softfpu.pp 319 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. {$macro on}
  69. {$define compilerproc:=stdcall }
  70. interface
  71. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  72. {$if not(defined(fpc_softfpu_implementation))}
  73. {
  74. -------------------------------------------------------------------------------
  75. Software IEC/IEEE floating-point types.
  76. -------------------------------------------------------------------------------
  77. }
  78. TYPE
  79. float32 = longword;
  80. {$define FPC_SYSTEM_HAS_float32}
  81. { we use here a record in the function header because
  82. the record allows bitwise conversion to single }
  83. float32rec = record
  84. float32 : float32;
  85. end;
  86. flag = byte;
  87. bits8 = byte;
  88. sbits8 = shortint;
  89. bits16 = word;
  90. sbits16 = smallint;
  91. sbits32 = longint;
  92. bits32 = longword;
  93. {$ifndef fpc}
  94. qword = int64;
  95. {$endif}
  96. { now part of the system unit
  97. uint64 = qword;
  98. }
  99. bits64 = qword;
  100. sbits64 = int64;
  101. {$ifdef ENDIAN_LITTLE}
  102. float64 = record
  103. case byte of
  104. 1: (low,high : bits32);
  105. // force the record to be aligned like a double
  106. // else *_to_double will fail for cpus like sparc
  107. // and avoid expensive unpacking/packing operations
  108. 2: (dummy : double);
  109. end;
  110. floatx80 = record
  111. case byte of
  112. 1: (low : qword;high : word);
  113. // force the record to be aligned like a double
  114. // else *_to_double will fail for cpus like sparc
  115. // and avoid expensive unpacking/packing operations
  116. 2: (dummy : extended);
  117. end;
  118. float128 = record
  119. case byte of
  120. 1: (low,high : qword);
  121. // force the record to be aligned like a double
  122. // else *_to_double will fail for cpus like sparc
  123. // and avoid expensive unpacking/packing operations
  124. 2: (dummy : qword);
  125. end;
  126. {$else}
  127. float64 = record
  128. case byte of
  129. 1: (high,low : bits32);
  130. // force the record to be aligned like a double
  131. // else *_to_double will fail for cpus like sparc
  132. 2: (dummy : double);
  133. end;
  134. floatx80 = record
  135. case byte of
  136. 1: (high : word;low : qword);
  137. // force the record to be aligned like a double
  138. // else *_to_double will fail for cpus like sparc
  139. // and avoid expensive unpacking/packing operations
  140. 2: (dummy : qword);
  141. end;
  142. float128 = record
  143. case byte of
  144. 1: (high : qword;low : qword);
  145. // force the record to be aligned like a double
  146. // else *_to_double will fail for cpus like sparc
  147. // and avoid expensive unpacking/packing operations
  148. 2: (dummy : qword);
  149. end;
  150. {$endif}
  151. {$define FPC_SYSTEM_HAS_float64}
  152. {*
  153. -------------------------------------------------------------------------------
  154. Returns 1 if the double-precision floating-point value `a' is less than
  155. the corresponding value `b', and 0 otherwise. The comparison is performed
  156. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  157. -------------------------------------------------------------------------------
  158. *}
  159. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  160. {*
  161. -------------------------------------------------------------------------------
  162. Returns 1 if the double-precision floating-point value `a' is less than
  163. or equal to the corresponding value `b', and 0 otherwise. The comparison
  164. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  165. Arithmetic.
  166. -------------------------------------------------------------------------------
  167. *}
  168. Function float64_le(a: float64;b: float64): flag; compilerproc;
  169. {*
  170. -------------------------------------------------------------------------------
  171. Returns 1 if the double-precision floating-point value `a' is equal to
  172. the corresponding value `b', and 0 otherwise. The comparison is performed
  173. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  174. -------------------------------------------------------------------------------
  175. *}
  176. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  177. {*
  178. -------------------------------------------------------------------------------
  179. Returns the square root of the double-precision floating-point value `a'.
  180. The operation is performed according to the IEC/IEEE Standard for Binary
  181. Floating-Point Arithmetic.
  182. -------------------------------------------------------------------------------
  183. *}
  184. function float64_sqrt( a: float64 ): float64; compilerproc;
  185. {*
  186. -------------------------------------------------------------------------------
  187. Returns the remainder of the double-precision floating-point value `a'
  188. with respect to the corresponding value `b'. The operation is performed
  189. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  190. -------------------------------------------------------------------------------
  191. *}
  192. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  193. {*
  194. -------------------------------------------------------------------------------
  195. Returns the result of dividing the double-precision floating-point value `a'
  196. by the corresponding value `b'. The operation is performed according to the
  197. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  198. -------------------------------------------------------------------------------
  199. *}
  200. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  201. {*
  202. -------------------------------------------------------------------------------
  203. Returns the result of multiplying the double-precision floating-point values
  204. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  205. for Binary Floating-Point Arithmetic.
  206. -------------------------------------------------------------------------------
  207. *}
  208. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  209. {*
  210. -------------------------------------------------------------------------------
  211. Returns the result of subtracting the double-precision floating-point values
  212. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  213. for Binary Floating-Point Arithmetic.
  214. -------------------------------------------------------------------------------
  215. *}
  216. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  217. {*
  218. -------------------------------------------------------------------------------
  219. Returns the result of adding the double-precision floating-point values `a'
  220. and `b'. The operation is performed according to the IEC/IEEE Standard for
  221. Binary Floating-Point Arithmetic.
  222. -------------------------------------------------------------------------------
  223. *}
  224. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  225. {*
  226. -------------------------------------------------------------------------------
  227. Rounds the double-precision floating-point value `a' to an integer,
  228. and returns the result as a double-precision floating-point value. The
  229. operation is performed according to the IEC/IEEE Standard for Binary
  230. Floating-Point Arithmetic.
  231. -------------------------------------------------------------------------------
  232. *}
  233. Function float64_round_to_int(a: float64) : float64; compilerproc;
  234. {*
  235. -------------------------------------------------------------------------------
  236. Returns the result of converting the double-precision floating-point value
  237. `a' to the single-precision floating-point format. The conversion is
  238. performed according to the IEC/IEEE Standard for Binary Floating-Point
  239. Arithmetic.
  240. -------------------------------------------------------------------------------
  241. *}
  242. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  243. {*
  244. -------------------------------------------------------------------------------
  245. Returns the result of converting the double-precision floating-point value
  246. `a' to the 32-bit two's complement integer format. The conversion is
  247. performed according to the IEC/IEEE Standard for Binary Floating-Point
  248. Arithmetic, except that the conversion is always rounded toward zero.
  249. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  250. the conversion overflows, the largest integer with the same sign as `a' is
  251. returned.
  252. -------------------------------------------------------------------------------
  253. *}
  254. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  255. {*
  256. -------------------------------------------------------------------------------
  257. Returns the result of converting the double-precision floating-point value
  258. `a' to the 32-bit two's complement integer format. The conversion is
  259. performed according to the IEC/IEEE Standard for Binary Floating-Point
  260. Arithmetic---which means in particular that the conversion is rounded
  261. according to the current rounding mode. If `a' is a NaN, the largest
  262. positive integer is returned. Otherwise, if the conversion overflows, the
  263. largest integer with the same sign as `a' is returned.
  264. -------------------------------------------------------------------------------
  265. *}
  266. Function float64_to_int32(a: float64): int32; compilerproc;
  267. {*
  268. -------------------------------------------------------------------------------
  269. Returns 1 if the single-precision floating-point value `a' is less than
  270. the corresponding value `b', and 0 otherwise. The comparison is performed
  271. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  272. -------------------------------------------------------------------------------
  273. *}
  274. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  275. {*
  276. -------------------------------------------------------------------------------
  277. Returns 1 if the single-precision floating-point value `a' is less than
  278. or equal to the corresponding value `b', and 0 otherwise. The comparison
  279. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  280. Arithmetic.
  281. -------------------------------------------------------------------------------
  282. *}
  283. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  284. {*
  285. -------------------------------------------------------------------------------
  286. Returns 1 if the single-precision floating-point value `a' is equal to
  287. the corresponding value `b', and 0 otherwise. The comparison is performed
  288. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  289. -------------------------------------------------------------------------------
  290. *}
  291. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  292. {*
  293. -------------------------------------------------------------------------------
  294. Returns the square root of the single-precision floating-point value `a'.
  295. The operation is performed according to the IEC/IEEE Standard for Binary
  296. Floating-Point Arithmetic.
  297. -------------------------------------------------------------------------------
  298. *}
  299. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  300. {*
  301. -------------------------------------------------------------------------------
  302. Returns the remainder of the single-precision floating-point value `a'
  303. with respect to the corresponding value `b'. The operation is performed
  304. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  305. -------------------------------------------------------------------------------
  306. *}
  307. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  308. {*
  309. -------------------------------------------------------------------------------
  310. Returns the result of dividing the single-precision floating-point value `a'
  311. by the corresponding value `b'. The operation is performed according to the
  312. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  313. -------------------------------------------------------------------------------
  314. *}
  315. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  316. {*
  317. -------------------------------------------------------------------------------
  318. Returns the result of multiplying the single-precision floating-point values
  319. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  320. for Binary Floating-Point Arithmetic.
  321. -------------------------------------------------------------------------------
  322. *}
  323. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  324. {*
  325. -------------------------------------------------------------------------------
  326. Returns the result of subtracting the single-precision floating-point values
  327. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  328. for Binary Floating-Point Arithmetic.
  329. -------------------------------------------------------------------------------
  330. *}
  331. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  332. {*
  333. -------------------------------------------------------------------------------
  334. Returns the result of adding the single-precision floating-point values `a'
  335. and `b'. The operation is performed according to the IEC/IEEE Standard for
  336. Binary Floating-Point Arithmetic.
  337. -------------------------------------------------------------------------------
  338. *}
  339. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  340. {*
  341. -------------------------------------------------------------------------------
  342. Rounds the single-precision floating-point value `a' to an integer,
  343. and returns the result as a single-precision floating-point value. The
  344. operation is performed according to the IEC/IEEE Standard for Binary
  345. Floating-Point Arithmetic.
  346. -------------------------------------------------------------------------------
  347. *}
  348. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  349. {*
  350. -------------------------------------------------------------------------------
  351. Returns the result of converting the single-precision floating-point value
  352. `a' to the double-precision floating-point format. The conversion is
  353. performed according to the IEC/IEEE Standard for Binary Floating-Point
  354. Arithmetic.
  355. -------------------------------------------------------------------------------
  356. *}
  357. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  358. {*
  359. -------------------------------------------------------------------------------
  360. Returns the result of converting the single-precision floating-point value
  361. `a' to the 32-bit two's complement integer format. The conversion is
  362. performed according to the IEC/IEEE Standard for Binary Floating-Point
  363. Arithmetic, except that the conversion is always rounded toward zero.
  364. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  365. the conversion overflows, the largest integer with the same sign as `a' is
  366. returned.
  367. -------------------------------------------------------------------------------
  368. *}
  369. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  370. {*
  371. -------------------------------------------------------------------------------
  372. Returns the result of converting the single-precision floating-point value
  373. `a' to the 32-bit two's complement integer format. The conversion is
  374. performed according to the IEC/IEEE Standard for Binary Floating-Point
  375. Arithmetic---which means in particular that the conversion is rounded
  376. according to the current rounding mode. If `a' is a NaN, the largest
  377. positive integer is returned. Otherwise, if the conversion overflows, the
  378. largest integer with the same sign as `a' is returned.
  379. -------------------------------------------------------------------------------
  380. *}
  381. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  382. {*
  383. -------------------------------------------------------------------------------
  384. Returns the result of converting the 32-bit two's complement integer `a' to
  385. the double-precision floating-point format. The conversion is performed
  386. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  387. -------------------------------------------------------------------------------
  388. *}
  389. Function int32_to_float64( a: int32) : float64; compilerproc;
  390. {*
  391. -------------------------------------------------------------------------------
  392. Returns the result of converting the 32-bit two's complement integer `a' to
  393. the single-precision floating-point format. The conversion is performed
  394. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  395. -------------------------------------------------------------------------------
  396. *}
  397. Function int32_to_float32( a: int32): float32rec; compilerproc;
  398. {*----------------------------------------------------------------------------
  399. | Returns the result of converting the 64-bit two's complement integer `a'
  400. | to the double-precision floating-point format. The conversion is performed
  401. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  402. *----------------------------------------------------------------------------*}
  403. Function int64_to_float64( a: int64 ): float64; compilerproc;
  404. Function qword_to_float64( a: qword ): float64; compilerproc;
  405. {*----------------------------------------------------------------------------
  406. | Returns the result of converting the 64-bit two's complement integer `a'
  407. | to the single-precision floating-point format. The conversion is performed
  408. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  409. *----------------------------------------------------------------------------*}
  410. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  411. Function qword_to_float32( a: qword ): float32rec; compilerproc;
  412. // +++
  413. function float32_to_int64( a: float32 ): int64;
  414. function float32_to_int64_round_to_zero( a: float32 ): int64;
  415. function float32_eq_signaling( a: float32; b: float32) : flag;
  416. function float32_le_quiet( a: float32 ; b : float32 ): flag;
  417. function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  418. function float32_is_signaling_nan( a : float32 ): flag;
  419. function float32_is_nan( a : float32 ): flag;
  420. function float64_to_int64( a: float64 ): int64;
  421. function float64_to_int64_round_to_zero( a: float64 ): int64;
  422. function float64_eq_signaling( a: float64; b: float64): flag;
  423. function float64_le_quiet(a: float64 ; b: float64 ): flag;
  424. function float64_lt_quiet(a: float64; b: float64 ): Flag;
  425. function float64_is_signaling_nan( a : float64 ): flag;
  426. function float64_is_nan( a : float64 ): flag;
  427. // ===
  428. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  429. {*----------------------------------------------------------------------------
  430. | Extended double-precision rounding precision
  431. *----------------------------------------------------------------------------*}
  432. var // threadvar!?
  433. floatx80_rounding_precision : int8 = 80;
  434. function int32_to_floatx80( a: int32 ): floatx80;
  435. function int64_to_floatx80( a: int64 ): floatx80;
  436. function qword_to_floatx80( a: qword ): floatx80;
  437. function float32_to_floatx80( a: float32 ): floatx80;
  438. function float64_to_floatx80( a: float64 ): floatx80;
  439. function floatx80_to_int32( a: floatx80 ): int32;
  440. function floatx80_to_int32_round_to_zero( a: floatx80 ): int32;
  441. function floatx80_to_int64( a: floatx80 ): int64;
  442. function floatx80_to_int64_round_to_zero( a: floatx80 ): int64;
  443. function floatx80_to_float32( a: floatx80 ): float32;
  444. function floatx80_to_float64( a: floatx80 ): float64;
  445. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  446. function floatx80_to_float128( a: floatx80 ): float128;
  447. {$endif FPC_SOFTFLOAT_FLOAT128}
  448. function floatx80_round_to_int( a: floatx80 ): floatx80;
  449. function floatx80_add( a: floatx80; b: floatx80 ): floatx80;
  450. function floatx80_sub( a: floatx80; b: floatx80 ): floatx80;
  451. function floatx80_mul( a: floatx80; b: floatx80 ): floatx80;
  452. function floatx80_div( a: floatx80; b: floatx80 ): floatx80;
  453. function floatx80_rem( a: floatx80; b: floatx80 ): floatx80;
  454. function floatx80_sqrt( a: floatx80 ): floatx80;
  455. function floatx80_eq( a: floatx80; b: floatx80 ): flag;
  456. function floatx80_le( a: floatx80; b: floatx80 ): flag;
  457. function floatx80_lt( a: floatx80; b: floatx80 ): flag;
  458. function floatx80_eq_signaling( a: floatx80; b: floatx80 ): flag;
  459. function floatx80_le_quiet( a: floatx80; b: floatx80 ): flag;
  460. function floatx80_lt_quiet( a: floatx80; b: floatx80 ): flag;
  461. function floatx80_is_signaling_nan( a: floatx80 ): flag;
  462. function floatx80_is_nan(a : floatx80 ): flag;
  463. {$endif FPC_SOFTFLOAT_FLOATX80}
  464. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  465. function int32_to_float128( a: int32 ): float128;
  466. function int64_to_float128( a: int64 ): float128;
  467. function qword_to_float128( a: qword ): float128;
  468. function float32_to_float128( a: float32 ): float128;
  469. function float128_is_nan( a : float128): flag;
  470. function float128_is_signaling_nan( a : float128): flag;
  471. function float128_to_int32(a: float128): int32;
  472. function float128_to_int32_round_to_zero(a: float128): int32;
  473. function float128_to_int64(a: float128): int64;
  474. function float128_to_int64_round_to_zero(a: float128): int64;
  475. function float128_to_float32(a: float128): float32;
  476. function float128_to_float64(a: float128): float64;
  477. function float64_to_float128( a : float64) : float128;
  478. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  479. function float128_to_floatx80(a: float128): floatx80;
  480. {$endif FPC_SOFTFLOAT_FLOATX80}
  481. function float128_round_to_int(a: float128): float128;
  482. function float128_add(a: float128; b: float128): float128;
  483. function float128_sub(a: float128; b: float128): float128;
  484. function float128_mul(a: float128; b: float128): float128;
  485. function float128_div(a: float128; b: float128): float128;
  486. function float128_rem(a: float128; b: float128): float128;
  487. function float128_sqrt(a: float128): float128;
  488. function float128_eq(a: float128; b: float128): flag;
  489. function float128_le(a: float128; b: float128): flag;
  490. function float128_lt(a: float128; b: float128): flag;
  491. function float128_eq_signaling(a: float128; b: float128): flag;
  492. function float128_le_quiet(a: float128; b: float128): flag;
  493. function float128_lt_quiet(a: float128; b: float128): flag;
  494. {$endif FPC_SOFTFLOAT_FLOAT128}
  495. CONST
  496. {-------------------------------------------------------------------------------
  497. Software IEC/IEEE floating-point underflow tininess-detection mode.
  498. -------------------------------------------------------------------------------
  499. *}
  500. float_tininess_after_rounding = 0;
  501. float_tininess_before_rounding = 1;
  502. {*
  503. -------------------------------------------------------------------------------
  504. Underflow tininess-detection mode, statically initialized to default value.
  505. (The declaration in `softfloat.h' must match the `int8' type here.)
  506. -------------------------------------------------------------------------------
  507. *}
  508. var // threadvar!?
  509. softfloat_detect_tininess: int8 = float_tininess_after_rounding;
  510. {$endif not(defined(fpc_softfpu_implementation))}
  511. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  512. implementation
  513. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  514. {$if not(defined(fpc_softfpu_interface))}
  515. (*****************************************************************************)
  516. (*----------------------------------------------------------------------------*)
  517. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  518. (* division and square root approximations. (Can be specialized to target if *)
  519. (* desired.) *)
  520. (* ---------------------------------------------------------------------------*)
  521. (*****************************************************************************)
  522. { This procedure serves as a single access point to softfloat_exception_flags.
  523. It also helps to reduce code size a bit because softfloat_exception_flags is
  524. a threadvar. }
  525. procedure set_inexact_flag;
  526. begin
  527. include(softfloat_exception_flags,float_flag_inexact);
  528. end;
  529. {*----------------------------------------------------------------------------
  530. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  531. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  532. | input. If `zSign' is 1, the input is negated before being converted to an
  533. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  534. | is simply rounded to an integer, with the inexact exception raised if the
  535. | input cannot be represented exactly as an integer. However, if the fixed-
  536. | point input is too large, the invalid exception is raised and the largest
  537. | positive or negative integer is returned.
  538. *----------------------------------------------------------------------------*}
  539. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  540. var
  541. roundingMode: TFPURoundingMode;
  542. roundNearestEven: boolean;
  543. roundIncrement, roundBits: int8;
  544. z: int32;
  545. begin
  546. roundingMode := softfloat_rounding_mode;
  547. roundNearestEven := (roundingMode = float_round_nearest_even);
  548. roundIncrement := $40;
  549. if not roundNearestEven then
  550. begin
  551. if ( roundingMode = float_round_to_zero ) then
  552. begin
  553. roundIncrement := 0;
  554. end
  555. else begin
  556. roundIncrement := $7F;
  557. if ( zSign<>0 ) then
  558. begin
  559. if ( roundingMode = float_round_up ) then
  560. roundIncrement := 0;
  561. end
  562. else begin
  563. if ( roundingMode = float_round_down ) then
  564. roundIncrement := 0;
  565. end;
  566. end;
  567. end;
  568. roundBits := absZ and $7F;
  569. absZ := ( absZ + roundIncrement ) shr 7;
  570. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and ord(roundNearestEven) );
  571. z := absZ;
  572. if ( zSign<>0 ) then
  573. z := - z;
  574. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  575. begin
  576. float_raise( float_flag_invalid );
  577. if zSign<>0 then
  578. result:=sbits32($80000000)
  579. else
  580. result:=$7FFFFFFF;
  581. exit;
  582. end;
  583. if ( roundBits<>0 ) then
  584. set_inexact_flag;
  585. result:=z;
  586. end;
  587. {*----------------------------------------------------------------------------
  588. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  589. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  590. | and returns the properly rounded 64-bit integer corresponding to the input.
  591. | If `zSign' is 1, the input is negated before being converted to an integer.
  592. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  593. | the inexact exception raised if the input cannot be represented exactly as
  594. | an integer. However, if the fixed-point input is too large, the invalid
  595. | exception is raised and the largest positive or negative integer is
  596. | returned.
  597. *----------------------------------------------------------------------------*}
  598. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  599. var
  600. roundingMode: TFPURoundingMode;
  601. roundNearestEven, increment: flag;
  602. z: int64;
  603. label
  604. overflow;
  605. begin
  606. roundingMode := softfloat_rounding_mode;
  607. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  608. increment := ord( sbits64(absZ1) < 0 );
  609. if ( roundNearestEven=0 ) then
  610. begin
  611. if ( roundingMode = float_round_to_zero ) then
  612. begin
  613. increment := 0;
  614. end
  615. else begin
  616. if ( zSign<>0 ) then
  617. begin
  618. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  619. end
  620. else begin
  621. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  622. end;
  623. end;
  624. end;
  625. if ( increment<>0 ) then
  626. begin
  627. inc(absZ0);
  628. if ( absZ0 = 0 ) then
  629. goto overflow;
  630. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  631. end;
  632. z := absZ0;
  633. if ( zSign<>0 ) then
  634. z := - z;
  635. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  636. begin
  637. overflow:
  638. float_raise( float_flag_invalid );
  639. if zSign<>0 then
  640. result:=int64($8000000000000000)
  641. else
  642. result:=int64($7FFFFFFFFFFFFFFF);
  643. exit;
  644. end;
  645. if ( absZ1<>0 ) then
  646. set_inexact_flag;
  647. result:=z;
  648. end;
  649. {*
  650. -------------------------------------------------------------------------------
  651. Shifts `a' right by the number of bits given in `count'. If any nonzero
  652. bits are shifted off, they are ``jammed'' into the least significant bit of
  653. the result by setting the least significant bit to 1. The value of `count'
  654. can be arbitrarily large; in particular, if `count' is greater than 32, the
  655. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  656. The result is stored in the location pointed to by `zPtr'.
  657. -------------------------------------------------------------------------------
  658. *}
  659. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  660. var
  661. z: Bits32;
  662. Begin
  663. if ( count = 0 ) then
  664. z := a
  665. else
  666. if ( count < 32 ) then
  667. Begin
  668. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  669. End
  670. else
  671. Begin
  672. z := bits32( a <> 0 );
  673. End;
  674. zPtr := z;
  675. End;
  676. {*----------------------------------------------------------------------------
  677. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  678. | number of bits given in `count'. Any bits shifted off are lost. The value
  679. | of `count' can be arbitrarily large; in particular, if `count' is greater
  680. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  681. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  682. *----------------------------------------------------------------------------*}
  683. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  684. var
  685. z0, z1: bits64;
  686. negCount: int8;
  687. begin
  688. negCount := ( - count ) and 63;
  689. if ( count = 0 ) then
  690. begin
  691. z1 := a1;
  692. z0 := a0;
  693. end
  694. else if ( count < 64 ) then
  695. begin
  696. z1 := ( a0 shl negCount ) or ( a1 shr count );
  697. z0 := a0 shr count;
  698. end
  699. else
  700. begin
  701. if ( count < 128 ) then
  702. z1 := a0 shr ( count and 63 )
  703. else
  704. z1 := 0;
  705. z0 := 0;
  706. end;
  707. z1Ptr := z1;
  708. z0Ptr := z0;
  709. end;
  710. {*----------------------------------------------------------------------------
  711. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  712. | number of bits given in `count'. If any nonzero bits are shifted off, they
  713. | are ``jammed'' into the least significant bit of the result by setting the
  714. | least significant bit to 1. The value of `count' can be arbitrarily large;
  715. | in particular, if `count' is greater than 128, the result will be either
  716. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  717. | nonzero. The result is broken into two 64-bit pieces which are stored at
  718. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  719. *----------------------------------------------------------------------------*}
  720. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  721. var
  722. z0,z1 : bits64;
  723. negCount : int8;
  724. begin
  725. negCount := ( - count ) and 63;
  726. if ( count = 0 ) then begin
  727. z1 := a1;
  728. z0 := a0;
  729. end
  730. else if ( count < 64 ) then begin
  731. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  732. z0 := a0 shr count;
  733. end
  734. else begin
  735. if ( count = 64 ) then begin
  736. z1 := a0 or ord( a1 <> 0 );
  737. end
  738. else if ( count < 128 ) then begin
  739. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  740. end
  741. else begin
  742. z1 := ord( ( a0 or a1 ) <> 0 );
  743. end;
  744. z0 := 0;
  745. end;
  746. z1Ptr := z1;
  747. z0Ptr := z0;
  748. end;
  749. {*
  750. -------------------------------------------------------------------------------
  751. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  752. number of bits given in `count'. Any bits shifted off are lost. The value
  753. of `count' can be arbitrarily large; in particular, if `count' is greater
  754. than 64, the result will be 0. The result is broken into two 32-bit pieces
  755. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  756. -------------------------------------------------------------------------------
  757. *}
  758. Procedure
  759. shift64Right(
  760. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  761. Var
  762. z0, z1: bits32;
  763. negCount : int8;
  764. Begin
  765. negCount := ( - count ) AND 31;
  766. if ( count = 0 ) then
  767. Begin
  768. z1 := a1;
  769. z0 := a0;
  770. End
  771. else if ( count < 32 ) then
  772. Begin
  773. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  774. z0 := a0 shr count;
  775. End
  776. else
  777. Begin
  778. if (count < 64) then
  779. z1 := ( a0 shr ( count AND 31 ) )
  780. else
  781. z1 := 0;
  782. z0 := 0;
  783. End;
  784. z1Ptr := z1;
  785. z0Ptr := z0;
  786. End;
  787. {*
  788. -------------------------------------------------------------------------------
  789. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  790. number of bits given in `count'. If any nonzero bits are shifted off, they
  791. are ``jammed'' into the least significant bit of the result by setting the
  792. least significant bit to 1. The value of `count' can be arbitrarily large;
  793. in particular, if `count' is greater than 64, the result will be either 0
  794. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  795. nonzero. The result is broken into two 32-bit pieces which are stored at
  796. the locations pointed to by `z0Ptr' and `z1Ptr'.
  797. -------------------------------------------------------------------------------
  798. *}
  799. Procedure
  800. shift64RightJamming(
  801. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  802. VAR
  803. z0, z1 : bits32;
  804. negCount : int8;
  805. Begin
  806. negCount := ( - count ) AND 31;
  807. if ( count = 0 ) then
  808. Begin
  809. z1 := a1;
  810. z0 := a0;
  811. End
  812. else
  813. if ( count < 32 ) then
  814. Begin
  815. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  816. z0 := a0 shr count;
  817. End
  818. else
  819. Begin
  820. if ( count = 32 ) then
  821. Begin
  822. z1 := a0 OR bits32( a1 <> 0 );
  823. End
  824. else
  825. if ( count < 64 ) Then
  826. Begin
  827. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  828. End
  829. else
  830. Begin
  831. z1 := bits32( ( a0 OR a1 ) <> 0 );
  832. End;
  833. z0 := 0;
  834. End;
  835. z1Ptr := z1;
  836. z0Ptr := z0;
  837. End;
  838. {*----------------------------------------------------------------------------
  839. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  840. | bits are shifted off, they are ``jammed'' into the least significant bit of
  841. | the result by setting the least significant bit to 1. The value of `count'
  842. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  843. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  844. | The result is stored in the location pointed to by `zPtr'.
  845. *----------------------------------------------------------------------------*}
  846. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  847. var
  848. z: bits64;
  849. begin
  850. if ( count = 0 ) then
  851. begin
  852. z := a;
  853. end
  854. else if ( count < 64 ) then
  855. begin
  856. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  857. end
  858. else
  859. begin
  860. z := ord( a <> 0 );
  861. end;
  862. zPtr := z;
  863. end;
  864. {$if not defined(shift64ExtraRightJamming)}
  865. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  866. overload;
  867. forward;
  868. {$endif}
  869. {*
  870. -------------------------------------------------------------------------------
  871. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  872. by 32 _plus_ the number of bits given in `count'. The shifted result is
  873. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  874. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  875. off form a third 32-bit result as follows: The _last_ bit shifted off is
  876. the most-significant bit of the extra result, and the other 31 bits of the
  877. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  878. were all zero. This extra result is stored in the location pointed to by
  879. `z2Ptr'. The value of `count' can be arbitrarily large.
  880. (This routine makes more sense if `a0', `a1', and `a2' are considered
  881. to form a fixed-point value with binary point between `a1' and `a2'. This
  882. fixed-point value is shifted right by the number of bits given in `count',
  883. and the integer part of the result is returned at the locations pointed to
  884. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  885. corrupted as described above, and is returned at the location pointed to by
  886. `z2Ptr'.)
  887. -------------------------------------------------------------------------------
  888. }
  889. Procedure
  890. shift64ExtraRightJamming(
  891. a0: bits32;
  892. a1: bits32;
  893. a2: bits32;
  894. count: int16;
  895. VAR z0Ptr: bits32;
  896. VAR z1Ptr: bits32;
  897. VAR z2Ptr: bits32
  898. ); overload;
  899. Var
  900. z0, z1, z2: bits32;
  901. negCount : int8;
  902. Begin
  903. negCount := ( - count ) AND 31;
  904. if ( count = 0 ) then
  905. Begin
  906. z2 := a2;
  907. z1 := a1;
  908. z0 := a0;
  909. End
  910. else
  911. Begin
  912. if ( count < 32 ) Then
  913. Begin
  914. z2 := a1 shl negCount;
  915. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  916. z0 := a0 shr count;
  917. End
  918. else
  919. Begin
  920. if ( count = 32 ) then
  921. Begin
  922. z2 := a1;
  923. z1 := a0;
  924. End
  925. else
  926. Begin
  927. a2 := a2 or a1;
  928. if ( count < 64 ) then
  929. Begin
  930. z2 := a0 shl negCount;
  931. z1 := a0 shr ( count AND 31 );
  932. End
  933. else
  934. Begin
  935. if count = 64 then
  936. z2 := a0
  937. else
  938. z2 := bits32(a0 <> 0);
  939. z1 := 0;
  940. End;
  941. End;
  942. z0 := 0;
  943. End;
  944. z2 := z2 or bits32( a2 <> 0 );
  945. End;
  946. z2Ptr := z2;
  947. z1Ptr := z1;
  948. z0Ptr := z0;
  949. End;
  950. {*
  951. -------------------------------------------------------------------------------
  952. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  953. number of bits given in `count'. Any bits shifted off are lost. The value
  954. of `count' must be less than 32. The result is broken into two 32-bit
  955. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  956. -------------------------------------------------------------------------------
  957. *}
  958. Procedure
  959. shortShift64Left(
  960. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  961. Begin
  962. z1Ptr := a1 shl count;
  963. if count = 0 then
  964. z0Ptr := a0
  965. else
  966. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  967. End;
  968. {*
  969. -------------------------------------------------------------------------------
  970. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  971. by the number of bits given in `count'. Any bits shifted off are lost.
  972. The value of `count' must be less than 32. The result is broken into three
  973. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  974. `z1Ptr', and `z2Ptr'.
  975. -------------------------------------------------------------------------------
  976. *}
  977. Procedure
  978. shortShift96Left(
  979. a0: bits32;
  980. a1: bits32;
  981. a2: bits32;
  982. count: int16;
  983. VAR z0Ptr: bits32;
  984. VAR z1Ptr: bits32;
  985. VAR z2Ptr: bits32
  986. );
  987. Var
  988. z0, z1, z2: bits32;
  989. negCount: int8;
  990. Begin
  991. z2 := a2 shl count;
  992. z1 := a1 shl count;
  993. z0 := a0 shl count;
  994. if ( 0 < count ) then
  995. Begin
  996. negCount := ( ( - count ) AND 31 );
  997. z1 := z1 or (a2 shr negCount);
  998. z0 := z0 or (a1 shr negCount);
  999. End;
  1000. z2Ptr := z2;
  1001. z1Ptr := z1;
  1002. z0Ptr := z0;
  1003. End;
  1004. {*----------------------------------------------------------------------------
  1005. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  1006. | number of bits given in `count'. Any bits shifted off are lost. The value
  1007. | of `count' must be less than 64. The result is broken into two 64-bit
  1008. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1009. *----------------------------------------------------------------------------*}
  1010. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  1011. begin
  1012. z1Ptr := a1 shl count;
  1013. if count=0 then
  1014. z0Ptr:=a0
  1015. else
  1016. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  1017. end;
  1018. {*
  1019. -------------------------------------------------------------------------------
  1020. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  1021. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  1022. any carry out is lost. The result is broken into two 32-bit pieces which
  1023. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1024. -------------------------------------------------------------------------------
  1025. *}
  1026. Procedure
  1027. add64(
  1028. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  1029. Var
  1030. z1: bits32;
  1031. Begin
  1032. z1 := a1 + b1;
  1033. z1Ptr := z1;
  1034. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  1035. End;
  1036. {*
  1037. -------------------------------------------------------------------------------
  1038. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  1039. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1040. modulo 2^96, so any carry out is lost. The result is broken into three
  1041. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1042. `z1Ptr', and `z2Ptr'.
  1043. -------------------------------------------------------------------------------
  1044. *}
  1045. Procedure
  1046. add96(
  1047. a0: bits32;
  1048. a1: bits32;
  1049. a2: bits32;
  1050. b0: bits32;
  1051. b1: bits32;
  1052. b2: bits32;
  1053. VAR z0Ptr: bits32;
  1054. VAR z1Ptr: bits32;
  1055. VAR z2Ptr: bits32
  1056. );
  1057. var
  1058. z0, z1, z2: bits32;
  1059. carry0, carry1: int8;
  1060. Begin
  1061. z2 := a2 + b2;
  1062. carry1 := int8( z2 < a2 );
  1063. z1 := a1 + b1;
  1064. carry0 := int8( z1 < a1 );
  1065. z0 := a0 + b0;
  1066. z1 := z1 + carry1;
  1067. z0 := z0 + bits32( z1 < carry1 );
  1068. z0 := z0 + carry0;
  1069. z2Ptr := z2;
  1070. z1Ptr := z1;
  1071. z0Ptr := z0;
  1072. End;
  1073. {*----------------------------------------------------------------------------
  1074. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1075. | by the number of bits given in `count'. Any bits shifted off are lost.
  1076. | The value of `count' must be less than 64. The result is broken into three
  1077. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1078. | `z1Ptr', and `z2Ptr'.
  1079. *----------------------------------------------------------------------------*}
  1080. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1081. var
  1082. z0, z1, z2 : bits64;
  1083. negCount : int8;
  1084. begin
  1085. z2 := a2 shl count;
  1086. z1 := a1 shl count;
  1087. z0 := a0 shl count;
  1088. if ( 0 < count ) then
  1089. begin
  1090. negCount := ( ( - count ) and 63 );
  1091. z1 := z1 or (a2 shr negCount);
  1092. z0 := z0 or (a1 shr negCount);
  1093. end;
  1094. z2Ptr := z2;
  1095. z1Ptr := z1;
  1096. z0Ptr := z0;
  1097. end;
  1098. {*----------------------------------------------------------------------------
  1099. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1100. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1101. | any carry out is lost. The result is broken into two 64-bit pieces which
  1102. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1103. *----------------------------------------------------------------------------*}
  1104. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);inline;
  1105. var
  1106. z1 : bits64;
  1107. begin
  1108. z1 := a1 + b1;
  1109. z1Ptr := z1;
  1110. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1111. end;
  1112. {*----------------------------------------------------------------------------
  1113. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1114. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1115. | modulo 2^192, so any carry out is lost. The result is broken into three
  1116. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1117. | `z1Ptr', and `z2Ptr'.
  1118. *----------------------------------------------------------------------------*}
  1119. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1120. var
  1121. z0, z1, z2 : bits64;
  1122. carry0, carry1 : int8;
  1123. begin
  1124. z2 := a2 + b2;
  1125. carry1 := ord( z2 < a2 );
  1126. z1 := a1 + b1;
  1127. carry0 := ord( z1 < a1 );
  1128. z0 := a0 + b0;
  1129. inc(z1, carry1);
  1130. inc(z0, ord( z1 < carry1 ));
  1131. inc(z0, carry0);
  1132. z2Ptr := z2;
  1133. z1Ptr := z1;
  1134. z0Ptr := z0;
  1135. end;
  1136. {*
  1137. -------------------------------------------------------------------------------
  1138. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1139. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1140. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1141. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1142. `z1Ptr'.
  1143. -------------------------------------------------------------------------------
  1144. *}
  1145. Procedure
  1146. sub64(
  1147. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  1148. Begin
  1149. z1Ptr := a1 - b1;
  1150. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1151. End;
  1152. {*
  1153. -------------------------------------------------------------------------------
  1154. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1155. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1156. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1157. into three 32-bit pieces which are stored at the locations pointed to by
  1158. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1159. -------------------------------------------------------------------------------
  1160. *}
  1161. Procedure
  1162. sub96(
  1163. a0:bits32;
  1164. a1:bits32;
  1165. a2:bits32;
  1166. b0:bits32;
  1167. b1:bits32;
  1168. b2:bits32;
  1169. VAR z0Ptr:bits32;
  1170. VAR z1Ptr:bits32;
  1171. VAR z2Ptr:bits32
  1172. );
  1173. Var
  1174. z0, z1, z2: bits32;
  1175. borrow0, borrow1: int8;
  1176. Begin
  1177. z2 := a2 - b2;
  1178. borrow1 := int8( a2 < b2 );
  1179. z1 := a1 - b1;
  1180. borrow0 := int8( a1 < b1 );
  1181. z0 := a0 - b0;
  1182. z0 := z0 - bits32( z1 < borrow1 );
  1183. z1 := z1 - borrow1;
  1184. z0 := z0 -borrow0;
  1185. z2Ptr := z2;
  1186. z1Ptr := z1;
  1187. z0Ptr := z0;
  1188. End;
  1189. {*----------------------------------------------------------------------------
  1190. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1191. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1192. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1193. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1194. | `z1Ptr'.
  1195. *----------------------------------------------------------------------------*}
  1196. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1197. begin
  1198. z1Ptr := a1 - b1;
  1199. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1200. end;
  1201. {*----------------------------------------------------------------------------
  1202. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1203. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1204. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1205. | result is broken into three 64-bit pieces which are stored at the locations
  1206. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1207. *----------------------------------------------------------------------------*}
  1208. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1209. var
  1210. z0, z1, z2 : bits64;
  1211. borrow0, borrow1 : int8;
  1212. begin
  1213. z2 := a2 - b2;
  1214. borrow1 := ord( a2 < b2 );
  1215. z1 := a1 - b1;
  1216. borrow0 := ord( a1 < b1 );
  1217. z0 := a0 - b0;
  1218. dec(z0, ord( z1 < borrow1 ));
  1219. dec(z1, borrow1);
  1220. dec(z0, borrow0);
  1221. z2Ptr := z2;
  1222. z1Ptr := z1;
  1223. z0Ptr := z0;
  1224. end;
  1225. {*
  1226. -------------------------------------------------------------------------------
  1227. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1228. into two 32-bit pieces which are stored at the locations pointed to by
  1229. `z0Ptr' and `z1Ptr'.
  1230. -------------------------------------------------------------------------------
  1231. *}
  1232. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1233. :bits32 );
  1234. Var
  1235. aHigh, aLow, bHigh, bLow: bits16;
  1236. z0, zMiddleA, zMiddleB, z1: bits32;
  1237. Begin
  1238. aLow := a and $ffff;
  1239. aHigh := a shr 16;
  1240. bLow := b and $ffff;
  1241. bHigh := b shr 16;
  1242. z1 := ( bits32( aLow) ) * bLow;
  1243. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1244. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1245. z0 := ( bits32 (aHigh) ) * bHigh;
  1246. zMiddleA := zMiddleA + zMiddleB;
  1247. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1248. zMiddleA := zmiddleA shl 16;
  1249. z1 := z1 + zMiddleA;
  1250. z0 := z0 + bits32( z1 < zMiddleA );
  1251. z1Ptr := z1;
  1252. z0Ptr := z0;
  1253. End;
  1254. {*
  1255. -------------------------------------------------------------------------------
  1256. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1257. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1258. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1259. `z2Ptr'.
  1260. -------------------------------------------------------------------------------
  1261. *}
  1262. Procedure
  1263. mul64By32To96(
  1264. a0:bits32;
  1265. a1:bits32;
  1266. b:bits32;
  1267. VAR z0Ptr:bits32;
  1268. VAR z1Ptr:bits32;
  1269. VAR z2Ptr:bits32
  1270. );
  1271. Var
  1272. z0, z1, z2, more1: bits32;
  1273. Begin
  1274. mul32To64( a1, b, z1, z2 );
  1275. mul32To64( a0, b, z0, more1 );
  1276. add64( z0, more1, 0, z1, z0, z1 );
  1277. z2Ptr := z2;
  1278. z1Ptr := z1;
  1279. z0Ptr := z0;
  1280. End;
  1281. {*
  1282. -------------------------------------------------------------------------------
  1283. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1284. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1285. product. The product is broken into four 32-bit pieces which are stored at
  1286. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1287. -------------------------------------------------------------------------------
  1288. *}
  1289. Procedure
  1290. mul64To128(
  1291. a0:bits32;
  1292. a1:bits32;
  1293. b0:bits32;
  1294. b1:bits32;
  1295. VAR z0Ptr:bits32;
  1296. VAR z1Ptr:bits32;
  1297. VAR z2Ptr:bits32;
  1298. VAR z3Ptr:bits32
  1299. );
  1300. Var
  1301. z0, z1, z2, z3: bits32;
  1302. more1, more2: bits32;
  1303. Begin
  1304. mul32To64( a1, b1, z2, z3 );
  1305. mul32To64( a1, b0, z1, more2 );
  1306. add64( z1, more2, 0, z2, z1, z2 );
  1307. mul32To64( a0, b0, z0, more1 );
  1308. add64( z0, more1, 0, z1, z0, z1 );
  1309. mul32To64( a0, b1, more1, more2 );
  1310. add64( more1, more2, 0, z2, more1, z2 );
  1311. add64( z0, z1, 0, more1, z0, z1 );
  1312. z3Ptr := z3;
  1313. z2Ptr := z2;
  1314. z1Ptr := z1;
  1315. z0Ptr := z0;
  1316. End;
  1317. {*----------------------------------------------------------------------------
  1318. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1319. | into two 64-bit pieces which are stored at the locations pointed to by
  1320. | `z0Ptr' and `z1Ptr'.
  1321. *----------------------------------------------------------------------------*}
  1322. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1323. var
  1324. aHigh, aLow, bHigh, bLow : bits32;
  1325. z0, zMiddleA, zMiddleB, z1 : bits64;
  1326. begin
  1327. aLow := a;
  1328. aHigh := a shr 32;
  1329. bLow := b;
  1330. bHigh := b shr 32;
  1331. z1 := ( bits64(aLow) ) * bLow;
  1332. zMiddleA := ( bits64( aLow )) * bHigh;
  1333. zMiddleB := ( bits64( aHigh )) * bLow;
  1334. z0 := ( bits64(aHigh) ) * bHigh;
  1335. inc(zMiddleA, zMiddleB);
  1336. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1337. zMiddleA := zMiddleA shl 32;
  1338. inc(z1, zMiddleA);
  1339. inc(z0, ord( z1 < zMiddleA ));
  1340. z1Ptr := z1;
  1341. z0Ptr := z0;
  1342. end;
  1343. {*----------------------------------------------------------------------------
  1344. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1345. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1346. | product. The product is broken into four 64-bit pieces which are stored at
  1347. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1348. *----------------------------------------------------------------------------*}
  1349. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1350. var
  1351. z0,z1,z2,z3,more1,more2 : bits64;
  1352. begin
  1353. mul64To128( a1, b1, z2, z3 );
  1354. mul64To128( a1, b0, z1, more2 );
  1355. add128( z1, more2, 0, z2, z1, z2 );
  1356. mul64To128( a0, b0, z0, more1 );
  1357. add128( z0, more1, 0, z1, z0, z1 );
  1358. mul64To128( a0, b1, more1, more2 );
  1359. add128( more1, more2, 0, z2, more1, z2 );
  1360. add128( z0, z1, 0, more1, z0, z1 );
  1361. z3Ptr := z3;
  1362. z2Ptr := z2;
  1363. z1Ptr := z1;
  1364. z0Ptr := z0;
  1365. end;
  1366. {*----------------------------------------------------------------------------
  1367. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1368. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1369. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1370. | `z2Ptr'.
  1371. *----------------------------------------------------------------------------*}
  1372. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1373. var
  1374. z0, z1, z2, more1 : bits64;
  1375. begin
  1376. mul64To128( a1, b, z1, z2 );
  1377. mul64To128( a0, b, z0, more1 );
  1378. add128( z0, more1, 0, z1, z0, z1 );
  1379. z2Ptr := z2;
  1380. z1Ptr := z1;
  1381. z0Ptr := z0;
  1382. end;
  1383. {*----------------------------------------------------------------------------
  1384. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1385. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1386. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1387. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1388. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1389. | unsigned integer is returned.
  1390. *----------------------------------------------------------------------------*}
  1391. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1392. var
  1393. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1394. begin
  1395. if ( b <= a0 ) then
  1396. begin
  1397. result:=qword( $FFFFFFFFFFFFFFFF );
  1398. exit;
  1399. end;
  1400. b0 := b shr 32;
  1401. if ( b0 shl 32 <= a0 ) then
  1402. z:=qword( $FFFFFFFF00000000 )
  1403. else
  1404. z:=( a0 div b0 ) shl 32;
  1405. mul64To128( b, z, term0, term1 );
  1406. sub128( a0, a1, term0, term1, rem0, rem1 );
  1407. while ( ( sbits64(rem0) ) < 0 ) do begin
  1408. dec(z,qword( $100000000 ));
  1409. b1 := b shl 32;
  1410. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1411. end;
  1412. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1413. if ( b0 shl 32 <= rem0 ) then
  1414. z:=z or $FFFFFFFF
  1415. else
  1416. z:=z or rem0 div b0;
  1417. result:=z;
  1418. end;
  1419. {*
  1420. -------------------------------------------------------------------------------
  1421. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1422. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1423. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1424. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1425. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1426. unsigned integer is returned.
  1427. -------------------------------------------------------------------------------
  1428. *}
  1429. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1430. Var
  1431. b0, b1: bits32;
  1432. rem0, rem1, term0, term1: bits32;
  1433. z: bits32;
  1434. Begin
  1435. if ( b <= a0 ) then
  1436. Begin
  1437. estimateDiv64To32 := $FFFFFFFF;
  1438. exit;
  1439. End;
  1440. b0 := b shr 16;
  1441. if ( b0 shl 16 <= a0 ) then
  1442. z:= $FFFF0000
  1443. else
  1444. z:= ( a0 div b0 ) shl 16;
  1445. mul32To64( b, z, term0, term1 );
  1446. sub64( a0, a1, term0, term1, rem0, rem1 );
  1447. while ( ( sbits32 (rem0) ) < 0 ) do
  1448. Begin
  1449. z := z - $10000;
  1450. b1 := b shl 16;
  1451. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1452. End;
  1453. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1454. if ( b0 shl 16 <= rem0 ) then
  1455. z := z or $FFFF
  1456. else
  1457. z := z or (rem0 div b0);
  1458. estimateDiv64To32 := z;
  1459. End;
  1460. {*
  1461. -------------------------------------------------------------------------------
  1462. Returns an approximation to the square root of the 32-bit significand given
  1463. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1464. `aExp' (the least significant bit) is 1, the integer returned approximates
  1465. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1466. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1467. case, the approximation returned lies strictly within +/-2 of the exact
  1468. value.
  1469. -------------------------------------------------------------------------------
  1470. *}
  1471. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1472. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1473. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1474. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1475. );
  1476. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1477. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1478. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1479. );
  1480. Var
  1481. index: int8;
  1482. z: bits32;
  1483. Begin
  1484. index := ( a shr 27 ) AND 15;
  1485. if ( aExp AND 1 ) <> 0 then
  1486. Begin
  1487. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1488. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1489. a := a shr 1;
  1490. End
  1491. else
  1492. Begin
  1493. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1494. z := a div z + z;
  1495. if ( $20000 <= z ) then
  1496. z := $FFFF8000
  1497. else
  1498. z := ( z shl 15 );
  1499. if ( z <= a ) then
  1500. Begin
  1501. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1502. exit;
  1503. End;
  1504. End;
  1505. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1506. End;
  1507. {*
  1508. -------------------------------------------------------------------------------
  1509. Returns the number of leading 0 bits before the most-significant 1 bit of
  1510. `a'. If `a' is zero, 32 is returned.
  1511. -------------------------------------------------------------------------------
  1512. *}
  1513. Function countLeadingZeros32( a:bits32 ): int8;
  1514. const countLeadingZerosHigh:array[0..255] of int8 = (
  1515. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1516. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1517. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1518. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1519. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1520. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1521. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1522. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1523. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1524. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1525. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1526. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1527. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1528. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1529. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1530. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1531. );
  1532. Var
  1533. shiftCount: int8;
  1534. Begin
  1535. shiftCount := 0;
  1536. if ( a < $10000 ) then
  1537. Begin
  1538. shiftCount := shiftcount + 16;
  1539. a := a shl 16;
  1540. End;
  1541. if ( a < $1000000 ) then
  1542. Begin
  1543. shiftCount := shiftcount + 8;
  1544. a := a shl 8;
  1545. end;
  1546. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1547. countLeadingZeros32:= shiftCount;
  1548. End;
  1549. {*----------------------------------------------------------------------------
  1550. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1551. | `a'. If `a' is zero, 64 is returned.
  1552. *----------------------------------------------------------------------------*}
  1553. function countLeadingZeros64( a : bits64): int8;
  1554. var
  1555. shiftcount : int8;
  1556. Begin
  1557. shiftCount := 0;
  1558. if ( a < bits64(bits64(1) shl 32 )) then
  1559. shiftCount := shiftcount + 32
  1560. else
  1561. a := a shr 32;
  1562. shiftCount := shiftCount + countLeadingZeros32( a );
  1563. countLeadingZeros64:= shiftCount;
  1564. End;
  1565. {*
  1566. -------------------------------------------------------------------------------
  1567. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1568. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1569. Otherwise, returns 0.
  1570. -------------------------------------------------------------------------------
  1571. *}
  1572. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1573. Begin
  1574. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1575. End;
  1576. {*
  1577. -------------------------------------------------------------------------------
  1578. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1579. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1580. returns 0.
  1581. -------------------------------------------------------------------------------
  1582. *}
  1583. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1584. Begin
  1585. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1586. End;
  1587. const
  1588. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1589. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1590. (*****************************************************************************)
  1591. (* End Low-Level arithmetic *)
  1592. (*****************************************************************************)
  1593. {*
  1594. -------------------------------------------------------------------------------
  1595. Functions and definitions to determine: (1) whether tininess for underflow
  1596. is detected before or after rounding by default, (2) what (if anything)
  1597. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1598. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1599. are propagated from function inputs to output. These details are ENDIAN
  1600. specific
  1601. -------------------------------------------------------------------------------
  1602. *}
  1603. {$IFDEF ENDIAN_LITTLE}
  1604. {*
  1605. -------------------------------------------------------------------------------
  1606. Internal canonical NaN format.
  1607. -------------------------------------------------------------------------------
  1608. *}
  1609. TYPE
  1610. commonNaNT = record
  1611. high, low : bits32;
  1612. sign: flag;
  1613. end;
  1614. {*
  1615. -------------------------------------------------------------------------------
  1616. The pattern for a default generated single-precision NaN.
  1617. -------------------------------------------------------------------------------
  1618. *}
  1619. const float32_default_nan = $FFC00000;
  1620. {*
  1621. -------------------------------------------------------------------------------
  1622. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1623. otherwise returns 0.
  1624. -------------------------------------------------------------------------------
  1625. *}
  1626. Function float32_is_nan( a : float32 ): flag;
  1627. Begin
  1628. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1629. End;
  1630. {*
  1631. -------------------------------------------------------------------------------
  1632. Returns 1 if the single-precision floating-point value `a' is a signaling
  1633. NaN; otherwise returns 0.
  1634. -------------------------------------------------------------------------------
  1635. *}
  1636. Function float32_is_signaling_nan( a : float32 ): flag;
  1637. Begin
  1638. float32_is_signaling_nan := flag
  1639. (( ( ( a shr 22 ) and $1FF ) = $1FE ) and (( a and $003FFFFF )<>0));
  1640. End;
  1641. {*
  1642. -------------------------------------------------------------------------------
  1643. Returns the result of converting the single-precision floating-point NaN
  1644. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1645. exception is raised.
  1646. -------------------------------------------------------------------------------
  1647. *}
  1648. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1649. var
  1650. z : commonNaNT ;
  1651. Begin
  1652. if ( float32_is_signaling_nan( a ) <> 0) then
  1653. float_raise( float_flag_invalid );
  1654. z.sign := a shr 31;
  1655. z.low := 0;
  1656. z.high := a shl 9;
  1657. c := z;
  1658. End;
  1659. {*
  1660. -------------------------------------------------------------------------------
  1661. Returns the result of converting the canonical NaN `a' to the single-
  1662. precision floating-point format.
  1663. -------------------------------------------------------------------------------
  1664. *}
  1665. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1666. Begin
  1667. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1668. End;
  1669. {*
  1670. -------------------------------------------------------------------------------
  1671. Takes two single-precision floating-point values `a' and `b', one of which
  1672. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1673. signaling NaN, the invalid exception is raised.
  1674. -------------------------------------------------------------------------------
  1675. *}
  1676. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1677. Var
  1678. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1679. label returnLargerSignificand;
  1680. Begin
  1681. aIsNaN := float32_is_nan( a );
  1682. aIsSignalingNaN := float32_is_signaling_nan( a );
  1683. bIsNaN := float32_is_nan( b );
  1684. bIsSignalingNaN := float32_is_signaling_nan( b );
  1685. a := a or $00400000;
  1686. b := b or $00400000;
  1687. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1688. float_raise( float_flag_invalid );
  1689. if ( aIsSignalingNaN )<> 0 then
  1690. Begin
  1691. if ( bIsSignalingNaN ) <> 0 then
  1692. goto returnLargerSignificand;
  1693. if bIsNan <> 0 then
  1694. propagateFloat32NaN := b
  1695. else
  1696. propagateFloat32NaN := a;
  1697. exit;
  1698. End
  1699. else if ( aIsNaN <> 0) then
  1700. Begin
  1701. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1702. Begin
  1703. propagateFloat32NaN := a;
  1704. exit;
  1705. End;
  1706. returnLargerSignificand:
  1707. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1708. Begin
  1709. propagateFloat32NaN := b;
  1710. exit;
  1711. End;
  1712. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1713. Begin
  1714. propagateFloat32NaN := a;
  1715. End;
  1716. if a < b then
  1717. propagateFloat32NaN := a
  1718. else
  1719. propagateFloat32NaN := b;
  1720. exit;
  1721. End
  1722. else
  1723. Begin
  1724. propagateFloat32NaN := b;
  1725. exit;
  1726. End;
  1727. End;
  1728. {*
  1729. -------------------------------------------------------------------------------
  1730. The pattern for a default generated double-precision NaN. The `high' and
  1731. `low' values hold the most- and least-significant bits, respectively.
  1732. -------------------------------------------------------------------------------
  1733. *}
  1734. const
  1735. float64_default_nan_high = $FFF80000;
  1736. float64_default_nan_low = $00000000;
  1737. {*
  1738. -------------------------------------------------------------------------------
  1739. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1740. otherwise returns 0.
  1741. -------------------------------------------------------------------------------
  1742. *}
  1743. Function float64_is_nan( a : float64 ) : flag;
  1744. Begin
  1745. float64_is_nan :=
  1746. flag(( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1747. and (( a.low or ( a.high and $000FFFFF ) )<>0));
  1748. End;
  1749. {*
  1750. -------------------------------------------------------------------------------
  1751. Returns 1 if the double-precision floating-point value `a' is a signaling
  1752. NaN; otherwise returns 0.
  1753. -------------------------------------------------------------------------------
  1754. *}
  1755. Function float64_is_signaling_nan( a : float64 ): flag;
  1756. Begin
  1757. float64_is_signaling_nan :=
  1758. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1759. and ( a.low or ( a.high and $0007FFFF ) );
  1760. End;
  1761. {*
  1762. -------------------------------------------------------------------------------
  1763. Returns the result of converting the double-precision floating-point NaN
  1764. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1765. exception is raised.
  1766. -------------------------------------------------------------------------------
  1767. *}
  1768. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1769. Var
  1770. z : commonNaNT;
  1771. Begin
  1772. if ( float64_is_signaling_nan( a )<>0 ) then
  1773. float_raise( float_flag_invalid );
  1774. z.sign := a.high shr 31;
  1775. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1776. c := z;
  1777. End;
  1778. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1779. Var
  1780. z : commonNaNT;
  1781. Begin
  1782. if ( float64_is_signaling_nan( a )<>0 ) then
  1783. float_raise( float_flag_invalid );
  1784. z.sign := a.high shr 31;
  1785. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1786. result := z;
  1787. End;
  1788. {*
  1789. -------------------------------------------------------------------------------
  1790. Returns the result of converting the canonical NaN `a' to the double-
  1791. precision floating-point format.
  1792. -------------------------------------------------------------------------------
  1793. *}
  1794. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1795. Var
  1796. z: float64;
  1797. Begin
  1798. shift64Right( a.high, a.low, 12, z.high, z.low );
  1799. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1800. c := z;
  1801. End;
  1802. {*
  1803. -------------------------------------------------------------------------------
  1804. Takes two double-precision floating-point values `a' and `b', one of which
  1805. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1806. signaling NaN, the invalid exception is raised.
  1807. -------------------------------------------------------------------------------
  1808. *}
  1809. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1810. Var
  1811. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1812. label returnLargerSignificand;
  1813. Begin
  1814. aIsNaN := float64_is_nan( a );
  1815. aIsSignalingNaN := float64_is_signaling_nan( a );
  1816. bIsNaN := float64_is_nan( b );
  1817. bIsSignalingNaN := float64_is_signaling_nan( b );
  1818. a.high := a.high or $00080000;
  1819. b.high := b.high or $00080000;
  1820. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1821. float_raise( float_flag_invalid );
  1822. if ( aIsSignalingNaN )<>0 then
  1823. Begin
  1824. if ( bIsSignalingNaN )<>0 then
  1825. goto returnLargerSignificand;
  1826. if bIsNan <> 0 then
  1827. c := b
  1828. else
  1829. c := a;
  1830. exit;
  1831. End
  1832. else if ( aIsNaN )<> 0 then
  1833. Begin
  1834. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1835. Begin
  1836. c := a;
  1837. exit;
  1838. End;
  1839. returnLargerSignificand:
  1840. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1841. Begin
  1842. c := b;
  1843. exit;
  1844. End;
  1845. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1846. Begin
  1847. c := a;
  1848. exit;
  1849. End;
  1850. if a.high < b.high then
  1851. c := a
  1852. else
  1853. c := b;
  1854. exit;
  1855. End
  1856. else
  1857. Begin
  1858. c := b;
  1859. exit;
  1860. End;
  1861. End;
  1862. {*----------------------------------------------------------------------------
  1863. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1864. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1865. | returns 0.
  1866. *----------------------------------------------------------------------------*}
  1867. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1868. begin
  1869. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1870. end;
  1871. {*----------------------------------------------------------------------------
  1872. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1873. | otherwise returns 0.
  1874. *----------------------------------------------------------------------------*}
  1875. function float128_is_nan( a : float128): flag;
  1876. begin
  1877. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1878. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1879. end;
  1880. {*----------------------------------------------------------------------------
  1881. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1882. | signaling NaN; otherwise returns 0.
  1883. *----------------------------------------------------------------------------*}
  1884. function float128_is_signaling_nan( a : float128): flag;
  1885. begin
  1886. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1887. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1888. end;
  1889. {*----------------------------------------------------------------------------
  1890. | Returns the result of converting the quadruple-precision floating-point NaN
  1891. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1892. | exception is raised.
  1893. *----------------------------------------------------------------------------*}
  1894. function float128ToCommonNaN( a : float128): commonNaNT;
  1895. var
  1896. z: commonNaNT;
  1897. qhigh,qlow : qword;
  1898. begin
  1899. if ( float128_is_signaling_nan( a )<>0) then
  1900. float_raise( float_flag_invalid );
  1901. z.sign := a.high shr 63;
  1902. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1903. z.high:=qhigh shr 32;
  1904. z.low:=qhigh and $ffffffff;
  1905. result:=z;
  1906. end;
  1907. {*----------------------------------------------------------------------------
  1908. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1909. | precision floating-point format.
  1910. *----------------------------------------------------------------------------*}
  1911. function commonNaNToFloat128( a : commonNaNT): float128;
  1912. var
  1913. z: float128;
  1914. begin
  1915. shift128Right( a.high, a.low, 16, z.high, z.low );
  1916. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1917. result:=z;
  1918. end;
  1919. {*----------------------------------------------------------------------------
  1920. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1921. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1922. | `b' is a signaling NaN, the invalid exception is raised.
  1923. *----------------------------------------------------------------------------*}
  1924. function propagateFloat128NaN( a: float128; b : float128): float128;
  1925. var
  1926. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1927. label
  1928. returnLargerSignificand;
  1929. begin
  1930. aIsNaN := float128_is_nan( a );
  1931. aIsSignalingNaN := float128_is_signaling_nan( a );
  1932. bIsNaN := float128_is_nan( b );
  1933. bIsSignalingNaN := float128_is_signaling_nan( b );
  1934. a.high := a.high or int64( $0000800000000000 );
  1935. b.high := b.high or int64( $0000800000000000 );
  1936. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1937. float_raise( float_flag_invalid );
  1938. if ( aIsSignalingNaN )<>0 then
  1939. begin
  1940. if ( bIsSignalingNaN )<>0 then
  1941. goto returnLargerSignificand;
  1942. if bIsNaN<>0 then
  1943. result := b
  1944. else
  1945. result := a;
  1946. exit;
  1947. end
  1948. else if ( aIsNaN )<>0 then
  1949. begin
  1950. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1951. begin
  1952. result := a;
  1953. exit;
  1954. end;
  1955. returnLargerSignificand:
  1956. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1957. begin
  1958. result := b;
  1959. exit;
  1960. end;
  1961. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1962. begin
  1963. result := a;
  1964. exit
  1965. end;
  1966. if ( a.high < b.high ) then
  1967. result := a
  1968. else
  1969. result := b;
  1970. exit;
  1971. end
  1972. else
  1973. result:=b;
  1974. end;
  1975. {$ELSE}
  1976. { Big endian code }
  1977. (*----------------------------------------------------------------------------
  1978. | Internal canonical NaN format.
  1979. *----------------------------------------------------------------------------*)
  1980. type
  1981. commonNANT = record
  1982. high, low : bits32;
  1983. sign : flag;
  1984. end;
  1985. (*----------------------------------------------------------------------------
  1986. | The pattern for a default generated single-precision NaN.
  1987. *----------------------------------------------------------------------------*)
  1988. const float32_default_nan = $7FFFFFFF;
  1989. (*----------------------------------------------------------------------------
  1990. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1991. | otherwise returns 0.
  1992. *----------------------------------------------------------------------------*)
  1993. function float32_is_nan(a: float32): flag;
  1994. begin
  1995. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1996. end;
  1997. (*----------------------------------------------------------------------------
  1998. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1999. | NaN; otherwise returns 0.
  2000. *----------------------------------------------------------------------------*)
  2001. function float32_is_signaling_nan(a: float32):flag;
  2002. begin
  2003. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  2004. end;
  2005. (*----------------------------------------------------------------------------
  2006. | Returns the result of converting the single-precision floating-point NaN
  2007. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2008. | exception is raised.
  2009. *----------------------------------------------------------------------------*)
  2010. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  2011. var
  2012. z: commonNANT;
  2013. begin
  2014. if float32_is_signaling_nan(a)<>0 then
  2015. float_raise(float_flag_invalid);
  2016. z.sign := a shr 31;
  2017. z.low := 0;
  2018. z.high := a shl 9;
  2019. c:=z;
  2020. end;
  2021. (*----------------------------------------------------------------------------
  2022. | Returns the result of converting the canonical NaN `a' to the single-
  2023. | precision floating-point format.
  2024. *----------------------------------------------------------------------------*)
  2025. function CommonNanToFloat32(a : CommonNaNT): float32;
  2026. begin
  2027. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  2028. end;
  2029. (*----------------------------------------------------------------------------
  2030. | Takes two single-precision floating-point values `a' and `b', one of which
  2031. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2032. | signaling NaN, the invalid exception is raised.
  2033. *----------------------------------------------------------------------------*)
  2034. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  2035. var
  2036. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2037. begin
  2038. aIsNaN := float32_is_nan( a );
  2039. aIsSignalingNaN := float32_is_signaling_nan( a );
  2040. bIsNaN := float32_is_nan( b );
  2041. bIsSignalingNaN := float32_is_signaling_nan( b );
  2042. a := a or $00400000;
  2043. b := b or $00400000;
  2044. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2045. float_raise( float_flag_invalid );
  2046. if bIsSignalingNaN<>0 then
  2047. propagateFloat32Nan := b
  2048. else if aIsSignalingNan<>0 then
  2049. propagateFloat32Nan := a
  2050. else if bIsNan<>0 then
  2051. propagateFloat32Nan := b
  2052. else
  2053. propagateFloat32Nan := a;
  2054. end;
  2055. (*----------------------------------------------------------------------------
  2056. | The pattern for a default generated double-precision NaN. The `high' and
  2057. | `low' values hold the most- and least-significant bits, respectively.
  2058. *----------------------------------------------------------------------------*)
  2059. const
  2060. float64_default_nan_high = $7FFFFFFF;
  2061. float64_default_nan_low = $FFFFFFFF;
  2062. (*----------------------------------------------------------------------------
  2063. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2064. | otherwise returns 0.
  2065. *----------------------------------------------------------------------------*)
  2066. function float64_is_nan(a: float64): flag;
  2067. begin
  2068. float64_is_nan := flag (
  2069. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2070. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2071. end;
  2072. (*----------------------------------------------------------------------------
  2073. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2074. | NaN; otherwise returns 0.
  2075. *----------------------------------------------------------------------------*)
  2076. function float64_is_signaling_nan( a:float64): flag;
  2077. begin
  2078. float64_is_signaling_nan := flag(
  2079. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2080. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2081. end;
  2082. (*----------------------------------------------------------------------------
  2083. | Returns the result of converting the double-precision floating-point NaN
  2084. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2085. | exception is raised.
  2086. *----------------------------------------------------------------------------*)
  2087. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  2088. var
  2089. z : commonNaNT;
  2090. begin
  2091. if ( float64_is_signaling_nan( a )<>0 ) then
  2092. float_raise( float_flag_invalid );
  2093. z.sign := a.high shr 31;
  2094. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2095. c:=z;
  2096. end;
  2097. (*----------------------------------------------------------------------------
  2098. | Returns the result of converting the canonical NaN `a' to the double-
  2099. | precision floating-point format.
  2100. *----------------------------------------------------------------------------*)
  2101. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  2102. var
  2103. z: float64;
  2104. begin
  2105. shift64Right( a.high, a.low, 12, z.high, z.low );
  2106. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2107. c:=z;
  2108. end;
  2109. (*----------------------------------------------------------------------------
  2110. | Takes two double-precision floating-point values `a' and `b', one of which
  2111. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2112. | signaling NaN, the invalid exception is raised.
  2113. *----------------------------------------------------------------------------*)
  2114. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2115. var
  2116. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2117. begin
  2118. aIsNaN := float64_is_nan( a );
  2119. aIsSignalingNaN := float64_is_signaling_nan( a );
  2120. bIsNaN := float64_is_nan( b );
  2121. bIsSignalingNaN := float64_is_signaling_nan( b );
  2122. a.high := a.high or $00080000;
  2123. b.high := b.high or $00080000;
  2124. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2125. float_raise( float_flag_invalid );
  2126. if bIsSignalingNaN<>0 then
  2127. c := b
  2128. else if aIsSignalingNan<>0 then
  2129. c := a
  2130. else if bIsNan<>0 then
  2131. c := b
  2132. else
  2133. c := a;
  2134. end;
  2135. {$ENDIF}
  2136. (****************************************************************************)
  2137. (* END ENDIAN SPECIFIC CODE *)
  2138. (****************************************************************************)
  2139. {*
  2140. -------------------------------------------------------------------------------
  2141. Returns the fraction bits of the single-precision floating-point value `a'.
  2142. -------------------------------------------------------------------------------
  2143. *}
  2144. Function ExtractFloat32Frac(a : Float32) : Bits32; inline;
  2145. Begin
  2146. ExtractFloat32Frac := A AND $007FFFFF;
  2147. End;
  2148. {*
  2149. -------------------------------------------------------------------------------
  2150. Returns the exponent bits of the single-precision floating-point value `a'.
  2151. -------------------------------------------------------------------------------
  2152. *}
  2153. Function extractFloat32Exp( a: float32 ): Int16; inline;
  2154. Begin
  2155. extractFloat32Exp := (a shr 23) AND $FF;
  2156. End;
  2157. {*
  2158. -------------------------------------------------------------------------------
  2159. Returns the sign bit of the single-precision floating-point value `a'.
  2160. -------------------------------------------------------------------------------
  2161. *}
  2162. Function extractFloat32Sign( a: float32 ): Flag; inline;
  2163. Begin
  2164. extractFloat32Sign := a shr 31;
  2165. End;
  2166. {*
  2167. -------------------------------------------------------------------------------
  2168. Normalizes the subnormal single-precision floating-point value represented
  2169. by the denormalized significand `aSig'. The normalized exponent and
  2170. significand are stored at the locations pointed to by `zExpPtr' and
  2171. `zSigPtr', respectively.
  2172. -------------------------------------------------------------------------------
  2173. *}
  2174. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2175. Var
  2176. ShiftCount : BYTE;
  2177. Begin
  2178. shiftCount := countLeadingZeros32( aSig ) - 8;
  2179. zSigPtr := aSig shl shiftCount;
  2180. zExpPtr := 1 - shiftCount;
  2181. End;
  2182. {*
  2183. -------------------------------------------------------------------------------
  2184. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2185. single-precision floating-point value, returning the result. After being
  2186. shifted into the proper positions, the three fields are simply added
  2187. together to form the result. This means that any integer portion of `zSig'
  2188. will be added into the exponent. Since a properly normalized significand
  2189. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2190. than the desired result exponent whenever `zSig' is a complete, normalized
  2191. significand.
  2192. -------------------------------------------------------------------------------
  2193. *}
  2194. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32; inline;
  2195. Begin
  2196. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2197. + zSig;
  2198. End;
  2199. {*
  2200. -------------------------------------------------------------------------------
  2201. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2202. and significand `zSig', and returns the proper single-precision floating-
  2203. point value corresponding to the abstract input. Ordinarily, the abstract
  2204. value is simply rounded and packed into the single-precision format, with
  2205. the inexact exception raised if the abstract input cannot be represented
  2206. exactly. However, if the abstract value is too large, the overflow and
  2207. inexact exceptions are raised and an infinity or maximal finite value is
  2208. returned. If the abstract value is too small, the input value is rounded to
  2209. a subnormal number, and the underflow and inexact exceptions are raised if
  2210. the abstract input cannot be represented exactly as a subnormal single-
  2211. precision floating-point number.
  2212. The input significand `zSig' has its binary point between bits 30
  2213. and 29, which is 7 bits to the left of the usual location. This shifted
  2214. significand must be normalized or smaller. If `zSig' is not normalized,
  2215. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2216. and it must not require rounding. In the usual case that `zSig' is
  2217. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2218. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2219. Binary Floating-Point Arithmetic.
  2220. -------------------------------------------------------------------------------
  2221. *}
  2222. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2223. Var
  2224. roundingMode : TFPURoundingMode;
  2225. roundNearestEven : boolean;
  2226. roundIncrement, roundBits : BYTE;
  2227. IsTiny : boolean;
  2228. Begin
  2229. roundingMode := softfloat_rounding_mode;
  2230. roundNearestEven := (roundingMode = float_round_nearest_even);
  2231. roundIncrement := $40;
  2232. if not roundNearestEven then
  2233. Begin
  2234. if ( roundingMode = float_round_to_zero ) Then
  2235. Begin
  2236. roundIncrement := 0;
  2237. End
  2238. else
  2239. Begin
  2240. roundIncrement := $7F;
  2241. if ( zSign <> 0 ) then
  2242. Begin
  2243. if roundingMode = float_round_up then roundIncrement := 0;
  2244. End
  2245. else
  2246. Begin
  2247. if roundingMode = float_round_down then roundIncrement := 0;
  2248. End;
  2249. End
  2250. End;
  2251. roundBits := zSig AND $7F;
  2252. if ($FD <= bits16 (zExp) ) then
  2253. Begin
  2254. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2255. Begin
  2256. float_raise( [float_flag_overflow,float_flag_inexact] );
  2257. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2258. exit;
  2259. End;
  2260. if ( zExp < 0 ) then
  2261. Begin
  2262. isTiny :=
  2263. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2264. OR ( zExp < -1 )
  2265. OR ( (zSig + roundIncrement) < $80000000 );
  2266. shift32RightJamming( zSig, - zExp, zSig );
  2267. zExp := 0;
  2268. roundBits := zSig AND $7F;
  2269. if ( isTiny and (roundBits<>0) ) then
  2270. float_raise( float_flag_underflow );
  2271. End;
  2272. End;
  2273. if ( roundBits )<> 0 then
  2274. set_inexact_flag;
  2275. zSig := ( zSig + roundIncrement ) shr 7;
  2276. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and ord(roundNearestEven) );
  2277. if ( zSig = 0 ) then zExp := 0;
  2278. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2279. End;
  2280. {*
  2281. -------------------------------------------------------------------------------
  2282. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2283. and significand `zSig', and returns the proper single-precision floating-
  2284. point value corresponding to the abstract input. This routine is just like
  2285. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2286. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2287. floating-point exponent.
  2288. -------------------------------------------------------------------------------
  2289. *}
  2290. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2291. Var
  2292. ShiftCount : int8;
  2293. Begin
  2294. shiftCount := countLeadingZeros32( zSig ) - 1;
  2295. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2296. End;
  2297. {*
  2298. -------------------------------------------------------------------------------
  2299. Returns the most-significant 20 fraction bits of the double-precision
  2300. floating-point value `a'.
  2301. -------------------------------------------------------------------------------
  2302. *}
  2303. Function extractFloat64Frac0(a: float64): bits32; inline;
  2304. Begin
  2305. extractFloat64Frac0 := a.high and $000FFFFF;
  2306. End;
  2307. {*
  2308. -------------------------------------------------------------------------------
  2309. Returns the least-significant 32 fraction bits of the double-precision
  2310. floating-point value `a'.
  2311. -------------------------------------------------------------------------------
  2312. *}
  2313. Function extractFloat64Frac1(a: float64): bits32; inline;
  2314. Begin
  2315. extractFloat64Frac1 := a.low;
  2316. End;
  2317. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2318. Function extractFloat64Frac(a: float64): bits64; inline;
  2319. Begin
  2320. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2321. End;
  2322. {*
  2323. -------------------------------------------------------------------------------
  2324. Returns the exponent bits of the double-precision floating-point value `a'.
  2325. -------------------------------------------------------------------------------
  2326. *}
  2327. Function extractFloat64Exp(a: float64): int16; inline;
  2328. Begin
  2329. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2330. End;
  2331. {*
  2332. -------------------------------------------------------------------------------
  2333. Returns the sign bit of the double-precision floating-point value `a'.
  2334. -------------------------------------------------------------------------------
  2335. *}
  2336. Function extractFloat64Sign(a: float64) : flag; inline;
  2337. Begin
  2338. extractFloat64Sign := a.high shr 31;
  2339. End;
  2340. {*
  2341. -------------------------------------------------------------------------------
  2342. Normalizes the subnormal double-precision floating-point value represented
  2343. by the denormalized significand formed by the concatenation of `aSig0' and
  2344. `aSig1'. The normalized exponent is stored at the location pointed to by
  2345. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2346. stored at the location pointed to by `zSig0Ptr', and the least significant
  2347. 32 bits of the normalized significand are stored at the location pointed to
  2348. by `zSig1Ptr'.
  2349. -------------------------------------------------------------------------------
  2350. *}
  2351. Procedure normalizeFloat64Subnormal(
  2352. aSig0: bits32;
  2353. aSig1: bits32;
  2354. VAR zExpPtr : Int16;
  2355. VAR zSig0Ptr : Bits32;
  2356. VAR zSig1Ptr : Bits32
  2357. );
  2358. Var
  2359. ShiftCount : Int8;
  2360. Begin
  2361. if ( aSig0 = 0 ) then
  2362. Begin
  2363. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2364. if ( shiftCount < 0 ) then
  2365. Begin
  2366. zSig0Ptr := aSig1 shr ( - shiftCount );
  2367. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2368. End
  2369. else
  2370. Begin
  2371. zSig0Ptr := aSig1 shl shiftCount;
  2372. zSig1Ptr := 0;
  2373. End;
  2374. zExpPtr := - shiftCount - 31;
  2375. End
  2376. else
  2377. Begin
  2378. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2379. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2380. zExpPtr := 1 - shiftCount;
  2381. End;
  2382. End;
  2383. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2384. var
  2385. shiftCount : int8;
  2386. begin
  2387. shiftCount := countLeadingZeros64( aSig ) - 11;
  2388. zSigPtr := aSig shl shiftCount;
  2389. zExpPtr := 1 - shiftCount;
  2390. end;
  2391. {*
  2392. -------------------------------------------------------------------------------
  2393. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2394. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2395. point value, returning the result. After being shifted into the proper
  2396. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2397. together to form the most significant 32 bits of the result. This means
  2398. that any integer portion of `zSig0' will be added into the exponent. Since
  2399. a properly normalized significand will have an integer portion equal to 1,
  2400. the `zExp' input should be 1 less than the desired result exponent whenever
  2401. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2402. -------------------------------------------------------------------------------
  2403. *}
  2404. Procedure
  2405. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2406. var
  2407. z: Float64;
  2408. Begin
  2409. z.low := zSig1;
  2410. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2411. c := z;
  2412. End;
  2413. {*----------------------------------------------------------------------------
  2414. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2415. | double-precision floating-point value, returning the result. After being
  2416. | shifted into the proper positions, the three fields are simply added
  2417. | together to form the result. This means that any integer portion of `zSig'
  2418. | will be added into the exponent. Since a properly normalized significand
  2419. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2420. | than the desired result exponent whenever `zSig' is a complete, normalized
  2421. | significand.
  2422. *----------------------------------------------------------------------------*}
  2423. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2424. begin
  2425. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2426. end;
  2427. {*
  2428. -------------------------------------------------------------------------------
  2429. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2430. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2431. and `zSig2', and returns the proper double-precision floating-point value
  2432. corresponding to the abstract input. Ordinarily, the abstract value is
  2433. simply rounded and packed into the double-precision format, with the inexact
  2434. exception raised if the abstract input cannot be represented exactly.
  2435. However, if the abstract value is too large, the overflow and inexact
  2436. exceptions are raised and an infinity or maximal finite value is returned.
  2437. If the abstract value is too small, the input value is rounded to a
  2438. subnormal number, and the underflow and inexact exceptions are raised if the
  2439. abstract input cannot be represented exactly as a subnormal double-precision
  2440. floating-point number.
  2441. The input significand must be normalized or smaller. If the input
  2442. significand is not normalized, `zExp' must be 0; in that case, the result
  2443. returned is a subnormal number, and it must not require rounding. In the
  2444. usual case that the input significand is normalized, `zExp' must be 1 less
  2445. than the ``true'' floating-point exponent. The handling of underflow and
  2446. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2447. -------------------------------------------------------------------------------
  2448. *}
  2449. Procedure
  2450. roundAndPackFloat64(
  2451. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2452. Var
  2453. roundingMode : TFPURoundingMode;
  2454. roundNearestEven, increment, isTiny : Flag;
  2455. Begin
  2456. roundingMode := softfloat_rounding_mode;
  2457. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2458. increment := flag( sbits32 (zSig2) < 0 );
  2459. if ( roundNearestEven = flag(FALSE) ) then
  2460. Begin
  2461. if ( roundingMode = float_round_to_zero ) then
  2462. increment := 0
  2463. else
  2464. Begin
  2465. if ( zSign )<> 0 then
  2466. Begin
  2467. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2468. End
  2469. else
  2470. Begin
  2471. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2472. End
  2473. End
  2474. End;
  2475. if ( $7FD <= bits16 (zExp) ) then
  2476. Begin
  2477. if (( $7FD < zExp )
  2478. or (( zExp = $7FD )
  2479. and (zSig0=$001FFFFF) and (zSig1=$FFFFFFFF)
  2480. and (increment<>0)
  2481. )
  2482. ) then
  2483. Begin
  2484. float_raise( [float_flag_overflow,float_flag_inexact] );
  2485. if (( roundingMode = float_round_to_zero )
  2486. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2487. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2488. ) then
  2489. Begin
  2490. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2491. exit;
  2492. End;
  2493. packFloat64( zSign, $7FF, 0, 0, c );
  2494. exit;
  2495. End;
  2496. if ( zExp < 0 ) then
  2497. Begin
  2498. isTiny :=
  2499. flag( softfloat_detect_tininess = float_tininess_before_rounding )
  2500. or flag( zExp < -1 )
  2501. or flag(increment = 0)
  2502. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2503. shift64ExtraRightJamming(
  2504. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2505. zExp := 0;
  2506. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2507. if ( roundNearestEven )<>0 then
  2508. Begin
  2509. increment := flag( sbits32 (zSig2) < 0 );
  2510. End
  2511. else
  2512. Begin
  2513. if ( zSign )<>0 then
  2514. Begin
  2515. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2516. End
  2517. else
  2518. Begin
  2519. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2520. End
  2521. End;
  2522. End;
  2523. End;
  2524. if ( zSig2 )<>0 then
  2525. set_inexact_flag;
  2526. if ( increment )<>0 then
  2527. Begin
  2528. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2529. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2530. End
  2531. else
  2532. Begin
  2533. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2534. End;
  2535. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2536. End;
  2537. {*----------------------------------------------------------------------------
  2538. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2539. | and significand `zSig', and returns the proper double-precision floating-
  2540. | point value corresponding to the abstract input. Ordinarily, the abstract
  2541. | value is simply rounded and packed into the double-precision format, with
  2542. | the inexact exception raised if the abstract input cannot be represented
  2543. | exactly. However, if the abstract value is too large, the overflow and
  2544. | inexact exceptions are raised and an infinity or maximal finite value is
  2545. | returned. If the abstract value is too small, the input value is rounded
  2546. | to a subnormal number, and the underflow and inexact exceptions are raised
  2547. | if the abstract input cannot be represented exactly as a subnormal double-
  2548. | precision floating-point number.
  2549. | The input significand `zSig' has its binary point between bits 62
  2550. | and 61, which is 10 bits to the left of the usual location. This shifted
  2551. | significand must be normalized or smaller. If `zSig' is not normalized,
  2552. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2553. | and it must not require rounding. In the usual case that `zSig' is
  2554. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2555. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2556. | Binary Floating-Point Arithmetic.
  2557. *----------------------------------------------------------------------------*}
  2558. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2559. var
  2560. roundingMode: TFPURoundingMode;
  2561. roundNearestEven: flag;
  2562. roundIncrement, roundBits: int16;
  2563. isTiny: flag;
  2564. begin
  2565. roundingMode := softfloat_rounding_mode;
  2566. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2567. roundIncrement := $200;
  2568. if ( roundNearestEven=0 ) then
  2569. begin
  2570. if ( roundingMode = float_round_to_zero ) then
  2571. begin
  2572. roundIncrement := 0;
  2573. end
  2574. else begin
  2575. roundIncrement := $3FF;
  2576. if ( zSign<>0 ) then
  2577. begin
  2578. if ( roundingMode = float_round_up ) then
  2579. roundIncrement := 0;
  2580. end
  2581. else begin
  2582. if ( roundingMode = float_round_down ) then
  2583. roundIncrement := 0;
  2584. end
  2585. end
  2586. end;
  2587. roundBits := zSig and $3FF;
  2588. if ( $7FD <= bits16(zExp) ) then
  2589. begin
  2590. if ( ( $7FD < zExp )
  2591. or ( ( zExp = $7FD )
  2592. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2593. ) then
  2594. begin
  2595. float_raise( [float_flag_overflow,float_flag_inexact] );
  2596. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2597. exit;
  2598. end;
  2599. if ( zExp < 0 ) then
  2600. begin
  2601. isTiny := ord(
  2602. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2603. or ( zExp < -1 )
  2604. or ( (zSig + roundIncrement) < bits64( $8000000000000000 ) ) );
  2605. shift64RightJamming( zSig, - zExp, zSig );
  2606. zExp := 0;
  2607. roundBits := zSig and $3FF;
  2608. if ( isTiny and roundBits )<>0 then
  2609. float_raise( float_flag_underflow );
  2610. end
  2611. end;
  2612. if ( roundBits<>0 ) then
  2613. set_inexact_flag;
  2614. zSig := ( zSig + roundIncrement ) shr 10;
  2615. zSig := zSig and not(qword(ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven ));
  2616. if ( zSig = 0 ) then
  2617. zExp := 0;
  2618. result:=packFloat64( zSign, zExp, zSig );
  2619. end;
  2620. {*
  2621. -------------------------------------------------------------------------------
  2622. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2623. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2624. returns the proper double-precision floating-point value corresponding
  2625. to the abstract input. This routine is just like `roundAndPackFloat64'
  2626. except that the input significand has fewer bits and does not have to be
  2627. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2628. point exponent.
  2629. -------------------------------------------------------------------------------
  2630. *}
  2631. Procedure
  2632. normalizeRoundAndPackFloat64(
  2633. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2634. Var
  2635. shiftCount : int8;
  2636. zSig2 : bits32;
  2637. Begin
  2638. if ( zSig0 = 0 ) then
  2639. Begin
  2640. zSig0 := zSig1;
  2641. zSig1 := 0;
  2642. zExp := zExp -32;
  2643. End;
  2644. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2645. if ( 0 <= shiftCount ) then
  2646. Begin
  2647. zSig2 := 0;
  2648. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2649. End
  2650. else
  2651. Begin
  2652. shift64ExtraRightJamming
  2653. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2654. End;
  2655. zExp := zExp - shiftCount;
  2656. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2657. End;
  2658. {*
  2659. -------------------------------------------------------------------------------
  2660. Returns the result of converting the 32-bit two's complement integer `a' to
  2661. the single-precision floating-point format. The conversion is performed
  2662. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2663. -------------------------------------------------------------------------------
  2664. *}
  2665. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2666. Var
  2667. zSign : Flag;
  2668. Begin
  2669. if ( a = 0 ) then
  2670. Begin
  2671. int32_to_float32.float32 := 0;
  2672. exit;
  2673. End;
  2674. if ( a = sbits32 ($80000000) ) then
  2675. Begin
  2676. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2677. exit;
  2678. end;
  2679. zSign := flag( a < 0 );
  2680. If zSign<>0 then
  2681. a := -a;
  2682. int32_to_float32.float32:=
  2683. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2684. End;
  2685. {*
  2686. -------------------------------------------------------------------------------
  2687. Returns the result of converting the 32-bit two's complement integer `a' to
  2688. the double-precision floating-point format. The conversion is performed
  2689. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2690. -------------------------------------------------------------------------------
  2691. *}
  2692. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2693. var
  2694. zSign : flag;
  2695. absA : bits32;
  2696. shiftCount : int8;
  2697. zSig0, zSig1 : bits32;
  2698. Begin
  2699. if ( a = 0 ) then
  2700. Begin
  2701. packFloat64( 0, 0, 0, 0, result );
  2702. exit;
  2703. end;
  2704. zSign := flag( a < 0 );
  2705. if ZSign<>0 then
  2706. AbsA := -a
  2707. else
  2708. AbsA := a;
  2709. shiftCount := countLeadingZeros32( absA ) - 11;
  2710. if ( 0 <= shiftCount ) then
  2711. Begin
  2712. zSig0 := absA shl shiftCount;
  2713. zSig1 := 0;
  2714. End
  2715. else
  2716. Begin
  2717. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2718. End;
  2719. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2720. End;
  2721. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  2722. {$if not defined(packFloatx80)}
  2723. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  2724. forward;
  2725. {$endif}
  2726. {*----------------------------------------------------------------------------
  2727. | Returns the result of converting the 32-bit two's complement integer `a'
  2728. | to the extended double-precision floating-point format. The conversion
  2729. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2730. | Arithmetic.
  2731. *----------------------------------------------------------------------------*}
  2732. function int32_to_floatx80( a: int32 ): floatx80;
  2733. var
  2734. zSign: flag;
  2735. absA: uint32;
  2736. shiftCount: int8;
  2737. zSig: bits64;
  2738. begin
  2739. if ( a = 0 ) then begin
  2740. result := packFloatx80( 0, 0, 0 );
  2741. exit;
  2742. end;
  2743. zSign := ord( a < 0 );
  2744. if zSign <> 0 then absA := - a else absA := a;
  2745. shiftCount := countLeadingZeros32( absA ) + 32;
  2746. zSig := absA;
  2747. result := packFloatx80( zSign, $403E - shiftCount, zSig shl shiftCount );
  2748. end;
  2749. {$endif FPC_SOFTFLOAT_FLOATX80}
  2750. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  2751. {$if not defined(packFloat128)}
  2752. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64 ) : float128;
  2753. forward;
  2754. {$endif}
  2755. {*----------------------------------------------------------------------------
  2756. | Returns the result of converting the 32-bit two's complement integer `a' to
  2757. | the quadruple-precision floating-point format. The conversion is performed
  2758. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2759. *----------------------------------------------------------------------------*}
  2760. function int32_to_float128( a: int32 ): float128;
  2761. var
  2762. zSign: flag;
  2763. absA: uint32;
  2764. shiftCount: int8;
  2765. zSig0: bits64;
  2766. begin
  2767. if ( a = 0 ) then begin
  2768. result := packFloat128( 0, 0, 0, 0 );
  2769. exit;
  2770. end;
  2771. zSign := ord( a < 0 );
  2772. if zSign <> 0 then absA := - a else absA := a;
  2773. shiftCount := countLeadingZeros32( absA ) + 17;
  2774. zSig0 := absA;
  2775. result := packFloat128( zSign, $402E - shiftCount, zSig0 shl shiftCount, 0 );
  2776. end;
  2777. {$endif FPC_SOFTFLOAT_FLOAT128}
  2778. {*
  2779. -------------------------------------------------------------------------------
  2780. Returns the result of converting the single-precision floating-point value
  2781. `a' to the 32-bit two's complement integer format. The conversion is
  2782. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2783. Arithmetic---which means in particular that the conversion is rounded
  2784. according to the current rounding mode. If `a' is a NaN, the largest
  2785. positive integer is returned. Otherwise, if the conversion overflows, the
  2786. largest integer with the same sign as `a' is returned.
  2787. -------------------------------------------------------------------------------
  2788. *}
  2789. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2790. Var
  2791. aSign: flag;
  2792. aExp, shiftCount: int16;
  2793. aSig, aSigExtra: bits32;
  2794. z: int32;
  2795. roundingMode: TFPURoundingMode;
  2796. Begin
  2797. aSig := extractFloat32Frac( a.float32 );
  2798. aExp := extractFloat32Exp( a.float32 );
  2799. aSign := extractFloat32Sign( a.float32 );
  2800. shiftCount := aExp - $96;
  2801. if ( 0 <= shiftCount ) then
  2802. Begin
  2803. if ( $9E <= aExp ) then
  2804. Begin
  2805. if ( a.float32 <> $CF000000 ) then
  2806. Begin
  2807. float_raise( float_flag_invalid );
  2808. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2809. Begin
  2810. float32_to_int32 := $7FFFFFFF;
  2811. exit;
  2812. End;
  2813. End;
  2814. float32_to_int32 := sbits32 ($80000000);
  2815. exit;
  2816. End;
  2817. z := ( aSig or $00800000 ) shl shiftCount;
  2818. if ( aSign<>0 ) then z := - z;
  2819. End
  2820. else
  2821. Begin
  2822. if ( aExp < $7E ) then
  2823. Begin
  2824. aSigExtra := aExp OR aSig;
  2825. z := 0;
  2826. End
  2827. else
  2828. Begin
  2829. aSig := aSig OR $00800000;
  2830. aSigExtra := aSig shl ( shiftCount and 31 );
  2831. z := aSig shr ( - shiftCount );
  2832. End;
  2833. if ( aSigExtra<>0 ) then
  2834. set_inexact_flag;
  2835. roundingMode := softfloat_rounding_mode;
  2836. if ( roundingMode = float_round_nearest_even ) then
  2837. Begin
  2838. if ( sbits32 (aSigExtra) < 0 ) then
  2839. Begin
  2840. Inc(z);
  2841. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2842. z := z and not 1;
  2843. End;
  2844. if ( aSign<>0 ) then
  2845. z := - z;
  2846. End
  2847. else
  2848. Begin
  2849. aSigExtra := flag( aSigExtra <> 0 );
  2850. if ( aSign<>0 ) then
  2851. Begin
  2852. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2853. z := - z;
  2854. End
  2855. else
  2856. Begin
  2857. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2858. End
  2859. End;
  2860. End;
  2861. float32_to_int32 := z;
  2862. End;
  2863. {*
  2864. -------------------------------------------------------------------------------
  2865. Returns the result of converting the single-precision floating-point value
  2866. `a' to the 32-bit two's complement integer format. The conversion is
  2867. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2868. Arithmetic, except that the conversion is always rounded toward zero.
  2869. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2870. the conversion overflows, the largest integer with the same sign as `a' is
  2871. returned.
  2872. -------------------------------------------------------------------------------
  2873. *}
  2874. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2875. Var
  2876. aSign : flag;
  2877. aExp, shiftCount : int16;
  2878. aSig : bits32;
  2879. z : int32;
  2880. Begin
  2881. aSig := extractFloat32Frac( a.float32 );
  2882. aExp := extractFloat32Exp( a.float32 );
  2883. aSign := extractFloat32Sign( a.float32 );
  2884. shiftCount := aExp - $9E;
  2885. if ( 0 <= shiftCount ) then
  2886. Begin
  2887. if ( a.float32 <> $CF000000 ) then
  2888. Begin
  2889. float_raise( float_flag_invalid );
  2890. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2891. Begin
  2892. float32_to_int32_round_to_zero := $7FFFFFFF;
  2893. exit;
  2894. end;
  2895. End;
  2896. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2897. exit;
  2898. End
  2899. else
  2900. if ( aExp <= $7E ) then
  2901. Begin
  2902. if ( aExp or aSig )<>0 then
  2903. set_inexact_flag;
  2904. float32_to_int32_round_to_zero := 0;
  2905. exit;
  2906. End;
  2907. aSig := ( aSig or $00800000 ) shl 8;
  2908. z := aSig shr ( - shiftCount );
  2909. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2910. Begin
  2911. set_inexact_flag;
  2912. End;
  2913. if ( aSign<>0 ) then z := - z;
  2914. float32_to_int32_round_to_zero := z;
  2915. End;
  2916. {*----------------------------------------------------------------------------
  2917. | Returns the result of converting the single-precision floating-point value
  2918. | `a' to the 64-bit two's complement integer format. The conversion is
  2919. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2920. | Arithmetic---which means in particular that the conversion is rounded
  2921. | according to the current rounding mode. If `a' is a NaN, the largest
  2922. | positive integer is returned. Otherwise, if the conversion overflows, the
  2923. | largest integer with the same sign as `a' is returned.
  2924. *----------------------------------------------------------------------------*}
  2925. function float32_to_int64( a: float32 ): int64;
  2926. var
  2927. aSign: flag;
  2928. aExp, shiftCount: int16;
  2929. aSig: bits32;
  2930. aSig64, aSigExtra: bits64;
  2931. begin
  2932. aSig := extractFloat32Frac( a );
  2933. aExp := extractFloat32Exp( a );
  2934. aSign := extractFloat32Sign( a );
  2935. shiftCount := $BE - aExp;
  2936. if ( shiftCount < 0 ) then begin
  2937. float_raise( float_flag_invalid );
  2938. if ( aSign = 0 ) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  2939. result := $7FFFFFFFFFFFFFFF;
  2940. exit;
  2941. end;
  2942. result := $8000000000000000;
  2943. exit;
  2944. end;
  2945. if ( aExp <> 0 ) then aSig := aSig or $00800000;
  2946. aSig64 := aSig;
  2947. aSig64 := aSig64 shl 40;
  2948. shift64ExtraRightJamming( aSig64, 0, shiftCount, aSig64, aSigExtra );
  2949. result := roundAndPackInt64( aSign, aSig64, aSigExtra );
  2950. end;
  2951. {*----------------------------------------------------------------------------
  2952. | Returns the result of converting the single-precision floating-point value
  2953. | `a' to the 64-bit two's complement integer format. The conversion is
  2954. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2955. | Arithmetic, except that the conversion is always rounded toward zero. If
  2956. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  2957. | conversion overflows, the largest integer with the same sign as `a' is
  2958. | returned.
  2959. *----------------------------------------------------------------------------*}
  2960. function float32_to_int64_round_to_zero( a: float32 ): int64;
  2961. var
  2962. aSign: flag;
  2963. aExp, shiftCount: int16;
  2964. aSig: bits32;
  2965. aSig64: bits64;
  2966. z: int64;
  2967. begin
  2968. aSig := extractFloat32Frac( a );
  2969. aExp := extractFloat32Exp( a );
  2970. aSign := extractFloat32Sign( a );
  2971. shiftCount := aExp - $BE;
  2972. if ( 0 <= shiftCount ) then begin
  2973. if ( a <> $DF000000 ) then begin
  2974. float_raise( float_flag_invalid );
  2975. if ( aSign = 0) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  2976. result := $7FFFFFFFFFFFFFFF;
  2977. exit;
  2978. end;
  2979. end;
  2980. result := $8000000000000000;
  2981. exit;
  2982. end
  2983. else if ( aExp <= $7E ) then begin
  2984. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  2985. result := 0;
  2986. exit;
  2987. end;
  2988. aSig64 := aSig or $00800000;
  2989. aSig64 := aSig64 shl 40;
  2990. z := aSig64 shr ( - shiftCount );
  2991. if bits64( aSig64 shl ( shiftCount and 63 ) ) <> 0 then
  2992. set_inexact_flag;
  2993. if ( aSign <> 0 ) then z := - z;
  2994. result := z;
  2995. end;
  2996. {*
  2997. -------------------------------------------------------------------------------
  2998. Returns the result of converting the single-precision floating-point value
  2999. `a' to the double-precision floating-point format. The conversion is
  3000. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3001. Arithmetic.
  3002. -------------------------------------------------------------------------------
  3003. *}
  3004. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  3005. Var
  3006. aSign : flag;
  3007. aExp : int16;
  3008. aSig, zSig0, zSig1: bits32;
  3009. tmp : CommonNanT;
  3010. Begin
  3011. aSig := extractFloat32Frac( a.float32 );
  3012. aExp := extractFloat32Exp( a.float32 );
  3013. aSign := extractFloat32Sign( a.float32 );
  3014. if ( aExp = $FF ) then
  3015. Begin
  3016. if ( aSig<>0 ) then
  3017. Begin
  3018. float32ToCommonNaN(a.float32, tmp);
  3019. commonNaNToFloat64(tmp , result);
  3020. exit;
  3021. End;
  3022. packFloat64( aSign, $7FF, 0, 0, result);
  3023. exit;
  3024. End;
  3025. if ( aExp = 0 ) then
  3026. Begin
  3027. if ( aSig = 0 ) then
  3028. Begin
  3029. packFloat64( aSign, 0, 0, 0, result );
  3030. exit;
  3031. end;
  3032. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3033. Dec(aExp);
  3034. End;
  3035. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  3036. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  3037. End;
  3038. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  3039. {*----------------------------------------------------------------------------
  3040. | Returns the result of converting the canonical NaN `a' to the extended
  3041. | double-precision floating-point format.
  3042. *----------------------------------------------------------------------------*}
  3043. function commonNaNToFloatx80( a : commonNaNT ) : floatx80;
  3044. var
  3045. z : floatx80;
  3046. begin
  3047. z.low := bits64( $C000000000000000 ) or ( a.high shr 1 );
  3048. z.high := ( bits16( a.sign ) shl 15 ) or $7FFF;
  3049. result := z;
  3050. end;
  3051. {*----------------------------------------------------------------------------
  3052. | Returns the result of converting the single-precision floating-point value
  3053. | `a' to the extended double-precision floating-point format. The conversion
  3054. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3055. | Arithmetic.
  3056. *----------------------------------------------------------------------------*}
  3057. function float32_to_floatx80( a: float32 ): floatx80;
  3058. var
  3059. aSign: flag;
  3060. aExp: int16;
  3061. aSig: bits32;
  3062. tmp: commonNaNT;
  3063. begin
  3064. aSig := extractFloat32Frac( a );
  3065. aExp := extractFloat32Exp( a );
  3066. aSign := extractFloat32Sign( a );
  3067. if ( aExp = $FF ) then begin
  3068. if ( aSig <> 0 ) then begin
  3069. float32ToCommonNaN( a, tmp );
  3070. result := commonNaNToFloatx80( tmp );
  3071. exit;
  3072. end;
  3073. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  3074. exit;
  3075. end;
  3076. if ( aExp = 0 ) then begin
  3077. if ( aSig = 0 ) then begin
  3078. result := packFloatx80( aSign, 0, 0 );
  3079. exit;
  3080. end;
  3081. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3082. end;
  3083. aSig := aSig or $00800000;
  3084. result := packFloatx80( aSign, aExp + $3F80, bits64(aSig) shl 40 );
  3085. end;
  3086. {$endif FPC_SOFTFLOAT_FLOATX80}
  3087. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  3088. {*----------------------------------------------------------------------------
  3089. | Returns the result of converting the single-precision floating-point value
  3090. | `a' to the double-precision floating-point format. The conversion is
  3091. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3092. | Arithmetic.
  3093. *----------------------------------------------------------------------------*}
  3094. function float32_to_float128( a: float32 ): float128;
  3095. var
  3096. aSign: flag;
  3097. aExp: int16;
  3098. aSig: bits32;
  3099. tmp: commonNaNT;
  3100. begin
  3101. aSig := extractFloat32Frac( a );
  3102. aExp := extractFloat32Exp( a );
  3103. aSign := extractFloat32Sign( a );
  3104. if ( aExp = $FF ) then begin
  3105. if ( aSig <> 0 ) then begin
  3106. float32ToCommonNaN( a, tmp );
  3107. result := commonNaNToFloat128( tmp );
  3108. exit;
  3109. end;
  3110. result := packFloat128( aSign, $7FFF, 0, 0 );
  3111. exit;
  3112. end;
  3113. if ( aExp = 0 ) then begin
  3114. if ( aSig = 0 ) then begin
  3115. result := packFloat128( aSign, 0, 0, 0 );
  3116. exit;
  3117. end;
  3118. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3119. dec( aExp );
  3120. end;
  3121. result := packFloat128( aSign, aExp + $3F80, bits64( aSig ) shl 25, 0 );
  3122. end;
  3123. {$endif FPC_SOFTFLOAT_FLOAT128}
  3124. {*
  3125. -------------------------------------------------------------------------------
  3126. Rounds the single-precision floating-point value `a' to an integer,
  3127. and returns the result as a single-precision floating-point value. The
  3128. operation is performed according to the IEC/IEEE Standard for Binary
  3129. Floating-Point Arithmetic.
  3130. -------------------------------------------------------------------------------
  3131. *}
  3132. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  3133. Var
  3134. aSign: flag;
  3135. aExp: int16;
  3136. lastBitMask, roundBitsMask: bits32;
  3137. roundingMode: TFPURoundingMode;
  3138. z: float32;
  3139. Begin
  3140. aExp := extractFloat32Exp( a.float32 );
  3141. if ( $96 <= aExp ) then
  3142. Begin
  3143. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3144. Begin
  3145. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  3146. exit;
  3147. End;
  3148. float32_round_to_int:=a;
  3149. exit;
  3150. End;
  3151. if ( aExp <= $7E ) then
  3152. Begin
  3153. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  3154. Begin
  3155. float32_round_to_int:=a;
  3156. exit;
  3157. end;
  3158. set_inexact_flag;
  3159. aSign := extractFloat32Sign( a.float32 );
  3160. case ( softfloat_rounding_mode ) of
  3161. float_round_nearest_even:
  3162. Begin
  3163. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3164. Begin
  3165. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  3166. exit;
  3167. End;
  3168. End;
  3169. float_round_down:
  3170. Begin
  3171. if aSign <> 0 then
  3172. float32_round_to_int.float32 := $BF800000
  3173. else
  3174. float32_round_to_int.float32 := 0;
  3175. exit;
  3176. End;
  3177. float_round_up:
  3178. Begin
  3179. if aSign <> 0 then
  3180. float32_round_to_int.float32 := $80000000
  3181. else
  3182. float32_round_to_int.float32 := $3F800000;
  3183. exit;
  3184. End;
  3185. end;
  3186. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  3187. exit;
  3188. End;
  3189. lastBitMask := 1;
  3190. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  3191. lastBitMask := lastBitMask shl ($96 - aExp);
  3192. roundBitsMask := lastBitMask - 1;
  3193. z := a.float32;
  3194. roundingMode := softfloat_rounding_mode;
  3195. if ( roundingMode = float_round_nearest_even ) then
  3196. Begin
  3197. z := z + (lastBitMask shr 1);
  3198. if ( ( z and roundBitsMask ) = 0 ) then
  3199. z := z and not lastBitMask;
  3200. End
  3201. else if ( roundingMode <> float_round_to_zero ) then
  3202. Begin
  3203. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  3204. Begin
  3205. z := z + roundBitsMask;
  3206. End;
  3207. End;
  3208. z := z and not roundBitsMask;
  3209. if ( z <> a.float32 ) then
  3210. set_inexact_flag;
  3211. float32_round_to_int.float32 := z;
  3212. End;
  3213. {*
  3214. -------------------------------------------------------------------------------
  3215. Returns the result of adding the absolute values of the single-precision
  3216. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3217. before being returned. `zSign' is ignored if the result is a NaN.
  3218. The addition is performed according to the IEC/IEEE Standard for Binary
  3219. Floating-Point Arithmetic.
  3220. -------------------------------------------------------------------------------
  3221. *}
  3222. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  3223. Var
  3224. aExp, bExp, zExp: int16;
  3225. aSig, bSig, zSig: bits32;
  3226. expDiff: int16;
  3227. label roundAndPack;
  3228. Begin
  3229. aSig:=extractFloat32Frac( a );
  3230. aExp:=extractFloat32Exp( a );
  3231. bSig:=extractFloat32Frac( b );
  3232. bExp := extractFloat32Exp( b );
  3233. expDiff := aExp - bExp;
  3234. aSig := aSig shl 6;
  3235. bSig := bSig shl 6;
  3236. if ( 0 < expDiff ) then
  3237. Begin
  3238. if ( aExp = $FF ) then
  3239. Begin
  3240. if ( aSig <> 0) then
  3241. Begin
  3242. addFloat32Sigs := propagateFloat32NaN( a, b );
  3243. exit;
  3244. End;
  3245. addFloat32Sigs := a;
  3246. exit;
  3247. End;
  3248. if ( bExp = 0 ) then
  3249. Begin
  3250. Dec(expDiff);
  3251. End
  3252. else
  3253. Begin
  3254. bSig := bSig or $20000000;
  3255. End;
  3256. shift32RightJamming( bSig, expDiff, bSig );
  3257. zExp := aExp;
  3258. End
  3259. else
  3260. If ( expDiff < 0 ) then
  3261. Begin
  3262. if ( bExp = $FF ) then
  3263. Begin
  3264. if ( bSig<>0 ) then
  3265. Begin
  3266. addFloat32Sigs := propagateFloat32NaN( a, b );
  3267. exit;
  3268. end;
  3269. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3270. exit;
  3271. End;
  3272. if ( aExp = 0 ) then
  3273. Begin
  3274. Inc(expDiff);
  3275. End
  3276. else
  3277. Begin
  3278. aSig := aSig OR $20000000;
  3279. End;
  3280. shift32RightJamming( aSig, - expDiff, aSig );
  3281. zExp := bExp;
  3282. End
  3283. else
  3284. Begin
  3285. if ( aExp = $FF ) then
  3286. Begin
  3287. if ( aSig OR bSig )<> 0 then
  3288. Begin
  3289. addFloat32Sigs := propagateFloat32NaN( a, b );
  3290. exit;
  3291. end;
  3292. addFloat32Sigs := a;
  3293. exit;
  3294. End;
  3295. if ( aExp = 0 ) then
  3296. Begin
  3297. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3298. exit;
  3299. end;
  3300. zSig := $40000000 + aSig + bSig;
  3301. zExp := aExp;
  3302. goto roundAndPack;
  3303. End;
  3304. aSig := aSig OR $20000000;
  3305. zSig := ( aSig + bSig ) shl 1;
  3306. Dec(zExp);
  3307. if ( sbits32 (zSig) < 0 ) then
  3308. Begin
  3309. zSig := aSig + bSig;
  3310. Inc(zExp);
  3311. End;
  3312. roundAndPack:
  3313. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3314. End;
  3315. {*
  3316. -------------------------------------------------------------------------------
  3317. Returns the result of subtracting the absolute values of the single-
  3318. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3319. difference is negated before being returned. `zSign' is ignored if the
  3320. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3321. Standard for Binary Floating-Point Arithmetic.
  3322. -------------------------------------------------------------------------------
  3323. *}
  3324. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3325. Var
  3326. aExp, bExp, zExp: int16;
  3327. aSig, bSig, zSig: bits32;
  3328. expDiff : int16;
  3329. label aExpBigger;
  3330. label bExpBigger;
  3331. label aBigger;
  3332. label bBigger;
  3333. label normalizeRoundAndPack;
  3334. Begin
  3335. aSig := extractFloat32Frac( a );
  3336. aExp := extractFloat32Exp( a );
  3337. bSig := extractFloat32Frac( b );
  3338. bExp := extractFloat32Exp( b );
  3339. expDiff := aExp - bExp;
  3340. aSig := aSig shl 7;
  3341. bSig := bSig shl 7;
  3342. if ( 0 < expDiff ) then goto aExpBigger;
  3343. if ( expDiff < 0 ) then goto bExpBigger;
  3344. if ( aExp = $FF ) then
  3345. Begin
  3346. if ( aSig OR bSig )<> 0 then
  3347. Begin
  3348. subFloat32Sigs := propagateFloat32NaN( a, b );
  3349. exit;
  3350. End;
  3351. float_raise( float_flag_invalid );
  3352. subFloat32Sigs := float32_default_nan;
  3353. exit;
  3354. End;
  3355. if ( aExp = 0 ) then
  3356. Begin
  3357. aExp := 1;
  3358. bExp := 1;
  3359. End;
  3360. if ( bSig < aSig ) Then goto aBigger;
  3361. if ( aSig < bSig ) Then goto bBigger;
  3362. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3363. exit;
  3364. bExpBigger:
  3365. if ( bExp = $FF ) then
  3366. Begin
  3367. if ( bSig<>0 ) then
  3368. Begin
  3369. subFloat32Sigs := propagateFloat32NaN( a, b );
  3370. exit;
  3371. End;
  3372. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3373. exit;
  3374. End;
  3375. if ( aExp = 0 ) then
  3376. Begin
  3377. Inc(expDiff);
  3378. End
  3379. else
  3380. Begin
  3381. aSig := aSig OR $40000000;
  3382. End;
  3383. shift32RightJamming( aSig, - expDiff, aSig );
  3384. bSig := bSig OR $40000000;
  3385. bBigger:
  3386. zSig := bSig - aSig;
  3387. zExp := bExp;
  3388. zSign := zSign xor 1;
  3389. goto normalizeRoundAndPack;
  3390. aExpBigger:
  3391. if ( aExp = $FF ) then
  3392. Begin
  3393. if ( aSig <> 0) then
  3394. Begin
  3395. subFloat32Sigs := propagateFloat32NaN( a, b );
  3396. exit;
  3397. End;
  3398. subFloat32Sigs := a;
  3399. exit;
  3400. End;
  3401. if ( bExp = 0 ) then
  3402. Begin
  3403. Dec(expDiff);
  3404. End
  3405. else
  3406. Begin
  3407. bSig := bSig OR $40000000;
  3408. End;
  3409. shift32RightJamming( bSig, expDiff, bSig );
  3410. aSig := aSig OR $40000000;
  3411. aBigger:
  3412. zSig := aSig - bSig;
  3413. zExp := aExp;
  3414. normalizeRoundAndPack:
  3415. Dec(zExp);
  3416. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3417. End;
  3418. {*
  3419. -------------------------------------------------------------------------------
  3420. Returns the result of adding the single-precision floating-point values `a'
  3421. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3422. Binary Floating-Point Arithmetic.
  3423. -------------------------------------------------------------------------------
  3424. *}
  3425. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3426. Var
  3427. aSign, bSign: Flag;
  3428. Begin
  3429. aSign := extractFloat32Sign( a.float32 );
  3430. bSign := extractFloat32Sign( b.float32 );
  3431. if ( aSign = bSign ) then
  3432. Begin
  3433. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3434. End
  3435. else
  3436. Begin
  3437. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3438. End;
  3439. End;
  3440. {*
  3441. -------------------------------------------------------------------------------
  3442. Returns the result of subtracting the single-precision floating-point values
  3443. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3444. for Binary Floating-Point Arithmetic.
  3445. -------------------------------------------------------------------------------
  3446. *}
  3447. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3448. Var
  3449. aSign, bSign: flag;
  3450. Begin
  3451. aSign := extractFloat32Sign( a.float32 );
  3452. bSign := extractFloat32Sign( b.float32 );
  3453. if ( aSign = bSign ) then
  3454. Begin
  3455. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3456. End
  3457. else
  3458. Begin
  3459. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3460. End;
  3461. End;
  3462. {*
  3463. -------------------------------------------------------------------------------
  3464. Returns the result of multiplying the single-precision floating-point values
  3465. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3466. for Binary Floating-Point Arithmetic.
  3467. -------------------------------------------------------------------------------
  3468. *}
  3469. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3470. Var
  3471. aSign, bSign, zSign: flag;
  3472. aExp, bExp, zExp : int16;
  3473. aSig, bSig, zSig0, zSig1: bits32;
  3474. Begin
  3475. aSig := extractFloat32Frac( a.float32 );
  3476. aExp := extractFloat32Exp( a.float32 );
  3477. aSign := extractFloat32Sign( a.float32 );
  3478. bSig := extractFloat32Frac( b.float32 );
  3479. bExp := extractFloat32Exp( b.float32 );
  3480. bSign := extractFloat32Sign( b.float32 );
  3481. zSign := aSign xor bSign;
  3482. if ( aExp = $FF ) then
  3483. Begin
  3484. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3485. Begin
  3486. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3487. exit;
  3488. End;
  3489. if ( ( bits32(bExp) OR bSig ) = 0 ) then
  3490. Begin
  3491. float_raise( float_flag_invalid );
  3492. float32_mul.float32 := float32_default_nan;
  3493. exit;
  3494. End;
  3495. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3496. exit;
  3497. End;
  3498. if ( bExp = $FF ) then
  3499. Begin
  3500. if ( bSig <> 0 ) then
  3501. Begin
  3502. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3503. exit;
  3504. End;
  3505. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3506. Begin
  3507. float_raise( float_flag_invalid );
  3508. float32_mul.float32 := float32_default_nan;
  3509. exit;
  3510. End;
  3511. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3512. exit;
  3513. End;
  3514. if ( aExp = 0 ) then
  3515. Begin
  3516. if ( aSig = 0 ) then
  3517. Begin
  3518. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3519. exit;
  3520. End;
  3521. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3522. End;
  3523. if ( bExp = 0 ) then
  3524. Begin
  3525. if ( bSig = 0 ) then
  3526. Begin
  3527. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3528. exit;
  3529. End;
  3530. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3531. End;
  3532. zExp := aExp + bExp - $7F;
  3533. aSig := ( aSig OR $00800000 ) shl 7;
  3534. bSig := ( bSig OR $00800000 ) shl 8;
  3535. mul32To64( aSig, bSig, zSig0, zSig1 );
  3536. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3537. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3538. Begin
  3539. zSig0 := zSig0 shl 1;
  3540. Dec(zExp);
  3541. End;
  3542. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3543. End;
  3544. {*
  3545. -------------------------------------------------------------------------------
  3546. Returns the result of dividing the single-precision floating-point value `a'
  3547. by the corresponding value `b'. The operation is performed according to the
  3548. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3549. -------------------------------------------------------------------------------
  3550. *}
  3551. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3552. Var
  3553. aSign, bSign, zSign: flag;
  3554. aExp, bExp, zExp: int16;
  3555. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3556. Begin
  3557. aSig := extractFloat32Frac( a.float32 );
  3558. aExp := extractFloat32Exp( a.float32 );
  3559. aSign := extractFloat32Sign( a.float32 );
  3560. bSig := extractFloat32Frac( b.float32 );
  3561. bExp := extractFloat32Exp( b.float32 );
  3562. bSign := extractFloat32Sign( b.float32 );
  3563. zSign := aSign xor bSign;
  3564. if ( aExp = $FF ) then
  3565. Begin
  3566. if ( aSig <> 0 ) then
  3567. Begin
  3568. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3569. exit;
  3570. End;
  3571. if ( bExp = $FF ) then
  3572. Begin
  3573. if ( bSig <> 0) then
  3574. Begin
  3575. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3576. exit;
  3577. End;
  3578. float_raise( float_flag_invalid );
  3579. float32_div.float32 := float32_default_nan;
  3580. exit;
  3581. End;
  3582. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3583. exit;
  3584. End;
  3585. if ( bExp = $FF ) then
  3586. Begin
  3587. if ( bSig <> 0) then
  3588. Begin
  3589. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3590. exit;
  3591. End;
  3592. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3593. exit;
  3594. End;
  3595. if ( bExp = 0 ) Then
  3596. Begin
  3597. if ( bSig = 0 ) Then
  3598. Begin
  3599. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3600. Begin
  3601. float_raise( float_flag_invalid );
  3602. float32_div.float32 := float32_default_nan;
  3603. exit;
  3604. End;
  3605. float_raise( float_flag_divbyzero );
  3606. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3607. exit;
  3608. End;
  3609. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3610. End;
  3611. if ( aExp = 0 ) Then
  3612. Begin
  3613. if ( aSig = 0 ) Then
  3614. Begin
  3615. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3616. exit;
  3617. End;
  3618. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3619. End;
  3620. zExp := aExp - bExp + $7D;
  3621. aSig := ( aSig OR $00800000 ) shl 7;
  3622. bSig := ( bSig OR $00800000 ) shl 8;
  3623. if ( bSig <= ( aSig + aSig ) ) then
  3624. Begin
  3625. aSig := aSig shr 1;
  3626. Inc(zExp);
  3627. End;
  3628. zSig := estimateDiv64To32( aSig, 0, bSig );
  3629. if ( ( zSig and $3F ) <= 2 ) then
  3630. Begin
  3631. mul32To64( bSig, zSig, term0, term1 );
  3632. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3633. while ( sbits32 (rem0) < 0 ) do
  3634. Begin
  3635. Dec(zSig);
  3636. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3637. End;
  3638. zSig := zSig or bits32( rem1 <> 0 );
  3639. End;
  3640. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3641. End;
  3642. {*
  3643. -------------------------------------------------------------------------------
  3644. Returns the remainder of the single-precision floating-point value `a'
  3645. with respect to the corresponding value `b'. The operation is performed
  3646. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3647. -------------------------------------------------------------------------------
  3648. *}
  3649. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3650. Var
  3651. aSign, zSign: flag;
  3652. aExp, bExp, expDiff: int16;
  3653. aSig, bSig, q, alternateASig: bits32;
  3654. sigMean: sbits32;
  3655. Begin
  3656. aSig := extractFloat32Frac( a.float32 );
  3657. aExp := extractFloat32Exp( a.float32 );
  3658. aSign := extractFloat32Sign( a.float32 );
  3659. bSig := extractFloat32Frac( b.float32 );
  3660. bExp := extractFloat32Exp( b.float32 );
  3661. if ( aExp = $FF ) then
  3662. Begin
  3663. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3664. Begin
  3665. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3666. exit;
  3667. End;
  3668. float_raise( float_flag_invalid );
  3669. float32_rem.float32 := float32_default_nan;
  3670. exit;
  3671. End;
  3672. if ( bExp = $FF ) then
  3673. Begin
  3674. if ( bSig <> 0 ) then
  3675. Begin
  3676. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3677. exit;
  3678. End;
  3679. float32_rem := a;
  3680. exit;
  3681. End;
  3682. if ( bExp = 0 ) then
  3683. Begin
  3684. if ( bSig = 0 ) then
  3685. Begin
  3686. float_raise( float_flag_invalid );
  3687. float32_rem.float32 := float32_default_nan;
  3688. exit;
  3689. End;
  3690. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3691. End;
  3692. if ( aExp = 0 ) then
  3693. Begin
  3694. if ( aSig = 0 ) then
  3695. Begin
  3696. float32_rem := a;
  3697. exit;
  3698. End;
  3699. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3700. End;
  3701. expDiff := aExp - bExp;
  3702. aSig := ( aSig OR $00800000 ) shl 8;
  3703. bSig := ( bSig OR $00800000 ) shl 8;
  3704. if ( expDiff < 0 ) then
  3705. Begin
  3706. if ( expDiff < -1 ) then
  3707. Begin
  3708. float32_rem := a;
  3709. exit;
  3710. End;
  3711. aSig := aSig shr 1;
  3712. End;
  3713. q := bits32( bSig <= aSig );
  3714. if ( q <> 0) then
  3715. aSig := aSig - bSig;
  3716. expDiff := expDiff - 32;
  3717. while ( 0 < expDiff ) do
  3718. Begin
  3719. q := estimateDiv64To32( aSig, 0, bSig );
  3720. if (2 < q) then
  3721. q := q - 2
  3722. else
  3723. q := 0;
  3724. aSig := - ( ( bSig shr 2 ) * q );
  3725. expDiff := expDiff - 30;
  3726. End;
  3727. expDiff := expDiff + 32;
  3728. if ( 0 < expDiff ) then
  3729. Begin
  3730. q := estimateDiv64To32( aSig, 0, bSig );
  3731. if (2 < q) then
  3732. q := q - 2
  3733. else
  3734. q := 0;
  3735. q := q shr (32 - expDiff);
  3736. bSig := bSig shr 2;
  3737. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3738. End
  3739. else
  3740. Begin
  3741. aSig := aSig shr 2;
  3742. bSig := bSig shr 2;
  3743. End;
  3744. Repeat
  3745. alternateASig := aSig;
  3746. Inc(q);
  3747. aSig := aSig - bSig;
  3748. Until not ( 0 <= sbits32 (aSig) );
  3749. sigMean := aSig + alternateASig;
  3750. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3751. Begin
  3752. aSig := alternateASig;
  3753. End;
  3754. zSign := flag( sbits32 (aSig) < 0 );
  3755. if ( zSign<>0 ) then
  3756. aSig := - aSig;
  3757. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3758. End;
  3759. {*
  3760. -------------------------------------------------------------------------------
  3761. Returns the square root of the single-precision floating-point value `a'.
  3762. The operation is performed according to the IEC/IEEE Standard for Binary
  3763. Floating-Point Arithmetic.
  3764. -------------------------------------------------------------------------------
  3765. *}
  3766. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3767. Var
  3768. aSign : flag;
  3769. aExp, zExp : int16;
  3770. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3771. label roundAndPack;
  3772. Begin
  3773. aSig := extractFloat32Frac( a.float32 );
  3774. aExp := extractFloat32Exp( a.float32 );
  3775. aSign := extractFloat32Sign( a.float32 );
  3776. if ( aExp = $FF ) then
  3777. Begin
  3778. if ( aSig <> 0) then
  3779. Begin
  3780. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3781. exit;
  3782. End;
  3783. if ( aSign = 0) then
  3784. Begin
  3785. float32_sqrt := a;
  3786. exit;
  3787. End;
  3788. float_raise( float_flag_invalid );
  3789. float32_sqrt.float32 := float32_default_nan;
  3790. exit;
  3791. End;
  3792. if ( aSign <> 0) then
  3793. Begin
  3794. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3795. Begin
  3796. float32_sqrt := a;
  3797. exit;
  3798. End;
  3799. float_raise( float_flag_invalid );
  3800. float32_sqrt.float32 := float32_default_nan;
  3801. exit;
  3802. End;
  3803. if ( aExp = 0 ) then
  3804. Begin
  3805. if ( aSig = 0 ) then
  3806. Begin
  3807. float32_sqrt.float32 := 0;
  3808. exit;
  3809. End;
  3810. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3811. End;
  3812. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3813. aSig := ( aSig OR $00800000 ) shl 8;
  3814. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3815. if ( ( zSig and $7F ) <= 5 ) then
  3816. Begin
  3817. if ( zSig < 2 ) then
  3818. Begin
  3819. zSig := $7FFFFFFF;
  3820. goto roundAndPack;
  3821. End
  3822. else
  3823. Begin
  3824. aSig := aSig shr (aExp and 1);
  3825. mul32To64( zSig, zSig, term0, term1 );
  3826. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3827. while ( sbits32 (rem0) < 0 ) do
  3828. Begin
  3829. Dec(zSig);
  3830. shortShift64Left( 0, zSig, 1, term0, term1 );
  3831. term1 := term1 or 1;
  3832. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3833. End;
  3834. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3835. End;
  3836. End;
  3837. shift32RightJamming( zSig, 1, zSig );
  3838. roundAndPack:
  3839. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3840. End;
  3841. {*
  3842. -------------------------------------------------------------------------------
  3843. Returns 1 if the single-precision floating-point value `a' is equal to
  3844. the corresponding value `b', and 0 otherwise. The comparison is performed
  3845. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3846. -------------------------------------------------------------------------------
  3847. *}
  3848. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3849. Begin
  3850. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3851. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3852. ) then
  3853. Begin
  3854. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3855. Begin
  3856. float_raise( float_flag_invalid );
  3857. End;
  3858. float32_eq := 0;
  3859. exit;
  3860. End;
  3861. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3862. End;
  3863. {*
  3864. -------------------------------------------------------------------------------
  3865. Returns 1 if the single-precision floating-point value `a' is less than
  3866. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3867. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3868. Arithmetic.
  3869. -------------------------------------------------------------------------------
  3870. *}
  3871. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3872. var
  3873. aSign, bSign: flag;
  3874. Begin
  3875. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3876. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3877. ) then
  3878. Begin
  3879. float_raise( float_flag_invalid );
  3880. float32_le := 0;
  3881. exit;
  3882. End;
  3883. aSign := extractFloat32Sign( a.float32 );
  3884. bSign := extractFloat32Sign( b.float32 );
  3885. if ( aSign <> bSign ) then
  3886. Begin
  3887. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3888. exit;
  3889. End;
  3890. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3891. End;
  3892. {*
  3893. -------------------------------------------------------------------------------
  3894. Returns 1 if the single-precision floating-point value `a' is less than
  3895. the corresponding value `b', and 0 otherwise. The comparison is performed
  3896. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3897. -------------------------------------------------------------------------------
  3898. *}
  3899. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3900. var
  3901. aSign, bSign: flag;
  3902. Begin
  3903. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3904. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3905. ) then
  3906. Begin
  3907. float_raise( float_flag_invalid );
  3908. float32_lt :=0;
  3909. exit;
  3910. End;
  3911. aSign := extractFloat32Sign( a.float32 );
  3912. bSign := extractFloat32Sign( b.float32 );
  3913. if ( aSign <> bSign ) then
  3914. Begin
  3915. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3916. exit;
  3917. End;
  3918. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3919. End;
  3920. {*
  3921. -------------------------------------------------------------------------------
  3922. Returns 1 if the single-precision floating-point value `a' is equal to
  3923. the corresponding value `b', and 0 otherwise. The invalid exception is
  3924. raised if either operand is a NaN. Otherwise, the comparison is performed
  3925. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3926. -------------------------------------------------------------------------------
  3927. *}
  3928. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3929. Begin
  3930. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3931. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3932. ) then
  3933. Begin
  3934. float_raise( float_flag_invalid );
  3935. float32_eq_signaling := 0;
  3936. exit;
  3937. End;
  3938. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3939. End;
  3940. {*
  3941. -------------------------------------------------------------------------------
  3942. Returns 1 if the single-precision floating-point value `a' is less than or
  3943. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3944. cause an exception. Otherwise, the comparison is performed according to the
  3945. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3946. -------------------------------------------------------------------------------
  3947. *}
  3948. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3949. Var
  3950. aSign, bSign: flag;
  3951. Begin
  3952. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3953. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3954. ) then
  3955. Begin
  3956. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3957. Begin
  3958. float_raise( float_flag_invalid );
  3959. End;
  3960. float32_le_quiet := 0;
  3961. exit;
  3962. End;
  3963. aSign := extractFloat32Sign( a );
  3964. bSign := extractFloat32Sign( b );
  3965. if ( aSign <> bSign ) then
  3966. Begin
  3967. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3968. exit;
  3969. End;
  3970. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3971. End;
  3972. {*
  3973. -------------------------------------------------------------------------------
  3974. Returns 1 if the single-precision floating-point value `a' is less than
  3975. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3976. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3977. Standard for Binary Floating-Point Arithmetic.
  3978. -------------------------------------------------------------------------------
  3979. *}
  3980. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  3981. Var
  3982. aSign, bSign: flag;
  3983. Begin
  3984. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3985. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3986. ) then
  3987. Begin
  3988. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3989. Begin
  3990. float_raise( float_flag_invalid );
  3991. End;
  3992. float32_lt_quiet := 0;
  3993. exit;
  3994. End;
  3995. aSign := extractFloat32Sign( a );
  3996. bSign := extractFloat32Sign( b );
  3997. if ( aSign <> bSign ) then
  3998. Begin
  3999. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  4000. exit;
  4001. End;
  4002. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  4003. End;
  4004. {*
  4005. -------------------------------------------------------------------------------
  4006. Returns the result of converting the double-precision floating-point value
  4007. `a' to the 32-bit two's complement integer format. The conversion is
  4008. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4009. Arithmetic---which means in particular that the conversion is rounded
  4010. according to the current rounding mode. If `a' is a NaN, the largest
  4011. positive integer is returned. Otherwise, if the conversion overflows, the
  4012. largest integer with the same sign as `a' is returned.
  4013. -------------------------------------------------------------------------------
  4014. *}
  4015. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  4016. var
  4017. aSign: flag;
  4018. aExp, shiftCount: int16;
  4019. aSig0, aSig1, absZ, aSigExtra: bits32;
  4020. z: int32;
  4021. roundingMode: TFPURoundingMode;
  4022. label invalid;
  4023. Begin
  4024. aSig1 := extractFloat64Frac1( a );
  4025. aSig0 := extractFloat64Frac0( a );
  4026. aExp := extractFloat64Exp( a );
  4027. aSign := extractFloat64Sign( a );
  4028. shiftCount := aExp - $413;
  4029. if ( 0 <= shiftCount ) then
  4030. Begin
  4031. if ( $41E < aExp ) then
  4032. Begin
  4033. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4034. aSign := 0;
  4035. goto invalid;
  4036. End;
  4037. shortShift64Left(
  4038. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4039. if ( $80000000 < absZ ) then
  4040. goto invalid;
  4041. End
  4042. else
  4043. Begin
  4044. aSig1 := flag( aSig1 <> 0 );
  4045. if ( aExp < $3FE ) then
  4046. Begin
  4047. aSigExtra := aExp OR aSig0 OR aSig1;
  4048. absZ := 0;
  4049. End
  4050. else
  4051. Begin
  4052. aSig0 := aSig0 OR $00100000;
  4053. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4054. absZ := aSig0 shr ( - shiftCount );
  4055. End;
  4056. End;
  4057. roundingMode := softfloat_rounding_mode;
  4058. if ( roundingMode = float_round_nearest_even ) then
  4059. Begin
  4060. if ( sbits32(aSigExtra) < 0 ) then
  4061. Begin
  4062. Inc(absZ);
  4063. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  4064. absZ := absZ and not 1;
  4065. End;
  4066. if aSign <> 0 then
  4067. z := - absZ
  4068. else
  4069. z := absZ;
  4070. End
  4071. else
  4072. Begin
  4073. aSigExtra := bits32( aSigExtra <> 0 );
  4074. if ( aSign <> 0) then
  4075. Begin
  4076. z := - ( absZ
  4077. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  4078. End
  4079. else
  4080. Begin
  4081. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  4082. End
  4083. End;
  4084. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  4085. Begin
  4086. invalid:
  4087. float_raise( float_flag_invalid );
  4088. if (aSign <> 0 ) then
  4089. float64_to_int32 := sbits32 ($80000000)
  4090. else
  4091. float64_to_int32 := $7FFFFFFF;
  4092. exit;
  4093. End;
  4094. if ( aSigExtra <> 0) then
  4095. set_inexact_flag;
  4096. float64_to_int32 := z;
  4097. End;
  4098. {*
  4099. -------------------------------------------------------------------------------
  4100. Returns the result of converting the double-precision floating-point value
  4101. `a' to the 32-bit two's complement integer format. The conversion is
  4102. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4103. Arithmetic, except that the conversion is always rounded toward zero.
  4104. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4105. the conversion overflows, the largest integer with the same sign as `a' is
  4106. returned.
  4107. -------------------------------------------------------------------------------
  4108. *}
  4109. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  4110. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  4111. Var
  4112. aSign: flag;
  4113. aExp, shiftCount: int16;
  4114. aSig0, aSig1, absZ, aSigExtra: bits32;
  4115. z: int32;
  4116. label invalid;
  4117. Begin
  4118. aSig1 := extractFloat64Frac1( a );
  4119. aSig0 := extractFloat64Frac0( a );
  4120. aExp := extractFloat64Exp( a );
  4121. aSign := extractFloat64Sign( a );
  4122. shiftCount := aExp - $413;
  4123. if ( 0 <= shiftCount ) then
  4124. Begin
  4125. if ( $41E < aExp ) then
  4126. Begin
  4127. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4128. aSign := 0;
  4129. goto invalid;
  4130. End;
  4131. shortShift64Left(
  4132. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4133. End
  4134. else
  4135. Begin
  4136. if ( aExp < $3FF ) then
  4137. Begin
  4138. if ( bits32(aExp) OR aSig0 OR aSig1 )<>0 then
  4139. Begin
  4140. set_inexact_flag;
  4141. End;
  4142. float64_to_int32_round_to_zero := 0;
  4143. exit;
  4144. End;
  4145. aSig0 := aSig0 or $00100000;
  4146. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4147. absZ := aSig0 shr ( - shiftCount );
  4148. End;
  4149. if aSign <> 0 then
  4150. z := - absZ
  4151. else
  4152. z := absZ;
  4153. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  4154. Begin
  4155. invalid:
  4156. float_raise( float_flag_invalid );
  4157. if (aSign <> 0) then
  4158. float64_to_int32_round_to_zero := sbits32 ($80000000)
  4159. else
  4160. float64_to_int32_round_to_zero := $7FFFFFFF;
  4161. exit;
  4162. End;
  4163. if ( aSigExtra <> 0) then
  4164. set_inexact_flag;
  4165. float64_to_int32_round_to_zero := z;
  4166. End;
  4167. {*----------------------------------------------------------------------------
  4168. | Returns the result of converting the double-precision floating-point value
  4169. | `a' to the 64-bit two's complement integer format. The conversion is
  4170. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4171. | Arithmetic---which means in particular that the conversion is rounded
  4172. | according to the current rounding mode. If `a' is a NaN, the largest
  4173. | positive integer is returned. Otherwise, if the conversion overflows, the
  4174. | largest integer with the same sign as `a' is returned.
  4175. *----------------------------------------------------------------------------*}
  4176. function float64_to_int64( a: float64 ): int64;
  4177. var
  4178. aSign: flag;
  4179. aExp, shiftCount: int16;
  4180. aSig, aSigExtra: bits64;
  4181. begin
  4182. aSig := extractFloat64Frac( a );
  4183. aExp := extractFloat64Exp( a );
  4184. aSign := extractFloat64Sign( a );
  4185. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4186. shiftCount := $433 - aExp;
  4187. if ( shiftCount <= 0 ) then begin
  4188. if ( $43E < aExp ) then begin
  4189. float_raise( float_flag_invalid );
  4190. if ( ( aSign = 0 )
  4191. or ( ( aExp = $7FF )
  4192. and ( aSig <> $0010000000000000 ) )
  4193. ) then begin
  4194. result := $7FFFFFFFFFFFFFFF;
  4195. exit;
  4196. end;
  4197. result := $8000000000000000;
  4198. exit;
  4199. end;
  4200. aSigExtra := 0;
  4201. aSig := aSig shl ( - shiftCount );
  4202. end
  4203. else
  4204. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  4205. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  4206. end;
  4207. {*----------------------------------------------------------------------------
  4208. | Returns the result of converting the double-precision floating-point value
  4209. | `a' to the 64-bit two's complement integer format. The conversion is
  4210. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4211. | Arithmetic, except that the conversion is always rounded toward zero.
  4212. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4213. | the conversion overflows, the largest integer with the same sign as `a' is
  4214. | returned.
  4215. *----------------------------------------------------------------------------*}
  4216. {$define FPC_SYSTEM_HAS_float64_to_int64_round_to_zero}
  4217. function float64_to_int64_round_to_zero( a: float64 ): int64;
  4218. var
  4219. aSign: flag;
  4220. aExp, shiftCount: int16;
  4221. aSig: bits64;
  4222. z: int64;
  4223. begin
  4224. aSig := extractFloat64Frac( a );
  4225. aExp := extractFloat64Exp( a );
  4226. aSign := extractFloat64Sign( a );
  4227. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4228. shiftCount := aExp - $433;
  4229. if ( 0 <= shiftCount ) then begin
  4230. if ( $43E <= aExp ) then begin
  4231. if ( bits64 ( a ) <> bits64( $C3E0000000000000 ) ) then begin
  4232. float_raise( float_flag_invalid );
  4233. if ( ( aSign = 0 )
  4234. or ( ( aExp = $7FF )
  4235. and ( aSig <> $0010000000000000 ) )
  4236. ) then begin
  4237. result := $7FFFFFFFFFFFFFFF;
  4238. exit;
  4239. end;
  4240. end;
  4241. result := $8000000000000000;
  4242. exit;
  4243. end;
  4244. z := aSig shl shiftCount;
  4245. end
  4246. else begin
  4247. if ( aExp < $3FE ) then begin
  4248. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  4249. result := 0;
  4250. exit;
  4251. end;
  4252. z := aSig shr ( - shiftCount );
  4253. if ( bits64( aSig shl ( shiftCount and 63 ) ) <> 0 ) then
  4254. set_inexact_flag;
  4255. end;
  4256. if ( aSign <> 0 ) then z := - z;
  4257. result := z;
  4258. end;
  4259. {*
  4260. -------------------------------------------------------------------------------
  4261. Returns the result of converting the double-precision floating-point value
  4262. `a' to the single-precision floating-point format. The conversion is
  4263. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4264. Arithmetic.
  4265. -------------------------------------------------------------------------------
  4266. *}
  4267. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  4268. Var
  4269. aSign: flag;
  4270. aExp: int16;
  4271. aSig0, aSig1, zSig: bits32;
  4272. allZero: bits32;
  4273. tmp : CommonNanT;
  4274. Begin
  4275. aSig1 := extractFloat64Frac1( a );
  4276. aSig0 := extractFloat64Frac0( a );
  4277. aExp := extractFloat64Exp( a );
  4278. aSign := extractFloat64Sign( a );
  4279. if ( aExp = $7FF ) then
  4280. Begin
  4281. if ( aSig0 OR aSig1 ) <> 0 then
  4282. Begin
  4283. float64ToCommonNaN( a, tmp );
  4284. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  4285. exit;
  4286. End;
  4287. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  4288. exit;
  4289. End;
  4290. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  4291. if ( aExp <> 0) then
  4292. zSig := zSig OR $40000000;
  4293. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  4294. End;
  4295. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  4296. {*----------------------------------------------------------------------------
  4297. | Returns the result of converting the double-precision floating-point value
  4298. | `a' to the extended double-precision floating-point format. The conversion
  4299. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4300. | Arithmetic.
  4301. *----------------------------------------------------------------------------*}
  4302. function float64_to_floatx80( a: float64 ): floatx80;
  4303. var
  4304. aSign: flag;
  4305. aExp: int16;
  4306. aSig: bits64;
  4307. begin
  4308. aSig := extractFloat64Frac( a );
  4309. aExp := extractFloat64Exp( a );
  4310. aSign := extractFloat64Sign( a );
  4311. if ( aExp = $7FF ) then begin
  4312. if ( aSig <> 0 ) then begin
  4313. result := commonNaNToFloatx80( float64ToCommonNaN( a ) );
  4314. exit;
  4315. end;
  4316. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  4317. exit;
  4318. end;
  4319. if ( aExp = 0 ) then begin
  4320. if ( aSig = 0 ) then begin
  4321. result := packFloatx80( aSign, 0, 0 );
  4322. exit;
  4323. end;
  4324. normalizeFloat64Subnormal( aSig, aExp, aSig );
  4325. end;
  4326. result :=
  4327. packFloatx80(
  4328. aSign, aExp + $3C00, ( aSig or $0010000000000000 ) shl 11 );
  4329. end;
  4330. {$endif FPC_SOFTFLOAT_FLOATX80}
  4331. {*
  4332. -------------------------------------------------------------------------------
  4333. Rounds the double-precision floating-point value `a' to an integer,
  4334. and returns the result as a double-precision floating-point value. The
  4335. operation is performed according to the IEC/IEEE Standard for Binary
  4336. Floating-Point Arithmetic.
  4337. -------------------------------------------------------------------------------
  4338. *}
  4339. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  4340. Var
  4341. aSign: flag;
  4342. aExp: int16;
  4343. lastBitMask, roundBitsMask: bits32;
  4344. roundingMode: TFPURoundingMode;
  4345. z: float64;
  4346. Begin
  4347. aExp := extractFloat64Exp( a );
  4348. if ( $413 <= aExp ) then
  4349. Begin
  4350. if ( $433 <= aExp ) then
  4351. Begin
  4352. if ( ( aExp = $7FF )
  4353. AND
  4354. (
  4355. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  4356. ) <>0)
  4357. ) then
  4358. Begin
  4359. propagateFloat64NaN( a, a, result );
  4360. exit;
  4361. End;
  4362. result := a;
  4363. exit;
  4364. End;
  4365. lastBitMask := 1;
  4366. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  4367. roundBitsMask := lastBitMask - 1;
  4368. z := a;
  4369. roundingMode := softfloat_rounding_mode;
  4370. if ( roundingMode = float_round_nearest_even ) then
  4371. Begin
  4372. if ( lastBitMask <> 0) then
  4373. Begin
  4374. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  4375. if ( ( z.low and roundBitsMask ) = 0 ) then
  4376. z.low := z.low and not lastBitMask;
  4377. End
  4378. else
  4379. Begin
  4380. if ( sbits32 (z.low) < 0 ) then
  4381. Begin
  4382. Inc(z.high);
  4383. if ( bits32 ( z.low shl 1 ) = 0 ) then
  4384. z.high := z.high and not 1;
  4385. End;
  4386. End;
  4387. End
  4388. else if ( roundingMode <> float_round_to_zero ) then
  4389. Begin
  4390. if ( extractFloat64Sign( z )
  4391. xor flag( roundingMode = float_round_up ) )<> 0 then
  4392. Begin
  4393. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4394. End;
  4395. End;
  4396. z.low := z.low and not roundBitsMask;
  4397. End
  4398. else
  4399. Begin
  4400. if ( aExp <= $3FE ) then
  4401. Begin
  4402. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4403. Begin
  4404. result := a;
  4405. exit;
  4406. End;
  4407. set_inexact_flag;
  4408. aSign := extractFloat64Sign( a );
  4409. case ( softfloat_rounding_mode ) of
  4410. float_round_nearest_even:
  4411. Begin
  4412. if ( ( aExp = $3FE )
  4413. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4414. ) then
  4415. Begin
  4416. packFloat64( aSign, $3FF, 0, 0, result );
  4417. exit;
  4418. End;
  4419. End;
  4420. float_round_down:
  4421. Begin
  4422. if aSign<>0 then
  4423. packFloat64( 1, $3FF, 0, 0, result )
  4424. else
  4425. packFloat64( 0, 0, 0, 0, result );
  4426. exit;
  4427. End;
  4428. float_round_up:
  4429. Begin
  4430. if aSign <> 0 then
  4431. packFloat64( 1, 0, 0, 0, result )
  4432. else
  4433. packFloat64( 0, $3FF, 0, 0, result );
  4434. exit;
  4435. End;
  4436. end;
  4437. packFloat64( aSign, 0, 0, 0, result );
  4438. exit;
  4439. End;
  4440. lastBitMask := 1;
  4441. lastBitMask := lastBitMask shl ($413 - aExp);
  4442. roundBitsMask := lastBitMask - 1;
  4443. z.low := 0;
  4444. z.high := a.high;
  4445. roundingMode := softfloat_rounding_mode;
  4446. if ( roundingMode = float_round_nearest_even ) then
  4447. Begin
  4448. z.high := z.high + lastBitMask shr 1;
  4449. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4450. Begin
  4451. z.high := z.high and not lastBitMask;
  4452. End;
  4453. End
  4454. else if ( roundingMode <> float_round_to_zero ) then
  4455. Begin
  4456. if ( extractFloat64Sign( z )
  4457. xor flag( roundingMode = float_round_up ) )<> 0 then
  4458. Begin
  4459. z.high := z.high or bits32( a.low <> 0 );
  4460. z.high := z.high + roundBitsMask;
  4461. End;
  4462. End;
  4463. z.high := z.high and not roundBitsMask;
  4464. End;
  4465. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4466. Begin
  4467. set_inexact_flag;
  4468. End;
  4469. result := z;
  4470. End;
  4471. {*
  4472. -------------------------------------------------------------------------------
  4473. Returns the result of adding the absolute values of the double-precision
  4474. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4475. before being returned. `zSign' is ignored if the result is a NaN.
  4476. The addition is performed according to the IEC/IEEE Standard for Binary
  4477. Floating-Point Arithmetic.
  4478. -------------------------------------------------------------------------------
  4479. *}
  4480. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4481. Var
  4482. aExp, bExp, zExp: int16;
  4483. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4484. expDiff: int16;
  4485. label shiftRight1;
  4486. label roundAndPack;
  4487. Begin
  4488. aSig1 := extractFloat64Frac1( a );
  4489. aSig0 := extractFloat64Frac0( a );
  4490. aExp := extractFloat64Exp( a );
  4491. bSig1 := extractFloat64Frac1( b );
  4492. bSig0 := extractFloat64Frac0( b );
  4493. bExp := extractFloat64Exp( b );
  4494. expDiff := aExp - bExp;
  4495. if ( 0 < expDiff ) then
  4496. Begin
  4497. if ( aExp = $7FF ) then
  4498. Begin
  4499. if ( aSig0 OR aSig1 ) <> 0 then
  4500. Begin
  4501. propagateFloat64NaN( a, b, out );
  4502. exit;
  4503. end;
  4504. out := a;
  4505. exit;
  4506. End;
  4507. if ( bExp = 0 ) then
  4508. Begin
  4509. Dec(expDiff);
  4510. End
  4511. else
  4512. Begin
  4513. bSig0 := bSig0 or $00100000;
  4514. End;
  4515. shift64ExtraRightJamming(
  4516. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4517. zExp := aExp;
  4518. End
  4519. else if ( expDiff < 0 ) then
  4520. Begin
  4521. if ( bExp = $7FF ) then
  4522. Begin
  4523. if ( bSig0 OR bSig1 ) <> 0 then
  4524. Begin
  4525. propagateFloat64NaN( a, b, out );
  4526. exit;
  4527. End;
  4528. packFloat64( zSign, $7FF, 0, 0, out );
  4529. exit;
  4530. End;
  4531. if ( aExp = 0 ) then
  4532. Begin
  4533. Inc(expDiff);
  4534. End
  4535. else
  4536. Begin
  4537. aSig0 := aSig0 or $00100000;
  4538. End;
  4539. shift64ExtraRightJamming(
  4540. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4541. zExp := bExp;
  4542. End
  4543. else
  4544. Begin
  4545. if ( aExp = $7FF ) then
  4546. Begin
  4547. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4548. Begin
  4549. propagateFloat64NaN( a, b, out );
  4550. exit;
  4551. End;
  4552. out := a;
  4553. exit;
  4554. End;
  4555. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4556. if ( aExp = 0 ) then
  4557. Begin
  4558. packFloat64( zSign, 0, zSig0, zSig1, out );
  4559. exit;
  4560. End;
  4561. zSig2 := 0;
  4562. zSig0 := zSig0 or $00200000;
  4563. zExp := aExp;
  4564. goto shiftRight1;
  4565. End;
  4566. aSig0 := aSig0 or $00100000;
  4567. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4568. Dec(zExp);
  4569. if ( zSig0 < $00200000 ) then
  4570. goto roundAndPack;
  4571. Inc(zExp);
  4572. shiftRight1:
  4573. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4574. roundAndPack:
  4575. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4576. End;
  4577. {*
  4578. -------------------------------------------------------------------------------
  4579. Returns the result of subtracting the absolute values of the double-
  4580. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4581. difference is negated before being returned. `zSign' is ignored if the
  4582. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4583. Standard for Binary Floating-Point Arithmetic.
  4584. -------------------------------------------------------------------------------
  4585. *}
  4586. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4587. Var
  4588. aExp, bExp, zExp: int16;
  4589. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4590. expDiff: int16;
  4591. z: float64;
  4592. label aExpBigger;
  4593. label bExpBigger;
  4594. label aBigger;
  4595. label bBigger;
  4596. label normalizeRoundAndPack;
  4597. Begin
  4598. aSig1 := extractFloat64Frac1( a );
  4599. aSig0 := extractFloat64Frac0( a );
  4600. aExp := extractFloat64Exp( a );
  4601. bSig1 := extractFloat64Frac1( b );
  4602. bSig0 := extractFloat64Frac0( b );
  4603. bExp := extractFloat64Exp( b );
  4604. expDiff := aExp - bExp;
  4605. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4606. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4607. if ( 0 < expDiff ) then goto aExpBigger;
  4608. if ( expDiff < 0 ) then goto bExpBigger;
  4609. if ( aExp = $7FF ) then
  4610. Begin
  4611. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4612. Begin
  4613. propagateFloat64NaN( a, b, out );
  4614. exit;
  4615. End;
  4616. float_raise( float_flag_invalid );
  4617. z.low := float64_default_nan_low;
  4618. z.high := float64_default_nan_high;
  4619. out := z;
  4620. exit;
  4621. End;
  4622. if ( aExp = 0 ) then
  4623. Begin
  4624. aExp := 1;
  4625. bExp := 1;
  4626. End;
  4627. if ( bSig0 < aSig0 ) then goto aBigger;
  4628. if ( aSig0 < bSig0 ) then goto bBigger;
  4629. if ( bSig1 < aSig1 ) then goto aBigger;
  4630. if ( aSig1 < bSig1 ) then goto bBigger;
  4631. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4632. exit;
  4633. bExpBigger:
  4634. if ( bExp = $7FF ) then
  4635. Begin
  4636. if ( bSig0 OR bSig1 ) <> 0 then
  4637. Begin
  4638. propagateFloat64NaN( a, b, out );
  4639. exit;
  4640. End;
  4641. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4642. exit;
  4643. End;
  4644. if ( aExp = 0 ) then
  4645. Begin
  4646. Inc(expDiff);
  4647. End
  4648. else
  4649. Begin
  4650. aSig0 := aSig0 or $40000000;
  4651. End;
  4652. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4653. bSig0 := bSig0 or $40000000;
  4654. bBigger:
  4655. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4656. zExp := bExp;
  4657. zSign := zSign xor 1;
  4658. goto normalizeRoundAndPack;
  4659. aExpBigger:
  4660. if ( aExp = $7FF ) then
  4661. Begin
  4662. if ( aSig0 OR aSig1 ) <> 0 then
  4663. Begin
  4664. propagateFloat64NaN( a, b, out );
  4665. exit;
  4666. End;
  4667. out := a;
  4668. exit;
  4669. End;
  4670. if ( bExp = 0 ) then
  4671. Begin
  4672. Dec(expDiff);
  4673. End
  4674. else
  4675. Begin
  4676. bSig0 := bSig0 or $40000000;
  4677. End;
  4678. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4679. aSig0 := aSig0 or $40000000;
  4680. aBigger:
  4681. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4682. zExp := aExp;
  4683. normalizeRoundAndPack:
  4684. Dec(zExp);
  4685. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4686. End;
  4687. {*
  4688. -------------------------------------------------------------------------------
  4689. Returns the result of adding the double-precision floating-point values `a'
  4690. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4691. Binary Floating-Point Arithmetic.
  4692. -------------------------------------------------------------------------------
  4693. *}
  4694. Function float64_add( a: float64; b : float64) : Float64;
  4695. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4696. Var
  4697. aSign, bSign: flag;
  4698. Begin
  4699. aSign := extractFloat64Sign( a );
  4700. bSign := extractFloat64Sign( b );
  4701. if ( aSign = bSign ) then
  4702. Begin
  4703. addFloat64Sigs( a, b, aSign, result );
  4704. End
  4705. else
  4706. Begin
  4707. subFloat64Sigs( a, b, aSign, result );
  4708. End;
  4709. End;
  4710. {*
  4711. -------------------------------------------------------------------------------
  4712. Returns the result of subtracting the double-precision floating-point values
  4713. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4714. for Binary Floating-Point Arithmetic.
  4715. -------------------------------------------------------------------------------
  4716. *}
  4717. Function float64_sub(a: float64; b : float64) : Float64;
  4718. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4719. Var
  4720. aSign, bSign: flag;
  4721. Begin
  4722. aSign := extractFloat64Sign( a );
  4723. bSign := extractFloat64Sign( b );
  4724. if ( aSign = bSign ) then
  4725. Begin
  4726. subFloat64Sigs( a, b, aSign, result );
  4727. End
  4728. else
  4729. Begin
  4730. addFloat64Sigs( a, b, aSign, result );
  4731. End;
  4732. End;
  4733. {*
  4734. -------------------------------------------------------------------------------
  4735. Returns the result of multiplying the double-precision floating-point values
  4736. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4737. for Binary Floating-Point Arithmetic.
  4738. -------------------------------------------------------------------------------
  4739. *}
  4740. Function float64_mul( a: float64; b:float64) : Float64;
  4741. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4742. Var
  4743. aSign, bSign, zSign: flag;
  4744. aExp, bExp, zExp: int16;
  4745. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4746. z: float64;
  4747. label invalid;
  4748. Begin
  4749. aSig1 := extractFloat64Frac1( a );
  4750. aSig0 := extractFloat64Frac0( a );
  4751. aExp := extractFloat64Exp( a );
  4752. aSign := extractFloat64Sign( a );
  4753. bSig1 := extractFloat64Frac1( b );
  4754. bSig0 := extractFloat64Frac0( b );
  4755. bExp := extractFloat64Exp( b );
  4756. bSign := extractFloat64Sign( b );
  4757. zSign := aSign xor bSign;
  4758. if ( aExp = $7FF ) then
  4759. Begin
  4760. if ( (( aSig0 OR aSig1 ) <>0)
  4761. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4762. Begin
  4763. propagateFloat64NaN( a, b, result );
  4764. exit;
  4765. End;
  4766. if ( ( bits32(bExp) OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4767. packFloat64( zSign, $7FF, 0, 0, result );
  4768. exit;
  4769. End;
  4770. if ( bExp = $7FF ) then
  4771. Begin
  4772. if ( bSig0 OR bSig1 )<> 0 then
  4773. Begin
  4774. propagateFloat64NaN( a, b, result );
  4775. exit;
  4776. End;
  4777. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4778. Begin
  4779. invalid:
  4780. float_raise( float_flag_invalid );
  4781. z.low := float64_default_nan_low;
  4782. z.high := float64_default_nan_high;
  4783. result := z;
  4784. exit;
  4785. End;
  4786. packFloat64( zSign, $7FF, 0, 0, result );
  4787. exit;
  4788. End;
  4789. if ( aExp = 0 ) then
  4790. Begin
  4791. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4792. Begin
  4793. packFloat64( zSign, 0, 0, 0, result );
  4794. exit;
  4795. End;
  4796. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4797. End;
  4798. if ( bExp = 0 ) then
  4799. Begin
  4800. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4801. Begin
  4802. packFloat64( zSign, 0, 0, 0, result );
  4803. exit;
  4804. End;
  4805. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4806. End;
  4807. zExp := aExp + bExp - $400;
  4808. aSig0 := aSig0 or $00100000;
  4809. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4810. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4811. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4812. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4813. if ( $00200000 <= zSig0 ) then
  4814. Begin
  4815. shift64ExtraRightJamming(
  4816. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4817. Inc(zExp);
  4818. End;
  4819. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4820. End;
  4821. {*
  4822. -------------------------------------------------------------------------------
  4823. Returns the result of dividing the double-precision floating-point value `a'
  4824. by the corresponding value `b'. The operation is performed according to the
  4825. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4826. -------------------------------------------------------------------------------
  4827. *}
  4828. Function float64_div(a: float64; b : float64) : Float64;
  4829. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4830. Var
  4831. aSign, bSign, zSign: flag;
  4832. aExp, bExp, zExp: int16;
  4833. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4834. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4835. z: float64;
  4836. label invalid;
  4837. Begin
  4838. aSig1 := extractFloat64Frac1( a );
  4839. aSig0 := extractFloat64Frac0( a );
  4840. aExp := extractFloat64Exp( a );
  4841. aSign := extractFloat64Sign( a );
  4842. bSig1 := extractFloat64Frac1( b );
  4843. bSig0 := extractFloat64Frac0( b );
  4844. bExp := extractFloat64Exp( b );
  4845. bSign := extractFloat64Sign( b );
  4846. zSign := aSign xor bSign;
  4847. if ( aExp = $7FF ) then
  4848. Begin
  4849. if ( aSig0 OR aSig1 )<> 0 then
  4850. Begin
  4851. propagateFloat64NaN( a, b, result );
  4852. exit;
  4853. end;
  4854. if ( bExp = $7FF ) then
  4855. Begin
  4856. if ( bSig0 OR bSig1 )<>0 then
  4857. Begin
  4858. propagateFloat64NaN( a, b, result );
  4859. exit;
  4860. End;
  4861. goto invalid;
  4862. End;
  4863. packFloat64( zSign, $7FF, 0, 0, result );
  4864. exit;
  4865. End;
  4866. if ( bExp = $7FF ) then
  4867. Begin
  4868. if ( bSig0 OR bSig1 )<> 0 then
  4869. Begin
  4870. propagateFloat64NaN( a, b, result );
  4871. exit;
  4872. End;
  4873. packFloat64( zSign, 0, 0, 0, result );
  4874. exit;
  4875. End;
  4876. if ( bExp = 0 ) then
  4877. Begin
  4878. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4879. Begin
  4880. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  4881. Begin
  4882. invalid:
  4883. float_raise( float_flag_invalid );
  4884. z.low := float64_default_nan_low;
  4885. z.high := float64_default_nan_high;
  4886. result := z;
  4887. exit;
  4888. End;
  4889. float_raise( float_flag_divbyzero );
  4890. packFloat64( zSign, $7FF, 0, 0, result );
  4891. exit;
  4892. End;
  4893. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4894. End;
  4895. if ( aExp = 0 ) then
  4896. Begin
  4897. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4898. Begin
  4899. packFloat64( zSign, 0, 0, 0, result );
  4900. exit;
  4901. End;
  4902. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4903. End;
  4904. zExp := aExp - bExp + $3FD;
  4905. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4906. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4907. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4908. Begin
  4909. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4910. Inc(zExp);
  4911. End;
  4912. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4913. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4914. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4915. while ( sbits32 (rem0) < 0 ) do
  4916. Begin
  4917. Dec(zSig0);
  4918. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4919. End;
  4920. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4921. if ( ( zSig1 and $3FF ) <= 4 ) then
  4922. Begin
  4923. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4924. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4925. while ( sbits32 (rem1) < 0 ) do
  4926. Begin
  4927. Dec(zSig1);
  4928. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4929. End;
  4930. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4931. End;
  4932. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4933. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4934. End;
  4935. {*
  4936. -------------------------------------------------------------------------------
  4937. Returns the remainder of the double-precision floating-point value `a'
  4938. with respect to the corresponding value `b'. The operation is performed
  4939. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4940. -------------------------------------------------------------------------------
  4941. *}
  4942. Function float64_rem(a: float64; b : float64) : float64;
  4943. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4944. Var
  4945. aSign, zSign: flag;
  4946. aExp, bExp, expDiff: int16;
  4947. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4948. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4949. sigMean0: sbits32;
  4950. z: float64;
  4951. label invalid;
  4952. Begin
  4953. aSig1 := extractFloat64Frac1( a );
  4954. aSig0 := extractFloat64Frac0( a );
  4955. aExp := extractFloat64Exp( a );
  4956. aSign := extractFloat64Sign( a );
  4957. bSig1 := extractFloat64Frac1( b );
  4958. bSig0 := extractFloat64Frac0( b );
  4959. bExp := extractFloat64Exp( b );
  4960. if ( aExp = $7FF ) then
  4961. Begin
  4962. if ((( aSig0 OR aSig1 )<>0)
  4963. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4964. Begin
  4965. propagateFloat64NaN( a, b, result );
  4966. exit;
  4967. End;
  4968. goto invalid;
  4969. End;
  4970. if ( bExp = $7FF ) then
  4971. Begin
  4972. if ( bSig0 OR bSig1 ) <> 0 then
  4973. Begin
  4974. propagateFloat64NaN( a, b, result );
  4975. exit;
  4976. End;
  4977. result := a;
  4978. exit;
  4979. End;
  4980. if ( bExp = 0 ) then
  4981. Begin
  4982. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4983. Begin
  4984. invalid:
  4985. float_raise( float_flag_invalid );
  4986. z.low := float64_default_nan_low;
  4987. z.high := float64_default_nan_high;
  4988. result := z;
  4989. exit;
  4990. End;
  4991. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4992. End;
  4993. if ( aExp = 0 ) then
  4994. Begin
  4995. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4996. Begin
  4997. result := a;
  4998. exit;
  4999. End;
  5000. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5001. End;
  5002. expDiff := aExp - bExp;
  5003. if ( expDiff < -1 ) then
  5004. Begin
  5005. result := a;
  5006. exit;
  5007. End;
  5008. shortShift64Left(
  5009. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  5010. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5011. q := le64( bSig0, bSig1, aSig0, aSig1 );
  5012. if ( q )<>0 then
  5013. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5014. expDiff := expDiff - 32;
  5015. while ( 0 < expDiff ) do
  5016. Begin
  5017. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5018. if 4 < q then
  5019. q:= q - 4
  5020. else
  5021. q := 0;
  5022. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5023. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  5024. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  5025. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  5026. expDiff := expDiff - 29;
  5027. End;
  5028. if ( -32 < expDiff ) then
  5029. Begin
  5030. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5031. if 4 < q then
  5032. q := q - 4
  5033. else
  5034. q := 0;
  5035. q := q shr (- expDiff);
  5036. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5037. expDiff := expDiff + 24;
  5038. if ( expDiff < 0 ) then
  5039. Begin
  5040. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  5041. End
  5042. else
  5043. Begin
  5044. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  5045. End;
  5046. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5047. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  5048. End
  5049. else
  5050. Begin
  5051. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  5052. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5053. End;
  5054. Repeat
  5055. alternateASig0 := aSig0;
  5056. alternateASig1 := aSig1;
  5057. Inc(q);
  5058. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5059. Until not ( 0 <= sbits32 (aSig0) );
  5060. add64(
  5061. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  5062. if ( ( sigMean0 < 0 )
  5063. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  5064. Begin
  5065. aSig0 := alternateASig0;
  5066. aSig1 := alternateASig1;
  5067. End;
  5068. zSign := flag( sbits32 (aSig0) < 0 );
  5069. if ( zSign <> 0 ) then
  5070. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  5071. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  5072. End;
  5073. {*
  5074. -------------------------------------------------------------------------------
  5075. Returns the square root of the double-precision floating-point value `a'.
  5076. The operation is performed according to the IEC/IEEE Standard for Binary
  5077. Floating-Point Arithmetic.
  5078. -------------------------------------------------------------------------------
  5079. *}
  5080. function float64_sqrt( a: float64 ): float64;
  5081. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  5082. Var
  5083. aSign: flag;
  5084. aExp, zExp: int16;
  5085. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  5086. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  5087. label invalid;
  5088. Begin
  5089. aSig1 := extractFloat64Frac1( a );
  5090. aSig0 := extractFloat64Frac0( a );
  5091. aExp := extractFloat64Exp( a );
  5092. aSign := extractFloat64Sign( a );
  5093. if ( aExp = $7FF ) then
  5094. Begin
  5095. if ( aSig0 OR aSig1 ) <> 0 then
  5096. Begin
  5097. propagateFloat64NaN( a, a, result );
  5098. exit;
  5099. End;
  5100. if ( aSign = 0) then
  5101. Begin
  5102. result := a;
  5103. exit;
  5104. End;
  5105. goto invalid;
  5106. End;
  5107. if ( aSign <> 0 ) then
  5108. Begin
  5109. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  5110. Begin
  5111. result := a;
  5112. exit;
  5113. End;
  5114. invalid:
  5115. float_raise( float_flag_invalid );
  5116. result.low := float64_default_nan_low;
  5117. result.high := float64_default_nan_high;
  5118. exit;
  5119. End;
  5120. if ( aExp = 0 ) then
  5121. Begin
  5122. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5123. Begin
  5124. packFloat64( 0, 0, 0, 0, result );
  5125. exit;
  5126. End;
  5127. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5128. End;
  5129. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  5130. aSig0 := aSig0 or $00100000;
  5131. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  5132. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  5133. if ( zSig0 = 0 ) then
  5134. zSig0 := $7FFFFFFF;
  5135. doubleZSig0 := zSig0 + zSig0;
  5136. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  5137. mul32To64( zSig0, zSig0, term0, term1 );
  5138. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  5139. while ( sbits32 (rem0) < 0 ) do
  5140. Begin
  5141. Dec(zSig0);
  5142. doubleZSig0 := doubleZSig0 - 2;
  5143. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  5144. End;
  5145. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  5146. if ( ( zSig1 and $1FF ) <= 5 ) then
  5147. Begin
  5148. if ( zSig1 = 0 ) then
  5149. zSig1 := 1;
  5150. mul32To64( doubleZSig0, zSig1, term1, term2 );
  5151. sub64( rem1, 0, term1, term2, rem1, rem2 );
  5152. mul32To64( zSig1, zSig1, term2, term3 );
  5153. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  5154. while ( sbits32 (rem1) < 0 ) do
  5155. Begin
  5156. Dec(zSig1);
  5157. shortShift64Left( 0, zSig1, 1, term2, term3 );
  5158. term3 := term3 or 1;
  5159. term2 := term2 or doubleZSig0;
  5160. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  5161. End;
  5162. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5163. End;
  5164. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  5165. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, result );
  5166. End;
  5167. {*
  5168. -------------------------------------------------------------------------------
  5169. Returns 1 if the double-precision floating-point value `a' is equal to
  5170. the corresponding value `b', and 0 otherwise. The comparison is performed
  5171. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5172. -------------------------------------------------------------------------------
  5173. *}
  5174. Function float64_eq(a: float64; b: float64): flag;
  5175. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  5176. Begin
  5177. if
  5178. (
  5179. ( extractFloat64Exp( a ) = $7FF )
  5180. AND
  5181. (
  5182. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5183. )
  5184. )
  5185. OR (
  5186. ( extractFloat64Exp( b ) = $7FF )
  5187. AND (
  5188. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5189. )
  5190. )
  5191. ) then
  5192. Begin
  5193. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5194. float_raise( float_flag_invalid );
  5195. float64_eq := 0;
  5196. exit;
  5197. End;
  5198. float64_eq := flag(
  5199. ( a.low = b.low )
  5200. AND ( ( a.high = b.high )
  5201. OR ( ( a.low = 0 )
  5202. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5203. ));
  5204. End;
  5205. {*
  5206. -------------------------------------------------------------------------------
  5207. Returns 1 if the double-precision floating-point value `a' is less than
  5208. or equal to the corresponding value `b', and 0 otherwise. The comparison
  5209. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5210. Arithmetic.
  5211. -------------------------------------------------------------------------------
  5212. *}
  5213. Function float64_le(a: float64;b: float64): flag;
  5214. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  5215. Var
  5216. aSign, bSign: flag;
  5217. Begin
  5218. if
  5219. (
  5220. ( extractFloat64Exp( a ) = $7FF )
  5221. AND
  5222. (
  5223. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5224. )
  5225. )
  5226. OR (
  5227. ( extractFloat64Exp( b ) = $7FF )
  5228. AND (
  5229. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5230. )
  5231. )
  5232. ) then
  5233. Begin
  5234. float_raise( float_flag_invalid );
  5235. float64_le := 0;
  5236. exit;
  5237. End;
  5238. aSign := extractFloat64Sign( a );
  5239. bSign := extractFloat64Sign( b );
  5240. if ( aSign <> bSign ) then
  5241. Begin
  5242. float64_le := flag(
  5243. (aSign <> 0)
  5244. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5245. = 0 ));
  5246. exit;
  5247. End;
  5248. if aSign <> 0 then
  5249. float64_le := le64( b.high, b.low, a.high, a.low )
  5250. else
  5251. float64_le := le64( a.high, a.low, b.high, b.low );
  5252. End;
  5253. {*
  5254. -------------------------------------------------------------------------------
  5255. Returns 1 if the double-precision floating-point value `a' is less than
  5256. the corresponding value `b', and 0 otherwise. The comparison is performed
  5257. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5258. -------------------------------------------------------------------------------
  5259. *}
  5260. Function float64_lt(a: float64;b: float64): flag;
  5261. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  5262. Var
  5263. aSign, bSign: flag;
  5264. Begin
  5265. if
  5266. (
  5267. ( extractFloat64Exp( a ) = $7FF )
  5268. AND
  5269. (
  5270. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5271. )
  5272. )
  5273. OR (
  5274. ( extractFloat64Exp( b ) = $7FF )
  5275. AND (
  5276. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5277. )
  5278. )
  5279. ) then
  5280. Begin
  5281. float_raise( float_flag_invalid );
  5282. float64_lt := 0;
  5283. exit;
  5284. End;
  5285. aSign := extractFloat64Sign( a );
  5286. bSign := extractFloat64Sign( b );
  5287. if ( aSign <> bSign ) then
  5288. Begin
  5289. float64_lt := flag(
  5290. (aSign <> 0)
  5291. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5292. <> 0 ));
  5293. exit;
  5294. End;
  5295. if aSign <> 0 then
  5296. float64_lt := lt64( b.high, b.low, a.high, a.low )
  5297. else
  5298. float64_lt := lt64( a.high, a.low, b.high, b.low );
  5299. End;
  5300. {*
  5301. -------------------------------------------------------------------------------
  5302. Returns 1 if the double-precision floating-point value `a' is equal to
  5303. the corresponding value `b', and 0 otherwise. The invalid exception is
  5304. raised if either operand is a NaN. Otherwise, the comparison is performed
  5305. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5306. -------------------------------------------------------------------------------
  5307. *}
  5308. Function float64_eq_signaling( a: float64; b: float64): flag;
  5309. Begin
  5310. if
  5311. (
  5312. ( extractFloat64Exp( a ) = $7FF )
  5313. AND
  5314. (
  5315. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5316. )
  5317. )
  5318. OR (
  5319. ( extractFloat64Exp( b ) = $7FF )
  5320. AND (
  5321. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5322. )
  5323. )
  5324. ) then
  5325. Begin
  5326. float_raise( float_flag_invalid );
  5327. float64_eq_signaling := 0;
  5328. exit;
  5329. End;
  5330. float64_eq_signaling := flag(
  5331. ( a.low = b.low )
  5332. AND ( ( a.high = b.high )
  5333. OR ( ( a.low = 0 )
  5334. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5335. ));
  5336. End;
  5337. {*
  5338. -------------------------------------------------------------------------------
  5339. Returns 1 if the double-precision floating-point value `a' is less than or
  5340. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  5341. cause an exception. Otherwise, the comparison is performed according to the
  5342. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5343. -------------------------------------------------------------------------------
  5344. *}
  5345. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  5346. Var
  5347. aSign, bSign : flag;
  5348. Begin
  5349. if
  5350. (
  5351. ( extractFloat64Exp( a ) = $7FF )
  5352. AND
  5353. (
  5354. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5355. )
  5356. )
  5357. OR (
  5358. ( extractFloat64Exp( b ) = $7FF )
  5359. AND (
  5360. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5361. )
  5362. )
  5363. ) then
  5364. Begin
  5365. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5366. float_raise( float_flag_invalid );
  5367. float64_le_quiet := 0;
  5368. exit;
  5369. End;
  5370. aSign := extractFloat64Sign( a );
  5371. bSign := extractFloat64Sign( b );
  5372. if ( aSign <> bSign ) then
  5373. Begin
  5374. float64_le_quiet := flag
  5375. ((aSign <> 0)
  5376. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5377. = 0 ));
  5378. exit;
  5379. End;
  5380. if aSign <> 0 then
  5381. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  5382. else
  5383. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  5384. End;
  5385. {*
  5386. -------------------------------------------------------------------------------
  5387. Returns 1 if the double-precision floating-point value `a' is less than
  5388. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5389. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5390. Standard for Binary Floating-Point Arithmetic.
  5391. -------------------------------------------------------------------------------
  5392. *}
  5393. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5394. Var
  5395. aSign, bSign: flag;
  5396. Begin
  5397. if
  5398. (
  5399. ( extractFloat64Exp( a ) = $7FF )
  5400. AND
  5401. (
  5402. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5403. )
  5404. )
  5405. OR (
  5406. ( extractFloat64Exp( b ) = $7FF )
  5407. AND (
  5408. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5409. )
  5410. )
  5411. ) then
  5412. Begin
  5413. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5414. float_raise( float_flag_invalid );
  5415. float64_lt_quiet := 0;
  5416. exit;
  5417. End;
  5418. aSign := extractFloat64Sign( a );
  5419. bSign := extractFloat64Sign( b );
  5420. if ( aSign <> bSign ) then
  5421. Begin
  5422. float64_lt_quiet := flag(
  5423. (aSign<>0)
  5424. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5425. <> 0 ));
  5426. exit;
  5427. End;
  5428. If aSign <> 0 then
  5429. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5430. else
  5431. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5432. End;
  5433. {*----------------------------------------------------------------------------
  5434. | Returns the result of converting the 64-bit two's complement integer `a'
  5435. | to the single-precision floating-point format. The conversion is performed
  5436. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5437. *----------------------------------------------------------------------------*}
  5438. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5439. var
  5440. zSign : flag;
  5441. absA : uint64;
  5442. shiftCount: int8;
  5443. Begin
  5444. if ( a = 0 ) then
  5445. begin
  5446. int64_to_float32.float32 := 0;
  5447. exit;
  5448. end;
  5449. if a < 0 then
  5450. zSign := flag(TRUE)
  5451. else
  5452. zSign := flag(FALSE);
  5453. if zSign<>0 then
  5454. absA := -a
  5455. else
  5456. absA := a;
  5457. shiftCount := countLeadingZeros64( absA ) - 40;
  5458. if ( 0 <= shiftCount ) then
  5459. begin
  5460. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5461. end
  5462. else
  5463. begin
  5464. shiftCount := shiftCount + 7;
  5465. if ( shiftCount < 0 ) then
  5466. shift64RightJamming( absA, - shiftCount, absA )
  5467. else
  5468. absA := absA shl shiftCount;
  5469. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5470. end;
  5471. End;
  5472. {*----------------------------------------------------------------------------
  5473. | Returns the result of converting the 64-bit two's complement integer `a'
  5474. | to the single-precision floating-point format. The conversion is performed
  5475. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5476. | Unisgned version.
  5477. *----------------------------------------------------------------------------*}
  5478. function qword_to_float32( a: qword ): float32rec; compilerproc;
  5479. var
  5480. absA : uint64;
  5481. shiftCount: int8;
  5482. Begin
  5483. if ( a = 0 ) then
  5484. begin
  5485. qword_to_float32.float32 := 0;
  5486. exit;
  5487. end;
  5488. absA := a;
  5489. shiftCount := countLeadingZeros64( absA ) - 40;
  5490. if ( 0 <= shiftCount ) then
  5491. begin
  5492. qword_to_float32.float32:= packFloat32( 0, $95 - shiftCount, absA shl shiftCount );
  5493. end
  5494. else
  5495. begin
  5496. shiftCount := shiftCount + 7;
  5497. if ( shiftCount < 0 ) then
  5498. shift64RightJamming( absA, - shiftCount, absA )
  5499. else
  5500. absA := absA shl shiftCount;
  5501. qword_to_float32.float32:=roundAndPackFloat32( 0, $9C - shiftCount, absA );
  5502. end;
  5503. End;
  5504. {*----------------------------------------------------------------------------
  5505. | Returns the result of converting the 64-bit two's complement integer `a'
  5506. | to the double-precision floating-point format. The conversion is performed
  5507. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5508. *----------------------------------------------------------------------------*}
  5509. function qword_to_float64( a: qword ): float64;
  5510. {$ifdef fpc}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5511. var
  5512. shiftcount : int8;
  5513. Begin
  5514. if ( a = 0 ) then
  5515. Begin
  5516. result:=packFloat64( 0, 0, 0);
  5517. exit;
  5518. end;
  5519. shiftCount := countLeadingZeros64( a ) - 11;
  5520. if ( 0 <= shiftCount ) then
  5521. a := a shl shiftcount
  5522. else
  5523. a := a shr (-shiftCount);
  5524. result := packFloat64( 0, $432 - shiftCount, a );
  5525. End;
  5526. {*----------------------------------------------------------------------------
  5527. | Returns the result of converting the 64-bit two's complement integer `a'
  5528. | to the double-precision floating-point format. The conversion is performed
  5529. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5530. *----------------------------------------------------------------------------*}
  5531. function int64_to_float64( a: int64 ): float64;
  5532. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5533. var
  5534. zSign : flag;
  5535. AbsA : bits64;
  5536. shiftcount : int8;
  5537. Begin
  5538. if ( a = 0 ) then
  5539. Begin
  5540. result:=packFloat64( 0, 0, 0);
  5541. exit;
  5542. end;
  5543. zSign := flag( a < 0 );
  5544. if ZSign<>0 then
  5545. AbsA := -a
  5546. else
  5547. AbsA := a;
  5548. shiftCount := countLeadingZeros64( absA ) - 11;
  5549. if ( 0 <= shiftCount ) then
  5550. absA := absA shl shiftcount
  5551. else
  5552. absA := absA shr (-shiftcount);
  5553. result := packFloat64( zSign, $432 - shiftCount, absA );
  5554. End;
  5555. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5556. {*----------------------------------------------------------------------------
  5557. | Returns the result of converting the 64-bit two's complement integer `a'
  5558. | to the extended double-precision floating-point format. The conversion
  5559. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5560. | Arithmetic.
  5561. *----------------------------------------------------------------------------*}
  5562. function int64_to_floatx80( a: int64 ): floatx80;
  5563. var
  5564. zSign: flag;
  5565. absA: uint64;
  5566. shiftCount: int8;
  5567. begin
  5568. if ( a = 0 ) then begin
  5569. result := packFloatx80( 0, 0, 0 );
  5570. exit;
  5571. end;
  5572. zSign := ord( a < 0 );
  5573. if zSign <> 0 then absA := - a else absA := a;
  5574. shiftCount := countLeadingZeros64( absA );
  5575. result := packFloatx80( zSign, $403E - shiftCount, absA shl shiftCount );
  5576. end;
  5577. {*----------------------------------------------------------------------------
  5578. | Returns the result of converting the 64-bit two's complement integer `a'
  5579. | to the extended double-precision floating-point format. The conversion
  5580. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5581. | Arithmetic.
  5582. | Unsigned version.
  5583. *----------------------------------------------------------------------------*}
  5584. function qword_to_floatx80( a: qword ): floatx80;
  5585. var
  5586. absA: bits64;
  5587. shiftCount: int8;
  5588. begin
  5589. if ( a = 0 ) then begin
  5590. result := packFloatx80( 0, 0, 0 );
  5591. exit;
  5592. end;
  5593. absA := a;
  5594. shiftCount := countLeadingZeros64( absA );
  5595. result := packFloatx80( 0, $403E - shiftCount, absA shl shiftCount );
  5596. end;
  5597. {$endif FPC_SOFTFLOAT_FLOATX80}
  5598. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5599. {*----------------------------------------------------------------------------
  5600. | Returns the result of converting the 64-bit two's complement integer `a' to
  5601. | the quadruple-precision floating-point format. The conversion is performed
  5602. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5603. *----------------------------------------------------------------------------*}
  5604. function int64_to_float128( a: int64 ): float128;
  5605. var
  5606. zSign: flag;
  5607. absA: uint64;
  5608. shiftCount: int8;
  5609. zExp: int32;
  5610. zSig0, zSig1: bits64;
  5611. begin
  5612. if ( a = 0 ) then begin
  5613. result := packFloat128( 0, 0, 0, 0 );
  5614. exit;
  5615. end;
  5616. zSign := ord( a < 0 );
  5617. if zSign <> 0 then absA := - a else absA := a;
  5618. shiftCount := countLeadingZeros64( absA ) + 49;
  5619. zExp := $406E - shiftCount;
  5620. if ( 64 <= shiftCount ) then begin
  5621. zSig1 := 0;
  5622. zSig0 := absA;
  5623. dec( shiftCount, 64 );
  5624. end
  5625. else begin
  5626. zSig1 := absA;
  5627. zSig0 := 0;
  5628. end;
  5629. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5630. result := packFloat128( zSign, zExp, zSig0, zSig1 );
  5631. end;
  5632. {*----------------------------------------------------------------------------
  5633. | Returns the result of converting the 64-bit two's complement integer `a' to
  5634. | the quadruple-precision floating-point format. The conversion is performed
  5635. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5636. | Unsigned version.
  5637. *----------------------------------------------------------------------------*}
  5638. function qword_to_float128( a: qword ): float128;
  5639. var
  5640. absA: bits64;
  5641. shiftCount: int8;
  5642. zExp: int32;
  5643. zSig0, zSig1: bits64;
  5644. begin
  5645. if ( a = 0 ) then begin
  5646. result := packFloat128( 0, 0, 0, 0 );
  5647. exit;
  5648. end;
  5649. absA := a;
  5650. shiftCount := countLeadingZeros64( absA ) + 49;
  5651. zExp := $406E - shiftCount;
  5652. if ( 64 <= shiftCount ) then begin
  5653. zSig1 := 0;
  5654. zSig0 := absA;
  5655. dec( shiftCount, 64 );
  5656. end
  5657. else begin
  5658. zSig1 := absA;
  5659. zSig0 := 0;
  5660. end;
  5661. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5662. result := packFloat128( 0, zExp, zSig0, zSig1 );
  5663. end;
  5664. {$endif FPC_SOFTFLOAT_FLOAT128}
  5665. {*----------------------------------------------------------------------------
  5666. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5667. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5668. | Otherwise, returns 0.
  5669. *----------------------------------------------------------------------------*}
  5670. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5671. begin
  5672. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5673. end;
  5674. {*----------------------------------------------------------------------------
  5675. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5676. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5677. | Otherwise, returns 0.
  5678. *----------------------------------------------------------------------------*}
  5679. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5680. begin
  5681. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5682. end;
  5683. {*----------------------------------------------------------------------------
  5684. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5685. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5686. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5687. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5688. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5689. | the most-significant bit of the extra result, and the other 63 bits of the
  5690. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5691. | were all zero. This extra result is stored in the location pointed to by
  5692. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5693. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5694. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5695. | fixed-point value is shifted right by the number of bits given in `count',
  5696. | and the integer part of the result is returned at the locations pointed to
  5697. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5698. | corrupted as described above, and is returned at the location pointed to by
  5699. | `z2Ptr'.)
  5700. *----------------------------------------------------------------------------*}
  5701. procedure shift128ExtraRightJamming(
  5702. a0: bits64;
  5703. a1: bits64;
  5704. a2: bits64;
  5705. count: int16;
  5706. var z0Ptr: bits64;
  5707. var z1Ptr: bits64;
  5708. var z2Ptr: bits64);
  5709. var
  5710. z0, z1, z2: bits64;
  5711. negCount: int8;
  5712. begin
  5713. negCount := ( - count ) and 63;
  5714. if ( count = 0 ) then
  5715. begin
  5716. z2 := a2;
  5717. z1 := a1;
  5718. z0 := a0;
  5719. end
  5720. else begin
  5721. if ( count < 64 ) then
  5722. begin
  5723. z2 := a1 shl negCount;
  5724. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5725. z0 := a0 shr count;
  5726. end
  5727. else begin
  5728. if ( count = 64 ) then
  5729. begin
  5730. z2 := a1;
  5731. z1 := a0;
  5732. end
  5733. else begin
  5734. a2 := a2 or a1;
  5735. if ( count < 128 ) then
  5736. begin
  5737. z2 := a0 shl negCount;
  5738. z1 := a0 shr ( count and 63 );
  5739. end
  5740. else begin
  5741. if ( count = 128 ) then
  5742. z2 := a0
  5743. else
  5744. z2 := ord( a0 <> 0 );
  5745. z1 := 0;
  5746. end;
  5747. end;
  5748. z0 := 0;
  5749. end;
  5750. z2 := z2 or ord( a2 <> 0 );
  5751. end;
  5752. z2Ptr := z2;
  5753. z1Ptr := z1;
  5754. z0Ptr := z0;
  5755. end;
  5756. {*----------------------------------------------------------------------------
  5757. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5758. | _plus_ the number of bits given in `count'. The shifted result is at most
  5759. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5760. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5761. | shifted off is the most-significant bit of the extra result, and the other
  5762. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5763. | bits shifted off were all zero. This extra result is stored in the location
  5764. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5765. | (This routine makes more sense if `a0' and `a1' are considered to form
  5766. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5767. | point value is shifted right by the number of bits given in `count', and
  5768. | the integer part of the result is returned at the location pointed to by
  5769. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5770. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5771. *----------------------------------------------------------------------------*}
  5772. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5773. var
  5774. z0, z1: bits64;
  5775. negCount: int8;
  5776. begin
  5777. negCount := ( - count ) and 63;
  5778. if ( count = 0 ) then
  5779. begin
  5780. z1 := a1;
  5781. z0 := a0;
  5782. end
  5783. else if ( count < 64 ) then
  5784. begin
  5785. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5786. z0 := a0 shr count;
  5787. end
  5788. else begin
  5789. if ( count = 64 ) then
  5790. begin
  5791. z1 := a0 or ord( a1 <> 0 );
  5792. end
  5793. else begin
  5794. z1 := ord( ( a0 or a1 ) <> 0 );
  5795. end;
  5796. z0 := 0;
  5797. end;
  5798. z1Ptr := z1;
  5799. z0Ptr := z0;
  5800. end;
  5801. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5802. {*----------------------------------------------------------------------------
  5803. | Returns the fraction bits of the extended double-precision floating-point
  5804. | value `a'.
  5805. *----------------------------------------------------------------------------*}
  5806. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5807. begin
  5808. result:=a.low;
  5809. end;
  5810. {*----------------------------------------------------------------------------
  5811. | Returns the exponent bits of the extended double-precision floating-point
  5812. | value `a'.
  5813. *----------------------------------------------------------------------------*}
  5814. function extractFloatx80Exp(a : floatx80): int32;inline;
  5815. begin
  5816. result:=a.high and $7FFF;
  5817. end;
  5818. {*----------------------------------------------------------------------------
  5819. | Returns the sign bit of the extended double-precision floating-point value
  5820. | `a'.
  5821. *----------------------------------------------------------------------------*}
  5822. function extractFloatx80Sign(a : floatx80): flag;inline;
  5823. begin
  5824. result:=a.high shr 15;
  5825. end;
  5826. {*----------------------------------------------------------------------------
  5827. | Normalizes the subnormal extended double-precision floating-point value
  5828. | represented by the denormalized significand `aSig'. The normalized exponent
  5829. | and significand are stored at the locations pointed to by `zExpPtr' and
  5830. | `zSigPtr', respectively.
  5831. *----------------------------------------------------------------------------*}
  5832. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5833. var
  5834. shiftCount: int8;
  5835. begin
  5836. shiftCount := countLeadingZeros64( aSig );
  5837. zSigPtr := aSig shl shiftCount;
  5838. zExpPtr := 1 - shiftCount;
  5839. end;
  5840. {*----------------------------------------------------------------------------
  5841. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5842. | extended double-precision floating-point value, returning the result.
  5843. *----------------------------------------------------------------------------*}
  5844. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5845. var
  5846. z: floatx80;
  5847. begin
  5848. z.low := zSig;
  5849. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5850. result:=z;
  5851. end;
  5852. {*----------------------------------------------------------------------------
  5853. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5854. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5855. | and returns the proper extended double-precision floating-point value
  5856. | corresponding to the abstract input. Ordinarily, the abstract value is
  5857. | rounded and packed into the extended double-precision format, with the
  5858. | inexact exception raised if the abstract input cannot be represented
  5859. | exactly. However, if the abstract value is too large, the overflow and
  5860. | inexact exceptions are raised and an infinity or maximal finite value is
  5861. | returned. If the abstract value is too small, the input value is rounded to
  5862. | a subnormal number, and the underflow and inexact exceptions are raised if
  5863. | the abstract input cannot be represented exactly as a subnormal extended
  5864. | double-precision floating-point number.
  5865. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5866. | number of bits as single or double precision, respectively. Otherwise, the
  5867. | result is rounded to the full precision of the extended double-precision
  5868. | format.
  5869. | The input significand must be normalized or smaller. If the input
  5870. | significand is not normalized, `zExp' must be 0; in that case, the result
  5871. | returned is a subnormal number, and it must not require rounding. The
  5872. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5873. | Floating-Point Arithmetic.
  5874. *----------------------------------------------------------------------------*}
  5875. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5876. var
  5877. roundingMode: int8;
  5878. roundNearestEven, increment, isTiny: flag;
  5879. roundIncrement, roundMask, roundBits: int64;
  5880. label
  5881. precision80, overflow;
  5882. begin
  5883. roundingMode := softfloat_rounding_mode;
  5884. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5885. if ( roundingPrecision = 80 ) then
  5886. goto precision80;
  5887. if ( roundingPrecision = 64 ) then
  5888. begin
  5889. roundIncrement := int64( $0000000000000400 );
  5890. roundMask := int64( $00000000000007FF );
  5891. end
  5892. else if ( roundingPrecision = 32 ) then
  5893. begin
  5894. roundIncrement := int64( $0000008000000000 );
  5895. roundMask := int64( $000000FFFFFFFFFF );
  5896. end
  5897. else begin
  5898. goto precision80;
  5899. end;
  5900. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5901. if ( not (roundNearestEven<>0) ) then
  5902. begin
  5903. if ( roundingMode = float_round_to_zero ) then
  5904. begin
  5905. roundIncrement := 0;
  5906. end
  5907. else begin
  5908. roundIncrement := roundMask;
  5909. if ( zSign<>0 ) then
  5910. begin
  5911. if ( roundingMode = float_round_up ) then
  5912. roundIncrement := 0;
  5913. end
  5914. else begin
  5915. if ( roundingMode = float_round_down ) then
  5916. roundIncrement := 0;
  5917. end;
  5918. end;
  5919. end;
  5920. roundBits := zSig0 and roundMask;
  5921. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  5922. if ( ( $7FFE < zExp )
  5923. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5924. ) then begin
  5925. goto overflow;
  5926. end;
  5927. if ( zExp <= 0 ) then begin
  5928. isTiny := ord (
  5929. ( softfloat_detect_tininess = float_tininess_before_rounding )
  5930. or ( zExp < 0 )
  5931. or ( zSig0 <= zSig0 + roundIncrement ) );
  5932. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  5933. zExp := 0;
  5934. roundBits := zSig0 and roundMask;
  5935. if ( isTiny <> 0 ) and ( roundBits <> 0 ) then float_raise( float_flag_underflow );
  5936. if ( roundBits <> 0 ) then set_inexact_flag;
  5937. inc( zSig0, roundIncrement );
  5938. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  5939. roundIncrement := roundMask + 1;
  5940. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  5941. roundMask := roundMask or roundIncrement;
  5942. end;
  5943. zSig0 := zSig0 and not roundMask;
  5944. result:=packFloatx80( zSign, zExp, zSig0 );
  5945. exit;
  5946. end;
  5947. end;
  5948. if ( roundBits <> 0 ) then set_inexact_flag;
  5949. inc( zSig0, roundIncrement );
  5950. if ( zSig0 < roundIncrement ) then begin
  5951. inc(zExp);
  5952. zSig0 := bits64( $8000000000000000 );
  5953. end;
  5954. roundIncrement := roundMask + 1;
  5955. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  5956. roundMask := roundMask or roundIncrement;
  5957. end;
  5958. zSig0 := zSig0 and not roundMask;
  5959. if ( zSig0 = 0 ) then zExp := 0;
  5960. result:=packFloatx80( zSign, zExp, zSig0 );
  5961. exit;
  5962. precision80:
  5963. increment := ord ( sbits64( zSig1 ) < 0 );
  5964. if ( roundNearestEven = 0 ) then begin
  5965. if ( roundingMode = float_round_to_zero ) then begin
  5966. increment := 0;
  5967. end
  5968. else begin
  5969. if ( zSign <> 0 ) then begin
  5970. increment := ord ( roundingMode = float_round_down ) and zSig1;
  5971. end
  5972. else begin
  5973. increment := ord ( roundingMode = float_round_up ) and zSig1;
  5974. end;
  5975. end;
  5976. end;
  5977. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  5978. if ( ( $7FFE < zExp )
  5979. or ( ( zExp = $7FFE )
  5980. and ( zSig0 = bits64( $FFFFFFFFFFFFFFFF ) )
  5981. and ( increment <> 0 )
  5982. )
  5983. ) then begin
  5984. roundMask := 0;
  5985. overflow:
  5986. float_raise( [float_flag_overflow,float_flag_inexact] );
  5987. if ( ( roundingMode = float_round_to_zero )
  5988. or ( ( zSign <> 0) and ( roundingMode = float_round_up ) )
  5989. or ( ( zSign = 0) and ( roundingMode = float_round_down ) )
  5990. ) then begin
  5991. result:=packFloatx80( zSign, $7FFE, not roundMask );
  5992. exit;
  5993. end;
  5994. result:=packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  5995. exit;
  5996. end;
  5997. if ( zExp <= 0 ) then begin
  5998. isTiny := ord(
  5999. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6000. or ( zExp < 0 )
  6001. or ( increment = 0 )
  6002. or ( zSig0 < bits64( $FFFFFFFFFFFFFFFF ) ) );
  6003. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  6004. zExp := 0;
  6005. if ( ( isTiny <> 0 ) and ( zSig1 <> 0 ) ) then float_raise( float_flag_underflow );
  6006. if ( zSig1 <> 0 ) then set_inexact_flag;
  6007. if ( roundNearestEven <> 0 ) then begin
  6008. increment := ord( sbits64( zSig1 ) < 0 );
  6009. end
  6010. else begin
  6011. if ( zSign <> 0 ) then begin
  6012. increment := ord( roundingMode = float_round_down ) and zSig1;
  6013. end
  6014. else begin
  6015. increment := ord( roundingMode = float_round_up ) and zSig1;
  6016. end;
  6017. end;
  6018. if ( increment <> 0 ) then begin
  6019. inc(zSig0);
  6020. zSig0 :=
  6021. not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6022. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6023. end;
  6024. result:=packFloatx80( zSign, zExp, zSig0 );
  6025. exit;
  6026. end;
  6027. end;
  6028. if ( zSig1 <> 0 ) then set_inexact_flag;
  6029. if ( increment <> 0 ) then begin
  6030. inc(zSig0);
  6031. if ( zSig0 = 0 ) then begin
  6032. inc(zExp);
  6033. zSig0 := bits64( $8000000000000000 );
  6034. end
  6035. else begin
  6036. zSig0 := zSig0 and not bits64( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6037. end;
  6038. end
  6039. else begin
  6040. if ( zSig0 = 0 ) then zExp := 0;
  6041. end;
  6042. result:=packFloatx80( zSign, zExp, zSig0 );
  6043. end;
  6044. {*----------------------------------------------------------------------------
  6045. | Takes an abstract floating-point value having sign `zSign', exponent
  6046. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  6047. | and returns the proper extended double-precision floating-point value
  6048. | corresponding to the abstract input. This routine is just like
  6049. | `roundAndPackFloatx80' except that the input significand does not have to be
  6050. | normalized.
  6051. *----------------------------------------------------------------------------*}
  6052. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6053. var
  6054. shiftCount: int8;
  6055. begin
  6056. if ( zSig0 = 0 ) then begin
  6057. zSig0 := zSig1;
  6058. zSig1 := 0;
  6059. dec( zExp, 64 );
  6060. end;
  6061. shiftCount := countLeadingZeros64( zSig0 );
  6062. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6063. zExp := zExp - shiftCount;
  6064. result :=
  6065. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  6066. end;
  6067. {*----------------------------------------------------------------------------
  6068. | Returns the result of converting the extended double-precision floating-
  6069. | point value `a' to the 32-bit two's complement integer format. The
  6070. | conversion is performed according to the IEC/IEEE Standard for Binary
  6071. | Floating-Point Arithmetic---which means in particular that the conversion
  6072. | is rounded according to the current rounding mode. If `a' is a NaN, the
  6073. | largest positive integer is returned. Otherwise, if the conversion
  6074. | overflows, the largest integer with the same sign as `a' is returned.
  6075. *----------------------------------------------------------------------------*}
  6076. function floatx80_to_int32(a: floatx80): int32;
  6077. var
  6078. aSign: flag;
  6079. aExp, shiftCount: int32;
  6080. aSig: bits64;
  6081. begin
  6082. aSig := extractFloatx80Frac( a );
  6083. aExp := extractFloatx80Exp( a );
  6084. aSign := extractFloatx80Sign( a );
  6085. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then aSign := 0;
  6086. shiftCount := $4037 - aExp;
  6087. if ( shiftCount <= 0 ) then shiftCount := 1;
  6088. shift64RightJamming( aSig, shiftCount, aSig );
  6089. result := roundAndPackInt32( aSign, aSig );
  6090. end;
  6091. {*----------------------------------------------------------------------------
  6092. | Returns the result of converting the extended double-precision floating-
  6093. | point value `a' to the 32-bit two's complement integer format. The
  6094. | conversion is performed according to the IEC/IEEE Standard for Binary
  6095. | Floating-Point Arithmetic, except that the conversion is always rounded
  6096. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6097. | Otherwise, if the conversion overflows, the largest integer with the same
  6098. | sign as `a' is returned.
  6099. *----------------------------------------------------------------------------*}
  6100. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  6101. var
  6102. aSign: flag;
  6103. aExp, shiftCount: int32;
  6104. aSig, savedASig: bits64;
  6105. z: int32;
  6106. label
  6107. invalid;
  6108. begin
  6109. aSig := extractFloatx80Frac( a );
  6110. aExp := extractFloatx80Exp( a );
  6111. aSign := extractFloatx80Sign( a );
  6112. if ( $401E < aExp ) then begin
  6113. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 )<>0 ) then aSign := 0;
  6114. goto invalid;
  6115. end
  6116. else if ( aExp < $3FFF ) then begin
  6117. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6118. result := 0;
  6119. exit;
  6120. end;
  6121. shiftCount := $403E - aExp;
  6122. savedASig := aSig;
  6123. aSig := aSig shr shiftCount;
  6124. z := aSig;
  6125. if ( aSign <> 0 ) then z := - z;
  6126. if ( ord( z < 0 ) xor aSign ) <> 0 then begin
  6127. invalid:
  6128. float_raise( float_flag_invalid );
  6129. if aSign <> 0 then result := sbits32( $80000000 ) else result := $7FFFFFFF;
  6130. exit;
  6131. end;
  6132. if ( ( aSig shl shiftCount ) <> savedASig ) then begin
  6133. set_inexact_flag;
  6134. end;
  6135. result := z;
  6136. end;
  6137. {*----------------------------------------------------------------------------
  6138. | Returns the result of converting the extended double-precision floating-
  6139. | point value `a' to the 64-bit two's complement integer format. The
  6140. | conversion is performed according to the IEC/IEEE Standard for Binary
  6141. | Floating-Point Arithmetic---which means in particular that the conversion
  6142. | is rounded according to the current rounding mode. If `a' is a NaN,
  6143. | the largest positive integer is returned. Otherwise, if the conversion
  6144. | overflows, the largest integer with the same sign as `a' is returned.
  6145. *----------------------------------------------------------------------------*}
  6146. function floatx80_to_int64(a: floatx80): int64;
  6147. var
  6148. aSign: flag;
  6149. aExp, shiftCount: int32;
  6150. aSig, aSigExtra: bits64;
  6151. begin
  6152. aSig := extractFloatx80Frac( a );
  6153. aExp := extractFloatx80Exp( a );
  6154. aSign := extractFloatx80Sign( a );
  6155. shiftCount := $403E - aExp;
  6156. if ( shiftCount <= 0 ) then begin
  6157. if ( shiftCount <> 0 ) then begin
  6158. float_raise( float_flag_invalid );
  6159. if ( ( aSign = 0 )
  6160. or ( ( aExp = $7FFF )
  6161. and ( aSig <> bits64( $8000000000000000 ) ) )
  6162. ) then begin
  6163. result := $7FFFFFFFFFFFFFFF;
  6164. exit;
  6165. end;
  6166. result := $8000000000000000;
  6167. exit;
  6168. end;
  6169. aSigExtra := 0;
  6170. end
  6171. else begin
  6172. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  6173. end;
  6174. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  6175. end;
  6176. {*----------------------------------------------------------------------------
  6177. | Returns the result of converting the extended double-precision floating-
  6178. | point value `a' to the 64-bit two's complement integer format. The
  6179. | conversion is performed according to the IEC/IEEE Standard for Binary
  6180. | Floating-Point Arithmetic, except that the conversion is always rounded
  6181. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6182. | Otherwise, if the conversion overflows, the largest integer with the same
  6183. | sign as `a' is returned.
  6184. *----------------------------------------------------------------------------*}
  6185. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  6186. var
  6187. aSign: flag;
  6188. aExp, shiftCount: int32;
  6189. aSig: bits64;
  6190. z: int64;
  6191. begin
  6192. aSig := extractFloatx80Frac( a );
  6193. aExp := extractFloatx80Exp( a );
  6194. aSign := extractFloatx80Sign( a );
  6195. shiftCount := aExp - $403E;
  6196. if ( 0 <= shiftCount ) then begin
  6197. aSig := $7FFFFFFFFFFFFFFF;
  6198. if ( ( a.high <> $C03E ) or ( aSig <> 0 ) ) then begin
  6199. float_raise( float_flag_invalid );
  6200. if ( ( aSign = 0 ) or ( ( aExp = $7FFF ) and ( aSig <> 0 ) ) ) then begin
  6201. result := $7FFFFFFFFFFFFFFF;
  6202. exit;
  6203. end;
  6204. end;
  6205. result := $8000000000000000;
  6206. exit;
  6207. end
  6208. else if ( aExp < $3FFF ) then begin
  6209. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6210. result := 0;
  6211. exit;
  6212. end;
  6213. z := aSig shr ( - shiftCount );
  6214. if bits64( aSig shl ( shiftCount and 63 ) ) <> 0 then begin
  6215. set_inexact_flag;
  6216. end;
  6217. if ( aSign <> 0 ) then z := - z;
  6218. result := z;
  6219. end;
  6220. {*----------------------------------------------------------------------------
  6221. | The pattern for a default generated extended double-precision NaN. The
  6222. | `high' and `low' values hold the most- and least-significant bits,
  6223. | respectively.
  6224. *----------------------------------------------------------------------------*}
  6225. const
  6226. floatx80_default_nan_high = $FFFF;
  6227. floatx80_default_nan_low = bits64( $C000000000000000 );
  6228. {*----------------------------------------------------------------------------
  6229. | Returns 1 if the extended double-precision floating-point value `a' is a
  6230. | signaling NaN; otherwise returns 0.
  6231. *----------------------------------------------------------------------------*}
  6232. function floatx80_is_signaling_nan(a : floatx80): flag;
  6233. var
  6234. aLow: bits64;
  6235. begin
  6236. aLow := a.low and not $4000000000000000;
  6237. result := ord(
  6238. ( a.high and $7FFF = $7FFF )
  6239. and ( bits64( aLow shl 1 ) <> 0 )
  6240. and ( a.low = aLow ) );
  6241. end;
  6242. {*----------------------------------------------------------------------------
  6243. | Returns the result of converting the extended double-precision floating-
  6244. | point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
  6245. | invalid exception is raised.
  6246. *----------------------------------------------------------------------------*}
  6247. function floatx80ToCommonNaN(a : floatx80): commonNaNT;
  6248. var
  6249. z: commonNaNT;
  6250. begin
  6251. if floatx80_is_signaling_nan( a ) <> 0 then float_raise( float_flag_invalid );
  6252. z.sign := a.high shr 15;
  6253. z.low := 0;
  6254. z.high := a.low shl 1;
  6255. result := z;
  6256. end;
  6257. {*----------------------------------------------------------------------------
  6258. | Returns 1 if the extended double-precision floating-point value `a' is a
  6259. | NaN; otherwise returns 0.
  6260. *----------------------------------------------------------------------------*}
  6261. function floatx80_is_nan(a : floatx80 ): flag;
  6262. begin
  6263. result := ord( ( ( a.high and $7FFF ) = $7FFF ) and ( bits64( a.low shl 1 ) <> 0 ) );
  6264. end;
  6265. {*----------------------------------------------------------------------------
  6266. | Takes two extended double-precision floating-point values `a' and `b', one
  6267. | of which is a NaN, and returns the appropriate NaN result. If either `a' or
  6268. | `b' is a signaling NaN, the invalid exception is raised.
  6269. *----------------------------------------------------------------------------*}
  6270. function propagateFloatx80NaN(a, b: floatx80): floatx80;
  6271. var
  6272. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  6273. label
  6274. returnLargerSignificand;
  6275. begin
  6276. aIsNaN := floatx80_is_nan( a );
  6277. aIsSignalingNaN := floatx80_is_signaling_nan( a );
  6278. bIsNaN := floatx80_is_nan( b );
  6279. bIsSignalingNaN := floatx80_is_signaling_nan( b );
  6280. a.low := a.low or $C000000000000000;
  6281. b.low := b.low or $C000000000000000;
  6282. if aIsSignalingNaN or bIsSignalingNaN <> 0 then float_raise( float_flag_invalid );
  6283. if aIsSignalingNaN <> 0 then begin
  6284. if bIsSignalingNaN <> 0 then goto returnLargerSignificand;
  6285. if bIsNaN <> 0 then result := b else result := a;
  6286. exit;
  6287. end
  6288. else if aIsNaN <>0 then begin
  6289. if ( bIsSignalingNaN <> 0 ) or ( bIsNaN = 0) then begin
  6290. result := a;
  6291. exit;
  6292. end;
  6293. returnLargerSignificand:
  6294. if ( a.low < b.low ) then begin
  6295. result := b;
  6296. exit;
  6297. end;
  6298. if ( b.low < a.low ) then begin
  6299. result := a;
  6300. exit;
  6301. end;
  6302. if a.high < b.high then result := a else result := b;
  6303. exit;
  6304. end
  6305. else
  6306. result := b;
  6307. end;
  6308. {*----------------------------------------------------------------------------
  6309. | Returns the result of converting the extended double-precision floating-
  6310. | point value `a' to the single-precision floating-point format. The
  6311. | conversion is performed according to the IEC/IEEE Standard for Binary
  6312. | Floating-Point Arithmetic.
  6313. *----------------------------------------------------------------------------*}
  6314. function floatx80_to_float32(a: floatx80): float32;
  6315. var
  6316. aSign: flag;
  6317. aExp: int32;
  6318. aSig: bits64;
  6319. begin
  6320. aSig := extractFloatx80Frac( a );
  6321. aExp := extractFloatx80Exp( a );
  6322. aSign := extractFloatx80Sign( a );
  6323. if ( aExp = $7FFF ) then begin
  6324. if bits64( aSig shl 1 ) <> 0 then begin
  6325. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  6326. exit;
  6327. end;
  6328. result := packFloat32( aSign, $FF, 0 );
  6329. exit;
  6330. end;
  6331. shift64RightJamming( aSig, 33, aSig );
  6332. if ( aExp or aSig <> 0 ) then dec( aExp, $3F81 );
  6333. result := roundAndPackFloat32( aSign, aExp, aSig );
  6334. end;
  6335. {*----------------------------------------------------------------------------
  6336. | Returns the result of converting the extended double-precision floating-
  6337. | point value `a' to the double-precision floating-point format. The
  6338. | conversion is performed according to the IEC/IEEE Standard for Binary
  6339. | Floating-Point Arithmetic.
  6340. *----------------------------------------------------------------------------*}
  6341. function floatx80_to_float64(a: floatx80): float64;
  6342. var
  6343. aSign: flag;
  6344. aExp: int32;
  6345. aSig, zSig: bits64;
  6346. begin
  6347. aSig := extractFloatx80Frac( a );
  6348. aExp := extractFloatx80Exp( a );
  6349. aSign := extractFloatx80Sign( a );
  6350. if ( aExp = $7FFF ) then begin
  6351. if bits64( aSig shl 1 ) <> 0 then begin
  6352. commonNaNToFloat64( floatx80ToCommonNaN( a ), result );
  6353. exit;
  6354. end;
  6355. result := packFloat64( aSign, $7FF, 0 );
  6356. exit;
  6357. end;
  6358. shift64RightJamming( aSig, 1, zSig );
  6359. if ( aExp or aSig <> 0 ) then dec( aExp, $3C01 );
  6360. result := roundAndPackFloat64( aSign, aExp, zSig );
  6361. end;
  6362. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6363. {*----------------------------------------------------------------------------
  6364. | Returns the result of converting the extended double-precision floating-
  6365. | point value `a' to the quadruple-precision floating-point format. The
  6366. | conversion is performed according to the IEC/IEEE Standard for Binary
  6367. | Floating-Point Arithmetic.
  6368. *----------------------------------------------------------------------------*}
  6369. function floatx80_to_float128(a: floatx80): float128;
  6370. var
  6371. aSign: flag;
  6372. aExp: int16;
  6373. aSig, zSig0, zSig1: bits64;
  6374. begin
  6375. aSig := extractFloatx80Frac( a );
  6376. aExp := extractFloatx80Exp( a );
  6377. aSign := extractFloatx80Sign( a );
  6378. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then begin
  6379. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  6380. exit;
  6381. end;
  6382. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  6383. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  6384. end;
  6385. {$endif FPC_SOFTFLOAT_FLOAT128}
  6386. {*----------------------------------------------------------------------------
  6387. | Rounds the extended double-precision floating-point value `a' to an integer,
  6388. | and Returns the result as an extended quadruple-precision floating-point
  6389. | value. The operation is performed according to the IEC/IEEE Standard for
  6390. | Binary Floating-Point Arithmetic.
  6391. *----------------------------------------------------------------------------*}
  6392. function floatx80_round_to_int(a: floatx80): floatx80;
  6393. var
  6394. aSign: flag;
  6395. aExp: int32;
  6396. lastBitMask, roundBitsMask: bits64;
  6397. roundingMode: int8;
  6398. z: floatx80;
  6399. begin
  6400. aExp := extractFloatx80Exp( a );
  6401. if ( $403E <= aExp ) then begin
  6402. if ( aExp = $7FFF ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) then begin
  6403. result := propagateFloatx80NaN( a, a );
  6404. exit;
  6405. end;
  6406. result := a;
  6407. exit;
  6408. end;
  6409. if ( aExp < $3FFF ) then begin
  6410. if ( ( aExp = 0 )
  6411. and ( bits64( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) then begin
  6412. result := a;
  6413. exit;
  6414. end;
  6415. set_inexact_flag;
  6416. aSign := extractFloatx80Sign( a );
  6417. case softfloat_rounding_mode of
  6418. float_round_nearest_even:
  6419. if ( ( aExp = $3FFE ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  6420. ) then begin
  6421. result :=
  6422. packFloatx80( aSign, $3FFF, bits64( $8000000000000000 ) );
  6423. exit;
  6424. end;
  6425. float_round_down: begin
  6426. if aSign <> 0 then
  6427. result := packFloatx80( 1, $3FFF, bits64( $8000000000000000 ) )
  6428. else
  6429. result := packFloatx80( 0, 0, 0 );
  6430. exit;
  6431. end;
  6432. float_round_up: begin
  6433. if aSign <> 0 then
  6434. result := packFloatx80( 1, 0, 0 )
  6435. else
  6436. result := packFloatx80( 0, $3FFF, bits64( $8000000000000000 ) );
  6437. exit;
  6438. end;
  6439. end;
  6440. result := packFloatx80( aSign, 0, 0 );
  6441. exit;
  6442. end;
  6443. lastBitMask := 1;
  6444. lastBitMask := lastBitMask shl ( $403E - aExp );
  6445. roundBitsMask := lastBitMask - 1;
  6446. z := a;
  6447. roundingMode := softfloat_rounding_mode;
  6448. if ( roundingMode = float_round_nearest_even ) then begin
  6449. inc( z.low, lastBitMask shr 1 );
  6450. if ( ( z.low and roundBitsMask ) = 0 ) then z.low := z.low and not lastBitMask;
  6451. end
  6452. else if ( roundingMode <> float_round_to_zero ) then begin
  6453. if ( extractFloatx80Sign( z ) <> 0 ) xor ( roundingMode = float_round_up ) then begin
  6454. inc( z.low, roundBitsMask );
  6455. end;
  6456. end;
  6457. z.low := z.low and not roundBitsMask;
  6458. if ( z.low = 0 ) then begin
  6459. inc(z.high);
  6460. z.low := bits64( $8000000000000000 );
  6461. end;
  6462. if ( z.low <> a.low ) then set_inexact_flag;
  6463. result := z;
  6464. end;
  6465. {*----------------------------------------------------------------------------
  6466. | Returns the result of adding the absolute values of the extended double-
  6467. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  6468. | negated before being returned. `zSign' is ignored if the result is a NaN.
  6469. | The addition is performed according to the IEC/IEEE Standard for Binary
  6470. | Floating-Point Arithmetic.
  6471. *----------------------------------------------------------------------------*}
  6472. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6473. var
  6474. aExp, bExp, zExp: int32;
  6475. aSig, bSig, zSig0, zSig1: bits64;
  6476. expDiff: int32;
  6477. label
  6478. shiftRight1, roundAndPack;
  6479. begin
  6480. aSig := extractFloatx80Frac( a );
  6481. aExp := extractFloatx80Exp( a );
  6482. bSig := extractFloatx80Frac( b );
  6483. bExp := extractFloatx80Exp( b );
  6484. expDiff := aExp - bExp;
  6485. if ( 0 < expDiff ) then begin
  6486. if ( aExp = $7FFF ) then begin
  6487. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6488. result := propagateFloatx80NaN( a, b );
  6489. exit;
  6490. end;
  6491. result := a;
  6492. exit;
  6493. end;
  6494. if ( bExp = 0 ) then dec(expDiff);
  6495. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6496. zExp := aExp;
  6497. end
  6498. else if ( expDiff < 0 ) then begin
  6499. if ( bExp = $7FFF ) then begin
  6500. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6501. result := propagateFloatx80NaN( a, b );
  6502. exit;
  6503. end;
  6504. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6505. exit;
  6506. end;
  6507. if ( aExp = 0 ) then inc(expDiff);
  6508. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6509. zExp := bExp;
  6510. end
  6511. else begin
  6512. if ( aExp = $7FFF ) then begin
  6513. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6514. result := propagateFloatx80NaN( a, b );
  6515. exit;
  6516. end;
  6517. result := a;
  6518. exit;
  6519. end;
  6520. zSig1 := 0;
  6521. zSig0 := aSig + bSig;
  6522. if ( aExp = 0 ) then begin
  6523. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  6524. goto roundAndPack;
  6525. end;
  6526. zExp := aExp;
  6527. goto shiftRight1;
  6528. end;
  6529. zSig0 := aSig + bSig;
  6530. if ( sbits64( zSig0 ) < 0 ) then goto roundAndPack;
  6531. shiftRight1:
  6532. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  6533. zSig0 := zSig0 or $8000000000000000;
  6534. inc(zExp);
  6535. roundAndPack:
  6536. result :=
  6537. roundAndPackFloatx80(
  6538. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6539. end;
  6540. {*----------------------------------------------------------------------------
  6541. | Returns the result of subtracting the absolute values of the extended
  6542. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  6543. | difference is negated before being returned. `zSign' is ignored if the
  6544. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6545. | Standard for Binary Floating-Point Arithmetic.
  6546. *----------------------------------------------------------------------------*}
  6547. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6548. var
  6549. aExp, bExp, zExp: int32;
  6550. aSig, bSig, zSig0, zSig1: bits64;
  6551. expDiff: int32;
  6552. z: floatx80;
  6553. label
  6554. bExpBigger, bBigger, aExpBigger, aBigger, normalizeRoundAndPack;
  6555. begin
  6556. aSig := extractFloatx80Frac( a );
  6557. aExp := extractFloatx80Exp( a );
  6558. bSig := extractFloatx80Frac( b );
  6559. bExp := extractFloatx80Exp( b );
  6560. expDiff := aExp - bExp;
  6561. if ( 0 < expDiff ) then goto aExpBigger;
  6562. if ( expDiff < 0 ) then goto bExpBigger;
  6563. if ( aExp = $7FFF ) then begin
  6564. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6565. result := propagateFloatx80NaN( a, b );
  6566. exit;
  6567. end;
  6568. float_raise( float_flag_invalid );
  6569. z.low := floatx80_default_nan_low;
  6570. z.high := floatx80_default_nan_high;
  6571. result := z;
  6572. exit;
  6573. end;
  6574. if ( aExp = 0 ) then begin
  6575. aExp := 1;
  6576. bExp := 1;
  6577. end;
  6578. zSig1 := 0;
  6579. if ( bSig < aSig ) then goto aBigger;
  6580. if ( aSig < bSig ) then goto bBigger;
  6581. result := packFloatx80( ord( softfloat_rounding_mode = float_round_down ), 0, 0 );
  6582. exit;
  6583. bExpBigger:
  6584. if ( bExp = $7FFF ) then begin
  6585. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6586. result := propagateFloatx80NaN( a, b );
  6587. exit;
  6588. end;
  6589. result := packFloatx80( zSign xor 1, $7FFF, bits64( $8000000000000000 ) );
  6590. exit;
  6591. end;
  6592. if ( aExp = 0 ) then inc(expDiff);
  6593. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6594. bBigger:
  6595. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6596. zExp := bExp;
  6597. zSign := zSign xor 1;
  6598. goto normalizeRoundAndPack;
  6599. aExpBigger:
  6600. if ( aExp = $7FFF ) then begin
  6601. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6602. result := propagateFloatx80NaN( a, b );
  6603. exit;
  6604. end;
  6605. result := a;
  6606. exit;
  6607. end;
  6608. if ( bExp = 0 ) then dec(expDiff);
  6609. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6610. aBigger:
  6611. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6612. zExp := aExp;
  6613. normalizeRoundAndPack:
  6614. result :=
  6615. normalizeRoundAndPackFloatx80(
  6616. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6617. end;
  6618. {*----------------------------------------------------------------------------
  6619. | Returns the result of adding the extended double-precision floating-point
  6620. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6621. | Standard for Binary Floating-Point Arithmetic.
  6622. *----------------------------------------------------------------------------*}
  6623. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6624. var
  6625. aSign, bSign: flag;
  6626. begin
  6627. aSign := extractFloatx80Sign( a );
  6628. bSign := extractFloatx80Sign( b );
  6629. if ( aSign = bSign ) then begin
  6630. result := addFloatx80Sigs( a, b, aSign );
  6631. end
  6632. else begin
  6633. result := subFloatx80Sigs( a, b, aSign );
  6634. end;
  6635. end;
  6636. {*----------------------------------------------------------------------------
  6637. | Returns the result of subtracting the extended double-precision floating-
  6638. | point values `a' and `b'. The operation is performed according to the
  6639. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6640. *----------------------------------------------------------------------------*}
  6641. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6642. var
  6643. aSign, bSign: flag;
  6644. begin
  6645. aSign := extractFloatx80Sign( a );
  6646. bSign := extractFloatx80Sign( b );
  6647. if ( aSign = bSign ) then begin
  6648. result := subFloatx80Sigs( a, b, aSign );
  6649. end
  6650. else begin
  6651. result := addFloatx80Sigs( a, b, aSign );
  6652. end;
  6653. end;
  6654. {*----------------------------------------------------------------------------
  6655. | Returns the result of multiplying the extended double-precision floating-
  6656. | point values `a' and `b'. The operation is performed according to the
  6657. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6658. *----------------------------------------------------------------------------*}
  6659. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6660. var
  6661. aSign, bSign, zSign: flag;
  6662. aExp, bExp, zExp: int32;
  6663. aSig, bSig, zSig0, zSig1: bits64;
  6664. z: floatx80;
  6665. label
  6666. invalid;
  6667. begin
  6668. aSig := extractFloatx80Frac( a );
  6669. aExp := extractFloatx80Exp( a );
  6670. aSign := extractFloatx80Sign( a );
  6671. bSig := extractFloatx80Frac( b );
  6672. bExp := extractFloatx80Exp( b );
  6673. bSign := extractFloatx80Sign( b );
  6674. zSign := aSign xor bSign;
  6675. if ( aExp = $7FFF ) then begin
  6676. if ( bits64( aSig shl 1 ) <> 0 )
  6677. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6678. result := propagateFloatx80NaN( a, b );
  6679. exit;
  6680. end;
  6681. if ( ( bExp or bSig ) = 0 ) then goto invalid;
  6682. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6683. exit;
  6684. end;
  6685. if ( bExp = $7FFF ) then begin
  6686. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6687. result := propagateFloatx80NaN( a, b );
  6688. exit;
  6689. end;
  6690. if ( ( aExp or aSig ) = 0 ) then begin
  6691. invalid:
  6692. float_raise( float_flag_invalid );
  6693. z.low := floatx80_default_nan_low;
  6694. z.high := floatx80_default_nan_high;
  6695. result := z;
  6696. exit;
  6697. end;
  6698. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6699. exit;
  6700. end;
  6701. if ( aExp = 0 ) then begin
  6702. if ( aSig = 0 ) then begin
  6703. result := packFloatx80( zSign, 0, 0 );
  6704. exit;
  6705. end;
  6706. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6707. end;
  6708. if ( bExp = 0 ) then begin
  6709. if ( bSig = 0 ) then begin
  6710. result := packFloatx80( zSign, 0, 0 );
  6711. exit;
  6712. end;
  6713. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6714. end;
  6715. zExp := aExp + bExp - $3FFE;
  6716. mul64To128( aSig, bSig, zSig0, zSig1 );
  6717. if 0 < sbits64( zSig0 ) then begin
  6718. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6719. dec(zExp);
  6720. end;
  6721. result :=
  6722. roundAndPackFloatx80(
  6723. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6724. end;
  6725. {*----------------------------------------------------------------------------
  6726. | Returns the result of dividing the extended double-precision floating-point
  6727. | value `a' by the corresponding value `b'. The operation is performed
  6728. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6729. *----------------------------------------------------------------------------*}
  6730. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6731. var
  6732. aSign, bSign, zSign: flag;
  6733. aExp, bExp, zExp: int32;
  6734. aSig, bSig, zSig0, zSig1: bits64;
  6735. rem0, rem1, rem2, term0, term1, term2: bits64;
  6736. z: floatx80;
  6737. label
  6738. invalid;
  6739. begin
  6740. aSig := extractFloatx80Frac( a );
  6741. aExp := extractFloatx80Exp( a );
  6742. aSign := extractFloatx80Sign( a );
  6743. bSig := extractFloatx80Frac( b );
  6744. bExp := extractFloatx80Exp( b );
  6745. bSign := extractFloatx80Sign( b );
  6746. zSign := aSign xor bSign;
  6747. if ( aExp = $7FFF ) then begin
  6748. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6749. result := propagateFloatx80NaN( a, b );
  6750. exit;
  6751. end;
  6752. if ( bExp = $7FFF ) then begin
  6753. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6754. result := propagateFloatx80NaN( a, b );
  6755. exit;
  6756. end;
  6757. goto invalid;
  6758. end;
  6759. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6760. exit;
  6761. end;
  6762. if ( bExp = $7FFF ) then begin
  6763. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6764. result := propagateFloatx80NaN( a, b );
  6765. exit;
  6766. end;
  6767. result := packFloatx80( zSign, 0, 0 );
  6768. exit;
  6769. end;
  6770. if ( bExp = 0 ) then begin
  6771. if ( bSig = 0 ) then begin
  6772. if ( ( aExp or aSig ) = 0 ) then begin
  6773. invalid:
  6774. float_raise( float_flag_invalid );
  6775. z.low := floatx80_default_nan_low;
  6776. z.high := floatx80_default_nan_high;
  6777. result := z;
  6778. exit;
  6779. end;
  6780. float_raise( float_flag_divbyzero );
  6781. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6782. exit;
  6783. end;
  6784. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6785. end;
  6786. if ( aExp = 0 ) then begin
  6787. if ( aSig = 0 ) then begin
  6788. result := packFloatx80( zSign, 0, 0 );
  6789. exit;
  6790. end;
  6791. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6792. end;
  6793. zExp := aExp - bExp + $3FFE;
  6794. rem1 := 0;
  6795. if ( bSig <= aSig ) then begin
  6796. shift128Right( aSig, 0, 1, aSig, rem1 );
  6797. inc(zExp);
  6798. end;
  6799. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6800. mul64To128( bSig, zSig0, term0, term1 );
  6801. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6802. while ( sbits64( rem0 ) < 0 ) do begin
  6803. dec(zSig0);
  6804. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6805. end;
  6806. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6807. if ( bits64( zSig1 shl 1 ) <= 8 ) then begin
  6808. mul64To128( bSig, zSig1, term1, term2 );
  6809. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6810. while ( sbits64( rem1 ) < 0 ) do begin
  6811. dec(zSig1);
  6812. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6813. end;
  6814. zSig1 := zSig1 or ord( ( rem1 or rem2 ) <> 0 );
  6815. end;
  6816. result :=
  6817. roundAndPackFloatx80(
  6818. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6819. end;
  6820. {*----------------------------------------------------------------------------
  6821. | Returns the remainder of the extended double-precision floating-point value
  6822. | `a' with respect to the corresponding value `b'. The operation is performed
  6823. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6824. *----------------------------------------------------------------------------*}
  6825. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6826. var
  6827. aSign, zSign: flag;
  6828. aExp, bExp, expDiff: int32;
  6829. aSig0, aSig1, bSig: bits64;
  6830. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6831. z: floatx80;
  6832. label
  6833. invalid;
  6834. begin
  6835. aSig0 := extractFloatx80Frac( a );
  6836. aExp := extractFloatx80Exp( a );
  6837. aSign := extractFloatx80Sign( a );
  6838. bSig := extractFloatx80Frac( b );
  6839. bExp := extractFloatx80Exp( b );
  6840. if ( aExp = $7FFF ) then begin
  6841. if ( bits64( aSig0 shl 1 ) <> 0 )
  6842. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6843. result := propagateFloatx80NaN( a, b );
  6844. exit;
  6845. end;
  6846. goto invalid;
  6847. end;
  6848. if ( bExp = $7FFF ) then begin
  6849. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6850. result := propagateFloatx80NaN( a, b );
  6851. exit;
  6852. end;
  6853. result := a;
  6854. exit;
  6855. end;
  6856. if ( bExp = 0 ) then begin
  6857. if ( bSig = 0 ) then begin
  6858. invalid:
  6859. float_raise( float_flag_invalid );
  6860. z.low := floatx80_default_nan_low;
  6861. z.high := floatx80_default_nan_high;
  6862. result := z;
  6863. exit;
  6864. end;
  6865. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6866. end;
  6867. if ( aExp = 0 ) then begin
  6868. if ( bits64( aSig0 shl 1 ) = 0 ) then begin
  6869. result := a;
  6870. exit;
  6871. end;
  6872. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6873. end;
  6874. bSig := bSig or $8000000000000000;
  6875. zSign := aSign;
  6876. expDiff := aExp - bExp;
  6877. aSig1 := 0;
  6878. if ( expDiff < 0 ) then begin
  6879. if ( expDiff < -1 ) then begin
  6880. result := a;
  6881. exit;
  6882. end;
  6883. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6884. expDiff := 0;
  6885. end;
  6886. q := ord( bSig <= aSig0 );
  6887. if ( q <> 0 ) then dec( aSig0, bSig );
  6888. dec( expDiff, 64 );
  6889. while ( 0 < expDiff ) do begin
  6890. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6891. if ( 2 < q ) then q := q - 2 else q := 0;
  6892. mul64To128( bSig, q, term0, term1 );
  6893. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6894. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  6895. dec( expDiff, 62 );
  6896. end;
  6897. inc( expDiff, 64 );
  6898. if ( 0 < expDiff ) then begin
  6899. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6900. if ( 2 < q ) then q:= q - 2 else q := 0;
  6901. q := q shr ( 64 - expDiff );
  6902. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  6903. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6904. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  6905. while ( le128( term0, term1, aSig0, aSig1 ) <> 0 ) do begin
  6906. inc(q);
  6907. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6908. end;
  6909. end
  6910. else begin
  6911. term1 := 0;
  6912. term0 := bSig;
  6913. end;
  6914. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  6915. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  6916. or ( ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  6917. and ( q and 1 <> 0 ) )
  6918. then begin
  6919. aSig0 := alternateASig0;
  6920. aSig1 := alternateASig1;
  6921. zSign := ord( zSign = 0 );
  6922. end;
  6923. result :=
  6924. normalizeRoundAndPackFloatx80(
  6925. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  6926. end;
  6927. {*----------------------------------------------------------------------------
  6928. | Returns the square root of the extended double-precision floating-point
  6929. | value `a'. The operation is performed according to the IEC/IEEE Standard
  6930. | for Binary Floating-Point Arithmetic.
  6931. *----------------------------------------------------------------------------*}
  6932. function floatx80_sqrt(a: floatx80): floatx80;
  6933. var
  6934. aSign: flag;
  6935. aExp, zExp: int32;
  6936. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  6937. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  6938. z: floatx80;
  6939. label
  6940. invalid;
  6941. begin
  6942. aSig0 := extractFloatx80Frac( a );
  6943. aExp := extractFloatx80Exp( a );
  6944. aSign := extractFloatx80Sign( a );
  6945. if ( aExp = $7FFF ) then begin
  6946. if ( bits64( aSig0 shl 1 ) <> 0 ) then begin
  6947. result := propagateFloatx80NaN( a, a );
  6948. exit;
  6949. end;
  6950. if ( aSign = 0 ) then begin
  6951. result := a;
  6952. exit;
  6953. end;
  6954. goto invalid;
  6955. end;
  6956. if ( aSign <> 0 ) then begin
  6957. if ( ( aExp or aSig0 ) = 0 ) then begin
  6958. result := a;
  6959. exit;
  6960. end;
  6961. invalid:
  6962. float_raise( float_flag_invalid );
  6963. z.low := floatx80_default_nan_low;
  6964. z.high := floatx80_default_nan_high;
  6965. result := z;
  6966. exit;
  6967. end;
  6968. if ( aExp = 0 ) then begin
  6969. if ( aSig0 = 0 ) then begin
  6970. result := packFloatx80( 0, 0, 0 );
  6971. exit;
  6972. end;
  6973. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6974. end;
  6975. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFF;
  6976. zSig0 := estimateSqrt32( aExp, aSig0 shr 32 );
  6977. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  6978. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  6979. doubleZSig0 := zSig0 shl 1;
  6980. mul64To128( zSig0, zSig0, term0, term1 );
  6981. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  6982. while ( sbits64( rem0 ) < 0 ) do begin
  6983. dec(zSig0);
  6984. dec( doubleZSig0, 2 );
  6985. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  6986. end;
  6987. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  6988. if ( ( zSig1 and $3FFFFFFFFFFFFFFF ) <= 5 ) then begin
  6989. if ( zSig1 = 0 ) then zSig1 := 1;
  6990. mul64To128( doubleZSig0, zSig1, term1, term2 );
  6991. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6992. mul64To128( zSig1, zSig1, term2, term3 );
  6993. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  6994. while ( sbits64( rem1 ) < 0 ) do begin
  6995. dec(zSig1);
  6996. shortShift128Left( 0, zSig1, 1, term2, term3 );
  6997. term3 := term3 or 1;
  6998. term2 := term2 or doubleZSig0;
  6999. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7000. end;
  7001. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7002. end;
  7003. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  7004. zSig0 := zSig0 or doubleZSig0;
  7005. result :=
  7006. roundAndPackFloatx80(
  7007. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  7008. end;
  7009. {*----------------------------------------------------------------------------
  7010. | Returns 1 if the extended double-precision floating-point value `a' is
  7011. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  7012. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  7013. | Arithmetic.
  7014. *----------------------------------------------------------------------------*}
  7015. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  7016. begin
  7017. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7018. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  7019. ) or ( ( extractFloatx80Exp( b ) = $7FFF )
  7020. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 )
  7021. ) then begin
  7022. if ( floatx80_is_signaling_nan( a )
  7023. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7024. float_raise( float_flag_invalid );
  7025. end;
  7026. result := 0;
  7027. exit;
  7028. end;
  7029. result := ord(
  7030. ( a.low = b.low )
  7031. and ( ( a.high = b.high )
  7032. or ( ( a.low = 0 )
  7033. and ( bits16 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7034. ) );
  7035. end;
  7036. {*----------------------------------------------------------------------------
  7037. | Returns 1 if the extended double-precision floating-point value `a' is
  7038. | less than or equal to the corresponding value `b', and 0 otherwise. The
  7039. | comparison is performed according to the IEC/IEEE Standard for Binary
  7040. | Floating-Point Arithmetic.
  7041. *----------------------------------------------------------------------------*}
  7042. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  7043. var
  7044. aSign, bSign: flag;
  7045. begin
  7046. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7047. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7048. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7049. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7050. then begin
  7051. float_raise( float_flag_invalid );
  7052. result := 0;
  7053. exit;
  7054. end;
  7055. aSign := extractFloatx80Sign( a );
  7056. bSign := extractFloatx80Sign( b );
  7057. if ( aSign <> bSign ) then begin
  7058. result := ord(
  7059. ( aSign <> 0 )
  7060. or ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low = 0 ) );
  7061. exit;
  7062. end;
  7063. if aSign<>0 then
  7064. result := le128( b.high, b.low, a.high, a.low )
  7065. else
  7066. result := le128( a.high, a.low, b.high, b.low );
  7067. end;
  7068. {*----------------------------------------------------------------------------
  7069. | Returns 1 if the extended double-precision floating-point value `a' is
  7070. | less than the corresponding value `b', and 0 otherwise. The comparison
  7071. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7072. | Arithmetic.
  7073. *----------------------------------------------------------------------------*}
  7074. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  7075. var
  7076. aSign, bSign: flag;
  7077. begin
  7078. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7079. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7080. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7081. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7082. then begin
  7083. float_raise( float_flag_invalid );
  7084. result := 0;
  7085. exit;
  7086. end;
  7087. aSign := extractFloatx80Sign( a );
  7088. bSign := extractFloatx80Sign( b );
  7089. if ( aSign <> bSign ) then begin
  7090. result := ord(
  7091. ( aSign <> 0 )
  7092. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7093. exit;
  7094. end;
  7095. if aSign <> 0 then
  7096. result := lt128( b.high, b.low, a.high, a.low )
  7097. else
  7098. result := lt128( a.high, a.low, b.high, b.low );
  7099. end;
  7100. {*----------------------------------------------------------------------------
  7101. | Returns 1 if the extended double-precision floating-point value `a' is equal
  7102. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  7103. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7104. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7105. *----------------------------------------------------------------------------*}
  7106. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  7107. begin
  7108. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7109. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7110. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7111. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7112. then begin
  7113. float_raise( float_flag_invalid );
  7114. result := 0;
  7115. exit;
  7116. end;
  7117. result := ord(
  7118. ( a.low = b.low )
  7119. and ( ( a.high = b.high )
  7120. or ( ( a.low = 0 )
  7121. and ( bits16( ( a.high or b.high ) shl 1 ) = 0 ) )
  7122. ) );
  7123. end;
  7124. {*----------------------------------------------------------------------------
  7125. | Returns 1 if the extended double-precision floating-point value `a' is less
  7126. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  7127. | do not cause an exception. Otherwise, the comparison is performed according
  7128. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7129. *----------------------------------------------------------------------------*}
  7130. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  7131. var
  7132. aSign, bSign: flag;
  7133. begin
  7134. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7135. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7136. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7137. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7138. then begin
  7139. if ( floatx80_is_signaling_nan( a )
  7140. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7141. float_raise( float_flag_invalid );
  7142. end;
  7143. result := 0;
  7144. exit;
  7145. end;
  7146. aSign := extractFloatx80Sign( a );
  7147. bSign := extractFloatx80Sign( b );
  7148. if ( aSign <> bSign ) then begin
  7149. result := ord(
  7150. ( aSign <> 0 )
  7151. or ( ( bits16( ( a.high or b.high ) shl 1 ) ) or a.low or b.low = 0 ) );
  7152. exit;
  7153. end;
  7154. if aSign <> 0 then
  7155. result := le128( b.high, b.low, a.high, a.low )
  7156. else
  7157. result := le128( a.high, a.low, b.high, b.low );
  7158. end;
  7159. {*----------------------------------------------------------------------------
  7160. | Returns 1 if the extended double-precision floating-point value `a' is less
  7161. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  7162. | an exception. Otherwise, the comparison is performed according to the
  7163. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7164. *----------------------------------------------------------------------------*}
  7165. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  7166. var
  7167. aSign, bSign: flag;
  7168. begin
  7169. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7170. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7171. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7172. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7173. then begin
  7174. if ( floatx80_is_signaling_nan( a )
  7175. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7176. float_raise( float_flag_invalid );
  7177. end;
  7178. result := 0;
  7179. exit;
  7180. end;
  7181. aSign := extractFloatx80Sign( a );
  7182. bSign := extractFloatx80Sign( b );
  7183. if ( aSign <> bSign ) then begin
  7184. result := ord(
  7185. ( aSign <> 0 )
  7186. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7187. exit;
  7188. end;
  7189. if aSign <> 0 then
  7190. result := lt128( b.high, b.low, a.high, a.low )
  7191. else
  7192. result := lt128( a.high, a.low, b.high, b.low );
  7193. end;
  7194. {$endif FPC_SOFTFLOAT_FLOATX80}
  7195. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  7196. {*----------------------------------------------------------------------------
  7197. | Returns the least-significant 64 fraction bits of the quadruple-precision
  7198. | floating-point value `a'.
  7199. *----------------------------------------------------------------------------*}
  7200. function extractFloat128Frac1(a : float128): bits64;
  7201. begin
  7202. result:=a.low;
  7203. end;
  7204. {*----------------------------------------------------------------------------
  7205. | Returns the most-significant 48 fraction bits of the quadruple-precision
  7206. | floating-point value `a'.
  7207. *----------------------------------------------------------------------------*}
  7208. function extractFloat128Frac0(a : float128): bits64;
  7209. begin
  7210. result:=a.high and int64($0000FFFFFFFFFFFF);
  7211. end;
  7212. {*----------------------------------------------------------------------------
  7213. | Returns the exponent bits of the quadruple-precision floating-point value
  7214. | `a'.
  7215. *----------------------------------------------------------------------------*}
  7216. function extractFloat128Exp(a : float128): int32;
  7217. begin
  7218. result:=( a.high shr 48 ) and $7FFF;
  7219. end;
  7220. {*----------------------------------------------------------------------------
  7221. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  7222. *----------------------------------------------------------------------------*}
  7223. function extractFloat128Sign(a : float128): flag;
  7224. begin
  7225. result:=a.high shr 63;
  7226. end;
  7227. {*----------------------------------------------------------------------------
  7228. | Normalizes the subnormal quadruple-precision floating-point value
  7229. | represented by the denormalized significand formed by the concatenation of
  7230. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  7231. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  7232. | significand are stored at the location pointed to by `zSig0Ptr', and the
  7233. | least significant 64 bits of the normalized significand are stored at the
  7234. | location pointed to by `zSig1Ptr'.
  7235. *----------------------------------------------------------------------------*}
  7236. procedure normalizeFloat128Subnormal(
  7237. aSig0: bits64;
  7238. aSig1: bits64;
  7239. var zExpPtr: int32;
  7240. var zSig0Ptr: bits64;
  7241. var zSig1Ptr: bits64);
  7242. var
  7243. shiftCount: int8;
  7244. begin
  7245. if ( aSig0 = 0 ) then
  7246. begin
  7247. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  7248. if ( shiftCount < 0 ) then
  7249. begin
  7250. zSig0Ptr := aSig1 shr ( - shiftCount );
  7251. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  7252. end
  7253. else begin
  7254. zSig0Ptr := aSig1 shl shiftCount;
  7255. zSig1Ptr := 0;
  7256. end;
  7257. zExpPtr := - shiftCount - 63;
  7258. end
  7259. else begin
  7260. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  7261. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  7262. zExpPtr := 1 - shiftCount;
  7263. end;
  7264. end;
  7265. {*----------------------------------------------------------------------------
  7266. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  7267. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  7268. | floating-point value, returning the result. After being shifted into the
  7269. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  7270. | added together to form the most significant 32 bits of the result. This
  7271. | means that any integer portion of `zSig0' will be added into the exponent.
  7272. | Since a properly normalized significand will have an integer portion equal
  7273. | to 1, the `zExp' input should be 1 less than the desired result exponent
  7274. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  7275. | significand.
  7276. *----------------------------------------------------------------------------*}
  7277. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  7278. var
  7279. z: float128;
  7280. begin
  7281. z.low := zSig1;
  7282. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  7283. result:=z;
  7284. end;
  7285. {*----------------------------------------------------------------------------
  7286. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7287. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  7288. | and `zSig2', and returns the proper quadruple-precision floating-point value
  7289. | corresponding to the abstract input. Ordinarily, the abstract value is
  7290. | simply rounded and packed into the quadruple-precision format, with the
  7291. | inexact exception raised if the abstract input cannot be represented
  7292. | exactly. However, if the abstract value is too large, the overflow and
  7293. | inexact exceptions are raised and an infinity or maximal finite value is
  7294. | returned. If the abstract value is too small, the input value is rounded to
  7295. | a subnormal number, and the underflow and inexact exceptions are raised if
  7296. | the abstract input cannot be represented exactly as a subnormal quadruple-
  7297. | precision floating-point number.
  7298. | The input significand must be normalized or smaller. If the input
  7299. | significand is not normalized, `zExp' must be 0; in that case, the result
  7300. | returned is a subnormal number, and it must not require rounding. In the
  7301. | usual case that the input significand is normalized, `zExp' must be 1 less
  7302. | than the ``true'' floating-point exponent. The handling of underflow and
  7303. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7304. *----------------------------------------------------------------------------*}
  7305. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  7306. var
  7307. roundingMode: int8;
  7308. roundNearestEven, increment, isTiny: flag;
  7309. begin
  7310. roundingMode := softfloat_rounding_mode;
  7311. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  7312. increment := ord( sbits64(zSig2) < 0 );
  7313. if ( roundNearestEven=0 ) then
  7314. begin
  7315. if ( roundingMode = float_round_to_zero ) then
  7316. begin
  7317. increment := 0;
  7318. end
  7319. else begin
  7320. if ( zSign<>0 ) then
  7321. begin
  7322. increment := ord( roundingMode = float_round_down ) and zSig2;
  7323. end
  7324. else begin
  7325. increment := ord( roundingMode = float_round_up ) and zSig2;
  7326. end;
  7327. end;
  7328. end;
  7329. if ( $7FFD <= bits32(zExp) ) then
  7330. begin
  7331. if ( ord( $7FFD < zExp )
  7332. or ( ord( zExp = $7FFD )
  7333. and eq128(
  7334. int64( $0001FFFFFFFFFFFF ),
  7335. bits64( $FFFFFFFFFFFFFFFF ),
  7336. zSig0,
  7337. zSig1
  7338. )
  7339. and increment
  7340. )
  7341. )<>0 then
  7342. begin
  7343. float_raise( [float_flag_overflow,float_flag_inexact] );
  7344. if ( ord( roundingMode = float_round_to_zero )
  7345. or ( zSign and ord( roundingMode = float_round_up ) )
  7346. or ( ord( zSign = 0) and ord( roundingMode = float_round_down ) )
  7347. )<>0 then
  7348. begin
  7349. result :=
  7350. packFloat128(
  7351. zSign,
  7352. $7FFE,
  7353. int64( $0000FFFFFFFFFFFF ),
  7354. bits64( $FFFFFFFFFFFFFFFF )
  7355. );
  7356. exit;
  7357. end;
  7358. result:=packFloat128( zSign, $7FFF, 0, 0 );
  7359. exit;
  7360. end;
  7361. if ( zExp < 0 ) then
  7362. begin
  7363. isTiny :=
  7364. ord(( softfloat_detect_tininess = float_tininess_before_rounding )
  7365. or ( zExp < -1 )
  7366. or not( increment<>0 )
  7367. or boolean(lt128(
  7368. zSig0,
  7369. zSig1,
  7370. int64( $0001FFFFFFFFFFFF ),
  7371. bits64( $FFFFFFFFFFFFFFFF )
  7372. )));
  7373. shift128ExtraRightJamming(
  7374. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  7375. zExp := 0;
  7376. if ( isTiny and zSig2 )<>0 then
  7377. float_raise( float_flag_underflow );
  7378. if ( roundNearestEven<>0 ) then
  7379. begin
  7380. increment := ord( sbits64(zSig2) < 0 );
  7381. end
  7382. else begin
  7383. if ( zSign<>0 ) then
  7384. begin
  7385. increment := ord( roundingMode = float_round_down ) and zSig2;
  7386. end
  7387. else begin
  7388. increment := ord( roundingMode = float_round_up ) and zSig2;
  7389. end;
  7390. end;
  7391. end;
  7392. end;
  7393. if ( zSig2<>0 ) then
  7394. set_inexact_flag;
  7395. if ( increment<>0 ) then
  7396. begin
  7397. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  7398. zSig1 := zSig1 and not bits64( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  7399. end
  7400. else begin
  7401. if ( ( zSig0 or zSig1 ) = 0 ) then
  7402. zExp := 0;
  7403. end;
  7404. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  7405. end;
  7406. {*----------------------------------------------------------------------------
  7407. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7408. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  7409. | returns the proper quadruple-precision floating-point value corresponding
  7410. | to the abstract input. This routine is just like `roundAndPackFloat128'
  7411. | except that the input significand has fewer bits and does not have to be
  7412. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  7413. | point exponent.
  7414. *----------------------------------------------------------------------------*}
  7415. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  7416. var
  7417. shiftCount: int8;
  7418. zSig2: bits64;
  7419. begin
  7420. if ( zSig0 = 0 ) then
  7421. begin
  7422. zSig0 := zSig1;
  7423. zSig1 := 0;
  7424. dec(zExp, 64);
  7425. end;
  7426. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  7427. if ( 0 <= shiftCount ) then
  7428. begin
  7429. zSig2 := 0;
  7430. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  7431. end
  7432. else begin
  7433. shift128ExtraRightJamming(
  7434. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  7435. end;
  7436. dec(zExp, shiftCount);
  7437. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7438. end;
  7439. {*----------------------------------------------------------------------------
  7440. | Returns the result of converting the quadruple-precision floating-point
  7441. | value `a' to the 32-bit two's complement integer format. The conversion
  7442. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7443. | Arithmetic---which means in particular that the conversion is rounded
  7444. | according to the current rounding mode. If `a' is a NaN, the largest
  7445. | positive integer is returned. Otherwise, if the conversion overflows, the
  7446. | largest integer with the same sign as `a' is returned.
  7447. *----------------------------------------------------------------------------*}
  7448. function float128_to_int32(a: float128): int32;
  7449. var
  7450. aSign: flag;
  7451. aExp, shiftCount: int32;
  7452. aSig0, aSig1: bits64;
  7453. begin
  7454. aSig1 := extractFloat128Frac1( a );
  7455. aSig0 := extractFloat128Frac0( a );
  7456. aExp := extractFloat128Exp( a );
  7457. aSign := extractFloat128Sign( a );
  7458. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  7459. aSign := 0;
  7460. if ( aExp<>0 ) then
  7461. aSig0 := aSig0 or int64( $0001000000000000 );
  7462. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7463. shiftCount := $4028 - aExp;
  7464. if ( 0 < shiftCount ) then
  7465. shift64RightJamming( aSig0, shiftCount, aSig0 );
  7466. result := roundAndPackInt32( aSign, aSig0 );
  7467. end;
  7468. {*----------------------------------------------------------------------------
  7469. | Returns the result of converting the quadruple-precision floating-point
  7470. | value `a' to the 32-bit two's complement integer format. The conversion
  7471. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7472. | Arithmetic, except that the conversion is always rounded toward zero. If
  7473. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  7474. | conversion overflows, the largest integer with the same sign as `a' is
  7475. | returned.
  7476. *----------------------------------------------------------------------------*}
  7477. function float128_to_int32_round_to_zero(a: float128): int32;
  7478. var
  7479. aSign: flag;
  7480. aExp, shiftCount: int32;
  7481. aSig0, aSig1, savedASig: bits64;
  7482. z: int32;
  7483. label
  7484. invalid;
  7485. begin
  7486. aSig1 := extractFloat128Frac1( a );
  7487. aSig0 := extractFloat128Frac0( a );
  7488. aExp := extractFloat128Exp( a );
  7489. aSign := extractFloat128Sign( a );
  7490. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7491. if ( $401E < aExp ) then
  7492. begin
  7493. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  7494. aSign := 0;
  7495. goto invalid;
  7496. end
  7497. else if ( aExp < $3FFF ) then
  7498. begin
  7499. if ( aExp or aSig0 )<>0 then
  7500. set_inexact_flag;
  7501. result := 0;
  7502. exit;
  7503. end;
  7504. aSig0 := aSig0 or int64( $0001000000000000 );
  7505. shiftCount := $402F - aExp;
  7506. savedASig := aSig0;
  7507. aSig0 := aSig0 shr shiftCount;
  7508. z := aSig0;
  7509. if ( aSign )<>0 then
  7510. z := - z;
  7511. if ( ord( z < 0 ) xor aSign )<>0 then
  7512. begin
  7513. invalid:
  7514. float_raise( float_flag_invalid );
  7515. if aSign<>0 then
  7516. result:= int32( $80000000 )
  7517. else
  7518. result:=$7FFFFFFF;
  7519. exit;
  7520. end;
  7521. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  7522. begin
  7523. set_inexact_flag;
  7524. end;
  7525. result := z;
  7526. end;
  7527. {*----------------------------------------------------------------------------
  7528. | Returns the result of converting the quadruple-precision floating-point
  7529. | value `a' to the 64-bit two's complement integer format. The conversion
  7530. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7531. | Arithmetic---which means in particular that the conversion is rounded
  7532. | according to the current rounding mode. If `a' is a NaN, the largest
  7533. | positive integer is returned. Otherwise, if the conversion overflows, the
  7534. | largest integer with the same sign as `a' is returned.
  7535. *----------------------------------------------------------------------------*}
  7536. function float128_to_int64(a: float128): int64;
  7537. var
  7538. aSign: flag;
  7539. aExp, shiftCount: int32;
  7540. aSig0, aSig1: bits64;
  7541. begin
  7542. aSig1 := extractFloat128Frac1( a );
  7543. aSig0 := extractFloat128Frac0( a );
  7544. aExp := extractFloat128Exp( a );
  7545. aSign := extractFloat128Sign( a );
  7546. if ( aExp<>0 ) then
  7547. aSig0 := aSig0 or int64( $0001000000000000 );
  7548. shiftCount := $402F - aExp;
  7549. if ( shiftCount <= 0 ) then
  7550. begin
  7551. if ( $403E < aExp ) then
  7552. begin
  7553. float_raise( float_flag_invalid );
  7554. if ( (aSign=0)
  7555. or ( ( aExp = $7FFF )
  7556. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  7557. )
  7558. ) then
  7559. begin
  7560. result := int64( $7FFFFFFFFFFFFFFF );
  7561. exit;
  7562. end;
  7563. result := int64( $8000000000000000 );
  7564. exit;
  7565. end;
  7566. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  7567. end
  7568. else begin
  7569. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  7570. end;
  7571. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  7572. end;
  7573. {*----------------------------------------------------------------------------
  7574. | Returns the result of converting the quadruple-precision floating-point
  7575. | value `a' to the 64-bit two's complement integer format. The conversion
  7576. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7577. | Arithmetic, except that the conversion is always rounded toward zero.
  7578. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  7579. | the conversion overflows, the largest integer with the same sign as `a' is
  7580. | returned.
  7581. *----------------------------------------------------------------------------*}
  7582. function float128_to_int64_round_to_zero(a: float128): int64;
  7583. var
  7584. aSign: flag;
  7585. aExp, shiftCount: int32;
  7586. aSig0, aSig1: bits64;
  7587. z: int64;
  7588. begin
  7589. aSig1 := extractFloat128Frac1( a );
  7590. aSig0 := extractFloat128Frac0( a );
  7591. aExp := extractFloat128Exp( a );
  7592. aSign := extractFloat128Sign( a );
  7593. if ( aExp<>0 ) then
  7594. aSig0 := aSig0 or int64( $0001000000000000 );
  7595. shiftCount := aExp - $402F;
  7596. if ( 0 < shiftCount ) then
  7597. begin
  7598. if ( $403E <= aExp ) then
  7599. begin
  7600. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  7601. if ( ( a.high = bits64( $C03E000000000000 ) )
  7602. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  7603. begin
  7604. if ( aSig1<>0 ) then
  7605. set_inexact_flag;
  7606. end
  7607. else begin
  7608. float_raise( float_flag_invalid );
  7609. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  7610. begin
  7611. result := int64( $7FFFFFFFFFFFFFFF );
  7612. exit;
  7613. end;
  7614. end;
  7615. result := int64( $8000000000000000 );
  7616. exit;
  7617. end;
  7618. z := ( aSig0 shl shiftCount ) or ( aSig1 shr ( ( - shiftCount ) and 63 ) );
  7619. if ( int64( aSig1 shl shiftCount )<>0 ) then
  7620. begin
  7621. set_inexact_flag;
  7622. end;
  7623. end
  7624. else begin
  7625. if ( aExp < $3FFF ) then
  7626. begin
  7627. if ( aExp or aSig0 or aSig1 )<>0 then
  7628. begin
  7629. set_inexact_flag;
  7630. end;
  7631. result := 0;
  7632. exit;
  7633. end;
  7634. z := aSig0 shr ( - shiftCount );
  7635. if ( (aSig1<>0)
  7636. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  7637. begin
  7638. set_inexact_flag;
  7639. end;
  7640. end;
  7641. if ( aSign<>0 ) then
  7642. z := - z;
  7643. result := z;
  7644. end;
  7645. {*----------------------------------------------------------------------------
  7646. | Returns the result of converting the quadruple-precision floating-point
  7647. | value `a' to the single-precision floating-point format. The conversion
  7648. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7649. | Arithmetic.
  7650. *----------------------------------------------------------------------------*}
  7651. function float128_to_float32(a: float128): float32;
  7652. var
  7653. aSign: flag;
  7654. aExp: int32;
  7655. aSig0, aSig1: bits64;
  7656. zSig: bits32;
  7657. begin
  7658. aSig1 := extractFloat128Frac1( a );
  7659. aSig0 := extractFloat128Frac0( a );
  7660. aExp := extractFloat128Exp( a );
  7661. aSign := extractFloat128Sign( a );
  7662. if ( aExp = $7FFF ) then
  7663. begin
  7664. if ( aSig0 or aSig1 )<>0 then
  7665. begin
  7666. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  7667. exit;
  7668. end;
  7669. result := packFloat32( aSign, $FF, 0 );
  7670. exit;
  7671. end;
  7672. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7673. shift64RightJamming( aSig0, 18, aSig0 );
  7674. zSig := aSig0;
  7675. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7676. begin
  7677. zSig := zSig or $40000000;
  7678. dec(aExp,$3F81);
  7679. end;
  7680. result := roundAndPackFloat32( aSign, aExp, zSig );
  7681. end;
  7682. {*----------------------------------------------------------------------------
  7683. | Returns the result of converting the quadruple-precision floating-point
  7684. | value `a' to the double-precision floating-point format. The conversion
  7685. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7686. | Arithmetic.
  7687. *----------------------------------------------------------------------------*}
  7688. function float128_to_float64(a: float128): float64;
  7689. var
  7690. aSign: flag;
  7691. aExp: int32;
  7692. aSig0, aSig1: bits64;
  7693. begin
  7694. aSig1 := extractFloat128Frac1( a );
  7695. aSig0 := extractFloat128Frac0( a );
  7696. aExp := extractFloat128Exp( a );
  7697. aSign := extractFloat128Sign( a );
  7698. if ( aExp = $7FFF ) then
  7699. begin
  7700. if ( aSig0 or aSig1 )<>0 then
  7701. begin
  7702. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  7703. exit;
  7704. end;
  7705. result:=packFloat64( aSign, $7FF, 0);
  7706. exit;
  7707. end;
  7708. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7709. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7710. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7711. begin
  7712. aSig0 := aSig0 or int64( $4000000000000000 );
  7713. dec(aExp,$3C01);
  7714. end;
  7715. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7716. end;
  7717. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7718. {*----------------------------------------------------------------------------
  7719. | Returns the result of converting the quadruple-precision floating-point
  7720. | value `a' to the extended double-precision floating-point format. The
  7721. | conversion is performed according to the IEC/IEEE Standard for Binary
  7722. | Floating-Point Arithmetic.
  7723. *----------------------------------------------------------------------------*}
  7724. function float128_to_floatx80(a: float128): floatx80;
  7725. var
  7726. aSign: flag;
  7727. aExp: int32;
  7728. aSig0, aSig1: bits64;
  7729. begin
  7730. aSig1 := extractFloat128Frac1( a );
  7731. aSig0 := extractFloat128Frac0( a );
  7732. aExp := extractFloat128Exp( a );
  7733. aSign := extractFloat128Sign( a );
  7734. if ( aExp = $7FFF ) then begin
  7735. if ( aSig0 or aSig1 <> 0 ) then begin
  7736. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7737. exit;
  7738. end;
  7739. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  7740. exit;
  7741. end;
  7742. if ( aExp = 0 ) then begin
  7743. if ( ( aSig0 or aSig1 ) = 0 ) then
  7744. begin
  7745. result := packFloatx80( aSign, 0, 0 );
  7746. exit;
  7747. end;
  7748. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7749. end
  7750. else begin
  7751. aSig0 := aSig0 or int64( $0001000000000000 );
  7752. end;
  7753. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7754. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7755. end;
  7756. {$endif FPC_SOFTFLOAT_FLOATX80}
  7757. {*----------------------------------------------------------------------------
  7758. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7759. | Returns the result as a quadruple-precision floating-point value. The
  7760. | operation is performed according to the IEC/IEEE Standard for Binary
  7761. | Floating-Point Arithmetic.
  7762. *----------------------------------------------------------------------------*}
  7763. function float128_round_to_int(a: float128): float128;
  7764. var
  7765. aSign: flag;
  7766. aExp: int32;
  7767. lastBitMask, roundBitsMask: bits64;
  7768. roundingMode: int8;
  7769. z: float128;
  7770. begin
  7771. aExp := extractFloat128Exp( a );
  7772. if ( $402F <= aExp ) then
  7773. begin
  7774. if ( $406F <= aExp ) then
  7775. begin
  7776. if ( ( aExp = $7FFF )
  7777. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7778. ) then
  7779. begin
  7780. result := propagateFloat128NaN( a, a );
  7781. exit;
  7782. end;
  7783. result := a;
  7784. exit;
  7785. end;
  7786. lastBitMask := 1;
  7787. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7788. roundBitsMask := lastBitMask - 1;
  7789. z := a;
  7790. roundingMode := softfloat_rounding_mode;
  7791. if ( roundingMode = float_round_nearest_even ) then
  7792. begin
  7793. if ( lastBitMask )<>0 then
  7794. begin
  7795. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7796. if ( ( z.low and roundBitsMask ) = 0 ) then
  7797. z.low := z.low and not(lastBitMask);
  7798. end
  7799. else begin
  7800. if ( sbits64(z.low) < 0 ) then
  7801. begin
  7802. inc(z.high);
  7803. if ( bits64( z.low shl 1 ) = 0 ) then
  7804. z.high := z.high and not bits64( 1 );
  7805. end;
  7806. end;
  7807. end
  7808. else if ( roundingMode <> float_round_to_zero ) then
  7809. begin
  7810. if ( extractFloat128Sign( z )
  7811. xor ord( roundingMode = float_round_up ) )<>0 then
  7812. begin
  7813. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7814. end;
  7815. end;
  7816. z.low := z.low and not(roundBitsMask);
  7817. end
  7818. else begin
  7819. if ( aExp < $3FFF ) then
  7820. begin
  7821. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7822. begin
  7823. result := a;
  7824. exit;
  7825. end;
  7826. set_inexact_flag;
  7827. aSign := extractFloat128Sign( a );
  7828. case softfloat_rounding_mode of
  7829. float_round_nearest_even:
  7830. if ( ( aExp = $3FFE )
  7831. and ( (extractFloat128Frac0( a )<>0)
  7832. or (extractFloat128Frac1( a )<>0) )
  7833. ) then begin
  7834. begin
  7835. result := packFloat128( aSign, $3FFF, 0, 0 );
  7836. exit;
  7837. end;
  7838. end;
  7839. float_round_down:
  7840. begin
  7841. if aSign<>0 then
  7842. result:=packFloat128( 1, $3FFF, 0, 0 )
  7843. else
  7844. result:=packFloat128( 0, 0, 0, 0 );
  7845. exit;
  7846. end;
  7847. float_round_up:
  7848. begin
  7849. if aSign<>0 then
  7850. result := packFloat128( 1, 0, 0, 0 )
  7851. else
  7852. result:=packFloat128( 0, $3FFF, 0, 0 );
  7853. exit;
  7854. end;
  7855. end;
  7856. result := packFloat128( aSign, 0, 0, 0 );
  7857. exit;
  7858. end;
  7859. lastBitMask := 1;
  7860. lastBitMask := lastBitMask shl ($402F - aExp);
  7861. roundBitsMask := lastBitMask - 1;
  7862. z.low := 0;
  7863. z.high := a.high;
  7864. roundingMode := softfloat_rounding_mode;
  7865. if ( roundingMode = float_round_nearest_even ) then begin
  7866. inc(z.high,lastBitMask shr 1);
  7867. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7868. z.high := z.high and not(lastBitMask);
  7869. end;
  7870. end
  7871. else if ( roundingMode <> float_round_to_zero ) then begin
  7872. if ( (extractFloat128Sign( z )<>0)
  7873. xor ( roundingMode = float_round_up ) ) then begin
  7874. z.high := z.high or ord( a.low <> 0 );
  7875. z.high := z.high+roundBitsMask;
  7876. end;
  7877. end;
  7878. z.high := z.high and not(roundBitsMask);
  7879. end;
  7880. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7881. set_inexact_flag;
  7882. end;
  7883. result := z;
  7884. end;
  7885. {*----------------------------------------------------------------------------
  7886. | Returns the result of adding the absolute values of the quadruple-precision
  7887. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7888. | before being returned. `zSign' is ignored if the result is a NaN.
  7889. | The addition is performed according to the IEC/IEEE Standard for Binary
  7890. | Floating-Point Arithmetic.
  7891. *----------------------------------------------------------------------------*}
  7892. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7893. var
  7894. aExp, bExp, zExp: int32;
  7895. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7896. expDiff: int32;
  7897. label
  7898. shiftRight1,roundAndPack;
  7899. begin
  7900. aSig1 := extractFloat128Frac1( a );
  7901. aSig0 := extractFloat128Frac0( a );
  7902. aExp := extractFloat128Exp( a );
  7903. bSig1 := extractFloat128Frac1( b );
  7904. bSig0 := extractFloat128Frac0( b );
  7905. bExp := extractFloat128Exp( b );
  7906. expDiff := aExp - bExp;
  7907. if ( 0 < expDiff ) then begin
  7908. if ( aExp = $7FFF ) then begin
  7909. if ( aSig0 or aSig1 )<>0 then
  7910. begin
  7911. result := propagateFloat128NaN( a, b );
  7912. exit;
  7913. end;
  7914. result := a;
  7915. exit;
  7916. end;
  7917. if ( bExp = 0 ) then begin
  7918. dec(expDiff);
  7919. end
  7920. else begin
  7921. bSig0 := bSig0 or int64( $0001000000000000 );
  7922. end;
  7923. shift128ExtraRightJamming(
  7924. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  7925. zExp := aExp;
  7926. end
  7927. else if ( expDiff < 0 ) then begin
  7928. if ( bExp = $7FFF ) then begin
  7929. if ( bSig0 or bSig1 )<>0 then
  7930. begin
  7931. result := propagateFloat128NaN( a, b );
  7932. exit;
  7933. end;
  7934. result := packFloat128( zSign, $7FFF, 0, 0 );
  7935. exit;
  7936. end;
  7937. if ( aExp = 0 ) then begin
  7938. inc(expDiff);
  7939. end
  7940. else begin
  7941. aSig0 := aSig0 or int64( $0001000000000000 );
  7942. end;
  7943. shift128ExtraRightJamming(
  7944. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  7945. zExp := bExp;
  7946. end
  7947. else begin
  7948. if ( aExp = $7FFF ) then begin
  7949. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7950. result := propagateFloat128NaN( a, b );
  7951. exit;
  7952. end;
  7953. result := a;
  7954. exit;
  7955. end;
  7956. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7957. if ( aExp = 0 ) then
  7958. begin
  7959. result := packFloat128( zSign, 0, zSig0, zSig1 );
  7960. exit;
  7961. end;
  7962. zSig2 := 0;
  7963. zSig0 := zSig0 or int64( $0002000000000000 );
  7964. zExp := aExp;
  7965. goto shiftRight1;
  7966. end;
  7967. aSig0 := aSig0 or int64( $0001000000000000 );
  7968. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7969. dec(zExp);
  7970. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  7971. inc(zExp);
  7972. shiftRight1:
  7973. shift128ExtraRightJamming(
  7974. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7975. roundAndPack:
  7976. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7977. end;
  7978. {*----------------------------------------------------------------------------
  7979. | Returns the result of subtracting the absolute values of the quadruple-
  7980. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  7981. | difference is negated before being returned. `zSign' is ignored if the
  7982. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  7983. | Standard for Binary Floating-Point Arithmetic.
  7984. *----------------------------------------------------------------------------*}
  7985. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  7986. var
  7987. aExp, bExp, zExp: int32;
  7988. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  7989. expDiff: int32;
  7990. z: float128;
  7991. label
  7992. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  7993. begin
  7994. aSig1 := extractFloat128Frac1( a );
  7995. aSig0 := extractFloat128Frac0( a );
  7996. aExp := extractFloat128Exp( a );
  7997. bSig1 := extractFloat128Frac1( b );
  7998. bSig0 := extractFloat128Frac0( b );
  7999. bExp := extractFloat128Exp( b );
  8000. expDiff := aExp - bExp;
  8001. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  8002. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  8003. if ( 0 < expDiff ) then goto aExpBigger;
  8004. if ( expDiff < 0 ) then goto bExpBigger;
  8005. if ( aExp = $7FFF ) then begin
  8006. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8007. result := propagateFloat128NaN( a, b );
  8008. exit;
  8009. end;
  8010. float_raise( float_flag_invalid );
  8011. z.low := float128_default_nan_low;
  8012. z.high := float128_default_nan_high;
  8013. result := z;
  8014. exit;
  8015. end;
  8016. if ( aExp = 0 ) then begin
  8017. aExp := 1;
  8018. bExp := 1;
  8019. end;
  8020. if ( bSig0 < aSig0 ) then goto aBigger;
  8021. if ( aSig0 < bSig0 ) then goto bBigger;
  8022. if ( bSig1 < aSig1 ) then goto aBigger;
  8023. if ( aSig1 < bSig1 ) then goto bBigger;
  8024. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  8025. exit;
  8026. bExpBigger:
  8027. if ( bExp = $7FFF ) then begin
  8028. if ( bSig0 or bSig1 )<>0 then
  8029. begin
  8030. result := propagateFloat128NaN( a, b );
  8031. exit;
  8032. end;
  8033. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  8034. exit;
  8035. end;
  8036. if ( aExp = 0 ) then begin
  8037. inc(expDiff);
  8038. end
  8039. else begin
  8040. aSig0 := aSig0 or int64( $4000000000000000 );
  8041. end;
  8042. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8043. bSig0 := bSig0 or int64( $4000000000000000 );
  8044. bBigger:
  8045. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  8046. zExp := bExp;
  8047. zSign := zSign xor 1;
  8048. goto normalizeRoundAndPack;
  8049. aExpBigger:
  8050. if ( aExp = $7FFF ) then begin
  8051. if ( aSig0 or aSig1 )<>0 then
  8052. begin
  8053. result := propagateFloat128NaN( a, b );
  8054. exit;
  8055. end;
  8056. result := a;
  8057. exit;
  8058. end;
  8059. if ( bExp = 0 ) then begin
  8060. dec(expDiff);
  8061. end
  8062. else begin
  8063. bSig0 := bSig0 or int64( $4000000000000000 );
  8064. end;
  8065. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  8066. aSig0 := aSig0 or int64( $4000000000000000 );
  8067. aBigger:
  8068. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8069. zExp := aExp;
  8070. normalizeRoundAndPack:
  8071. dec(zExp);
  8072. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  8073. end;
  8074. {*----------------------------------------------------------------------------
  8075. | Returns the result of adding the quadruple-precision floating-point values
  8076. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  8077. | for Binary Floating-Point Arithmetic.
  8078. *----------------------------------------------------------------------------*}
  8079. function float128_add(a: float128; b: float128): float128;
  8080. var
  8081. aSign, bSign: flag;
  8082. begin
  8083. aSign := extractFloat128Sign( a );
  8084. bSign := extractFloat128Sign( b );
  8085. if ( aSign = bSign ) then begin
  8086. result := addFloat128Sigs( a, b, aSign );
  8087. end
  8088. else begin
  8089. result := subFloat128Sigs( a, b, aSign );
  8090. end;
  8091. end;
  8092. {*----------------------------------------------------------------------------
  8093. | Returns the result of subtracting the quadruple-precision floating-point
  8094. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8095. | Standard for Binary Floating-Point Arithmetic.
  8096. *----------------------------------------------------------------------------*}
  8097. function float128_sub(a: float128; b: float128): float128;
  8098. var
  8099. aSign, bSign: flag;
  8100. begin
  8101. aSign := extractFloat128Sign( a );
  8102. bSign := extractFloat128Sign( b );
  8103. if ( aSign = bSign ) then begin
  8104. result := subFloat128Sigs( a, b, aSign );
  8105. end
  8106. else begin
  8107. result := addFloat128Sigs( a, b, aSign );
  8108. end;
  8109. end;
  8110. {*----------------------------------------------------------------------------
  8111. | Returns the result of multiplying the quadruple-precision floating-point
  8112. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8113. | Standard for Binary Floating-Point Arithmetic.
  8114. *----------------------------------------------------------------------------*}
  8115. function float128_mul(a: float128; b: float128): float128;
  8116. var
  8117. aSign, bSign, zSign: flag;
  8118. aExp, bExp, zExp: int32;
  8119. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  8120. z: float128;
  8121. label
  8122. invalid;
  8123. begin
  8124. aSig1 := extractFloat128Frac1( a );
  8125. aSig0 := extractFloat128Frac0( a );
  8126. aExp := extractFloat128Exp( a );
  8127. aSign := extractFloat128Sign( a );
  8128. bSig1 := extractFloat128Frac1( b );
  8129. bSig0 := extractFloat128Frac0( b );
  8130. bExp := extractFloat128Exp( b );
  8131. bSign := extractFloat128Sign( b );
  8132. zSign := aSign xor bSign;
  8133. if ( aExp = $7FFF ) then begin
  8134. if ( (( aSig0 or aSig1 )<>0)
  8135. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8136. result := propagateFloat128NaN( a, b );
  8137. exit;
  8138. end;
  8139. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  8140. result := packFloat128( zSign, $7FFF, 0, 0 );
  8141. exit;
  8142. end;
  8143. if ( bExp = $7FFF ) then begin
  8144. if ( bSig0 or bSig1 )<>0 then
  8145. begin
  8146. result := propagateFloat128NaN( a, b );
  8147. exit;
  8148. end;
  8149. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8150. invalid:
  8151. float_raise( float_flag_invalid );
  8152. z.low := float128_default_nan_low;
  8153. z.high := float128_default_nan_high;
  8154. result := z;
  8155. exit;
  8156. end;
  8157. result := packFloat128( zSign, $7FFF, 0, 0 );
  8158. exit;
  8159. end;
  8160. if ( aExp = 0 ) then begin
  8161. if ( ( aSig0 or aSig1 ) = 0 ) then
  8162. begin
  8163. result := packFloat128( zSign, 0, 0, 0 );
  8164. exit;
  8165. end;
  8166. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8167. end;
  8168. if ( bExp = 0 ) then begin
  8169. if ( ( bSig0 or bSig1 ) = 0 ) then
  8170. begin
  8171. result := packFloat128( zSign, 0, 0, 0 );
  8172. exit;
  8173. end;
  8174. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8175. end;
  8176. zExp := aExp + bExp - $4000;
  8177. aSig0 := aSig0 or int64( $0001000000000000 );
  8178. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  8179. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  8180. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  8181. zSig2 := zSig2 or ord( zSig3 <> 0 );
  8182. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  8183. shift128ExtraRightJamming(
  8184. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8185. inc(zExp);
  8186. end;
  8187. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8188. end;
  8189. {*----------------------------------------------------------------------------
  8190. | Returns the result of dividing the quadruple-precision floating-point value
  8191. | `a' by the corresponding value `b'. The operation is performed according to
  8192. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8193. *----------------------------------------------------------------------------*}
  8194. function float128_div(a: float128; b: float128): float128;
  8195. var
  8196. aSign, bSign, zSign: flag;
  8197. aExp, bExp, zExp: int32;
  8198. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8199. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8200. z: float128;
  8201. label
  8202. invalid;
  8203. begin
  8204. aSig1 := extractFloat128Frac1( a );
  8205. aSig0 := extractFloat128Frac0( a );
  8206. aExp := extractFloat128Exp( a );
  8207. aSign := extractFloat128Sign( a );
  8208. bSig1 := extractFloat128Frac1( b );
  8209. bSig0 := extractFloat128Frac0( b );
  8210. bExp := extractFloat128Exp( b );
  8211. bSign := extractFloat128Sign( b );
  8212. zSign := aSign xor bSign;
  8213. if ( aExp = $7FFF ) then begin
  8214. if ( aSig0 or aSig1 )<>0 then
  8215. begin
  8216. result := propagateFloat128NaN( a, b );
  8217. exit;
  8218. end;
  8219. if ( bExp = $7FFF ) then begin
  8220. if ( bSig0 or bSig1 )<>0 then
  8221. begin
  8222. result := propagateFloat128NaN( a, b );
  8223. exit;
  8224. end;
  8225. goto invalid;
  8226. end;
  8227. result := packFloat128( zSign, $7FFF, 0, 0 );
  8228. exit;
  8229. end;
  8230. if ( bExp = $7FFF ) then begin
  8231. if ( bSig0 or bSig1 )<>0 then
  8232. begin
  8233. result := propagateFloat128NaN( a, b );
  8234. exit;
  8235. end;
  8236. result := packFloat128( zSign, 0, 0, 0 );
  8237. exit;
  8238. end;
  8239. if ( bExp = 0 ) then begin
  8240. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8241. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8242. invalid:
  8243. float_raise( float_flag_invalid );
  8244. z.low := float128_default_nan_low;
  8245. z.high := float128_default_nan_high;
  8246. result := z;
  8247. exit;
  8248. end;
  8249. float_raise( float_flag_divbyzero );
  8250. result := packFloat128( zSign, $7FFF, 0, 0 );
  8251. exit;
  8252. end;
  8253. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8254. end;
  8255. if ( aExp = 0 ) then begin
  8256. if ( ( aSig0 or aSig1 ) = 0 ) then
  8257. begin
  8258. result := packFloat128( zSign, 0, 0, 0 );
  8259. exit;
  8260. end;
  8261. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8262. end;
  8263. zExp := aExp - bExp + $3FFD;
  8264. shortShift128Left(
  8265. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  8266. shortShift128Left(
  8267. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8268. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  8269. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  8270. inc(zExp);
  8271. end;
  8272. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8273. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  8274. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  8275. while ( sbits64(rem0) < 0 ) do begin
  8276. dec(zSig0);
  8277. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  8278. end;
  8279. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  8280. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  8281. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  8282. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  8283. while ( sbits64(rem1) < 0 ) do begin
  8284. dec(zSig1);
  8285. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  8286. end;
  8287. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8288. end;
  8289. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  8290. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8291. end;
  8292. {*----------------------------------------------------------------------------
  8293. | Returns the remainder of the quadruple-precision floating-point value `a'
  8294. | with respect to the corresponding value `b'. The operation is performed
  8295. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8296. *----------------------------------------------------------------------------*}
  8297. function float128_rem(a: float128; b: float128): float128;
  8298. var
  8299. aSign, zSign: flag;
  8300. aExp, bExp, expDiff: int32;
  8301. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  8302. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  8303. sigMean0: sbits64;
  8304. z: float128;
  8305. label
  8306. invalid;
  8307. begin
  8308. aSig1 := extractFloat128Frac1( a );
  8309. aSig0 := extractFloat128Frac0( a );
  8310. aExp := extractFloat128Exp( a );
  8311. aSign := extractFloat128Sign( a );
  8312. bSig1 := extractFloat128Frac1( b );
  8313. bSig0 := extractFloat128Frac0( b );
  8314. bExp := extractFloat128Exp( b );
  8315. if ( aExp = $7FFF ) then begin
  8316. if ( (( aSig0 or aSig1 )<>0)
  8317. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8318. result := propagateFloat128NaN( a, b );
  8319. exit;
  8320. end;
  8321. goto invalid;
  8322. end;
  8323. if ( bExp = $7FFF ) then begin
  8324. if ( bSig0 or bSig1 )<>0 then
  8325. begin
  8326. result := propagateFloat128NaN( a, b );
  8327. exit;
  8328. end;
  8329. result := a;
  8330. exit;
  8331. end;
  8332. if ( bExp = 0 ) then begin
  8333. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8334. invalid:
  8335. float_raise( float_flag_invalid );
  8336. z.low := float128_default_nan_low;
  8337. z.high := float128_default_nan_high;
  8338. result := z;
  8339. exit;
  8340. end;
  8341. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8342. end;
  8343. if ( aExp = 0 ) then begin
  8344. if ( ( aSig0 or aSig1 ) = 0 ) then
  8345. begin
  8346. result := a;
  8347. exit;
  8348. end;
  8349. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8350. end;
  8351. expDiff := aExp - bExp;
  8352. if ( expDiff < -1 ) then
  8353. begin
  8354. result := a;
  8355. exit;
  8356. end;
  8357. shortShift128Left(
  8358. aSig0 or int64( $0001000000000000 ),
  8359. aSig1,
  8360. 15 - ord( expDiff < 0 ),
  8361. aSig0,
  8362. aSig1
  8363. );
  8364. shortShift128Left(
  8365. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8366. q := le128( bSig0, bSig1, aSig0, aSig1 );
  8367. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8368. dec(expDiff,64);
  8369. while ( 0 < expDiff ) do begin
  8370. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8371. if ( 4 < q ) then
  8372. q := q - 4
  8373. else
  8374. q := 0;
  8375. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8376. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  8377. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  8378. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  8379. dec(expDiff,61);
  8380. end;
  8381. if ( -64 < expDiff ) then begin
  8382. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8383. if ( 4 < q ) then
  8384. q := q - 4
  8385. else
  8386. q := 0;
  8387. q := q shr (- expDiff);
  8388. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8389. inc(expDiff,52);
  8390. if ( expDiff < 0 ) then begin
  8391. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8392. end
  8393. else begin
  8394. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  8395. end;
  8396. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8397. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  8398. end
  8399. else begin
  8400. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  8401. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8402. end;
  8403. repeat
  8404. alternateASig0 := aSig0;
  8405. alternateASig1 := aSig1;
  8406. inc(q);
  8407. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8408. until not( 0 <= sbits64(aSig0) );
  8409. add128(
  8410. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  8411. if ( ( sigMean0 < 0 )
  8412. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  8413. aSig0 := alternateASig0;
  8414. aSig1 := alternateASig1;
  8415. end;
  8416. zSign := ord( sbits64(aSig0) < 0 );
  8417. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  8418. result :=
  8419. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  8420. end;
  8421. {*----------------------------------------------------------------------------
  8422. | Returns the square root of the quadruple-precision floating-point value `a'.
  8423. | The operation is performed according to the IEC/IEEE Standard for Binary
  8424. | Floating-Point Arithmetic.
  8425. *----------------------------------------------------------------------------*}
  8426. function float128_sqrt(a: float128): float128;
  8427. var
  8428. aSign: flag;
  8429. aExp, zExp: int32;
  8430. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  8431. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8432. z: float128;
  8433. label
  8434. invalid;
  8435. begin
  8436. aSig1 := extractFloat128Frac1( a );
  8437. aSig0 := extractFloat128Frac0( a );
  8438. aExp := extractFloat128Exp( a );
  8439. aSign := extractFloat128Sign( a );
  8440. if ( aExp = $7FFF ) then begin
  8441. if ( aSig0 or aSig1 )<>0 then
  8442. begin
  8443. result := propagateFloat128NaN( a, a );
  8444. exit;
  8445. end;
  8446. if ( aSign=0 ) then
  8447. begin
  8448. result := a;
  8449. exit;
  8450. end;
  8451. goto invalid;
  8452. end;
  8453. if ( aSign<>0 ) then begin
  8454. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  8455. begin
  8456. result := a;
  8457. exit;
  8458. end;
  8459. invalid:
  8460. float_raise( float_flag_invalid );
  8461. z.low := float128_default_nan_low;
  8462. z.high := float128_default_nan_high;
  8463. result := z;
  8464. exit;
  8465. end;
  8466. if ( aExp = 0 ) then begin
  8467. if ( ( aSig0 or aSig1 ) = 0 ) then
  8468. begin
  8469. result := packFloat128( 0, 0, 0, 0 );
  8470. exit;
  8471. end;
  8472. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8473. end;
  8474. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFE;
  8475. aSig0 := aSig0 or int64( $0001000000000000 );
  8476. zSig0 := estimateSqrt32( aExp, aSig0 shr 17 );
  8477. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  8478. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  8479. doubleZSig0 := zSig0 shl 1;
  8480. mul64To128( zSig0, zSig0, term0, term1 );
  8481. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  8482. while ( sbits64(rem0) < 0 ) do begin
  8483. dec(zSig0);
  8484. dec(doubleZSig0,2);
  8485. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  8486. end;
  8487. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  8488. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  8489. if ( zSig1 = 0 ) then zSig1 := 1;
  8490. mul64To128( doubleZSig0, zSig1, term1, term2 );
  8491. sub128( rem1, 0, term1, term2, rem1, rem2 );
  8492. mul64To128( zSig1, zSig1, term2, term3 );
  8493. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  8494. while ( sbits64(rem1) < 0 ) do begin
  8495. dec(zSig1);
  8496. shortShift128Left( 0, zSig1, 1, term2, term3 );
  8497. term3 := term3 or 1;
  8498. term2 := term2 or doubleZSig0;
  8499. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  8500. end;
  8501. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8502. end;
  8503. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  8504. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  8505. end;
  8506. {*----------------------------------------------------------------------------
  8507. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8508. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8509. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8510. *----------------------------------------------------------------------------*}
  8511. function float128_eq(a: float128; b: float128): flag;
  8512. begin
  8513. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8514. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8515. or ( ( extractFloat128Exp( b ) = $7FFF )
  8516. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8517. ) then begin
  8518. if ( (float128_is_signaling_nan( a )<>0)
  8519. or (float128_is_signaling_nan( b )<>0) ) then begin
  8520. float_raise( float_flag_invalid );
  8521. end;
  8522. result := 0;
  8523. exit;
  8524. end;
  8525. result := ord(
  8526. ( a.low = b.low )
  8527. and ( ( a.high = b.high )
  8528. or ( ( a.low = 0 )
  8529. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  8530. ));
  8531. end;
  8532. {*----------------------------------------------------------------------------
  8533. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8534. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  8535. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  8536. | Arithmetic.
  8537. *----------------------------------------------------------------------------*}
  8538. function float128_le(a: float128; b: float128): flag;
  8539. var
  8540. aSign, bSign: flag;
  8541. begin
  8542. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8543. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8544. or ( ( extractFloat128Exp( b ) = $7FFF )
  8545. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8546. ) then begin
  8547. float_raise( float_flag_invalid );
  8548. result := 0;
  8549. exit;
  8550. end;
  8551. aSign := extractFloat128Sign( a );
  8552. bSign := extractFloat128Sign( b );
  8553. if ( aSign <> bSign ) then begin
  8554. result := ord(
  8555. (aSign<>0)
  8556. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8557. = 0 ));
  8558. exit;
  8559. end;
  8560. if aSign<>0 then
  8561. result := le128( b.high, b.low, a.high, a.low )
  8562. else
  8563. result := le128( a.high, a.low, b.high, b.low );
  8564. end;
  8565. {*----------------------------------------------------------------------------
  8566. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8567. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8568. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8569. *----------------------------------------------------------------------------*}
  8570. function float128_lt(a: float128; b: float128): flag;
  8571. var
  8572. aSign, bSign: flag;
  8573. begin
  8574. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8575. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8576. or ( ( extractFloat128Exp( b ) = $7FFF )
  8577. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8578. ) then begin
  8579. float_raise( float_flag_invalid );
  8580. result := 0;
  8581. exit;
  8582. end;
  8583. aSign := extractFloat128Sign( a );
  8584. bSign := extractFloat128Sign( b );
  8585. if ( aSign <> bSign ) then begin
  8586. result := ord(
  8587. (aSign<>0)
  8588. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8589. <> 0 ));
  8590. exit;
  8591. end;
  8592. if aSign<>0 then
  8593. result := lt128( b.high, b.low, a.high, a.low )
  8594. else
  8595. result := lt128( a.high, a.low, b.high, b.low );
  8596. end;
  8597. {*----------------------------------------------------------------------------
  8598. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8599. | the corresponding value `b', and 0 otherwise. The invalid exception is
  8600. | raised if either operand is a NaN. Otherwise, the comparison is performed
  8601. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8602. *----------------------------------------------------------------------------*}
  8603. function float128_eq_signaling(a: float128; b: float128): flag;
  8604. begin
  8605. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8606. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8607. or ( ( extractFloat128Exp( b ) = $7FFF )
  8608. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8609. ) then begin
  8610. float_raise( float_flag_invalid );
  8611. result := 0;
  8612. exit;
  8613. end;
  8614. result := ord(
  8615. ( a.low = b.low )
  8616. and ( ( a.high = b.high )
  8617. or ( ( a.low = 0 )
  8618. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  8619. ));
  8620. end;
  8621. {*----------------------------------------------------------------------------
  8622. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8623. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  8624. | cause an exception. Otherwise, the comparison is performed according to the
  8625. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8626. *----------------------------------------------------------------------------*}
  8627. function float128_le_quiet(a: float128; b: float128): flag;
  8628. var
  8629. aSign, bSign: flag;
  8630. begin
  8631. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8632. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8633. or ( ( extractFloat128Exp( b ) = $7FFF )
  8634. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8635. ) then begin
  8636. if ( (float128_is_signaling_nan( a )<>0)
  8637. or (float128_is_signaling_nan( b )<>0) ) then begin
  8638. float_raise( float_flag_invalid );
  8639. end;
  8640. result := 0;
  8641. exit;
  8642. end;
  8643. aSign := extractFloat128Sign( a );
  8644. bSign := extractFloat128Sign( b );
  8645. if ( aSign <> bSign ) then begin
  8646. result := ord(
  8647. (aSign<>0)
  8648. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8649. = 0 ));
  8650. exit;
  8651. end;
  8652. if aSign<>0 then
  8653. result := le128( b.high, b.low, a.high, a.low )
  8654. else
  8655. result := le128( a.high, a.low, b.high, b.low );
  8656. end;
  8657. {*----------------------------------------------------------------------------
  8658. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8659. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  8660. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  8661. | Standard for Binary Floating-Point Arithmetic.
  8662. *----------------------------------------------------------------------------*}
  8663. function float128_lt_quiet(a: float128; b: float128): flag;
  8664. var
  8665. aSign, bSign: flag;
  8666. begin
  8667. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8668. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8669. or ( ( extractFloat128Exp( b ) = $7FFF )
  8670. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8671. ) then begin
  8672. if ( (float128_is_signaling_nan( a )<>0)
  8673. or (float128_is_signaling_nan( b )<>0) ) then begin
  8674. float_raise( float_flag_invalid );
  8675. end;
  8676. result := 0;
  8677. exit;
  8678. end;
  8679. aSign := extractFloat128Sign( a );
  8680. bSign := extractFloat128Sign( b );
  8681. if ( aSign <> bSign ) then begin
  8682. result := ord(
  8683. (aSign<>0)
  8684. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8685. <> 0 ));
  8686. exit;
  8687. end;
  8688. if aSign<>0 then
  8689. result:=lt128( b.high, b.low, a.high, a.low )
  8690. else
  8691. result:=lt128( a.high, a.low, b.high, b.low );
  8692. end;
  8693. {----------------------------------------------------------------------------
  8694. | Returns the result of converting the double-precision floating-point value
  8695. | `a' to the quadruple-precision floating-point format. The conversion is
  8696. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8697. | Arithmetic.
  8698. *----------------------------------------------------------------------------}
  8699. function float64_to_float128( a : float64) : float128;
  8700. var
  8701. aSign : flag;
  8702. aExp : int16;
  8703. aSig, zSig0, zSig1 : bits64;
  8704. begin
  8705. aSig := extractFloat64Frac( a );
  8706. aExp := extractFloat64Exp( a );
  8707. aSign := extractFloat64Sign( a );
  8708. if ( aExp = $7FF ) then begin
  8709. if ( aSig<>0 ) then begin
  8710. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8711. exit;
  8712. end;
  8713. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8714. exit;
  8715. end;
  8716. if ( aExp = 0 ) then begin
  8717. if ( aSig = 0 ) then
  8718. begin
  8719. result:=packFloat128( aSign, 0, 0, 0 );
  8720. exit;
  8721. end;
  8722. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8723. dec(aExp);
  8724. end;
  8725. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8726. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8727. end;
  8728. {$endif FPC_SOFTFLOAT_FLOAT128}
  8729. {$endif not(defined(fpc_softfpu_interface))}
  8730. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8731. end.
  8732. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}