softfpu.pp 297 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. {$macro on}
  69. {$define compilerproc:=stdcall }
  70. interface
  71. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  72. {$if not(defined(fpc_softfpu_implementation))}
  73. {
  74. -------------------------------------------------------------------------------
  75. Software IEC/IEEE floating-point types.
  76. -------------------------------------------------------------------------------
  77. }
  78. TYPE
  79. float32 = longword;
  80. {$define FPC_SYSTEM_HAS_float32}
  81. { we use here a record in the function header because
  82. the record allows bitwise conversion to single }
  83. float32rec = record
  84. float32 : float32;
  85. end;
  86. flag = byte;
  87. uint8 = byte;
  88. int8 = shortint;
  89. uint16 = word;
  90. int16 = smallint;
  91. uint32 = longword;
  92. int32 = longint;
  93. bits8 = byte;
  94. sbits8 = shortint;
  95. bits16 = word;
  96. sbits16 = smallint;
  97. sbits32 = longint;
  98. bits32 = longword;
  99. {$ifndef fpc}
  100. qword = int64;
  101. {$endif}
  102. { now part of the system unit
  103. uint64 = qword;
  104. }
  105. bits64 = qword;
  106. sbits64 = int64;
  107. {$ifdef ENDIAN_LITTLE}
  108. float64 = record
  109. case byte of
  110. 1: (low,high : bits32);
  111. // force the record to be aligned like a double
  112. // else *_to_double will fail for cpus like sparc
  113. // and avoid expensive unpacking/packing operations
  114. 2: (dummy : double);
  115. end;
  116. int64rec = record
  117. case byte of
  118. 1: (low,high : bits32);
  119. // force the record to be aligned like a double
  120. // else *_to_double will fail for cpus like sparc
  121. // and avoid expensive unpacking/packing operations
  122. 2: (dummy : int64);
  123. end;
  124. floatx80 = record
  125. case byte of
  126. 1: (low : qword;high : word);
  127. // force the record to be aligned like a double
  128. // else *_to_double will fail for cpus like sparc
  129. // and avoid expensive unpacking/packing operations
  130. 2: (dummy : extended);
  131. end;
  132. float128 = record
  133. case byte of
  134. 1: (low,high : qword);
  135. // force the record to be aligned like a double
  136. // else *_to_double will fail for cpus like sparc
  137. // and avoid expensive unpacking/packing operations
  138. 2: (dummy : qword);
  139. end;
  140. {$else}
  141. float64 = record
  142. case byte of
  143. 1: (high,low : bits32);
  144. // force the record to be aligned like a double
  145. // else *_to_double will fail for cpus like sparc
  146. 2: (dummy : double);
  147. end;
  148. int64rec = record
  149. case byte of
  150. 1: high,low : bits32;
  151. // force the record to be aligned like a double
  152. // else *_to_double will fail for cpus like sparc
  153. // and avoid expensive unpacking/packing operations
  154. 2: (dummy : int64);
  155. end;
  156. floatx80 = record
  157. case byte of
  158. 1: (high : word;low : qword);
  159. // force the record to be aligned like a double
  160. // else *_to_double will fail for cpus like sparc
  161. // and avoid expensive unpacking/packing operations
  162. 2: (dummy : qword);
  163. end;
  164. float128 = record
  165. case byte of
  166. 1: (high : qword;low : qword);
  167. // force the record to be aligned like a double
  168. // else *_to_double will fail for cpus like sparc
  169. // and avoid expensive unpacking/packing operations
  170. 2: (dummy : qword);
  171. end;
  172. {$endif}
  173. {$define FPC_SYSTEM_HAS_float64}
  174. {*
  175. -------------------------------------------------------------------------------
  176. Returns 1 if the double-precision floating-point value `a' is less than
  177. the corresponding value `b', and 0 otherwise. The comparison is performed
  178. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  179. -------------------------------------------------------------------------------
  180. *}
  181. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  182. {*
  183. -------------------------------------------------------------------------------
  184. Returns 1 if the double-precision floating-point value `a' is less than
  185. or equal to the corresponding value `b', and 0 otherwise. The comparison
  186. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  187. Arithmetic.
  188. -------------------------------------------------------------------------------
  189. *}
  190. Function float64_le(a: float64;b: float64): flag; compilerproc;
  191. {*
  192. -------------------------------------------------------------------------------
  193. Returns 1 if the double-precision floating-point value `a' is equal to
  194. the corresponding value `b', and 0 otherwise. The comparison is performed
  195. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  196. -------------------------------------------------------------------------------
  197. *}
  198. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  199. {*
  200. -------------------------------------------------------------------------------
  201. Returns the square root of the double-precision floating-point value `a'.
  202. The operation is performed according to the IEC/IEEE Standard for Binary
  203. Floating-Point Arithmetic.
  204. -------------------------------------------------------------------------------
  205. *}
  206. Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
  207. {*
  208. -------------------------------------------------------------------------------
  209. Returns the remainder of the double-precision floating-point value `a'
  210. with respect to the corresponding value `b'. The operation is performed
  211. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  212. -------------------------------------------------------------------------------
  213. *}
  214. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  215. {*
  216. -------------------------------------------------------------------------------
  217. Returns the result of dividing the double-precision floating-point value `a'
  218. by the corresponding value `b'. The operation is performed according to the
  219. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  220. -------------------------------------------------------------------------------
  221. *}
  222. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  223. {*
  224. -------------------------------------------------------------------------------
  225. Returns the result of multiplying the double-precision floating-point values
  226. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  227. for Binary Floating-Point Arithmetic.
  228. -------------------------------------------------------------------------------
  229. *}
  230. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  231. {*
  232. -------------------------------------------------------------------------------
  233. Returns the result of subtracting the double-precision floating-point values
  234. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  235. for Binary Floating-Point Arithmetic.
  236. -------------------------------------------------------------------------------
  237. *}
  238. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  239. {*
  240. -------------------------------------------------------------------------------
  241. Returns the result of adding the double-precision floating-point values `a'
  242. and `b'. The operation is performed according to the IEC/IEEE Standard for
  243. Binary Floating-Point Arithmetic.
  244. -------------------------------------------------------------------------------
  245. *}
  246. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  247. {*
  248. -------------------------------------------------------------------------------
  249. Rounds the double-precision floating-point value `a' to an integer,
  250. and returns the result as a double-precision floating-point value. The
  251. operation is performed according to the IEC/IEEE Standard for Binary
  252. Floating-Point Arithmetic.
  253. -------------------------------------------------------------------------------
  254. *}
  255. Function float64_round_to_int(a: float64) : float64; compilerproc;
  256. {*
  257. -------------------------------------------------------------------------------
  258. Returns the result of converting the double-precision floating-point value
  259. `a' to the single-precision floating-point format. The conversion is
  260. performed according to the IEC/IEEE Standard for Binary Floating-Point
  261. Arithmetic.
  262. -------------------------------------------------------------------------------
  263. *}
  264. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  265. {*
  266. -------------------------------------------------------------------------------
  267. Returns the result of converting the double-precision floating-point value
  268. `a' to the 32-bit two's complement integer format. The conversion is
  269. performed according to the IEC/IEEE Standard for Binary Floating-Point
  270. Arithmetic, except that the conversion is always rounded toward zero.
  271. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  272. the conversion overflows, the largest integer with the same sign as `a' is
  273. returned.
  274. -------------------------------------------------------------------------------
  275. *}
  276. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  277. {*
  278. -------------------------------------------------------------------------------
  279. Returns the result of converting the double-precision floating-point value
  280. `a' to the 32-bit two's complement integer format. The conversion is
  281. performed according to the IEC/IEEE Standard for Binary Floating-Point
  282. Arithmetic---which means in particular that the conversion is rounded
  283. according to the current rounding mode. If `a' is a NaN, the largest
  284. positive integer is returned. Otherwise, if the conversion overflows, the
  285. largest integer with the same sign as `a' is returned.
  286. -------------------------------------------------------------------------------
  287. *}
  288. Function float64_to_int32(a: float64): int32; compilerproc;
  289. {*
  290. -------------------------------------------------------------------------------
  291. Returns 1 if the single-precision floating-point value `a' is less than
  292. the corresponding value `b', and 0 otherwise. The comparison is performed
  293. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  294. -------------------------------------------------------------------------------
  295. *}
  296. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  297. {*
  298. -------------------------------------------------------------------------------
  299. Returns 1 if the single-precision floating-point value `a' is less than
  300. or equal to the corresponding value `b', and 0 otherwise. The comparison
  301. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  302. Arithmetic.
  303. -------------------------------------------------------------------------------
  304. *}
  305. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  306. {*
  307. -------------------------------------------------------------------------------
  308. Returns 1 if the single-precision floating-point value `a' is equal to
  309. the corresponding value `b', and 0 otherwise. The comparison is performed
  310. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  311. -------------------------------------------------------------------------------
  312. *}
  313. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  314. {*
  315. -------------------------------------------------------------------------------
  316. Returns the square root of the single-precision floating-point value `a'.
  317. The operation is performed according to the IEC/IEEE Standard for Binary
  318. Floating-Point Arithmetic.
  319. -------------------------------------------------------------------------------
  320. *}
  321. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  322. {*
  323. -------------------------------------------------------------------------------
  324. Returns the remainder of the single-precision floating-point value `a'
  325. with respect to the corresponding value `b'. The operation is performed
  326. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  327. -------------------------------------------------------------------------------
  328. *}
  329. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  330. {*
  331. -------------------------------------------------------------------------------
  332. Returns the result of dividing the single-precision floating-point value `a'
  333. by the corresponding value `b'. The operation is performed according to the
  334. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  335. -------------------------------------------------------------------------------
  336. *}
  337. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  338. {*
  339. -------------------------------------------------------------------------------
  340. Returns the result of multiplying the single-precision floating-point values
  341. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  342. for Binary Floating-Point Arithmetic.
  343. -------------------------------------------------------------------------------
  344. *}
  345. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  346. {*
  347. -------------------------------------------------------------------------------
  348. Returns the result of subtracting the single-precision floating-point values
  349. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  350. for Binary Floating-Point Arithmetic.
  351. -------------------------------------------------------------------------------
  352. *}
  353. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  354. {*
  355. -------------------------------------------------------------------------------
  356. Returns the result of adding the single-precision floating-point values `a'
  357. and `b'. The operation is performed according to the IEC/IEEE Standard for
  358. Binary Floating-Point Arithmetic.
  359. -------------------------------------------------------------------------------
  360. *}
  361. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  362. {*
  363. -------------------------------------------------------------------------------
  364. Rounds the single-precision floating-point value `a' to an integer,
  365. and returns the result as a single-precision floating-point value. The
  366. operation is performed according to the IEC/IEEE Standard for Binary
  367. Floating-Point Arithmetic.
  368. -------------------------------------------------------------------------------
  369. *}
  370. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  371. {*
  372. -------------------------------------------------------------------------------
  373. Returns the result of converting the single-precision floating-point value
  374. `a' to the double-precision floating-point format. The conversion is
  375. performed according to the IEC/IEEE Standard for Binary Floating-Point
  376. Arithmetic.
  377. -------------------------------------------------------------------------------
  378. *}
  379. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  380. {*
  381. -------------------------------------------------------------------------------
  382. Returns the result of converting the single-precision floating-point value
  383. `a' to the 32-bit two's complement integer format. The conversion is
  384. performed according to the IEC/IEEE Standard for Binary Floating-Point
  385. Arithmetic, except that the conversion is always rounded toward zero.
  386. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  387. the conversion overflows, the largest integer with the same sign as `a' is
  388. returned.
  389. -------------------------------------------------------------------------------
  390. *}
  391. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  392. {*
  393. -------------------------------------------------------------------------------
  394. Returns the result of converting the single-precision floating-point value
  395. `a' to the 32-bit two's complement integer format. The conversion is
  396. performed according to the IEC/IEEE Standard for Binary Floating-Point
  397. Arithmetic---which means in particular that the conversion is rounded
  398. according to the current rounding mode. If `a' is a NaN, the largest
  399. positive integer is returned. Otherwise, if the conversion overflows, the
  400. largest integer with the same sign as `a' is returned.
  401. -------------------------------------------------------------------------------
  402. *}
  403. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  404. {*
  405. -------------------------------------------------------------------------------
  406. Returns the result of converting the 32-bit two's complement integer `a' to
  407. the double-precision floating-point format. The conversion is performed
  408. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  409. -------------------------------------------------------------------------------
  410. *}
  411. Function int32_to_float64( a: int32) : float64; compilerproc;
  412. {*
  413. -------------------------------------------------------------------------------
  414. Returns the result of converting the 32-bit two's complement integer `a' to
  415. the single-precision floating-point format. The conversion is performed
  416. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  417. -------------------------------------------------------------------------------
  418. *}
  419. Function int32_to_float32( a: int32): float32rec; compilerproc;
  420. {*----------------------------------------------------------------------------
  421. | Returns the result of converting the 64-bit two's complement integer `a'
  422. | to the double-precision floating-point format. The conversion is performed
  423. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  424. *----------------------------------------------------------------------------*}
  425. Function int64_to_float64( a: int64 ): float64; compilerproc;
  426. Function qword_to_float64( a: qword ): float64; compilerproc;
  427. {*----------------------------------------------------------------------------
  428. | Returns the result of converting the 64-bit two's complement integer `a'
  429. | to the single-precision floating-point format. The conversion is performed
  430. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  431. *----------------------------------------------------------------------------*}
  432. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  433. Function qword_to_float32( a: qword ): float32rec; compilerproc;
  434. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  435. function float128_is_nan( a : float128): flag;
  436. function float128_is_signaling_nan( a : float128): flag;
  437. function float128_to_int32(a: float128): int32;
  438. function float128_to_int32_round_to_zero(a: float128): int32;
  439. function float128_to_int64(a: float128): int64;
  440. function float128_to_int64_round_to_zero(a: float128): int64;
  441. function float128_to_float32(a: float128): float32;
  442. function float128_to_float64(a: float128): float64;
  443. function float64_to_float128( a : float64) : float128;
  444. {$ifdef FPC_SOFTFLOAT_FLOAT80}
  445. function float128_to_floatx80(a: float128): floatx80;
  446. {$endif FPC_SOFTFLOAT_FLOAT80}
  447. function float128_round_to_int(a: float128): float128;
  448. function float128_add(a: float128; b: float128): float128;
  449. function float128_sub(a: float128; b: float128): float128;
  450. function float128_mul(a: float128; b: float128): float128;
  451. function float128_div(a: float128; b: float128): float128;
  452. function float128_rem(a: float128; b: float128): float128;
  453. function float128_sqrt(a: float128): float128;
  454. function float128_eq(a: float128; b: float128): flag;
  455. function float128_le(a: float128; b: float128): flag;
  456. function float128_lt(a: float128; b: float128): flag;
  457. function float128_eq_signaling(a: float128; b: float128): flag;
  458. function float128_le_quiet(a: float128; b: float128): flag;
  459. function float128_lt_quiet(a: float128; b: float128): flag;
  460. {$endif FPC_SOFTFLOAT_FLOAT128}
  461. CONST
  462. {-------------------------------------------------------------------------------
  463. Software IEC/IEEE floating-point underflow tininess-detection mode.
  464. -------------------------------------------------------------------------------
  465. *}
  466. float_tininess_after_rounding = 0;
  467. float_tininess_before_rounding = 1;
  468. {*
  469. -------------------------------------------------------------------------------
  470. Underflow tininess-detection mode, statically initialized to default value.
  471. (The declaration in `softfloat.h' must match the `int8' type here.)
  472. -------------------------------------------------------------------------------
  473. *}
  474. const float_detect_tininess: int8 = float_tininess_after_rounding;
  475. {$endif not(defined(fpc_softfpu_implementation))}
  476. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  477. implementation
  478. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  479. {$if not(defined(fpc_softfpu_interface))}
  480. (*****************************************************************************)
  481. (*----------------------------------------------------------------------------*)
  482. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  483. (* division and square root approximations. (Can be specialized to target if *)
  484. (* desired.) *)
  485. (* ---------------------------------------------------------------------------*)
  486. (*****************************************************************************)
  487. {*----------------------------------------------------------------------------
  488. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  489. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  490. | input. If `zSign' is 1, the input is negated before being converted to an
  491. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  492. | is simply rounded to an integer, with the inexact exception raised if the
  493. | input cannot be represented exactly as an integer. However, if the fixed-
  494. | point input is too large, the invalid exception is raised and the largest
  495. | positive or negative integer is returned.
  496. *----------------------------------------------------------------------------*}
  497. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  498. var
  499. roundingMode: int8;
  500. roundNearestEven: flag;
  501. roundIncrement, roundBits: int8;
  502. z: int32;
  503. begin
  504. roundingMode := softfloat_rounding_mode;
  505. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  506. roundIncrement := $40;
  507. if ( roundNearestEven=0 ) then
  508. begin
  509. if ( roundingMode = float_round_to_zero ) then
  510. begin
  511. roundIncrement := 0;
  512. end
  513. else begin
  514. roundIncrement := $7F;
  515. if ( zSign<>0 ) then
  516. begin
  517. if ( roundingMode = float_round_up ) then
  518. roundIncrement := 0;
  519. end
  520. else begin
  521. if ( roundingMode = float_round_down ) then
  522. roundIncrement := 0;
  523. end;
  524. end;
  525. end;
  526. roundBits := absZ and $7F;
  527. absZ := ( absZ + roundIncrement ) shr 7;
  528. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
  529. z := absZ;
  530. if ( zSign<>0 ) then
  531. z := - z;
  532. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  533. begin
  534. float_raise( float_flag_invalid );
  535. if zSign<>0 then
  536. result:=sbits32($80000000)
  537. else
  538. result:=$7FFFFFFF;
  539. exit;
  540. end;
  541. if ( roundBits<>0 ) then
  542. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  543. result:=z;
  544. end;
  545. {*----------------------------------------------------------------------------
  546. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  547. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  548. | and returns the properly rounded 64-bit integer corresponding to the input.
  549. | If `zSign' is 1, the input is negated before being converted to an integer.
  550. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  551. | the inexact exception raised if the input cannot be represented exactly as
  552. | an integer. However, if the fixed-point input is too large, the invalid
  553. | exception is raised and the largest positive or negative integer is
  554. | returned.
  555. *----------------------------------------------------------------------------*}
  556. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  557. var
  558. roundingMode: int8;
  559. roundNearestEven, increment: flag;
  560. z: int64;
  561. label
  562. overflow;
  563. begin
  564. roundingMode := softfloat_rounding_mode;
  565. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  566. increment := ord( sbits64(absZ1) < 0 );
  567. if ( roundNearestEven=0 ) then
  568. begin
  569. if ( roundingMode = float_round_to_zero ) then
  570. begin
  571. increment := 0;
  572. end
  573. else begin
  574. if ( zSign<>0 ) then
  575. begin
  576. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  577. end
  578. else begin
  579. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  580. end;
  581. end;
  582. end;
  583. if ( increment<>0 ) then
  584. begin
  585. inc(absZ0);
  586. if ( absZ0 = 0 ) then
  587. goto overflow;
  588. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  589. end;
  590. z := absZ0;
  591. if ( zSign<>0 ) then
  592. z := - z;
  593. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  594. begin
  595. overflow:
  596. float_raise( float_flag_invalid );
  597. if zSign<>0 then
  598. result:=int64($8000000000000000)
  599. else
  600. result:=int64($7FFFFFFFFFFFFFFF);
  601. end;
  602. if ( absZ1<>0 ) then
  603. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  604. result:=z;
  605. end;
  606. {*
  607. -------------------------------------------------------------------------------
  608. Shifts `a' right by the number of bits given in `count'. If any nonzero
  609. bits are shifted off, they are ``jammed'' into the least significant bit of
  610. the result by setting the least significant bit to 1. The value of `count'
  611. can be arbitrarily large; in particular, if `count' is greater than 32, the
  612. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  613. The result is stored in the location pointed to by `zPtr'.
  614. -------------------------------------------------------------------------------
  615. *}
  616. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  617. var
  618. z: Bits32;
  619. Begin
  620. if ( count = 0 ) then
  621. z := a
  622. else
  623. if ( count < 32 ) then
  624. Begin
  625. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  626. End
  627. else
  628. Begin
  629. z := bits32( a <> 0 );
  630. End;
  631. zPtr := z;
  632. End;
  633. {*----------------------------------------------------------------------------
  634. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  635. | number of bits given in `count'. Any bits shifted off are lost. The value
  636. | of `count' can be arbitrarily large; in particular, if `count' is greater
  637. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  638. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  639. *----------------------------------------------------------------------------*}
  640. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  641. var
  642. z0, z1: bits64;
  643. negCount: int8;
  644. begin
  645. negCount := ( - count ) and 63;
  646. if ( count = 0 ) then
  647. begin
  648. z1 := a1;
  649. z0 := a0;
  650. end
  651. else if ( count < 64 ) then
  652. begin
  653. z1 := ( a0 shl negCount ) or ( a1 shr count );
  654. z0 := a0 shr count;
  655. end
  656. else
  657. begin
  658. if ( count shl 64 )<>0 then
  659. z1 := a0 shr ( count and 63 )
  660. else
  661. z1 := 0;
  662. z0 := 0;
  663. end;
  664. z1Ptr := z1;
  665. z0Ptr := z0;
  666. end;
  667. {*----------------------------------------------------------------------------
  668. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  669. | number of bits given in `count'. If any nonzero bits are shifted off, they
  670. | are ``jammed'' into the least significant bit of the result by setting the
  671. | least significant bit to 1. The value of `count' can be arbitrarily large;
  672. | in particular, if `count' is greater than 128, the result will be either
  673. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  674. | nonzero. The result is broken into two 64-bit pieces which are stored at
  675. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  676. *----------------------------------------------------------------------------*}
  677. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  678. var
  679. z0,z1 : bits64;
  680. negCount : int8;
  681. begin
  682. negCount := ( - count ) and 63;
  683. if ( count = 0 ) then begin
  684. z1 := a1;
  685. z0 := a0;
  686. end
  687. else if ( count < 64 ) then begin
  688. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  689. z0 := a0>>count;
  690. end
  691. else begin
  692. if ( count = 64 ) then begin
  693. z1 := a0 or ord( a1 <> 0 );
  694. end
  695. else if ( count < 128 ) then begin
  696. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  697. end
  698. else begin
  699. z1 := ord( ( a0 or a1 ) <> 0 );
  700. end;
  701. z0 := 0;
  702. end;
  703. z1Ptr := z1;
  704. z0Ptr := z0;
  705. end;
  706. {*
  707. -------------------------------------------------------------------------------
  708. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  709. number of bits given in `count'. Any bits shifted off are lost. The value
  710. of `count' can be arbitrarily large; in particular, if `count' is greater
  711. than 64, the result will be 0. The result is broken into two 32-bit pieces
  712. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  713. -------------------------------------------------------------------------------
  714. *}
  715. Procedure
  716. shift64Right(
  717. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  718. Var
  719. z0, z1: bits32;
  720. negCount : int8;
  721. Begin
  722. negCount := ( - count ) AND 31;
  723. if ( count = 0 ) then
  724. Begin
  725. z1 := a1;
  726. z0 := a0;
  727. End
  728. else if ( count < 32 ) then
  729. Begin
  730. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  731. z0 := a0 shr count;
  732. End
  733. else
  734. Begin
  735. if (count < 64) then
  736. z1 := ( a0 shr ( count AND 31 ) )
  737. else
  738. z1 := 0;
  739. z0 := 0;
  740. End;
  741. z1Ptr := z1;
  742. z0Ptr := z0;
  743. End;
  744. {*
  745. -------------------------------------------------------------------------------
  746. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  747. number of bits given in `count'. If any nonzero bits are shifted off, they
  748. are ``jammed'' into the least significant bit of the result by setting the
  749. least significant bit to 1. The value of `count' can be arbitrarily large;
  750. in particular, if `count' is greater than 64, the result will be either 0
  751. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  752. nonzero. The result is broken into two 32-bit pieces which are stored at
  753. the locations pointed to by `z0Ptr' and `z1Ptr'.
  754. -------------------------------------------------------------------------------
  755. *}
  756. Procedure
  757. shift64RightJamming(
  758. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  759. VAR
  760. z0, z1 : bits32;
  761. negCount : int8;
  762. Begin
  763. negCount := ( - count ) AND 31;
  764. if ( count = 0 ) then
  765. Begin
  766. z1 := a1;
  767. z0 := a0;
  768. End
  769. else
  770. if ( count < 32 ) then
  771. Begin
  772. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  773. z0 := a0 shr count;
  774. End
  775. else
  776. Begin
  777. if ( count = 32 ) then
  778. Begin
  779. z1 := a0 OR bits32( a1 <> 0 );
  780. End
  781. else
  782. if ( count < 64 ) Then
  783. Begin
  784. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  785. End
  786. else
  787. Begin
  788. z1 := bits32( ( a0 OR a1 ) <> 0 );
  789. End;
  790. z0 := 0;
  791. End;
  792. z1Ptr := z1;
  793. z0Ptr := z0;
  794. End;
  795. {*----------------------------------------------------------------------------
  796. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  797. | bits are shifted off, they are ``jammed'' into the least significant bit of
  798. | the result by setting the least significant bit to 1. The value of `count'
  799. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  800. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  801. | The result is stored in the location pointed to by `zPtr'.
  802. *----------------------------------------------------------------------------*}
  803. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  804. var
  805. z: bits64;
  806. begin
  807. if ( count = 0 ) then
  808. begin
  809. z := a;
  810. end
  811. else if ( count < 64 ) then
  812. begin
  813. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  814. end
  815. else
  816. begin
  817. z := ord( a <> 0 );
  818. end;
  819. zPtr := z;
  820. end;
  821. {*
  822. -------------------------------------------------------------------------------
  823. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  824. by 32 _plus_ the number of bits given in `count'. The shifted result is
  825. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  826. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  827. off form a third 32-bit result as follows: The _last_ bit shifted off is
  828. the most-significant bit of the extra result, and the other 31 bits of the
  829. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  830. were all zero. This extra result is stored in the location pointed to by
  831. `z2Ptr'. The value of `count' can be arbitrarily large.
  832. (This routine makes more sense if `a0', `a1', and `a2' are considered
  833. to form a fixed-point value with binary point between `a1' and `a2'. This
  834. fixed-point value is shifted right by the number of bits given in `count',
  835. and the integer part of the result is returned at the locations pointed to
  836. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  837. corrupted as described above, and is returned at the location pointed to by
  838. `z2Ptr'.)
  839. -------------------------------------------------------------------------------
  840. }
  841. Procedure
  842. shift64ExtraRightJamming(
  843. a0: bits32;
  844. a1: bits32;
  845. a2: bits32;
  846. count: int16;
  847. VAR z0Ptr: bits32;
  848. VAR z1Ptr: bits32;
  849. VAR z2Ptr: bits32
  850. );
  851. Var
  852. z0, z1, z2: bits32;
  853. negCount : int8;
  854. Begin
  855. negCount := ( - count ) AND 31;
  856. if ( count = 0 ) then
  857. Begin
  858. z2 := a2;
  859. z1 := a1;
  860. z0 := a0;
  861. End
  862. else
  863. Begin
  864. if ( count < 32 ) Then
  865. Begin
  866. z2 := a1 shl negCount;
  867. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  868. z0 := a0 shr count;
  869. End
  870. else
  871. Begin
  872. if ( count = 32 ) then
  873. Begin
  874. z2 := a1;
  875. z1 := a0;
  876. End
  877. else
  878. Begin
  879. a2 := a2 or a1;
  880. if ( count < 64 ) then
  881. Begin
  882. z2 := a0 shl negCount;
  883. z1 := a0 shr ( count AND 31 );
  884. End
  885. else
  886. Begin
  887. if count = 64 then
  888. z2 := a0
  889. else
  890. z2 := bits32(a0 <> 0);
  891. z1 := 0;
  892. End;
  893. End;
  894. z0 := 0;
  895. End;
  896. z2 := z2 or bits32( a2 <> 0 );
  897. End;
  898. z2Ptr := z2;
  899. z1Ptr := z1;
  900. z0Ptr := z0;
  901. End;
  902. {*
  903. -------------------------------------------------------------------------------
  904. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  905. number of bits given in `count'. Any bits shifted off are lost. The value
  906. of `count' must be less than 32. The result is broken into two 32-bit
  907. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  908. -------------------------------------------------------------------------------
  909. *}
  910. Procedure
  911. shortShift64Left(
  912. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  913. Begin
  914. z1Ptr := a1 shl count;
  915. if count = 0 then
  916. z0Ptr := a0
  917. else
  918. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  919. End;
  920. {*
  921. -------------------------------------------------------------------------------
  922. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  923. by the number of bits given in `count'. Any bits shifted off are lost.
  924. The value of `count' must be less than 32. The result is broken into three
  925. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  926. `z1Ptr', and `z2Ptr'.
  927. -------------------------------------------------------------------------------
  928. *}
  929. Procedure
  930. shortShift96Left(
  931. a0: bits32;
  932. a1: bits32;
  933. a2: bits32;
  934. count: int16;
  935. VAR z0Ptr: bits32;
  936. VAR z1Ptr: bits32;
  937. VAR z2Ptr: bits32
  938. );
  939. Var
  940. z0, z1, z2: bits32;
  941. negCount: int8;
  942. Begin
  943. z2 := a2 shl count;
  944. z1 := a1 shl count;
  945. z0 := a0 shl count;
  946. if ( 0 < count ) then
  947. Begin
  948. negCount := ( ( - count ) AND 31 );
  949. z1 := z1 or (a2 shr negCount);
  950. z0 := z0 or (a1 shr negCount);
  951. End;
  952. z2Ptr := z2;
  953. z1Ptr := z1;
  954. z0Ptr := z0;
  955. End;
  956. {*----------------------------------------------------------------------------
  957. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  958. | number of bits given in `count'. Any bits shifted off are lost. The value
  959. | of `count' must be less than 64. The result is broken into two 64-bit
  960. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  961. *----------------------------------------------------------------------------*}
  962. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  963. begin
  964. z1Ptr := a1 shl count;
  965. if count=0 then
  966. z0Ptr:=a0
  967. else
  968. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  969. end;
  970. {*
  971. -------------------------------------------------------------------------------
  972. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  973. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  974. any carry out is lost. The result is broken into two 32-bit pieces which
  975. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  976. -------------------------------------------------------------------------------
  977. *}
  978. Procedure
  979. add64(
  980. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  981. Var
  982. z1: bits32;
  983. Begin
  984. z1 := a1 + b1;
  985. z1Ptr := z1;
  986. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  987. End;
  988. {*
  989. -------------------------------------------------------------------------------
  990. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  991. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  992. modulo 2^96, so any carry out is lost. The result is broken into three
  993. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  994. `z1Ptr', and `z2Ptr'.
  995. -------------------------------------------------------------------------------
  996. *}
  997. Procedure
  998. add96(
  999. a0: bits32;
  1000. a1: bits32;
  1001. a2: bits32;
  1002. b0: bits32;
  1003. b1: bits32;
  1004. b2: bits32;
  1005. VAR z0Ptr: bits32;
  1006. VAR z1Ptr: bits32;
  1007. VAR z2Ptr: bits32
  1008. );
  1009. var
  1010. z0, z1, z2: bits32;
  1011. carry0, carry1: int8;
  1012. Begin
  1013. z2 := a2 + b2;
  1014. carry1 := int8( z2 < a2 );
  1015. z1 := a1 + b1;
  1016. carry0 := int8( z1 < a1 );
  1017. z0 := a0 + b0;
  1018. z1 := z1 + carry1;
  1019. z0 := z0 + bits32( z1 < carry1 );
  1020. z0 := z0 + carry0;
  1021. z2Ptr := z2;
  1022. z1Ptr := z1;
  1023. z0Ptr := z0;
  1024. End;
  1025. {*----------------------------------------------------------------------------
  1026. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1027. | by the number of bits given in `count'. Any bits shifted off are lost.
  1028. | The value of `count' must be less than 64. The result is broken into three
  1029. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1030. | `z1Ptr', and `z2Ptr'.
  1031. *----------------------------------------------------------------------------*}
  1032. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1033. var
  1034. z0, z1, z2 : bits64;
  1035. negCount : int8;
  1036. begin
  1037. z2 := a2 shl count;
  1038. z1 := a1 shl count;
  1039. z0 := a0 shl count;
  1040. if ( 0 < count ) then
  1041. begin
  1042. negCount := ( ( - count ) and 63 );
  1043. z1 := z1 or (a2 shr negCount);
  1044. z0 := z0 or (a1 shr negCount);
  1045. end;
  1046. z2Ptr := z2;
  1047. z1Ptr := z1;
  1048. z0Ptr := z0;
  1049. end;
  1050. {*----------------------------------------------------------------------------
  1051. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1052. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1053. | any carry out is lost. The result is broken into two 64-bit pieces which
  1054. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1055. *----------------------------------------------------------------------------*}
  1056. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);inline;
  1057. var
  1058. z1 : bits64;
  1059. begin
  1060. z1 := a1 + b1;
  1061. z1Ptr := z1;
  1062. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1063. end;
  1064. {*----------------------------------------------------------------------------
  1065. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1066. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1067. | modulo 2^192, so any carry out is lost. The result is broken into three
  1068. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1069. | `z1Ptr', and `z2Ptr'.
  1070. *----------------------------------------------------------------------------*}
  1071. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1072. var
  1073. z0, z1, z2 : bits64;
  1074. carry0, carry1 : int8;
  1075. begin
  1076. z2 := a2 + b2;
  1077. carry1 := ord( z2 < a2 );
  1078. z1 := a1 + b1;
  1079. carry0 := ord( z1 < a1 );
  1080. z0 := a0 + b0;
  1081. inc(z1, carry1);
  1082. inc(z0, ord( z1 < carry1 ));
  1083. inc(z0, carry0);
  1084. z2Ptr := z2;
  1085. z1Ptr := z1;
  1086. z0Ptr := z0;
  1087. end;
  1088. {*
  1089. -------------------------------------------------------------------------------
  1090. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1091. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1092. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1093. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1094. `z1Ptr'.
  1095. -------------------------------------------------------------------------------
  1096. *}
  1097. Procedure
  1098. sub64(
  1099. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  1100. Begin
  1101. z1Ptr := a1 - b1;
  1102. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1103. End;
  1104. {*
  1105. -------------------------------------------------------------------------------
  1106. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1107. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1108. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1109. into three 32-bit pieces which are stored at the locations pointed to by
  1110. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1111. -------------------------------------------------------------------------------
  1112. *}
  1113. Procedure
  1114. sub96(
  1115. a0:bits32;
  1116. a1:bits32;
  1117. a2:bits32;
  1118. b0:bits32;
  1119. b1:bits32;
  1120. b2:bits32;
  1121. VAR z0Ptr:bits32;
  1122. VAR z1Ptr:bits32;
  1123. VAR z2Ptr:bits32
  1124. );
  1125. Var
  1126. z0, z1, z2: bits32;
  1127. borrow0, borrow1: int8;
  1128. Begin
  1129. z2 := a2 - b2;
  1130. borrow1 := int8( a2 < b2 );
  1131. z1 := a1 - b1;
  1132. borrow0 := int8( a1 < b1 );
  1133. z0 := a0 - b0;
  1134. z0 := z0 - bits32( z1 < borrow1 );
  1135. z1 := z1 - borrow1;
  1136. z0 := z0 -borrow0;
  1137. z2Ptr := z2;
  1138. z1Ptr := z1;
  1139. z0Ptr := z0;
  1140. End;
  1141. {*----------------------------------------------------------------------------
  1142. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1143. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1144. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1145. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1146. | `z1Ptr'.
  1147. *----------------------------------------------------------------------------*}
  1148. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1149. begin
  1150. z1Ptr := a1 - b1;
  1151. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1152. end;
  1153. {*----------------------------------------------------------------------------
  1154. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1155. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1156. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1157. | result is broken into three 64-bit pieces which are stored at the locations
  1158. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1159. *----------------------------------------------------------------------------*}
  1160. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1161. var
  1162. z0, z1, z2 : bits64;
  1163. borrow0, borrow1 : int8;
  1164. begin
  1165. z2 := a2 - b2;
  1166. borrow1 := ord( a2 < b2 );
  1167. z1 := a1 - b1;
  1168. borrow0 := ord( a1 < b1 );
  1169. z0 := a0 - b0;
  1170. dec(z0, ord( z1 < borrow1 ));
  1171. dec(z1, borrow1);
  1172. dec(z0, borrow0);
  1173. z2Ptr := z2;
  1174. z1Ptr := z1;
  1175. z0Ptr := z0;
  1176. end;
  1177. {*
  1178. -------------------------------------------------------------------------------
  1179. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1180. into two 32-bit pieces which are stored at the locations pointed to by
  1181. `z0Ptr' and `z1Ptr'.
  1182. -------------------------------------------------------------------------------
  1183. *}
  1184. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1185. :bits32 );
  1186. Var
  1187. aHigh, aLow, bHigh, bLow: bits16;
  1188. z0, zMiddleA, zMiddleB, z1: bits32;
  1189. Begin
  1190. aLow := a and $ffff;
  1191. aHigh := a shr 16;
  1192. bLow := b and $ffff;
  1193. bHigh := b shr 16;
  1194. z1 := ( bits32( aLow) ) * bLow;
  1195. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1196. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1197. z0 := ( bits32 (aHigh) ) * bHigh;
  1198. zMiddleA := zMiddleA + zMiddleB;
  1199. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1200. zMiddleA := zmiddleA shl 16;
  1201. z1 := z1 + zMiddleA;
  1202. z0 := z0 + bits32( z1 < zMiddleA );
  1203. z1Ptr := z1;
  1204. z0Ptr := z0;
  1205. End;
  1206. {*
  1207. -------------------------------------------------------------------------------
  1208. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1209. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1210. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1211. `z2Ptr'.
  1212. -------------------------------------------------------------------------------
  1213. *}
  1214. Procedure
  1215. mul64By32To96(
  1216. a0:bits32;
  1217. a1:bits32;
  1218. b:bits32;
  1219. VAR z0Ptr:bits32;
  1220. VAR z1Ptr:bits32;
  1221. VAR z2Ptr:bits32
  1222. );
  1223. Var
  1224. z0, z1, z2, more1: bits32;
  1225. Begin
  1226. mul32To64( a1, b, z1, z2 );
  1227. mul32To64( a0, b, z0, more1 );
  1228. add64( z0, more1, 0, z1, z0, z1 );
  1229. z2Ptr := z2;
  1230. z1Ptr := z1;
  1231. z0Ptr := z0;
  1232. End;
  1233. {*
  1234. -------------------------------------------------------------------------------
  1235. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1236. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1237. product. The product is broken into four 32-bit pieces which are stored at
  1238. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1239. -------------------------------------------------------------------------------
  1240. *}
  1241. Procedure
  1242. mul64To128(
  1243. a0:bits32;
  1244. a1:bits32;
  1245. b0:bits32;
  1246. b1:bits32;
  1247. VAR z0Ptr:bits32;
  1248. VAR z1Ptr:bits32;
  1249. VAR z2Ptr:bits32;
  1250. VAR z3Ptr:bits32
  1251. );
  1252. Var
  1253. z0, z1, z2, z3: bits32;
  1254. more1, more2: bits32;
  1255. Begin
  1256. mul32To64( a1, b1, z2, z3 );
  1257. mul32To64( a1, b0, z1, more2 );
  1258. add64( z1, more2, 0, z2, z1, z2 );
  1259. mul32To64( a0, b0, z0, more1 );
  1260. add64( z0, more1, 0, z1, z0, z1 );
  1261. mul32To64( a0, b1, more1, more2 );
  1262. add64( more1, more2, 0, z2, more1, z2 );
  1263. add64( z0, z1, 0, more1, z0, z1 );
  1264. z3Ptr := z3;
  1265. z2Ptr := z2;
  1266. z1Ptr := z1;
  1267. z0Ptr := z0;
  1268. End;
  1269. {*----------------------------------------------------------------------------
  1270. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1271. | into two 64-bit pieces which are stored at the locations pointed to by
  1272. | `z0Ptr' and `z1Ptr'.
  1273. *----------------------------------------------------------------------------*}
  1274. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1275. var
  1276. aHigh, aLow, bHigh, bLow : bits32;
  1277. z0, zMiddleA, zMiddleB, z1 : bits64;
  1278. begin
  1279. aLow := a;
  1280. aHigh := a shr 32;
  1281. bLow := b;
  1282. bHigh := b shr 32;
  1283. z1 := ( bits64(aLow) ) * bLow;
  1284. zMiddleA := ( bits64( aLow )) * bHigh;
  1285. zMiddleB := ( bits64( aHigh )) * bLow;
  1286. z0 := ( bits64(aHigh) ) * bHigh;
  1287. inc(zMiddleA, zMiddleB);
  1288. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1289. zMiddleA := zMiddleA shl 32;
  1290. inc(z1, zMiddleA);
  1291. inc(z0, ord( z1 < zMiddleA ));
  1292. z1Ptr := z1;
  1293. z0Ptr := z0;
  1294. end;
  1295. {*----------------------------------------------------------------------------
  1296. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1297. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1298. | product. The product is broken into four 64-bit pieces which are stored at
  1299. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1300. *----------------------------------------------------------------------------*}
  1301. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1302. var
  1303. z0,z1,z2,z3,more1,more2 : bits64;
  1304. begin
  1305. mul64To128( a1, b1, z2, z3 );
  1306. mul64To128( a1, b0, z1, more2 );
  1307. add128( z1, more2, 0, z2, z1, z2 );
  1308. mul64To128( a0, b0, z0, more1 );
  1309. add128( z0, more1, 0, z1, z0, z1 );
  1310. mul64To128( a0, b1, more1, more2 );
  1311. add128( more1, more2, 0, z2, more1, z2 );
  1312. add128( z0, z1, 0, more1, z0, z1 );
  1313. z3Ptr := z3;
  1314. z2Ptr := z2;
  1315. z1Ptr := z1;
  1316. z0Ptr := z0;
  1317. end;
  1318. {*----------------------------------------------------------------------------
  1319. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1320. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1321. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1322. | `z2Ptr'.
  1323. *----------------------------------------------------------------------------*}
  1324. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1325. var
  1326. z0, z1, z2, more1 : bits64;
  1327. begin
  1328. mul64To128( a1, b, z1, z2 );
  1329. mul64To128( a0, b, z0, more1 );
  1330. add128( z0, more1, 0, z1, z0, z1 );
  1331. z2Ptr := z2;
  1332. z1Ptr := z1;
  1333. z0Ptr := z0;
  1334. end;
  1335. {*----------------------------------------------------------------------------
  1336. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1337. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1338. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1339. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1340. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1341. | unsigned integer is returned.
  1342. *----------------------------------------------------------------------------*}
  1343. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1344. var
  1345. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1346. begin
  1347. if ( b <= a0 ) then
  1348. begin
  1349. result:=qword( $FFFFFFFFFFFFFFFF );
  1350. exit;
  1351. end;
  1352. b0 := b shr 32;
  1353. if ( b0 shl 32 <= a0 ) then
  1354. z:=qword( $FFFFFFFF00000000 )
  1355. else
  1356. z:=( a0 div b0 ) shl 32;
  1357. mul64To128( b, z, term0, term1 );
  1358. sub128( a0, a1, term0, term1, rem0, rem1 );
  1359. while ( ( sbits64(rem0) ) < 0 ) do begin
  1360. dec(z,qword( $100000000 ));
  1361. b1 := b shl 32;
  1362. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1363. end;
  1364. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1365. if ( b0 shl 32 <= rem0 ) then
  1366. z:=z or $FFFFFFFF
  1367. else
  1368. z:=z or rem0 div b0;
  1369. result:=z;
  1370. end;
  1371. {*
  1372. -------------------------------------------------------------------------------
  1373. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1374. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1375. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1376. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1377. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1378. unsigned integer is returned.
  1379. -------------------------------------------------------------------------------
  1380. *}
  1381. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1382. Var
  1383. b0, b1: bits32;
  1384. rem0, rem1, term0, term1: bits32;
  1385. z: bits32;
  1386. Begin
  1387. if ( b <= a0 ) then
  1388. Begin
  1389. estimateDiv64To32 := $FFFFFFFF;
  1390. exit;
  1391. End;
  1392. b0 := b shr 16;
  1393. if ( b0 shl 16 <= a0 ) then
  1394. z:= $FFFF0000
  1395. else
  1396. z:= ( a0 div b0 ) shl 16;
  1397. mul32To64( b, z, term0, term1 );
  1398. sub64( a0, a1, term0, term1, rem0, rem1 );
  1399. while ( ( sbits32 (rem0) ) < 0 ) do
  1400. Begin
  1401. z := z - $10000;
  1402. b1 := b shl 16;
  1403. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1404. End;
  1405. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1406. if ( b0 shl 16 <= rem0 ) then
  1407. z := z or $FFFF
  1408. else
  1409. z := z or (rem0 div b0);
  1410. estimateDiv64To32 := z;
  1411. End;
  1412. {*
  1413. -------------------------------------------------------------------------------
  1414. Returns an approximation to the square root of the 32-bit significand given
  1415. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1416. `aExp' (the least significant bit) is 1, the integer returned approximates
  1417. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1418. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1419. case, the approximation returned lies strictly within +/-2 of the exact
  1420. value.
  1421. -------------------------------------------------------------------------------
  1422. *}
  1423. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1424. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1425. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1426. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1427. );
  1428. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1429. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1430. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1431. );
  1432. Var
  1433. index: int8;
  1434. z: bits32;
  1435. Begin
  1436. index := ( a shr 27 ) AND 15;
  1437. if ( aExp AND 1 ) <> 0 then
  1438. Begin
  1439. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1440. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1441. a := a shr 1;
  1442. End
  1443. else
  1444. Begin
  1445. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1446. z := a div z + z;
  1447. if ( $20000 <= z ) then
  1448. z := $FFFF8000
  1449. else
  1450. z := ( z shl 15 );
  1451. if ( z <= a ) then
  1452. Begin
  1453. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1454. exit;
  1455. End;
  1456. End;
  1457. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1458. End;
  1459. {*
  1460. -------------------------------------------------------------------------------
  1461. Returns the number of leading 0 bits before the most-significant 1 bit of
  1462. `a'. If `a' is zero, 32 is returned.
  1463. -------------------------------------------------------------------------------
  1464. *}
  1465. Function countLeadingZeros32( a:bits32 ): int8;
  1466. const countLeadingZerosHigh:array[0..255] of int8 = (
  1467. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1468. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1469. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1470. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1471. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1472. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1473. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1474. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1475. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1476. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1477. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1478. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1479. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1480. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1481. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1482. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1483. );
  1484. Var
  1485. shiftCount: int8;
  1486. Begin
  1487. shiftCount := 0;
  1488. if ( a < $10000 ) then
  1489. Begin
  1490. shiftCount := shiftcount + 16;
  1491. a := a shl 16;
  1492. End;
  1493. if ( a < $1000000 ) then
  1494. Begin
  1495. shiftCount := shiftcount + 8;
  1496. a := a shl 8;
  1497. end;
  1498. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1499. countLeadingZeros32:= shiftCount;
  1500. End;
  1501. {*----------------------------------------------------------------------------
  1502. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1503. | `a'. If `a' is zero, 64 is returned.
  1504. *----------------------------------------------------------------------------*}
  1505. function countLeadingZeros64( a : bits64): int8;
  1506. var
  1507. shiftcount : int8;
  1508. Begin
  1509. shiftCount := 0;
  1510. if ( a < bits64(bits64(1) shl 32 )) then
  1511. shiftCount := shiftcount + 32
  1512. else
  1513. a := a shr 32;
  1514. shiftCount := shiftCount + countLeadingZeros32( a );
  1515. countLeadingZeros64:= shiftCount;
  1516. End;
  1517. {*
  1518. -------------------------------------------------------------------------------
  1519. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
  1520. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1521. returns 0.
  1522. -------------------------------------------------------------------------------
  1523. *}
  1524. Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1525. Begin
  1526. eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
  1527. End;
  1528. {*
  1529. -------------------------------------------------------------------------------
  1530. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1531. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1532. Otherwise, returns 0.
  1533. -------------------------------------------------------------------------------
  1534. *}
  1535. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1536. Begin
  1537. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1538. End;
  1539. {*
  1540. -------------------------------------------------------------------------------
  1541. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1542. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1543. returns 0.
  1544. -------------------------------------------------------------------------------
  1545. *}
  1546. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1547. Begin
  1548. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1549. End;
  1550. {*
  1551. -------------------------------------------------------------------------------
  1552. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
  1553. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1554. returns 0.
  1555. -------------------------------------------------------------------------------
  1556. *}
  1557. Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1558. Begin
  1559. ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
  1560. End;
  1561. const
  1562. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1563. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1564. (*****************************************************************************)
  1565. (* End Low-Level arithmetic *)
  1566. (*****************************************************************************)
  1567. {*
  1568. -------------------------------------------------------------------------------
  1569. Functions and definitions to determine: (1) whether tininess for underflow
  1570. is detected before or after rounding by default, (2) what (if anything)
  1571. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1572. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1573. are propagated from function inputs to output. These details are ENDIAN
  1574. specific
  1575. -------------------------------------------------------------------------------
  1576. *}
  1577. {$IFDEF ENDIAN_LITTLE}
  1578. {*
  1579. -------------------------------------------------------------------------------
  1580. Internal canonical NaN format.
  1581. -------------------------------------------------------------------------------
  1582. *}
  1583. TYPE
  1584. commonNaNT = packed record
  1585. sign: flag;
  1586. high, low : bits32;
  1587. end;
  1588. {*
  1589. -------------------------------------------------------------------------------
  1590. The pattern for a default generated single-precision NaN.
  1591. -------------------------------------------------------------------------------
  1592. *}
  1593. const float32_default_nan = $FFC00000;
  1594. {*
  1595. -------------------------------------------------------------------------------
  1596. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1597. otherwise returns 0.
  1598. -------------------------------------------------------------------------------
  1599. *}
  1600. Function float32_is_nan( a : float32 ): flag;
  1601. Begin
  1602. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1603. End;
  1604. {*
  1605. -------------------------------------------------------------------------------
  1606. Returns 1 if the single-precision floating-point value `a' is a signaling
  1607. NaN; otherwise returns 0.
  1608. -------------------------------------------------------------------------------
  1609. *}
  1610. Function float32_is_signaling_nan( a : float32 ): flag;
  1611. Begin
  1612. float32_is_signaling_nan := flag
  1613. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1614. End;
  1615. {*
  1616. -------------------------------------------------------------------------------
  1617. Returns the result of converting the single-precision floating-point NaN
  1618. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1619. exception is raised.
  1620. -------------------------------------------------------------------------------
  1621. *}
  1622. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1623. var
  1624. z : commonNaNT ;
  1625. Begin
  1626. if ( float32_is_signaling_nan( a ) <> 0) then
  1627. float_raise( float_flag_invalid );
  1628. z.sign := a shr 31;
  1629. z.low := 0;
  1630. z.high := a shl 9;
  1631. c := z;
  1632. End;
  1633. {*
  1634. -------------------------------------------------------------------------------
  1635. Returns the result of converting the canonical NaN `a' to the single-
  1636. precision floating-point format.
  1637. -------------------------------------------------------------------------------
  1638. *}
  1639. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1640. Begin
  1641. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1642. End;
  1643. {*
  1644. -------------------------------------------------------------------------------
  1645. Takes two single-precision floating-point values `a' and `b', one of which
  1646. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1647. signaling NaN, the invalid exception is raised.
  1648. -------------------------------------------------------------------------------
  1649. *}
  1650. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1651. Var
  1652. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1653. label returnLargerSignificand;
  1654. Begin
  1655. aIsNaN := float32_is_nan( a );
  1656. aIsSignalingNaN := float32_is_signaling_nan( a );
  1657. bIsNaN := float32_is_nan( b );
  1658. bIsSignalingNaN := float32_is_signaling_nan( b );
  1659. a := a or $00400000;
  1660. b := b or $00400000;
  1661. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1662. float_raise( float_flag_invalid );
  1663. if ( aIsSignalingNaN )<> 0 then
  1664. Begin
  1665. if ( bIsSignalingNaN ) <> 0 then
  1666. goto returnLargerSignificand;
  1667. if bIsNan <> 0 then
  1668. propagateFloat32NaN := b
  1669. else
  1670. propagateFloat32NaN := a;
  1671. exit;
  1672. End
  1673. else if ( aIsNaN <> 0) then
  1674. Begin
  1675. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1676. Begin
  1677. propagateFloat32NaN := a;
  1678. exit;
  1679. End;
  1680. returnLargerSignificand:
  1681. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1682. Begin
  1683. propagateFloat32NaN := b;
  1684. exit;
  1685. End;
  1686. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1687. Begin
  1688. propagateFloat32NaN := a;
  1689. End;
  1690. if a < b then
  1691. propagateFloat32NaN := a
  1692. else
  1693. propagateFloat32NaN := b;
  1694. exit;
  1695. End
  1696. else
  1697. Begin
  1698. propagateFloat32NaN := b;
  1699. exit;
  1700. End;
  1701. End;
  1702. {*
  1703. -------------------------------------------------------------------------------
  1704. The pattern for a default generated double-precision NaN. The `high' and
  1705. `low' values hold the most- and least-significant bits, respectively.
  1706. -------------------------------------------------------------------------------
  1707. *}
  1708. const
  1709. float64_default_nan_high = $FFF80000;
  1710. float64_default_nan_low = $00000000;
  1711. {*
  1712. -------------------------------------------------------------------------------
  1713. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1714. otherwise returns 0.
  1715. -------------------------------------------------------------------------------
  1716. *}
  1717. Function float64_is_nan( a : float64 ) : flag;
  1718. Begin
  1719. float64_is_nan :=
  1720. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1721. and ( a.low or ( a.high and $000FFFFF ) );
  1722. End;
  1723. {*
  1724. -------------------------------------------------------------------------------
  1725. Returns 1 if the double-precision floating-point value `a' is a signaling
  1726. NaN; otherwise returns 0.
  1727. -------------------------------------------------------------------------------
  1728. *}
  1729. Function float64_is_signaling_nan( a : float64 ): flag;
  1730. Begin
  1731. float64_is_signaling_nan :=
  1732. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1733. and ( a.low or ( a.high and $0007FFFF ) );
  1734. End;
  1735. {*
  1736. -------------------------------------------------------------------------------
  1737. Returns the result of converting the double-precision floating-point NaN
  1738. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1739. exception is raised.
  1740. -------------------------------------------------------------------------------
  1741. *}
  1742. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1743. Var
  1744. z : commonNaNT;
  1745. Begin
  1746. if ( float64_is_signaling_nan( a )<>0 ) then
  1747. float_raise( float_flag_invalid );
  1748. z.sign := a.high shr 31;
  1749. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1750. c := z;
  1751. End;
  1752. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1753. Var
  1754. z : commonNaNT;
  1755. Begin
  1756. if ( float64_is_signaling_nan( a )<>0 ) then
  1757. float_raise( float_flag_invalid );
  1758. z.sign := a.high shr 31;
  1759. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1760. result := z;
  1761. End;
  1762. {*
  1763. -------------------------------------------------------------------------------
  1764. Returns the result of converting the canonical NaN `a' to the double-
  1765. precision floating-point format.
  1766. -------------------------------------------------------------------------------
  1767. *}
  1768. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1769. Var
  1770. z: float64;
  1771. Begin
  1772. shift64Right( a.high, a.low, 12, z.high, z.low );
  1773. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1774. c := z;
  1775. End;
  1776. {*
  1777. -------------------------------------------------------------------------------
  1778. Takes two double-precision floating-point values `a' and `b', one of which
  1779. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1780. signaling NaN, the invalid exception is raised.
  1781. -------------------------------------------------------------------------------
  1782. *}
  1783. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1784. Var
  1785. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1786. label returnLargerSignificand;
  1787. Begin
  1788. aIsNaN := float64_is_nan( a );
  1789. aIsSignalingNaN := float64_is_signaling_nan( a );
  1790. bIsNaN := float64_is_nan( b );
  1791. bIsSignalingNaN := float64_is_signaling_nan( b );
  1792. a.high := a.high or $00080000;
  1793. b.high := b.high or $00080000;
  1794. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1795. float_raise( float_flag_invalid );
  1796. if ( aIsSignalingNaN )<>0 then
  1797. Begin
  1798. if ( bIsSignalingNaN )<>0 then
  1799. goto returnLargerSignificand;
  1800. if bIsNan <> 0 then
  1801. c := b
  1802. else
  1803. c := a;
  1804. exit;
  1805. End
  1806. else if ( aIsNaN )<> 0 then
  1807. Begin
  1808. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1809. Begin
  1810. c := a;
  1811. exit;
  1812. End;
  1813. returnLargerSignificand:
  1814. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1815. Begin
  1816. c := b;
  1817. exit;
  1818. End;
  1819. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1820. Begin
  1821. c := a;
  1822. exit;
  1823. End;
  1824. if a.high < b.high then
  1825. c := a
  1826. else
  1827. c := b;
  1828. exit;
  1829. End
  1830. else
  1831. Begin
  1832. c := b;
  1833. exit;
  1834. End;
  1835. End;
  1836. {*----------------------------------------------------------------------------
  1837. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1838. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1839. | returns 0.
  1840. *----------------------------------------------------------------------------*}
  1841. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1842. begin
  1843. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1844. end;
  1845. {*----------------------------------------------------------------------------
  1846. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1847. | otherwise returns 0.
  1848. *----------------------------------------------------------------------------*}
  1849. function float128_is_nan( a : float128): flag;
  1850. begin
  1851. result:= ord(( int64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1852. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1853. end;
  1854. {*----------------------------------------------------------------------------
  1855. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1856. | signaling NaN; otherwise returns 0.
  1857. *----------------------------------------------------------------------------*}
  1858. function float128_is_signaling_nan( a : float128): flag;
  1859. begin
  1860. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1861. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1862. end;
  1863. {*----------------------------------------------------------------------------
  1864. | Returns the result of converting the quadruple-precision floating-point NaN
  1865. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1866. | exception is raised.
  1867. *----------------------------------------------------------------------------*}
  1868. function float128ToCommonNaN( a : float128): commonNaNT;
  1869. var
  1870. z: commonNaNT;
  1871. qhigh,qlow : qword;
  1872. begin
  1873. if ( float128_is_signaling_nan( a )<>0) then
  1874. float_raise( float_flag_invalid );
  1875. z.sign := a.high shr 63;
  1876. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1877. z.high:=qhigh shr 32;
  1878. z.low:=qhigh and $ffffffff;
  1879. result:=z;
  1880. end;
  1881. {*----------------------------------------------------------------------------
  1882. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1883. | precision floating-point format.
  1884. *----------------------------------------------------------------------------*}
  1885. function commonNaNToFloat128( a : commonNaNT): float128;
  1886. var
  1887. z: float128;
  1888. begin
  1889. shift128Right( a.high, a.low, 16, z.high, z.low );
  1890. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1891. result:=z;
  1892. end;
  1893. {*----------------------------------------------------------------------------
  1894. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1895. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1896. | `b' is a signaling NaN, the invalid exception is raised.
  1897. *----------------------------------------------------------------------------*}
  1898. function propagateFloat128NaN( a: float128; b : float128): float128;
  1899. var
  1900. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1901. label
  1902. returnLargerSignificand;
  1903. begin
  1904. aIsNaN := float128_is_nan( a );
  1905. aIsSignalingNaN := float128_is_signaling_nan( a );
  1906. bIsNaN := float128_is_nan( b );
  1907. bIsSignalingNaN := float128_is_signaling_nan( b );
  1908. a.high := a.high or int64( $0000800000000000 );
  1909. b.high := b.high or int64( $0000800000000000 );
  1910. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1911. float_raise( float_flag_invalid );
  1912. if ( aIsSignalingNaN )<>0 then
  1913. begin
  1914. if ( bIsSignalingNaN )<>0 then
  1915. goto returnLargerSignificand;
  1916. if bIsNaN<>0 then
  1917. result := b
  1918. else
  1919. result := a;
  1920. exit;
  1921. end
  1922. else if ( aIsNaN )<>0 then
  1923. begin
  1924. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1925. begin
  1926. result := a;
  1927. exit;
  1928. end;
  1929. returnLargerSignificand:
  1930. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1931. begin
  1932. result := b;
  1933. exit;
  1934. end;
  1935. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1936. begin
  1937. result := a;
  1938. exit
  1939. end;
  1940. if ( a.high < b.high ) then
  1941. result := a
  1942. else
  1943. result := b;
  1944. exit;
  1945. end
  1946. else
  1947. result:=b;
  1948. end;
  1949. {$ELSE}
  1950. { Big endian code }
  1951. (*----------------------------------------------------------------------------
  1952. | Internal canonical NaN format.
  1953. *----------------------------------------------------------------------------*)
  1954. type
  1955. commonNANT = packed record
  1956. sign : flag;
  1957. high, low : bits32;
  1958. end;
  1959. (*----------------------------------------------------------------------------
  1960. | The pattern for a default generated single-precision NaN.
  1961. *----------------------------------------------------------------------------*)
  1962. const float32_default_nan = $7FFFFFFF;
  1963. (*----------------------------------------------------------------------------
  1964. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1965. | otherwise returns 0.
  1966. *----------------------------------------------------------------------------*)
  1967. function float32_is_nan(a: float32): flag;
  1968. begin
  1969. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1970. end;
  1971. (*----------------------------------------------------------------------------
  1972. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1973. | NaN; otherwise returns 0.
  1974. *----------------------------------------------------------------------------*)
  1975. function float32_is_signaling_nan(a: float32):flag;
  1976. begin
  1977. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  1978. end;
  1979. (*----------------------------------------------------------------------------
  1980. | Returns the result of converting the single-precision floating-point NaN
  1981. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1982. | exception is raised.
  1983. *----------------------------------------------------------------------------*)
  1984. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1985. var
  1986. z: commonNANT;
  1987. begin
  1988. if float32_is_signaling_nan(a)<>0 then
  1989. float_raise(float_flag_invalid);
  1990. z.sign := a shr 31;
  1991. z.low := 0;
  1992. z.high := a shl 9;
  1993. c:=z;
  1994. end;
  1995. (*----------------------------------------------------------------------------
  1996. | Returns the result of converting the canonical NaN `a' to the single-
  1997. | precision floating-point format.
  1998. *----------------------------------------------------------------------------*)
  1999. function CommonNanToFloat32(a : CommonNaNT): float32;
  2000. begin
  2001. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  2002. end;
  2003. (*----------------------------------------------------------------------------
  2004. | Takes two single-precision floating-point values `a' and `b', one of which
  2005. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2006. | signaling NaN, the invalid exception is raised.
  2007. *----------------------------------------------------------------------------*)
  2008. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  2009. var
  2010. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2011. begin
  2012. aIsNaN := float32_is_nan( a );
  2013. aIsSignalingNaN := float32_is_signaling_nan( a );
  2014. bIsNaN := float32_is_nan( b );
  2015. bIsSignalingNaN := float32_is_signaling_nan( b );
  2016. a := a or $00400000;
  2017. b := b or $00400000;
  2018. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2019. float_raise( float_flag_invalid );
  2020. if bIsSignalingNaN<>0 then
  2021. propagateFloat32Nan := b
  2022. else if aIsSignalingNan<>0 then
  2023. propagateFloat32Nan := a
  2024. else if bIsNan<>0 then
  2025. propagateFloat32Nan := b
  2026. else
  2027. propagateFloat32Nan := a;
  2028. end;
  2029. (*----------------------------------------------------------------------------
  2030. | The pattern for a default generated double-precision NaN. The `high' and
  2031. | `low' values hold the most- and least-significant bits, respectively.
  2032. *----------------------------------------------------------------------------*)
  2033. const
  2034. float64_default_nan_high = $7FFFFFFF;
  2035. float64_default_nan_low = $FFFFFFFF;
  2036. (*----------------------------------------------------------------------------
  2037. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2038. | otherwise returns 0.
  2039. *----------------------------------------------------------------------------*)
  2040. function float64_is_nan(a: float64): flag;
  2041. begin
  2042. float64_is_nan := flag (
  2043. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2044. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2045. end;
  2046. (*----------------------------------------------------------------------------
  2047. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2048. | NaN; otherwise returns 0.
  2049. *----------------------------------------------------------------------------*)
  2050. function float64_is_signaling_nan( a:float64): flag;
  2051. begin
  2052. float64_is_signaling_nan := flag(
  2053. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2054. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2055. end;
  2056. (*----------------------------------------------------------------------------
  2057. | Returns the result of converting the double-precision floating-point NaN
  2058. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2059. | exception is raised.
  2060. *----------------------------------------------------------------------------*)
  2061. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  2062. var
  2063. z : commonNaNT;
  2064. begin
  2065. if ( float64_is_signaling_nan( a )<>0 ) then
  2066. float_raise( float_flag_invalid );
  2067. z.sign := a.high shr 31;
  2068. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2069. c:=z;
  2070. end;
  2071. (*----------------------------------------------------------------------------
  2072. | Returns the result of converting the canonical NaN `a' to the double-
  2073. | precision floating-point format.
  2074. *----------------------------------------------------------------------------*)
  2075. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  2076. var
  2077. z: float64;
  2078. begin
  2079. shift64Right( a.high, a.low, 12, z.high, z.low );
  2080. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2081. c:=z;
  2082. end;
  2083. (*----------------------------------------------------------------------------
  2084. | Takes two double-precision floating-point values `a' and `b', one of which
  2085. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2086. | signaling NaN, the invalid exception is raised.
  2087. *----------------------------------------------------------------------------*)
  2088. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2089. var
  2090. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2091. begin
  2092. aIsNaN := float64_is_nan( a );
  2093. aIsSignalingNaN := float64_is_signaling_nan( a );
  2094. bIsNaN := float64_is_nan( b );
  2095. bIsSignalingNaN := float64_is_signaling_nan( b );
  2096. a.high := a.high or $00080000;
  2097. b.high := b.high or $00080000;
  2098. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2099. float_raise( float_flag_invalid );
  2100. if bIsSignalingNaN<>0 then
  2101. c := b
  2102. else if aIsSignalingNan<>0 then
  2103. c := a
  2104. else if bIsNan<>0 then
  2105. c := b
  2106. else
  2107. c := a;
  2108. end;
  2109. {$ENDIF}
  2110. (****************************************************************************)
  2111. (* END ENDIAN SPECIFIC CODE *)
  2112. (****************************************************************************)
  2113. {*
  2114. -------------------------------------------------------------------------------
  2115. Returns the fraction bits of the single-precision floating-point value `a'.
  2116. -------------------------------------------------------------------------------
  2117. *}
  2118. Function ExtractFloat32Frac(a : Float32) : Bits32;
  2119. Begin
  2120. ExtractFloat32Frac := A AND $007FFFFF;
  2121. End;
  2122. {*
  2123. -------------------------------------------------------------------------------
  2124. Returns the exponent bits of the single-precision floating-point value `a'.
  2125. -------------------------------------------------------------------------------
  2126. *}
  2127. Function extractFloat32Exp( a: float32 ): Int16;
  2128. Begin
  2129. extractFloat32Exp := (a shr 23) AND $FF;
  2130. End;
  2131. {*
  2132. -------------------------------------------------------------------------------
  2133. Returns the sign bit of the single-precision floating-point value `a'.
  2134. -------------------------------------------------------------------------------
  2135. *}
  2136. Function extractFloat32Sign( a: float32 ): Flag;
  2137. Begin
  2138. extractFloat32Sign := a shr 31;
  2139. End;
  2140. {*
  2141. -------------------------------------------------------------------------------
  2142. Normalizes the subnormal single-precision floating-point value represented
  2143. by the denormalized significand `aSig'. The normalized exponent and
  2144. significand are stored at the locations pointed to by `zExpPtr' and
  2145. `zSigPtr', respectively.
  2146. -------------------------------------------------------------------------------
  2147. *}
  2148. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2149. Var
  2150. ShiftCount : BYTE;
  2151. Begin
  2152. shiftCount := countLeadingZeros32( aSig ) - 8;
  2153. zSigPtr := aSig shl shiftCount;
  2154. zExpPtr := 1 - shiftCount;
  2155. End;
  2156. {*
  2157. -------------------------------------------------------------------------------
  2158. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2159. single-precision floating-point value, returning the result. After being
  2160. shifted into the proper positions, the three fields are simply added
  2161. together to form the result. This means that any integer portion of `zSig'
  2162. will be added into the exponent. Since a properly normalized significand
  2163. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2164. than the desired result exponent whenever `zSig' is a complete, normalized
  2165. significand.
  2166. -------------------------------------------------------------------------------
  2167. *}
  2168. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  2169. Begin
  2170. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2171. + zSig;
  2172. End;
  2173. {*
  2174. -------------------------------------------------------------------------------
  2175. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2176. and significand `zSig', and returns the proper single-precision floating-
  2177. point value corresponding to the abstract input. Ordinarily, the abstract
  2178. value is simply rounded and packed into the single-precision format, with
  2179. the inexact exception raised if the abstract input cannot be represented
  2180. exactly. However, if the abstract value is too large, the overflow and
  2181. inexact exceptions are raised and an infinity or maximal finite value is
  2182. returned. If the abstract value is too small, the input value is rounded to
  2183. a subnormal number, and the underflow and inexact exceptions are raised if
  2184. the abstract input cannot be represented exactly as a subnormal single-
  2185. precision floating-point number.
  2186. The input significand `zSig' has its binary point between bits 30
  2187. and 29, which is 7 bits to the left of the usual location. This shifted
  2188. significand must be normalized or smaller. If `zSig' is not normalized,
  2189. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2190. and it must not require rounding. In the usual case that `zSig' is
  2191. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2192. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2193. Binary Floating-Point Arithmetic.
  2194. -------------------------------------------------------------------------------
  2195. *}
  2196. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2197. Var
  2198. roundingMode : BYTE;
  2199. roundNearestEven : Flag;
  2200. roundIncrement, roundBits : BYTE;
  2201. IsTiny : Flag;
  2202. Begin
  2203. roundingMode := softfloat_rounding_mode;
  2204. if (roundingMode = float_round_nearest_even) then
  2205. Begin
  2206. roundNearestEven := Flag(TRUE);
  2207. end
  2208. else
  2209. roundNearestEven := Flag(FALSE);
  2210. roundIncrement := $40;
  2211. if ( Boolean(roundNearestEven) = FALSE) then
  2212. Begin
  2213. if ( roundingMode = float_round_to_zero ) Then
  2214. Begin
  2215. roundIncrement := 0;
  2216. End
  2217. else
  2218. Begin
  2219. roundIncrement := $7F;
  2220. if ( zSign <> 0 ) then
  2221. Begin
  2222. if roundingMode = float_round_up then roundIncrement := 0;
  2223. End
  2224. else
  2225. Begin
  2226. if roundingMode = float_round_down then roundIncrement := 0;
  2227. End;
  2228. End
  2229. End;
  2230. roundBits := zSig AND $7F;
  2231. if ($FD <= bits16 (zExp) ) then
  2232. Begin
  2233. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2234. Begin
  2235. float_raise( float_flag_overflow OR float_flag_inexact );
  2236. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2237. exit;
  2238. End;
  2239. if ( zExp < 0 ) then
  2240. Begin
  2241. isTiny :=
  2242. flag(( float_detect_tininess = float_tininess_before_rounding )
  2243. OR ( zExp < -1 )
  2244. OR ( (zSig + roundIncrement) < $80000000 ));
  2245. shift32RightJamming( zSig, - zExp, zSig );
  2246. zExp := 0;
  2247. roundBits := zSig AND $7F;
  2248. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  2249. float_raise( float_flag_underflow );
  2250. End;
  2251. End;
  2252. if ( roundBits )<> 0 then
  2253. softfloat_exception_flags := float_flag_inexact OR softfloat_exception_flags;
  2254. zSig := ( zSig + roundIncrement ) shr 7;
  2255. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  2256. if ( zSig = 0 ) then zExp := 0;
  2257. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2258. exit;
  2259. End;
  2260. {*
  2261. -------------------------------------------------------------------------------
  2262. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2263. and significand `zSig', and returns the proper single-precision floating-
  2264. point value corresponding to the abstract input. This routine is just like
  2265. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2266. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2267. floating-point exponent.
  2268. -------------------------------------------------------------------------------
  2269. *}
  2270. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2271. Var
  2272. ShiftCount : int8;
  2273. Begin
  2274. shiftCount := countLeadingZeros32( zSig ) - 1;
  2275. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2276. End;
  2277. {*
  2278. -------------------------------------------------------------------------------
  2279. Returns the most-significant 20 fraction bits of the double-precision
  2280. floating-point value `a'.
  2281. -------------------------------------------------------------------------------
  2282. *}
  2283. Function extractFloat64Frac0(a: float64): bits32;
  2284. Begin
  2285. extractFloat64Frac0 := a.high and $000FFFFF;
  2286. End;
  2287. {*
  2288. -------------------------------------------------------------------------------
  2289. Returns the least-significant 32 fraction bits of the double-precision
  2290. floating-point value `a'.
  2291. -------------------------------------------------------------------------------
  2292. *}
  2293. Function extractFloat64Frac1(a: float64): bits32;
  2294. Begin
  2295. extractFloat64Frac1 := a.low;
  2296. End;
  2297. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2298. Function extractFloat64Frac(a: float64): bits64;
  2299. Begin
  2300. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2301. End;
  2302. {*
  2303. -------------------------------------------------------------------------------
  2304. Returns the exponent bits of the double-precision floating-point value `a'.
  2305. -------------------------------------------------------------------------------
  2306. *}
  2307. Function extractFloat64Exp(a: float64): int16;
  2308. Begin
  2309. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2310. End;
  2311. {*
  2312. -------------------------------------------------------------------------------
  2313. Returns the sign bit of the double-precision floating-point value `a'.
  2314. -------------------------------------------------------------------------------
  2315. *}
  2316. Function extractFloat64Sign(a: float64) : flag;
  2317. Begin
  2318. extractFloat64Sign := a.high shr 31;
  2319. End;
  2320. {*
  2321. -------------------------------------------------------------------------------
  2322. Normalizes the subnormal double-precision floating-point value represented
  2323. by the denormalized significand formed by the concatenation of `aSig0' and
  2324. `aSig1'. The normalized exponent is stored at the location pointed to by
  2325. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2326. stored at the location pointed to by `zSig0Ptr', and the least significant
  2327. 32 bits of the normalized significand are stored at the location pointed to
  2328. by `zSig1Ptr'.
  2329. -------------------------------------------------------------------------------
  2330. *}
  2331. Procedure normalizeFloat64Subnormal(
  2332. aSig0: bits32;
  2333. aSig1: bits32;
  2334. VAR zExpPtr : Int16;
  2335. VAR zSig0Ptr : Bits32;
  2336. VAR zSig1Ptr : Bits32
  2337. );
  2338. Var
  2339. ShiftCount : Int8;
  2340. Begin
  2341. if ( aSig0 = 0 ) then
  2342. Begin
  2343. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2344. if ( shiftCount < 0 ) then
  2345. Begin
  2346. zSig0Ptr := aSig1 shr ( - shiftCount );
  2347. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2348. End
  2349. else
  2350. Begin
  2351. zSig0Ptr := aSig1 shl shiftCount;
  2352. zSig1Ptr := 0;
  2353. End;
  2354. zExpPtr := - shiftCount - 31;
  2355. End
  2356. else
  2357. Begin
  2358. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2359. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2360. zExpPtr := 1 - shiftCount;
  2361. End;
  2362. End;
  2363. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2364. var
  2365. shiftCount : int8;
  2366. begin
  2367. shiftCount := countLeadingZeros64( aSig ) - 11;
  2368. zSigPtr := aSig shl shiftCount;
  2369. zExpPtr := 1 - shiftCount;
  2370. end;
  2371. {*
  2372. -------------------------------------------------------------------------------
  2373. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2374. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2375. point value, returning the result. After being shifted into the proper
  2376. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2377. together to form the most significant 32 bits of the result. This means
  2378. that any integer portion of `zSig0' will be added into the exponent. Since
  2379. a properly normalized significand will have an integer portion equal to 1,
  2380. the `zExp' input should be 1 less than the desired result exponent whenever
  2381. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2382. -------------------------------------------------------------------------------
  2383. *}
  2384. Procedure
  2385. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2386. var
  2387. z: Float64;
  2388. Begin
  2389. z.low := zSig1;
  2390. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2391. c := z;
  2392. End;
  2393. {*----------------------------------------------------------------------------
  2394. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2395. | double-precision floating-point value, returning the result. After being
  2396. | shifted into the proper positions, the three fields are simply added
  2397. | together to form the result. This means that any integer portion of `zSig'
  2398. | will be added into the exponent. Since a properly normalized significand
  2399. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2400. | than the desired result exponent whenever `zSig' is a complete, normalized
  2401. | significand.
  2402. *----------------------------------------------------------------------------*}
  2403. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2404. begin
  2405. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2406. end;
  2407. {*
  2408. -------------------------------------------------------------------------------
  2409. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2410. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2411. and `zSig2', and returns the proper double-precision floating-point value
  2412. corresponding to the abstract input. Ordinarily, the abstract value is
  2413. simply rounded and packed into the double-precision format, with the inexact
  2414. exception raised if the abstract input cannot be represented exactly.
  2415. However, if the abstract value is too large, the overflow and inexact
  2416. exceptions are raised and an infinity or maximal finite value is returned.
  2417. If the abstract value is too small, the input value is rounded to a
  2418. subnormal number, and the underflow and inexact exceptions are raised if the
  2419. abstract input cannot be represented exactly as a subnormal double-precision
  2420. floating-point number.
  2421. The input significand must be normalized or smaller. If the input
  2422. significand is not normalized, `zExp' must be 0; in that case, the result
  2423. returned is a subnormal number, and it must not require rounding. In the
  2424. usual case that the input significand is normalized, `zExp' must be 1 less
  2425. than the ``true'' floating-point exponent. The handling of underflow and
  2426. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2427. -------------------------------------------------------------------------------
  2428. *}
  2429. Procedure
  2430. roundAndPackFloat64(
  2431. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2432. Var
  2433. roundingMode : Int8;
  2434. roundNearestEven, increment, isTiny : Flag;
  2435. Begin
  2436. roundingMode := softfloat_rounding_mode;
  2437. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2438. increment := flag( sbits32 (zSig2) < 0 );
  2439. if ( roundNearestEven = flag(FALSE) ) then
  2440. Begin
  2441. if ( roundingMode = float_round_to_zero ) then
  2442. increment := 0
  2443. else
  2444. Begin
  2445. if ( zSign )<> 0 then
  2446. Begin
  2447. increment := flag( roundingMode = float_round_down ) and zSig2;
  2448. End
  2449. else
  2450. Begin
  2451. increment := flag( roundingMode = float_round_up ) and zSig2;
  2452. End
  2453. End
  2454. End;
  2455. if ( $7FD <= bits16 (zExp) ) then
  2456. Begin
  2457. if (( $7FD < zExp )
  2458. or (( zExp = $7FD )
  2459. and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
  2460. and (increment<>0)
  2461. )
  2462. ) then
  2463. Begin
  2464. float_raise( float_flag_overflow OR float_flag_inexact );
  2465. if (( roundingMode = float_round_to_zero )
  2466. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2467. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2468. ) then
  2469. Begin
  2470. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2471. exit;
  2472. End;
  2473. packFloat64( zSign, $7FF, 0, 0, c );
  2474. exit;
  2475. End;
  2476. if ( zExp < 0 ) then
  2477. Begin
  2478. isTiny :=
  2479. flag( float_detect_tininess = float_tininess_before_rounding )
  2480. or flag( zExp < -1 )
  2481. or flag(increment = 0)
  2482. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2483. shift64ExtraRightJamming(
  2484. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2485. zExp := 0;
  2486. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2487. if ( roundNearestEven )<>0 then
  2488. Begin
  2489. increment := flag( sbits32 (zSig2) < 0 );
  2490. End
  2491. else
  2492. Begin
  2493. if ( zSign )<>0 then
  2494. Begin
  2495. increment := flag( roundingMode = float_round_down ) and zSig2;
  2496. End
  2497. else
  2498. Begin
  2499. increment := flag( roundingMode = float_round_up ) and zSig2;
  2500. End
  2501. End;
  2502. End;
  2503. End;
  2504. if ( zSig2 )<>0 then
  2505. softfloat_exception_flags := softfloat_exception_flags OR float_flag_inexact;
  2506. if ( increment )<>0 then
  2507. Begin
  2508. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2509. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2510. End
  2511. else
  2512. Begin
  2513. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2514. End;
  2515. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2516. End;
  2517. {*----------------------------------------------------------------------------
  2518. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2519. | and significand `zSig', and returns the proper double-precision floating-
  2520. | point value corresponding to the abstract input. Ordinarily, the abstract
  2521. | value is simply rounded and packed into the double-precision format, with
  2522. | the inexact exception raised if the abstract input cannot be represented
  2523. | exactly. However, if the abstract value is too large, the overflow and
  2524. | inexact exceptions are raised and an infinity or maximal finite value is
  2525. | returned. If the abstract value is too small, the input value is rounded
  2526. | to a subnormal number, and the underflow and inexact exceptions are raised
  2527. | if the abstract input cannot be represented exactly as a subnormal double-
  2528. | precision floating-point number.
  2529. | The input significand `zSig' has its binary point between bits 62
  2530. | and 61, which is 10 bits to the left of the usual location. This shifted
  2531. | significand must be normalized or smaller. If `zSig' is not normalized,
  2532. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2533. | and it must not require rounding. In the usual case that `zSig' is
  2534. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2535. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2536. | Binary Floating-Point Arithmetic.
  2537. *----------------------------------------------------------------------------*}
  2538. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2539. var
  2540. roundingMode: int8;
  2541. roundNearestEven: flag;
  2542. roundIncrement, roundBits: int16;
  2543. isTiny: flag;
  2544. begin
  2545. roundingMode := softfloat_rounding_mode;
  2546. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2547. roundIncrement := $200;
  2548. if ( roundNearestEven=0 ) then
  2549. begin
  2550. if ( roundingMode = float_round_to_zero ) then
  2551. begin
  2552. roundIncrement := 0;
  2553. end
  2554. else begin
  2555. roundIncrement := $3FF;
  2556. if ( zSign<>0 ) then
  2557. begin
  2558. if ( roundingMode = float_round_up ) then
  2559. roundIncrement := 0;
  2560. end
  2561. else begin
  2562. if ( roundingMode = float_round_down ) then
  2563. roundIncrement := 0;
  2564. end
  2565. end
  2566. end;
  2567. roundBits := zSig and $3FF;
  2568. if ( $7FD <= bits16(zExp) ) then
  2569. begin
  2570. if ( ( $7FD < zExp )
  2571. or ( ( zExp = $7FD )
  2572. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2573. ) then
  2574. begin
  2575. float_raise( float_flag_overflow or float_flag_inexact );
  2576. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2577. exit;
  2578. end;
  2579. if ( zExp < 0 ) then
  2580. begin
  2581. isTiny := ord(
  2582. ( float_detect_tininess = float_tininess_before_rounding )
  2583. or ( zExp < -1 )
  2584. or ( (zSig + roundIncrement) < int64( $8000000000000000 ) ) );
  2585. shift64RightJamming( zSig, - zExp, zSig );
  2586. zExp := 0;
  2587. roundBits := zSig and $3FF;
  2588. if ( isTiny and roundBits )<>0 then
  2589. float_raise( float_flag_underflow );
  2590. end
  2591. end;
  2592. if ( roundBits<>0 ) then
  2593. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2594. zSig := ( zSig + roundIncrement ) shr 10;
  2595. zSig := zSig and not( ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven );
  2596. if ( zSig = 0 ) then
  2597. zExp := 0;
  2598. result:=packFloat64( zSign, zExp, zSig );
  2599. end;
  2600. {*
  2601. -------------------------------------------------------------------------------
  2602. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2603. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2604. returns the proper double-precision floating-point value corresponding
  2605. to the abstract input. This routine is just like `roundAndPackFloat64'
  2606. except that the input significand has fewer bits and does not have to be
  2607. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2608. point exponent.
  2609. -------------------------------------------------------------------------------
  2610. *}
  2611. Procedure
  2612. normalizeRoundAndPackFloat64(
  2613. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2614. Var
  2615. shiftCount : int8;
  2616. zSig2 : bits32;
  2617. Begin
  2618. if ( zSig0 = 0 ) then
  2619. Begin
  2620. zSig0 := zSig1;
  2621. zSig1 := 0;
  2622. zExp := zExp -32;
  2623. End;
  2624. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2625. if ( 0 <= shiftCount ) then
  2626. Begin
  2627. zSig2 := 0;
  2628. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2629. End
  2630. else
  2631. Begin
  2632. shift64ExtraRightJamming
  2633. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2634. End;
  2635. zExp := zExp - shiftCount;
  2636. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2637. End;
  2638. {*
  2639. -------------------------------------------------------------------------------
  2640. Returns the result of converting the 32-bit two's complement integer `a' to
  2641. the single-precision floating-point format. The conversion is performed
  2642. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2643. -------------------------------------------------------------------------------
  2644. *}
  2645. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2646. Var
  2647. zSign : Flag;
  2648. Begin
  2649. if ( a = 0 ) then
  2650. Begin
  2651. int32_to_float32.float32 := 0;
  2652. exit;
  2653. End;
  2654. if ( a = sbits32 ($80000000) ) then
  2655. Begin
  2656. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2657. exit;
  2658. end;
  2659. zSign := flag( a < 0 );
  2660. If zSign<>0 then
  2661. a := -a;
  2662. int32_to_float32.float32:=
  2663. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2664. End;
  2665. {*
  2666. -------------------------------------------------------------------------------
  2667. Returns the result of converting the 32-bit two's complement integer `a' to
  2668. the double-precision floating-point format. The conversion is performed
  2669. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2670. -------------------------------------------------------------------------------
  2671. *}
  2672. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2673. var
  2674. zSign : flag;
  2675. absA : bits32;
  2676. shiftCount : int8;
  2677. zSig0, zSig1 : bits32;
  2678. Begin
  2679. if ( a = 0 ) then
  2680. Begin
  2681. packFloat64( 0, 0, 0, 0, result );
  2682. exit;
  2683. end;
  2684. zSign := flag( a < 0 );
  2685. if ZSign<>0 then
  2686. AbsA := -a
  2687. else
  2688. AbsA := a;
  2689. shiftCount := countLeadingZeros32( absA ) - 11;
  2690. if ( 0 <= shiftCount ) then
  2691. Begin
  2692. zSig0 := absA shl shiftCount;
  2693. zSig1 := 0;
  2694. End
  2695. else
  2696. Begin
  2697. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2698. End;
  2699. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2700. End;
  2701. {*
  2702. -------------------------------------------------------------------------------
  2703. Returns the result of converting the single-precision floating-point value
  2704. `a' to the 32-bit two's complement integer format. The conversion is
  2705. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2706. Arithmetic---which means in particular that the conversion is rounded
  2707. according to the current rounding mode. If `a' is a NaN, the largest
  2708. positive integer is returned. Otherwise, if the conversion overflows, the
  2709. largest integer with the same sign as `a' is returned.
  2710. -------------------------------------------------------------------------------
  2711. *}
  2712. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2713. Var
  2714. aSign: flag;
  2715. aExp, shiftCount: int16;
  2716. aSig, aSigExtra: bits32;
  2717. z: int32;
  2718. roundingMode: int8;
  2719. Begin
  2720. aSig := extractFloat32Frac( a.float32 );
  2721. aExp := extractFloat32Exp( a.float32 );
  2722. aSign := extractFloat32Sign( a.float32 );
  2723. shiftCount := aExp - $96;
  2724. if ( 0 <= shiftCount ) then
  2725. Begin
  2726. if ( $9E <= aExp ) then
  2727. Begin
  2728. if ( a.float32 <> $CF000000 ) then
  2729. Begin
  2730. float_raise( float_flag_invalid );
  2731. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2732. Begin
  2733. float32_to_int32 := $7FFFFFFF;
  2734. exit;
  2735. End;
  2736. End;
  2737. float32_to_int32 := sbits32 ($80000000);
  2738. exit;
  2739. End;
  2740. z := ( aSig or $00800000 ) shl shiftCount;
  2741. if ( aSign<>0 ) then z := - z;
  2742. End
  2743. else
  2744. Begin
  2745. if ( aExp < $7E ) then
  2746. Begin
  2747. aSigExtra := aExp OR aSig;
  2748. z := 0;
  2749. End
  2750. else
  2751. Begin
  2752. aSig := aSig OR $00800000;
  2753. aSigExtra := aSig shl ( shiftCount and 31 );
  2754. z := aSig shr ( - shiftCount );
  2755. End;
  2756. if ( aSigExtra<>0 ) then
  2757. softfloat_exception_flags := softfloat_exception_flags
  2758. or float_flag_inexact;
  2759. roundingMode := softfloat_rounding_mode;
  2760. if ( roundingMode = float_round_nearest_even ) then
  2761. Begin
  2762. if ( sbits32 (aSigExtra) < 0 ) then
  2763. Begin
  2764. Inc(z);
  2765. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2766. z := z and not 1;
  2767. End;
  2768. if ( aSign<>0 ) then
  2769. z := - z;
  2770. End
  2771. else
  2772. Begin
  2773. aSigExtra := flag( aSigExtra <> 0 );
  2774. if ( aSign<>0 ) then
  2775. Begin
  2776. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2777. z := - z;
  2778. End
  2779. else
  2780. Begin
  2781. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2782. End
  2783. End;
  2784. End;
  2785. float32_to_int32 := z;
  2786. End;
  2787. {*
  2788. -------------------------------------------------------------------------------
  2789. Returns the result of converting the single-precision floating-point value
  2790. `a' to the 32-bit two's complement integer format. The conversion is
  2791. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2792. Arithmetic, except that the conversion is always rounded toward zero.
  2793. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2794. the conversion overflows, the largest integer with the same sign as `a' is
  2795. returned.
  2796. -------------------------------------------------------------------------------
  2797. *}
  2798. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2799. Var
  2800. aSign : flag;
  2801. aExp, shiftCount : int16;
  2802. aSig : bits32;
  2803. z : int32;
  2804. Begin
  2805. aSig := extractFloat32Frac( a.float32 );
  2806. aExp := extractFloat32Exp( a.float32 );
  2807. aSign := extractFloat32Sign( a.float32 );
  2808. shiftCount := aExp - $9E;
  2809. if ( 0 <= shiftCount ) then
  2810. Begin
  2811. if ( a.float32 <> $CF000000 ) then
  2812. Begin
  2813. float_raise( float_flag_invalid );
  2814. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2815. Begin
  2816. float32_to_int32_round_to_zero := $7FFFFFFF;
  2817. exit;
  2818. end;
  2819. End;
  2820. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2821. exit;
  2822. End
  2823. else
  2824. if ( aExp <= $7E ) then
  2825. Begin
  2826. if ( aExp or aSig )<>0 then
  2827. softfloat_exception_flags :=
  2828. softfloat_exception_flags or float_flag_inexact;
  2829. float32_to_int32_round_to_zero := 0;
  2830. exit;
  2831. End;
  2832. aSig := ( aSig or $00800000 ) shl 8;
  2833. z := aSig shr ( - shiftCount );
  2834. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2835. Begin
  2836. softfloat_exception_flags :=
  2837. softfloat_exception_flags or float_flag_inexact;
  2838. End;
  2839. if ( aSign<>0 ) then z := - z;
  2840. float32_to_int32_round_to_zero := z;
  2841. End;
  2842. {*
  2843. -------------------------------------------------------------------------------
  2844. Returns the result of converting the single-precision floating-point value
  2845. `a' to the double-precision floating-point format. The conversion is
  2846. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2847. Arithmetic.
  2848. -------------------------------------------------------------------------------
  2849. *}
  2850. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  2851. Var
  2852. aSign : flag;
  2853. aExp : int16;
  2854. aSig, zSig0, zSig1: bits32;
  2855. tmp : CommonNanT;
  2856. Begin
  2857. aSig := extractFloat32Frac( a.float32 );
  2858. aExp := extractFloat32Exp( a.float32 );
  2859. aSign := extractFloat32Sign( a.float32 );
  2860. if ( aExp = $FF ) then
  2861. Begin
  2862. if ( aSig<>0 ) then
  2863. Begin
  2864. float32ToCommonNaN(a.float32, tmp);
  2865. commonNaNToFloat64(tmp , result);
  2866. exit;
  2867. End;
  2868. packFloat64( aSign, $7FF, 0, 0, result);
  2869. exit;
  2870. End;
  2871. if ( aExp = 0 ) then
  2872. Begin
  2873. if ( aSig = 0 ) then
  2874. Begin
  2875. packFloat64( aSign, 0, 0, 0, result );
  2876. exit;
  2877. end;
  2878. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2879. Dec(aExp);
  2880. End;
  2881. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  2882. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  2883. End;
  2884. {*
  2885. -------------------------------------------------------------------------------
  2886. Rounds the single-precision floating-point value `a' to an integer,
  2887. and returns the result as a single-precision floating-point value. The
  2888. operation is performed according to the IEC/IEEE Standard for Binary
  2889. Floating-Point Arithmetic.
  2890. -------------------------------------------------------------------------------
  2891. *}
  2892. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  2893. Var
  2894. aSign: flag;
  2895. aExp: int16;
  2896. lastBitMask, roundBitsMask: bits32;
  2897. roundingMode: int8;
  2898. z: float32;
  2899. Begin
  2900. aExp := extractFloat32Exp( a.float32 );
  2901. if ( $96 <= aExp ) then
  2902. Begin
  2903. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2904. Begin
  2905. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  2906. exit;
  2907. End;
  2908. float32_round_to_int:=a;
  2909. exit;
  2910. End;
  2911. if ( aExp <= $7E ) then
  2912. Begin
  2913. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  2914. Begin
  2915. float32_round_to_int:=a;
  2916. exit;
  2917. end;
  2918. softfloat_exception_flags
  2919. := softfloat_exception_flags OR float_flag_inexact;
  2920. aSign := extractFloat32Sign( a.float32 );
  2921. case ( softfloat_rounding_mode ) of
  2922. float_round_nearest_even:
  2923. Begin
  2924. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2925. Begin
  2926. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  2927. exit;
  2928. End;
  2929. End;
  2930. float_round_down:
  2931. Begin
  2932. if aSign <> 0 then
  2933. float32_round_to_int.float32 := $BF800000
  2934. else
  2935. float32_round_to_int.float32 := 0;
  2936. exit;
  2937. End;
  2938. float_round_up:
  2939. Begin
  2940. if aSign <> 0 then
  2941. float32_round_to_int.float32 := $80000000
  2942. else
  2943. float32_round_to_int.float32 := $3F800000;
  2944. exit;
  2945. End;
  2946. end;
  2947. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  2948. End;
  2949. lastBitMask := 1;
  2950. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  2951. lastBitMask := lastBitMask shl ($96 - aExp);
  2952. roundBitsMask := lastBitMask - 1;
  2953. z := a.float32;
  2954. roundingMode := softfloat_rounding_mode;
  2955. if ( roundingMode = float_round_nearest_even ) then
  2956. Begin
  2957. z := z + (lastBitMask shr 1);
  2958. if ( ( z and roundBitsMask ) = 0 ) then
  2959. z := z and not lastBitMask;
  2960. End
  2961. else if ( roundingMode <> float_round_to_zero ) then
  2962. Begin
  2963. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  2964. Begin
  2965. z := z + roundBitsMask;
  2966. End;
  2967. End;
  2968. z := z and not roundBitsMask;
  2969. if ( z <> a.float32 ) then
  2970. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2971. float32_round_to_int.float32 := z;
  2972. End;
  2973. {*
  2974. -------------------------------------------------------------------------------
  2975. Returns the result of adding the absolute values of the single-precision
  2976. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  2977. before being returned. `zSign' is ignored if the result is a NaN.
  2978. The addition is performed according to the IEC/IEEE Standard for Binary
  2979. Floating-Point Arithmetic.
  2980. -------------------------------------------------------------------------------
  2981. *}
  2982. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  2983. Var
  2984. aExp, bExp, zExp: int16;
  2985. aSig, bSig, zSig: bits32;
  2986. expDiff: int16;
  2987. label roundAndPack;
  2988. Begin
  2989. aSig:=extractFloat32Frac( a );
  2990. aExp:=extractFloat32Exp( a );
  2991. bSig:=extractFloat32Frac( b );
  2992. bExp := extractFloat32Exp( b );
  2993. expDiff := aExp - bExp;
  2994. aSig := aSig shl 6;
  2995. bSig := bSig shl 6;
  2996. if ( 0 < expDiff ) then
  2997. Begin
  2998. if ( aExp = $FF ) then
  2999. Begin
  3000. if ( aSig <> 0) then
  3001. Begin
  3002. addFloat32Sigs := propagateFloat32NaN( a, b );
  3003. exit;
  3004. End;
  3005. addFloat32Sigs := a;
  3006. exit;
  3007. End;
  3008. if ( bExp = 0 ) then
  3009. Begin
  3010. Dec(expDiff);
  3011. End
  3012. else
  3013. Begin
  3014. bSig := bSig or $20000000;
  3015. End;
  3016. shift32RightJamming( bSig, expDiff, bSig );
  3017. zExp := aExp;
  3018. End
  3019. else
  3020. If ( expDiff < 0 ) then
  3021. Begin
  3022. if ( bExp = $FF ) then
  3023. Begin
  3024. if ( bSig<>0 ) then
  3025. Begin
  3026. addFloat32Sigs := propagateFloat32NaN( a, b );
  3027. exit;
  3028. end;
  3029. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3030. exit;
  3031. End;
  3032. if ( aExp = 0 ) then
  3033. Begin
  3034. Inc(expDiff);
  3035. End
  3036. else
  3037. Begin
  3038. aSig := aSig OR $20000000;
  3039. End;
  3040. shift32RightJamming( aSig, - expDiff, aSig );
  3041. zExp := bExp;
  3042. End
  3043. else
  3044. Begin
  3045. if ( aExp = $FF ) then
  3046. Begin
  3047. if ( aSig OR bSig )<> 0 then
  3048. Begin
  3049. addFloat32Sigs := propagateFloat32NaN( a, b );
  3050. exit;
  3051. end;
  3052. addFloat32Sigs := a;
  3053. exit;
  3054. End;
  3055. if ( aExp = 0 ) then
  3056. Begin
  3057. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3058. exit;
  3059. end;
  3060. zSig := $40000000 + aSig + bSig;
  3061. zExp := aExp;
  3062. goto roundAndPack;
  3063. End;
  3064. aSig := aSig OR $20000000;
  3065. zSig := ( aSig + bSig ) shl 1;
  3066. Dec(zExp);
  3067. if ( sbits32 (zSig) < 0 ) then
  3068. Begin
  3069. zSig := aSig + bSig;
  3070. Inc(zExp);
  3071. End;
  3072. roundAndPack:
  3073. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3074. End;
  3075. {*
  3076. -------------------------------------------------------------------------------
  3077. Returns the result of subtracting the absolute values of the single-
  3078. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3079. difference is negated before being returned. `zSign' is ignored if the
  3080. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3081. Standard for Binary Floating-Point Arithmetic.
  3082. -------------------------------------------------------------------------------
  3083. *}
  3084. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3085. Var
  3086. aExp, bExp, zExp: int16;
  3087. aSig, bSig, zSig: bits32;
  3088. expDiff : int16;
  3089. label aExpBigger;
  3090. label bExpBigger;
  3091. label aBigger;
  3092. label bBigger;
  3093. label normalizeRoundAndPack;
  3094. Begin
  3095. aSig := extractFloat32Frac( a );
  3096. aExp := extractFloat32Exp( a );
  3097. bSig := extractFloat32Frac( b );
  3098. bExp := extractFloat32Exp( b );
  3099. expDiff := aExp - bExp;
  3100. aSig := aSig shl 7;
  3101. bSig := bSig shl 7;
  3102. if ( 0 < expDiff ) then goto aExpBigger;
  3103. if ( expDiff < 0 ) then goto bExpBigger;
  3104. if ( aExp = $FF ) then
  3105. Begin
  3106. if ( aSig OR bSig )<> 0 then
  3107. Begin
  3108. subFloat32Sigs := propagateFloat32NaN( a, b );
  3109. exit;
  3110. End;
  3111. float_raise( float_flag_invalid );
  3112. subFloat32Sigs := float32_default_nan;
  3113. exit;
  3114. End;
  3115. if ( aExp = 0 ) then
  3116. Begin
  3117. aExp := 1;
  3118. bExp := 1;
  3119. End;
  3120. if ( bSig < aSig ) Then goto aBigger;
  3121. if ( aSig < bSig ) Then goto bBigger;
  3122. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3123. exit;
  3124. bExpBigger:
  3125. if ( bExp = $FF ) then
  3126. Begin
  3127. if ( bSig<>0 ) then
  3128. Begin
  3129. subFloat32Sigs := propagateFloat32NaN( a, b );
  3130. exit;
  3131. End;
  3132. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3133. exit;
  3134. End;
  3135. if ( aExp = 0 ) then
  3136. Begin
  3137. Inc(expDiff);
  3138. End
  3139. else
  3140. Begin
  3141. aSig := aSig OR $40000000;
  3142. End;
  3143. shift32RightJamming( aSig, - expDiff, aSig );
  3144. bSig := bSig OR $40000000;
  3145. bBigger:
  3146. zSig := bSig - aSig;
  3147. zExp := bExp;
  3148. zSign := zSign xor 1;
  3149. goto normalizeRoundAndPack;
  3150. aExpBigger:
  3151. if ( aExp = $FF ) then
  3152. Begin
  3153. if ( aSig <> 0) then
  3154. Begin
  3155. subFloat32Sigs := propagateFloat32NaN( a, b );
  3156. exit;
  3157. End;
  3158. subFloat32Sigs := a;
  3159. exit;
  3160. End;
  3161. if ( bExp = 0 ) then
  3162. Begin
  3163. Dec(expDiff);
  3164. End
  3165. else
  3166. Begin
  3167. bSig := bSig OR $40000000;
  3168. End;
  3169. shift32RightJamming( bSig, expDiff, bSig );
  3170. aSig := aSig OR $40000000;
  3171. aBigger:
  3172. zSig := aSig - bSig;
  3173. zExp := aExp;
  3174. normalizeRoundAndPack:
  3175. Dec(zExp);
  3176. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3177. End;
  3178. {*
  3179. -------------------------------------------------------------------------------
  3180. Returns the result of adding the single-precision floating-point values `a'
  3181. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3182. Binary Floating-Point Arithmetic.
  3183. -------------------------------------------------------------------------------
  3184. *}
  3185. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3186. Var
  3187. aSign, bSign: Flag;
  3188. Begin
  3189. aSign := extractFloat32Sign( a.float32 );
  3190. bSign := extractFloat32Sign( b.float32 );
  3191. if ( aSign = bSign ) then
  3192. Begin
  3193. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3194. End
  3195. else
  3196. Begin
  3197. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3198. End;
  3199. End;
  3200. {*
  3201. -------------------------------------------------------------------------------
  3202. Returns the result of subtracting the single-precision floating-point values
  3203. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3204. for Binary Floating-Point Arithmetic.
  3205. -------------------------------------------------------------------------------
  3206. *}
  3207. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3208. Var
  3209. aSign, bSign: flag;
  3210. Begin
  3211. aSign := extractFloat32Sign( a.float32 );
  3212. bSign := extractFloat32Sign( b.float32 );
  3213. if ( aSign = bSign ) then
  3214. Begin
  3215. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3216. End
  3217. else
  3218. Begin
  3219. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3220. End;
  3221. End;
  3222. {*
  3223. -------------------------------------------------------------------------------
  3224. Returns the result of multiplying the single-precision floating-point values
  3225. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3226. for Binary Floating-Point Arithmetic.
  3227. -------------------------------------------------------------------------------
  3228. *}
  3229. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3230. Var
  3231. aSign, bSign, zSign: flag;
  3232. aExp, bExp, zExp : int16;
  3233. aSig, bSig, zSig0, zSig1: bits32;
  3234. Begin
  3235. aSig := extractFloat32Frac( a.float32 );
  3236. aExp := extractFloat32Exp( a.float32 );
  3237. aSign := extractFloat32Sign( a.float32 );
  3238. bSig := extractFloat32Frac( b.float32 );
  3239. bExp := extractFloat32Exp( b.float32 );
  3240. bSign := extractFloat32Sign( b.float32 );
  3241. zSign := aSign xor bSign;
  3242. if ( aExp = $FF ) then
  3243. Begin
  3244. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3245. Begin
  3246. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3247. End;
  3248. if ( ( bExp OR bSig ) = 0 ) then
  3249. Begin
  3250. float_raise( float_flag_invalid );
  3251. float32_mul.float32 := float32_default_nan;
  3252. exit;
  3253. End;
  3254. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3255. exit;
  3256. End;
  3257. if ( bExp = $FF ) then
  3258. Begin
  3259. if ( bSig <> 0 ) then
  3260. Begin
  3261. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3262. exit;
  3263. End;
  3264. if ( ( aExp OR aSig ) = 0 ) then
  3265. Begin
  3266. float_raise( float_flag_invalid );
  3267. float32_mul.float32 := float32_default_nan;
  3268. exit;
  3269. End;
  3270. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3271. exit;
  3272. End;
  3273. if ( aExp = 0 ) then
  3274. Begin
  3275. if ( aSig = 0 ) then
  3276. Begin
  3277. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3278. exit;
  3279. End;
  3280. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3281. End;
  3282. if ( bExp = 0 ) then
  3283. Begin
  3284. if ( bSig = 0 ) then
  3285. Begin
  3286. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3287. exit;
  3288. End;
  3289. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3290. End;
  3291. zExp := aExp + bExp - $7F;
  3292. aSig := ( aSig OR $00800000 ) shl 7;
  3293. bSig := ( bSig OR $00800000 ) shl 8;
  3294. mul32To64( aSig, bSig, zSig0, zSig1 );
  3295. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3296. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3297. Begin
  3298. zSig0 := zSig0 shl 1;
  3299. Dec(zExp);
  3300. End;
  3301. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3302. End;
  3303. {*
  3304. -------------------------------------------------------------------------------
  3305. Returns the result of dividing the single-precision floating-point value `a'
  3306. by the corresponding value `b'. The operation is performed according to the
  3307. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3308. -------------------------------------------------------------------------------
  3309. *}
  3310. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3311. Var
  3312. aSign, bSign, zSign: flag;
  3313. aExp, bExp, zExp: int16;
  3314. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3315. Begin
  3316. aSig := extractFloat32Frac( a.float32 );
  3317. aExp := extractFloat32Exp( a.float32 );
  3318. aSign := extractFloat32Sign( a.float32 );
  3319. bSig := extractFloat32Frac( b.float32 );
  3320. bExp := extractFloat32Exp( b.float32 );
  3321. bSign := extractFloat32Sign( b.float32 );
  3322. zSign := aSign xor bSign;
  3323. if ( aExp = $FF ) then
  3324. Begin
  3325. if ( aSig <> 0 ) then
  3326. Begin
  3327. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3328. exit;
  3329. End;
  3330. if ( bExp = $FF ) then
  3331. Begin
  3332. if ( bSig <> 0) then
  3333. Begin
  3334. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3335. End;
  3336. float_raise( float_flag_invalid );
  3337. float32_div.float32 := float32_default_nan;
  3338. exit;
  3339. End;
  3340. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3341. exit;
  3342. End;
  3343. if ( bExp = $FF ) then
  3344. Begin
  3345. if ( bSig <> 0) then
  3346. Begin
  3347. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3348. exit;
  3349. End;
  3350. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3351. exit;
  3352. End;
  3353. if ( bExp = 0 ) Then
  3354. Begin
  3355. if ( bSig = 0 ) Then
  3356. Begin
  3357. if ( ( aExp OR aSig ) = 0 ) then
  3358. Begin
  3359. float_raise( float_flag_invalid );
  3360. float32_div.float32 := float32_default_nan;
  3361. exit;
  3362. End;
  3363. float_raise( float_flag_divbyzero );
  3364. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3365. exit;
  3366. End;
  3367. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3368. End;
  3369. if ( aExp = 0 ) Then
  3370. Begin
  3371. if ( aSig = 0 ) Then
  3372. Begin
  3373. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3374. exit;
  3375. End;
  3376. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3377. End;
  3378. zExp := aExp - bExp + $7D;
  3379. aSig := ( aSig OR $00800000 ) shl 7;
  3380. bSig := ( bSig OR $00800000 ) shl 8;
  3381. if ( bSig <= ( aSig + aSig ) ) then
  3382. Begin
  3383. aSig := aSig shr 1;
  3384. Inc(zExp);
  3385. End;
  3386. zSig := estimateDiv64To32( aSig, 0, bSig );
  3387. if ( ( zSig and $3F ) <= 2 ) then
  3388. Begin
  3389. mul32To64( bSig, zSig, term0, term1 );
  3390. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3391. while ( sbits32 (rem0) < 0 ) do
  3392. Begin
  3393. Dec(zSig);
  3394. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3395. End;
  3396. zSig := zSig or bits32( rem1 <> 0 );
  3397. End;
  3398. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3399. End;
  3400. {*
  3401. -------------------------------------------------------------------------------
  3402. Returns the remainder of the single-precision floating-point value `a'
  3403. with respect to the corresponding value `b'. The operation is performed
  3404. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3405. -------------------------------------------------------------------------------
  3406. *}
  3407. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3408. Var
  3409. aSign, bSign, zSign: flag;
  3410. aExp, bExp, expDiff: int16;
  3411. aSig, bSig, q, allZero, alternateASig: bits32;
  3412. sigMean: sbits32;
  3413. Begin
  3414. aSig := extractFloat32Frac( a.float32 );
  3415. aExp := extractFloat32Exp( a.float32 );
  3416. aSign := extractFloat32Sign( a.float32 );
  3417. bSig := extractFloat32Frac( b.float32 );
  3418. bExp := extractFloat32Exp( b.float32 );
  3419. bSign := extractFloat32Sign( b.float32 );
  3420. if ( aExp = $FF ) then
  3421. Begin
  3422. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3423. Begin
  3424. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3425. exit;
  3426. End;
  3427. float_raise( float_flag_invalid );
  3428. float32_rem.float32 := float32_default_nan;
  3429. exit;
  3430. End;
  3431. if ( bExp = $FF ) then
  3432. Begin
  3433. if ( bSig <> 0 ) then
  3434. Begin
  3435. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3436. exit;
  3437. End;
  3438. float32_rem := a;
  3439. exit;
  3440. End;
  3441. if ( bExp = 0 ) then
  3442. Begin
  3443. if ( bSig = 0 ) then
  3444. Begin
  3445. float_raise( float_flag_invalid );
  3446. float32_rem.float32 := float32_default_nan;
  3447. exit;
  3448. End;
  3449. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3450. End;
  3451. if ( aExp = 0 ) then
  3452. Begin
  3453. if ( aSig = 0 ) then
  3454. Begin
  3455. float32_rem := a;
  3456. exit;
  3457. End;
  3458. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3459. End;
  3460. expDiff := aExp - bExp;
  3461. aSig := ( aSig OR $00800000 ) shl 8;
  3462. bSig := ( bSig OR $00800000 ) shl 8;
  3463. if ( expDiff < 0 ) then
  3464. Begin
  3465. if ( expDiff < -1 ) then
  3466. Begin
  3467. float32_rem := a;
  3468. exit;
  3469. End;
  3470. aSig := aSig shr 1;
  3471. End;
  3472. q := bits32( bSig <= aSig );
  3473. if ( q <> 0) then
  3474. aSig := aSig - bSig;
  3475. expDiff := expDiff - 32;
  3476. while ( 0 < expDiff ) do
  3477. Begin
  3478. q := estimateDiv64To32( aSig, 0, bSig );
  3479. if (2 < q) then
  3480. q := q - 2
  3481. else
  3482. q := 0;
  3483. aSig := - ( ( bSig shr 2 ) * q );
  3484. expDiff := expDiff - 30;
  3485. End;
  3486. expDiff := expDiff + 32;
  3487. if ( 0 < expDiff ) then
  3488. Begin
  3489. q := estimateDiv64To32( aSig, 0, bSig );
  3490. if (2 < q) then
  3491. q := q - 2
  3492. else
  3493. q := 0;
  3494. q := q shr (32 - expDiff);
  3495. bSig := bSig shr 2;
  3496. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3497. End
  3498. else
  3499. Begin
  3500. aSig := aSig shr 2;
  3501. bSig := bSig shr 2;
  3502. End;
  3503. Repeat
  3504. alternateASig := aSig;
  3505. Inc(q);
  3506. aSig := aSig - bSig;
  3507. Until not ( 0 <= sbits32 (aSig) );
  3508. sigMean := aSig + alternateASig;
  3509. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3510. Begin
  3511. aSig := alternateASig;
  3512. End;
  3513. zSign := flag( sbits32 (aSig) < 0 );
  3514. if ( zSign<>0 ) then
  3515. aSig := - aSig;
  3516. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3517. End;
  3518. {*
  3519. -------------------------------------------------------------------------------
  3520. Returns the square root of the single-precision floating-point value `a'.
  3521. The operation is performed according to the IEC/IEEE Standard for Binary
  3522. Floating-Point Arithmetic.
  3523. -------------------------------------------------------------------------------
  3524. *}
  3525. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3526. Var
  3527. aSign : flag;
  3528. aExp, zExp : int16;
  3529. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3530. label roundAndPack;
  3531. Begin
  3532. aSig := extractFloat32Frac( a.float32 );
  3533. aExp := extractFloat32Exp( a.float32 );
  3534. aSign := extractFloat32Sign( a.float32 );
  3535. if ( aExp = $FF ) then
  3536. Begin
  3537. if ( aSig <> 0) then
  3538. Begin
  3539. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3540. exit;
  3541. End;
  3542. if ( aSign = 0) then
  3543. Begin
  3544. float32_sqrt := a;
  3545. exit;
  3546. End;
  3547. float_raise( float_flag_invalid );
  3548. float32_sqrt.float32 := float32_default_nan;
  3549. exit;
  3550. End;
  3551. if ( aSign <> 0) then
  3552. Begin
  3553. if ( ( aExp OR aSig ) = 0 ) then
  3554. Begin
  3555. float32_sqrt := a;
  3556. exit;
  3557. End;
  3558. float_raise( float_flag_invalid );
  3559. float32_sqrt.float32 := float32_default_nan;
  3560. exit;
  3561. End;
  3562. if ( aExp = 0 ) then
  3563. Begin
  3564. if ( aSig = 0 ) then
  3565. Begin
  3566. float32_sqrt.float32 := 0;
  3567. exit;
  3568. End;
  3569. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3570. End;
  3571. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3572. aSig := ( aSig OR $00800000 ) shl 8;
  3573. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3574. if ( ( zSig and $7F ) <= 5 ) then
  3575. Begin
  3576. if ( zSig < 2 ) then
  3577. Begin
  3578. zSig := $7FFFFFFF;
  3579. goto roundAndPack;
  3580. End
  3581. else
  3582. Begin
  3583. aSig := aSig shr (aExp and 1);
  3584. mul32To64( zSig, zSig, term0, term1 );
  3585. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3586. while ( sbits32 (rem0) < 0 ) do
  3587. Begin
  3588. Dec(zSig);
  3589. shortShift64Left( 0, zSig, 1, term0, term1 );
  3590. term1 := term1 or 1;
  3591. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3592. End;
  3593. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3594. End;
  3595. End;
  3596. shift32RightJamming( zSig, 1, zSig );
  3597. roundAndPack:
  3598. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3599. End;
  3600. {*
  3601. -------------------------------------------------------------------------------
  3602. Returns 1 if the single-precision floating-point value `a' is equal to
  3603. the corresponding value `b', and 0 otherwise. The comparison is performed
  3604. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3605. -------------------------------------------------------------------------------
  3606. *}
  3607. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3608. Begin
  3609. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3610. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3611. ) then
  3612. Begin
  3613. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3614. Begin
  3615. float_raise( float_flag_invalid );
  3616. End;
  3617. float32_eq := 0;
  3618. exit;
  3619. End;
  3620. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3621. End;
  3622. {*
  3623. -------------------------------------------------------------------------------
  3624. Returns 1 if the single-precision floating-point value `a' is less than
  3625. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3626. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3627. Arithmetic.
  3628. -------------------------------------------------------------------------------
  3629. *}
  3630. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3631. var
  3632. aSign, bSign: flag;
  3633. Begin
  3634. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3635. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3636. ) then
  3637. Begin
  3638. float_raise( float_flag_invalid );
  3639. float32_le := 0;
  3640. exit;
  3641. End;
  3642. aSign := extractFloat32Sign( a.float32 );
  3643. bSign := extractFloat32Sign( b.float32 );
  3644. if ( aSign <> bSign ) then
  3645. Begin
  3646. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3647. exit;
  3648. End;
  3649. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3650. End;
  3651. {*
  3652. -------------------------------------------------------------------------------
  3653. Returns 1 if the single-precision floating-point value `a' is less than
  3654. the corresponding value `b', and 0 otherwise. The comparison is performed
  3655. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3656. -------------------------------------------------------------------------------
  3657. *}
  3658. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3659. var
  3660. aSign, bSign: flag;
  3661. Begin
  3662. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3663. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3664. ) then
  3665. Begin
  3666. float_raise( float_flag_invalid );
  3667. float32_lt :=0;
  3668. exit;
  3669. End;
  3670. aSign := extractFloat32Sign( a.float32 );
  3671. bSign := extractFloat32Sign( b.float32 );
  3672. if ( aSign <> bSign ) then
  3673. Begin
  3674. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3675. exit;
  3676. End;
  3677. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3678. End;
  3679. {*
  3680. -------------------------------------------------------------------------------
  3681. Returns 1 if the single-precision floating-point value `a' is equal to
  3682. the corresponding value `b', and 0 otherwise. The invalid exception is
  3683. raised if either operand is a NaN. Otherwise, the comparison is performed
  3684. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3685. -------------------------------------------------------------------------------
  3686. *}
  3687. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3688. Begin
  3689. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3690. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3691. ) then
  3692. Begin
  3693. float_raise( float_flag_invalid );
  3694. float32_eq_signaling := 0;
  3695. exit;
  3696. End;
  3697. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3698. End;
  3699. {*
  3700. -------------------------------------------------------------------------------
  3701. Returns 1 if the single-precision floating-point value `a' is less than or
  3702. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3703. cause an exception. Otherwise, the comparison is performed according to the
  3704. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3705. -------------------------------------------------------------------------------
  3706. *}
  3707. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3708. Var
  3709. aSign, bSign: flag;
  3710. aExp, bExp: int16;
  3711. Begin
  3712. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3713. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3714. ) then
  3715. Begin
  3716. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3717. Begin
  3718. float_raise( float_flag_invalid );
  3719. End;
  3720. float32_le_quiet := 0;
  3721. exit;
  3722. End;
  3723. aSign := extractFloat32Sign( a );
  3724. bSign := extractFloat32Sign( b );
  3725. if ( aSign <> bSign ) then
  3726. Begin
  3727. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3728. exit;
  3729. End;
  3730. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3731. End;
  3732. {*
  3733. -------------------------------------------------------------------------------
  3734. Returns 1 if the single-precision floating-point value `a' is less than
  3735. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3736. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3737. Standard for Binary Floating-Point Arithmetic.
  3738. -------------------------------------------------------------------------------
  3739. *}
  3740. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  3741. Var
  3742. aSign, bSign: flag;
  3743. Begin
  3744. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3745. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3746. ) then
  3747. Begin
  3748. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3749. Begin
  3750. float_raise( float_flag_invalid );
  3751. End;
  3752. float32_lt_quiet := 0;
  3753. exit;
  3754. End;
  3755. aSign := extractFloat32Sign( a );
  3756. bSign := extractFloat32Sign( b );
  3757. if ( aSign <> bSign ) then
  3758. Begin
  3759. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  3760. exit;
  3761. End;
  3762. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  3763. End;
  3764. {*
  3765. -------------------------------------------------------------------------------
  3766. Returns the result of converting the double-precision floating-point value
  3767. `a' to the 32-bit two's complement integer format. The conversion is
  3768. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3769. Arithmetic---which means in particular that the conversion is rounded
  3770. according to the current rounding mode. If `a' is a NaN, the largest
  3771. positive integer is returned. Otherwise, if the conversion overflows, the
  3772. largest integer with the same sign as `a' is returned.
  3773. -------------------------------------------------------------------------------
  3774. *}
  3775. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  3776. var
  3777. aSign: flag;
  3778. aExp, shiftCount: int16;
  3779. aSig0, aSig1, absZ, aSigExtra: bits32;
  3780. z: int32;
  3781. roundingMode: int8;
  3782. label invalid;
  3783. Begin
  3784. aSig1 := extractFloat64Frac1( a );
  3785. aSig0 := extractFloat64Frac0( a );
  3786. aExp := extractFloat64Exp( a );
  3787. aSign := extractFloat64Sign( a );
  3788. shiftCount := aExp - $413;
  3789. if ( 0 <= shiftCount ) then
  3790. Begin
  3791. if ( $41E < aExp ) then
  3792. Begin
  3793. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3794. aSign := 0;
  3795. goto invalid;
  3796. End;
  3797. shortShift64Left(
  3798. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3799. if ( $80000000 < absZ ) then
  3800. goto invalid;
  3801. End
  3802. else
  3803. Begin
  3804. aSig1 := flag( aSig1 <> 0 );
  3805. if ( aExp < $3FE ) then
  3806. Begin
  3807. aSigExtra := aExp OR aSig0 OR aSig1;
  3808. absZ := 0;
  3809. End
  3810. else
  3811. Begin
  3812. aSig0 := aSig0 OR $00100000;
  3813. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3814. absZ := aSig0 shr ( - shiftCount );
  3815. End;
  3816. End;
  3817. roundingMode := softfloat_rounding_mode;
  3818. if ( roundingMode = float_round_nearest_even ) then
  3819. Begin
  3820. if ( sbits32(aSigExtra) < 0 ) then
  3821. Begin
  3822. Inc(absZ);
  3823. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  3824. absZ := absZ and not 1;
  3825. End;
  3826. if aSign <> 0 then
  3827. z := - absZ
  3828. else
  3829. z := absZ;
  3830. End
  3831. else
  3832. Begin
  3833. aSigExtra := bits32( aSigExtra <> 0 );
  3834. if ( aSign <> 0) then
  3835. Begin
  3836. z := - ( absZ
  3837. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  3838. End
  3839. else
  3840. Begin
  3841. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  3842. End
  3843. End;
  3844. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  3845. Begin
  3846. invalid:
  3847. float_raise( float_flag_invalid );
  3848. if (aSign <> 0 ) then
  3849. float64_to_int32 := sbits32 ($80000000)
  3850. else
  3851. float64_to_int32 := $7FFFFFFF;
  3852. exit;
  3853. End;
  3854. if ( aSigExtra <> 0) then
  3855. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3856. float64_to_int32 := z;
  3857. End;
  3858. {*
  3859. -------------------------------------------------------------------------------
  3860. Returns the result of converting the double-precision floating-point value
  3861. `a' to the 32-bit two's complement integer format. The conversion is
  3862. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3863. Arithmetic, except that the conversion is always rounded toward zero.
  3864. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3865. the conversion overflows, the largest integer with the same sign as `a' is
  3866. returned.
  3867. -------------------------------------------------------------------------------
  3868. *}
  3869. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  3870. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  3871. Var
  3872. aSign: flag;
  3873. aExp, shiftCount: int16;
  3874. aSig0, aSig1, absZ, aSigExtra: bits32;
  3875. z: int32;
  3876. label invalid;
  3877. Begin
  3878. aSig1 := extractFloat64Frac1( a );
  3879. aSig0 := extractFloat64Frac0( a );
  3880. aExp := extractFloat64Exp( a );
  3881. aSign := extractFloat64Sign( a );
  3882. shiftCount := aExp - $413;
  3883. if ( 0 <= shiftCount ) then
  3884. Begin
  3885. if ( $41E < aExp ) then
  3886. Begin
  3887. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3888. aSign := 0;
  3889. goto invalid;
  3890. End;
  3891. shortShift64Left(
  3892. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3893. End
  3894. else
  3895. Begin
  3896. if ( aExp < $3FF ) then
  3897. Begin
  3898. if ( aExp OR aSig0 OR aSig1 )<>0 then
  3899. Begin
  3900. softfloat_exception_flags :=
  3901. softfloat_exception_flags or float_flag_inexact;
  3902. End;
  3903. float64_to_int32_round_to_zero := 0;
  3904. exit;
  3905. End;
  3906. aSig0 := aSig0 or $00100000;
  3907. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3908. absZ := aSig0 shr ( - shiftCount );
  3909. End;
  3910. if aSign <> 0 then
  3911. z := - absZ
  3912. else
  3913. z := absZ;
  3914. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  3915. Begin
  3916. invalid:
  3917. float_raise( float_flag_invalid );
  3918. if (aSign <> 0) then
  3919. float64_to_int32_round_to_zero := sbits32 ($80000000)
  3920. else
  3921. float64_to_int32_round_to_zero := $7FFFFFFF;
  3922. exit;
  3923. End;
  3924. if ( aSigExtra <> 0) then
  3925. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3926. float64_to_int32_round_to_zero := z;
  3927. End;
  3928. {*
  3929. -------------------------------------------------------------------------------
  3930. Returns the result of converting the double-precision floating-point value
  3931. `a' to the single-precision floating-point format. The conversion is
  3932. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3933. Arithmetic.
  3934. -------------------------------------------------------------------------------
  3935. *}
  3936. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  3937. Var
  3938. aSign: flag;
  3939. aExp: int16;
  3940. aSig0, aSig1, zSig: bits32;
  3941. allZero: bits32;
  3942. tmp : CommonNanT;
  3943. Begin
  3944. aSig1 := extractFloat64Frac1( a );
  3945. aSig0 := extractFloat64Frac0( a );
  3946. aExp := extractFloat64Exp( a );
  3947. aSign := extractFloat64Sign( a );
  3948. if ( aExp = $7FF ) then
  3949. Begin
  3950. if ( aSig0 OR aSig1 ) <> 0 then
  3951. Begin
  3952. float64ToCommonNaN( a, tmp );
  3953. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  3954. exit;
  3955. End;
  3956. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  3957. exit;
  3958. End;
  3959. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  3960. if ( aExp <> 0) then
  3961. zSig := zSig OR $40000000;
  3962. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  3963. End;
  3964. {*
  3965. -------------------------------------------------------------------------------
  3966. Rounds the double-precision floating-point value `a' to an integer,
  3967. and returns the result as a double-precision floating-point value. The
  3968. operation is performed according to the IEC/IEEE Standard for Binary
  3969. Floating-Point Arithmetic.
  3970. -------------------------------------------------------------------------------
  3971. *}
  3972. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  3973. Var
  3974. aSign: flag;
  3975. aExp: int16;
  3976. lastBitMask, roundBitsMask: bits32;
  3977. roundingMode: int8;
  3978. z: float64;
  3979. Begin
  3980. aExp := extractFloat64Exp( a );
  3981. if ( $413 <= aExp ) then
  3982. Begin
  3983. if ( $433 <= aExp ) then
  3984. Begin
  3985. if ( ( aExp = $7FF )
  3986. AND
  3987. (
  3988. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  3989. ) <>0)
  3990. ) then
  3991. Begin
  3992. propagateFloat64NaN( a, a, result );
  3993. exit;
  3994. End;
  3995. result := a;
  3996. exit;
  3997. End;
  3998. lastBitMask := 1;
  3999. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  4000. roundBitsMask := lastBitMask - 1;
  4001. z := a;
  4002. roundingMode := softfloat_rounding_mode;
  4003. if ( roundingMode = float_round_nearest_even ) then
  4004. Begin
  4005. if ( lastBitMask <> 0) then
  4006. Begin
  4007. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  4008. if ( ( z.low and roundBitsMask ) = 0 ) then
  4009. z.low := z.low and not lastBitMask;
  4010. End
  4011. else
  4012. Begin
  4013. if ( sbits32 (z.low) < 0 ) then
  4014. Begin
  4015. Inc(z.high);
  4016. if ( bits32 ( z.low shl 1 ) = 0 ) then
  4017. z.high := z.high and not 1;
  4018. End;
  4019. End;
  4020. End
  4021. else if ( roundingMode <> float_round_to_zero ) then
  4022. Begin
  4023. if ( extractFloat64Sign( z )
  4024. xor flag( roundingMode = float_round_up ) )<> 0 then
  4025. Begin
  4026. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4027. End;
  4028. End;
  4029. z.low := z.low and not roundBitsMask;
  4030. End
  4031. else
  4032. Begin
  4033. if ( aExp <= $3FE ) then
  4034. Begin
  4035. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4036. Begin
  4037. result := a;
  4038. exit;
  4039. End;
  4040. softfloat_exception_flags := softfloat_exception_flags or
  4041. float_flag_inexact;
  4042. aSign := extractFloat64Sign( a );
  4043. case ( softfloat_rounding_mode ) of
  4044. float_round_nearest_even:
  4045. Begin
  4046. if ( ( aExp = $3FE )
  4047. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4048. ) then
  4049. Begin
  4050. packFloat64( aSign, $3FF, 0, 0, result );
  4051. exit;
  4052. End;
  4053. End;
  4054. float_round_down:
  4055. Begin
  4056. if aSign<>0 then
  4057. packFloat64( 1, $3FF, 0, 0, result )
  4058. else
  4059. packFloat64( 0, 0, 0, 0, result );
  4060. exit;
  4061. End;
  4062. float_round_up:
  4063. Begin
  4064. if aSign <> 0 then
  4065. packFloat64( 1, 0, 0, 0, result )
  4066. else
  4067. packFloat64( 0, $3FF, 0, 0, result );
  4068. exit;
  4069. End;
  4070. end;
  4071. packFloat64( aSign, 0, 0, 0, result );
  4072. exit;
  4073. End;
  4074. lastBitMask := 1;
  4075. lastBitMask := lastBitMask shl ($413 - aExp);
  4076. roundBitsMask := lastBitMask - 1;
  4077. z.low := 0;
  4078. z.high := a.high;
  4079. roundingMode := softfloat_rounding_mode;
  4080. if ( roundingMode = float_round_nearest_even ) then
  4081. Begin
  4082. z.high := z.high + lastBitMask shr 1;
  4083. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4084. Begin
  4085. z.high := z.high and not lastBitMask;
  4086. End;
  4087. End
  4088. else if ( roundingMode <> float_round_to_zero ) then
  4089. Begin
  4090. if ( extractFloat64Sign( z )
  4091. xor flag( roundingMode = float_round_up ) )<> 0 then
  4092. Begin
  4093. z.high := z.high or bits32( a.low <> 0 );
  4094. z.high := z.high + roundBitsMask;
  4095. End;
  4096. End;
  4097. z.high := z.high and not roundBitsMask;
  4098. End;
  4099. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4100. Begin
  4101. softfloat_exception_flags :=
  4102. softfloat_exception_flags or float_flag_inexact;
  4103. End;
  4104. result := z;
  4105. End;
  4106. {*
  4107. -------------------------------------------------------------------------------
  4108. Returns the result of adding the absolute values of the double-precision
  4109. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4110. before being returned. `zSign' is ignored if the result is a NaN.
  4111. The addition is performed according to the IEC/IEEE Standard for Binary
  4112. Floating-Point Arithmetic.
  4113. -------------------------------------------------------------------------------
  4114. *}
  4115. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4116. Var
  4117. aExp, bExp, zExp: int16;
  4118. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4119. expDiff: int16;
  4120. label shiftRight1;
  4121. label roundAndPack;
  4122. Begin
  4123. aSig1 := extractFloat64Frac1( a );
  4124. aSig0 := extractFloat64Frac0( a );
  4125. aExp := extractFloat64Exp( a );
  4126. bSig1 := extractFloat64Frac1( b );
  4127. bSig0 := extractFloat64Frac0( b );
  4128. bExp := extractFloat64Exp( b );
  4129. expDiff := aExp - bExp;
  4130. if ( 0 < expDiff ) then
  4131. Begin
  4132. if ( aExp = $7FF ) then
  4133. Begin
  4134. if ( aSig0 OR aSig1 ) <> 0 then
  4135. Begin
  4136. propagateFloat64NaN( a, b, out );
  4137. exit;
  4138. end;
  4139. out := a;
  4140. exit;
  4141. End;
  4142. if ( bExp = 0 ) then
  4143. Begin
  4144. Dec(expDiff);
  4145. End
  4146. else
  4147. Begin
  4148. bSig0 := bSig0 or $00100000;
  4149. End;
  4150. shift64ExtraRightJamming(
  4151. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4152. zExp := aExp;
  4153. End
  4154. else if ( expDiff < 0 ) then
  4155. Begin
  4156. if ( bExp = $7FF ) then
  4157. Begin
  4158. if ( bSig0 OR bSig1 ) <> 0 then
  4159. Begin
  4160. propagateFloat64NaN( a, b, out );
  4161. exit;
  4162. End;
  4163. packFloat64( zSign, $7FF, 0, 0, out );
  4164. End;
  4165. if ( aExp = 0 ) then
  4166. Begin
  4167. Inc(expDiff);
  4168. End
  4169. else
  4170. Begin
  4171. aSig0 := aSig0 or $00100000;
  4172. End;
  4173. shift64ExtraRightJamming(
  4174. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4175. zExp := bExp;
  4176. End
  4177. else
  4178. Begin
  4179. if ( aExp = $7FF ) then
  4180. Begin
  4181. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4182. Begin
  4183. propagateFloat64NaN( a, b, out );
  4184. exit;
  4185. End;
  4186. out := a;
  4187. exit;
  4188. End;
  4189. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4190. if ( aExp = 0 ) then
  4191. Begin
  4192. packFloat64( zSign, 0, zSig0, zSig1, out );
  4193. exit;
  4194. End;
  4195. zSig2 := 0;
  4196. zSig0 := zSig0 or $00200000;
  4197. zExp := aExp;
  4198. goto shiftRight1;
  4199. End;
  4200. aSig0 := aSig0 or $00100000;
  4201. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4202. Dec(zExp);
  4203. if ( zSig0 < $00200000 ) then
  4204. goto roundAndPack;
  4205. Inc(zExp);
  4206. shiftRight1:
  4207. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4208. roundAndPack:
  4209. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4210. End;
  4211. {*
  4212. -------------------------------------------------------------------------------
  4213. Returns the result of subtracting the absolute values of the double-
  4214. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4215. difference is negated before being returned. `zSign' is ignored if the
  4216. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4217. Standard for Binary Floating-Point Arithmetic.
  4218. -------------------------------------------------------------------------------
  4219. *}
  4220. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4221. Var
  4222. aExp, bExp, zExp: int16;
  4223. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4224. expDiff: int16;
  4225. z: float64;
  4226. label aExpBigger;
  4227. label bExpBigger;
  4228. label aBigger;
  4229. label bBigger;
  4230. label normalizeRoundAndPack;
  4231. Begin
  4232. aSig1 := extractFloat64Frac1( a );
  4233. aSig0 := extractFloat64Frac0( a );
  4234. aExp := extractFloat64Exp( a );
  4235. bSig1 := extractFloat64Frac1( b );
  4236. bSig0 := extractFloat64Frac0( b );
  4237. bExp := extractFloat64Exp( b );
  4238. expDiff := aExp - bExp;
  4239. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4240. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4241. if ( 0 < expDiff ) then goto aExpBigger;
  4242. if ( expDiff < 0 ) then goto bExpBigger;
  4243. if ( aExp = $7FF ) then
  4244. Begin
  4245. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4246. Begin
  4247. propagateFloat64NaN( a, b, out );
  4248. exit;
  4249. End;
  4250. float_raise( float_flag_invalid );
  4251. z.low := float64_default_nan_low;
  4252. z.high := float64_default_nan_high;
  4253. out := z;
  4254. exit;
  4255. End;
  4256. if ( aExp = 0 ) then
  4257. Begin
  4258. aExp := 1;
  4259. bExp := 1;
  4260. End;
  4261. if ( bSig0 < aSig0 ) then goto aBigger;
  4262. if ( aSig0 < bSig0 ) then goto bBigger;
  4263. if ( bSig1 < aSig1 ) then goto aBigger;
  4264. if ( aSig1 < bSig1 ) then goto bBigger;
  4265. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4266. exit;
  4267. bExpBigger:
  4268. if ( bExp = $7FF ) then
  4269. Begin
  4270. if ( bSig0 OR bSig1 ) <> 0 then
  4271. Begin
  4272. propagateFloat64NaN( a, b, out );
  4273. exit;
  4274. End;
  4275. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4276. exit;
  4277. End;
  4278. if ( aExp = 0 ) then
  4279. Begin
  4280. Inc(expDiff);
  4281. End
  4282. else
  4283. Begin
  4284. aSig0 := aSig0 or $40000000;
  4285. End;
  4286. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4287. bSig0 := bSig0 or $40000000;
  4288. bBigger:
  4289. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4290. zExp := bExp;
  4291. zSign := zSign xor 1;
  4292. goto normalizeRoundAndPack;
  4293. aExpBigger:
  4294. if ( aExp = $7FF ) then
  4295. Begin
  4296. if ( aSig0 OR aSig1 ) <> 0 then
  4297. Begin
  4298. propagateFloat64NaN( a, b, out );
  4299. exit;
  4300. End;
  4301. out := a;
  4302. exit;
  4303. End;
  4304. if ( bExp = 0 ) then
  4305. Begin
  4306. Dec(expDiff);
  4307. End
  4308. else
  4309. Begin
  4310. bSig0 := bSig0 or $40000000;
  4311. End;
  4312. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4313. aSig0 := aSig0 or $40000000;
  4314. aBigger:
  4315. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4316. zExp := aExp;
  4317. normalizeRoundAndPack:
  4318. Dec(zExp);
  4319. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4320. End;
  4321. {*
  4322. -------------------------------------------------------------------------------
  4323. Returns the result of adding the double-precision floating-point values `a'
  4324. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4325. Binary Floating-Point Arithmetic.
  4326. -------------------------------------------------------------------------------
  4327. *}
  4328. Function float64_add( a: float64; b : float64) : Float64;
  4329. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4330. Var
  4331. aSign, bSign: flag;
  4332. Begin
  4333. aSign := extractFloat64Sign( a );
  4334. bSign := extractFloat64Sign( b );
  4335. if ( aSign = bSign ) then
  4336. Begin
  4337. addFloat64Sigs( a, b, aSign, result );
  4338. End
  4339. else
  4340. Begin
  4341. subFloat64Sigs( a, b, aSign, result );
  4342. End;
  4343. End;
  4344. {*
  4345. -------------------------------------------------------------------------------
  4346. Returns the result of subtracting the double-precision floating-point values
  4347. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4348. for Binary Floating-Point Arithmetic.
  4349. -------------------------------------------------------------------------------
  4350. *}
  4351. Function float64_sub(a: float64; b : float64) : Float64;
  4352. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4353. Var
  4354. aSign, bSign: flag;
  4355. Begin
  4356. aSign := extractFloat64Sign( a );
  4357. bSign := extractFloat64Sign( b );
  4358. if ( aSign = bSign ) then
  4359. Begin
  4360. subFloat64Sigs( a, b, aSign, result );
  4361. End
  4362. else
  4363. Begin
  4364. addFloat64Sigs( a, b, aSign, result );
  4365. End;
  4366. End;
  4367. {*
  4368. -------------------------------------------------------------------------------
  4369. Returns the result of multiplying the double-precision floating-point values
  4370. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4371. for Binary Floating-Point Arithmetic.
  4372. -------------------------------------------------------------------------------
  4373. *}
  4374. Function float64_mul( a: float64; b:float64) : Float64;
  4375. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4376. Var
  4377. aSign, bSign, zSign: flag;
  4378. aExp, bExp, zExp: int16;
  4379. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4380. z: float64;
  4381. label invalid;
  4382. Begin
  4383. aSig1 := extractFloat64Frac1( a );
  4384. aSig0 := extractFloat64Frac0( a );
  4385. aExp := extractFloat64Exp( a );
  4386. aSign := extractFloat64Sign( a );
  4387. bSig1 := extractFloat64Frac1( b );
  4388. bSig0 := extractFloat64Frac0( b );
  4389. bExp := extractFloat64Exp( b );
  4390. bSign := extractFloat64Sign( b );
  4391. zSign := aSign xor bSign;
  4392. if ( aExp = $7FF ) then
  4393. Begin
  4394. if ( (( aSig0 OR aSig1 ) <>0)
  4395. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4396. Begin
  4397. propagateFloat64NaN( a, b, result );
  4398. exit;
  4399. End;
  4400. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4401. packFloat64( zSign, $7FF, 0, 0, result );
  4402. exit;
  4403. End;
  4404. if ( bExp = $7FF ) then
  4405. Begin
  4406. if ( bSig0 OR bSig1 )<> 0 then
  4407. Begin
  4408. propagateFloat64NaN( a, b, result );
  4409. exit;
  4410. End;
  4411. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4412. Begin
  4413. invalid:
  4414. float_raise( float_flag_invalid );
  4415. z.low := float64_default_nan_low;
  4416. z.high := float64_default_nan_high;
  4417. result := z;
  4418. exit;
  4419. End;
  4420. packFloat64( zSign, $7FF, 0, 0, result );
  4421. exit;
  4422. End;
  4423. if ( aExp = 0 ) then
  4424. Begin
  4425. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4426. Begin
  4427. packFloat64( zSign, 0, 0, 0, result );
  4428. exit;
  4429. End;
  4430. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4431. End;
  4432. if ( bExp = 0 ) then
  4433. Begin
  4434. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4435. Begin
  4436. packFloat64( zSign, 0, 0, 0, result );
  4437. exit;
  4438. End;
  4439. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4440. End;
  4441. zExp := aExp + bExp - $400;
  4442. aSig0 := aSig0 or $00100000;
  4443. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4444. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4445. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4446. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4447. if ( $00200000 <= zSig0 ) then
  4448. Begin
  4449. shift64ExtraRightJamming(
  4450. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4451. Inc(zExp);
  4452. End;
  4453. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4454. End;
  4455. {*
  4456. -------------------------------------------------------------------------------
  4457. Returns the result of dividing the double-precision floating-point value `a'
  4458. by the corresponding value `b'. The operation is performed according to the
  4459. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4460. -------------------------------------------------------------------------------
  4461. *}
  4462. Function float64_div(a: float64; b : float64) : Float64;
  4463. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4464. Var
  4465. aSign, bSign, zSign: flag;
  4466. aExp, bExp, zExp: int16;
  4467. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4468. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4469. z: float64;
  4470. label invalid;
  4471. Begin
  4472. aSig1 := extractFloat64Frac1( a );
  4473. aSig0 := extractFloat64Frac0( a );
  4474. aExp := extractFloat64Exp( a );
  4475. aSign := extractFloat64Sign( a );
  4476. bSig1 := extractFloat64Frac1( b );
  4477. bSig0 := extractFloat64Frac0( b );
  4478. bExp := extractFloat64Exp( b );
  4479. bSign := extractFloat64Sign( b );
  4480. zSign := aSign xor bSign;
  4481. if ( aExp = $7FF ) then
  4482. Begin
  4483. if ( aSig0 OR aSig1 )<> 0 then
  4484. Begin
  4485. propagateFloat64NaN( a, b, result );
  4486. exit;
  4487. end;
  4488. if ( bExp = $7FF ) then
  4489. Begin
  4490. if ( bSig0 OR bSig1 )<>0 then
  4491. Begin
  4492. propagateFloat64NaN( a, b, result );
  4493. exit;
  4494. End;
  4495. goto invalid;
  4496. End;
  4497. packFloat64( zSign, $7FF, 0, 0, result );
  4498. exit;
  4499. End;
  4500. if ( bExp = $7FF ) then
  4501. Begin
  4502. if ( bSig0 OR bSig1 )<> 0 then
  4503. Begin
  4504. propagateFloat64NaN( a, b, result );
  4505. exit;
  4506. End;
  4507. packFloat64( zSign, 0, 0, 0, result );
  4508. exit;
  4509. End;
  4510. if ( bExp = 0 ) then
  4511. Begin
  4512. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4513. Begin
  4514. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4515. Begin
  4516. invalid:
  4517. float_raise( float_flag_invalid );
  4518. z.low := float64_default_nan_low;
  4519. z.high := float64_default_nan_high;
  4520. result := z;
  4521. exit;
  4522. End;
  4523. float_raise( float_flag_divbyzero );
  4524. packFloat64( zSign, $7FF, 0, 0, result );
  4525. exit;
  4526. End;
  4527. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4528. End;
  4529. if ( aExp = 0 ) then
  4530. Begin
  4531. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4532. Begin
  4533. packFloat64( zSign, 0, 0, 0, result );
  4534. exit;
  4535. End;
  4536. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4537. End;
  4538. zExp := aExp - bExp + $3FD;
  4539. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4540. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4541. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4542. Begin
  4543. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4544. Inc(zExp);
  4545. End;
  4546. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4547. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4548. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4549. while ( sbits32 (rem0) < 0 ) do
  4550. Begin
  4551. Dec(zSig0);
  4552. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4553. End;
  4554. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4555. if ( ( zSig1 and $3FF ) <= 4 ) then
  4556. Begin
  4557. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4558. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4559. while ( sbits32 (rem1) < 0 ) do
  4560. Begin
  4561. Dec(zSig1);
  4562. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4563. End;
  4564. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4565. End;
  4566. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4567. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4568. End;
  4569. {*
  4570. -------------------------------------------------------------------------------
  4571. Returns the remainder of the double-precision floating-point value `a'
  4572. with respect to the corresponding value `b'. The operation is performed
  4573. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4574. -------------------------------------------------------------------------------
  4575. *}
  4576. Function float64_rem(a: float64; b : float64) : float64;
  4577. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4578. Var
  4579. aSign, bSign, zSign: flag;
  4580. aExp, bExp, expDiff: int16;
  4581. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4582. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4583. sigMean0: sbits32;
  4584. z: float64;
  4585. label invalid;
  4586. Begin
  4587. aSig1 := extractFloat64Frac1( a );
  4588. aSig0 := extractFloat64Frac0( a );
  4589. aExp := extractFloat64Exp( a );
  4590. aSign := extractFloat64Sign( a );
  4591. bSig1 := extractFloat64Frac1( b );
  4592. bSig0 := extractFloat64Frac0( b );
  4593. bExp := extractFloat64Exp( b );
  4594. bSign := extractFloat64Sign( b );
  4595. if ( aExp = $7FF ) then
  4596. Begin
  4597. if ((( aSig0 OR aSig1 )<>0)
  4598. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4599. Begin
  4600. propagateFloat64NaN( a, b, result );
  4601. exit;
  4602. End;
  4603. goto invalid;
  4604. End;
  4605. if ( bExp = $7FF ) then
  4606. Begin
  4607. if ( bSig0 OR bSig1 ) <> 0 then
  4608. Begin
  4609. propagateFloat64NaN( a, b, result );
  4610. exit;
  4611. End;
  4612. result := a;
  4613. exit;
  4614. End;
  4615. if ( bExp = 0 ) then
  4616. Begin
  4617. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4618. Begin
  4619. invalid:
  4620. float_raise( float_flag_invalid );
  4621. z.low := float64_default_nan_low;
  4622. z.high := float64_default_nan_high;
  4623. result := z;
  4624. exit;
  4625. End;
  4626. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4627. End;
  4628. if ( aExp = 0 ) then
  4629. Begin
  4630. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4631. Begin
  4632. result := a;
  4633. exit;
  4634. End;
  4635. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4636. End;
  4637. expDiff := aExp - bExp;
  4638. if ( expDiff < -1 ) then
  4639. Begin
  4640. result := a;
  4641. exit;
  4642. End;
  4643. shortShift64Left(
  4644. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  4645. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4646. q := le64( bSig0, bSig1, aSig0, aSig1 );
  4647. if ( q )<>0 then
  4648. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4649. expDiff := expDiff - 32;
  4650. while ( 0 < expDiff ) do
  4651. Begin
  4652. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4653. if 4 < q then
  4654. q:= q - 4
  4655. else
  4656. q := 0;
  4657. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4658. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  4659. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  4660. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  4661. expDiff := expDiff - 29;
  4662. End;
  4663. if ( -32 < expDiff ) then
  4664. Begin
  4665. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4666. if 4 < q then
  4667. q := q - 4
  4668. else
  4669. q := 0;
  4670. q := q shr (- expDiff);
  4671. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4672. expDiff := expDiff + 24;
  4673. if ( expDiff < 0 ) then
  4674. Begin
  4675. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4676. End
  4677. else
  4678. Begin
  4679. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  4680. End;
  4681. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4682. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  4683. End
  4684. else
  4685. Begin
  4686. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  4687. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4688. End;
  4689. Repeat
  4690. alternateASig0 := aSig0;
  4691. alternateASig1 := aSig1;
  4692. Inc(q);
  4693. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4694. Until not ( 0 <= sbits32 (aSig0) );
  4695. add64(
  4696. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  4697. if ( ( sigMean0 < 0 )
  4698. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  4699. Begin
  4700. aSig0 := alternateASig0;
  4701. aSig1 := alternateASig1;
  4702. End;
  4703. zSign := flag( sbits32 (aSig0) < 0 );
  4704. if ( zSign <> 0 ) then
  4705. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  4706. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  4707. End;
  4708. {*
  4709. -------------------------------------------------------------------------------
  4710. Returns the square root of the double-precision floating-point value `a'.
  4711. The operation is performed according to the IEC/IEEE Standard for Binary
  4712. Floating-Point Arithmetic.
  4713. -------------------------------------------------------------------------------
  4714. *}
  4715. Procedure float64_sqrt( a: float64; var out: float64 );
  4716. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  4717. Var
  4718. aSign: flag;
  4719. aExp, zExp: int16;
  4720. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  4721. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4722. z: float64;
  4723. label invalid;
  4724. Begin
  4725. aSig1 := extractFloat64Frac1( a );
  4726. aSig0 := extractFloat64Frac0( a );
  4727. aExp := extractFloat64Exp( a );
  4728. aSign := extractFloat64Sign( a );
  4729. if ( aExp = $7FF ) then
  4730. Begin
  4731. if ( aSig0 OR aSig1 ) <> 0 then
  4732. Begin
  4733. propagateFloat64NaN( a, a, out );
  4734. exit;
  4735. End;
  4736. if ( aSign = 0) then
  4737. Begin
  4738. out := a;
  4739. exit;
  4740. End;
  4741. goto invalid;
  4742. End;
  4743. if ( aSign <> 0 ) then
  4744. Begin
  4745. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4746. Begin
  4747. out := a;
  4748. exit;
  4749. End;
  4750. invalid:
  4751. float_raise( float_flag_invalid );
  4752. z.low := float64_default_nan_low;
  4753. z.high := float64_default_nan_high;
  4754. out := z;
  4755. exit;
  4756. End;
  4757. if ( aExp = 0 ) then
  4758. Begin
  4759. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4760. Begin
  4761. packFloat64( 0, 0, 0, 0, out );
  4762. exit;
  4763. End;
  4764. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4765. End;
  4766. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  4767. aSig0 := aSig0 or $00100000;
  4768. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  4769. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  4770. if ( zSig0 = 0 ) then
  4771. zSig0 := $7FFFFFFF;
  4772. doubleZSig0 := zSig0 + zSig0;
  4773. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  4774. mul32To64( zSig0, zSig0, term0, term1 );
  4775. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  4776. while ( sbits32 (rem0) < 0 ) do
  4777. Begin
  4778. Dec(zSig0);
  4779. doubleZSig0 := doubleZSig0 - 2;
  4780. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  4781. End;
  4782. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  4783. if ( ( zSig1 and $1FF ) <= 5 ) then
  4784. Begin
  4785. if ( zSig1 = 0 ) then
  4786. zSig1 := 1;
  4787. mul32To64( doubleZSig0, zSig1, term1, term2 );
  4788. sub64( rem1, 0, term1, term2, rem1, rem2 );
  4789. mul32To64( zSig1, zSig1, term2, term3 );
  4790. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  4791. while ( sbits32 (rem1) < 0 ) do
  4792. Begin
  4793. Dec(zSig1);
  4794. shortShift64Left( 0, zSig1, 1, term2, term3 );
  4795. term3 := term3 or 1;
  4796. term2 := term2 or doubleZSig0;
  4797. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  4798. End;
  4799. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4800. End;
  4801. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  4802. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  4803. End;
  4804. {*
  4805. -------------------------------------------------------------------------------
  4806. Returns 1 if the double-precision floating-point value `a' is equal to
  4807. the corresponding value `b', and 0 otherwise. The comparison is performed
  4808. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4809. -------------------------------------------------------------------------------
  4810. *}
  4811. Function float64_eq(a: float64; b: float64): flag;
  4812. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  4813. Begin
  4814. if
  4815. (
  4816. ( extractFloat64Exp( a ) = $7FF )
  4817. AND
  4818. (
  4819. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4820. )
  4821. )
  4822. OR (
  4823. ( extractFloat64Exp( b ) = $7FF )
  4824. AND (
  4825. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4826. )
  4827. )
  4828. ) then
  4829. Begin
  4830. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4831. float_raise( float_flag_invalid );
  4832. float64_eq := 0;
  4833. exit;
  4834. End;
  4835. float64_eq := flag(
  4836. ( a.low = b.low )
  4837. AND ( ( a.high = b.high )
  4838. OR ( ( a.low = 0 )
  4839. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4840. ));
  4841. End;
  4842. {*
  4843. -------------------------------------------------------------------------------
  4844. Returns 1 if the double-precision floating-point value `a' is less than
  4845. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4846. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4847. Arithmetic.
  4848. -------------------------------------------------------------------------------
  4849. *}
  4850. Function float64_le(a: float64;b: float64): flag;
  4851. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  4852. Var
  4853. aSign, bSign: flag;
  4854. Begin
  4855. if
  4856. (
  4857. ( extractFloat64Exp( a ) = $7FF )
  4858. AND
  4859. (
  4860. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4861. )
  4862. )
  4863. OR (
  4864. ( extractFloat64Exp( b ) = $7FF )
  4865. AND (
  4866. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4867. )
  4868. )
  4869. ) then
  4870. Begin
  4871. float_raise( float_flag_invalid );
  4872. float64_le := 0;
  4873. exit;
  4874. End;
  4875. aSign := extractFloat64Sign( a );
  4876. bSign := extractFloat64Sign( b );
  4877. if ( aSign <> bSign ) then
  4878. Begin
  4879. float64_le := flag(
  4880. (aSign <> 0)
  4881. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4882. = 0 ));
  4883. exit;
  4884. End;
  4885. if aSign <> 0 then
  4886. float64_le := le64( b.high, b.low, a.high, a.low )
  4887. else
  4888. float64_le := le64( a.high, a.low, b.high, b.low );
  4889. End;
  4890. {*
  4891. -------------------------------------------------------------------------------
  4892. Returns 1 if the double-precision floating-point value `a' is less than
  4893. the corresponding value `b', and 0 otherwise. The comparison is performed
  4894. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4895. -------------------------------------------------------------------------------
  4896. *}
  4897. Function float64_lt(a: float64;b: float64): flag;
  4898. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  4899. Var
  4900. aSign, bSign: flag;
  4901. Begin
  4902. if
  4903. (
  4904. ( extractFloat64Exp( a ) = $7FF )
  4905. AND
  4906. (
  4907. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4908. )
  4909. )
  4910. OR (
  4911. ( extractFloat64Exp( b ) = $7FF )
  4912. AND (
  4913. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4914. )
  4915. )
  4916. ) then
  4917. Begin
  4918. float_raise( float_flag_invalid );
  4919. float64_lt := 0;
  4920. exit;
  4921. End;
  4922. aSign := extractFloat64Sign( a );
  4923. bSign := extractFloat64Sign( b );
  4924. if ( aSign <> bSign ) then
  4925. Begin
  4926. float64_lt := flag(
  4927. (aSign <> 0)
  4928. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4929. <> 0 ));
  4930. exit;
  4931. End;
  4932. if aSign <> 0 then
  4933. float64_lt := lt64( b.high, b.low, a.high, a.low )
  4934. else
  4935. float64_lt := lt64( a.high, a.low, b.high, b.low );
  4936. End;
  4937. {*
  4938. -------------------------------------------------------------------------------
  4939. Returns 1 if the double-precision floating-point value `a' is equal to
  4940. the corresponding value `b', and 0 otherwise. The invalid exception is
  4941. raised if either operand is a NaN. Otherwise, the comparison is performed
  4942. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4943. -------------------------------------------------------------------------------
  4944. *}
  4945. Function float64_eq_signaling( a: float64; b: float64): flag;
  4946. Begin
  4947. if
  4948. (
  4949. ( extractFloat64Exp( a ) = $7FF )
  4950. AND
  4951. (
  4952. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4953. )
  4954. )
  4955. OR (
  4956. ( extractFloat64Exp( b ) = $7FF )
  4957. AND (
  4958. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4959. )
  4960. )
  4961. ) then
  4962. Begin
  4963. float_raise( float_flag_invalid );
  4964. float64_eq_signaling := 0;
  4965. exit;
  4966. End;
  4967. float64_eq_signaling := flag(
  4968. ( a.low = b.low )
  4969. AND ( ( a.high = b.high )
  4970. OR ( ( a.low = 0 )
  4971. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4972. ));
  4973. End;
  4974. {*
  4975. -------------------------------------------------------------------------------
  4976. Returns 1 if the double-precision floating-point value `a' is less than or
  4977. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4978. cause an exception. Otherwise, the comparison is performed according to the
  4979. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4980. -------------------------------------------------------------------------------
  4981. *}
  4982. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  4983. Var
  4984. aSign, bSign : flag;
  4985. Begin
  4986. if
  4987. (
  4988. ( extractFloat64Exp( a ) = $7FF )
  4989. AND
  4990. (
  4991. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4992. )
  4993. )
  4994. OR (
  4995. ( extractFloat64Exp( b ) = $7FF )
  4996. AND (
  4997. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4998. )
  4999. )
  5000. ) then
  5001. Begin
  5002. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5003. float_raise( float_flag_invalid );
  5004. float64_le_quiet := 0;
  5005. exit;
  5006. End;
  5007. aSign := extractFloat64Sign( a );
  5008. bSign := extractFloat64Sign( b );
  5009. if ( aSign <> bSign ) then
  5010. Begin
  5011. float64_le_quiet := flag
  5012. ((aSign <> 0)
  5013. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5014. = 0 ));
  5015. exit;
  5016. End;
  5017. if aSign <> 0 then
  5018. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  5019. else
  5020. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  5021. End;
  5022. {*
  5023. -------------------------------------------------------------------------------
  5024. Returns 1 if the double-precision floating-point value `a' is less than
  5025. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5026. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5027. Standard for Binary Floating-Point Arithmetic.
  5028. -------------------------------------------------------------------------------
  5029. *}
  5030. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5031. Var
  5032. aSign, bSign: flag;
  5033. Begin
  5034. if
  5035. (
  5036. ( extractFloat64Exp( a ) = $7FF )
  5037. AND
  5038. (
  5039. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5040. )
  5041. )
  5042. OR (
  5043. ( extractFloat64Exp( b ) = $7FF )
  5044. AND (
  5045. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5046. )
  5047. )
  5048. ) then
  5049. Begin
  5050. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5051. float_raise( float_flag_invalid );
  5052. float64_lt_quiet := 0;
  5053. exit;
  5054. End;
  5055. aSign := extractFloat64Sign( a );
  5056. bSign := extractFloat64Sign( b );
  5057. if ( aSign <> bSign ) then
  5058. Begin
  5059. float64_lt_quiet := flag(
  5060. (aSign<>0)
  5061. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5062. <> 0 ));
  5063. exit;
  5064. End;
  5065. If aSign <> 0 then
  5066. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5067. else
  5068. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5069. End;
  5070. {*----------------------------------------------------------------------------
  5071. | Returns the result of converting the 64-bit two's complement integer `a'
  5072. | to the single-precision floating-point format. The conversion is performed
  5073. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5074. *----------------------------------------------------------------------------*}
  5075. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5076. var
  5077. zSign : flag;
  5078. absA : uint64;
  5079. shiftCount: int8;
  5080. zSig : bits32;
  5081. intval : int64rec;
  5082. Begin
  5083. if ( a = 0 ) then
  5084. begin
  5085. int64_to_float32.float32 := 0;
  5086. exit;
  5087. end;
  5088. if a < 0 then
  5089. zSign := flag(TRUE)
  5090. else
  5091. zSign := flag(FALSE);
  5092. if zSign<>0 then
  5093. absA := -a
  5094. else
  5095. absA := a;
  5096. shiftCount := countLeadingZeros64( absA ) - 40;
  5097. if ( 0 <= shiftCount ) then
  5098. begin
  5099. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5100. end
  5101. else
  5102. begin
  5103. shiftCount := shiftCount + 7;
  5104. if ( shiftCount < 0 ) then
  5105. begin
  5106. intval.low := int64rec(AbsA).low;
  5107. intval.high := int64rec(AbsA).high;
  5108. shift64RightJamming( intval.low, intval.high, - shiftCount,
  5109. intval.low, intval.high);
  5110. int64rec(absA).low := intval.low;
  5111. int64rec(absA).high := intval.high;
  5112. end
  5113. else
  5114. absA := absA shl shiftCount;
  5115. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5116. end;
  5117. End;
  5118. {*----------------------------------------------------------------------------
  5119. | Returns the result of converting the 64-bit two's complement integer `a'
  5120. | to the single-precision floating-point format. The conversion is performed
  5121. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5122. | Unisgned version.
  5123. *----------------------------------------------------------------------------*}
  5124. function qword_to_float32( a: qword ): float32rec; compilerproc;
  5125. var
  5126. zSign : flag;
  5127. absA : uint64;
  5128. shiftCount: int8;
  5129. zSig : bits32;
  5130. intval : int64rec;
  5131. Begin
  5132. if ( a = 0 ) then
  5133. begin
  5134. qword_to_float32.float32 := 0;
  5135. exit;
  5136. end;
  5137. zSign := flag(FALSE);
  5138. absA := a;
  5139. shiftCount := countLeadingZeros64( absA ) - 40;
  5140. if ( 0 <= shiftCount ) then
  5141. begin
  5142. qword_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5143. end
  5144. else
  5145. begin
  5146. shiftCount := shiftCount + 7;
  5147. if ( shiftCount < 0 ) then
  5148. begin
  5149. intval.low := int64rec(AbsA).low;
  5150. intval.high := int64rec(AbsA).high;
  5151. shift64RightJamming( intval.low, intval.high, - shiftCount,
  5152. intval.low, intval.high);
  5153. int64rec(absA).low := intval.low;
  5154. int64rec(absA).high := intval.high;
  5155. end
  5156. else
  5157. absA := absA shl shiftCount;
  5158. qword_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5159. end;
  5160. End;
  5161. {*----------------------------------------------------------------------------
  5162. | Returns the result of converting the 64-bit two's complement integer `a'
  5163. | to the double-precision floating-point format. The conversion is performed
  5164. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5165. *----------------------------------------------------------------------------*}
  5166. function qword_to_float64( a: qword ): float64;
  5167. {$ifdef fpc}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5168. var
  5169. zSign : flag;
  5170. float_result : float64;
  5171. intval : int64rec;
  5172. AbsA : bits64;
  5173. shiftcount : int8;
  5174. zSig0, zSig1 : bits32;
  5175. Begin
  5176. if ( a = 0 ) then
  5177. Begin
  5178. packFloat64( 0, 0, 0, 0, result );
  5179. exit;
  5180. end;
  5181. zSign := flag(FALSE);
  5182. AbsA := a;
  5183. shiftCount := countLeadingZeros64( absA ) - 11;
  5184. if ( 0 <= shiftCount ) then
  5185. Begin
  5186. absA := absA shl shiftcount;
  5187. zSig0:=int64rec(absA).high;
  5188. zSig1:=int64rec(absA).low;
  5189. End
  5190. else
  5191. Begin
  5192. shift64Right( int64rec(absA).high, int64rec(absA).low,
  5193. - shiftCount, zSig0, zSig1 );
  5194. End;
  5195. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5196. qword_to_float64:= float_result;
  5197. End;
  5198. {*----------------------------------------------------------------------------
  5199. | Returns the result of converting the 64-bit two's complement integer `a'
  5200. | to the double-precision floating-point format. The conversion is performed
  5201. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5202. *----------------------------------------------------------------------------*}
  5203. function int64_to_float64( a: int64 ): float64;
  5204. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5205. var
  5206. zSign : flag;
  5207. float_result : float64;
  5208. intval : int64rec;
  5209. AbsA : bits64;
  5210. shiftcount : int8;
  5211. zSig0, zSig1 : bits32;
  5212. Begin
  5213. if ( a = 0 ) then
  5214. Begin
  5215. packFloat64( 0, 0, 0, 0, result );
  5216. exit;
  5217. end;
  5218. zSign := flag( a < 0 );
  5219. if ZSign<>0 then
  5220. AbsA := -a
  5221. else
  5222. AbsA := a;
  5223. shiftCount := countLeadingZeros64( absA ) - 11;
  5224. if ( 0 <= shiftCount ) then
  5225. Begin
  5226. absA := absA shl shiftcount;
  5227. zSig0:=int64rec(absA).high;
  5228. zSig1:=int64rec(absA).low;
  5229. End
  5230. else
  5231. Begin
  5232. shift64Right( int64rec(absA).high, int64rec(absA).low,
  5233. - shiftCount, zSig0, zSig1 );
  5234. End;
  5235. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5236. int64_to_float64:= float_result;
  5237. End;
  5238. {*----------------------------------------------------------------------------
  5239. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5240. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5241. | Otherwise, returns 0.
  5242. *----------------------------------------------------------------------------*}
  5243. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5244. begin
  5245. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5246. end;
  5247. {*----------------------------------------------------------------------------
  5248. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5249. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5250. | Otherwise, returns 0.
  5251. *----------------------------------------------------------------------------*}
  5252. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5253. begin
  5254. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5255. end;
  5256. {*----------------------------------------------------------------------------
  5257. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5258. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5259. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5260. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5261. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5262. | the most-significant bit of the extra result, and the other 63 bits of the
  5263. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5264. | were all zero. This extra result is stored in the location pointed to by
  5265. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5266. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5267. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5268. | fixed-point value is shifted right by the number of bits given in `count',
  5269. | and the integer part of the result is returned at the locations pointed to
  5270. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5271. | corrupted as described above, and is returned at the location pointed to by
  5272. | `z2Ptr'.)
  5273. *----------------------------------------------------------------------------*}
  5274. procedure shift128ExtraRightJamming(
  5275. a0: bits64;
  5276. a1: bits64;
  5277. a2: bits64;
  5278. count: int16;
  5279. var z0Ptr: bits64;
  5280. var z1Ptr: bits64;
  5281. var z2Ptr: bits64);
  5282. var
  5283. z0, z1, z2: bits64;
  5284. negCount: int8;
  5285. begin
  5286. negCount := ( - count ) and 63;
  5287. if ( count = 0 ) then
  5288. begin
  5289. z2 := a2;
  5290. z1 := a1;
  5291. z0 := a0;
  5292. end
  5293. else begin
  5294. if ( count < 64 ) then
  5295. begin
  5296. z2 := a1 shr negCount;
  5297. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5298. z0 := a0 shr count;
  5299. end
  5300. else begin
  5301. if ( count = 64 ) then
  5302. begin
  5303. z2 := a1;
  5304. z1 := a0;
  5305. end
  5306. else begin
  5307. a2 := a2 or a1;
  5308. if ( count < 128 ) then
  5309. begin
  5310. z2 := a0 shl negCount;
  5311. z1 := a0 shr ( count and 63 );
  5312. end
  5313. else begin
  5314. if ( count = 128 ) then
  5315. z2 := a0
  5316. else
  5317. z2 := ord( a0 <> 0 );
  5318. z1 := 0;
  5319. end;
  5320. end;
  5321. z0 := 0;
  5322. end;
  5323. z2 := z2 or ord( a2 <> 0 );
  5324. end;
  5325. z2Ptr := z2;
  5326. z1Ptr := z1;
  5327. z0Ptr := z0;
  5328. end;
  5329. {*----------------------------------------------------------------------------
  5330. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5331. | _plus_ the number of bits given in `count'. The shifted result is at most
  5332. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5333. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5334. | shifted off is the most-significant bit of the extra result, and the other
  5335. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5336. | bits shifted off were all zero. This extra result is stored in the location
  5337. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5338. | (This routine makes more sense if `a0' and `a1' are considered to form
  5339. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5340. | point value is shifted right by the number of bits given in `count', and
  5341. | the integer part of the result is returned at the location pointed to by
  5342. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5343. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5344. *----------------------------------------------------------------------------*}
  5345. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5346. var
  5347. z0, z1: bits64;
  5348. negCount: int8;
  5349. begin
  5350. negCount := ( - count ) and 63;
  5351. if ( count = 0 ) then
  5352. begin
  5353. z1 := a1;
  5354. z0 := a0;
  5355. end
  5356. else if ( count < 64 ) then
  5357. begin
  5358. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5359. z0 := a0 shr count;
  5360. end
  5361. else begin
  5362. if ( count = 64 ) then
  5363. begin
  5364. z1 := a0 or ord( a1 <> 0 );
  5365. end
  5366. else begin
  5367. z1 := ord( ( a0 or a1 ) <> 0 );
  5368. end;
  5369. z0 := 0;
  5370. end;
  5371. z1Ptr := z1;
  5372. z0Ptr := z0;
  5373. end;
  5374. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5375. {*----------------------------------------------------------------------------
  5376. | Returns the fraction bits of the extended double-precision floating-point
  5377. | value `a'.
  5378. *----------------------------------------------------------------------------*}
  5379. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5380. begin
  5381. result:=a.low;
  5382. end;
  5383. {*----------------------------------------------------------------------------
  5384. | Returns the exponent bits of the extended double-precision floating-point
  5385. | value `a'.
  5386. *----------------------------------------------------------------------------*}
  5387. function extractFloatx80Exp(a : floatx80): int32;inline;
  5388. begin
  5389. result:=a.high and $7FFF;
  5390. end;
  5391. {*----------------------------------------------------------------------------
  5392. | Returns the sign bit of the extended double-precision floating-point value
  5393. | `a'.
  5394. *----------------------------------------------------------------------------*}
  5395. function extractFloatx80Sign(a : floatx80): flag;inline;
  5396. begin
  5397. result:=a.high shr 15;
  5398. end;
  5399. {*----------------------------------------------------------------------------
  5400. | Normalizes the subnormal extended double-precision floating-point value
  5401. | represented by the denormalized significand `aSig'. The normalized exponent
  5402. | and significand are stored at the locations pointed to by `zExpPtr' and
  5403. | `zSigPtr', respectively.
  5404. *----------------------------------------------------------------------------*}
  5405. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5406. var
  5407. shiftCount: int8;
  5408. begin
  5409. shiftCount := countLeadingZeros64( aSig );
  5410. zSigPtr := aSig shl shiftCount;
  5411. zExpPtr := 1 - shiftCount;
  5412. end;
  5413. {*----------------------------------------------------------------------------
  5414. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5415. | extended double-precision floating-point value, returning the result.
  5416. *----------------------------------------------------------------------------*}
  5417. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5418. var
  5419. z: floatx80;
  5420. begin
  5421. z.low := zSig;
  5422. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5423. result:=z;
  5424. end;
  5425. {*----------------------------------------------------------------------------
  5426. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5427. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5428. | and returns the proper extended double-precision floating-point value
  5429. | corresponding to the abstract input. Ordinarily, the abstract value is
  5430. | rounded and packed into the extended double-precision format, with the
  5431. | inexact exception raised if the abstract input cannot be represented
  5432. | exactly. However, if the abstract value is too large, the overflow and
  5433. | inexact exceptions are raised and an infinity or maximal finite value is
  5434. | returned. If the abstract value is too small, the input value is rounded to
  5435. | a subnormal number, and the underflow and inexact exceptions are raised if
  5436. | the abstract input cannot be represented exactly as a subnormal extended
  5437. | double-precision floating-point number.
  5438. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5439. | number of bits as single or double precision, respectively. Otherwise, the
  5440. | result is rounded to the full precision of the extended double-precision
  5441. | format.
  5442. | The input significand must be normalized or smaller. If the input
  5443. | significand is not normalized, `zExp' must be 0; in that case, the result
  5444. | returned is a subnormal number, and it must not require rounding. The
  5445. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5446. | Floating-Point Arithmetic.
  5447. *----------------------------------------------------------------------------*}
  5448. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5449. var
  5450. roundingMode: int8;
  5451. roundNearestEven, increment, isTiny: flag;
  5452. roundIncrement, roundMask, roundBits: int64;
  5453. label
  5454. precision80;
  5455. begin
  5456. roundingMode := softfloat_rounding_mode;
  5457. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5458. if ( roundingPrecision = 80 ) then
  5459. goto precision80;
  5460. if ( roundingPrecision = 64 ) then
  5461. begin
  5462. roundIncrement := int64( $0000000000000400 );
  5463. roundMask := int64( $00000000000007FF );
  5464. end
  5465. else if ( roundingPrecision = 32 ) then
  5466. begin
  5467. roundIncrement := int64( $0000008000000000 );
  5468. roundMask := int64( $000000FFFFFFFFFF );
  5469. end
  5470. else begin
  5471. goto precision80;
  5472. end;
  5473. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5474. if ( not (roundNearestEven<>0) ) then
  5475. begin
  5476. if ( roundingMode = float_round_to_zero ) then
  5477. begin
  5478. roundIncrement := 0;
  5479. end
  5480. else begin
  5481. roundIncrement := roundMask;
  5482. if ( zSign<>0 ) then
  5483. begin
  5484. if ( roundingMode = float_round_up ) then
  5485. roundIncrement := 0;
  5486. end
  5487. else begin
  5488. if ( roundingMode = float_round_down ) then
  5489. roundIncrement := 0;
  5490. end;
  5491. end;
  5492. end;
  5493. roundBits := zSig0 and roundMask;
  5494. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5495. if ( ( $7FFE < zExp )
  5496. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5497. ) begin
  5498. goto overflow;
  5499. end;
  5500. if ( zExp <= 0 ) begin
  5501. isTiny =
  5502. ( float_detect_tininess = float_tininess_before_rounding )
  5503. or ( zExp < 0 )
  5504. or ( zSig0 <= zSig0 + roundIncrement );
  5505. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  5506. zExp := 0;
  5507. roundBits := zSig0 and roundMask;
  5508. if ( isTiny and roundBits ) float_raise( float_flag_underflow );
  5509. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5510. zSig0 += roundIncrement;
  5511. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5512. roundIncrement := roundMask + 1;
  5513. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5514. roundMask |= roundIncrement;
  5515. end;
  5516. zSig0 = ~ roundMask;
  5517. result:=packFloatx80( zSign, zExp, zSig0 );
  5518. end;
  5519. end;
  5520. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5521. zSig0 += roundIncrement;
  5522. if ( zSig0 < roundIncrement ) begin
  5523. ++zExp;
  5524. zSig0 := LIT64( $8000000000000000 );
  5525. end;
  5526. roundIncrement := roundMask + 1;
  5527. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5528. roundMask |= roundIncrement;
  5529. end;
  5530. zSig0 = ~ roundMask;
  5531. if ( zSig0 = 0 ) zExp := 0;
  5532. result:=packFloatx80( zSign, zExp, zSig0 );
  5533. precision80:
  5534. increment := ( (sbits64) zSig1 < 0 );
  5535. if ( ! roundNearestEven ) begin
  5536. if ( roundingMode = float_round_to_zero ) begin
  5537. increment := 0;
  5538. end;
  5539. else begin
  5540. if ( zSign ) begin
  5541. increment := ( roundingMode = float_round_down ) and zSig1;
  5542. end;
  5543. else begin
  5544. increment := ( roundingMode = float_round_up ) and zSig1;
  5545. end;
  5546. end;
  5547. end;
  5548. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5549. if ( ( $7FFE < zExp )
  5550. or ( ( zExp = $7FFE )
  5551. and ( zSig0 = LIT64( $FFFFFFFFFFFFFFFF ) )
  5552. and increment
  5553. )
  5554. ) begin
  5555. roundMask := 0;
  5556. overflow:
  5557. float_raise( float_flag_overflow or float_flag_inexact );
  5558. if ( ( roundingMode = float_round_to_zero )
  5559. or ( zSign and ( roundingMode = float_round_up ) )
  5560. or ( ! zSign and ( roundingMode = float_round_down ) )
  5561. ) begin
  5562. result:=packFloatx80( zSign, $7FFE, ~ roundMask );
  5563. end;
  5564. result:=packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5565. end;
  5566. if ( zExp <= 0 ) begin
  5567. isTiny =
  5568. ( float_detect_tininess = float_tininess_before_rounding )
  5569. or ( zExp < 0 )
  5570. or ! increment
  5571. or ( zSig0 < LIT64( $FFFFFFFFFFFFFFFF ) );
  5572. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  5573. zExp := 0;
  5574. if ( isTiny and zSig1 ) float_raise( float_flag_underflow );
  5575. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5576. if ( roundNearestEven ) begin
  5577. increment := ( (sbits64) zSig1 < 0 );
  5578. end;
  5579. else begin
  5580. if ( zSign ) begin
  5581. increment := ( roundingMode = float_round_down ) and zSig1;
  5582. end;
  5583. else begin
  5584. increment := ( roundingMode = float_round_up ) and zSig1;
  5585. end;
  5586. end;
  5587. if ( increment ) begin
  5588. ++zSig0;
  5589. zSig0 =
  5590. ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5591. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5592. end;
  5593. result:=packFloatx80( zSign, zExp, zSig0 );
  5594. end;
  5595. end;
  5596. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5597. if ( increment ) begin
  5598. ++zSig0;
  5599. if ( zSig0 = 0 ) begin
  5600. ++zExp;
  5601. zSig0 := LIT64( $8000000000000000 );
  5602. end;
  5603. else begin
  5604. zSig0 = ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5605. end;
  5606. end;
  5607. else begin
  5608. if ( zSig0 = 0 ) zExp := 0;
  5609. end;
  5610. result:=packFloatx80( zSign, zExp, zSig0 );
  5611. end;
  5612. {*----------------------------------------------------------------------------
  5613. | Takes an abstract floating-point value having sign `zSign', exponent
  5614. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  5615. | and returns the proper extended double-precision floating-point value
  5616. | corresponding to the abstract input. This routine is just like
  5617. | `roundAndPackFloatx80' except that the input significand does not have to be
  5618. | normalized.
  5619. *----------------------------------------------------------------------------*}
  5620. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5621. var
  5622. shiftCount: int8;
  5623. begin
  5624. if ( zSig0 = 0 ) begin
  5625. zSig0 := zSig1;
  5626. zSig1 := 0;
  5627. zExp -= 64;
  5628. end;
  5629. shiftCount := countLeadingZeros64( zSig0 );
  5630. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5631. zExp := eExp - shiftCount;
  5632. return
  5633. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  5634. end;
  5635. {*----------------------------------------------------------------------------
  5636. | Returns the result of converting the extended double-precision floating-
  5637. | point value `a' to the 32-bit two's complement integer format. The
  5638. | conversion is performed according to the IEC/IEEE Standard for Binary
  5639. | Floating-Point Arithmetic---which means in particular that the conversion
  5640. | is rounded according to the current rounding mode. If `a' is a NaN, the
  5641. | largest positive integer is returned. Otherwise, if the conversion
  5642. | overflows, the largest integer with the same sign as `a' is returned.
  5643. *----------------------------------------------------------------------------*}
  5644. function floatx80_to_int32(a: floatx80): int32;
  5645. var
  5646. aSign: flag;
  5647. aExp, shiftCount: int32;
  5648. aSig: bits64;
  5649. begin
  5650. aSig := extractFloatx80Frac( a );
  5651. aExp := extractFloatx80Exp( a );
  5652. aSign := extractFloatx80Sign( a );
  5653. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5654. shiftCount := $4037 - aExp;
  5655. if ( shiftCount <= 0 ) shiftCount := 1;
  5656. shift64RightJamming( aSig, shiftCount, aSig );
  5657. result := roundAndPackInt32( aSign, aSig );
  5658. end;
  5659. {*----------------------------------------------------------------------------
  5660. | Returns the result of converting the extended double-precision floating-
  5661. | point value `a' to the 32-bit two's complement integer format. The
  5662. | conversion is performed according to the IEC/IEEE Standard for Binary
  5663. | Floating-Point Arithmetic, except that the conversion is always rounded
  5664. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5665. | Otherwise, if the conversion overflows, the largest integer with the same
  5666. | sign as `a' is returned.
  5667. *----------------------------------------------------------------------------*}
  5668. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  5669. var
  5670. aSign: flag;
  5671. aExp, shiftCount: int32;
  5672. aSig, savedASig: bits64;
  5673. z: int32;
  5674. begin
  5675. aSig := extractFloatx80Frac( a );
  5676. aExp := extractFloatx80Exp( a );
  5677. aSign := extractFloatx80Sign( a );
  5678. if ( $401E < aExp ) begin
  5679. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5680. goto invalid;
  5681. end;
  5682. else if ( aExp < $3FFF ) begin
  5683. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5684. result := 0;
  5685. end;
  5686. shiftCount := $403E - aExp;
  5687. savedASig := aSig;
  5688. aSig >>= shiftCount;
  5689. z := aSig;
  5690. if ( aSign ) z := - z;
  5691. if ( ( z < 0 ) xor aSign ) begin
  5692. invalid:
  5693. float_raise( float_flag_invalid );
  5694. result := aSign ? (sbits32) $80000000 : $7FFFFFFF;
  5695. end;
  5696. if ( ( aSig shl shiftCount ) <> savedASig ) begin
  5697. softfloat_exception_flags or= float_flag_inexact;
  5698. end;
  5699. result := z;
  5700. end;
  5701. {*----------------------------------------------------------------------------
  5702. | Returns the result of converting the extended double-precision floating-
  5703. | point value `a' to the 64-bit two's complement integer format. The
  5704. | conversion is performed according to the IEC/IEEE Standard for Binary
  5705. | Floating-Point Arithmetic---which means in particular that the conversion
  5706. | is rounded according to the current rounding mode. If `a' is a NaN,
  5707. | the largest positive integer is returned. Otherwise, if the conversion
  5708. | overflows, the largest integer with the same sign as `a' is returned.
  5709. *----------------------------------------------------------------------------*}
  5710. function floatx80_to_int64(a: floatx80): int64;
  5711. var
  5712. aSign: flag;
  5713. aExp, shiftCount: int32;
  5714. aSig, aSigExtra: bits64;
  5715. begin
  5716. aSig := extractFloatx80Frac( a );
  5717. aExp := extractFloatx80Exp( a );
  5718. aSign := extractFloatx80Sign( a );
  5719. shiftCount := $403E - aExp;
  5720. if ( shiftCount <= 0 ) begin
  5721. if ( shiftCount ) begin
  5722. float_raise( float_flag_invalid );
  5723. if ( ! aSign
  5724. or ( ( aExp = $7FFF )
  5725. and ( aSig <> LIT64( $8000000000000000 ) ) )
  5726. ) begin
  5727. result := LIT64( $7FFFFFFFFFFFFFFF );
  5728. end;
  5729. result := (sbits64) LIT64( $8000000000000000 );
  5730. end;
  5731. aSigExtra := 0;
  5732. end;
  5733. else begin
  5734. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  5735. end;
  5736. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  5737. end;
  5738. {*----------------------------------------------------------------------------
  5739. | Returns the result of converting the extended double-precision floating-
  5740. | point value `a' to the 64-bit two's complement integer format. The
  5741. | conversion is performed according to the IEC/IEEE Standard for Binary
  5742. | Floating-Point Arithmetic, except that the conversion is always rounded
  5743. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5744. | Otherwise, if the conversion overflows, the largest integer with the same
  5745. | sign as `a' is returned.
  5746. *----------------------------------------------------------------------------*}
  5747. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  5748. var
  5749. aSign: flag;
  5750. aExp, shiftCount: int32;
  5751. aSig: bits64;
  5752. z: int64;
  5753. begin
  5754. aSig := extractFloatx80Frac( a );
  5755. aExp := extractFloatx80Exp( a );
  5756. aSign := extractFloatx80Sign( a );
  5757. shiftCount := aExp - $403E;
  5758. if ( 0 <= shiftCount ) begin
  5759. aSig = LIT64( $7FFFFFFFFFFFFFFF );
  5760. if ( ( a.high <> $C03E ) or aSig ) begin
  5761. float_raise( float_flag_invalid );
  5762. if ( ! aSign or ( ( aExp = $7FFF ) and aSig ) ) begin
  5763. result := LIT64( $7FFFFFFFFFFFFFFF );
  5764. end;
  5765. end;
  5766. result := (sbits64) LIT64( $8000000000000000 );
  5767. end;
  5768. else if ( aExp < $3FFF ) begin
  5769. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5770. result := 0;
  5771. end;
  5772. z := aSig>>( - shiftCount );
  5773. if ( (bits64) ( aSig shl ( shiftCount and 63 ) ) ) begin
  5774. softfloat_exception_flags or= float_flag_inexact;
  5775. end;
  5776. if ( aSign ) z := - z;
  5777. result := z;
  5778. end;
  5779. {*----------------------------------------------------------------------------
  5780. | Returns the result of converting the extended double-precision floating-
  5781. | point value `a' to the single-precision floating-point format. The
  5782. | conversion is performed according to the IEC/IEEE Standard for Binary
  5783. | Floating-Point Arithmetic.
  5784. *----------------------------------------------------------------------------*}
  5785. function floatx80_to_float32(a: floatx80): float32;
  5786. var
  5787. aSign: flag;
  5788. aExp: int32;
  5789. aSig: bits64;
  5790. begin
  5791. aSig := extractFloatx80Frac( a );
  5792. aExp := extractFloatx80Exp( a );
  5793. aSign := extractFloatx80Sign( a );
  5794. if ( aExp = $7FFF ) begin
  5795. if ( (bits64) ( aSig shl 1 ) ) begin
  5796. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  5797. end;
  5798. result := packFloat32( aSign, $FF, 0 );
  5799. end;
  5800. shift64RightJamming( aSig, 33, aSig );
  5801. if ( aExp or aSig ) aExp -= $3F81;
  5802. result := roundAndPackFloat32( aSign, aExp, aSig );
  5803. end;
  5804. {*----------------------------------------------------------------------------
  5805. | Returns the result of converting the extended double-precision floating-
  5806. | point value `a' to the double-precision floating-point format. The
  5807. | conversion is performed according to the IEC/IEEE Standard for Binary
  5808. | Floating-Point Arithmetic.
  5809. *----------------------------------------------------------------------------*}
  5810. function floatx80_to_float64(a: floatx80): float64;
  5811. var
  5812. aSign: flag;
  5813. aExp: int32;
  5814. aSig, zSig: bits64;
  5815. begin
  5816. aSig := extractFloatx80Frac( a );
  5817. aExp := extractFloatx80Exp( a );
  5818. aSign := extractFloatx80Sign( a );
  5819. if ( aExp = $7FFF ) begin
  5820. if ( (bits64) ( aSig shl 1 ) ) begin
  5821. result := commonNaNToFloat64( floatx80ToCommonNaN( a ) );
  5822. end;
  5823. result := packFloat64( aSign, $7FF, 0 );
  5824. end;
  5825. shift64RightJamming( aSig, 1, zSig );
  5826. if ( aExp or aSig ) aExp -= $3C01;
  5827. result := roundAndPackFloat64( aSign, aExp, zSig );
  5828. end;
  5829. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5830. {*----------------------------------------------------------------------------
  5831. | Returns the result of converting the extended double-precision floating-
  5832. | point value `a' to the quadruple-precision floating-point format. The
  5833. | conversion is performed according to the IEC/IEEE Standard for Binary
  5834. | Floating-Point Arithmetic.
  5835. *----------------------------------------------------------------------------*}
  5836. function floatx80_to_float128(a: floatx80): float128;
  5837. var
  5838. aSign: flag;
  5839. aExp: int16;
  5840. aSig, zSig0, zSig1: bits64;
  5841. begin
  5842. aSig := extractFloatx80Frac( a );
  5843. aExp := extractFloatx80Exp( a );
  5844. aSign := extractFloatx80Sign( a );
  5845. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) begin
  5846. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  5847. end;
  5848. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  5849. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  5850. end;
  5851. {$endif FPC_SOFTFLOAT_FLOAT128}
  5852. {*----------------------------------------------------------------------------
  5853. | Rounds the extended double-precision floating-point value `a' to an integer,
  5854. | and Returns the result as an extended quadruple-precision floating-point
  5855. | value. The operation is performed according to the IEC/IEEE Standard for
  5856. | Binary Floating-Point Arithmetic.
  5857. *----------------------------------------------------------------------------*}
  5858. function floatx80_round_to_int(a: floatx80): floatx80;
  5859. var
  5860. aSign: flag;
  5861. aExp: int32;
  5862. lastBitMask, roundBitsMask: bits64;
  5863. roundingMode: int8;
  5864. z: floatx80;
  5865. begin
  5866. aExp := extractFloatx80Exp( a );
  5867. if ( $403E <= aExp ) begin
  5868. if ( ( aExp = $7FFF ) and (bits64) ( extractFloatx80Frac( a ) shl 1 ) ) begin
  5869. result := propagateFloatx80NaN( a, a );
  5870. end;
  5871. result := a;
  5872. end;
  5873. if ( aExp < $3FFF ) begin
  5874. if ( ( aExp = 0 )
  5875. and ( (bits64) ( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) begin
  5876. result := a;
  5877. end;
  5878. softfloat_exception_flags or= float_flag_inexact;
  5879. aSign := extractFloatx80Sign( a );
  5880. switch ( softfloat_rounding_mode ) begin
  5881. case float_round_nearest_even:
  5882. if ( ( aExp = $3FFE ) and (bits64) ( extractFloatx80Frac( a ) shl 1 )
  5883. ) begin
  5884. result :=
  5885. packFloatx80( aSign, $3FFF, LIT64( $8000000000000000 ) );
  5886. end;
  5887. break;
  5888. case float_round_down:
  5889. result :=
  5890. aSign ?
  5891. packFloatx80( 1, $3FFF, LIT64( $8000000000000000 ) )
  5892. : packFloatx80( 0, 0, 0 );
  5893. case float_round_up:
  5894. result :=
  5895. aSign ? packFloatx80( 1, 0, 0 )
  5896. : packFloatx80( 0, $3FFF, LIT64( $8000000000000000 ) );
  5897. end;
  5898. result := packFloatx80( aSign, 0, 0 );
  5899. end;
  5900. lastBitMask := 1;
  5901. lastBitMask shl = $403E - aExp;
  5902. roundBitsMask := lastBitMask - 1;
  5903. z := a;
  5904. roundingMode := softfloat_rounding_mode;
  5905. if ( roundingMode = float_round_nearest_even ) begin
  5906. z.low += lastBitMask>>1;
  5907. if ( ( z.low and roundBitsMask ) = 0 ) z.low = ~ lastBitMask;
  5908. end;
  5909. else if ( roundingMode <> float_round_to_zero ) begin
  5910. if ( extractFloatx80Sign( z ) xor ( roundingMode = float_round_up ) ) begin
  5911. z.low += roundBitsMask;
  5912. end;
  5913. end;
  5914. z.low = ~ roundBitsMask;
  5915. if ( z.low = 0 ) begin
  5916. ++z.high;
  5917. z.low := LIT64( $8000000000000000 );
  5918. end;
  5919. if ( z.low <> a.low ) softfloat_exception_flags or= float_flag_inexact;
  5920. result := z;
  5921. end;
  5922. {*----------------------------------------------------------------------------
  5923. | Returns the result of adding the absolute values of the extended double-
  5924. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  5925. | negated before being returned. `zSign' is ignored if the result is a NaN.
  5926. | The addition is performed according to the IEC/IEEE Standard for Binary
  5927. | Floating-Point Arithmetic.
  5928. *----------------------------------------------------------------------------*}
  5929. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5930. var
  5931. aExp, bExp, zExp: int32;
  5932. aSig, bSig, zSig0, zSig1: bits64;
  5933. expDiff: int32;
  5934. begin
  5935. aSig := extractFloatx80Frac( a );
  5936. aExp := extractFloatx80Exp( a );
  5937. bSig := extractFloatx80Frac( b );
  5938. bExp := extractFloatx80Exp( b );
  5939. expDiff := aExp - bExp;
  5940. if ( 0 < expDiff ) begin
  5941. if ( aExp = $7FFF ) begin
  5942. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5943. result := a;
  5944. end;
  5945. if ( bExp = 0 ) --expDiff;
  5946. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  5947. zExp := aExp;
  5948. end;
  5949. else if ( expDiff < 0 ) begin
  5950. if ( bExp = $7FFF ) begin
  5951. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5952. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5953. end;
  5954. if ( aExp = 0 ) ++expDiff;
  5955. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  5956. zExp := bExp;
  5957. end;
  5958. else begin
  5959. if ( aExp = $7FFF ) begin
  5960. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5961. result := propagateFloatx80NaN( a, b );
  5962. end;
  5963. result := a;
  5964. end;
  5965. zSig1 := 0;
  5966. zSig0 := aSig + bSig;
  5967. if ( aExp = 0 ) begin
  5968. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  5969. goto roundAndPack;
  5970. end;
  5971. zExp := aExp;
  5972. goto shiftRight1;
  5973. end;
  5974. zSig0 := aSig + bSig;
  5975. if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
  5976. shiftRight1:
  5977. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  5978. zSig0 or= LIT64( $8000000000000000 );
  5979. ++zExp;
  5980. roundAndPack:
  5981. result :=
  5982. roundAndPackFloatx80(
  5983. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5984. end;
  5985. {*----------------------------------------------------------------------------
  5986. | Returns the result of subtracting the absolute values of the extended
  5987. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  5988. | difference is negated before being returned. `zSign' is ignored if the
  5989. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  5990. | Standard for Binary Floating-Point Arithmetic.
  5991. *----------------------------------------------------------------------------*}
  5992. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5993. var
  5994. aExp, bExp, zExp: int32;
  5995. aSig, bSig, zSig0, zSig1: bits64;
  5996. expDiff: int32;
  5997. z: floatx80;
  5998. begin
  5999. aSig := extractFloatx80Frac( a );
  6000. aExp := extractFloatx80Exp( a );
  6001. bSig := extractFloatx80Frac( b );
  6002. bExp := extractFloatx80Exp( b );
  6003. expDiff := aExp - bExp;
  6004. if ( 0 < expDiff ) goto aExpBigger;
  6005. if ( expDiff < 0 ) goto bExpBigger;
  6006. if ( aExp = $7FFF ) begin
  6007. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  6008. result := propagateFloatx80NaN( a, b );
  6009. end;
  6010. float_raise( float_flag_invalid );
  6011. z.low := floatx80_default_nan_low;
  6012. z.high := floatx80_default_nan_high;
  6013. result := z;
  6014. end;
  6015. if ( aExp = 0 ) begin
  6016. aExp := 1;
  6017. bExp := 1;
  6018. end;
  6019. zSig1 := 0;
  6020. if ( bSig < aSig ) goto aBigger;
  6021. if ( aSig < bSig ) goto bBigger;
  6022. result := packFloatx80( softfloat_rounding_mode = float_round_down, 0, 0 );
  6023. bExpBigger:
  6024. if ( bExp = $7FFF ) begin
  6025. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6026. result := packFloatx80( zSign xor 1, $7FFF, LIT64( $8000000000000000 ) );
  6027. end;
  6028. if ( aExp = 0 ) ++expDiff;
  6029. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6030. bBigger:
  6031. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6032. zExp := bExp;
  6033. zSign xor = 1;
  6034. goto normalizeRoundAndPack;
  6035. aExpBigger:
  6036. if ( aExp = $7FFF ) begin
  6037. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6038. result := a;
  6039. end;
  6040. if ( bExp = 0 ) --expDiff;
  6041. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6042. aBigger:
  6043. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6044. zExp := aExp;
  6045. normalizeRoundAndPack:
  6046. result :=
  6047. normalizeRoundAndPackFloatx80(
  6048. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6049. end;
  6050. {*----------------------------------------------------------------------------
  6051. | Returns the result of adding the extended double-precision floating-point
  6052. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6053. | Standard for Binary Floating-Point Arithmetic.
  6054. *----------------------------------------------------------------------------*}
  6055. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6056. var
  6057. aSign, bSign: flag;
  6058. begin
  6059. aSign := extractFloatx80Sign( a );
  6060. bSign := extractFloatx80Sign( b );
  6061. if ( aSign = bSign ) begin
  6062. result := addFloatx80Sigs( a, b, aSign );
  6063. end;
  6064. else begin
  6065. result := subFloatx80Sigs( a, b, aSign );
  6066. end;
  6067. end;
  6068. {*----------------------------------------------------------------------------
  6069. | Returns the result of subtracting the extended double-precision floating-
  6070. | point values `a' and `b'. The operation is performed according to the
  6071. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6072. *----------------------------------------------------------------------------*}
  6073. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6074. var
  6075. aSign, bSign: flag;
  6076. begin
  6077. aSign := extractFloatx80Sign( a );
  6078. bSign := extractFloatx80Sign( b );
  6079. if ( aSign = bSign ) begin
  6080. result := subFloatx80Sigs( a, b, aSign );
  6081. end;
  6082. else begin
  6083. result := addFloatx80Sigs( a, b, aSign );
  6084. end;
  6085. end;
  6086. {*----------------------------------------------------------------------------
  6087. | Returns the result of multiplying the extended double-precision floating-
  6088. | point values `a' and `b'. The operation is performed according to the
  6089. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6090. *----------------------------------------------------------------------------*}
  6091. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6092. var
  6093. aSign, bSign, zSign: flag;
  6094. aExp, bExp, zExp: int32;
  6095. aSig, bSig, zSig0, zSig1: bits64;
  6096. z: floatx80;
  6097. begin
  6098. aSig := extractFloatx80Frac( a );
  6099. aExp := extractFloatx80Exp( a );
  6100. aSign := extractFloatx80Sign( a );
  6101. bSig := extractFloatx80Frac( b );
  6102. bExp := extractFloatx80Exp( b );
  6103. bSign := extractFloatx80Sign( b );
  6104. zSign := aSign xor bSign;
  6105. if ( aExp = $7FFF ) begin
  6106. if ( (bits64) ( aSig shl 1 )
  6107. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  6108. result := propagateFloatx80NaN( a, b );
  6109. end;
  6110. if ( ( bExp or bSig ) = 0 ) goto invalid;
  6111. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6112. end;
  6113. if ( bExp = $7FFF ) begin
  6114. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6115. if ( ( aExp or aSig ) = 0 ) begin
  6116. invalid:
  6117. float_raise( float_flag_invalid );
  6118. z.low := floatx80_default_nan_low;
  6119. z.high := floatx80_default_nan_high;
  6120. result := z;
  6121. end;
  6122. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6123. end;
  6124. if ( aExp = 0 ) begin
  6125. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6126. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6127. end;
  6128. if ( bExp = 0 ) begin
  6129. if ( bSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6130. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6131. end;
  6132. zExp := aExp + bExp - $3FFE;
  6133. mul64To128( aSig, bSig, zSig0, zSig1 );
  6134. if ( 0 < (sbits64) zSig0 ) begin
  6135. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6136. --zExp;
  6137. end;
  6138. result :=
  6139. roundAndPackFloatx80(
  6140. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6141. end;
  6142. {*----------------------------------------------------------------------------
  6143. | Returns the result of dividing the extended double-precision floating-point
  6144. | value `a' by the corresponding value `b'. The operation is performed
  6145. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6146. *----------------------------------------------------------------------------*}
  6147. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6148. var
  6149. aSign, bSign, zSign: flag;
  6150. aExp, bExp, zExp: int32;
  6151. aSig, bSig, zSig0, zSig1: bits64;
  6152. rem0, rem1, rem2, term0, term1, term2: bits64;
  6153. z: floatx80;
  6154. begin
  6155. aSig := extractFloatx80Frac( a );
  6156. aExp := extractFloatx80Exp( a );
  6157. aSign := extractFloatx80Sign( a );
  6158. bSig := extractFloatx80Frac( b );
  6159. bExp := extractFloatx80Exp( b );
  6160. bSign := extractFloatx80Sign( b );
  6161. zSign := aSign xor bSign;
  6162. if ( aExp = $7FFF ) begin
  6163. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6164. if ( bExp = $7FFF ) begin
  6165. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6166. goto invalid;
  6167. end;
  6168. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6169. end;
  6170. if ( bExp = $7FFF ) begin
  6171. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6172. result := packFloatx80( zSign, 0, 0 );
  6173. end;
  6174. if ( bExp = 0 ) begin
  6175. if ( bSig = 0 ) begin
  6176. if ( ( aExp or aSig ) = 0 ) begin
  6177. invalid:
  6178. float_raise( float_flag_invalid );
  6179. z.low := floatx80_default_nan_low;
  6180. z.high := floatx80_default_nan_high;
  6181. result := z;
  6182. end;
  6183. float_raise( float_flag_divbyzero );
  6184. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6185. end;
  6186. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6187. end;
  6188. if ( aExp = 0 ) begin
  6189. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6190. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6191. end;
  6192. zExp := aExp - bExp + $3FFE;
  6193. rem1 := 0;
  6194. if ( bSig <= aSig ) begin
  6195. shift128Right( aSig, 0, 1, aSig, rem1 );
  6196. ++zExp;
  6197. end;
  6198. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6199. mul64To128( bSig, zSig0, term0, term1 );
  6200. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6201. while ( (sbits64) rem0 < 0 ) begin
  6202. --zSig0;
  6203. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6204. end;
  6205. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6206. if ( (bits64) ( zSig1 shl 1 ) <= 8 ) begin
  6207. mul64To128( bSig, zSig1, term1, term2 );
  6208. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6209. while ( (sbits64) rem1 < 0 ) begin
  6210. --zSig1;
  6211. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6212. end;
  6213. zSig1 or= ( ( rem1 or rem2 ) <> 0 );
  6214. end;
  6215. result :=
  6216. roundAndPackFloatx80(
  6217. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6218. end;
  6219. {*----------------------------------------------------------------------------
  6220. | Returns the remainder of the extended double-precision floating-point value
  6221. | `a' with respect to the corresponding value `b'. The operation is performed
  6222. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6223. *----------------------------------------------------------------------------*}
  6224. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6225. var
  6226. aSign, bSign, zSign: flag;
  6227. aExp, bExp, expDiff: int32;
  6228. aSig0, aSig1, bSig: bits64;
  6229. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6230. z: floatx80;
  6231. begin
  6232. aSig0 := extractFloatx80Frac( a );
  6233. aExp := extractFloatx80Exp( a );
  6234. aSign := extractFloatx80Sign( a );
  6235. bSig := extractFloatx80Frac( b );
  6236. bExp := extractFloatx80Exp( b );
  6237. bSign := extractFloatx80Sign( b );
  6238. if ( aExp = $7FFF ) begin
  6239. if ( (bits64) ( aSig0 shl 1 )
  6240. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  6241. result := propagateFloatx80NaN( a, b );
  6242. end;
  6243. goto invalid;
  6244. end;
  6245. if ( bExp = $7FFF ) begin
  6246. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6247. result := a;
  6248. end;
  6249. if ( bExp = 0 ) begin
  6250. if ( bSig = 0 ) begin
  6251. invalid:
  6252. float_raise( float_flag_invalid );
  6253. z.low := floatx80_default_nan_low;
  6254. z.high := floatx80_default_nan_high;
  6255. result := z;
  6256. end;
  6257. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6258. end;
  6259. if ( aExp = 0 ) begin
  6260. if ( (bits64) ( aSig0 shl 1 ) = 0 ) result := a;
  6261. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6262. end;
  6263. bSig or= LIT64( $8000000000000000 );
  6264. zSign := aSign;
  6265. expDiff := aExp - bExp;
  6266. aSig1 := 0;
  6267. if ( expDiff < 0 ) begin
  6268. if ( expDiff < -1 ) result := a;
  6269. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6270. expDiff := 0;
  6271. end;
  6272. q := ( bSig <= aSig0 );
  6273. if ( q ) aSig0 -= bSig;
  6274. expDiff -= 64;
  6275. while ( 0 < expDiff ) begin
  6276. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6277. q := ( 2 < q ) ? q - 2 : 0;
  6278. mul64To128( bSig, q, term0, term1 );
  6279. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6280. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  6281. expDiff -= 62;
  6282. end;
  6283. expDiff += 64;
  6284. if ( 0 < expDiff ) begin
  6285. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6286. q := ( 2 < q ) ? q - 2 : 0;
  6287. q >>= 64 - expDiff;
  6288. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  6289. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6290. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  6291. while ( le128( term0, term1, aSig0, aSig1 ) ) begin
  6292. ++q;
  6293. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6294. end;
  6295. end;
  6296. else begin
  6297. term1 := 0;
  6298. term0 := bSig;
  6299. end;
  6300. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  6301. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6302. or ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6303. and ( q and 1 ) )
  6304. ) begin
  6305. aSig0 := alternateASig0;
  6306. aSig1 := alternateASig1;
  6307. zSign := ! zSign;
  6308. end;
  6309. result :=
  6310. normalizeRoundAndPackFloatx80(
  6311. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  6312. end;
  6313. {*----------------------------------------------------------------------------
  6314. | Returns the square root of the extended double-precision floating-point
  6315. | value `a'. The operation is performed according to the IEC/IEEE Standard
  6316. | for Binary Floating-Point Arithmetic.
  6317. *----------------------------------------------------------------------------*}
  6318. function floatx80_sqrt(a: floatx80): floatx80;
  6319. var
  6320. aSign: flag;
  6321. aExp, zExp: int32;
  6322. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  6323. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  6324. z: floatx80;
  6325. label
  6326. invalid;
  6327. begin
  6328. aSig0 := extractFloatx80Frac( a );
  6329. aExp := extractFloatx80Exp( a );
  6330. aSign := extractFloatx80Sign( a );
  6331. if ( aExp = $7FFF ) begin
  6332. if ( (bits64) ( aSig0 shl 1 ) ) result := propagateFloatx80NaN( a, a );
  6333. if ( ! aSign ) result := a;
  6334. goto invalid;
  6335. end;
  6336. if ( aSign ) begin
  6337. if ( ( aExp or aSig0 ) = 0 ) result := a;
  6338. invalid:
  6339. float_raise( float_flag_invalid );
  6340. z.low := floatx80_default_nan_low;
  6341. z.high := floatx80_default_nan_high;
  6342. result := z;
  6343. end;
  6344. if ( aExp = 0 ) begin
  6345. if ( aSig0 = 0 ) result := packFloatx80( 0, 0, 0 );
  6346. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6347. end;
  6348. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
  6349. zSig0 := estimateSqrt32( aExp, aSig0>>32 );
  6350. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  6351. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  6352. doubleZSig0 := zSig0 shl 1;
  6353. mul64To128( zSig0, zSig0, term0, term1 );
  6354. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  6355. while ( (sbits64) rem0 < 0 ) begin
  6356. --zSig0;
  6357. doubleZSig0 -= 2;
  6358. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, rem0, rem1 );
  6359. end;
  6360. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  6361. if ( ( zSig1 and LIT64( $3FFFFFFFFFFFFFFF ) ) <= 5 ) begin
  6362. if ( zSig1 = 0 ) zSig1 := 1;
  6363. mul64To128( doubleZSig0, zSig1, term1, term2 );
  6364. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6365. mul64To128( zSig1, zSig1, term2, term3 );
  6366. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  6367. while ( (sbits64) rem1 < 0 ) begin
  6368. --zSig1;
  6369. shortShift128Left( 0, zSig1, 1, term2, term3 );
  6370. term3 or= 1;
  6371. term2 or= doubleZSig0;
  6372. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  6373. end;
  6374. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  6375. end;
  6376. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  6377. zSig0 or= doubleZSig0;
  6378. result :=
  6379. roundAndPackFloatx80(
  6380. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  6381. end;
  6382. {*----------------------------------------------------------------------------
  6383. | Returns 1 if the extended double-precision floating-point value `a' is
  6384. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  6385. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  6386. | Arithmetic.
  6387. *----------------------------------------------------------------------------*}
  6388. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  6389. begin
  6390. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6391. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6392. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6393. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6394. ) begin
  6395. if ( floatx80_is_signaling_nan( a )
  6396. or floatx80_is_signaling_nan( b ) ) begin
  6397. float_raise( float_flag_invalid );
  6398. end;
  6399. result := 0;
  6400. end;
  6401. result :=
  6402. ( a.low = b.low )
  6403. and ( ( a.high = b.high )
  6404. or ( ( a.low = 0 )
  6405. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6406. );
  6407. end;
  6408. {*----------------------------------------------------------------------------
  6409. | Returns 1 if the extended double-precision floating-point value `a' is
  6410. | less than or equal to the corresponding value `b', and 0 otherwise. The
  6411. | comparison is performed according to the IEC/IEEE Standard for Binary
  6412. | Floating-Point Arithmetic.
  6413. *----------------------------------------------------------------------------*}
  6414. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  6415. var
  6416. aSign, bSign: flag;
  6417. begin
  6418. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6419. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6420. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6421. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6422. ) begin
  6423. float_raise( float_flag_invalid );
  6424. result := 0;
  6425. end;
  6426. aSign := extractFloatx80Sign( a );
  6427. bSign := extractFloatx80Sign( b );
  6428. if ( aSign <> bSign ) begin
  6429. result :=
  6430. aSign
  6431. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6432. = 0 );
  6433. end;
  6434. result :=
  6435. aSign ? le128( b.high, b.low, a.high, a.low )
  6436. : le128( a.high, a.low, b.high, b.low );
  6437. end;
  6438. {*----------------------------------------------------------------------------
  6439. | Returns 1 if the extended double-precision floating-point value `a' is
  6440. | less than the corresponding value `b', and 0 otherwise. The comparison
  6441. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6442. | Arithmetic.
  6443. *----------------------------------------------------------------------------*}
  6444. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  6445. var
  6446. aSign, bSign: flag;
  6447. begin
  6448. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6449. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6450. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6451. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6452. ) begin
  6453. float_raise( float_flag_invalid );
  6454. result := 0;
  6455. end;
  6456. aSign := extractFloatx80Sign( a );
  6457. bSign := extractFloatx80Sign( b );
  6458. if ( aSign <> bSign ) begin
  6459. result :=
  6460. aSign
  6461. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6462. <> 0 );
  6463. end;
  6464. result :=
  6465. aSign ? lt128( b.high, b.low, a.high, a.low )
  6466. : lt128( a.high, a.low, b.high, b.low );
  6467. end;
  6468. {*----------------------------------------------------------------------------
  6469. | Returns 1 if the extended double-precision floating-point value `a' is equal
  6470. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  6471. | raised if either operand is a NaN. Otherwise, the comparison is performed
  6472. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6473. *----------------------------------------------------------------------------*}
  6474. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  6475. begin
  6476. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6477. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6478. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6479. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6480. ) begin
  6481. float_raise( float_flag_invalid );
  6482. result := 0;
  6483. end;
  6484. result :=
  6485. ( a.low = b.low )
  6486. and ( ( a.high = b.high )
  6487. or ( ( a.low = 0 )
  6488. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6489. );
  6490. end;
  6491. {*----------------------------------------------------------------------------
  6492. | Returns 1 if the extended double-precision floating-point value `a' is less
  6493. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  6494. | do not cause an exception. Otherwise, the comparison is performed according
  6495. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6496. *----------------------------------------------------------------------------*}
  6497. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  6498. var
  6499. aSign, bSign: flag;
  6500. begin
  6501. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6502. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6503. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6504. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6505. ) begin
  6506. if ( floatx80_is_signaling_nan( a )
  6507. or floatx80_is_signaling_nan( b ) ) begin
  6508. float_raise( float_flag_invalid );
  6509. end;
  6510. result := 0;
  6511. end;
  6512. aSign := extractFloatx80Sign( a );
  6513. bSign := extractFloatx80Sign( b );
  6514. if ( aSign <> bSign ) begin
  6515. result :=
  6516. aSign
  6517. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6518. = 0 );
  6519. end;
  6520. result :=
  6521. aSign ? le128( b.high, b.low, a.high, a.low )
  6522. : le128( a.high, a.low, b.high, b.low );
  6523. end;
  6524. {*----------------------------------------------------------------------------
  6525. | Returns 1 if the extended double-precision floating-point value `a' is less
  6526. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  6527. | an exception. Otherwise, the comparison is performed according to the
  6528. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6529. *----------------------------------------------------------------------------*}
  6530. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  6531. var
  6532. aSign, bSign: flag;
  6533. begin
  6534. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6535. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6536. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6537. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6538. ) begin
  6539. if ( floatx80_is_signaling_nan( a )
  6540. or floatx80_is_signaling_nan( b ) ) begin
  6541. float_raise( float_flag_invalid );
  6542. end;
  6543. result := 0;
  6544. end;
  6545. aSign := extractFloatx80Sign( a );
  6546. bSign := extractFloatx80Sign( b );
  6547. if ( aSign <> bSign ) begin
  6548. result :=
  6549. aSign
  6550. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6551. <> 0 );
  6552. end;
  6553. result :=
  6554. aSign ? lt128( b.high, b.low, a.high, a.low )
  6555. : lt128( a.high, a.low, b.high, b.low );
  6556. end;
  6557. {$endif FPC_SOFTFLOAT_FLOATX80}
  6558. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6559. {*----------------------------------------------------------------------------
  6560. | Returns the least-significant 64 fraction bits of the quadruple-precision
  6561. | floating-point value `a'.
  6562. *----------------------------------------------------------------------------*}
  6563. function extractFloat128Frac1(a : float128): bits64;
  6564. begin
  6565. result:=a.low;
  6566. end;
  6567. {*----------------------------------------------------------------------------
  6568. | Returns the most-significant 48 fraction bits of the quadruple-precision
  6569. | floating-point value `a'.
  6570. *----------------------------------------------------------------------------*}
  6571. function extractFloat128Frac0(a : float128): bits64;
  6572. begin
  6573. result:=a.high and int64($0000FFFFFFFFFFFF);
  6574. end;
  6575. {*----------------------------------------------------------------------------
  6576. | Returns the exponent bits of the quadruple-precision floating-point value
  6577. | `a'.
  6578. *----------------------------------------------------------------------------*}
  6579. function extractFloat128Exp(a : float128): int32;
  6580. begin
  6581. result:=( a.high shr 48 ) and $7FFF;
  6582. end;
  6583. {*----------------------------------------------------------------------------
  6584. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  6585. *----------------------------------------------------------------------------*}
  6586. function extractFloat128Sign(a : float128): flag;
  6587. begin
  6588. result:=a.high shr 63;
  6589. end;
  6590. {*----------------------------------------------------------------------------
  6591. | Normalizes the subnormal quadruple-precision floating-point value
  6592. | represented by the denormalized significand formed by the concatenation of
  6593. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  6594. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  6595. | significand are stored at the location pointed to by `zSig0Ptr', and the
  6596. | least significant 64 bits of the normalized significand are stored at the
  6597. | location pointed to by `zSig1Ptr'.
  6598. *----------------------------------------------------------------------------*}
  6599. procedure normalizeFloat128Subnormal(
  6600. aSig0: bits64;
  6601. aSig1: bits64;
  6602. var zExpPtr: int32;
  6603. var zSig0Ptr: bits64;
  6604. var zSig1Ptr: bits64);
  6605. var
  6606. shiftCount: int8;
  6607. begin
  6608. if ( aSig0 = 0 ) then
  6609. begin
  6610. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  6611. if ( shiftCount < 0 ) then
  6612. begin
  6613. zSig0Ptr := aSig1 shr ( - shiftCount );
  6614. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  6615. end
  6616. else begin
  6617. zSig0Ptr := aSig1 shl shiftCount;
  6618. zSig1Ptr := 0;
  6619. end;
  6620. zExpPtr := - shiftCount - 63;
  6621. end
  6622. else begin
  6623. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  6624. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  6625. zExpPtr := 1 - shiftCount;
  6626. end;
  6627. end;
  6628. {*----------------------------------------------------------------------------
  6629. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  6630. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  6631. | floating-point value, returning the result. After being shifted into the
  6632. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  6633. | added together to form the most significant 32 bits of the result. This
  6634. | means that any integer portion of `zSig0' will be added into the exponent.
  6635. | Since a properly normalized significand will have an integer portion equal
  6636. | to 1, the `zExp' input should be 1 less than the desired result exponent
  6637. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  6638. | significand.
  6639. *----------------------------------------------------------------------------*}
  6640. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  6641. var
  6642. z: float128;
  6643. begin
  6644. z.low := zSig1;
  6645. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  6646. result:=z;
  6647. end;
  6648. {*----------------------------------------------------------------------------
  6649. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6650. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  6651. | and `zSig2', and returns the proper quadruple-precision floating-point value
  6652. | corresponding to the abstract input. Ordinarily, the abstract value is
  6653. | simply rounded and packed into the quadruple-precision format, with the
  6654. | inexact exception raised if the abstract input cannot be represented
  6655. | exactly. However, if the abstract value is too large, the overflow and
  6656. | inexact exceptions are raised and an infinity or maximal finite value is
  6657. | returned. If the abstract value is too small, the input value is rounded to
  6658. | a subnormal number, and the underflow and inexact exceptions are raised if
  6659. | the abstract input cannot be represented exactly as a subnormal quadruple-
  6660. | precision floating-point number.
  6661. | The input significand must be normalized or smaller. If the input
  6662. | significand is not normalized, `zExp' must be 0; in that case, the result
  6663. | returned is a subnormal number, and it must not require rounding. In the
  6664. | usual case that the input significand is normalized, `zExp' must be 1 less
  6665. | than the ``true'' floating-point exponent. The handling of underflow and
  6666. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6667. *----------------------------------------------------------------------------*}
  6668. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  6669. var
  6670. roundingMode: int8;
  6671. roundNearestEven, increment, isTiny: flag;
  6672. begin
  6673. roundingMode := softfloat_rounding_mode;
  6674. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  6675. increment := ord( sbits64(zSig2) < 0 );
  6676. if ( roundNearestEven=0 ) then
  6677. begin
  6678. if ( roundingMode = float_round_to_zero ) then
  6679. begin
  6680. increment := 0;
  6681. end
  6682. else begin
  6683. if ( zSign<>0 ) then
  6684. begin
  6685. increment := ord( roundingMode = float_round_down ) and zSig2;
  6686. end
  6687. else begin
  6688. increment := ord( roundingMode = float_round_up ) and zSig2;
  6689. end;
  6690. end;
  6691. end;
  6692. if ( $7FFD <= bits32(zExp) ) then
  6693. begin
  6694. if ( ord( $7FFD < zExp )
  6695. or ( ord( zExp = $7FFD )
  6696. and eq128(
  6697. int64( $0001FFFFFFFFFFFF ),
  6698. int64( $FFFFFFFFFFFFFFFF ),
  6699. zSig0,
  6700. zSig1
  6701. )
  6702. and increment
  6703. )
  6704. )<>0 then
  6705. begin
  6706. float_raise( float_flag_overflow or float_flag_inexact );
  6707. if ( ord( roundingMode = float_round_to_zero )
  6708. or ( zSign and ord( roundingMode = float_round_up ) )
  6709. or ( not(zSign) and ord( roundingMode = float_round_down ) )
  6710. )<>0 then
  6711. begin
  6712. result :=
  6713. packFloat128(
  6714. zSign,
  6715. $7FFE,
  6716. int64( $0000FFFFFFFFFFFF ),
  6717. int64( $FFFFFFFFFFFFFFFF )
  6718. );
  6719. end;
  6720. result:=packFloat128( zSign, $7FFF, 0, 0 );
  6721. end;
  6722. if ( zExp < 0 ) then
  6723. begin
  6724. isTiny :=
  6725. ord(( float_detect_tininess = float_tininess_before_rounding )
  6726. or ( zExp < -1 )
  6727. or not( increment<>0 )
  6728. or boolean(lt128(
  6729. zSig0,
  6730. zSig1,
  6731. int64( $0001FFFFFFFFFFFF ),
  6732. int64( $FFFFFFFFFFFFFFFF )
  6733. )));
  6734. shift128ExtraRightJamming(
  6735. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  6736. zExp := 0;
  6737. if ( isTiny and zSig2 )<>0 then
  6738. float_raise( float_flag_underflow );
  6739. if ( roundNearestEven<>0 ) then
  6740. begin
  6741. increment := ord( sbits64(zSig2) < 0 );
  6742. end
  6743. else begin
  6744. if ( zSign<>0 ) then
  6745. begin
  6746. increment := ord( roundingMode = float_round_down ) and zSig2;
  6747. end
  6748. else begin
  6749. increment := ord( roundingMode = float_round_up ) and zSig2;
  6750. end;
  6751. end;
  6752. end;
  6753. end;
  6754. if ( zSig2<>0 ) then
  6755. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6756. if ( increment<>0 ) then
  6757. begin
  6758. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  6759. zSig1 := zSig1 and not( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  6760. end
  6761. else begin
  6762. if ( ( zSig0 or zSig1 ) = 0 ) then
  6763. zExp := 0;
  6764. end;
  6765. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  6766. end;
  6767. {*----------------------------------------------------------------------------
  6768. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6769. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  6770. | returns the proper quadruple-precision floating-point value corresponding
  6771. | to the abstract input. This routine is just like `roundAndPackFloat128'
  6772. | except that the input significand has fewer bits and does not have to be
  6773. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  6774. | point exponent.
  6775. *----------------------------------------------------------------------------*}
  6776. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  6777. var
  6778. shiftCount: int8;
  6779. zSig2: bits64;
  6780. begin
  6781. if ( zSig0 = 0 ) then
  6782. begin
  6783. zSig0 := zSig1;
  6784. zSig1 := 0;
  6785. dec(zExp, 64);
  6786. end;
  6787. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  6788. if ( 0 <= shiftCount ) then
  6789. begin
  6790. zSig2 := 0;
  6791. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6792. end
  6793. else begin
  6794. shift128ExtraRightJamming(
  6795. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  6796. end;
  6797. dec(zExp, shiftCount);
  6798. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  6799. end;
  6800. {*----------------------------------------------------------------------------
  6801. | Returns the result of converting the quadruple-precision floating-point
  6802. | value `a' to the 32-bit two's complement integer format. The conversion
  6803. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6804. | Arithmetic---which means in particular that the conversion is rounded
  6805. | according to the current rounding mode. If `a' is a NaN, the largest
  6806. | positive integer is returned. Otherwise, if the conversion overflows, the
  6807. | largest integer with the same sign as `a' is returned.
  6808. *----------------------------------------------------------------------------*}
  6809. function float128_to_int32(a: float128): int32;
  6810. var
  6811. aSign: flag;
  6812. aExp, shiftCount: int32;
  6813. aSig0, aSig1: bits64;
  6814. begin
  6815. aSig1 := extractFloat128Frac1( a );
  6816. aSig0 := extractFloat128Frac0( a );
  6817. aExp := extractFloat128Exp( a );
  6818. aSign := extractFloat128Sign( a );
  6819. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  6820. aSign := 0;
  6821. if ( aExp<>0 ) then
  6822. aSig0 := aSig0 or int64( $0001000000000000 );
  6823. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6824. shiftCount := $4028 - aExp;
  6825. if ( 0 < shiftCount ) then
  6826. shift64RightJamming( aSig0, shiftCount, aSig0 );
  6827. result := roundAndPackInt32( aSign, aSig0 );
  6828. end;
  6829. {*----------------------------------------------------------------------------
  6830. | Returns the result of converting the quadruple-precision floating-point
  6831. | value `a' to the 32-bit two's complement integer format. The conversion
  6832. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6833. | Arithmetic, except that the conversion is always rounded toward zero. If
  6834. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  6835. | conversion overflows, the largest integer with the same sign as `a' is
  6836. | returned.
  6837. *----------------------------------------------------------------------------*}
  6838. function float128_to_int32_round_to_zero(a: float128): int32;
  6839. var
  6840. aSign: flag;
  6841. aExp, shiftCount: int32;
  6842. aSig0, aSig1, savedASig: bits64;
  6843. z: int32;
  6844. label
  6845. invalid;
  6846. begin
  6847. aSig1 := extractFloat128Frac1( a );
  6848. aSig0 := extractFloat128Frac0( a );
  6849. aExp := extractFloat128Exp( a );
  6850. aSign := extractFloat128Sign( a );
  6851. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6852. if ( $401E < aExp ) then
  6853. begin
  6854. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  6855. aSign := 0;
  6856. goto invalid;
  6857. end
  6858. else if ( aExp < $3FFF ) then
  6859. begin
  6860. if ( aExp or aSig0 )<>0 then
  6861. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6862. result := 0;
  6863. exit;
  6864. end;
  6865. aSig0 := aSig0 or int64( $0001000000000000 );
  6866. shiftCount := $402F - aExp;
  6867. savedASig := aSig0;
  6868. aSig0 := aSig0 shr shiftCount;
  6869. z := aSig0;
  6870. if ( aSign )<>0 then
  6871. z := - z;
  6872. if ( ord( z < 0 ) xor aSign )<>0 then
  6873. begin
  6874. invalid:
  6875. float_raise( float_flag_invalid );
  6876. if aSign<>0 then
  6877. result:=$80000000
  6878. else
  6879. result:=$7FFFFFFF;
  6880. exit;
  6881. end;
  6882. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  6883. begin
  6884. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6885. end;
  6886. result := z;
  6887. end;
  6888. {*----------------------------------------------------------------------------
  6889. | Returns the result of converting the quadruple-precision floating-point
  6890. | value `a' to the 64-bit two's complement integer format. The conversion
  6891. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6892. | Arithmetic---which means in particular that the conversion is rounded
  6893. | according to the current rounding mode. If `a' is a NaN, the largest
  6894. | positive integer is returned. Otherwise, if the conversion overflows, the
  6895. | largest integer with the same sign as `a' is returned.
  6896. *----------------------------------------------------------------------------*}
  6897. function float128_to_int64(a: float128): int64;
  6898. var
  6899. aSign: flag;
  6900. aExp, shiftCount: int32;
  6901. aSig0, aSig1: bits64;
  6902. begin
  6903. aSig1 := extractFloat128Frac1( a );
  6904. aSig0 := extractFloat128Frac0( a );
  6905. aExp := extractFloat128Exp( a );
  6906. aSign := extractFloat128Sign( a );
  6907. if ( aExp<>0 ) then
  6908. aSig0 := aSig0 or int64( $0001000000000000 );
  6909. shiftCount := $402F - aExp;
  6910. if ( shiftCount <= 0 ) then
  6911. begin
  6912. if ( $403E < aExp ) then
  6913. begin
  6914. float_raise( float_flag_invalid );
  6915. if ( (aSign=0)
  6916. or ( ( aExp = $7FFF )
  6917. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  6918. )
  6919. ) then
  6920. begin
  6921. result := int64( $7FFFFFFFFFFFFFFF );
  6922. end;
  6923. result := int64( $8000000000000000 );
  6924. end;
  6925. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  6926. end
  6927. else begin
  6928. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  6929. end;
  6930. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  6931. end;
  6932. {*----------------------------------------------------------------------------
  6933. | Returns the result of converting the quadruple-precision floating-point
  6934. | value `a' to the 64-bit two's complement integer format. The conversion
  6935. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6936. | Arithmetic, except that the conversion is always rounded toward zero.
  6937. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  6938. | the conversion overflows, the largest integer with the same sign as `a' is
  6939. | returned.
  6940. *----------------------------------------------------------------------------*}
  6941. function float128_to_int64_round_to_zero(a: float128): int64;
  6942. var
  6943. aSign: flag;
  6944. aExp, shiftCount: int32;
  6945. aSig0, aSig1: bits64;
  6946. z: int64;
  6947. begin
  6948. aSig1 := extractFloat128Frac1( a );
  6949. aSig0 := extractFloat128Frac0( a );
  6950. aExp := extractFloat128Exp( a );
  6951. aSign := extractFloat128Sign( a );
  6952. if ( aExp<>0 ) then
  6953. aSig0 := aSig0 or int64( $0001000000000000 );
  6954. shiftCount := aExp - $402F;
  6955. if ( 0 < shiftCount ) then
  6956. begin
  6957. if ( $403E <= aExp ) then
  6958. begin
  6959. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  6960. if ( ( a.high = int64( $C03E000000000000 ) )
  6961. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  6962. begin
  6963. if ( aSig1<>0 ) then
  6964. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6965. end
  6966. else begin
  6967. float_raise( float_flag_invalid );
  6968. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  6969. begin
  6970. result := int64( $7FFFFFFFFFFFFFFF );
  6971. exit;
  6972. end;
  6973. end;
  6974. result := int64( $8000000000000000 );
  6975. exit;
  6976. end;
  6977. z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
  6978. if ( int64( aSig1 shl shiftCount )<>0 ) then
  6979. begin
  6980. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6981. end;
  6982. end
  6983. else begin
  6984. if ( aExp < $3FFF ) then
  6985. begin
  6986. if ( aExp or aSig0 or aSig1 )<>0 then
  6987. begin
  6988. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6989. end;
  6990. result := 0;
  6991. exit;
  6992. end;
  6993. z := aSig0 shr ( - shiftCount );
  6994. if ( (aSig1<>0)
  6995. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  6996. begin
  6997. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6998. end;
  6999. end;
  7000. if ( aSign<>0 ) then
  7001. z := - z;
  7002. result := z;
  7003. end;
  7004. {*----------------------------------------------------------------------------
  7005. | Returns the result of converting the quadruple-precision floating-point
  7006. | value `a' to the single-precision floating-point format. The conversion
  7007. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7008. | Arithmetic.
  7009. *----------------------------------------------------------------------------*}
  7010. function float128_to_float32(a: float128): float32;
  7011. var
  7012. aSign: flag;
  7013. aExp: int32;
  7014. aSig0, aSig1: bits64;
  7015. zSig: bits32;
  7016. begin
  7017. aSig1 := extractFloat128Frac1( a );
  7018. aSig0 := extractFloat128Frac0( a );
  7019. aExp := extractFloat128Exp( a );
  7020. aSign := extractFloat128Sign( a );
  7021. if ( aExp = $7FFF ) then
  7022. begin
  7023. if ( aSig0 or aSig1 )<>0 then
  7024. begin
  7025. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  7026. exit;
  7027. end;
  7028. result := packFloat32( aSign, $FF, 0 );
  7029. exit;
  7030. end;
  7031. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7032. shift64RightJamming( aSig0, 18, aSig0 );
  7033. zSig := aSig0;
  7034. if ( aExp or zSig )<>0 then
  7035. begin
  7036. zSig := zSig or $40000000;
  7037. dec(aExp,$3F81);
  7038. end;
  7039. result := roundAndPackFloat32( aSign, aExp, zSig );
  7040. end;
  7041. {*----------------------------------------------------------------------------
  7042. | Returns the result of converting the quadruple-precision floating-point
  7043. | value `a' to the double-precision floating-point format. The conversion
  7044. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7045. | Arithmetic.
  7046. *----------------------------------------------------------------------------*}
  7047. function float128_to_float64(a: float128): float64;
  7048. var
  7049. aSign: flag;
  7050. aExp: int32;
  7051. aSig0, aSig1: bits64;
  7052. begin
  7053. aSig1 := extractFloat128Frac1( a );
  7054. aSig0 := extractFloat128Frac0( a );
  7055. aExp := extractFloat128Exp( a );
  7056. aSign := extractFloat128Sign( a );
  7057. if ( aExp = $7FFF ) then
  7058. begin
  7059. if ( aSig0 or aSig1 )<>0 then
  7060. begin
  7061. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  7062. exit;
  7063. end;
  7064. result:=packFloat64( aSign, $7FF, 0);
  7065. exit;
  7066. end;
  7067. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7068. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7069. if ( aExp or aSig0 )<>0 then
  7070. begin
  7071. aSig0 := aSig0 or int64( $4000000000000000 );
  7072. dec(aExp,$3C01);
  7073. end;
  7074. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7075. end;
  7076. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7077. {*----------------------------------------------------------------------------
  7078. | Returns the result of converting the quadruple-precision floating-point
  7079. | value `a' to the extended double-precision floating-point format. The
  7080. | conversion is performed according to the IEC/IEEE Standard for Binary
  7081. | Floating-Point Arithmetic.
  7082. *----------------------------------------------------------------------------*}
  7083. function float128_to_floatx80(a: float128): floatx80;
  7084. var
  7085. aSign: flag;
  7086. aExp: int32;
  7087. aSig0, aSig1: bits64;
  7088. begin
  7089. aSig1 := extractFloat128Frac1( a );
  7090. aSig0 := extractFloat128Frac0( a );
  7091. aExp := extractFloat128Exp( a );
  7092. aSign := extractFloat128Sign( a );
  7093. if ( aExp = $7FFF ) begin
  7094. if ( aSig0 or aSig1 ) begin
  7095. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7096. exit;
  7097. end;
  7098. result := packFloatx80( aSign, $7FFF, int64( $8000000000000000 ) );
  7099. exit;
  7100. end;
  7101. if ( aExp = 0 ) begin
  7102. if ( ( aSig0 or aSig1 ) = 0 ) then
  7103. begin
  7104. result := packFloatx80( aSign, 0, 0 );
  7105. exit;
  7106. end;
  7107. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7108. end;
  7109. else begin
  7110. aSig0 or= int64( $0001000000000000 );
  7111. end;
  7112. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7113. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7114. end;
  7115. {$endif FPC_SOFTFLOAT_FLOATX80}
  7116. {*----------------------------------------------------------------------------
  7117. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7118. | Returns the result as a quadruple-precision floating-point value. The
  7119. | operation is performed according to the IEC/IEEE Standard for Binary
  7120. | Floating-Point Arithmetic.
  7121. *----------------------------------------------------------------------------*}
  7122. function float128_round_to_int(a: float128): float128;
  7123. var
  7124. aSign: flag;
  7125. aExp: int32;
  7126. lastBitMask, roundBitsMask: bits64;
  7127. roundingMode: int8;
  7128. z: float128;
  7129. begin
  7130. aExp := extractFloat128Exp( a );
  7131. if ( $402F <= aExp ) then
  7132. begin
  7133. if ( $406F <= aExp ) then
  7134. begin
  7135. if ( ( aExp = $7FFF )
  7136. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7137. ) then
  7138. begin
  7139. result := propagateFloat128NaN( a, a );
  7140. exit;
  7141. end;
  7142. result := a;
  7143. exit;
  7144. end;
  7145. lastBitMask := 1;
  7146. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7147. roundBitsMask := lastBitMask - 1;
  7148. z := a;
  7149. roundingMode := softfloat_rounding_mode;
  7150. if ( roundingMode = float_round_nearest_even ) then
  7151. begin
  7152. if ( lastBitMask )<>0 then
  7153. begin
  7154. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7155. if ( ( z.low and roundBitsMask ) = 0 ) then
  7156. z.low := z.low and not(lastBitMask);
  7157. end
  7158. else begin
  7159. if ( sbits64(z.low) < 0 ) then
  7160. begin
  7161. inc(z.high);
  7162. if ( bits64( z.low shl 1 ) = 0 ) then
  7163. z.high := z.high and not(1);
  7164. end;
  7165. end;
  7166. end
  7167. else if ( roundingMode <> float_round_to_zero ) then
  7168. begin
  7169. if ( extractFloat128Sign( z )
  7170. xor ord( roundingMode = float_round_up ) )<>0 then
  7171. begin
  7172. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7173. end;
  7174. end;
  7175. z.low := z.low and not(roundBitsMask);
  7176. end
  7177. else begin
  7178. if ( aExp < $3FFF ) then
  7179. begin
  7180. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7181. begin
  7182. result := a;
  7183. exit;
  7184. end;
  7185. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7186. aSign := extractFloat128Sign( a );
  7187. case softfloat_rounding_mode of
  7188. float_round_nearest_even:
  7189. if ( ( aExp = $3FFE )
  7190. and ( (extractFloat128Frac0( a )<>0)
  7191. or (extractFloat128Frac1( a )<>0) )
  7192. ) then begin
  7193. begin
  7194. result := packFloat128( aSign, $3FFF, 0, 0 );
  7195. exit;
  7196. end;
  7197. end;
  7198. float_round_down:
  7199. begin
  7200. if aSign<>0 then
  7201. result:=packFloat128( 1, $3FFF, 0, 0 )
  7202. else
  7203. result:=packFloat128( 0, 0, 0, 0 );
  7204. exit;
  7205. end;
  7206. float_round_up:
  7207. begin
  7208. if aSign<>0 then
  7209. result := packFloat128( 1, 0, 0, 0 )
  7210. else
  7211. result:=packFloat128( 0, $3FFF, 0, 0 );
  7212. exit;
  7213. end;
  7214. end;
  7215. result := packFloat128( aSign, 0, 0, 0 );
  7216. exit;
  7217. end;
  7218. lastBitMask := 1;
  7219. lastBitMask := lastBitMask shl ($402F - aExp);
  7220. roundBitsMask := lastBitMask - 1;
  7221. z.low := 0;
  7222. z.high := a.high;
  7223. roundingMode := softfloat_rounding_mode;
  7224. if ( roundingMode = float_round_nearest_even ) then begin
  7225. inc(z.high,lastBitMask shr 1);
  7226. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7227. z.high := z.high and not(lastBitMask);
  7228. end;
  7229. end
  7230. else if ( roundingMode <> float_round_to_zero ) then begin
  7231. if ( (extractFloat128Sign( z )<>0)
  7232. xor ( roundingMode = float_round_up ) ) then begin
  7233. z.high := z.high or ord( a.low <> 0 );
  7234. z.high := z.high+roundBitsMask;
  7235. end;
  7236. end;
  7237. z.high := z.high and not(roundBitsMask);
  7238. end;
  7239. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7240. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7241. end;
  7242. result := z;
  7243. end;
  7244. {*----------------------------------------------------------------------------
  7245. | Returns the result of adding the absolute values of the quadruple-precision
  7246. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7247. | before being returned. `zSign' is ignored if the result is a NaN.
  7248. | The addition is performed according to the IEC/IEEE Standard for Binary
  7249. | Floating-Point Arithmetic.
  7250. *----------------------------------------------------------------------------*}
  7251. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7252. var
  7253. aExp, bExp, zExp: int32;
  7254. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7255. expDiff: int32;
  7256. label
  7257. shiftRight1,roundAndPack;
  7258. begin
  7259. aSig1 := extractFloat128Frac1( a );
  7260. aSig0 := extractFloat128Frac0( a );
  7261. aExp := extractFloat128Exp( a );
  7262. bSig1 := extractFloat128Frac1( b );
  7263. bSig0 := extractFloat128Frac0( b );
  7264. bExp := extractFloat128Exp( b );
  7265. expDiff := aExp - bExp;
  7266. if ( 0 < expDiff ) then begin
  7267. if ( aExp = $7FFF ) then begin
  7268. if ( aSig0 or aSig1 )<>0 then
  7269. begin
  7270. result := propagateFloat128NaN( a, b );
  7271. exit;
  7272. end;
  7273. result := a;
  7274. exit;
  7275. end;
  7276. if ( bExp = 0 ) then begin
  7277. dec(expDiff);
  7278. end
  7279. else begin
  7280. bSig0 := bSig0 or int64( $0001000000000000 );
  7281. end;
  7282. shift128ExtraRightJamming(
  7283. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  7284. zExp := aExp;
  7285. end
  7286. else if ( expDiff < 0 ) then begin
  7287. if ( bExp = $7FFF ) then begin
  7288. if ( bSig0 or bSig1 )<>0 then
  7289. begin
  7290. result := propagateFloat128NaN( a, b );
  7291. exit;
  7292. end;
  7293. result := packFloat128( zSign, $7FFF, 0, 0 );
  7294. exit;
  7295. end;
  7296. if ( aExp = 0 ) then begin
  7297. inc(expDiff);
  7298. end
  7299. else begin
  7300. aSig0 := aSig0 or int64( $0001000000000000 );
  7301. end;
  7302. shift128ExtraRightJamming(
  7303. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  7304. zExp := bExp;
  7305. end
  7306. else begin
  7307. if ( aExp = $7FFF ) then begin
  7308. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7309. result := propagateFloat128NaN( a, b );
  7310. exit;
  7311. end;
  7312. result := a;
  7313. exit;
  7314. end;
  7315. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7316. if ( aExp = 0 ) then
  7317. begin
  7318. result := packFloat128( zSign, 0, zSig0, zSig1 );
  7319. exit;
  7320. end;
  7321. zSig2 := 0;
  7322. zSig0 := zSig0 or int64( $0002000000000000 );
  7323. zExp := aExp;
  7324. goto shiftRight1;
  7325. end;
  7326. aSig0 := aSig0 or int64( $0001000000000000 );
  7327. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7328. dec(zExp);
  7329. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  7330. inc(zExp);
  7331. shiftRight1:
  7332. shift128ExtraRightJamming(
  7333. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7334. roundAndPack:
  7335. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7336. end;
  7337. {*----------------------------------------------------------------------------
  7338. | Returns the result of subtracting the absolute values of the quadruple-
  7339. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  7340. | difference is negated before being returned. `zSign' is ignored if the
  7341. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  7342. | Standard for Binary Floating-Point Arithmetic.
  7343. *----------------------------------------------------------------------------*}
  7344. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  7345. var
  7346. aExp, bExp, zExp: int32;
  7347. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  7348. expDiff: int32;
  7349. z: float128;
  7350. label
  7351. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  7352. begin
  7353. aSig1 := extractFloat128Frac1( a );
  7354. aSig0 := extractFloat128Frac0( a );
  7355. aExp := extractFloat128Exp( a );
  7356. bSig1 := extractFloat128Frac1( b );
  7357. bSig0 := extractFloat128Frac0( b );
  7358. bExp := extractFloat128Exp( b );
  7359. expDiff := aExp - bExp;
  7360. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7361. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  7362. if ( 0 < expDiff ) then goto aExpBigger;
  7363. if ( expDiff < 0 ) then goto bExpBigger;
  7364. if ( aExp = $7FFF ) then begin
  7365. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7366. result := propagateFloat128NaN( a, b );
  7367. exit;
  7368. end;
  7369. float_raise( float_flag_invalid );
  7370. z.low := float128_default_nan_low;
  7371. z.high := float128_default_nan_high;
  7372. result := z;
  7373. exit;
  7374. end;
  7375. if ( aExp = 0 ) then begin
  7376. aExp := 1;
  7377. bExp := 1;
  7378. end;
  7379. if ( bSig0 < aSig0 ) then goto aBigger;
  7380. if ( aSig0 < bSig0 ) then goto bBigger;
  7381. if ( bSig1 < aSig1 ) then goto aBigger;
  7382. if ( aSig1 < bSig1 ) then goto bBigger;
  7383. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  7384. exit;
  7385. bExpBigger:
  7386. if ( bExp = $7FFF ) then begin
  7387. if ( bSig0 or bSig1 )<>0 then
  7388. begin
  7389. result := propagateFloat128NaN( a, b );
  7390. exit;
  7391. end;
  7392. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  7393. exit;
  7394. end;
  7395. if ( aExp = 0 ) then begin
  7396. inc(expDiff);
  7397. end
  7398. else begin
  7399. aSig0 := aSig0 or int64( $4000000000000000 );
  7400. end;
  7401. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7402. bSig0 := bSig0 or int64( $4000000000000000 );
  7403. bBigger:
  7404. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  7405. zExp := bExp;
  7406. zSign := zSign xor 1;
  7407. goto normalizeRoundAndPack;
  7408. aExpBigger:
  7409. if ( aExp = $7FFF ) then begin
  7410. if ( aSig0 or aSig1 )<>0 then
  7411. begin
  7412. result := propagateFloat128NaN( a, b );
  7413. exit;
  7414. end;
  7415. result := a;
  7416. exit;
  7417. end;
  7418. if ( bExp = 0 ) then begin
  7419. dec(expDiff);
  7420. end
  7421. else begin
  7422. bSig0 := bSig0 or int64( $4000000000000000 );
  7423. end;
  7424. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  7425. aSig0 := aSig0 or int64( $4000000000000000 );
  7426. aBigger:
  7427. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7428. zExp := aExp;
  7429. normalizeRoundAndPack:
  7430. dec(zExp);
  7431. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  7432. end;
  7433. {*----------------------------------------------------------------------------
  7434. | Returns the result of adding the quadruple-precision floating-point values
  7435. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  7436. | for Binary Floating-Point Arithmetic.
  7437. *----------------------------------------------------------------------------*}
  7438. function float128_add(a: float128; b: float128): float128;
  7439. var
  7440. aSign, bSign: flag;
  7441. begin
  7442. aSign := extractFloat128Sign( a );
  7443. bSign := extractFloat128Sign( b );
  7444. if ( aSign = bSign ) then begin
  7445. result := addFloat128Sigs( a, b, aSign );
  7446. end
  7447. else begin
  7448. result := subFloat128Sigs( a, b, aSign );
  7449. end;
  7450. end;
  7451. {*----------------------------------------------------------------------------
  7452. | Returns the result of subtracting the quadruple-precision floating-point
  7453. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7454. | Standard for Binary Floating-Point Arithmetic.
  7455. *----------------------------------------------------------------------------*}
  7456. function float128_sub(a: float128; b: float128): float128;
  7457. var
  7458. aSign, bSign: flag;
  7459. begin
  7460. aSign := extractFloat128Sign( a );
  7461. bSign := extractFloat128Sign( b );
  7462. if ( aSign = bSign ) then begin
  7463. result := subFloat128Sigs( a, b, aSign );
  7464. end
  7465. else begin
  7466. result := addFloat128Sigs( a, b, aSign );
  7467. end;
  7468. end;
  7469. {*----------------------------------------------------------------------------
  7470. | Returns the result of multiplying the quadruple-precision floating-point
  7471. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7472. | Standard for Binary Floating-Point Arithmetic.
  7473. *----------------------------------------------------------------------------*}
  7474. function float128_mul(a: float128; b: float128): float128;
  7475. var
  7476. aSign, bSign, zSign: flag;
  7477. aExp, bExp, zExp: int32;
  7478. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  7479. z: float128;
  7480. label
  7481. invalid;
  7482. begin
  7483. aSig1 := extractFloat128Frac1( a );
  7484. aSig0 := extractFloat128Frac0( a );
  7485. aExp := extractFloat128Exp( a );
  7486. aSign := extractFloat128Sign( a );
  7487. bSig1 := extractFloat128Frac1( b );
  7488. bSig0 := extractFloat128Frac0( b );
  7489. bExp := extractFloat128Exp( b );
  7490. bSign := extractFloat128Sign( b );
  7491. zSign := aSign xor bSign;
  7492. if ( aExp = $7FFF ) then begin
  7493. if ( (( aSig0 or aSig1 )<>0)
  7494. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7495. result := propagateFloat128NaN( a, b );
  7496. exit;
  7497. end;
  7498. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  7499. result := packFloat128( zSign, $7FFF, 0, 0 );
  7500. exit;
  7501. end;
  7502. if ( bExp = $7FFF ) then begin
  7503. if ( bSig0 or bSig1 )<>0 then
  7504. begin
  7505. result := propagateFloat128NaN( a, b );
  7506. exit;
  7507. end;
  7508. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7509. invalid:
  7510. float_raise( float_flag_invalid );
  7511. z.low := float128_default_nan_low;
  7512. z.high := float128_default_nan_high;
  7513. result := z;
  7514. exit;
  7515. end;
  7516. result := packFloat128( zSign, $7FFF, 0, 0 );
  7517. exit;
  7518. end;
  7519. if ( aExp = 0 ) then begin
  7520. if ( ( aSig0 or aSig1 ) = 0 ) then
  7521. begin
  7522. result := packFloat128( zSign, 0, 0, 0 );
  7523. exit;
  7524. end;
  7525. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7526. end;
  7527. if ( bExp = 0 ) then begin
  7528. if ( ( bSig0 or bSig1 ) = 0 ) then
  7529. begin
  7530. result := packFloat128( zSign, 0, 0, 0 );
  7531. exit;
  7532. end;
  7533. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7534. end;
  7535. zExp := aExp + bExp - $4000;
  7536. aSig0 := aSig0 or int64( $0001000000000000 );
  7537. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  7538. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  7539. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  7540. zSig2 := zSig2 or ord( zSig3 <> 0 );
  7541. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  7542. shift128ExtraRightJamming(
  7543. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7544. inc(zExp);
  7545. end;
  7546. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7547. end;
  7548. {*----------------------------------------------------------------------------
  7549. | Returns the result of dividing the quadruple-precision floating-point value
  7550. | `a' by the corresponding value `b'. The operation is performed according to
  7551. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7552. *----------------------------------------------------------------------------*}
  7553. function float128_div(a: float128; b: float128): float128;
  7554. var
  7555. aSign, bSign, zSign: flag;
  7556. aExp, bExp, zExp: int32;
  7557. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7558. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7559. z: float128;
  7560. label
  7561. invalid;
  7562. begin
  7563. aSig1 := extractFloat128Frac1( a );
  7564. aSig0 := extractFloat128Frac0( a );
  7565. aExp := extractFloat128Exp( a );
  7566. aSign := extractFloat128Sign( a );
  7567. bSig1 := extractFloat128Frac1( b );
  7568. bSig0 := extractFloat128Frac0( b );
  7569. bExp := extractFloat128Exp( b );
  7570. bSign := extractFloat128Sign( b );
  7571. zSign := aSign xor bSign;
  7572. if ( aExp = $7FFF ) then begin
  7573. if ( aSig0 or aSig1 )<>0 then
  7574. begin
  7575. result := propagateFloat128NaN( a, b );
  7576. exit;
  7577. end;
  7578. if ( bExp = $7FFF ) then begin
  7579. if ( bSig0 or bSig1 )<>0 then
  7580. begin
  7581. result := propagateFloat128NaN( a, b );
  7582. exit;
  7583. end;
  7584. goto invalid;
  7585. end;
  7586. result := packFloat128( zSign, $7FFF, 0, 0 );
  7587. exit;
  7588. end;
  7589. if ( bExp = $7FFF ) then begin
  7590. if ( bSig0 or bSig1 )<>0 then
  7591. begin
  7592. result := propagateFloat128NaN( a, b );
  7593. exit;
  7594. end;
  7595. result := packFloat128( zSign, 0, 0, 0 );
  7596. exit;
  7597. end;
  7598. if ( bExp = 0 ) then begin
  7599. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7600. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7601. invalid:
  7602. float_raise( float_flag_invalid );
  7603. z.low := float128_default_nan_low;
  7604. z.high := float128_default_nan_high;
  7605. result := z;
  7606. exit;
  7607. end;
  7608. float_raise( float_flag_divbyzero );
  7609. result := packFloat128( zSign, $7FFF, 0, 0 );
  7610. exit;
  7611. end;
  7612. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7613. end;
  7614. if ( aExp = 0 ) then begin
  7615. if ( ( aSig0 or aSig1 ) = 0 ) then
  7616. begin
  7617. result := packFloat128( zSign, 0, 0, 0 );
  7618. exit;
  7619. end;
  7620. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7621. end;
  7622. zExp := aExp - bExp + $3FFD;
  7623. shortShift128Left(
  7624. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  7625. shortShift128Left(
  7626. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7627. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  7628. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  7629. inc(zExp);
  7630. end;
  7631. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7632. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  7633. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  7634. while ( sbits64(rem0) < 0 ) do begin
  7635. dec(zSig0);
  7636. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  7637. end;
  7638. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  7639. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  7640. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  7641. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  7642. while ( sbits64(rem1) < 0 ) do begin
  7643. dec(zSig1);
  7644. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  7645. end;
  7646. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7647. end;
  7648. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  7649. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7650. end;
  7651. {*----------------------------------------------------------------------------
  7652. | Returns the remainder of the quadruple-precision floating-point value `a'
  7653. | with respect to the corresponding value `b'. The operation is performed
  7654. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7655. *----------------------------------------------------------------------------*}
  7656. function float128_rem(a: float128; b: float128): float128;
  7657. var
  7658. aSign, bSign, zSign: flag;
  7659. aExp, bExp, expDiff: int32;
  7660. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  7661. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  7662. sigMean0: sbits64;
  7663. z: float128;
  7664. label
  7665. invalid;
  7666. begin
  7667. aSig1 := extractFloat128Frac1( a );
  7668. aSig0 := extractFloat128Frac0( a );
  7669. aExp := extractFloat128Exp( a );
  7670. aSign := extractFloat128Sign( a );
  7671. bSig1 := extractFloat128Frac1( b );
  7672. bSig0 := extractFloat128Frac0( b );
  7673. bExp := extractFloat128Exp( b );
  7674. bSign := extractFloat128Sign( b );
  7675. if ( aExp = $7FFF ) then begin
  7676. if ( (( aSig0 or aSig1 )<>0)
  7677. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7678. result := propagateFloat128NaN( a, b );
  7679. exit;
  7680. end;
  7681. goto invalid;
  7682. end;
  7683. if ( bExp = $7FFF ) then begin
  7684. if ( bSig0 or bSig1 )<>0 then
  7685. begin
  7686. result := propagateFloat128NaN( a, b );
  7687. exit;
  7688. end;
  7689. result := a;
  7690. exit;
  7691. end;
  7692. if ( bExp = 0 ) then begin
  7693. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7694. invalid:
  7695. float_raise( float_flag_invalid );
  7696. z.low := float128_default_nan_low;
  7697. z.high := float128_default_nan_high;
  7698. result := z;
  7699. exit;
  7700. end;
  7701. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7702. end;
  7703. if ( aExp = 0 ) then begin
  7704. if ( ( aSig0 or aSig1 ) = 0 ) then
  7705. begin
  7706. result := a;
  7707. exit;
  7708. end;
  7709. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7710. end;
  7711. expDiff := aExp - bExp;
  7712. if ( expDiff < -1 ) then
  7713. begin
  7714. result := a;
  7715. exit;
  7716. end;
  7717. shortShift128Left(
  7718. aSig0 or int64( $0001000000000000 ),
  7719. aSig1,
  7720. 15 - ord( expDiff < 0 ),
  7721. aSig0,
  7722. aSig1
  7723. );
  7724. shortShift128Left(
  7725. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7726. q := le128( bSig0, bSig1, aSig0, aSig1 );
  7727. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7728. dec(expDiff,64);
  7729. while ( 0 < expDiff ) do begin
  7730. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7731. if ( 4 < q ) then
  7732. q := q - 4
  7733. else
  7734. q := 0;
  7735. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7736. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  7737. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  7738. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  7739. dec(expDiff,61);
  7740. end;
  7741. if ( -64 < expDiff ) then begin
  7742. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7743. if ( 4 < q ) then
  7744. q := q - 4
  7745. else
  7746. q := 0;
  7747. q := q shr (- expDiff);
  7748. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7749. inc(expDiff,52);
  7750. if ( expDiff < 0 ) then begin
  7751. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7752. end
  7753. else begin
  7754. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  7755. end;
  7756. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7757. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  7758. end
  7759. else begin
  7760. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  7761. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7762. end;
  7763. repeat
  7764. alternateASig0 := aSig0;
  7765. alternateASig1 := aSig1;
  7766. inc(q);
  7767. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7768. until not( 0 <= sbits64(aSig0) );
  7769. add128(
  7770. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  7771. if ( ( sigMean0 < 0 )
  7772. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  7773. aSig0 := alternateASig0;
  7774. aSig1 := alternateASig1;
  7775. end;
  7776. zSign := ord( sbits64(aSig0) < 0 );
  7777. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  7778. result :=
  7779. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  7780. end;
  7781. {*----------------------------------------------------------------------------
  7782. | Returns the square root of the quadruple-precision floating-point value `a'.
  7783. | The operation is performed according to the IEC/IEEE Standard for Binary
  7784. | Floating-Point Arithmetic.
  7785. *----------------------------------------------------------------------------*}
  7786. function float128_sqrt(a: float128): float128;
  7787. var
  7788. aSign: flag;
  7789. aExp, zExp: int32;
  7790. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  7791. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7792. z: float128;
  7793. label
  7794. invalid;
  7795. begin
  7796. aSig1 := extractFloat128Frac1( a );
  7797. aSig0 := extractFloat128Frac0( a );
  7798. aExp := extractFloat128Exp( a );
  7799. aSign := extractFloat128Sign( a );
  7800. if ( aExp = $7FFF ) then begin
  7801. if ( aSig0 or aSig1 )<>0 then
  7802. begin
  7803. result := propagateFloat128NaN( a, a );
  7804. exit;
  7805. end;
  7806. if ( aSign=0 ) then
  7807. begin
  7808. result := a;
  7809. exit;
  7810. end;
  7811. goto invalid;
  7812. end;
  7813. if ( aSign<>0 ) then begin
  7814. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  7815. begin
  7816. result := a;
  7817. exit;
  7818. end;
  7819. invalid:
  7820. float_raise( float_flag_invalid );
  7821. z.low := float128_default_nan_low;
  7822. z.high := float128_default_nan_high;
  7823. result := z;
  7824. exit;
  7825. end;
  7826. if ( aExp = 0 ) then begin
  7827. if ( ( aSig0 or aSig1 ) = 0 ) then
  7828. begin
  7829. result := packFloat128( 0, 0, 0, 0 );
  7830. exit;
  7831. end;
  7832. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7833. end;
  7834. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
  7835. aSig0 := aSig0 or int64( $0001000000000000 );
  7836. zSig0 := estimateSqrt32( aExp, aSig0>>17 );
  7837. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  7838. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7839. doubleZSig0 := zSig0 shl 1;
  7840. mul64To128( zSig0, zSig0, term0, term1 );
  7841. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7842. while ( sbits64(rem0) < 0 ) do begin
  7843. dec(zSig0);
  7844. dec(doubleZSig0,2);
  7845. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7846. end;
  7847. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7848. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  7849. if ( zSig1 = 0 ) then zSig1 := 1;
  7850. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7851. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7852. mul64To128( zSig1, zSig1, term2, term3 );
  7853. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7854. while ( sbits64(rem1) < 0 ) do begin
  7855. dec(zSig1);
  7856. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7857. term3 := term3 or 1;
  7858. term2 := term2 or doubleZSig0;
  7859. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7860. end;
  7861. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7862. end;
  7863. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  7864. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  7865. end;
  7866. {*----------------------------------------------------------------------------
  7867. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7868. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7869. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7870. *----------------------------------------------------------------------------*}
  7871. function float128_eq(a: float128; b: float128): flag;
  7872. begin
  7873. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7874. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7875. or ( ( extractFloat128Exp( b ) = $7FFF )
  7876. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7877. ) then begin
  7878. if ( (float128_is_signaling_nan( a )<>0)
  7879. or (float128_is_signaling_nan( b )<>0) ) then begin
  7880. float_raise( float_flag_invalid );
  7881. end;
  7882. result := 0;
  7883. exit;
  7884. end;
  7885. result := ord(
  7886. ( a.low = b.low )
  7887. and ( ( a.high = b.high )
  7888. or ( ( a.low = 0 )
  7889. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  7890. ));
  7891. end;
  7892. {*----------------------------------------------------------------------------
  7893. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7894. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  7895. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7896. | Arithmetic.
  7897. *----------------------------------------------------------------------------*}
  7898. function float128_le(a: float128; b: float128): flag;
  7899. var
  7900. aSign, bSign: flag;
  7901. begin
  7902. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7903. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7904. or ( ( extractFloat128Exp( b ) = $7FFF )
  7905. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7906. ) then begin
  7907. float_raise( float_flag_invalid );
  7908. result := 0;
  7909. exit;
  7910. end;
  7911. aSign := extractFloat128Sign( a );
  7912. bSign := extractFloat128Sign( b );
  7913. if ( aSign <> bSign ) then begin
  7914. result := ord(
  7915. (aSign<>0)
  7916. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7917. = 0 ));
  7918. exit;
  7919. end;
  7920. if aSign<>0 then
  7921. result := le128( b.high, b.low, a.high, a.low )
  7922. else
  7923. result := le128( a.high, a.low, b.high, b.low );
  7924. end;
  7925. {*----------------------------------------------------------------------------
  7926. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7927. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7928. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7929. *----------------------------------------------------------------------------*}
  7930. function float128_lt(a: float128; b: float128): flag;
  7931. var
  7932. aSign, bSign: flag;
  7933. begin
  7934. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7935. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7936. or ( ( extractFloat128Exp( b ) = $7FFF )
  7937. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7938. ) then begin
  7939. float_raise( float_flag_invalid );
  7940. result := 0;
  7941. exit;
  7942. end;
  7943. aSign := extractFloat128Sign( a );
  7944. bSign := extractFloat128Sign( b );
  7945. if ( aSign <> bSign ) then begin
  7946. result := ord(
  7947. (aSign<>0)
  7948. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7949. <> 0 ));
  7950. exit;
  7951. end;
  7952. if aSign<>0 then
  7953. result := lt128( b.high, b.low, a.high, a.low )
  7954. else
  7955. result := lt128( a.high, a.low, b.high, b.low );
  7956. end;
  7957. {*----------------------------------------------------------------------------
  7958. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7959. | the corresponding value `b', and 0 otherwise. The invalid exception is
  7960. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7961. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7962. *----------------------------------------------------------------------------*}
  7963. function float128_eq_signaling(a: float128; b: float128): flag;
  7964. begin
  7965. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7966. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7967. or ( ( extractFloat128Exp( b ) = $7FFF )
  7968. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7969. ) then begin
  7970. float_raise( float_flag_invalid );
  7971. result := 0;
  7972. exit;
  7973. end;
  7974. result := ord(
  7975. ( a.low = b.low )
  7976. and ( ( a.high = b.high )
  7977. or ( ( a.low = 0 )
  7978. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7979. ));
  7980. end;
  7981. {*----------------------------------------------------------------------------
  7982. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7983. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  7984. | cause an exception. Otherwise, the comparison is performed according to the
  7985. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7986. *----------------------------------------------------------------------------*}
  7987. function float128_le_quiet(a: float128; b: float128): flag;
  7988. var
  7989. aSign, bSign: flag;
  7990. begin
  7991. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7992. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7993. or ( ( extractFloat128Exp( b ) = $7FFF )
  7994. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7995. ) then begin
  7996. if ( (float128_is_signaling_nan( a )<>0)
  7997. or (float128_is_signaling_nan( b )<>0) ) then begin
  7998. float_raise( float_flag_invalid );
  7999. end;
  8000. result := 0;
  8001. exit;
  8002. end;
  8003. aSign := extractFloat128Sign( a );
  8004. bSign := extractFloat128Sign( b );
  8005. if ( aSign <> bSign ) then begin
  8006. result := ord(
  8007. (aSign<>0)
  8008. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8009. = 0 ));
  8010. exit;
  8011. end;
  8012. if aSign<>0 then
  8013. result := le128( b.high, b.low, a.high, a.low )
  8014. else
  8015. result := le128( a.high, a.low, b.high, b.low );
  8016. end;
  8017. {*----------------------------------------------------------------------------
  8018. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8019. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  8020. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  8021. | Standard for Binary Floating-Point Arithmetic.
  8022. *----------------------------------------------------------------------------*}
  8023. function float128_lt_quiet(a: float128; b: float128): flag;
  8024. var
  8025. aSign, bSign: flag;
  8026. begin
  8027. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8028. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8029. or ( ( extractFloat128Exp( b ) = $7FFF )
  8030. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8031. ) then begin
  8032. if ( (float128_is_signaling_nan( a )<>0)
  8033. or (float128_is_signaling_nan( b )<>0) ) then begin
  8034. float_raise( float_flag_invalid );
  8035. end;
  8036. result := 0;
  8037. exit;
  8038. end;
  8039. aSign := extractFloat128Sign( a );
  8040. bSign := extractFloat128Sign( b );
  8041. if ( aSign <> bSign ) then begin
  8042. result := ord(
  8043. (aSign<>0)
  8044. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8045. <> 0 ));
  8046. exit;
  8047. end;
  8048. if aSign<>0 then
  8049. result:=lt128( b.high, b.low, a.high, a.low )
  8050. else
  8051. result:=lt128( a.high, a.low, b.high, b.low );
  8052. end;
  8053. {----------------------------------------------------------------------------
  8054. | Returns the result of converting the double-precision floating-point value
  8055. | `a' to the quadruple-precision floating-point format. The conversion is
  8056. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8057. | Arithmetic.
  8058. *----------------------------------------------------------------------------}
  8059. function float64_to_float128( a : float64) : float128;
  8060. var
  8061. aSign : flag;
  8062. aExp : int16;
  8063. aSig, zSig0, zSig1 : bits64;
  8064. begin
  8065. aSig := extractFloat64Frac( a );
  8066. aExp := extractFloat64Exp( a );
  8067. aSign := extractFloat64Sign( a );
  8068. if ( aExp = $7FF ) then begin
  8069. if ( aSig<>0 ) then
  8070. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8071. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8072. exit;
  8073. end;
  8074. if ( aExp = 0 ) then begin
  8075. if ( aSig = 0 ) then
  8076. begin
  8077. result:=packFloat128( aSign, 0, 0, 0 );
  8078. exit;
  8079. end;
  8080. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8081. dec(aExp);
  8082. end;
  8083. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8084. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8085. end;
  8086. {$endif FPC_SOFTFLOAT_FLOAT128}
  8087. {$endif not(defined(fpc_softfpu_interface))}
  8088. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8089. end.
  8090. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}