softfpu.pp 292 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. interface
  69. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  70. {$if not(defined(fpc_softfpu_implementation))}
  71. {
  72. -------------------------------------------------------------------------------
  73. Software IEC/IEEE floating-point types.
  74. -------------------------------------------------------------------------------
  75. }
  76. TYPE
  77. float32 = longword;
  78. { we use here a record in the function header because
  79. the record allows bitwise conversion to single }
  80. float32rec = record
  81. float32 : float32;
  82. end;
  83. flag = byte;
  84. uint8 = byte;
  85. int8 = shortint;
  86. uint16 = word;
  87. int16 = smallint;
  88. uint32 = longword;
  89. int32 = longint;
  90. bits8 = byte;
  91. sbits8 = shortint;
  92. bits16 = word;
  93. sbits16 = smallint;
  94. sbits32 = longint;
  95. bits32 = longword;
  96. {$ifndef fpc}
  97. qword = int64;
  98. {$endif}
  99. { now part of the system unit
  100. uint64 = qword;
  101. }
  102. bits64 = qword;
  103. sbits64 = int64;
  104. {$ifdef ENDIAN_LITTLE}
  105. float64 = packed record
  106. low: bits32;
  107. high: bits32;
  108. end;
  109. int64rec = packed record
  110. low: bits32;
  111. high: bits32;
  112. end;
  113. floatx80 = packed record
  114. low : qword;
  115. high : word;
  116. end;
  117. float128 = packed record
  118. low : qword;
  119. high : qword;
  120. end;
  121. {$else}
  122. float64 = packed record
  123. high,low : bits32;
  124. end;
  125. int64rec = packed record
  126. high,low : bits32;
  127. end;
  128. floatx80 = packed record
  129. high : word;
  130. low : qword;
  131. end;
  132. float128 = packed record
  133. high : qword;
  134. low : qword;
  135. end;
  136. {$endif}
  137. {*
  138. -------------------------------------------------------------------------------
  139. Returns 1 if the double-precision floating-point value `a' is less than
  140. the corresponding value `b', and 0 otherwise. The comparison is performed
  141. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  142. -------------------------------------------------------------------------------
  143. *}
  144. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  145. {*
  146. -------------------------------------------------------------------------------
  147. Returns 1 if the double-precision floating-point value `a' is less than
  148. or equal to the corresponding value `b', and 0 otherwise. The comparison
  149. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  150. Arithmetic.
  151. -------------------------------------------------------------------------------
  152. *}
  153. Function float64_le(a: float64;b: float64): flag; compilerproc;
  154. {*
  155. -------------------------------------------------------------------------------
  156. Returns 1 if the double-precision floating-point value `a' is equal to
  157. the corresponding value `b', and 0 otherwise. The comparison is performed
  158. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  159. -------------------------------------------------------------------------------
  160. *}
  161. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  162. {*
  163. -------------------------------------------------------------------------------
  164. Returns the square root of the double-precision floating-point value `a'.
  165. The operation is performed according to the IEC/IEEE Standard for Binary
  166. Floating-Point Arithmetic.
  167. -------------------------------------------------------------------------------
  168. *}
  169. Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
  170. {*
  171. -------------------------------------------------------------------------------
  172. Returns the remainder of the double-precision floating-point value `a'
  173. with respect to the corresponding value `b'. The operation is performed
  174. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  175. -------------------------------------------------------------------------------
  176. *}
  177. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  178. {*
  179. -------------------------------------------------------------------------------
  180. Returns the result of dividing the double-precision floating-point value `a'
  181. by the corresponding value `b'. The operation is performed according to the
  182. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  183. -------------------------------------------------------------------------------
  184. *}
  185. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  186. {*
  187. -------------------------------------------------------------------------------
  188. Returns the result of multiplying the double-precision floating-point values
  189. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  190. for Binary Floating-Point Arithmetic.
  191. -------------------------------------------------------------------------------
  192. *}
  193. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  194. {*
  195. -------------------------------------------------------------------------------
  196. Returns the result of subtracting the double-precision floating-point values
  197. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  198. for Binary Floating-Point Arithmetic.
  199. -------------------------------------------------------------------------------
  200. *}
  201. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  202. {*
  203. -------------------------------------------------------------------------------
  204. Returns the result of adding the double-precision floating-point values `a'
  205. and `b'. The operation is performed according to the IEC/IEEE Standard for
  206. Binary Floating-Point Arithmetic.
  207. -------------------------------------------------------------------------------
  208. *}
  209. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  210. {*
  211. -------------------------------------------------------------------------------
  212. Rounds the double-precision floating-point value `a' to an integer,
  213. and returns the result as a double-precision floating-point value. The
  214. operation is performed according to the IEC/IEEE Standard for Binary
  215. Floating-Point Arithmetic.
  216. -------------------------------------------------------------------------------
  217. *}
  218. Function float64_round_to_int(a: float64) : float64; compilerproc;
  219. {*
  220. -------------------------------------------------------------------------------
  221. Returns the result of converting the double-precision floating-point value
  222. `a' to the single-precision floating-point format. The conversion is
  223. performed according to the IEC/IEEE Standard for Binary Floating-Point
  224. Arithmetic.
  225. -------------------------------------------------------------------------------
  226. *}
  227. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  228. {*
  229. -------------------------------------------------------------------------------
  230. Returns the result of converting the double-precision floating-point value
  231. `a' to the 32-bit two's complement integer format. The conversion is
  232. performed according to the IEC/IEEE Standard for Binary Floating-Point
  233. Arithmetic, except that the conversion is always rounded toward zero.
  234. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  235. the conversion overflows, the largest integer with the same sign as `a' is
  236. returned.
  237. -------------------------------------------------------------------------------
  238. *}
  239. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  240. {*
  241. -------------------------------------------------------------------------------
  242. Returns the result of converting the double-precision floating-point value
  243. `a' to the 32-bit two's complement integer format. The conversion is
  244. performed according to the IEC/IEEE Standard for Binary Floating-Point
  245. Arithmetic---which means in particular that the conversion is rounded
  246. according to the current rounding mode. If `a' is a NaN, the largest
  247. positive integer is returned. Otherwise, if the conversion overflows, the
  248. largest integer with the same sign as `a' is returned.
  249. -------------------------------------------------------------------------------
  250. *}
  251. Function float64_to_int32(a: float64): int32; compilerproc;
  252. {*
  253. -------------------------------------------------------------------------------
  254. Returns 1 if the single-precision floating-point value `a' is less than
  255. the corresponding value `b', and 0 otherwise. The comparison is performed
  256. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  257. -------------------------------------------------------------------------------
  258. *}
  259. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  260. {*
  261. -------------------------------------------------------------------------------
  262. Returns 1 if the single-precision floating-point value `a' is less than
  263. or equal to the corresponding value `b', and 0 otherwise. The comparison
  264. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  265. Arithmetic.
  266. -------------------------------------------------------------------------------
  267. *}
  268. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  269. {*
  270. -------------------------------------------------------------------------------
  271. Returns 1 if the single-precision floating-point value `a' is equal to
  272. the corresponding value `b', and 0 otherwise. The comparison is performed
  273. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  274. -------------------------------------------------------------------------------
  275. *}
  276. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  277. {*
  278. -------------------------------------------------------------------------------
  279. Returns the square root of the single-precision floating-point value `a'.
  280. The operation is performed according to the IEC/IEEE Standard for Binary
  281. Floating-Point Arithmetic.
  282. -------------------------------------------------------------------------------
  283. *}
  284. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  285. {*
  286. -------------------------------------------------------------------------------
  287. Returns the remainder of the single-precision floating-point value `a'
  288. with respect to the corresponding value `b'. The operation is performed
  289. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  290. -------------------------------------------------------------------------------
  291. *}
  292. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  293. {*
  294. -------------------------------------------------------------------------------
  295. Returns the result of dividing the single-precision floating-point value `a'
  296. by the corresponding value `b'. The operation is performed according to the
  297. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  298. -------------------------------------------------------------------------------
  299. *}
  300. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  301. {*
  302. -------------------------------------------------------------------------------
  303. Returns the result of multiplying the single-precision floating-point values
  304. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  305. for Binary Floating-Point Arithmetic.
  306. -------------------------------------------------------------------------------
  307. *}
  308. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  309. {*
  310. -------------------------------------------------------------------------------
  311. Returns the result of subtracting the single-precision floating-point values
  312. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  313. for Binary Floating-Point Arithmetic.
  314. -------------------------------------------------------------------------------
  315. *}
  316. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  317. {*
  318. -------------------------------------------------------------------------------
  319. Returns the result of adding the single-precision floating-point values `a'
  320. and `b'. The operation is performed according to the IEC/IEEE Standard for
  321. Binary Floating-Point Arithmetic.
  322. -------------------------------------------------------------------------------
  323. *}
  324. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  325. {*
  326. -------------------------------------------------------------------------------
  327. Rounds the single-precision floating-point value `a' to an integer,
  328. and returns the result as a single-precision floating-point value. The
  329. operation is performed according to the IEC/IEEE Standard for Binary
  330. Floating-Point Arithmetic.
  331. -------------------------------------------------------------------------------
  332. *}
  333. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  334. {*
  335. -------------------------------------------------------------------------------
  336. Returns the result of converting the single-precision floating-point value
  337. `a' to the double-precision floating-point format. The conversion is
  338. performed according to the IEC/IEEE Standard for Binary Floating-Point
  339. Arithmetic.
  340. -------------------------------------------------------------------------------
  341. *}
  342. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  343. {*
  344. -------------------------------------------------------------------------------
  345. Returns the result of converting the single-precision floating-point value
  346. `a' to the 32-bit two's complement integer format. The conversion is
  347. performed according to the IEC/IEEE Standard for Binary Floating-Point
  348. Arithmetic, except that the conversion is always rounded toward zero.
  349. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  350. the conversion overflows, the largest integer with the same sign as `a' is
  351. returned.
  352. -------------------------------------------------------------------------------
  353. *}
  354. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  355. {*
  356. -------------------------------------------------------------------------------
  357. Returns the result of converting the single-precision floating-point value
  358. `a' to the 32-bit two's complement integer format. The conversion is
  359. performed according to the IEC/IEEE Standard for Binary Floating-Point
  360. Arithmetic---which means in particular that the conversion is rounded
  361. according to the current rounding mode. If `a' is a NaN, the largest
  362. positive integer is returned. Otherwise, if the conversion overflows, the
  363. largest integer with the same sign as `a' is returned.
  364. -------------------------------------------------------------------------------
  365. *}
  366. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  367. {*
  368. -------------------------------------------------------------------------------
  369. Returns the result of converting the 32-bit two's complement integer `a' to
  370. the double-precision floating-point format. The conversion is performed
  371. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  372. -------------------------------------------------------------------------------
  373. *}
  374. Function int32_to_float64( a: int32) : float64; compilerproc;
  375. {*
  376. -------------------------------------------------------------------------------
  377. Returns the result of converting the 32-bit two's complement integer `a' to
  378. the single-precision floating-point format. The conversion is performed
  379. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  380. -------------------------------------------------------------------------------
  381. *}
  382. Function int32_to_float32( a: int32): float32rec; compilerproc;
  383. {*----------------------------------------------------------------------------
  384. | Returns the result of converting the 64-bit two's complement integer `a'
  385. | to the double-precision floating-point format. The conversion is performed
  386. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  387. *----------------------------------------------------------------------------*}
  388. Function int64_to_float64( a: int64 ): float64; compilerproc;
  389. {*----------------------------------------------------------------------------
  390. | Returns the result of converting the 64-bit two's complement integer `a'
  391. | to the single-precision floating-point format. The conversion is performed
  392. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  393. *----------------------------------------------------------------------------*}
  394. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  395. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  396. function float128_is_nan( a : float128): flag;
  397. function float128_is_signaling_nan( a : float128): flag;
  398. function float128_to_int32(a: float128): int32;
  399. function float128_to_int32_round_to_zero(a: float128): int32;
  400. function float128_to_int64(a: float128): int64;
  401. function float128_to_int64_round_to_zero(a: float128): int64;
  402. function float128_to_float32(a: float128): float32;
  403. function float128_to_float64(a: float128): float64;
  404. function float64_to_float128( a : float64) : float128;
  405. {$ifdef FPC_SOFTFLOAT_FLOAT80}
  406. function float128_to_floatx80(a: float128): floatx80;
  407. {$endif FPC_SOFTFLOAT_FLOAT80}
  408. function float128_round_to_int(a: float128): float128;
  409. function float128_add(a: float128; b: float128): float128;
  410. function float128_sub(a: float128; b: float128): float128;
  411. function float128_mul(a: float128; b: float128): float128;
  412. function float128_div(a: float128; b: float128): float128;
  413. function float128_rem(a: float128; b: float128): float128;
  414. function float128_sqrt(a: float128): float128;
  415. function float128_eq(a: float128; b: float128): flag;
  416. function float128_le(a: float128; b: float128): flag;
  417. function float128_lt(a: float128; b: float128): flag;
  418. function float128_eq_signaling(a: float128; b: float128): flag;
  419. function float128_le_quiet(a: float128; b: float128): flag;
  420. function float128_lt_quiet(a: float128; b: float128): flag;
  421. {$endif FPC_SOFTFLOAT_FLOAT128}
  422. CONST
  423. {-------------------------------------------------------------------------------
  424. Software IEC/IEEE floating-point underflow tininess-detection mode.
  425. -------------------------------------------------------------------------------
  426. *}
  427. float_tininess_after_rounding = 0;
  428. float_tininess_before_rounding = 1;
  429. {*
  430. -------------------------------------------------------------------------------
  431. Underflow tininess-detection mode, statically initialized to default value.
  432. (The declaration in `softfloat.h' must match the `int8' type here.)
  433. -------------------------------------------------------------------------------
  434. *}
  435. const float_detect_tininess: int8 = float_tininess_after_rounding;
  436. {$endif not(defined(fpc_softfpu_implementation))}
  437. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  438. implementation
  439. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  440. {$if not(defined(fpc_softfpu_interface))}
  441. (*****************************************************************************)
  442. (*----------------------------------------------------------------------------*)
  443. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  444. (* division and square root approximations. (Can be specialized to target if *)
  445. (* desired.) *)
  446. (* ---------------------------------------------------------------------------*)
  447. (*****************************************************************************)
  448. {*----------------------------------------------------------------------------
  449. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  450. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  451. | input. If `zSign' is 1, the input is negated before being converted to an
  452. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  453. | is simply rounded to an integer, with the inexact exception raised if the
  454. | input cannot be represented exactly as an integer. However, if the fixed-
  455. | point input is too large, the invalid exception is raised and the largest
  456. | positive or negative integer is returned.
  457. *----------------------------------------------------------------------------*}
  458. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  459. var
  460. roundingMode: int8;
  461. roundNearestEven: flag;
  462. roundIncrement, roundBits: int8;
  463. z: int32;
  464. begin
  465. roundingMode := softfloat_rounding_mode;
  466. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  467. roundIncrement := $40;
  468. if ( roundNearestEven=0 ) then
  469. begin
  470. if ( roundingMode = float_round_to_zero ) then
  471. begin
  472. roundIncrement := 0;
  473. end
  474. else begin
  475. roundIncrement := $7F;
  476. if ( zSign<>0 ) then
  477. begin
  478. if ( roundingMode = float_round_up ) then
  479. roundIncrement := 0;
  480. end
  481. else begin
  482. if ( roundingMode = float_round_down ) then
  483. roundIncrement := 0;
  484. end;
  485. end;
  486. end;
  487. roundBits := absZ and $7F;
  488. absZ := ( absZ + roundIncrement ) shr 7;
  489. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
  490. z := absZ;
  491. if ( zSign<>0 ) then
  492. z := - z;
  493. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  494. begin
  495. float_raise( float_flag_invalid );
  496. if zSign<>0 then
  497. result:=sbits32($80000000)
  498. else
  499. result:=$7FFFFFFF;
  500. exit;
  501. end;
  502. if ( roundBits<>0 ) then
  503. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  504. result:=z;
  505. end;
  506. {*----------------------------------------------------------------------------
  507. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  508. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  509. | and returns the properly rounded 64-bit integer corresponding to the input.
  510. | If `zSign' is 1, the input is negated before being converted to an integer.
  511. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  512. | the inexact exception raised if the input cannot be represented exactly as
  513. | an integer. However, if the fixed-point input is too large, the invalid
  514. | exception is raised and the largest positive or negative integer is
  515. | returned.
  516. *----------------------------------------------------------------------------*}
  517. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  518. var
  519. roundingMode: int8;
  520. roundNearestEven, increment: flag;
  521. z: int64;
  522. label
  523. overflow;
  524. begin
  525. roundingMode := softfloat_rounding_mode;
  526. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  527. increment := ord( sbits64(absZ1) < 0 );
  528. if ( roundNearestEven=0 ) then
  529. begin
  530. if ( roundingMode = float_round_to_zero ) then
  531. begin
  532. increment := 0;
  533. end
  534. else begin
  535. if ( zSign<>0 ) then
  536. begin
  537. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  538. end
  539. else begin
  540. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  541. end;
  542. end;
  543. end;
  544. if ( increment<>0 ) then
  545. begin
  546. inc(absZ0);
  547. if ( absZ0 = 0 ) then
  548. goto overflow;
  549. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  550. end;
  551. z := absZ0;
  552. if ( zSign<>0 ) then
  553. z := - z;
  554. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  555. begin
  556. overflow:
  557. float_raise( float_flag_invalid );
  558. if zSign<>0 then
  559. result:=int64($8000000000000000)
  560. else
  561. result:=int64($7FFFFFFFFFFFFFFF);
  562. end;
  563. if ( absZ1<>0 ) then
  564. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  565. result:=z;
  566. end;
  567. {*
  568. -------------------------------------------------------------------------------
  569. Shifts `a' right by the number of bits given in `count'. If any nonzero
  570. bits are shifted off, they are ``jammed'' into the least significant bit of
  571. the result by setting the least significant bit to 1. The value of `count'
  572. can be arbitrarily large; in particular, if `count' is greater than 32, the
  573. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  574. The result is stored in the location pointed to by `zPtr'.
  575. -------------------------------------------------------------------------------
  576. *}
  577. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  578. var
  579. z: Bits32;
  580. Begin
  581. if ( count = 0 ) then
  582. z := a
  583. else
  584. if ( count < 32 ) then
  585. Begin
  586. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  587. End
  588. else
  589. Begin
  590. z := bits32( a <> 0 );
  591. End;
  592. zPtr := z;
  593. End;
  594. {*----------------------------------------------------------------------------
  595. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  596. | number of bits given in `count'. Any bits shifted off are lost. The value
  597. | of `count' can be arbitrarily large; in particular, if `count' is greater
  598. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  599. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  600. *----------------------------------------------------------------------------*}
  601. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  602. var
  603. z0, z1: bits64;
  604. negCount: int8;
  605. begin
  606. negCount := ( - count ) and 63;
  607. if ( count = 0 ) then
  608. begin
  609. z1 := a1;
  610. z0 := a0;
  611. end
  612. else if ( count < 64 ) then
  613. begin
  614. z1 := ( a0 shl negCount ) or ( a1 shr count );
  615. z0 := a0 shr count;
  616. end
  617. else
  618. begin
  619. if ( count shl 64 )<>0 then
  620. z1 := a0 shr ( count and 63 )
  621. else
  622. z1 := 0;
  623. z0 := 0;
  624. end;
  625. z1Ptr := z1;
  626. z0Ptr := z0;
  627. end;
  628. {*----------------------------------------------------------------------------
  629. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  630. | number of bits given in `count'. If any nonzero bits are shifted off, they
  631. | are ``jammed'' into the least significant bit of the result by setting the
  632. | least significant bit to 1. The value of `count' can be arbitrarily large;
  633. | in particular, if `count' is greater than 128, the result will be either
  634. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  635. | nonzero. The result is broken into two 64-bit pieces which are stored at
  636. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  637. *----------------------------------------------------------------------------*}
  638. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  639. var
  640. z0,z1 : bits64;
  641. negCount : int8;
  642. begin
  643. negCount := ( - count ) and 63;
  644. if ( count = 0 ) then begin
  645. z1 := a1;
  646. z0 := a0;
  647. end
  648. else if ( count < 64 ) then begin
  649. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  650. z0 := a0>>count;
  651. end
  652. else begin
  653. if ( count = 64 ) then begin
  654. z1 := a0 or ord( a1 <> 0 );
  655. end
  656. else if ( count < 128 ) then begin
  657. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  658. end
  659. else begin
  660. z1 := ord( ( a0 or a1 ) <> 0 );
  661. end;
  662. z0 := 0;
  663. end;
  664. z1Ptr := z1;
  665. z0Ptr := z0;
  666. end;
  667. {*
  668. -------------------------------------------------------------------------------
  669. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  670. number of bits given in `count'. Any bits shifted off are lost. The value
  671. of `count' can be arbitrarily large; in particular, if `count' is greater
  672. than 64, the result will be 0. The result is broken into two 32-bit pieces
  673. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  674. -------------------------------------------------------------------------------
  675. *}
  676. Procedure
  677. shift64Right(
  678. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  679. Var
  680. z0, z1: bits32;
  681. negCount : int8;
  682. Begin
  683. negCount := ( - count ) AND 31;
  684. if ( count = 0 ) then
  685. Begin
  686. z1 := a1;
  687. z0 := a0;
  688. End
  689. else if ( count < 32 ) then
  690. Begin
  691. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  692. z0 := a0 shr count;
  693. End
  694. else
  695. Begin
  696. if (count < 64) then
  697. z1 := ( a0 shr ( count AND 31 ) )
  698. else
  699. z1 := 0;
  700. z0 := 0;
  701. End;
  702. z1Ptr := z1;
  703. z0Ptr := z0;
  704. End;
  705. {*
  706. -------------------------------------------------------------------------------
  707. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  708. number of bits given in `count'. If any nonzero bits are shifted off, they
  709. are ``jammed'' into the least significant bit of the result by setting the
  710. least significant bit to 1. The value of `count' can be arbitrarily large;
  711. in particular, if `count' is greater than 64, the result will be either 0
  712. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  713. nonzero. The result is broken into two 32-bit pieces which are stored at
  714. the locations pointed to by `z0Ptr' and `z1Ptr'.
  715. -------------------------------------------------------------------------------
  716. *}
  717. Procedure
  718. shift64RightJamming(
  719. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  720. VAR
  721. z0, z1 : bits32;
  722. negCount : int8;
  723. Begin
  724. negCount := ( - count ) AND 31;
  725. if ( count = 0 ) then
  726. Begin
  727. z1 := a1;
  728. z0 := a0;
  729. End
  730. else
  731. if ( count < 32 ) then
  732. Begin
  733. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  734. z0 := a0 shr count;
  735. End
  736. else
  737. Begin
  738. if ( count = 32 ) then
  739. Begin
  740. z1 := a0 OR bits32( a1 <> 0 );
  741. End
  742. else
  743. if ( count < 64 ) Then
  744. Begin
  745. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  746. End
  747. else
  748. Begin
  749. z1 := bits32( ( a0 OR a1 ) <> 0 );
  750. End;
  751. z0 := 0;
  752. End;
  753. z1Ptr := z1;
  754. z0Ptr := z0;
  755. End;
  756. {*----------------------------------------------------------------------------
  757. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  758. | bits are shifted off, they are ``jammed'' into the least significant bit of
  759. | the result by setting the least significant bit to 1. The value of `count'
  760. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  761. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  762. | The result is stored in the location pointed to by `zPtr'.
  763. *----------------------------------------------------------------------------*}
  764. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  765. var
  766. z: bits64;
  767. begin
  768. if ( count = 0 ) then
  769. begin
  770. z := a;
  771. end
  772. else if ( count < 64 ) then
  773. begin
  774. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  775. end
  776. else
  777. begin
  778. z := ord( a <> 0 );
  779. end;
  780. zPtr := z;
  781. end;
  782. {*
  783. -------------------------------------------------------------------------------
  784. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  785. by 32 _plus_ the number of bits given in `count'. The shifted result is
  786. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  787. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  788. off form a third 32-bit result as follows: The _last_ bit shifted off is
  789. the most-significant bit of the extra result, and the other 31 bits of the
  790. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  791. were all zero. This extra result is stored in the location pointed to by
  792. `z2Ptr'. The value of `count' can be arbitrarily large.
  793. (This routine makes more sense if `a0', `a1', and `a2' are considered
  794. to form a fixed-point value with binary point between `a1' and `a2'. This
  795. fixed-point value is shifted right by the number of bits given in `count',
  796. and the integer part of the result is returned at the locations pointed to
  797. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  798. corrupted as described above, and is returned at the location pointed to by
  799. `z2Ptr'.)
  800. -------------------------------------------------------------------------------
  801. }
  802. Procedure
  803. shift64ExtraRightJamming(
  804. a0: bits32;
  805. a1: bits32;
  806. a2: bits32;
  807. count: int16;
  808. VAR z0Ptr: bits32;
  809. VAR z1Ptr: bits32;
  810. VAR z2Ptr: bits32
  811. );
  812. Var
  813. z0, z1, z2: bits32;
  814. negCount : int8;
  815. Begin
  816. negCount := ( - count ) AND 31;
  817. if ( count = 0 ) then
  818. Begin
  819. z2 := a2;
  820. z1 := a1;
  821. z0 := a0;
  822. End
  823. else
  824. Begin
  825. if ( count < 32 ) Then
  826. Begin
  827. z2 := a1 shl negCount;
  828. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  829. z0 := a0 shr count;
  830. End
  831. else
  832. Begin
  833. if ( count = 32 ) then
  834. Begin
  835. z2 := a1;
  836. z1 := a0;
  837. End
  838. else
  839. Begin
  840. a2 := a2 or a1;
  841. if ( count < 64 ) then
  842. Begin
  843. z2 := a0 shl negCount;
  844. z1 := a0 shr ( count AND 31 );
  845. End
  846. else
  847. Begin
  848. if count = 64 then
  849. z2 := a0
  850. else
  851. z2 := bits32(a0 <> 0);
  852. z1 := 0;
  853. End;
  854. End;
  855. z0 := 0;
  856. End;
  857. z2 := z2 or bits32( a2 <> 0 );
  858. End;
  859. z2Ptr := z2;
  860. z1Ptr := z1;
  861. z0Ptr := z0;
  862. End;
  863. {*
  864. -------------------------------------------------------------------------------
  865. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  866. number of bits given in `count'. Any bits shifted off are lost. The value
  867. of `count' must be less than 32. The result is broken into two 32-bit
  868. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  869. -------------------------------------------------------------------------------
  870. *}
  871. Procedure
  872. shortShift64Left(
  873. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  874. Begin
  875. z1Ptr := a1 shl count;
  876. if count = 0 then
  877. z0Ptr := a0
  878. else
  879. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  880. End;
  881. {*
  882. -------------------------------------------------------------------------------
  883. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  884. by the number of bits given in `count'. Any bits shifted off are lost.
  885. The value of `count' must be less than 32. The result is broken into three
  886. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  887. `z1Ptr', and `z2Ptr'.
  888. -------------------------------------------------------------------------------
  889. *}
  890. Procedure
  891. shortShift96Left(
  892. a0: bits32;
  893. a1: bits32;
  894. a2: bits32;
  895. count: int16;
  896. VAR z0Ptr: bits32;
  897. VAR z1Ptr: bits32;
  898. VAR z2Ptr: bits32
  899. );
  900. Var
  901. z0, z1, z2: bits32;
  902. negCount: int8;
  903. Begin
  904. z2 := a2 shl count;
  905. z1 := a1 shl count;
  906. z0 := a0 shl count;
  907. if ( 0 < count ) then
  908. Begin
  909. negCount := ( ( - count ) AND 31 );
  910. z1 := z1 or (a2 shr negCount);
  911. z0 := z0 or (a1 shr negCount);
  912. End;
  913. z2Ptr := z2;
  914. z1Ptr := z1;
  915. z0Ptr := z0;
  916. End;
  917. {*----------------------------------------------------------------------------
  918. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  919. | number of bits given in `count'. Any bits shifted off are lost. The value
  920. | of `count' must be less than 64. The result is broken into two 64-bit
  921. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  922. *----------------------------------------------------------------------------*}
  923. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  924. begin
  925. z1Ptr := a1 shl count;
  926. if count=0 then
  927. z0Ptr:=a0
  928. else
  929. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  930. end;
  931. {*
  932. -------------------------------------------------------------------------------
  933. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  934. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  935. any carry out is lost. The result is broken into two 32-bit pieces which
  936. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  937. -------------------------------------------------------------------------------
  938. *}
  939. Procedure
  940. add64(
  941. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  942. Var
  943. z1: bits32;
  944. Begin
  945. z1 := a1 + b1;
  946. z1Ptr := z1;
  947. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  948. End;
  949. {*
  950. -------------------------------------------------------------------------------
  951. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  952. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  953. modulo 2^96, so any carry out is lost. The result is broken into three
  954. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  955. `z1Ptr', and `z2Ptr'.
  956. -------------------------------------------------------------------------------
  957. *}
  958. Procedure
  959. add96(
  960. a0: bits32;
  961. a1: bits32;
  962. a2: bits32;
  963. b0: bits32;
  964. b1: bits32;
  965. b2: bits32;
  966. VAR z0Ptr: bits32;
  967. VAR z1Ptr: bits32;
  968. VAR z2Ptr: bits32
  969. );
  970. var
  971. z0, z1, z2: bits32;
  972. carry0, carry1: int8;
  973. Begin
  974. z2 := a2 + b2;
  975. carry1 := int8( z2 < a2 );
  976. z1 := a1 + b1;
  977. carry0 := int8( z1 < a1 );
  978. z0 := a0 + b0;
  979. z1 := z1 + carry1;
  980. z0 := z0 + bits32( z1 < carry1 );
  981. z0 := z0 + carry0;
  982. z2Ptr := z2;
  983. z1Ptr := z1;
  984. z0Ptr := z0;
  985. End;
  986. {*----------------------------------------------------------------------------
  987. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  988. | by the number of bits given in `count'. Any bits shifted off are lost.
  989. | The value of `count' must be less than 64. The result is broken into three
  990. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  991. | `z1Ptr', and `z2Ptr'.
  992. *----------------------------------------------------------------------------*}
  993. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  994. var
  995. z0, z1, z2 : bits64;
  996. negCount : int8;
  997. begin
  998. z2 := a2 shl count;
  999. z1 := a1 shl count;
  1000. z0 := a0 shl count;
  1001. if ( 0 < count ) then
  1002. begin
  1003. negCount := ( ( - count ) and 63 );
  1004. z1 := z1 or (a2 shr negCount);
  1005. z0 := z0 or (a1 shr negCount);
  1006. end;
  1007. z2Ptr := z2;
  1008. z1Ptr := z1;
  1009. z0Ptr := z0;
  1010. end;
  1011. {*----------------------------------------------------------------------------
  1012. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1013. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1014. | any carry out is lost. The result is broken into two 64-bit pieces which
  1015. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1016. *----------------------------------------------------------------------------*}
  1017. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);inline;
  1018. var
  1019. z1 : bits64;
  1020. begin
  1021. z1 := a1 + b1;
  1022. z1Ptr := z1;
  1023. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1024. end;
  1025. {*----------------------------------------------------------------------------
  1026. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1027. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1028. | modulo 2^192, so any carry out is lost. The result is broken into three
  1029. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1030. | `z1Ptr', and `z2Ptr'.
  1031. *----------------------------------------------------------------------------*}
  1032. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1033. var
  1034. z0, z1, z2 : bits64;
  1035. carry0, carry1 : int8;
  1036. begin
  1037. z2 := a2 + b2;
  1038. carry1 := ord( z2 < a2 );
  1039. z1 := a1 + b1;
  1040. carry0 := ord( z1 < a1 );
  1041. z0 := a0 + b0;
  1042. inc(z1, carry1);
  1043. inc(z0, ord( z1 < carry1 ));
  1044. inc(z0, carry0);
  1045. z2Ptr := z2;
  1046. z1Ptr := z1;
  1047. z0Ptr := z0;
  1048. end;
  1049. {*
  1050. -------------------------------------------------------------------------------
  1051. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1052. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1053. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1054. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1055. `z1Ptr'.
  1056. -------------------------------------------------------------------------------
  1057. *}
  1058. Procedure
  1059. sub64(
  1060. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  1061. Begin
  1062. z1Ptr := a1 - b1;
  1063. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1064. End;
  1065. {*
  1066. -------------------------------------------------------------------------------
  1067. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1068. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1069. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1070. into three 32-bit pieces which are stored at the locations pointed to by
  1071. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1072. -------------------------------------------------------------------------------
  1073. *}
  1074. Procedure
  1075. sub96(
  1076. a0:bits32;
  1077. a1:bits32;
  1078. a2:bits32;
  1079. b0:bits32;
  1080. b1:bits32;
  1081. b2:bits32;
  1082. VAR z0Ptr:bits32;
  1083. VAR z1Ptr:bits32;
  1084. VAR z2Ptr:bits32
  1085. );
  1086. Var
  1087. z0, z1, z2: bits32;
  1088. borrow0, borrow1: int8;
  1089. Begin
  1090. z2 := a2 - b2;
  1091. borrow1 := int8( a2 < b2 );
  1092. z1 := a1 - b1;
  1093. borrow0 := int8( a1 < b1 );
  1094. z0 := a0 - b0;
  1095. z0 := z0 - bits32( z1 < borrow1 );
  1096. z1 := z1 - borrow1;
  1097. z0 := z0 -borrow0;
  1098. z2Ptr := z2;
  1099. z1Ptr := z1;
  1100. z0Ptr := z0;
  1101. End;
  1102. {*----------------------------------------------------------------------------
  1103. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1104. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1105. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1106. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1107. | `z1Ptr'.
  1108. *----------------------------------------------------------------------------*}
  1109. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1110. begin
  1111. z1Ptr := a1 - b1;
  1112. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1113. end;
  1114. {*----------------------------------------------------------------------------
  1115. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1116. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1117. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1118. | result is broken into three 64-bit pieces which are stored at the locations
  1119. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1120. *----------------------------------------------------------------------------*}
  1121. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1122. var
  1123. z0, z1, z2 : bits64;
  1124. borrow0, borrow1 : int8;
  1125. begin
  1126. z2 := a2 - b2;
  1127. borrow1 := ord( a2 < b2 );
  1128. z1 := a1 - b1;
  1129. borrow0 := ord( a1 < b1 );
  1130. z0 := a0 - b0;
  1131. dec(z0, ord( z1 < borrow1 ));
  1132. dec(z1, borrow1);
  1133. dec(z0, borrow0);
  1134. z2Ptr := z2;
  1135. z1Ptr := z1;
  1136. z0Ptr := z0;
  1137. end;
  1138. {*
  1139. -------------------------------------------------------------------------------
  1140. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1141. into two 32-bit pieces which are stored at the locations pointed to by
  1142. `z0Ptr' and `z1Ptr'.
  1143. -------------------------------------------------------------------------------
  1144. *}
  1145. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1146. :bits32 );
  1147. Var
  1148. aHigh, aLow, bHigh, bLow: bits16;
  1149. z0, zMiddleA, zMiddleB, z1: bits32;
  1150. Begin
  1151. aLow := a and $ffff;
  1152. aHigh := a shr 16;
  1153. bLow := b and $ffff;
  1154. bHigh := b shr 16;
  1155. z1 := ( bits32( aLow) ) * bLow;
  1156. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1157. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1158. z0 := ( bits32 (aHigh) ) * bHigh;
  1159. zMiddleA := zMiddleA + zMiddleB;
  1160. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1161. zMiddleA := zmiddleA shl 16;
  1162. z1 := z1 + zMiddleA;
  1163. z0 := z0 + bits32( z1 < zMiddleA );
  1164. z1Ptr := z1;
  1165. z0Ptr := z0;
  1166. End;
  1167. {*
  1168. -------------------------------------------------------------------------------
  1169. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1170. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1171. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1172. `z2Ptr'.
  1173. -------------------------------------------------------------------------------
  1174. *}
  1175. Procedure
  1176. mul64By32To96(
  1177. a0:bits32;
  1178. a1:bits32;
  1179. b:bits32;
  1180. VAR z0Ptr:bits32;
  1181. VAR z1Ptr:bits32;
  1182. VAR z2Ptr:bits32
  1183. );
  1184. Var
  1185. z0, z1, z2, more1: bits32;
  1186. Begin
  1187. mul32To64( a1, b, z1, z2 );
  1188. mul32To64( a0, b, z0, more1 );
  1189. add64( z0, more1, 0, z1, z0, z1 );
  1190. z2Ptr := z2;
  1191. z1Ptr := z1;
  1192. z0Ptr := z0;
  1193. End;
  1194. {*
  1195. -------------------------------------------------------------------------------
  1196. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1197. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1198. product. The product is broken into four 32-bit pieces which are stored at
  1199. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1200. -------------------------------------------------------------------------------
  1201. *}
  1202. Procedure
  1203. mul64To128(
  1204. a0:bits32;
  1205. a1:bits32;
  1206. b0:bits32;
  1207. b1:bits32;
  1208. VAR z0Ptr:bits32;
  1209. VAR z1Ptr:bits32;
  1210. VAR z2Ptr:bits32;
  1211. VAR z3Ptr:bits32
  1212. );
  1213. Var
  1214. z0, z1, z2, z3: bits32;
  1215. more1, more2: bits32;
  1216. Begin
  1217. mul32To64( a1, b1, z2, z3 );
  1218. mul32To64( a1, b0, z1, more2 );
  1219. add64( z1, more2, 0, z2, z1, z2 );
  1220. mul32To64( a0, b0, z0, more1 );
  1221. add64( z0, more1, 0, z1, z0, z1 );
  1222. mul32To64( a0, b1, more1, more2 );
  1223. add64( more1, more2, 0, z2, more1, z2 );
  1224. add64( z0, z1, 0, more1, z0, z1 );
  1225. z3Ptr := z3;
  1226. z2Ptr := z2;
  1227. z1Ptr := z1;
  1228. z0Ptr := z0;
  1229. End;
  1230. {*----------------------------------------------------------------------------
  1231. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1232. | into two 64-bit pieces which are stored at the locations pointed to by
  1233. | `z0Ptr' and `z1Ptr'.
  1234. *----------------------------------------------------------------------------*}
  1235. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1236. var
  1237. aHigh, aLow, bHigh, bLow : bits32;
  1238. z0, zMiddleA, zMiddleB, z1 : bits64;
  1239. begin
  1240. aLow := a;
  1241. aHigh := a shr 32;
  1242. bLow := b;
  1243. bHigh := b shr 32;
  1244. z1 := ( bits64(aLow) ) * bLow;
  1245. zMiddleA := ( bits64( aLow )) * bHigh;
  1246. zMiddleB := ( bits64( aHigh )) * bLow;
  1247. z0 := ( bits64(aHigh) ) * bHigh;
  1248. inc(zMiddleA, zMiddleB);
  1249. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1250. zMiddleA := zMiddleA shl 32;
  1251. inc(z1, zMiddleA);
  1252. inc(z0, ord( z1 < zMiddleA ));
  1253. z1Ptr := z1;
  1254. z0Ptr := z0;
  1255. end;
  1256. {*----------------------------------------------------------------------------
  1257. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1258. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1259. | product. The product is broken into four 64-bit pieces which are stored at
  1260. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1261. *----------------------------------------------------------------------------*}
  1262. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1263. var
  1264. z0,z1,z2,z3,more1,more2 : bits64;
  1265. begin
  1266. mul64To128( a1, b1, z2, z3 );
  1267. mul64To128( a1, b0, z1, more2 );
  1268. add128( z1, more2, 0, z2, z1, z2 );
  1269. mul64To128( a0, b0, z0, more1 );
  1270. add128( z0, more1, 0, z1, z0, z1 );
  1271. mul64To128( a0, b1, more1, more2 );
  1272. add128( more1, more2, 0, z2, more1, z2 );
  1273. add128( z0, z1, 0, more1, z0, z1 );
  1274. z3Ptr := z3;
  1275. z2Ptr := z2;
  1276. z1Ptr := z1;
  1277. z0Ptr := z0;
  1278. end;
  1279. {*----------------------------------------------------------------------------
  1280. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1281. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1282. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1283. | `z2Ptr'.
  1284. *----------------------------------------------------------------------------*}
  1285. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1286. var
  1287. z0, z1, z2, more1 : bits64;
  1288. begin
  1289. mul64To128( a1, b, z1, z2 );
  1290. mul64To128( a0, b, z0, more1 );
  1291. add128( z0, more1, 0, z1, z0, z1 );
  1292. z2Ptr := z2;
  1293. z1Ptr := z1;
  1294. z0Ptr := z0;
  1295. end;
  1296. {*----------------------------------------------------------------------------
  1297. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1298. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1299. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1300. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1301. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1302. | unsigned integer is returned.
  1303. *----------------------------------------------------------------------------*}
  1304. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1305. var
  1306. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1307. begin
  1308. if ( b <= a0 ) then
  1309. begin
  1310. result:=qword( $FFFFFFFFFFFFFFFF );
  1311. exit;
  1312. end;
  1313. b0 := b shr 32;
  1314. if ( b0 shl 32 <= a0 ) then
  1315. z:=qword( $FFFFFFFF00000000 )
  1316. else
  1317. z:=( a0 div b0 ) shl 32;
  1318. mul64To128( b, z, term0, term1 );
  1319. sub128( a0, a1, term0, term1, rem0, rem1 );
  1320. while ( ( sbits64(rem0) ) < 0 ) do begin
  1321. dec(z,qword( $100000000 ));
  1322. b1 := b shl 32;
  1323. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1324. end;
  1325. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1326. if ( b0 shl 32 <= rem0 ) then
  1327. z:=z or $FFFFFFFF
  1328. else
  1329. z:=z or rem0 div b0;
  1330. result:=z;
  1331. end;
  1332. {*
  1333. -------------------------------------------------------------------------------
  1334. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1335. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1336. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1337. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1338. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1339. unsigned integer is returned.
  1340. -------------------------------------------------------------------------------
  1341. *}
  1342. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1343. Var
  1344. b0, b1: bits32;
  1345. rem0, rem1, term0, term1: bits32;
  1346. z: bits32;
  1347. Begin
  1348. if ( b <= a0 ) then
  1349. Begin
  1350. estimateDiv64To32 := $FFFFFFFF;
  1351. exit;
  1352. End;
  1353. b0 := b shr 16;
  1354. if ( b0 shl 16 <= a0 ) then
  1355. z:= $FFFF0000
  1356. else
  1357. z:= ( a0 div b0 ) shl 16;
  1358. mul32To64( b, z, term0, term1 );
  1359. sub64( a0, a1, term0, term1, rem0, rem1 );
  1360. while ( ( sbits32 (rem0) ) < 0 ) do
  1361. Begin
  1362. z := z - $10000;
  1363. b1 := b shl 16;
  1364. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1365. End;
  1366. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1367. if ( b0 shl 16 <= rem0 ) then
  1368. z := z or $FFFF
  1369. else
  1370. z := z or (rem0 div b0);
  1371. estimateDiv64To32 := z;
  1372. End;
  1373. {*
  1374. -------------------------------------------------------------------------------
  1375. Returns an approximation to the square root of the 32-bit significand given
  1376. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1377. `aExp' (the least significant bit) is 1, the integer returned approximates
  1378. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1379. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1380. case, the approximation returned lies strictly within +/-2 of the exact
  1381. value.
  1382. -------------------------------------------------------------------------------
  1383. *}
  1384. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1385. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1386. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1387. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1388. );
  1389. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1390. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1391. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1392. );
  1393. Var
  1394. index: int8;
  1395. z: bits32;
  1396. Begin
  1397. index := ( a shr 27 ) AND 15;
  1398. if ( aExp AND 1 ) <> 0 then
  1399. Begin
  1400. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1401. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1402. a := a shr 1;
  1403. End
  1404. else
  1405. Begin
  1406. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1407. z := a div z + z;
  1408. if ( $20000 <= z ) then
  1409. z := $FFFF8000
  1410. else
  1411. z := ( z shl 15 );
  1412. if ( z <= a ) then
  1413. Begin
  1414. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1415. exit;
  1416. End;
  1417. End;
  1418. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1419. End;
  1420. {*
  1421. -------------------------------------------------------------------------------
  1422. Returns the number of leading 0 bits before the most-significant 1 bit of
  1423. `a'. If `a' is zero, 32 is returned.
  1424. -------------------------------------------------------------------------------
  1425. *}
  1426. Function countLeadingZeros32( a:bits32 ): int8;
  1427. const countLeadingZerosHigh:array[0..255] of int8 = (
  1428. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1429. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1430. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1431. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1432. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1433. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1434. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1435. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1436. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1437. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1438. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1439. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1440. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1441. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1442. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1443. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1444. );
  1445. Var
  1446. shiftCount: int8;
  1447. Begin
  1448. shiftCount := 0;
  1449. if ( a < $10000 ) then
  1450. Begin
  1451. shiftCount := shiftcount + 16;
  1452. a := a shl 16;
  1453. End;
  1454. if ( a < $1000000 ) then
  1455. Begin
  1456. shiftCount := shiftcount + 8;
  1457. a := a shl 8;
  1458. end;
  1459. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1460. countLeadingZeros32:= shiftCount;
  1461. End;
  1462. {*----------------------------------------------------------------------------
  1463. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1464. | `a'. If `a' is zero, 64 is returned.
  1465. *----------------------------------------------------------------------------*}
  1466. function countLeadingZeros64( a : bits64): int8;
  1467. var
  1468. shiftcount : int8;
  1469. Begin
  1470. shiftCount := 0;
  1471. if ( a < (bits64(1) shl 32 )) then
  1472. shiftCount := shiftcount + 32
  1473. else
  1474. a := a shr 32;
  1475. shiftCount := shiftCount + countLeadingZeros32( a );
  1476. countLeadingZeros64:= shiftCount;
  1477. End;
  1478. {*
  1479. -------------------------------------------------------------------------------
  1480. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
  1481. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1482. returns 0.
  1483. -------------------------------------------------------------------------------
  1484. *}
  1485. Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1486. Begin
  1487. eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
  1488. End;
  1489. {*
  1490. -------------------------------------------------------------------------------
  1491. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1492. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1493. Otherwise, returns 0.
  1494. -------------------------------------------------------------------------------
  1495. *}
  1496. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1497. Begin
  1498. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1499. End;
  1500. {*
  1501. -------------------------------------------------------------------------------
  1502. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1503. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1504. returns 0.
  1505. -------------------------------------------------------------------------------
  1506. *}
  1507. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1508. Begin
  1509. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1510. End;
  1511. {*
  1512. -------------------------------------------------------------------------------
  1513. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
  1514. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1515. returns 0.
  1516. -------------------------------------------------------------------------------
  1517. *}
  1518. Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1519. Begin
  1520. ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
  1521. End;
  1522. const
  1523. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1524. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1525. (*****************************************************************************)
  1526. (* End Low-Level arithmetic *)
  1527. (*****************************************************************************)
  1528. {*
  1529. -------------------------------------------------------------------------------
  1530. Functions and definitions to determine: (1) whether tininess for underflow
  1531. is detected before or after rounding by default, (2) what (if anything)
  1532. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1533. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1534. are propagated from function inputs to output. These details are ENDIAN
  1535. specific
  1536. -------------------------------------------------------------------------------
  1537. *}
  1538. {$IFDEF ENDIAN_LITTLE}
  1539. {*
  1540. -------------------------------------------------------------------------------
  1541. Internal canonical NaN format.
  1542. -------------------------------------------------------------------------------
  1543. *}
  1544. TYPE
  1545. commonNaNT = packed record
  1546. sign: flag;
  1547. high, low : bits32;
  1548. end;
  1549. {*
  1550. -------------------------------------------------------------------------------
  1551. The pattern for a default generated single-precision NaN.
  1552. -------------------------------------------------------------------------------
  1553. *}
  1554. const float32_default_nan = $FFC00000;
  1555. {*
  1556. -------------------------------------------------------------------------------
  1557. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1558. otherwise returns 0.
  1559. -------------------------------------------------------------------------------
  1560. *}
  1561. Function float32_is_nan( a : float32 ): flag;
  1562. Begin
  1563. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1564. End;
  1565. {*
  1566. -------------------------------------------------------------------------------
  1567. Returns 1 if the single-precision floating-point value `a' is a signaling
  1568. NaN; otherwise returns 0.
  1569. -------------------------------------------------------------------------------
  1570. *}
  1571. Function float32_is_signaling_nan( a : float32 ): flag;
  1572. Begin
  1573. float32_is_signaling_nan := flag
  1574. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1575. End;
  1576. {*
  1577. -------------------------------------------------------------------------------
  1578. Returns the result of converting the single-precision floating-point NaN
  1579. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1580. exception is raised.
  1581. -------------------------------------------------------------------------------
  1582. *}
  1583. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1584. var
  1585. z : commonNaNT ;
  1586. Begin
  1587. if ( float32_is_signaling_nan( a ) <> 0) then
  1588. float_raise( float_flag_invalid );
  1589. z.sign := a shr 31;
  1590. z.low := 0;
  1591. z.high := a shl 9;
  1592. c := z;
  1593. End;
  1594. {*
  1595. -------------------------------------------------------------------------------
  1596. Returns the result of converting the canonical NaN `a' to the single-
  1597. precision floating-point format.
  1598. -------------------------------------------------------------------------------
  1599. *}
  1600. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1601. Begin
  1602. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1603. End;
  1604. {*
  1605. -------------------------------------------------------------------------------
  1606. Takes two single-precision floating-point values `a' and `b', one of which
  1607. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1608. signaling NaN, the invalid exception is raised.
  1609. -------------------------------------------------------------------------------
  1610. *}
  1611. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1612. Var
  1613. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1614. label returnLargerSignificand;
  1615. Begin
  1616. aIsNaN := float32_is_nan( a );
  1617. aIsSignalingNaN := float32_is_signaling_nan( a );
  1618. bIsNaN := float32_is_nan( b );
  1619. bIsSignalingNaN := float32_is_signaling_nan( b );
  1620. a := a or $00400000;
  1621. b := b or $00400000;
  1622. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1623. float_raise( float_flag_invalid );
  1624. if ( aIsSignalingNaN )<> 0 then
  1625. Begin
  1626. if ( bIsSignalingNaN ) <> 0 then
  1627. goto returnLargerSignificand;
  1628. if bIsNan <> 0 then
  1629. propagateFloat32NaN := b
  1630. else
  1631. propagateFloat32NaN := a;
  1632. exit;
  1633. End
  1634. else if ( aIsNaN <> 0) then
  1635. Begin
  1636. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1637. Begin
  1638. propagateFloat32NaN := a;
  1639. exit;
  1640. End;
  1641. returnLargerSignificand:
  1642. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1643. Begin
  1644. propagateFloat32NaN := b;
  1645. exit;
  1646. End;
  1647. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1648. Begin
  1649. propagateFloat32NaN := a;
  1650. End;
  1651. if a < b then
  1652. propagateFloat32NaN := a
  1653. else
  1654. propagateFloat32NaN := b;
  1655. exit;
  1656. End
  1657. else
  1658. Begin
  1659. propagateFloat32NaN := b;
  1660. exit;
  1661. End;
  1662. End;
  1663. {*
  1664. -------------------------------------------------------------------------------
  1665. The pattern for a default generated double-precision NaN. The `high' and
  1666. `low' values hold the most- and least-significant bits, respectively.
  1667. -------------------------------------------------------------------------------
  1668. *}
  1669. const
  1670. float64_default_nan_high = $FFF80000;
  1671. float64_default_nan_low = $00000000;
  1672. {*
  1673. -------------------------------------------------------------------------------
  1674. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1675. otherwise returns 0.
  1676. -------------------------------------------------------------------------------
  1677. *}
  1678. Function float64_is_nan( a : float64 ) : flag;
  1679. Begin
  1680. float64_is_nan :=
  1681. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1682. and ( a.low or ( a.high and $000FFFFF ) );
  1683. End;
  1684. {*
  1685. -------------------------------------------------------------------------------
  1686. Returns 1 if the double-precision floating-point value `a' is a signaling
  1687. NaN; otherwise returns 0.
  1688. -------------------------------------------------------------------------------
  1689. *}
  1690. Function float64_is_signaling_nan( a : float64 ): flag;
  1691. Begin
  1692. float64_is_signaling_nan :=
  1693. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1694. and ( a.low or ( a.high and $0007FFFF ) );
  1695. End;
  1696. {*
  1697. -------------------------------------------------------------------------------
  1698. Returns the result of converting the double-precision floating-point NaN
  1699. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1700. exception is raised.
  1701. -------------------------------------------------------------------------------
  1702. *}
  1703. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1704. Var
  1705. z : commonNaNT;
  1706. Begin
  1707. if ( float64_is_signaling_nan( a )<>0 ) then
  1708. float_raise( float_flag_invalid );
  1709. z.sign := a.high shr 31;
  1710. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1711. c := z;
  1712. End;
  1713. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1714. Var
  1715. z : commonNaNT;
  1716. Begin
  1717. if ( float64_is_signaling_nan( a )<>0 ) then
  1718. float_raise( float_flag_invalid );
  1719. z.sign := a.high shr 31;
  1720. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1721. result := z;
  1722. End;
  1723. {*
  1724. -------------------------------------------------------------------------------
  1725. Returns the result of converting the canonical NaN `a' to the double-
  1726. precision floating-point format.
  1727. -------------------------------------------------------------------------------
  1728. *}
  1729. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1730. Var
  1731. z: float64;
  1732. Begin
  1733. shift64Right( a.high, a.low, 12, z.high, z.low );
  1734. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1735. c := z;
  1736. End;
  1737. {*
  1738. -------------------------------------------------------------------------------
  1739. Takes two double-precision floating-point values `a' and `b', one of which
  1740. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1741. signaling NaN, the invalid exception is raised.
  1742. -------------------------------------------------------------------------------
  1743. *}
  1744. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1745. Var
  1746. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1747. label returnLargerSignificand;
  1748. Begin
  1749. aIsNaN := float64_is_nan( a );
  1750. aIsSignalingNaN := float64_is_signaling_nan( a );
  1751. bIsNaN := float64_is_nan( b );
  1752. bIsSignalingNaN := float64_is_signaling_nan( b );
  1753. a.high := a.high or $00080000;
  1754. b.high := b.high or $00080000;
  1755. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1756. float_raise( float_flag_invalid );
  1757. if ( aIsSignalingNaN )<>0 then
  1758. Begin
  1759. if ( bIsSignalingNaN )<>0 then
  1760. goto returnLargerSignificand;
  1761. if bIsNan <> 0 then
  1762. c := b
  1763. else
  1764. c := a;
  1765. exit;
  1766. End
  1767. else if ( aIsNaN )<> 0 then
  1768. Begin
  1769. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1770. Begin
  1771. c := a;
  1772. exit;
  1773. End;
  1774. returnLargerSignificand:
  1775. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1776. Begin
  1777. c := b;
  1778. exit;
  1779. End;
  1780. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1781. Begin
  1782. c := a;
  1783. exit;
  1784. End;
  1785. if a.high < b.high then
  1786. c := a
  1787. else
  1788. c := b;
  1789. exit;
  1790. End
  1791. else
  1792. Begin
  1793. c := b;
  1794. exit;
  1795. End;
  1796. End;
  1797. {*----------------------------------------------------------------------------
  1798. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1799. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1800. | returns 0.
  1801. *----------------------------------------------------------------------------*}
  1802. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1803. begin
  1804. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1805. end;
  1806. {*----------------------------------------------------------------------------
  1807. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1808. | otherwise returns 0.
  1809. *----------------------------------------------------------------------------*}
  1810. function float128_is_nan( a : float128): flag;
  1811. begin
  1812. result:= ord(( int64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1813. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1814. end;
  1815. {*----------------------------------------------------------------------------
  1816. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1817. | signaling NaN; otherwise returns 0.
  1818. *----------------------------------------------------------------------------*}
  1819. function float128_is_signaling_nan( a : float128): flag;
  1820. begin
  1821. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1822. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1823. end;
  1824. {*----------------------------------------------------------------------------
  1825. | Returns the result of converting the quadruple-precision floating-point NaN
  1826. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1827. | exception is raised.
  1828. *----------------------------------------------------------------------------*}
  1829. function float128ToCommonNaN( a : float128): commonNaNT;
  1830. var
  1831. z: commonNaNT;
  1832. qhigh,qlow : qword;
  1833. begin
  1834. if ( float128_is_signaling_nan( a )<>0) then
  1835. float_raise( float_flag_invalid );
  1836. z.sign := a.high shr 63;
  1837. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1838. z.high:=qhigh shr 32;
  1839. z.low:=qhigh and $ffffffff;
  1840. result:=z;
  1841. end;
  1842. {*----------------------------------------------------------------------------
  1843. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1844. | precision floating-point format.
  1845. *----------------------------------------------------------------------------*}
  1846. function commonNaNToFloat128( a : commonNaNT): float128;
  1847. var
  1848. z: float128;
  1849. begin
  1850. shift128Right( a.high, a.low, 16, z.high, z.low );
  1851. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1852. result:=z;
  1853. end;
  1854. {*----------------------------------------------------------------------------
  1855. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1856. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1857. | `b' is a signaling NaN, the invalid exception is raised.
  1858. *----------------------------------------------------------------------------*}
  1859. function propagateFloat128NaN( a: float128; b : float128): float128;
  1860. var
  1861. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1862. label
  1863. returnLargerSignificand;
  1864. begin
  1865. aIsNaN := float128_is_nan( a );
  1866. aIsSignalingNaN := float128_is_signaling_nan( a );
  1867. bIsNaN := float128_is_nan( b );
  1868. bIsSignalingNaN := float128_is_signaling_nan( b );
  1869. a.high := a.high or int64( $0000800000000000 );
  1870. b.high := b.high or int64( $0000800000000000 );
  1871. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1872. float_raise( float_flag_invalid );
  1873. if ( aIsSignalingNaN )<>0 then
  1874. begin
  1875. if ( bIsSignalingNaN )<>0 then
  1876. goto returnLargerSignificand;
  1877. if bIsNaN<>0 then
  1878. result := b
  1879. else
  1880. result := a;
  1881. exit;
  1882. end
  1883. else if ( aIsNaN )<>0 then
  1884. begin
  1885. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1886. begin
  1887. result := a;
  1888. exit;
  1889. end;
  1890. returnLargerSignificand:
  1891. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1892. begin
  1893. result := b;
  1894. exit;
  1895. end;
  1896. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1897. begin
  1898. result := a;
  1899. exit
  1900. end;
  1901. if ( a.high < b.high ) then
  1902. result := a
  1903. else
  1904. result := b;
  1905. exit;
  1906. end
  1907. else
  1908. result:=b;
  1909. end;
  1910. {$ELSE}
  1911. { Big endian code }
  1912. (*----------------------------------------------------------------------------
  1913. | Internal canonical NaN format.
  1914. *----------------------------------------------------------------------------*)
  1915. type
  1916. commonNANT = packed record
  1917. sign : flag;
  1918. high, low : bits32;
  1919. end;
  1920. (*----------------------------------------------------------------------------
  1921. | The pattern for a default generated single-precision NaN.
  1922. *----------------------------------------------------------------------------*)
  1923. const float32_default_nan = $7FFFFFFF;
  1924. (*----------------------------------------------------------------------------
  1925. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1926. | otherwise returns 0.
  1927. *----------------------------------------------------------------------------*)
  1928. function float32_is_nan(a: float32): flag;
  1929. begin
  1930. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1931. end;
  1932. (*----------------------------------------------------------------------------
  1933. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1934. | NaN; otherwise returns 0.
  1935. *----------------------------------------------------------------------------*)
  1936. function float32_is_signaling_nan(a: float32):flag;
  1937. begin
  1938. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  1939. end;
  1940. (*----------------------------------------------------------------------------
  1941. | Returns the result of converting the single-precision floating-point NaN
  1942. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1943. | exception is raised.
  1944. *----------------------------------------------------------------------------*)
  1945. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1946. var
  1947. z: commonNANT;
  1948. begin
  1949. if float32_is_signaling_nan(a)<>0 then
  1950. float_raise(float_flag_invalid);
  1951. z.sign := a shr 31;
  1952. z.low := 0;
  1953. z.high := a shl 9;
  1954. c:=z;
  1955. end;
  1956. (*----------------------------------------------------------------------------
  1957. | Returns the result of converting the canonical NaN `a' to the single-
  1958. | precision floating-point format.
  1959. *----------------------------------------------------------------------------*)
  1960. function CommonNanToFloat32(a : CommonNaNT): float32;
  1961. begin
  1962. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  1963. end;
  1964. (*----------------------------------------------------------------------------
  1965. | Takes two single-precision floating-point values `a' and `b', one of which
  1966. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1967. | signaling NaN, the invalid exception is raised.
  1968. *----------------------------------------------------------------------------*)
  1969. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  1970. var
  1971. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1972. begin
  1973. aIsNaN := float32_is_nan( a );
  1974. aIsSignalingNaN := float32_is_signaling_nan( a );
  1975. bIsNaN := float32_is_nan( b );
  1976. bIsSignalingNaN := float32_is_signaling_nan( b );
  1977. a := a or $00400000;
  1978. b := b or $00400000;
  1979. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1980. float_raise( float_flag_invalid );
  1981. if bIsSignalingNaN<>0 then
  1982. propagateFloat32Nan := b
  1983. else if aIsSignalingNan<>0 then
  1984. propagateFloat32Nan := a
  1985. else if bIsNan<>0 then
  1986. propagateFloat32Nan := b
  1987. else
  1988. propagateFloat32Nan := a;
  1989. end;
  1990. (*----------------------------------------------------------------------------
  1991. | The pattern for a default generated double-precision NaN. The `high' and
  1992. | `low' values hold the most- and least-significant bits, respectively.
  1993. *----------------------------------------------------------------------------*)
  1994. const
  1995. float64_default_nan_high = $7FFFFFFF;
  1996. float64_default_nan_low = $FFFFFFFF;
  1997. (*----------------------------------------------------------------------------
  1998. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  1999. | otherwise returns 0.
  2000. *----------------------------------------------------------------------------*)
  2001. function float64_is_nan(a: float64): flag;
  2002. begin
  2003. float64_is_nan := flag (
  2004. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2005. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2006. end;
  2007. (*----------------------------------------------------------------------------
  2008. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2009. | NaN; otherwise returns 0.
  2010. *----------------------------------------------------------------------------*)
  2011. function float64_is_signaling_nan( a:float64): flag;
  2012. begin
  2013. float64_is_signaling_nan := flag(
  2014. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2015. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2016. end;
  2017. (*----------------------------------------------------------------------------
  2018. | Returns the result of converting the double-precision floating-point NaN
  2019. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2020. | exception is raised.
  2021. *----------------------------------------------------------------------------*)
  2022. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  2023. var
  2024. z : commonNaNT;
  2025. begin
  2026. if ( float64_is_signaling_nan( a )<>0 ) then
  2027. float_raise( float_flag_invalid );
  2028. z.sign := a.high shr 31;
  2029. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2030. c:=z;
  2031. end;
  2032. (*----------------------------------------------------------------------------
  2033. | Returns the result of converting the canonical NaN `a' to the double-
  2034. | precision floating-point format.
  2035. *----------------------------------------------------------------------------*)
  2036. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  2037. var
  2038. z: float64;
  2039. begin
  2040. shift64Right( a.high, a.low, 12, z.high, z.low );
  2041. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2042. c:=z;
  2043. end;
  2044. (*----------------------------------------------------------------------------
  2045. | Takes two double-precision floating-point values `a' and `b', one of which
  2046. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2047. | signaling NaN, the invalid exception is raised.
  2048. *----------------------------------------------------------------------------*)
  2049. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2050. var
  2051. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2052. begin
  2053. aIsNaN := float64_is_nan( a );
  2054. aIsSignalingNaN := float64_is_signaling_nan( a );
  2055. bIsNaN := float64_is_nan( b );
  2056. bIsSignalingNaN := float64_is_signaling_nan( b );
  2057. a.high := a.high or $00080000;
  2058. b.high := b.high or $00080000;
  2059. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2060. float_raise( float_flag_invalid );
  2061. if bIsSignalingNaN<>0 then
  2062. c := b
  2063. else if aIsSignalingNan<>0 then
  2064. c := a
  2065. else if bIsNan<>0 then
  2066. c := b
  2067. else
  2068. c := a;
  2069. end;
  2070. {$ENDIF}
  2071. (****************************************************************************)
  2072. (* END ENDIAN SPECIFIC CODE *)
  2073. (****************************************************************************)
  2074. {*
  2075. -------------------------------------------------------------------------------
  2076. Returns the fraction bits of the single-precision floating-point value `a'.
  2077. -------------------------------------------------------------------------------
  2078. *}
  2079. Function ExtractFloat32Frac(a : Float32) : Bits32;
  2080. Begin
  2081. ExtractFloat32Frac := A AND $007FFFFF;
  2082. End;
  2083. {*
  2084. -------------------------------------------------------------------------------
  2085. Returns the exponent bits of the single-precision floating-point value `a'.
  2086. -------------------------------------------------------------------------------
  2087. *}
  2088. Function extractFloat32Exp( a: float32 ): Int16;
  2089. Begin
  2090. extractFloat32Exp := (a shr 23) AND $FF;
  2091. End;
  2092. {*
  2093. -------------------------------------------------------------------------------
  2094. Returns the sign bit of the single-precision floating-point value `a'.
  2095. -------------------------------------------------------------------------------
  2096. *}
  2097. Function extractFloat32Sign( a: float32 ): Flag;
  2098. Begin
  2099. extractFloat32Sign := a shr 31;
  2100. End;
  2101. {*
  2102. -------------------------------------------------------------------------------
  2103. Normalizes the subnormal single-precision floating-point value represented
  2104. by the denormalized significand `aSig'. The normalized exponent and
  2105. significand are stored at the locations pointed to by `zExpPtr' and
  2106. `zSigPtr', respectively.
  2107. -------------------------------------------------------------------------------
  2108. *}
  2109. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2110. Var
  2111. ShiftCount : BYTE;
  2112. Begin
  2113. shiftCount := countLeadingZeros32( aSig ) - 8;
  2114. zSigPtr := aSig shl shiftCount;
  2115. zExpPtr := 1 - shiftCount;
  2116. End;
  2117. {*
  2118. -------------------------------------------------------------------------------
  2119. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2120. single-precision floating-point value, returning the result. After being
  2121. shifted into the proper positions, the three fields are simply added
  2122. together to form the result. This means that any integer portion of `zSig'
  2123. will be added into the exponent. Since a properly normalized significand
  2124. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2125. than the desired result exponent whenever `zSig' is a complete, normalized
  2126. significand.
  2127. -------------------------------------------------------------------------------
  2128. *}
  2129. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  2130. Begin
  2131. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2132. + zSig;
  2133. End;
  2134. {*
  2135. -------------------------------------------------------------------------------
  2136. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2137. and significand `zSig', and returns the proper single-precision floating-
  2138. point value corresponding to the abstract input. Ordinarily, the abstract
  2139. value is simply rounded and packed into the single-precision format, with
  2140. the inexact exception raised if the abstract input cannot be represented
  2141. exactly. However, if the abstract value is too large, the overflow and
  2142. inexact exceptions are raised and an infinity or maximal finite value is
  2143. returned. If the abstract value is too small, the input value is rounded to
  2144. a subnormal number, and the underflow and inexact exceptions are raised if
  2145. the abstract input cannot be represented exactly as a subnormal single-
  2146. precision floating-point number.
  2147. The input significand `zSig' has its binary point between bits 30
  2148. and 29, which is 7 bits to the left of the usual location. This shifted
  2149. significand must be normalized or smaller. If `zSig' is not normalized,
  2150. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2151. and it must not require rounding. In the usual case that `zSig' is
  2152. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2153. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2154. Binary Floating-Point Arithmetic.
  2155. -------------------------------------------------------------------------------
  2156. *}
  2157. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2158. Var
  2159. roundingMode : BYTE;
  2160. roundNearestEven : Flag;
  2161. roundIncrement, roundBits : BYTE;
  2162. IsTiny : Flag;
  2163. Begin
  2164. roundingMode := softfloat_rounding_mode;
  2165. if (roundingMode = float_round_nearest_even) then
  2166. Begin
  2167. roundNearestEven := Flag(TRUE);
  2168. end
  2169. else
  2170. roundNearestEven := Flag(FALSE);
  2171. roundIncrement := $40;
  2172. if ( Boolean(roundNearestEven) = FALSE) then
  2173. Begin
  2174. if ( roundingMode = float_round_to_zero ) Then
  2175. Begin
  2176. roundIncrement := 0;
  2177. End
  2178. else
  2179. Begin
  2180. roundIncrement := $7F;
  2181. if ( zSign <> 0 ) then
  2182. Begin
  2183. if roundingMode = float_round_up then roundIncrement := 0;
  2184. End
  2185. else
  2186. Begin
  2187. if roundingMode = float_round_down then roundIncrement := 0;
  2188. End;
  2189. End
  2190. End;
  2191. roundBits := zSig AND $7F;
  2192. if ($FD <= bits16 (zExp) ) then
  2193. Begin
  2194. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2195. Begin
  2196. float_raise( float_flag_overflow OR float_flag_inexact );
  2197. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2198. exit;
  2199. End;
  2200. if ( zExp < 0 ) then
  2201. Begin
  2202. isTiny :=
  2203. flag(( float_detect_tininess = float_tininess_before_rounding )
  2204. OR ( zExp < -1 )
  2205. OR ( (zSig + roundIncrement) < $80000000 ));
  2206. shift32RightJamming( zSig, - zExp, zSig );
  2207. zExp := 0;
  2208. roundBits := zSig AND $7F;
  2209. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  2210. float_raise( float_flag_underflow );
  2211. End;
  2212. End;
  2213. if ( roundBits )<> 0 then
  2214. softfloat_exception_flags := float_flag_inexact OR softfloat_exception_flags;
  2215. zSig := ( zSig + roundIncrement ) shr 7;
  2216. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  2217. if ( zSig = 0 ) then zExp := 0;
  2218. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2219. exit;
  2220. End;
  2221. {*
  2222. -------------------------------------------------------------------------------
  2223. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2224. and significand `zSig', and returns the proper single-precision floating-
  2225. point value corresponding to the abstract input. This routine is just like
  2226. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2227. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2228. floating-point exponent.
  2229. -------------------------------------------------------------------------------
  2230. *}
  2231. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2232. Var
  2233. ShiftCount : int8;
  2234. Begin
  2235. shiftCount := countLeadingZeros32( zSig ) - 1;
  2236. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2237. End;
  2238. {*
  2239. -------------------------------------------------------------------------------
  2240. Returns the most-significant 20 fraction bits of the double-precision
  2241. floating-point value `a'.
  2242. -------------------------------------------------------------------------------
  2243. *}
  2244. Function extractFloat64Frac0(a: float64): bits32;
  2245. Begin
  2246. extractFloat64Frac0 := a.high and $000FFFFF;
  2247. End;
  2248. {*
  2249. -------------------------------------------------------------------------------
  2250. Returns the least-significant 32 fraction bits of the double-precision
  2251. floating-point value `a'.
  2252. -------------------------------------------------------------------------------
  2253. *}
  2254. Function extractFloat64Frac1(a: float64): bits32;
  2255. Begin
  2256. extractFloat64Frac1 := a.low;
  2257. End;
  2258. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2259. Function extractFloat64Frac(a: float64): bits64;
  2260. Begin
  2261. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2262. End;
  2263. {*
  2264. -------------------------------------------------------------------------------
  2265. Returns the exponent bits of the double-precision floating-point value `a'.
  2266. -------------------------------------------------------------------------------
  2267. *}
  2268. Function extractFloat64Exp(a: float64): int16;
  2269. Begin
  2270. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2271. End;
  2272. {*
  2273. -------------------------------------------------------------------------------
  2274. Returns the sign bit of the double-precision floating-point value `a'.
  2275. -------------------------------------------------------------------------------
  2276. *}
  2277. Function extractFloat64Sign(a: float64) : flag;
  2278. Begin
  2279. extractFloat64Sign := a.high shr 31;
  2280. End;
  2281. {*
  2282. -------------------------------------------------------------------------------
  2283. Normalizes the subnormal double-precision floating-point value represented
  2284. by the denormalized significand formed by the concatenation of `aSig0' and
  2285. `aSig1'. The normalized exponent is stored at the location pointed to by
  2286. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2287. stored at the location pointed to by `zSig0Ptr', and the least significant
  2288. 32 bits of the normalized significand are stored at the location pointed to
  2289. by `zSig1Ptr'.
  2290. -------------------------------------------------------------------------------
  2291. *}
  2292. Procedure normalizeFloat64Subnormal(
  2293. aSig0: bits32;
  2294. aSig1: bits32;
  2295. VAR zExpPtr : Int16;
  2296. VAR zSig0Ptr : Bits32;
  2297. VAR zSig1Ptr : Bits32
  2298. );
  2299. Var
  2300. ShiftCount : Int8;
  2301. Begin
  2302. if ( aSig0 = 0 ) then
  2303. Begin
  2304. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2305. if ( shiftCount < 0 ) then
  2306. Begin
  2307. zSig0Ptr := aSig1 shr ( - shiftCount );
  2308. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2309. End
  2310. else
  2311. Begin
  2312. zSig0Ptr := aSig1 shl shiftCount;
  2313. zSig1Ptr := 0;
  2314. End;
  2315. zExpPtr := - shiftCount - 31;
  2316. End
  2317. else
  2318. Begin
  2319. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2320. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2321. zExpPtr := 1 - shiftCount;
  2322. End;
  2323. End;
  2324. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2325. var
  2326. shiftCount : int8;
  2327. begin
  2328. shiftCount := countLeadingZeros64( aSig ) - 11;
  2329. zSigPtr := aSig shl shiftCount;
  2330. zExpPtr := 1 - shiftCount;
  2331. end;
  2332. {*
  2333. -------------------------------------------------------------------------------
  2334. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2335. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2336. point value, returning the result. After being shifted into the proper
  2337. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2338. together to form the most significant 32 bits of the result. This means
  2339. that any integer portion of `zSig0' will be added into the exponent. Since
  2340. a properly normalized significand will have an integer portion equal to 1,
  2341. the `zExp' input should be 1 less than the desired result exponent whenever
  2342. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2343. -------------------------------------------------------------------------------
  2344. *}
  2345. Procedure
  2346. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2347. var
  2348. z: Float64;
  2349. Begin
  2350. z.low := zSig1;
  2351. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2352. c := z;
  2353. End;
  2354. {*----------------------------------------------------------------------------
  2355. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2356. | double-precision floating-point value, returning the result. After being
  2357. | shifted into the proper positions, the three fields are simply added
  2358. | together to form the result. This means that any integer portion of `zSig'
  2359. | will be added into the exponent. Since a properly normalized significand
  2360. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2361. | than the desired result exponent whenever `zSig' is a complete, normalized
  2362. | significand.
  2363. *----------------------------------------------------------------------------*}
  2364. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2365. begin
  2366. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2367. end;
  2368. {*
  2369. -------------------------------------------------------------------------------
  2370. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2371. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2372. and `zSig2', and returns the proper double-precision floating-point value
  2373. corresponding to the abstract input. Ordinarily, the abstract value is
  2374. simply rounded and packed into the double-precision format, with the inexact
  2375. exception raised if the abstract input cannot be represented exactly.
  2376. However, if the abstract value is too large, the overflow and inexact
  2377. exceptions are raised and an infinity or maximal finite value is returned.
  2378. If the abstract value is too small, the input value is rounded to a
  2379. subnormal number, and the underflow and inexact exceptions are raised if the
  2380. abstract input cannot be represented exactly as a subnormal double-precision
  2381. floating-point number.
  2382. The input significand must be normalized or smaller. If the input
  2383. significand is not normalized, `zExp' must be 0; in that case, the result
  2384. returned is a subnormal number, and it must not require rounding. In the
  2385. usual case that the input significand is normalized, `zExp' must be 1 less
  2386. than the ``true'' floating-point exponent. The handling of underflow and
  2387. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2388. -------------------------------------------------------------------------------
  2389. *}
  2390. Procedure
  2391. roundAndPackFloat64(
  2392. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2393. Var
  2394. roundingMode : Int8;
  2395. roundNearestEven, increment, isTiny : Flag;
  2396. Begin
  2397. roundingMode := softfloat_rounding_mode;
  2398. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2399. increment := flag( sbits32 (zSig2) < 0 );
  2400. if ( roundNearestEven = flag(FALSE) ) then
  2401. Begin
  2402. if ( roundingMode = float_round_to_zero ) then
  2403. increment := 0
  2404. else
  2405. Begin
  2406. if ( zSign )<> 0 then
  2407. Begin
  2408. increment := flag( roundingMode = float_round_down ) and zSig2;
  2409. End
  2410. else
  2411. Begin
  2412. increment := flag( roundingMode = float_round_up ) and zSig2;
  2413. End
  2414. End
  2415. End;
  2416. if ( $7FD <= bits16 (zExp) ) then
  2417. Begin
  2418. if (( $7FD < zExp )
  2419. or (( zExp = $7FD )
  2420. and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
  2421. and (increment<>0)
  2422. )
  2423. ) then
  2424. Begin
  2425. float_raise( float_flag_overflow OR float_flag_inexact );
  2426. if (( roundingMode = float_round_to_zero )
  2427. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2428. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2429. ) then
  2430. Begin
  2431. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2432. exit;
  2433. End;
  2434. packFloat64( zSign, $7FF, 0, 0, c );
  2435. exit;
  2436. End;
  2437. if ( zExp < 0 ) then
  2438. Begin
  2439. isTiny :=
  2440. flag( float_detect_tininess = float_tininess_before_rounding )
  2441. or flag( zExp < -1 )
  2442. or flag(increment = 0)
  2443. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2444. shift64ExtraRightJamming(
  2445. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2446. zExp := 0;
  2447. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2448. if ( roundNearestEven )<>0 then
  2449. Begin
  2450. increment := flag( sbits32 (zSig2) < 0 );
  2451. End
  2452. else
  2453. Begin
  2454. if ( zSign )<>0 then
  2455. Begin
  2456. increment := flag( roundingMode = float_round_down ) and zSig2;
  2457. End
  2458. else
  2459. Begin
  2460. increment := flag( roundingMode = float_round_up ) and zSig2;
  2461. End
  2462. End;
  2463. End;
  2464. End;
  2465. if ( zSig2 )<>0 then
  2466. softfloat_exception_flags := softfloat_exception_flags OR float_flag_inexact;
  2467. if ( increment )<>0 then
  2468. Begin
  2469. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2470. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2471. End
  2472. else
  2473. Begin
  2474. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2475. End;
  2476. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2477. End;
  2478. {*----------------------------------------------------------------------------
  2479. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2480. | and significand `zSig', and returns the proper double-precision floating-
  2481. | point value corresponding to the abstract input. Ordinarily, the abstract
  2482. | value is simply rounded and packed into the double-precision format, with
  2483. | the inexact exception raised if the abstract input cannot be represented
  2484. | exactly. However, if the abstract value is too large, the overflow and
  2485. | inexact exceptions are raised and an infinity or maximal finite value is
  2486. | returned. If the abstract value is too small, the input value is rounded
  2487. | to a subnormal number, and the underflow and inexact exceptions are raised
  2488. | if the abstract input cannot be represented exactly as a subnormal double-
  2489. | precision floating-point number.
  2490. | The input significand `zSig' has its binary point between bits 62
  2491. | and 61, which is 10 bits to the left of the usual location. This shifted
  2492. | significand must be normalized or smaller. If `zSig' is not normalized,
  2493. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2494. | and it must not require rounding. In the usual case that `zSig' is
  2495. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2496. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2497. | Binary Floating-Point Arithmetic.
  2498. *----------------------------------------------------------------------------*}
  2499. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2500. var
  2501. roundingMode: int8;
  2502. roundNearestEven: flag;
  2503. roundIncrement, roundBits: int16;
  2504. isTiny: flag;
  2505. begin
  2506. roundingMode := softfloat_rounding_mode;
  2507. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2508. roundIncrement := $200;
  2509. if ( roundNearestEven=0 ) then
  2510. begin
  2511. if ( roundingMode = float_round_to_zero ) then
  2512. begin
  2513. roundIncrement := 0;
  2514. end
  2515. else begin
  2516. roundIncrement := $3FF;
  2517. if ( zSign<>0 ) then
  2518. begin
  2519. if ( roundingMode = float_round_up ) then
  2520. roundIncrement := 0;
  2521. end
  2522. else begin
  2523. if ( roundingMode = float_round_down ) then
  2524. roundIncrement := 0;
  2525. end
  2526. end
  2527. end;
  2528. roundBits := zSig and $3FF;
  2529. if ( $7FD <= bits16(zExp) ) then
  2530. begin
  2531. if ( ( $7FD < zExp )
  2532. or ( ( zExp = $7FD )
  2533. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2534. ) then
  2535. begin
  2536. float_raise( float_flag_overflow or float_flag_inexact );
  2537. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2538. exit;
  2539. end;
  2540. if ( zExp < 0 ) then
  2541. begin
  2542. isTiny := ord(
  2543. ( float_detect_tininess = float_tininess_before_rounding )
  2544. or ( zExp < -1 )
  2545. or ( (zSig + roundIncrement) < int64( $8000000000000000 ) ) );
  2546. shift64RightJamming( zSig, - zExp, zSig );
  2547. zExp := 0;
  2548. roundBits := zSig and $3FF;
  2549. if ( isTiny and roundBits )<>0 then
  2550. float_raise( float_flag_underflow );
  2551. end
  2552. end;
  2553. if ( roundBits<>0 ) then
  2554. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2555. zSig := ( zSig + roundIncrement ) shr 10;
  2556. zSig := zSig and not( ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven );
  2557. if ( zSig = 0 ) then
  2558. zExp := 0;
  2559. result:=packFloat64( zSign, zExp, zSig );
  2560. end;
  2561. {*
  2562. -------------------------------------------------------------------------------
  2563. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2564. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2565. returns the proper double-precision floating-point value corresponding
  2566. to the abstract input. This routine is just like `roundAndPackFloat64'
  2567. except that the input significand has fewer bits and does not have to be
  2568. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2569. point exponent.
  2570. -------------------------------------------------------------------------------
  2571. *}
  2572. Procedure
  2573. normalizeRoundAndPackFloat64(
  2574. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2575. Var
  2576. shiftCount : int8;
  2577. zSig2 : bits32;
  2578. Begin
  2579. if ( zSig0 = 0 ) then
  2580. Begin
  2581. zSig0 := zSig1;
  2582. zSig1 := 0;
  2583. zExp := zExp -32;
  2584. End;
  2585. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2586. if ( 0 <= shiftCount ) then
  2587. Begin
  2588. zSig2 := 0;
  2589. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2590. End
  2591. else
  2592. Begin
  2593. shift64ExtraRightJamming
  2594. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2595. End;
  2596. zExp := zExp - shiftCount;
  2597. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2598. End;
  2599. {*
  2600. -------------------------------------------------------------------------------
  2601. Returns the result of converting the 32-bit two's complement integer `a' to
  2602. the single-precision floating-point format. The conversion is performed
  2603. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2604. -------------------------------------------------------------------------------
  2605. *}
  2606. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2607. Var
  2608. zSign : Flag;
  2609. Begin
  2610. if ( a = 0 ) then
  2611. Begin
  2612. int32_to_float32.float32 := 0;
  2613. exit;
  2614. End;
  2615. if ( a = sbits32 ($80000000) ) then
  2616. Begin
  2617. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2618. exit;
  2619. end;
  2620. zSign := flag( a < 0 );
  2621. If zSign<>0 then
  2622. a := -a;
  2623. int32_to_float32.float32:=
  2624. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2625. End;
  2626. {*
  2627. -------------------------------------------------------------------------------
  2628. Returns the result of converting the 32-bit two's complement integer `a' to
  2629. the double-precision floating-point format. The conversion is performed
  2630. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2631. -------------------------------------------------------------------------------
  2632. *}
  2633. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2634. var
  2635. zSign : flag;
  2636. absA : bits32;
  2637. shiftCount : int8;
  2638. zSig0, zSig1 : bits32;
  2639. Begin
  2640. if ( a = 0 ) then
  2641. Begin
  2642. packFloat64( 0, 0, 0, 0, result );
  2643. exit;
  2644. end;
  2645. zSign := flag( a < 0 );
  2646. if ZSign<>0 then
  2647. AbsA := -a
  2648. else
  2649. AbsA := a;
  2650. shiftCount := countLeadingZeros32( absA ) - 11;
  2651. if ( 0 <= shiftCount ) then
  2652. Begin
  2653. zSig0 := absA shl shiftCount;
  2654. zSig1 := 0;
  2655. End
  2656. else
  2657. Begin
  2658. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2659. End;
  2660. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2661. End;
  2662. {*
  2663. -------------------------------------------------------------------------------
  2664. Returns the result of converting the single-precision floating-point value
  2665. `a' to the 32-bit two's complement integer format. The conversion is
  2666. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2667. Arithmetic---which means in particular that the conversion is rounded
  2668. according to the current rounding mode. If `a' is a NaN, the largest
  2669. positive integer is returned. Otherwise, if the conversion overflows, the
  2670. largest integer with the same sign as `a' is returned.
  2671. -------------------------------------------------------------------------------
  2672. *}
  2673. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2674. Var
  2675. aSign: flag;
  2676. aExp, shiftCount: int16;
  2677. aSig, aSigExtra: bits32;
  2678. z: int32;
  2679. roundingMode: int8;
  2680. Begin
  2681. aSig := extractFloat32Frac( a.float32 );
  2682. aExp := extractFloat32Exp( a.float32 );
  2683. aSign := extractFloat32Sign( a.float32 );
  2684. shiftCount := aExp - $96;
  2685. if ( 0 <= shiftCount ) then
  2686. Begin
  2687. if ( $9E <= aExp ) then
  2688. Begin
  2689. if ( a.float32 <> $CF000000 ) then
  2690. Begin
  2691. float_raise( float_flag_invalid );
  2692. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2693. Begin
  2694. float32_to_int32 := $7FFFFFFF;
  2695. exit;
  2696. End;
  2697. End;
  2698. float32_to_int32 := sbits32 ($80000000);
  2699. exit;
  2700. End;
  2701. z := ( aSig or $00800000 ) shl shiftCount;
  2702. if ( aSign<>0 ) then z := - z;
  2703. End
  2704. else
  2705. Begin
  2706. if ( aExp < $7E ) then
  2707. Begin
  2708. aSigExtra := aExp OR aSig;
  2709. z := 0;
  2710. End
  2711. else
  2712. Begin
  2713. aSig := aSig OR $00800000;
  2714. aSigExtra := aSig shl ( shiftCount and 31 );
  2715. z := aSig shr ( - shiftCount );
  2716. End;
  2717. if ( aSigExtra<>0 ) then
  2718. softfloat_exception_flags := softfloat_exception_flags
  2719. or float_flag_inexact;
  2720. roundingMode := softfloat_rounding_mode;
  2721. if ( roundingMode = float_round_nearest_even ) then
  2722. Begin
  2723. if ( sbits32 (aSigExtra) < 0 ) then
  2724. Begin
  2725. Inc(z);
  2726. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2727. z := z and not 1;
  2728. End;
  2729. if ( aSign<>0 ) then
  2730. z := - z;
  2731. End
  2732. else
  2733. Begin
  2734. aSigExtra := flag( aSigExtra <> 0 );
  2735. if ( aSign<>0 ) then
  2736. Begin
  2737. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2738. z := - z;
  2739. End
  2740. else
  2741. Begin
  2742. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2743. End
  2744. End;
  2745. End;
  2746. float32_to_int32 := z;
  2747. End;
  2748. {*
  2749. -------------------------------------------------------------------------------
  2750. Returns the result of converting the single-precision floating-point value
  2751. `a' to the 32-bit two's complement integer format. The conversion is
  2752. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2753. Arithmetic, except that the conversion is always rounded toward zero.
  2754. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2755. the conversion overflows, the largest integer with the same sign as `a' is
  2756. returned.
  2757. -------------------------------------------------------------------------------
  2758. *}
  2759. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2760. Var
  2761. aSign : flag;
  2762. aExp, shiftCount : int16;
  2763. aSig : bits32;
  2764. z : int32;
  2765. Begin
  2766. aSig := extractFloat32Frac( a.float32 );
  2767. aExp := extractFloat32Exp( a.float32 );
  2768. aSign := extractFloat32Sign( a.float32 );
  2769. shiftCount := aExp - $9E;
  2770. if ( 0 <= shiftCount ) then
  2771. Begin
  2772. if ( a.float32 <> $CF000000 ) then
  2773. Begin
  2774. float_raise( float_flag_invalid );
  2775. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2776. Begin
  2777. float32_to_int32_round_to_zero := $7FFFFFFF;
  2778. exit;
  2779. end;
  2780. End;
  2781. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2782. exit;
  2783. End
  2784. else
  2785. if ( aExp <= $7E ) then
  2786. Begin
  2787. if ( aExp or aSig )<>0 then
  2788. softfloat_exception_flags :=
  2789. softfloat_exception_flags or float_flag_inexact;
  2790. float32_to_int32_round_to_zero := 0;
  2791. exit;
  2792. End;
  2793. aSig := ( aSig or $00800000 ) shl 8;
  2794. z := aSig shr ( - shiftCount );
  2795. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2796. Begin
  2797. softfloat_exception_flags :=
  2798. softfloat_exception_flags or float_flag_inexact;
  2799. End;
  2800. if ( aSign<>0 ) then z := - z;
  2801. float32_to_int32_round_to_zero := z;
  2802. End;
  2803. {*
  2804. -------------------------------------------------------------------------------
  2805. Returns the result of converting the single-precision floating-point value
  2806. `a' to the double-precision floating-point format. The conversion is
  2807. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2808. Arithmetic.
  2809. -------------------------------------------------------------------------------
  2810. *}
  2811. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  2812. Var
  2813. aSign : flag;
  2814. aExp : int16;
  2815. aSig, zSig0, zSig1: bits32;
  2816. tmp : CommonNanT;
  2817. Begin
  2818. aSig := extractFloat32Frac( a.float32 );
  2819. aExp := extractFloat32Exp( a.float32 );
  2820. aSign := extractFloat32Sign( a.float32 );
  2821. if ( aExp = $FF ) then
  2822. Begin
  2823. if ( aSig<>0 ) then
  2824. Begin
  2825. float32ToCommonNaN(a.float32, tmp);
  2826. commonNaNToFloat64(tmp , result);
  2827. exit;
  2828. End;
  2829. packFloat64( aSign, $7FF, 0, 0, result);
  2830. exit;
  2831. End;
  2832. if ( aExp = 0 ) then
  2833. Begin
  2834. if ( aSig = 0 ) then
  2835. Begin
  2836. packFloat64( aSign, 0, 0, 0, result );
  2837. exit;
  2838. end;
  2839. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2840. Dec(aExp);
  2841. End;
  2842. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  2843. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  2844. End;
  2845. {*
  2846. -------------------------------------------------------------------------------
  2847. Rounds the single-precision floating-point value `a' to an integer,
  2848. and returns the result as a single-precision floating-point value. The
  2849. operation is performed according to the IEC/IEEE Standard for Binary
  2850. Floating-Point Arithmetic.
  2851. -------------------------------------------------------------------------------
  2852. *}
  2853. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  2854. Var
  2855. aSign: flag;
  2856. aExp: int16;
  2857. lastBitMask, roundBitsMask: bits32;
  2858. roundingMode: int8;
  2859. z: float32;
  2860. Begin
  2861. aExp := extractFloat32Exp( a.float32 );
  2862. if ( $96 <= aExp ) then
  2863. Begin
  2864. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2865. Begin
  2866. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  2867. exit;
  2868. End;
  2869. float32_round_to_int:=a;
  2870. exit;
  2871. End;
  2872. if ( aExp <= $7E ) then
  2873. Begin
  2874. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  2875. Begin
  2876. float32_round_to_int:=a;
  2877. exit;
  2878. end;
  2879. softfloat_exception_flags
  2880. := softfloat_exception_flags OR float_flag_inexact;
  2881. aSign := extractFloat32Sign( a.float32 );
  2882. case ( softfloat_rounding_mode ) of
  2883. float_round_nearest_even:
  2884. Begin
  2885. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2886. Begin
  2887. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  2888. exit;
  2889. End;
  2890. End;
  2891. float_round_down:
  2892. Begin
  2893. if aSign <> 0 then
  2894. float32_round_to_int.float32 := $BF800000
  2895. else
  2896. float32_round_to_int.float32 := 0;
  2897. exit;
  2898. End;
  2899. float_round_up:
  2900. Begin
  2901. if aSign <> 0 then
  2902. float32_round_to_int.float32 := $80000000
  2903. else
  2904. float32_round_to_int.float32 := $3F800000;
  2905. exit;
  2906. End;
  2907. end;
  2908. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  2909. End;
  2910. lastBitMask := 1;
  2911. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  2912. lastBitMask := lastBitMask shl ($96 - aExp);
  2913. roundBitsMask := lastBitMask - 1;
  2914. z := a.float32;
  2915. roundingMode := softfloat_rounding_mode;
  2916. if ( roundingMode = float_round_nearest_even ) then
  2917. Begin
  2918. z := z + (lastBitMask shr 1);
  2919. if ( ( z and roundBitsMask ) = 0 ) then
  2920. z := z and not lastBitMask;
  2921. End
  2922. else if ( roundingMode <> float_round_to_zero ) then
  2923. Begin
  2924. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  2925. Begin
  2926. z := z + roundBitsMask;
  2927. End;
  2928. End;
  2929. z := z and not roundBitsMask;
  2930. if ( z <> a.float32 ) then
  2931. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2932. float32_round_to_int.float32 := z;
  2933. End;
  2934. {*
  2935. -------------------------------------------------------------------------------
  2936. Returns the result of adding the absolute values of the single-precision
  2937. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  2938. before being returned. `zSign' is ignored if the result is a NaN.
  2939. The addition is performed according to the IEC/IEEE Standard for Binary
  2940. Floating-Point Arithmetic.
  2941. -------------------------------------------------------------------------------
  2942. *}
  2943. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  2944. Var
  2945. aExp, bExp, zExp: int16;
  2946. aSig, bSig, zSig: bits32;
  2947. expDiff: int16;
  2948. label roundAndPack;
  2949. Begin
  2950. aSig:=extractFloat32Frac( a );
  2951. aExp:=extractFloat32Exp( a );
  2952. bSig:=extractFloat32Frac( b );
  2953. bExp := extractFloat32Exp( b );
  2954. expDiff := aExp - bExp;
  2955. aSig := aSig shl 6;
  2956. bSig := bSig shl 6;
  2957. if ( 0 < expDiff ) then
  2958. Begin
  2959. if ( aExp = $FF ) then
  2960. Begin
  2961. if ( aSig <> 0) then
  2962. Begin
  2963. addFloat32Sigs := propagateFloat32NaN( a, b );
  2964. exit;
  2965. End;
  2966. addFloat32Sigs := a;
  2967. exit;
  2968. End;
  2969. if ( bExp = 0 ) then
  2970. Begin
  2971. Dec(expDiff);
  2972. End
  2973. else
  2974. Begin
  2975. bSig := bSig or $20000000;
  2976. End;
  2977. shift32RightJamming( bSig, expDiff, bSig );
  2978. zExp := aExp;
  2979. End
  2980. else
  2981. If ( expDiff < 0 ) then
  2982. Begin
  2983. if ( bExp = $FF ) then
  2984. Begin
  2985. if ( bSig<>0 ) then
  2986. Begin
  2987. addFloat32Sigs := propagateFloat32NaN( a, b );
  2988. exit;
  2989. end;
  2990. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  2991. exit;
  2992. End;
  2993. if ( aExp = 0 ) then
  2994. Begin
  2995. Inc(expDiff);
  2996. End
  2997. else
  2998. Begin
  2999. aSig := aSig OR $20000000;
  3000. End;
  3001. shift32RightJamming( aSig, - expDiff, aSig );
  3002. zExp := bExp;
  3003. End
  3004. else
  3005. Begin
  3006. if ( aExp = $FF ) then
  3007. Begin
  3008. if ( aSig OR bSig )<> 0 then
  3009. Begin
  3010. addFloat32Sigs := propagateFloat32NaN( a, b );
  3011. exit;
  3012. end;
  3013. addFloat32Sigs := a;
  3014. exit;
  3015. End;
  3016. if ( aExp = 0 ) then
  3017. Begin
  3018. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3019. exit;
  3020. end;
  3021. zSig := $40000000 + aSig + bSig;
  3022. zExp := aExp;
  3023. goto roundAndPack;
  3024. End;
  3025. aSig := aSig OR $20000000;
  3026. zSig := ( aSig + bSig ) shl 1;
  3027. Dec(zExp);
  3028. if ( sbits32 (zSig) < 0 ) then
  3029. Begin
  3030. zSig := aSig + bSig;
  3031. Inc(zExp);
  3032. End;
  3033. roundAndPack:
  3034. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3035. End;
  3036. {*
  3037. -------------------------------------------------------------------------------
  3038. Returns the result of subtracting the absolute values of the single-
  3039. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3040. difference is negated before being returned. `zSign' is ignored if the
  3041. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3042. Standard for Binary Floating-Point Arithmetic.
  3043. -------------------------------------------------------------------------------
  3044. *}
  3045. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3046. Var
  3047. aExp, bExp, zExp: int16;
  3048. aSig, bSig, zSig: bits32;
  3049. expDiff : int16;
  3050. label aExpBigger;
  3051. label bExpBigger;
  3052. label aBigger;
  3053. label bBigger;
  3054. label normalizeRoundAndPack;
  3055. Begin
  3056. aSig := extractFloat32Frac( a );
  3057. aExp := extractFloat32Exp( a );
  3058. bSig := extractFloat32Frac( b );
  3059. bExp := extractFloat32Exp( b );
  3060. expDiff := aExp - bExp;
  3061. aSig := aSig shl 7;
  3062. bSig := bSig shl 7;
  3063. if ( 0 < expDiff ) then goto aExpBigger;
  3064. if ( expDiff < 0 ) then goto bExpBigger;
  3065. if ( aExp = $FF ) then
  3066. Begin
  3067. if ( aSig OR bSig )<> 0 then
  3068. Begin
  3069. subFloat32Sigs := propagateFloat32NaN( a, b );
  3070. exit;
  3071. End;
  3072. float_raise( float_flag_invalid );
  3073. subFloat32Sigs := float32_default_nan;
  3074. exit;
  3075. End;
  3076. if ( aExp = 0 ) then
  3077. Begin
  3078. aExp := 1;
  3079. bExp := 1;
  3080. End;
  3081. if ( bSig < aSig ) Then goto aBigger;
  3082. if ( aSig < bSig ) Then goto bBigger;
  3083. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3084. exit;
  3085. bExpBigger:
  3086. if ( bExp = $FF ) then
  3087. Begin
  3088. if ( bSig<>0 ) then
  3089. Begin
  3090. subFloat32Sigs := propagateFloat32NaN( a, b );
  3091. exit;
  3092. End;
  3093. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3094. exit;
  3095. End;
  3096. if ( aExp = 0 ) then
  3097. Begin
  3098. Inc(expDiff);
  3099. End
  3100. else
  3101. Begin
  3102. aSig := aSig OR $40000000;
  3103. End;
  3104. shift32RightJamming( aSig, - expDiff, aSig );
  3105. bSig := bSig OR $40000000;
  3106. bBigger:
  3107. zSig := bSig - aSig;
  3108. zExp := bExp;
  3109. zSign := zSign xor 1;
  3110. goto normalizeRoundAndPack;
  3111. aExpBigger:
  3112. if ( aExp = $FF ) then
  3113. Begin
  3114. if ( aSig <> 0) then
  3115. Begin
  3116. subFloat32Sigs := propagateFloat32NaN( a, b );
  3117. exit;
  3118. End;
  3119. subFloat32Sigs := a;
  3120. exit;
  3121. End;
  3122. if ( bExp = 0 ) then
  3123. Begin
  3124. Dec(expDiff);
  3125. End
  3126. else
  3127. Begin
  3128. bSig := bSig OR $40000000;
  3129. End;
  3130. shift32RightJamming( bSig, expDiff, bSig );
  3131. aSig := aSig OR $40000000;
  3132. aBigger:
  3133. zSig := aSig - bSig;
  3134. zExp := aExp;
  3135. normalizeRoundAndPack:
  3136. Dec(zExp);
  3137. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3138. End;
  3139. {*
  3140. -------------------------------------------------------------------------------
  3141. Returns the result of adding the single-precision floating-point values `a'
  3142. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3143. Binary Floating-Point Arithmetic.
  3144. -------------------------------------------------------------------------------
  3145. *}
  3146. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3147. Var
  3148. aSign, bSign: Flag;
  3149. Begin
  3150. aSign := extractFloat32Sign( a.float32 );
  3151. bSign := extractFloat32Sign( b.float32 );
  3152. if ( aSign = bSign ) then
  3153. Begin
  3154. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3155. End
  3156. else
  3157. Begin
  3158. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3159. End;
  3160. End;
  3161. {*
  3162. -------------------------------------------------------------------------------
  3163. Returns the result of subtracting the single-precision floating-point values
  3164. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3165. for Binary Floating-Point Arithmetic.
  3166. -------------------------------------------------------------------------------
  3167. *}
  3168. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3169. Var
  3170. aSign, bSign: flag;
  3171. Begin
  3172. aSign := extractFloat32Sign( a.float32 );
  3173. bSign := extractFloat32Sign( b.float32 );
  3174. if ( aSign = bSign ) then
  3175. Begin
  3176. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3177. End
  3178. else
  3179. Begin
  3180. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3181. End;
  3182. End;
  3183. {*
  3184. -------------------------------------------------------------------------------
  3185. Returns the result of multiplying the single-precision floating-point values
  3186. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3187. for Binary Floating-Point Arithmetic.
  3188. -------------------------------------------------------------------------------
  3189. *}
  3190. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3191. Var
  3192. aSign, bSign, zSign: flag;
  3193. aExp, bExp, zExp : int16;
  3194. aSig, bSig, zSig0, zSig1: bits32;
  3195. Begin
  3196. aSig := extractFloat32Frac( a.float32 );
  3197. aExp := extractFloat32Exp( a.float32 );
  3198. aSign := extractFloat32Sign( a.float32 );
  3199. bSig := extractFloat32Frac( b.float32 );
  3200. bExp := extractFloat32Exp( b.float32 );
  3201. bSign := extractFloat32Sign( b.float32 );
  3202. zSign := aSign xor bSign;
  3203. if ( aExp = $FF ) then
  3204. Begin
  3205. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3206. Begin
  3207. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3208. End;
  3209. if ( ( bExp OR bSig ) = 0 ) then
  3210. Begin
  3211. float_raise( float_flag_invalid );
  3212. float32_mul.float32 := float32_default_nan;
  3213. exit;
  3214. End;
  3215. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3216. exit;
  3217. End;
  3218. if ( bExp = $FF ) then
  3219. Begin
  3220. if ( bSig <> 0 ) then
  3221. Begin
  3222. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3223. exit;
  3224. End;
  3225. if ( ( aExp OR aSig ) = 0 ) then
  3226. Begin
  3227. float_raise( float_flag_invalid );
  3228. float32_mul.float32 := float32_default_nan;
  3229. exit;
  3230. End;
  3231. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3232. exit;
  3233. End;
  3234. if ( aExp = 0 ) then
  3235. Begin
  3236. if ( aSig = 0 ) then
  3237. Begin
  3238. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3239. exit;
  3240. End;
  3241. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3242. End;
  3243. if ( bExp = 0 ) then
  3244. Begin
  3245. if ( bSig = 0 ) then
  3246. Begin
  3247. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3248. exit;
  3249. End;
  3250. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3251. End;
  3252. zExp := aExp + bExp - $7F;
  3253. aSig := ( aSig OR $00800000 ) shl 7;
  3254. bSig := ( bSig OR $00800000 ) shl 8;
  3255. mul32To64( aSig, bSig, zSig0, zSig1 );
  3256. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3257. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3258. Begin
  3259. zSig0 := zSig0 shl 1;
  3260. Dec(zExp);
  3261. End;
  3262. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3263. End;
  3264. {*
  3265. -------------------------------------------------------------------------------
  3266. Returns the result of dividing the single-precision floating-point value `a'
  3267. by the corresponding value `b'. The operation is performed according to the
  3268. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3269. -------------------------------------------------------------------------------
  3270. *}
  3271. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3272. Var
  3273. aSign, bSign, zSign: flag;
  3274. aExp, bExp, zExp: int16;
  3275. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3276. Begin
  3277. aSig := extractFloat32Frac( a.float32 );
  3278. aExp := extractFloat32Exp( a.float32 );
  3279. aSign := extractFloat32Sign( a.float32 );
  3280. bSig := extractFloat32Frac( b.float32 );
  3281. bExp := extractFloat32Exp( b.float32 );
  3282. bSign := extractFloat32Sign( b.float32 );
  3283. zSign := aSign xor bSign;
  3284. if ( aExp = $FF ) then
  3285. Begin
  3286. if ( aSig <> 0 ) then
  3287. Begin
  3288. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3289. exit;
  3290. End;
  3291. if ( bExp = $FF ) then
  3292. Begin
  3293. if ( bSig <> 0) then
  3294. Begin
  3295. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3296. End;
  3297. float_raise( float_flag_invalid );
  3298. float32_div.float32 := float32_default_nan;
  3299. exit;
  3300. End;
  3301. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3302. exit;
  3303. End;
  3304. if ( bExp = $FF ) then
  3305. Begin
  3306. if ( bSig <> 0) then
  3307. Begin
  3308. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3309. exit;
  3310. End;
  3311. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3312. exit;
  3313. End;
  3314. if ( bExp = 0 ) Then
  3315. Begin
  3316. if ( bSig = 0 ) Then
  3317. Begin
  3318. if ( ( aExp OR aSig ) = 0 ) then
  3319. Begin
  3320. float_raise( float_flag_invalid );
  3321. float32_div.float32 := float32_default_nan;
  3322. exit;
  3323. End;
  3324. float_raise( float_flag_divbyzero );
  3325. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3326. exit;
  3327. End;
  3328. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3329. End;
  3330. if ( aExp = 0 ) Then
  3331. Begin
  3332. if ( aSig = 0 ) Then
  3333. Begin
  3334. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3335. exit;
  3336. End;
  3337. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3338. End;
  3339. zExp := aExp - bExp + $7D;
  3340. aSig := ( aSig OR $00800000 ) shl 7;
  3341. bSig := ( bSig OR $00800000 ) shl 8;
  3342. if ( bSig <= ( aSig + aSig ) ) then
  3343. Begin
  3344. aSig := aSig shr 1;
  3345. Inc(zExp);
  3346. End;
  3347. zSig := estimateDiv64To32( aSig, 0, bSig );
  3348. if ( ( zSig and $3F ) <= 2 ) then
  3349. Begin
  3350. mul32To64( bSig, zSig, term0, term1 );
  3351. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3352. while ( sbits32 (rem0) < 0 ) do
  3353. Begin
  3354. Dec(zSig);
  3355. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3356. End;
  3357. zSig := zSig or bits32( rem1 <> 0 );
  3358. End;
  3359. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3360. End;
  3361. {*
  3362. -------------------------------------------------------------------------------
  3363. Returns the remainder of the single-precision floating-point value `a'
  3364. with respect to the corresponding value `b'. The operation is performed
  3365. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3366. -------------------------------------------------------------------------------
  3367. *}
  3368. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3369. Var
  3370. aSign, bSign, zSign: flag;
  3371. aExp, bExp, expDiff: int16;
  3372. aSig, bSig, q, allZero, alternateASig: bits32;
  3373. sigMean: sbits32;
  3374. Begin
  3375. aSig := extractFloat32Frac( a.float32 );
  3376. aExp := extractFloat32Exp( a.float32 );
  3377. aSign := extractFloat32Sign( a.float32 );
  3378. bSig := extractFloat32Frac( b.float32 );
  3379. bExp := extractFloat32Exp( b.float32 );
  3380. bSign := extractFloat32Sign( b.float32 );
  3381. if ( aExp = $FF ) then
  3382. Begin
  3383. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3384. Begin
  3385. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3386. exit;
  3387. End;
  3388. float_raise( float_flag_invalid );
  3389. float32_rem.float32 := float32_default_nan;
  3390. exit;
  3391. End;
  3392. if ( bExp = $FF ) then
  3393. Begin
  3394. if ( bSig <> 0 ) then
  3395. Begin
  3396. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3397. exit;
  3398. End;
  3399. float32_rem := a;
  3400. exit;
  3401. End;
  3402. if ( bExp = 0 ) then
  3403. Begin
  3404. if ( bSig = 0 ) then
  3405. Begin
  3406. float_raise( float_flag_invalid );
  3407. float32_rem.float32 := float32_default_nan;
  3408. exit;
  3409. End;
  3410. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3411. End;
  3412. if ( aExp = 0 ) then
  3413. Begin
  3414. if ( aSig = 0 ) then
  3415. Begin
  3416. float32_rem := a;
  3417. exit;
  3418. End;
  3419. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3420. End;
  3421. expDiff := aExp - bExp;
  3422. aSig := ( aSig OR $00800000 ) shl 8;
  3423. bSig := ( bSig OR $00800000 ) shl 8;
  3424. if ( expDiff < 0 ) then
  3425. Begin
  3426. if ( expDiff < -1 ) then
  3427. Begin
  3428. float32_rem := a;
  3429. exit;
  3430. End;
  3431. aSig := aSig shr 1;
  3432. End;
  3433. q := bits32( bSig <= aSig );
  3434. if ( q <> 0) then
  3435. aSig := aSig - bSig;
  3436. expDiff := expDiff - 32;
  3437. while ( 0 < expDiff ) do
  3438. Begin
  3439. q := estimateDiv64To32( aSig, 0, bSig );
  3440. if (2 < q) then
  3441. q := q - 2
  3442. else
  3443. q := 0;
  3444. aSig := - ( ( bSig shr 2 ) * q );
  3445. expDiff := expDiff - 30;
  3446. End;
  3447. expDiff := expDiff + 32;
  3448. if ( 0 < expDiff ) then
  3449. Begin
  3450. q := estimateDiv64To32( aSig, 0, bSig );
  3451. if (2 < q) then
  3452. q := q - 2
  3453. else
  3454. q := 0;
  3455. q := q shr (32 - expDiff);
  3456. bSig := bSig shr 2;
  3457. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3458. End
  3459. else
  3460. Begin
  3461. aSig := aSig shr 2;
  3462. bSig := bSig shr 2;
  3463. End;
  3464. Repeat
  3465. alternateASig := aSig;
  3466. Inc(q);
  3467. aSig := aSig - bSig;
  3468. Until not ( 0 <= sbits32 (aSig) );
  3469. sigMean := aSig + alternateASig;
  3470. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3471. Begin
  3472. aSig := alternateASig;
  3473. End;
  3474. zSign := flag( sbits32 (aSig) < 0 );
  3475. if ( zSign<>0 ) then
  3476. aSig := - aSig;
  3477. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3478. End;
  3479. {*
  3480. -------------------------------------------------------------------------------
  3481. Returns the square root of the single-precision floating-point value `a'.
  3482. The operation is performed according to the IEC/IEEE Standard for Binary
  3483. Floating-Point Arithmetic.
  3484. -------------------------------------------------------------------------------
  3485. *}
  3486. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3487. Var
  3488. aSign : flag;
  3489. aExp, zExp : int16;
  3490. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3491. label roundAndPack;
  3492. Begin
  3493. aSig := extractFloat32Frac( a.float32 );
  3494. aExp := extractFloat32Exp( a.float32 );
  3495. aSign := extractFloat32Sign( a.float32 );
  3496. if ( aExp = $FF ) then
  3497. Begin
  3498. if ( aSig <> 0) then
  3499. Begin
  3500. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3501. exit;
  3502. End;
  3503. if ( aSign = 0) then
  3504. Begin
  3505. float32_sqrt := a;
  3506. exit;
  3507. End;
  3508. float_raise( float_flag_invalid );
  3509. float32_sqrt.float32 := float32_default_nan;
  3510. exit;
  3511. End;
  3512. if ( aSign <> 0) then
  3513. Begin
  3514. if ( ( aExp OR aSig ) = 0 ) then
  3515. Begin
  3516. float32_sqrt := a;
  3517. exit;
  3518. End;
  3519. float_raise( float_flag_invalid );
  3520. float32_sqrt.float32 := float32_default_nan;
  3521. exit;
  3522. End;
  3523. if ( aExp = 0 ) then
  3524. Begin
  3525. if ( aSig = 0 ) then
  3526. Begin
  3527. float32_sqrt.float32 := 0;
  3528. exit;
  3529. End;
  3530. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3531. End;
  3532. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3533. aSig := ( aSig OR $00800000 ) shl 8;
  3534. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3535. if ( ( zSig and $7F ) <= 5 ) then
  3536. Begin
  3537. if ( zSig < 2 ) then
  3538. Begin
  3539. zSig := $7FFFFFFF;
  3540. goto roundAndPack;
  3541. End
  3542. else
  3543. Begin
  3544. aSig := aSig shr (aExp and 1);
  3545. mul32To64( zSig, zSig, term0, term1 );
  3546. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3547. while ( sbits32 (rem0) < 0 ) do
  3548. Begin
  3549. Dec(zSig);
  3550. shortShift64Left( 0, zSig, 1, term0, term1 );
  3551. term1 := term1 or 1;
  3552. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3553. End;
  3554. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3555. End;
  3556. End;
  3557. shift32RightJamming( zSig, 1, zSig );
  3558. roundAndPack:
  3559. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3560. End;
  3561. {*
  3562. -------------------------------------------------------------------------------
  3563. Returns 1 if the single-precision floating-point value `a' is equal to
  3564. the corresponding value `b', and 0 otherwise. The comparison is performed
  3565. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3566. -------------------------------------------------------------------------------
  3567. *}
  3568. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3569. Begin
  3570. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3571. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3572. ) then
  3573. Begin
  3574. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3575. Begin
  3576. float_raise( float_flag_invalid );
  3577. End;
  3578. float32_eq := 0;
  3579. exit;
  3580. End;
  3581. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3582. End;
  3583. {*
  3584. -------------------------------------------------------------------------------
  3585. Returns 1 if the single-precision floating-point value `a' is less than
  3586. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3587. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3588. Arithmetic.
  3589. -------------------------------------------------------------------------------
  3590. *}
  3591. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3592. var
  3593. aSign, bSign: flag;
  3594. Begin
  3595. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3596. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3597. ) then
  3598. Begin
  3599. float_raise( float_flag_invalid );
  3600. float32_le := 0;
  3601. exit;
  3602. End;
  3603. aSign := extractFloat32Sign( a.float32 );
  3604. bSign := extractFloat32Sign( b.float32 );
  3605. if ( aSign <> bSign ) then
  3606. Begin
  3607. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3608. exit;
  3609. End;
  3610. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3611. End;
  3612. {*
  3613. -------------------------------------------------------------------------------
  3614. Returns 1 if the single-precision floating-point value `a' is less than
  3615. the corresponding value `b', and 0 otherwise. The comparison is performed
  3616. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3617. -------------------------------------------------------------------------------
  3618. *}
  3619. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3620. var
  3621. aSign, bSign: flag;
  3622. Begin
  3623. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3624. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3625. ) then
  3626. Begin
  3627. float_raise( float_flag_invalid );
  3628. float32_lt :=0;
  3629. exit;
  3630. End;
  3631. aSign := extractFloat32Sign( a.float32 );
  3632. bSign := extractFloat32Sign( b.float32 );
  3633. if ( aSign <> bSign ) then
  3634. Begin
  3635. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3636. exit;
  3637. End;
  3638. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3639. End;
  3640. {*
  3641. -------------------------------------------------------------------------------
  3642. Returns 1 if the single-precision floating-point value `a' is equal to
  3643. the corresponding value `b', and 0 otherwise. The invalid exception is
  3644. raised if either operand is a NaN. Otherwise, the comparison is performed
  3645. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3646. -------------------------------------------------------------------------------
  3647. *}
  3648. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3649. Begin
  3650. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3651. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3652. ) then
  3653. Begin
  3654. float_raise( float_flag_invalid );
  3655. float32_eq_signaling := 0;
  3656. exit;
  3657. End;
  3658. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3659. End;
  3660. {*
  3661. -------------------------------------------------------------------------------
  3662. Returns 1 if the single-precision floating-point value `a' is less than or
  3663. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3664. cause an exception. Otherwise, the comparison is performed according to the
  3665. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3666. -------------------------------------------------------------------------------
  3667. *}
  3668. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3669. Var
  3670. aSign, bSign: flag;
  3671. aExp, bExp: int16;
  3672. Begin
  3673. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3674. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3675. ) then
  3676. Begin
  3677. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3678. Begin
  3679. float_raise( float_flag_invalid );
  3680. End;
  3681. float32_le_quiet := 0;
  3682. exit;
  3683. End;
  3684. aSign := extractFloat32Sign( a );
  3685. bSign := extractFloat32Sign( b );
  3686. if ( aSign <> bSign ) then
  3687. Begin
  3688. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3689. exit;
  3690. End;
  3691. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3692. End;
  3693. {*
  3694. -------------------------------------------------------------------------------
  3695. Returns 1 if the single-precision floating-point value `a' is less than
  3696. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3697. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3698. Standard for Binary Floating-Point Arithmetic.
  3699. -------------------------------------------------------------------------------
  3700. *}
  3701. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  3702. Var
  3703. aSign, bSign: flag;
  3704. Begin
  3705. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3706. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3707. ) then
  3708. Begin
  3709. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3710. Begin
  3711. float_raise( float_flag_invalid );
  3712. End;
  3713. float32_lt_quiet := 0;
  3714. exit;
  3715. End;
  3716. aSign := extractFloat32Sign( a );
  3717. bSign := extractFloat32Sign( b );
  3718. if ( aSign <> bSign ) then
  3719. Begin
  3720. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  3721. exit;
  3722. End;
  3723. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  3724. End;
  3725. {*
  3726. -------------------------------------------------------------------------------
  3727. Returns the result of converting the double-precision floating-point value
  3728. `a' to the 32-bit two's complement integer format. The conversion is
  3729. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3730. Arithmetic---which means in particular that the conversion is rounded
  3731. according to the current rounding mode. If `a' is a NaN, the largest
  3732. positive integer is returned. Otherwise, if the conversion overflows, the
  3733. largest integer with the same sign as `a' is returned.
  3734. -------------------------------------------------------------------------------
  3735. *}
  3736. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  3737. var
  3738. aSign: flag;
  3739. aExp, shiftCount: int16;
  3740. aSig0, aSig1, absZ, aSigExtra: bits32;
  3741. z: int32;
  3742. roundingMode: int8;
  3743. label invalid;
  3744. Begin
  3745. aSig1 := extractFloat64Frac1( a );
  3746. aSig0 := extractFloat64Frac0( a );
  3747. aExp := extractFloat64Exp( a );
  3748. aSign := extractFloat64Sign( a );
  3749. shiftCount := aExp - $413;
  3750. if ( 0 <= shiftCount ) then
  3751. Begin
  3752. if ( $41E < aExp ) then
  3753. Begin
  3754. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3755. aSign := 0;
  3756. goto invalid;
  3757. End;
  3758. shortShift64Left(
  3759. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3760. if ( $80000000 < absZ ) then
  3761. goto invalid;
  3762. End
  3763. else
  3764. Begin
  3765. aSig1 := flag( aSig1 <> 0 );
  3766. if ( aExp < $3FE ) then
  3767. Begin
  3768. aSigExtra := aExp OR aSig0 OR aSig1;
  3769. absZ := 0;
  3770. End
  3771. else
  3772. Begin
  3773. aSig0 := aSig0 OR $00100000;
  3774. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3775. absZ := aSig0 shr ( - shiftCount );
  3776. End;
  3777. End;
  3778. roundingMode := softfloat_rounding_mode;
  3779. if ( roundingMode = float_round_nearest_even ) then
  3780. Begin
  3781. if ( sbits32(aSigExtra) < 0 ) then
  3782. Begin
  3783. Inc(absZ);
  3784. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  3785. absZ := absZ and not 1;
  3786. End;
  3787. if aSign <> 0 then
  3788. z := - absZ
  3789. else
  3790. z := absZ;
  3791. End
  3792. else
  3793. Begin
  3794. aSigExtra := bits32( aSigExtra <> 0 );
  3795. if ( aSign <> 0) then
  3796. Begin
  3797. z := - ( absZ
  3798. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  3799. End
  3800. else
  3801. Begin
  3802. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  3803. End
  3804. End;
  3805. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  3806. Begin
  3807. invalid:
  3808. float_raise( float_flag_invalid );
  3809. if (aSign <> 0 ) then
  3810. float64_to_int32 := sbits32 ($80000000)
  3811. else
  3812. float64_to_int32 := $7FFFFFFF;
  3813. exit;
  3814. End;
  3815. if ( aSigExtra <> 0) then
  3816. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3817. float64_to_int32 := z;
  3818. End;
  3819. {*
  3820. -------------------------------------------------------------------------------
  3821. Returns the result of converting the double-precision floating-point value
  3822. `a' to the 32-bit two's complement integer format. The conversion is
  3823. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3824. Arithmetic, except that the conversion is always rounded toward zero.
  3825. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3826. the conversion overflows, the largest integer with the same sign as `a' is
  3827. returned.
  3828. -------------------------------------------------------------------------------
  3829. *}
  3830. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  3831. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  3832. Var
  3833. aSign: flag;
  3834. aExp, shiftCount: int16;
  3835. aSig0, aSig1, absZ, aSigExtra: bits32;
  3836. z: int32;
  3837. label invalid;
  3838. Begin
  3839. aSig1 := extractFloat64Frac1( a );
  3840. aSig0 := extractFloat64Frac0( a );
  3841. aExp := extractFloat64Exp( a );
  3842. aSign := extractFloat64Sign( a );
  3843. shiftCount := aExp - $413;
  3844. if ( 0 <= shiftCount ) then
  3845. Begin
  3846. if ( $41E < aExp ) then
  3847. Begin
  3848. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3849. aSign := 0;
  3850. goto invalid;
  3851. End;
  3852. shortShift64Left(
  3853. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3854. End
  3855. else
  3856. Begin
  3857. if ( aExp < $3FF ) then
  3858. Begin
  3859. if ( aExp OR aSig0 OR aSig1 )<>0 then
  3860. Begin
  3861. softfloat_exception_flags :=
  3862. softfloat_exception_flags or float_flag_inexact;
  3863. End;
  3864. float64_to_int32_round_to_zero := 0;
  3865. exit;
  3866. End;
  3867. aSig0 := aSig0 or $00100000;
  3868. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3869. absZ := aSig0 shr ( - shiftCount );
  3870. End;
  3871. if aSign <> 0 then
  3872. z := - absZ
  3873. else
  3874. z := absZ;
  3875. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  3876. Begin
  3877. invalid:
  3878. float_raise( float_flag_invalid );
  3879. if (aSign <> 0) then
  3880. float64_to_int32_round_to_zero := sbits32 ($80000000)
  3881. else
  3882. float64_to_int32_round_to_zero := $7FFFFFFF;
  3883. exit;
  3884. End;
  3885. if ( aSigExtra <> 0) then
  3886. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3887. float64_to_int32_round_to_zero := z;
  3888. End;
  3889. {*
  3890. -------------------------------------------------------------------------------
  3891. Returns the result of converting the double-precision floating-point value
  3892. `a' to the single-precision floating-point format. The conversion is
  3893. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3894. Arithmetic.
  3895. -------------------------------------------------------------------------------
  3896. *}
  3897. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  3898. Var
  3899. aSign: flag;
  3900. aExp: int16;
  3901. aSig0, aSig1, zSig: bits32;
  3902. allZero: bits32;
  3903. tmp : CommonNanT;
  3904. Begin
  3905. aSig1 := extractFloat64Frac1( a );
  3906. aSig0 := extractFloat64Frac0( a );
  3907. aExp := extractFloat64Exp( a );
  3908. aSign := extractFloat64Sign( a );
  3909. if ( aExp = $7FF ) then
  3910. Begin
  3911. if ( aSig0 OR aSig1 ) <> 0 then
  3912. Begin
  3913. float64ToCommonNaN( a, tmp );
  3914. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  3915. exit;
  3916. End;
  3917. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  3918. exit;
  3919. End;
  3920. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  3921. if ( aExp <> 0) then
  3922. zSig := zSig OR $40000000;
  3923. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  3924. End;
  3925. {*
  3926. -------------------------------------------------------------------------------
  3927. Rounds the double-precision floating-point value `a' to an integer,
  3928. and returns the result as a double-precision floating-point value. The
  3929. operation is performed according to the IEC/IEEE Standard for Binary
  3930. Floating-Point Arithmetic.
  3931. -------------------------------------------------------------------------------
  3932. *}
  3933. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  3934. Var
  3935. aSign: flag;
  3936. aExp: int16;
  3937. lastBitMask, roundBitsMask: bits32;
  3938. roundingMode: int8;
  3939. z: float64;
  3940. Begin
  3941. aExp := extractFloat64Exp( a );
  3942. if ( $413 <= aExp ) then
  3943. Begin
  3944. if ( $433 <= aExp ) then
  3945. Begin
  3946. if ( ( aExp = $7FF )
  3947. AND
  3948. (
  3949. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  3950. ) <>0)
  3951. ) then
  3952. Begin
  3953. propagateFloat64NaN( a, a, result );
  3954. exit;
  3955. End;
  3956. result := a;
  3957. exit;
  3958. End;
  3959. lastBitMask := 1;
  3960. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  3961. roundBitsMask := lastBitMask - 1;
  3962. z := a;
  3963. roundingMode := softfloat_rounding_mode;
  3964. if ( roundingMode = float_round_nearest_even ) then
  3965. Begin
  3966. if ( lastBitMask <> 0) then
  3967. Begin
  3968. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  3969. if ( ( z.low and roundBitsMask ) = 0 ) then
  3970. z.low := z.low and not lastBitMask;
  3971. End
  3972. else
  3973. Begin
  3974. if ( sbits32 (z.low) < 0 ) then
  3975. Begin
  3976. Inc(z.high);
  3977. if ( bits32 ( z.low shl 1 ) = 0 ) then
  3978. z.high := z.high and not 1;
  3979. End;
  3980. End;
  3981. End
  3982. else if ( roundingMode <> float_round_to_zero ) then
  3983. Begin
  3984. if ( extractFloat64Sign( z )
  3985. xor flag( roundingMode = float_round_up ) )<> 0 then
  3986. Begin
  3987. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  3988. End;
  3989. End;
  3990. z.low := z.low and not roundBitsMask;
  3991. End
  3992. else
  3993. Begin
  3994. if ( aExp <= $3FE ) then
  3995. Begin
  3996. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  3997. Begin
  3998. result := a;
  3999. exit;
  4000. End;
  4001. softfloat_exception_flags := softfloat_exception_flags or
  4002. float_flag_inexact;
  4003. aSign := extractFloat64Sign( a );
  4004. case ( softfloat_rounding_mode ) of
  4005. float_round_nearest_even:
  4006. Begin
  4007. if ( ( aExp = $3FE )
  4008. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4009. ) then
  4010. Begin
  4011. packFloat64( aSign, $3FF, 0, 0, result );
  4012. exit;
  4013. End;
  4014. End;
  4015. float_round_down:
  4016. Begin
  4017. if aSign<>0 then
  4018. packFloat64( 1, $3FF, 0, 0, result )
  4019. else
  4020. packFloat64( 0, 0, 0, 0, result );
  4021. exit;
  4022. End;
  4023. float_round_up:
  4024. Begin
  4025. if aSign <> 0 then
  4026. packFloat64( 1, 0, 0, 0, result )
  4027. else
  4028. packFloat64( 0, $3FF, 0, 0, result );
  4029. exit;
  4030. End;
  4031. end;
  4032. packFloat64( aSign, 0, 0, 0, result );
  4033. exit;
  4034. End;
  4035. lastBitMask := 1;
  4036. lastBitMask := lastBitMask shl ($413 - aExp);
  4037. roundBitsMask := lastBitMask - 1;
  4038. z.low := 0;
  4039. z.high := a.high;
  4040. roundingMode := softfloat_rounding_mode;
  4041. if ( roundingMode = float_round_nearest_even ) then
  4042. Begin
  4043. z.high := z.high + lastBitMask shr 1;
  4044. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4045. Begin
  4046. z.high := z.high and not lastBitMask;
  4047. End;
  4048. End
  4049. else if ( roundingMode <> float_round_to_zero ) then
  4050. Begin
  4051. if ( extractFloat64Sign( z )
  4052. xor flag( roundingMode = float_round_up ) )<> 0 then
  4053. Begin
  4054. z.high := z.high or bits32( a.low <> 0 );
  4055. z.high := z.high + roundBitsMask;
  4056. End;
  4057. End;
  4058. z.high := z.high and not roundBitsMask;
  4059. End;
  4060. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4061. Begin
  4062. softfloat_exception_flags :=
  4063. softfloat_exception_flags or float_flag_inexact;
  4064. End;
  4065. result := z;
  4066. End;
  4067. {*
  4068. -------------------------------------------------------------------------------
  4069. Returns the result of adding the absolute values of the double-precision
  4070. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4071. before being returned. `zSign' is ignored if the result is a NaN.
  4072. The addition is performed according to the IEC/IEEE Standard for Binary
  4073. Floating-Point Arithmetic.
  4074. -------------------------------------------------------------------------------
  4075. *}
  4076. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4077. Var
  4078. aExp, bExp, zExp: int16;
  4079. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4080. expDiff: int16;
  4081. label shiftRight1;
  4082. label roundAndPack;
  4083. Begin
  4084. aSig1 := extractFloat64Frac1( a );
  4085. aSig0 := extractFloat64Frac0( a );
  4086. aExp := extractFloat64Exp( a );
  4087. bSig1 := extractFloat64Frac1( b );
  4088. bSig0 := extractFloat64Frac0( b );
  4089. bExp := extractFloat64Exp( b );
  4090. expDiff := aExp - bExp;
  4091. if ( 0 < expDiff ) then
  4092. Begin
  4093. if ( aExp = $7FF ) then
  4094. Begin
  4095. if ( aSig0 OR aSig1 ) <> 0 then
  4096. Begin
  4097. propagateFloat64NaN( a, b, out );
  4098. exit;
  4099. end;
  4100. out := a;
  4101. exit;
  4102. End;
  4103. if ( bExp = 0 ) then
  4104. Begin
  4105. Dec(expDiff);
  4106. End
  4107. else
  4108. Begin
  4109. bSig0 := bSig0 or $00100000;
  4110. End;
  4111. shift64ExtraRightJamming(
  4112. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4113. zExp := aExp;
  4114. End
  4115. else if ( expDiff < 0 ) then
  4116. Begin
  4117. if ( bExp = $7FF ) then
  4118. Begin
  4119. if ( bSig0 OR bSig1 ) <> 0 then
  4120. Begin
  4121. propagateFloat64NaN( a, b, out );
  4122. exit;
  4123. End;
  4124. packFloat64( zSign, $7FF, 0, 0, out );
  4125. End;
  4126. if ( aExp = 0 ) then
  4127. Begin
  4128. Inc(expDiff);
  4129. End
  4130. else
  4131. Begin
  4132. aSig0 := aSig0 or $00100000;
  4133. End;
  4134. shift64ExtraRightJamming(
  4135. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4136. zExp := bExp;
  4137. End
  4138. else
  4139. Begin
  4140. if ( aExp = $7FF ) then
  4141. Begin
  4142. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4143. Begin
  4144. propagateFloat64NaN( a, b, out );
  4145. exit;
  4146. End;
  4147. out := a;
  4148. exit;
  4149. End;
  4150. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4151. if ( aExp = 0 ) then
  4152. Begin
  4153. packFloat64( zSign, 0, zSig0, zSig1, out );
  4154. exit;
  4155. End;
  4156. zSig2 := 0;
  4157. zSig0 := zSig0 or $00200000;
  4158. zExp := aExp;
  4159. goto shiftRight1;
  4160. End;
  4161. aSig0 := aSig0 or $00100000;
  4162. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4163. Dec(zExp);
  4164. if ( zSig0 < $00200000 ) then
  4165. goto roundAndPack;
  4166. Inc(zExp);
  4167. shiftRight1:
  4168. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4169. roundAndPack:
  4170. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4171. End;
  4172. {*
  4173. -------------------------------------------------------------------------------
  4174. Returns the result of subtracting the absolute values of the double-
  4175. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4176. difference is negated before being returned. `zSign' is ignored if the
  4177. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4178. Standard for Binary Floating-Point Arithmetic.
  4179. -------------------------------------------------------------------------------
  4180. *}
  4181. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4182. Var
  4183. aExp, bExp, zExp: int16;
  4184. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4185. expDiff: int16;
  4186. z: float64;
  4187. label aExpBigger;
  4188. label bExpBigger;
  4189. label aBigger;
  4190. label bBigger;
  4191. label normalizeRoundAndPack;
  4192. Begin
  4193. aSig1 := extractFloat64Frac1( a );
  4194. aSig0 := extractFloat64Frac0( a );
  4195. aExp := extractFloat64Exp( a );
  4196. bSig1 := extractFloat64Frac1( b );
  4197. bSig0 := extractFloat64Frac0( b );
  4198. bExp := extractFloat64Exp( b );
  4199. expDiff := aExp - bExp;
  4200. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4201. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4202. if ( 0 < expDiff ) then goto aExpBigger;
  4203. if ( expDiff < 0 ) then goto bExpBigger;
  4204. if ( aExp = $7FF ) then
  4205. Begin
  4206. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4207. Begin
  4208. propagateFloat64NaN( a, b, out );
  4209. exit;
  4210. End;
  4211. float_raise( float_flag_invalid );
  4212. z.low := float64_default_nan_low;
  4213. z.high := float64_default_nan_high;
  4214. out := z;
  4215. exit;
  4216. End;
  4217. if ( aExp = 0 ) then
  4218. Begin
  4219. aExp := 1;
  4220. bExp := 1;
  4221. End;
  4222. if ( bSig0 < aSig0 ) then goto aBigger;
  4223. if ( aSig0 < bSig0 ) then goto bBigger;
  4224. if ( bSig1 < aSig1 ) then goto aBigger;
  4225. if ( aSig1 < bSig1 ) then goto bBigger;
  4226. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4227. exit;
  4228. bExpBigger:
  4229. if ( bExp = $7FF ) then
  4230. Begin
  4231. if ( bSig0 OR bSig1 ) <> 0 then
  4232. Begin
  4233. propagateFloat64NaN( a, b, out );
  4234. exit;
  4235. End;
  4236. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4237. exit;
  4238. End;
  4239. if ( aExp = 0 ) then
  4240. Begin
  4241. Inc(expDiff);
  4242. End
  4243. else
  4244. Begin
  4245. aSig0 := aSig0 or $40000000;
  4246. End;
  4247. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4248. bSig0 := bSig0 or $40000000;
  4249. bBigger:
  4250. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4251. zExp := bExp;
  4252. zSign := zSign xor 1;
  4253. goto normalizeRoundAndPack;
  4254. aExpBigger:
  4255. if ( aExp = $7FF ) then
  4256. Begin
  4257. if ( aSig0 OR aSig1 ) <> 0 then
  4258. Begin
  4259. propagateFloat64NaN( a, b, out );
  4260. exit;
  4261. End;
  4262. out := a;
  4263. exit;
  4264. End;
  4265. if ( bExp = 0 ) then
  4266. Begin
  4267. Dec(expDiff);
  4268. End
  4269. else
  4270. Begin
  4271. bSig0 := bSig0 or $40000000;
  4272. End;
  4273. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4274. aSig0 := aSig0 or $40000000;
  4275. aBigger:
  4276. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4277. zExp := aExp;
  4278. normalizeRoundAndPack:
  4279. Dec(zExp);
  4280. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4281. End;
  4282. {*
  4283. -------------------------------------------------------------------------------
  4284. Returns the result of adding the double-precision floating-point values `a'
  4285. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4286. Binary Floating-Point Arithmetic.
  4287. -------------------------------------------------------------------------------
  4288. *}
  4289. Function float64_add( a: float64; b : float64) : Float64;
  4290. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4291. Var
  4292. aSign, bSign: flag;
  4293. Begin
  4294. aSign := extractFloat64Sign( a );
  4295. bSign := extractFloat64Sign( b );
  4296. if ( aSign = bSign ) then
  4297. Begin
  4298. addFloat64Sigs( a, b, aSign, result );
  4299. End
  4300. else
  4301. Begin
  4302. subFloat64Sigs( a, b, aSign, result );
  4303. End;
  4304. End;
  4305. {*
  4306. -------------------------------------------------------------------------------
  4307. Returns the result of subtracting the double-precision floating-point values
  4308. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4309. for Binary Floating-Point Arithmetic.
  4310. -------------------------------------------------------------------------------
  4311. *}
  4312. Function float64_sub(a: float64; b : float64) : Float64;
  4313. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4314. Var
  4315. aSign, bSign: flag;
  4316. Begin
  4317. aSign := extractFloat64Sign( a );
  4318. bSign := extractFloat64Sign( b );
  4319. if ( aSign = bSign ) then
  4320. Begin
  4321. subFloat64Sigs( a, b, aSign, result );
  4322. End
  4323. else
  4324. Begin
  4325. addFloat64Sigs( a, b, aSign, result );
  4326. End;
  4327. End;
  4328. {*
  4329. -------------------------------------------------------------------------------
  4330. Returns the result of multiplying the double-precision floating-point values
  4331. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4332. for Binary Floating-Point Arithmetic.
  4333. -------------------------------------------------------------------------------
  4334. *}
  4335. Function float64_mul( a: float64; b:float64) : Float64;
  4336. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4337. Var
  4338. aSign, bSign, zSign: flag;
  4339. aExp, bExp, zExp: int16;
  4340. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4341. z: float64;
  4342. label invalid;
  4343. Begin
  4344. aSig1 := extractFloat64Frac1( a );
  4345. aSig0 := extractFloat64Frac0( a );
  4346. aExp := extractFloat64Exp( a );
  4347. aSign := extractFloat64Sign( a );
  4348. bSig1 := extractFloat64Frac1( b );
  4349. bSig0 := extractFloat64Frac0( b );
  4350. bExp := extractFloat64Exp( b );
  4351. bSign := extractFloat64Sign( b );
  4352. zSign := aSign xor bSign;
  4353. if ( aExp = $7FF ) then
  4354. Begin
  4355. if ( (( aSig0 OR aSig1 ) <>0)
  4356. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4357. Begin
  4358. propagateFloat64NaN( a, b, result );
  4359. exit;
  4360. End;
  4361. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4362. packFloat64( zSign, $7FF, 0, 0, result );
  4363. exit;
  4364. End;
  4365. if ( bExp = $7FF ) then
  4366. Begin
  4367. if ( bSig0 OR bSig1 )<> 0 then
  4368. Begin
  4369. propagateFloat64NaN( a, b, result );
  4370. exit;
  4371. End;
  4372. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4373. Begin
  4374. invalid:
  4375. float_raise( float_flag_invalid );
  4376. z.low := float64_default_nan_low;
  4377. z.high := float64_default_nan_high;
  4378. result := z;
  4379. exit;
  4380. End;
  4381. packFloat64( zSign, $7FF, 0, 0, result );
  4382. exit;
  4383. End;
  4384. if ( aExp = 0 ) then
  4385. Begin
  4386. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4387. Begin
  4388. packFloat64( zSign, 0, 0, 0, result );
  4389. exit;
  4390. End;
  4391. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4392. End;
  4393. if ( bExp = 0 ) then
  4394. Begin
  4395. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4396. Begin
  4397. packFloat64( zSign, 0, 0, 0, result );
  4398. exit;
  4399. End;
  4400. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4401. End;
  4402. zExp := aExp + bExp - $400;
  4403. aSig0 := aSig0 or $00100000;
  4404. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4405. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4406. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4407. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4408. if ( $00200000 <= zSig0 ) then
  4409. Begin
  4410. shift64ExtraRightJamming(
  4411. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4412. Inc(zExp);
  4413. End;
  4414. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4415. End;
  4416. {*
  4417. -------------------------------------------------------------------------------
  4418. Returns the result of dividing the double-precision floating-point value `a'
  4419. by the corresponding value `b'. The operation is performed according to the
  4420. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4421. -------------------------------------------------------------------------------
  4422. *}
  4423. Function float64_div(a: float64; b : float64) : Float64;
  4424. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4425. Var
  4426. aSign, bSign, zSign: flag;
  4427. aExp, bExp, zExp: int16;
  4428. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4429. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4430. z: float64;
  4431. label invalid;
  4432. Begin
  4433. aSig1 := extractFloat64Frac1( a );
  4434. aSig0 := extractFloat64Frac0( a );
  4435. aExp := extractFloat64Exp( a );
  4436. aSign := extractFloat64Sign( a );
  4437. bSig1 := extractFloat64Frac1( b );
  4438. bSig0 := extractFloat64Frac0( b );
  4439. bExp := extractFloat64Exp( b );
  4440. bSign := extractFloat64Sign( b );
  4441. zSign := aSign xor bSign;
  4442. if ( aExp = $7FF ) then
  4443. Begin
  4444. if ( aSig0 OR aSig1 )<> 0 then
  4445. Begin
  4446. propagateFloat64NaN( a, b, result );
  4447. exit;
  4448. end;
  4449. if ( bExp = $7FF ) then
  4450. Begin
  4451. if ( bSig0 OR bSig1 )<>0 then
  4452. Begin
  4453. propagateFloat64NaN( a, b, result );
  4454. exit;
  4455. End;
  4456. goto invalid;
  4457. End;
  4458. packFloat64( zSign, $7FF, 0, 0, result );
  4459. exit;
  4460. End;
  4461. if ( bExp = $7FF ) then
  4462. Begin
  4463. if ( bSig0 OR bSig1 )<> 0 then
  4464. Begin
  4465. propagateFloat64NaN( a, b, result );
  4466. exit;
  4467. End;
  4468. packFloat64( zSign, 0, 0, 0, result );
  4469. exit;
  4470. End;
  4471. if ( bExp = 0 ) then
  4472. Begin
  4473. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4474. Begin
  4475. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4476. Begin
  4477. invalid:
  4478. float_raise( float_flag_invalid );
  4479. z.low := float64_default_nan_low;
  4480. z.high := float64_default_nan_high;
  4481. result := z;
  4482. exit;
  4483. End;
  4484. float_raise( float_flag_divbyzero );
  4485. packFloat64( zSign, $7FF, 0, 0, result );
  4486. exit;
  4487. End;
  4488. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4489. End;
  4490. if ( aExp = 0 ) then
  4491. Begin
  4492. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4493. Begin
  4494. packFloat64( zSign, 0, 0, 0, result );
  4495. exit;
  4496. End;
  4497. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4498. End;
  4499. zExp := aExp - bExp + $3FD;
  4500. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4501. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4502. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4503. Begin
  4504. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4505. Inc(zExp);
  4506. End;
  4507. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4508. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4509. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4510. while ( sbits32 (rem0) < 0 ) do
  4511. Begin
  4512. Dec(zSig0);
  4513. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4514. End;
  4515. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4516. if ( ( zSig1 and $3FF ) <= 4 ) then
  4517. Begin
  4518. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4519. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4520. while ( sbits32 (rem1) < 0 ) do
  4521. Begin
  4522. Dec(zSig1);
  4523. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4524. End;
  4525. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4526. End;
  4527. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4528. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4529. End;
  4530. {*
  4531. -------------------------------------------------------------------------------
  4532. Returns the remainder of the double-precision floating-point value `a'
  4533. with respect to the corresponding value `b'. The operation is performed
  4534. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4535. -------------------------------------------------------------------------------
  4536. *}
  4537. Function float64_rem(a: float64; b : float64) : float64;
  4538. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4539. Var
  4540. aSign, bSign, zSign: flag;
  4541. aExp, bExp, expDiff: int16;
  4542. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4543. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4544. sigMean0: sbits32;
  4545. z: float64;
  4546. label invalid;
  4547. Begin
  4548. aSig1 := extractFloat64Frac1( a );
  4549. aSig0 := extractFloat64Frac0( a );
  4550. aExp := extractFloat64Exp( a );
  4551. aSign := extractFloat64Sign( a );
  4552. bSig1 := extractFloat64Frac1( b );
  4553. bSig0 := extractFloat64Frac0( b );
  4554. bExp := extractFloat64Exp( b );
  4555. bSign := extractFloat64Sign( b );
  4556. if ( aExp = $7FF ) then
  4557. Begin
  4558. if ((( aSig0 OR aSig1 )<>0)
  4559. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4560. Begin
  4561. propagateFloat64NaN( a, b, result );
  4562. exit;
  4563. End;
  4564. goto invalid;
  4565. End;
  4566. if ( bExp = $7FF ) then
  4567. Begin
  4568. if ( bSig0 OR bSig1 ) <> 0 then
  4569. Begin
  4570. propagateFloat64NaN( a, b, result );
  4571. exit;
  4572. End;
  4573. result := a;
  4574. exit;
  4575. End;
  4576. if ( bExp = 0 ) then
  4577. Begin
  4578. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4579. Begin
  4580. invalid:
  4581. float_raise( float_flag_invalid );
  4582. z.low := float64_default_nan_low;
  4583. z.high := float64_default_nan_high;
  4584. result := z;
  4585. exit;
  4586. End;
  4587. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4588. End;
  4589. if ( aExp = 0 ) then
  4590. Begin
  4591. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4592. Begin
  4593. result := a;
  4594. exit;
  4595. End;
  4596. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4597. End;
  4598. expDiff := aExp - bExp;
  4599. if ( expDiff < -1 ) then
  4600. Begin
  4601. result := a;
  4602. exit;
  4603. End;
  4604. shortShift64Left(
  4605. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  4606. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4607. q := le64( bSig0, bSig1, aSig0, aSig1 );
  4608. if ( q )<>0 then
  4609. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4610. expDiff := expDiff - 32;
  4611. while ( 0 < expDiff ) do
  4612. Begin
  4613. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4614. if 4 < q then
  4615. q:= q - 4
  4616. else
  4617. q := 0;
  4618. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4619. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  4620. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  4621. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  4622. expDiff := expDiff - 29;
  4623. End;
  4624. if ( -32 < expDiff ) then
  4625. Begin
  4626. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4627. if 4 < q then
  4628. q := q - 4
  4629. else
  4630. q := 0;
  4631. q := q shr (- expDiff);
  4632. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4633. expDiff := expDiff + 24;
  4634. if ( expDiff < 0 ) then
  4635. Begin
  4636. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4637. End
  4638. else
  4639. Begin
  4640. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  4641. End;
  4642. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4643. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  4644. End
  4645. else
  4646. Begin
  4647. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  4648. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4649. End;
  4650. Repeat
  4651. alternateASig0 := aSig0;
  4652. alternateASig1 := aSig1;
  4653. Inc(q);
  4654. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4655. Until not ( 0 <= sbits32 (aSig0) );
  4656. add64(
  4657. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  4658. if ( ( sigMean0 < 0 )
  4659. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  4660. Begin
  4661. aSig0 := alternateASig0;
  4662. aSig1 := alternateASig1;
  4663. End;
  4664. zSign := flag( sbits32 (aSig0) < 0 );
  4665. if ( zSign <> 0 ) then
  4666. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  4667. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  4668. End;
  4669. {*
  4670. -------------------------------------------------------------------------------
  4671. Returns the square root of the double-precision floating-point value `a'.
  4672. The operation is performed according to the IEC/IEEE Standard for Binary
  4673. Floating-Point Arithmetic.
  4674. -------------------------------------------------------------------------------
  4675. *}
  4676. Procedure float64_sqrt( a: float64; var out: float64 );
  4677. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  4678. Var
  4679. aSign: flag;
  4680. aExp, zExp: int16;
  4681. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  4682. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4683. z: float64;
  4684. label invalid;
  4685. Begin
  4686. aSig1 := extractFloat64Frac1( a );
  4687. aSig0 := extractFloat64Frac0( a );
  4688. aExp := extractFloat64Exp( a );
  4689. aSign := extractFloat64Sign( a );
  4690. if ( aExp = $7FF ) then
  4691. Begin
  4692. if ( aSig0 OR aSig1 ) <> 0 then
  4693. Begin
  4694. propagateFloat64NaN( a, a, out );
  4695. exit;
  4696. End;
  4697. if ( aSign = 0) then
  4698. Begin
  4699. out := a;
  4700. exit;
  4701. End;
  4702. goto invalid;
  4703. End;
  4704. if ( aSign <> 0 ) then
  4705. Begin
  4706. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4707. Begin
  4708. out := a;
  4709. exit;
  4710. End;
  4711. invalid:
  4712. float_raise( float_flag_invalid );
  4713. z.low := float64_default_nan_low;
  4714. z.high := float64_default_nan_high;
  4715. out := z;
  4716. exit;
  4717. End;
  4718. if ( aExp = 0 ) then
  4719. Begin
  4720. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4721. Begin
  4722. packFloat64( 0, 0, 0, 0, out );
  4723. exit;
  4724. End;
  4725. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4726. End;
  4727. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  4728. aSig0 := aSig0 or $00100000;
  4729. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  4730. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  4731. if ( zSig0 = 0 ) then
  4732. zSig0 := $7FFFFFFF;
  4733. doubleZSig0 := zSig0 + zSig0;
  4734. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  4735. mul32To64( zSig0, zSig0, term0, term1 );
  4736. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  4737. while ( sbits32 (rem0) < 0 ) do
  4738. Begin
  4739. Dec(zSig0);
  4740. doubleZSig0 := doubleZSig0 - 2;
  4741. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  4742. End;
  4743. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  4744. if ( ( zSig1 and $1FF ) <= 5 ) then
  4745. Begin
  4746. if ( zSig1 = 0 ) then
  4747. zSig1 := 1;
  4748. mul32To64( doubleZSig0, zSig1, term1, term2 );
  4749. sub64( rem1, 0, term1, term2, rem1, rem2 );
  4750. mul32To64( zSig1, zSig1, term2, term3 );
  4751. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  4752. while ( sbits32 (rem1) < 0 ) do
  4753. Begin
  4754. Dec(zSig1);
  4755. shortShift64Left( 0, zSig1, 1, term2, term3 );
  4756. term3 := term3 or 1;
  4757. term2 := term2 or doubleZSig0;
  4758. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  4759. End;
  4760. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4761. End;
  4762. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  4763. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  4764. End;
  4765. {*
  4766. -------------------------------------------------------------------------------
  4767. Returns 1 if the double-precision floating-point value `a' is equal to
  4768. the corresponding value `b', and 0 otherwise. The comparison is performed
  4769. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4770. -------------------------------------------------------------------------------
  4771. *}
  4772. Function float64_eq(a: float64; b: float64): flag;
  4773. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  4774. Begin
  4775. if
  4776. (
  4777. ( extractFloat64Exp( a ) = $7FF )
  4778. AND
  4779. (
  4780. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4781. )
  4782. )
  4783. OR (
  4784. ( extractFloat64Exp( b ) = $7FF )
  4785. AND (
  4786. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4787. )
  4788. )
  4789. ) then
  4790. Begin
  4791. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4792. float_raise( float_flag_invalid );
  4793. float64_eq := 0;
  4794. exit;
  4795. End;
  4796. float64_eq := flag(
  4797. ( a.low = b.low )
  4798. AND ( ( a.high = b.high )
  4799. OR ( ( a.low = 0 )
  4800. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4801. ));
  4802. End;
  4803. {*
  4804. -------------------------------------------------------------------------------
  4805. Returns 1 if the double-precision floating-point value `a' is less than
  4806. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4807. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4808. Arithmetic.
  4809. -------------------------------------------------------------------------------
  4810. *}
  4811. Function float64_le(a: float64;b: float64): flag;
  4812. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  4813. Var
  4814. aSign, bSign: flag;
  4815. Begin
  4816. if
  4817. (
  4818. ( extractFloat64Exp( a ) = $7FF )
  4819. AND
  4820. (
  4821. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4822. )
  4823. )
  4824. OR (
  4825. ( extractFloat64Exp( b ) = $7FF )
  4826. AND (
  4827. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4828. )
  4829. )
  4830. ) then
  4831. Begin
  4832. float_raise( float_flag_invalid );
  4833. float64_le := 0;
  4834. exit;
  4835. End;
  4836. aSign := extractFloat64Sign( a );
  4837. bSign := extractFloat64Sign( b );
  4838. if ( aSign <> bSign ) then
  4839. Begin
  4840. float64_le := flag(
  4841. (aSign <> 0)
  4842. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4843. = 0 ));
  4844. exit;
  4845. End;
  4846. if aSign <> 0 then
  4847. float64_le := le64( b.high, b.low, a.high, a.low )
  4848. else
  4849. float64_le := le64( a.high, a.low, b.high, b.low );
  4850. End;
  4851. {*
  4852. -------------------------------------------------------------------------------
  4853. Returns 1 if the double-precision floating-point value `a' is less than
  4854. the corresponding value `b', and 0 otherwise. The comparison is performed
  4855. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4856. -------------------------------------------------------------------------------
  4857. *}
  4858. Function float64_lt(a: float64;b: float64): flag;
  4859. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  4860. Var
  4861. aSign, bSign: flag;
  4862. Begin
  4863. if
  4864. (
  4865. ( extractFloat64Exp( a ) = $7FF )
  4866. AND
  4867. (
  4868. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4869. )
  4870. )
  4871. OR (
  4872. ( extractFloat64Exp( b ) = $7FF )
  4873. AND (
  4874. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4875. )
  4876. )
  4877. ) then
  4878. Begin
  4879. float_raise( float_flag_invalid );
  4880. float64_lt := 0;
  4881. exit;
  4882. End;
  4883. aSign := extractFloat64Sign( a );
  4884. bSign := extractFloat64Sign( b );
  4885. if ( aSign <> bSign ) then
  4886. Begin
  4887. float64_lt := flag(
  4888. (aSign <> 0)
  4889. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4890. <> 0 ));
  4891. exit;
  4892. End;
  4893. if aSign <> 0 then
  4894. float64_lt := lt64( b.high, b.low, a.high, a.low )
  4895. else
  4896. float64_lt := lt64( a.high, a.low, b.high, b.low );
  4897. End;
  4898. {*
  4899. -------------------------------------------------------------------------------
  4900. Returns 1 if the double-precision floating-point value `a' is equal to
  4901. the corresponding value `b', and 0 otherwise. The invalid exception is
  4902. raised if either operand is a NaN. Otherwise, the comparison is performed
  4903. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4904. -------------------------------------------------------------------------------
  4905. *}
  4906. Function float64_eq_signaling( a: float64; b: float64): flag;
  4907. Begin
  4908. if
  4909. (
  4910. ( extractFloat64Exp( a ) = $7FF )
  4911. AND
  4912. (
  4913. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4914. )
  4915. )
  4916. OR (
  4917. ( extractFloat64Exp( b ) = $7FF )
  4918. AND (
  4919. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4920. )
  4921. )
  4922. ) then
  4923. Begin
  4924. float_raise( float_flag_invalid );
  4925. float64_eq_signaling := 0;
  4926. exit;
  4927. End;
  4928. float64_eq_signaling := flag(
  4929. ( a.low = b.low )
  4930. AND ( ( a.high = b.high )
  4931. OR ( ( a.low = 0 )
  4932. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4933. ));
  4934. End;
  4935. {*
  4936. -------------------------------------------------------------------------------
  4937. Returns 1 if the double-precision floating-point value `a' is less than or
  4938. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4939. cause an exception. Otherwise, the comparison is performed according to the
  4940. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4941. -------------------------------------------------------------------------------
  4942. *}
  4943. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  4944. Var
  4945. aSign, bSign : flag;
  4946. Begin
  4947. if
  4948. (
  4949. ( extractFloat64Exp( a ) = $7FF )
  4950. AND
  4951. (
  4952. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4953. )
  4954. )
  4955. OR (
  4956. ( extractFloat64Exp( b ) = $7FF )
  4957. AND (
  4958. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4959. )
  4960. )
  4961. ) then
  4962. Begin
  4963. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4964. float_raise( float_flag_invalid );
  4965. float64_le_quiet := 0;
  4966. exit;
  4967. End;
  4968. aSign := extractFloat64Sign( a );
  4969. bSign := extractFloat64Sign( b );
  4970. if ( aSign <> bSign ) then
  4971. Begin
  4972. float64_le_quiet := flag
  4973. ((aSign <> 0)
  4974. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4975. = 0 ));
  4976. exit;
  4977. End;
  4978. if aSign <> 0 then
  4979. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  4980. else
  4981. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  4982. End;
  4983. {*
  4984. -------------------------------------------------------------------------------
  4985. Returns 1 if the double-precision floating-point value `a' is less than
  4986. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4987. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4988. Standard for Binary Floating-Point Arithmetic.
  4989. -------------------------------------------------------------------------------
  4990. *}
  4991. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  4992. Var
  4993. aSign, bSign: flag;
  4994. Begin
  4995. if
  4996. (
  4997. ( extractFloat64Exp( a ) = $7FF )
  4998. AND
  4999. (
  5000. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5001. )
  5002. )
  5003. OR (
  5004. ( extractFloat64Exp( b ) = $7FF )
  5005. AND (
  5006. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5007. )
  5008. )
  5009. ) then
  5010. Begin
  5011. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5012. float_raise( float_flag_invalid );
  5013. float64_lt_quiet := 0;
  5014. exit;
  5015. End;
  5016. aSign := extractFloat64Sign( a );
  5017. bSign := extractFloat64Sign( b );
  5018. if ( aSign <> bSign ) then
  5019. Begin
  5020. float64_lt_quiet := flag(
  5021. (aSign<>0)
  5022. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5023. <> 0 ));
  5024. exit;
  5025. End;
  5026. If aSign <> 0 then
  5027. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5028. else
  5029. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5030. End;
  5031. {*----------------------------------------------------------------------------
  5032. | Returns the result of converting the 64-bit two's complement integer `a'
  5033. | to the single-precision floating-point format. The conversion is performed
  5034. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5035. *----------------------------------------------------------------------------*}
  5036. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5037. var
  5038. zSign : flag;
  5039. absA : uint64;
  5040. shiftCount: int8;
  5041. zSig : bits32;
  5042. intval : int64rec;
  5043. Begin
  5044. if ( a = 0 ) then
  5045. begin
  5046. int64_to_float32.float32 := 0;
  5047. exit;
  5048. end;
  5049. if a < 0 then
  5050. zSign := flag(TRUE)
  5051. else
  5052. zSign := flag(FALSE);
  5053. if zSign<>0 then
  5054. absA := -a
  5055. else
  5056. absA := a;
  5057. shiftCount := countLeadingZeros64( absA ) - 40;
  5058. if ( 0 <= shiftCount ) then
  5059. begin
  5060. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5061. end
  5062. else
  5063. begin
  5064. shiftCount := shiftCount + 7;
  5065. if ( shiftCount < 0 ) then
  5066. begin
  5067. intval.low := int64rec(AbsA).low;
  5068. intval.high := int64rec(AbsA).high;
  5069. shift64RightJamming( intval.low, intval.high, - shiftCount,
  5070. intval.low, intval.high);
  5071. int64rec(absA).low := intval.low;
  5072. int64rec(absA).high := intval.high;
  5073. end
  5074. else
  5075. absA := absA shl shiftCount;
  5076. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5077. end;
  5078. End;
  5079. {*----------------------------------------------------------------------------
  5080. | Returns the result of converting the 64-bit two's complement integer `a'
  5081. | to the double-precision floating-point format. The conversion is performed
  5082. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5083. *----------------------------------------------------------------------------*}
  5084. function int64_to_float64( a: int64 ): float64;
  5085. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5086. var
  5087. zSign : flag;
  5088. float_result : float64;
  5089. intval : int64rec;
  5090. AbsA : bits64;
  5091. shiftcount : int8;
  5092. zSig0, zSig1 : bits32;
  5093. Begin
  5094. if ( a = 0 ) then
  5095. Begin
  5096. packFloat64( 0, 0, 0, 0, result );
  5097. exit;
  5098. end;
  5099. zSign := flag( a < 0 );
  5100. if ZSign<>0 then
  5101. AbsA := -a
  5102. else
  5103. AbsA := a;
  5104. shiftCount := countLeadingZeros64( absA ) - 11;
  5105. if ( 0 <= shiftCount ) then
  5106. Begin
  5107. absA := absA shl shiftcount;
  5108. zSig0:=int64rec(absA).high;
  5109. zSig1:=int64rec(absA).low;
  5110. End
  5111. else
  5112. Begin
  5113. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  5114. End;
  5115. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5116. int64_to_float64:= float_result;
  5117. End;
  5118. {*----------------------------------------------------------------------------
  5119. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5120. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5121. | Otherwise, returns 0.
  5122. *----------------------------------------------------------------------------*}
  5123. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5124. begin
  5125. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5126. end;
  5127. {*----------------------------------------------------------------------------
  5128. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5129. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5130. | Otherwise, returns 0.
  5131. *----------------------------------------------------------------------------*}
  5132. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5133. begin
  5134. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5135. end;
  5136. {*----------------------------------------------------------------------------
  5137. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5138. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5139. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5140. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5141. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5142. | the most-significant bit of the extra result, and the other 63 bits of the
  5143. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5144. | were all zero. This extra result is stored in the location pointed to by
  5145. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5146. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5147. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5148. | fixed-point value is shifted right by the number of bits given in `count',
  5149. | and the integer part of the result is returned at the locations pointed to
  5150. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5151. | corrupted as described above, and is returned at the location pointed to by
  5152. | `z2Ptr'.)
  5153. *----------------------------------------------------------------------------*}
  5154. procedure shift128ExtraRightJamming(
  5155. a0: bits64;
  5156. a1: bits64;
  5157. a2: bits64;
  5158. count: int16;
  5159. var z0Ptr: bits64;
  5160. var z1Ptr: bits64;
  5161. var z2Ptr: bits64);
  5162. var
  5163. z0, z1, z2: bits64;
  5164. negCount: int8;
  5165. begin
  5166. negCount := ( - count ) and 63;
  5167. if ( count = 0 ) then
  5168. begin
  5169. z2 := a2;
  5170. z1 := a1;
  5171. z0 := a0;
  5172. end
  5173. else begin
  5174. if ( count < 64 ) then
  5175. begin
  5176. z2 := a1 shr negCount;
  5177. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5178. z0 := a0 shr count;
  5179. end
  5180. else begin
  5181. if ( count = 64 ) then
  5182. begin
  5183. z2 := a1;
  5184. z1 := a0;
  5185. end
  5186. else begin
  5187. a2 := a2 or a1;
  5188. if ( count < 128 ) then
  5189. begin
  5190. z2 := a0 shl negCount;
  5191. z1 := a0 shr ( count and 63 );
  5192. end
  5193. else begin
  5194. if ( count = 128 ) then
  5195. z2 := a0
  5196. else
  5197. z2 := ord( a0 <> 0 );
  5198. z1 := 0;
  5199. end;
  5200. end;
  5201. z0 := 0;
  5202. end;
  5203. z2 := z2 or ord( a2 <> 0 );
  5204. end;
  5205. z2Ptr := z2;
  5206. z1Ptr := z1;
  5207. z0Ptr := z0;
  5208. end;
  5209. {*----------------------------------------------------------------------------
  5210. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5211. | _plus_ the number of bits given in `count'. The shifted result is at most
  5212. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5213. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5214. | shifted off is the most-significant bit of the extra result, and the other
  5215. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5216. | bits shifted off were all zero. This extra result is stored in the location
  5217. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5218. | (This routine makes more sense if `a0' and `a1' are considered to form
  5219. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5220. | point value is shifted right by the number of bits given in `count', and
  5221. | the integer part of the result is returned at the location pointed to by
  5222. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5223. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5224. *----------------------------------------------------------------------------*}
  5225. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5226. var
  5227. z0, z1: bits64;
  5228. negCount: int8;
  5229. begin
  5230. negCount := ( - count ) and 63;
  5231. if ( count = 0 ) then
  5232. begin
  5233. z1 := a1;
  5234. z0 := a0;
  5235. end
  5236. else if ( count < 64 ) then
  5237. begin
  5238. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5239. z0 := a0 shr count;
  5240. end
  5241. else begin
  5242. if ( count = 64 ) then
  5243. begin
  5244. z1 := a0 or ord( a1 <> 0 );
  5245. end
  5246. else begin
  5247. z1 := ord( ( a0 or a1 ) <> 0 );
  5248. end;
  5249. z0 := 0;
  5250. end;
  5251. z1Ptr := z1;
  5252. z0Ptr := z0;
  5253. end;
  5254. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5255. {*----------------------------------------------------------------------------
  5256. | Returns the fraction bits of the extended double-precision floating-point
  5257. | value `a'.
  5258. *----------------------------------------------------------------------------*}
  5259. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5260. begin
  5261. result:=a.low;
  5262. end;
  5263. {*----------------------------------------------------------------------------
  5264. | Returns the exponent bits of the extended double-precision floating-point
  5265. | value `a'.
  5266. *----------------------------------------------------------------------------*}
  5267. function extractFloatx80Exp(a : floatx80): int32;inline;
  5268. begin
  5269. result:=a.high and $7FFF;
  5270. end;
  5271. {*----------------------------------------------------------------------------
  5272. | Returns the sign bit of the extended double-precision floating-point value
  5273. | `a'.
  5274. *----------------------------------------------------------------------------*}
  5275. function extractFloatx80Sign(a : floatx80): flag;inline;
  5276. begin
  5277. result:=a.high shr 15;
  5278. end;
  5279. {*----------------------------------------------------------------------------
  5280. | Normalizes the subnormal extended double-precision floating-point value
  5281. | represented by the denormalized significand `aSig'. The normalized exponent
  5282. | and significand are stored at the locations pointed to by `zExpPtr' and
  5283. | `zSigPtr', respectively.
  5284. *----------------------------------------------------------------------------*}
  5285. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5286. var
  5287. shiftCount: int8;
  5288. begin
  5289. shiftCount := countLeadingZeros64( aSig );
  5290. zSigPtr := aSig shl shiftCount;
  5291. zExpPtr := 1 - shiftCount;
  5292. end;
  5293. {*----------------------------------------------------------------------------
  5294. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5295. | extended double-precision floating-point value, returning the result.
  5296. *----------------------------------------------------------------------------*}
  5297. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5298. var
  5299. z: floatx80;
  5300. begin
  5301. z.low := zSig;
  5302. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5303. result:=z;
  5304. end;
  5305. {*----------------------------------------------------------------------------
  5306. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5307. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5308. | and returns the proper extended double-precision floating-point value
  5309. | corresponding to the abstract input. Ordinarily, the abstract value is
  5310. | rounded and packed into the extended double-precision format, with the
  5311. | inexact exception raised if the abstract input cannot be represented
  5312. | exactly. However, if the abstract value is too large, the overflow and
  5313. | inexact exceptions are raised and an infinity or maximal finite value is
  5314. | returned. If the abstract value is too small, the input value is rounded to
  5315. | a subnormal number, and the underflow and inexact exceptions are raised if
  5316. | the abstract input cannot be represented exactly as a subnormal extended
  5317. | double-precision floating-point number.
  5318. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5319. | number of bits as single or double precision, respectively. Otherwise, the
  5320. | result is rounded to the full precision of the extended double-precision
  5321. | format.
  5322. | The input significand must be normalized or smaller. If the input
  5323. | significand is not normalized, `zExp' must be 0; in that case, the result
  5324. | returned is a subnormal number, and it must not require rounding. The
  5325. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5326. | Floating-Point Arithmetic.
  5327. *----------------------------------------------------------------------------*}
  5328. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5329. var
  5330. roundingMode: int8;
  5331. roundNearestEven, increment, isTiny: flag;
  5332. roundIncrement, roundMask, roundBits: int64;
  5333. label
  5334. precision80;
  5335. begin
  5336. roundingMode := softfloat_rounding_mode;
  5337. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5338. if ( roundingPrecision = 80 ) then
  5339. goto precision80;
  5340. if ( roundingPrecision = 64 ) then
  5341. begin
  5342. roundIncrement := int64( $0000000000000400 );
  5343. roundMask := int64( $00000000000007FF );
  5344. end
  5345. else if ( roundingPrecision = 32 ) then
  5346. begin
  5347. roundIncrement := int64( $0000008000000000 );
  5348. roundMask := int64( $000000FFFFFFFFFF );
  5349. end
  5350. else begin
  5351. goto precision80;
  5352. end;
  5353. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5354. if ( not (roundNearestEven<>0) ) then
  5355. begin
  5356. if ( roundingMode = float_round_to_zero ) then
  5357. begin
  5358. roundIncrement := 0;
  5359. end
  5360. else begin
  5361. roundIncrement := roundMask;
  5362. if ( zSign<>0 ) then
  5363. begin
  5364. if ( roundingMode = float_round_up ) then
  5365. roundIncrement := 0;
  5366. end
  5367. else begin
  5368. if ( roundingMode = float_round_down ) then
  5369. roundIncrement := 0;
  5370. end;
  5371. end;
  5372. end;
  5373. roundBits := zSig0 and roundMask;
  5374. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5375. if ( ( $7FFE < zExp )
  5376. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5377. ) begin
  5378. goto overflow;
  5379. end;
  5380. if ( zExp <= 0 ) begin
  5381. isTiny =
  5382. ( float_detect_tininess = float_tininess_before_rounding )
  5383. or ( zExp < 0 )
  5384. or ( zSig0 <= zSig0 + roundIncrement );
  5385. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  5386. zExp := 0;
  5387. roundBits := zSig0 and roundMask;
  5388. if ( isTiny and roundBits ) float_raise( float_flag_underflow );
  5389. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5390. zSig0 += roundIncrement;
  5391. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5392. roundIncrement := roundMask + 1;
  5393. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5394. roundMask |= roundIncrement;
  5395. end;
  5396. zSig0 = ~ roundMask;
  5397. result:=packFloatx80( zSign, zExp, zSig0 );
  5398. end;
  5399. end;
  5400. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5401. zSig0 += roundIncrement;
  5402. if ( zSig0 < roundIncrement ) begin
  5403. ++zExp;
  5404. zSig0 := LIT64( $8000000000000000 );
  5405. end;
  5406. roundIncrement := roundMask + 1;
  5407. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5408. roundMask |= roundIncrement;
  5409. end;
  5410. zSig0 = ~ roundMask;
  5411. if ( zSig0 = 0 ) zExp := 0;
  5412. result:=packFloatx80( zSign, zExp, zSig0 );
  5413. precision80:
  5414. increment := ( (sbits64) zSig1 < 0 );
  5415. if ( ! roundNearestEven ) begin
  5416. if ( roundingMode = float_round_to_zero ) begin
  5417. increment := 0;
  5418. end;
  5419. else begin
  5420. if ( zSign ) begin
  5421. increment := ( roundingMode = float_round_down ) and zSig1;
  5422. end;
  5423. else begin
  5424. increment := ( roundingMode = float_round_up ) and zSig1;
  5425. end;
  5426. end;
  5427. end;
  5428. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5429. if ( ( $7FFE < zExp )
  5430. or ( ( zExp = $7FFE )
  5431. and ( zSig0 = LIT64( $FFFFFFFFFFFFFFFF ) )
  5432. and increment
  5433. )
  5434. ) begin
  5435. roundMask := 0;
  5436. overflow:
  5437. float_raise( float_flag_overflow or float_flag_inexact );
  5438. if ( ( roundingMode = float_round_to_zero )
  5439. or ( zSign and ( roundingMode = float_round_up ) )
  5440. or ( ! zSign and ( roundingMode = float_round_down ) )
  5441. ) begin
  5442. result:=packFloatx80( zSign, $7FFE, ~ roundMask );
  5443. end;
  5444. result:=packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5445. end;
  5446. if ( zExp <= 0 ) begin
  5447. isTiny =
  5448. ( float_detect_tininess = float_tininess_before_rounding )
  5449. or ( zExp < 0 )
  5450. or ! increment
  5451. or ( zSig0 < LIT64( $FFFFFFFFFFFFFFFF ) );
  5452. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  5453. zExp := 0;
  5454. if ( isTiny and zSig1 ) float_raise( float_flag_underflow );
  5455. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5456. if ( roundNearestEven ) begin
  5457. increment := ( (sbits64) zSig1 < 0 );
  5458. end;
  5459. else begin
  5460. if ( zSign ) begin
  5461. increment := ( roundingMode = float_round_down ) and zSig1;
  5462. end;
  5463. else begin
  5464. increment := ( roundingMode = float_round_up ) and zSig1;
  5465. end;
  5466. end;
  5467. if ( increment ) begin
  5468. ++zSig0;
  5469. zSig0 =
  5470. ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5471. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5472. end;
  5473. result:=packFloatx80( zSign, zExp, zSig0 );
  5474. end;
  5475. end;
  5476. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5477. if ( increment ) begin
  5478. ++zSig0;
  5479. if ( zSig0 = 0 ) begin
  5480. ++zExp;
  5481. zSig0 := LIT64( $8000000000000000 );
  5482. end;
  5483. else begin
  5484. zSig0 = ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5485. end;
  5486. end;
  5487. else begin
  5488. if ( zSig0 = 0 ) zExp := 0;
  5489. end;
  5490. result:=packFloatx80( zSign, zExp, zSig0 );
  5491. end;
  5492. {*----------------------------------------------------------------------------
  5493. | Takes an abstract floating-point value having sign `zSign', exponent
  5494. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  5495. | and returns the proper extended double-precision floating-point value
  5496. | corresponding to the abstract input. This routine is just like
  5497. | `roundAndPackFloatx80' except that the input significand does not have to be
  5498. | normalized.
  5499. *----------------------------------------------------------------------------*}
  5500. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5501. var
  5502. shiftCount: int8;
  5503. begin
  5504. if ( zSig0 = 0 ) begin
  5505. zSig0 := zSig1;
  5506. zSig1 := 0;
  5507. zExp -= 64;
  5508. end;
  5509. shiftCount := countLeadingZeros64( zSig0 );
  5510. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5511. zExp := eExp - shiftCount;
  5512. return
  5513. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  5514. end;
  5515. {*----------------------------------------------------------------------------
  5516. | Returns the result of converting the extended double-precision floating-
  5517. | point value `a' to the 32-bit two's complement integer format. The
  5518. | conversion is performed according to the IEC/IEEE Standard for Binary
  5519. | Floating-Point Arithmetic---which means in particular that the conversion
  5520. | is rounded according to the current rounding mode. If `a' is a NaN, the
  5521. | largest positive integer is returned. Otherwise, if the conversion
  5522. | overflows, the largest integer with the same sign as `a' is returned.
  5523. *----------------------------------------------------------------------------*}
  5524. function floatx80_to_int32(a: floatx80): int32;
  5525. var
  5526. aSign: flag;
  5527. aExp, shiftCount: int32;
  5528. aSig: bits64;
  5529. begin
  5530. aSig := extractFloatx80Frac( a );
  5531. aExp := extractFloatx80Exp( a );
  5532. aSign := extractFloatx80Sign( a );
  5533. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5534. shiftCount := $4037 - aExp;
  5535. if ( shiftCount <= 0 ) shiftCount := 1;
  5536. shift64RightJamming( aSig, shiftCount, aSig );
  5537. result := roundAndPackInt32( aSign, aSig );
  5538. end;
  5539. {*----------------------------------------------------------------------------
  5540. | Returns the result of converting the extended double-precision floating-
  5541. | point value `a' to the 32-bit two's complement integer format. The
  5542. | conversion is performed according to the IEC/IEEE Standard for Binary
  5543. | Floating-Point Arithmetic, except that the conversion is always rounded
  5544. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5545. | Otherwise, if the conversion overflows, the largest integer with the same
  5546. | sign as `a' is returned.
  5547. *----------------------------------------------------------------------------*}
  5548. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  5549. var
  5550. aSign: flag;
  5551. aExp, shiftCount: int32;
  5552. aSig, savedASig: bits64;
  5553. z: int32;
  5554. begin
  5555. aSig := extractFloatx80Frac( a );
  5556. aExp := extractFloatx80Exp( a );
  5557. aSign := extractFloatx80Sign( a );
  5558. if ( $401E < aExp ) begin
  5559. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5560. goto invalid;
  5561. end;
  5562. else if ( aExp < $3FFF ) begin
  5563. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5564. result := 0;
  5565. end;
  5566. shiftCount := $403E - aExp;
  5567. savedASig := aSig;
  5568. aSig >>= shiftCount;
  5569. z := aSig;
  5570. if ( aSign ) z := - z;
  5571. if ( ( z < 0 ) xor aSign ) begin
  5572. invalid:
  5573. float_raise( float_flag_invalid );
  5574. result := aSign ? (sbits32) $80000000 : $7FFFFFFF;
  5575. end;
  5576. if ( ( aSig shl shiftCount ) <> savedASig ) begin
  5577. softfloat_exception_flags or= float_flag_inexact;
  5578. end;
  5579. result := z;
  5580. end;
  5581. {*----------------------------------------------------------------------------
  5582. | Returns the result of converting the extended double-precision floating-
  5583. | point value `a' to the 64-bit two's complement integer format. The
  5584. | conversion is performed according to the IEC/IEEE Standard for Binary
  5585. | Floating-Point Arithmetic---which means in particular that the conversion
  5586. | is rounded according to the current rounding mode. If `a' is a NaN,
  5587. | the largest positive integer is returned. Otherwise, if the conversion
  5588. | overflows, the largest integer with the same sign as `a' is returned.
  5589. *----------------------------------------------------------------------------*}
  5590. function floatx80_to_int64(a: floatx80): int64;
  5591. var
  5592. aSign: flag;
  5593. aExp, shiftCount: int32;
  5594. aSig, aSigExtra: bits64;
  5595. begin
  5596. aSig := extractFloatx80Frac( a );
  5597. aExp := extractFloatx80Exp( a );
  5598. aSign := extractFloatx80Sign( a );
  5599. shiftCount := $403E - aExp;
  5600. if ( shiftCount <= 0 ) begin
  5601. if ( shiftCount ) begin
  5602. float_raise( float_flag_invalid );
  5603. if ( ! aSign
  5604. or ( ( aExp = $7FFF )
  5605. and ( aSig <> LIT64( $8000000000000000 ) ) )
  5606. ) begin
  5607. result := LIT64( $7FFFFFFFFFFFFFFF );
  5608. end;
  5609. result := (sbits64) LIT64( $8000000000000000 );
  5610. end;
  5611. aSigExtra := 0;
  5612. end;
  5613. else begin
  5614. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  5615. end;
  5616. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  5617. end;
  5618. {*----------------------------------------------------------------------------
  5619. | Returns the result of converting the extended double-precision floating-
  5620. | point value `a' to the 64-bit two's complement integer format. The
  5621. | conversion is performed according to the IEC/IEEE Standard for Binary
  5622. | Floating-Point Arithmetic, except that the conversion is always rounded
  5623. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5624. | Otherwise, if the conversion overflows, the largest integer with the same
  5625. | sign as `a' is returned.
  5626. *----------------------------------------------------------------------------*}
  5627. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  5628. var
  5629. aSign: flag;
  5630. aExp, shiftCount: int32;
  5631. aSig: bits64;
  5632. z: int64;
  5633. begin
  5634. aSig := extractFloatx80Frac( a );
  5635. aExp := extractFloatx80Exp( a );
  5636. aSign := extractFloatx80Sign( a );
  5637. shiftCount := aExp - $403E;
  5638. if ( 0 <= shiftCount ) begin
  5639. aSig = LIT64( $7FFFFFFFFFFFFFFF );
  5640. if ( ( a.high <> $C03E ) or aSig ) begin
  5641. float_raise( float_flag_invalid );
  5642. if ( ! aSign or ( ( aExp = $7FFF ) and aSig ) ) begin
  5643. result := LIT64( $7FFFFFFFFFFFFFFF );
  5644. end;
  5645. end;
  5646. result := (sbits64) LIT64( $8000000000000000 );
  5647. end;
  5648. else if ( aExp < $3FFF ) begin
  5649. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5650. result := 0;
  5651. end;
  5652. z := aSig>>( - shiftCount );
  5653. if ( (bits64) ( aSig shl ( shiftCount and 63 ) ) ) begin
  5654. softfloat_exception_flags or= float_flag_inexact;
  5655. end;
  5656. if ( aSign ) z := - z;
  5657. result := z;
  5658. end;
  5659. {*----------------------------------------------------------------------------
  5660. | Returns the result of converting the extended double-precision floating-
  5661. | point value `a' to the single-precision floating-point format. The
  5662. | conversion is performed according to the IEC/IEEE Standard for Binary
  5663. | Floating-Point Arithmetic.
  5664. *----------------------------------------------------------------------------*}
  5665. function floatx80_to_float32(a: floatx80): float32;
  5666. var
  5667. aSign: flag;
  5668. aExp: int32;
  5669. aSig: bits64;
  5670. begin
  5671. aSig := extractFloatx80Frac( a );
  5672. aExp := extractFloatx80Exp( a );
  5673. aSign := extractFloatx80Sign( a );
  5674. if ( aExp = $7FFF ) begin
  5675. if ( (bits64) ( aSig shl 1 ) ) begin
  5676. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  5677. end;
  5678. result := packFloat32( aSign, $FF, 0 );
  5679. end;
  5680. shift64RightJamming( aSig, 33, aSig );
  5681. if ( aExp or aSig ) aExp -= $3F81;
  5682. result := roundAndPackFloat32( aSign, aExp, aSig );
  5683. end;
  5684. {*----------------------------------------------------------------------------
  5685. | Returns the result of converting the extended double-precision floating-
  5686. | point value `a' to the double-precision floating-point format. The
  5687. | conversion is performed according to the IEC/IEEE Standard for Binary
  5688. | Floating-Point Arithmetic.
  5689. *----------------------------------------------------------------------------*}
  5690. function floatx80_to_float64(a: floatx80): float64;
  5691. var
  5692. aSign: flag;
  5693. aExp: int32;
  5694. aSig, zSig: bits64;
  5695. begin
  5696. aSig := extractFloatx80Frac( a );
  5697. aExp := extractFloatx80Exp( a );
  5698. aSign := extractFloatx80Sign( a );
  5699. if ( aExp = $7FFF ) begin
  5700. if ( (bits64) ( aSig shl 1 ) ) begin
  5701. result := commonNaNToFloat64( floatx80ToCommonNaN( a ) );
  5702. end;
  5703. result := packFloat64( aSign, $7FF, 0 );
  5704. end;
  5705. shift64RightJamming( aSig, 1, zSig );
  5706. if ( aExp or aSig ) aExp -= $3C01;
  5707. result := roundAndPackFloat64( aSign, aExp, zSig );
  5708. end;
  5709. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5710. {*----------------------------------------------------------------------------
  5711. | Returns the result of converting the extended double-precision floating-
  5712. | point value `a' to the quadruple-precision floating-point format. The
  5713. | conversion is performed according to the IEC/IEEE Standard for Binary
  5714. | Floating-Point Arithmetic.
  5715. *----------------------------------------------------------------------------*}
  5716. function floatx80_to_float128(a: floatx80): float128;
  5717. var
  5718. aSign: flag;
  5719. aExp: int16;
  5720. aSig, zSig0, zSig1: bits64;
  5721. begin
  5722. aSig := extractFloatx80Frac( a );
  5723. aExp := extractFloatx80Exp( a );
  5724. aSign := extractFloatx80Sign( a );
  5725. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) begin
  5726. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  5727. end;
  5728. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  5729. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  5730. end;
  5731. {$endif FPC_SOFTFLOAT_FLOAT128}
  5732. {*----------------------------------------------------------------------------
  5733. | Rounds the extended double-precision floating-point value `a' to an integer,
  5734. | and Returns the result as an extended quadruple-precision floating-point
  5735. | value. The operation is performed according to the IEC/IEEE Standard for
  5736. | Binary Floating-Point Arithmetic.
  5737. *----------------------------------------------------------------------------*}
  5738. function floatx80_round_to_int(a: floatx80): floatx80;
  5739. var
  5740. aSign: flag;
  5741. aExp: int32;
  5742. lastBitMask, roundBitsMask: bits64;
  5743. roundingMode: int8;
  5744. z: floatx80;
  5745. begin
  5746. aExp := extractFloatx80Exp( a );
  5747. if ( $403E <= aExp ) begin
  5748. if ( ( aExp = $7FFF ) and (bits64) ( extractFloatx80Frac( a ) shl 1 ) ) begin
  5749. result := propagateFloatx80NaN( a, a );
  5750. end;
  5751. result := a;
  5752. end;
  5753. if ( aExp < $3FFF ) begin
  5754. if ( ( aExp = 0 )
  5755. and ( (bits64) ( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) begin
  5756. result := a;
  5757. end;
  5758. softfloat_exception_flags or= float_flag_inexact;
  5759. aSign := extractFloatx80Sign( a );
  5760. switch ( softfloat_rounding_mode ) begin
  5761. case float_round_nearest_even:
  5762. if ( ( aExp = $3FFE ) and (bits64) ( extractFloatx80Frac( a ) shl 1 )
  5763. ) begin
  5764. result :=
  5765. packFloatx80( aSign, $3FFF, LIT64( $8000000000000000 ) );
  5766. end;
  5767. break;
  5768. case float_round_down:
  5769. result :=
  5770. aSign ?
  5771. packFloatx80( 1, $3FFF, LIT64( $8000000000000000 ) )
  5772. : packFloatx80( 0, 0, 0 );
  5773. case float_round_up:
  5774. result :=
  5775. aSign ? packFloatx80( 1, 0, 0 )
  5776. : packFloatx80( 0, $3FFF, LIT64( $8000000000000000 ) );
  5777. end;
  5778. result := packFloatx80( aSign, 0, 0 );
  5779. end;
  5780. lastBitMask := 1;
  5781. lastBitMask shl = $403E - aExp;
  5782. roundBitsMask := lastBitMask - 1;
  5783. z := a;
  5784. roundingMode := softfloat_rounding_mode;
  5785. if ( roundingMode = float_round_nearest_even ) begin
  5786. z.low += lastBitMask>>1;
  5787. if ( ( z.low and roundBitsMask ) = 0 ) z.low = ~ lastBitMask;
  5788. end;
  5789. else if ( roundingMode <> float_round_to_zero ) begin
  5790. if ( extractFloatx80Sign( z ) xor ( roundingMode = float_round_up ) ) begin
  5791. z.low += roundBitsMask;
  5792. end;
  5793. end;
  5794. z.low = ~ roundBitsMask;
  5795. if ( z.low = 0 ) begin
  5796. ++z.high;
  5797. z.low := LIT64( $8000000000000000 );
  5798. end;
  5799. if ( z.low <> a.low ) softfloat_exception_flags or= float_flag_inexact;
  5800. result := z;
  5801. end;
  5802. {*----------------------------------------------------------------------------
  5803. | Returns the result of adding the absolute values of the extended double-
  5804. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  5805. | negated before being returned. `zSign' is ignored if the result is a NaN.
  5806. | The addition is performed according to the IEC/IEEE Standard for Binary
  5807. | Floating-Point Arithmetic.
  5808. *----------------------------------------------------------------------------*}
  5809. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5810. var
  5811. aExp, bExp, zExp: int32;
  5812. aSig, bSig, zSig0, zSig1: bits64;
  5813. expDiff: int32;
  5814. begin
  5815. aSig := extractFloatx80Frac( a );
  5816. aExp := extractFloatx80Exp( a );
  5817. bSig := extractFloatx80Frac( b );
  5818. bExp := extractFloatx80Exp( b );
  5819. expDiff := aExp - bExp;
  5820. if ( 0 < expDiff ) begin
  5821. if ( aExp = $7FFF ) begin
  5822. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5823. result := a;
  5824. end;
  5825. if ( bExp = 0 ) --expDiff;
  5826. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  5827. zExp := aExp;
  5828. end;
  5829. else if ( expDiff < 0 ) begin
  5830. if ( bExp = $7FFF ) begin
  5831. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5832. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5833. end;
  5834. if ( aExp = 0 ) ++expDiff;
  5835. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  5836. zExp := bExp;
  5837. end;
  5838. else begin
  5839. if ( aExp = $7FFF ) begin
  5840. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5841. result := propagateFloatx80NaN( a, b );
  5842. end;
  5843. result := a;
  5844. end;
  5845. zSig1 := 0;
  5846. zSig0 := aSig + bSig;
  5847. if ( aExp = 0 ) begin
  5848. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  5849. goto roundAndPack;
  5850. end;
  5851. zExp := aExp;
  5852. goto shiftRight1;
  5853. end;
  5854. zSig0 := aSig + bSig;
  5855. if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
  5856. shiftRight1:
  5857. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  5858. zSig0 or= LIT64( $8000000000000000 );
  5859. ++zExp;
  5860. roundAndPack:
  5861. result :=
  5862. roundAndPackFloatx80(
  5863. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5864. end;
  5865. {*----------------------------------------------------------------------------
  5866. | Returns the result of subtracting the absolute values of the extended
  5867. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  5868. | difference is negated before being returned. `zSign' is ignored if the
  5869. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  5870. | Standard for Binary Floating-Point Arithmetic.
  5871. *----------------------------------------------------------------------------*}
  5872. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5873. var
  5874. aExp, bExp, zExp: int32;
  5875. aSig, bSig, zSig0, zSig1: bits64;
  5876. expDiff: int32;
  5877. z: floatx80;
  5878. begin
  5879. aSig := extractFloatx80Frac( a );
  5880. aExp := extractFloatx80Exp( a );
  5881. bSig := extractFloatx80Frac( b );
  5882. bExp := extractFloatx80Exp( b );
  5883. expDiff := aExp - bExp;
  5884. if ( 0 < expDiff ) goto aExpBigger;
  5885. if ( expDiff < 0 ) goto bExpBigger;
  5886. if ( aExp = $7FFF ) begin
  5887. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5888. result := propagateFloatx80NaN( a, b );
  5889. end;
  5890. float_raise( float_flag_invalid );
  5891. z.low := floatx80_default_nan_low;
  5892. z.high := floatx80_default_nan_high;
  5893. result := z;
  5894. end;
  5895. if ( aExp = 0 ) begin
  5896. aExp := 1;
  5897. bExp := 1;
  5898. end;
  5899. zSig1 := 0;
  5900. if ( bSig < aSig ) goto aBigger;
  5901. if ( aSig < bSig ) goto bBigger;
  5902. result := packFloatx80( softfloat_rounding_mode = float_round_down, 0, 0 );
  5903. bExpBigger:
  5904. if ( bExp = $7FFF ) begin
  5905. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5906. result := packFloatx80( zSign xor 1, $7FFF, LIT64( $8000000000000000 ) );
  5907. end;
  5908. if ( aExp = 0 ) ++expDiff;
  5909. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  5910. bBigger:
  5911. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  5912. zExp := bExp;
  5913. zSign xor = 1;
  5914. goto normalizeRoundAndPack;
  5915. aExpBigger:
  5916. if ( aExp = $7FFF ) begin
  5917. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5918. result := a;
  5919. end;
  5920. if ( bExp = 0 ) --expDiff;
  5921. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  5922. aBigger:
  5923. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  5924. zExp := aExp;
  5925. normalizeRoundAndPack:
  5926. result :=
  5927. normalizeRoundAndPackFloatx80(
  5928. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5929. end;
  5930. {*----------------------------------------------------------------------------
  5931. | Returns the result of adding the extended double-precision floating-point
  5932. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  5933. | Standard for Binary Floating-Point Arithmetic.
  5934. *----------------------------------------------------------------------------*}
  5935. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  5936. var
  5937. aSign, bSign: flag;
  5938. begin
  5939. aSign := extractFloatx80Sign( a );
  5940. bSign := extractFloatx80Sign( b );
  5941. if ( aSign = bSign ) begin
  5942. result := addFloatx80Sigs( a, b, aSign );
  5943. end;
  5944. else begin
  5945. result := subFloatx80Sigs( a, b, aSign );
  5946. end;
  5947. end;
  5948. {*----------------------------------------------------------------------------
  5949. | Returns the result of subtracting the extended double-precision floating-
  5950. | point values `a' and `b'. The operation is performed according to the
  5951. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5952. *----------------------------------------------------------------------------*}
  5953. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  5954. var
  5955. aSign, bSign: flag;
  5956. begin
  5957. aSign := extractFloatx80Sign( a );
  5958. bSign := extractFloatx80Sign( b );
  5959. if ( aSign = bSign ) begin
  5960. result := subFloatx80Sigs( a, b, aSign );
  5961. end;
  5962. else begin
  5963. result := addFloatx80Sigs( a, b, aSign );
  5964. end;
  5965. end;
  5966. {*----------------------------------------------------------------------------
  5967. | Returns the result of multiplying the extended double-precision floating-
  5968. | point values `a' and `b'. The operation is performed according to the
  5969. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5970. *----------------------------------------------------------------------------*}
  5971. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  5972. var
  5973. aSign, bSign, zSign: flag;
  5974. aExp, bExp, zExp: int32;
  5975. aSig, bSig, zSig0, zSig1: bits64;
  5976. z: floatx80;
  5977. begin
  5978. aSig := extractFloatx80Frac( a );
  5979. aExp := extractFloatx80Exp( a );
  5980. aSign := extractFloatx80Sign( a );
  5981. bSig := extractFloatx80Frac( b );
  5982. bExp := extractFloatx80Exp( b );
  5983. bSign := extractFloatx80Sign( b );
  5984. zSign := aSign xor bSign;
  5985. if ( aExp = $7FFF ) begin
  5986. if ( (bits64) ( aSig shl 1 )
  5987. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  5988. result := propagateFloatx80NaN( a, b );
  5989. end;
  5990. if ( ( bExp or bSig ) = 0 ) goto invalid;
  5991. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5992. end;
  5993. if ( bExp = $7FFF ) begin
  5994. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5995. if ( ( aExp or aSig ) = 0 ) begin
  5996. invalid:
  5997. float_raise( float_flag_invalid );
  5998. z.low := floatx80_default_nan_low;
  5999. z.high := floatx80_default_nan_high;
  6000. result := z;
  6001. end;
  6002. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6003. end;
  6004. if ( aExp = 0 ) begin
  6005. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6006. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6007. end;
  6008. if ( bExp = 0 ) begin
  6009. if ( bSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6010. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6011. end;
  6012. zExp := aExp + bExp - $3FFE;
  6013. mul64To128( aSig, bSig, zSig0, zSig1 );
  6014. if ( 0 < (sbits64) zSig0 ) begin
  6015. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6016. --zExp;
  6017. end;
  6018. result :=
  6019. roundAndPackFloatx80(
  6020. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6021. end;
  6022. {*----------------------------------------------------------------------------
  6023. | Returns the result of dividing the extended double-precision floating-point
  6024. | value `a' by the corresponding value `b'. The operation is performed
  6025. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6026. *----------------------------------------------------------------------------*}
  6027. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6028. var
  6029. aSign, bSign, zSign: flag;
  6030. aExp, bExp, zExp: int32;
  6031. aSig, bSig, zSig0, zSig1: bits64;
  6032. rem0, rem1, rem2, term0, term1, term2: bits64;
  6033. z: floatx80;
  6034. begin
  6035. aSig := extractFloatx80Frac( a );
  6036. aExp := extractFloatx80Exp( a );
  6037. aSign := extractFloatx80Sign( a );
  6038. bSig := extractFloatx80Frac( b );
  6039. bExp := extractFloatx80Exp( b );
  6040. bSign := extractFloatx80Sign( b );
  6041. zSign := aSign xor bSign;
  6042. if ( aExp = $7FFF ) begin
  6043. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6044. if ( bExp = $7FFF ) begin
  6045. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6046. goto invalid;
  6047. end;
  6048. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6049. end;
  6050. if ( bExp = $7FFF ) begin
  6051. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6052. result := packFloatx80( zSign, 0, 0 );
  6053. end;
  6054. if ( bExp = 0 ) begin
  6055. if ( bSig = 0 ) begin
  6056. if ( ( aExp or aSig ) = 0 ) begin
  6057. invalid:
  6058. float_raise( float_flag_invalid );
  6059. z.low := floatx80_default_nan_low;
  6060. z.high := floatx80_default_nan_high;
  6061. result := z;
  6062. end;
  6063. float_raise( float_flag_divbyzero );
  6064. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6065. end;
  6066. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6067. end;
  6068. if ( aExp = 0 ) begin
  6069. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6070. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6071. end;
  6072. zExp := aExp - bExp + $3FFE;
  6073. rem1 := 0;
  6074. if ( bSig <= aSig ) begin
  6075. shift128Right( aSig, 0, 1, aSig, rem1 );
  6076. ++zExp;
  6077. end;
  6078. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6079. mul64To128( bSig, zSig0, term0, term1 );
  6080. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6081. while ( (sbits64) rem0 < 0 ) begin
  6082. --zSig0;
  6083. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6084. end;
  6085. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6086. if ( (bits64) ( zSig1 shl 1 ) <= 8 ) begin
  6087. mul64To128( bSig, zSig1, term1, term2 );
  6088. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6089. while ( (sbits64) rem1 < 0 ) begin
  6090. --zSig1;
  6091. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6092. end;
  6093. zSig1 or= ( ( rem1 or rem2 ) <> 0 );
  6094. end;
  6095. result :=
  6096. roundAndPackFloatx80(
  6097. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6098. end;
  6099. {*----------------------------------------------------------------------------
  6100. | Returns the remainder of the extended double-precision floating-point value
  6101. | `a' with respect to the corresponding value `b'. The operation is performed
  6102. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6103. *----------------------------------------------------------------------------*}
  6104. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6105. var
  6106. aSign, bSign, zSign: flag;
  6107. aExp, bExp, expDiff: int32;
  6108. aSig0, aSig1, bSig: bits64;
  6109. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6110. z: floatx80;
  6111. begin
  6112. aSig0 := extractFloatx80Frac( a );
  6113. aExp := extractFloatx80Exp( a );
  6114. aSign := extractFloatx80Sign( a );
  6115. bSig := extractFloatx80Frac( b );
  6116. bExp := extractFloatx80Exp( b );
  6117. bSign := extractFloatx80Sign( b );
  6118. if ( aExp = $7FFF ) begin
  6119. if ( (bits64) ( aSig0 shl 1 )
  6120. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  6121. result := propagateFloatx80NaN( a, b );
  6122. end;
  6123. goto invalid;
  6124. end;
  6125. if ( bExp = $7FFF ) begin
  6126. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6127. result := a;
  6128. end;
  6129. if ( bExp = 0 ) begin
  6130. if ( bSig = 0 ) begin
  6131. invalid:
  6132. float_raise( float_flag_invalid );
  6133. z.low := floatx80_default_nan_low;
  6134. z.high := floatx80_default_nan_high;
  6135. result := z;
  6136. end;
  6137. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6138. end;
  6139. if ( aExp = 0 ) begin
  6140. if ( (bits64) ( aSig0 shl 1 ) = 0 ) result := a;
  6141. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6142. end;
  6143. bSig or= LIT64( $8000000000000000 );
  6144. zSign := aSign;
  6145. expDiff := aExp - bExp;
  6146. aSig1 := 0;
  6147. if ( expDiff < 0 ) begin
  6148. if ( expDiff < -1 ) result := a;
  6149. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6150. expDiff := 0;
  6151. end;
  6152. q := ( bSig <= aSig0 );
  6153. if ( q ) aSig0 -= bSig;
  6154. expDiff -= 64;
  6155. while ( 0 < expDiff ) begin
  6156. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6157. q := ( 2 < q ) ? q - 2 : 0;
  6158. mul64To128( bSig, q, term0, term1 );
  6159. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6160. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  6161. expDiff -= 62;
  6162. end;
  6163. expDiff += 64;
  6164. if ( 0 < expDiff ) begin
  6165. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6166. q := ( 2 < q ) ? q - 2 : 0;
  6167. q >>= 64 - expDiff;
  6168. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  6169. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6170. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  6171. while ( le128( term0, term1, aSig0, aSig1 ) ) begin
  6172. ++q;
  6173. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6174. end;
  6175. end;
  6176. else begin
  6177. term1 := 0;
  6178. term0 := bSig;
  6179. end;
  6180. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  6181. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6182. or ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6183. and ( q and 1 ) )
  6184. ) begin
  6185. aSig0 := alternateASig0;
  6186. aSig1 := alternateASig1;
  6187. zSign := ! zSign;
  6188. end;
  6189. result :=
  6190. normalizeRoundAndPackFloatx80(
  6191. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  6192. end;
  6193. {*----------------------------------------------------------------------------
  6194. | Returns the square root of the extended double-precision floating-point
  6195. | value `a'. The operation is performed according to the IEC/IEEE Standard
  6196. | for Binary Floating-Point Arithmetic.
  6197. *----------------------------------------------------------------------------*}
  6198. function floatx80_sqrt(a: floatx80): floatx80;
  6199. var
  6200. aSign: flag;
  6201. aExp, zExp: int32;
  6202. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  6203. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  6204. z: floatx80;
  6205. label
  6206. invalid;
  6207. begin
  6208. aSig0 := extractFloatx80Frac( a );
  6209. aExp := extractFloatx80Exp( a );
  6210. aSign := extractFloatx80Sign( a );
  6211. if ( aExp = $7FFF ) begin
  6212. if ( (bits64) ( aSig0 shl 1 ) ) result := propagateFloatx80NaN( a, a );
  6213. if ( ! aSign ) result := a;
  6214. goto invalid;
  6215. end;
  6216. if ( aSign ) begin
  6217. if ( ( aExp or aSig0 ) = 0 ) result := a;
  6218. invalid:
  6219. float_raise( float_flag_invalid );
  6220. z.low := floatx80_default_nan_low;
  6221. z.high := floatx80_default_nan_high;
  6222. result := z;
  6223. end;
  6224. if ( aExp = 0 ) begin
  6225. if ( aSig0 = 0 ) result := packFloatx80( 0, 0, 0 );
  6226. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6227. end;
  6228. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
  6229. zSig0 := estimateSqrt32( aExp, aSig0>>32 );
  6230. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  6231. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  6232. doubleZSig0 := zSig0 shl 1;
  6233. mul64To128( zSig0, zSig0, term0, term1 );
  6234. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  6235. while ( (sbits64) rem0 < 0 ) begin
  6236. --zSig0;
  6237. doubleZSig0 -= 2;
  6238. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, rem0, rem1 );
  6239. end;
  6240. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  6241. if ( ( zSig1 and LIT64( $3FFFFFFFFFFFFFFF ) ) <= 5 ) begin
  6242. if ( zSig1 = 0 ) zSig1 := 1;
  6243. mul64To128( doubleZSig0, zSig1, term1, term2 );
  6244. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6245. mul64To128( zSig1, zSig1, term2, term3 );
  6246. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  6247. while ( (sbits64) rem1 < 0 ) begin
  6248. --zSig1;
  6249. shortShift128Left( 0, zSig1, 1, term2, term3 );
  6250. term3 or= 1;
  6251. term2 or= doubleZSig0;
  6252. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  6253. end;
  6254. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  6255. end;
  6256. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  6257. zSig0 or= doubleZSig0;
  6258. result :=
  6259. roundAndPackFloatx80(
  6260. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  6261. end;
  6262. {*----------------------------------------------------------------------------
  6263. | Returns 1 if the extended double-precision floating-point value `a' is
  6264. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  6265. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  6266. | Arithmetic.
  6267. *----------------------------------------------------------------------------*}
  6268. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  6269. begin
  6270. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6271. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6272. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6273. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6274. ) begin
  6275. if ( floatx80_is_signaling_nan( a )
  6276. or floatx80_is_signaling_nan( b ) ) begin
  6277. float_raise( float_flag_invalid );
  6278. end;
  6279. result := 0;
  6280. end;
  6281. result :=
  6282. ( a.low = b.low )
  6283. and ( ( a.high = b.high )
  6284. or ( ( a.low = 0 )
  6285. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6286. );
  6287. end;
  6288. {*----------------------------------------------------------------------------
  6289. | Returns 1 if the extended double-precision floating-point value `a' is
  6290. | less than or equal to the corresponding value `b', and 0 otherwise. The
  6291. | comparison is performed according to the IEC/IEEE Standard for Binary
  6292. | Floating-Point Arithmetic.
  6293. *----------------------------------------------------------------------------*}
  6294. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  6295. var
  6296. aSign, bSign: flag;
  6297. begin
  6298. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6299. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6300. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6301. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6302. ) begin
  6303. float_raise( float_flag_invalid );
  6304. result := 0;
  6305. end;
  6306. aSign := extractFloatx80Sign( a );
  6307. bSign := extractFloatx80Sign( b );
  6308. if ( aSign <> bSign ) begin
  6309. result :=
  6310. aSign
  6311. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6312. = 0 );
  6313. end;
  6314. result :=
  6315. aSign ? le128( b.high, b.low, a.high, a.low )
  6316. : le128( a.high, a.low, b.high, b.low );
  6317. end;
  6318. {*----------------------------------------------------------------------------
  6319. | Returns 1 if the extended double-precision floating-point value `a' is
  6320. | less than the corresponding value `b', and 0 otherwise. The comparison
  6321. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6322. | Arithmetic.
  6323. *----------------------------------------------------------------------------*}
  6324. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  6325. var
  6326. aSign, bSign: flag;
  6327. begin
  6328. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6329. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6330. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6331. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6332. ) begin
  6333. float_raise( float_flag_invalid );
  6334. result := 0;
  6335. end;
  6336. aSign := extractFloatx80Sign( a );
  6337. bSign := extractFloatx80Sign( b );
  6338. if ( aSign <> bSign ) begin
  6339. result :=
  6340. aSign
  6341. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6342. <> 0 );
  6343. end;
  6344. result :=
  6345. aSign ? lt128( b.high, b.low, a.high, a.low )
  6346. : lt128( a.high, a.low, b.high, b.low );
  6347. end;
  6348. {*----------------------------------------------------------------------------
  6349. | Returns 1 if the extended double-precision floating-point value `a' is equal
  6350. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  6351. | raised if either operand is a NaN. Otherwise, the comparison is performed
  6352. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6353. *----------------------------------------------------------------------------*}
  6354. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  6355. begin
  6356. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6357. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6358. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6359. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6360. ) begin
  6361. float_raise( float_flag_invalid );
  6362. result := 0;
  6363. end;
  6364. result :=
  6365. ( a.low = b.low )
  6366. and ( ( a.high = b.high )
  6367. or ( ( a.low = 0 )
  6368. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6369. );
  6370. end;
  6371. {*----------------------------------------------------------------------------
  6372. | Returns 1 if the extended double-precision floating-point value `a' is less
  6373. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  6374. | do not cause an exception. Otherwise, the comparison is performed according
  6375. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6376. *----------------------------------------------------------------------------*}
  6377. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  6378. var
  6379. aSign, bSign: flag;
  6380. begin
  6381. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6382. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6383. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6384. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6385. ) begin
  6386. if ( floatx80_is_signaling_nan( a )
  6387. or floatx80_is_signaling_nan( b ) ) begin
  6388. float_raise( float_flag_invalid );
  6389. end;
  6390. result := 0;
  6391. end;
  6392. aSign := extractFloatx80Sign( a );
  6393. bSign := extractFloatx80Sign( b );
  6394. if ( aSign <> bSign ) begin
  6395. result :=
  6396. aSign
  6397. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6398. = 0 );
  6399. end;
  6400. result :=
  6401. aSign ? le128( b.high, b.low, a.high, a.low )
  6402. : le128( a.high, a.low, b.high, b.low );
  6403. end;
  6404. {*----------------------------------------------------------------------------
  6405. | Returns 1 if the extended double-precision floating-point value `a' is less
  6406. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  6407. | an exception. Otherwise, the comparison is performed according to the
  6408. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6409. *----------------------------------------------------------------------------*}
  6410. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  6411. var
  6412. aSign, bSign: flag;
  6413. begin
  6414. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6415. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6416. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6417. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6418. ) begin
  6419. if ( floatx80_is_signaling_nan( a )
  6420. or floatx80_is_signaling_nan( b ) ) begin
  6421. float_raise( float_flag_invalid );
  6422. end;
  6423. result := 0;
  6424. end;
  6425. aSign := extractFloatx80Sign( a );
  6426. bSign := extractFloatx80Sign( b );
  6427. if ( aSign <> bSign ) begin
  6428. result :=
  6429. aSign
  6430. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6431. <> 0 );
  6432. end;
  6433. result :=
  6434. aSign ? lt128( b.high, b.low, a.high, a.low )
  6435. : lt128( a.high, a.low, b.high, b.low );
  6436. end;
  6437. {$endif FPC_SOFTFLOAT_FLOATX80}
  6438. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6439. {*----------------------------------------------------------------------------
  6440. | Returns the least-significant 64 fraction bits of the quadruple-precision
  6441. | floating-point value `a'.
  6442. *----------------------------------------------------------------------------*}
  6443. function extractFloat128Frac1(a : float128): bits64;
  6444. begin
  6445. result:=a.low;
  6446. end;
  6447. {*----------------------------------------------------------------------------
  6448. | Returns the most-significant 48 fraction bits of the quadruple-precision
  6449. | floating-point value `a'.
  6450. *----------------------------------------------------------------------------*}
  6451. function extractFloat128Frac0(a : float128): bits64;
  6452. begin
  6453. result:=a.high and int64($0000FFFFFFFFFFFF);
  6454. end;
  6455. {*----------------------------------------------------------------------------
  6456. | Returns the exponent bits of the quadruple-precision floating-point value
  6457. | `a'.
  6458. *----------------------------------------------------------------------------*}
  6459. function extractFloat128Exp(a : float128): int32;
  6460. begin
  6461. result:=( a.high shr 48 ) and $7FFF;
  6462. end;
  6463. {*----------------------------------------------------------------------------
  6464. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  6465. *----------------------------------------------------------------------------*}
  6466. function extractFloat128Sign(a : float128): flag;
  6467. begin
  6468. result:=a.high shr 63;
  6469. end;
  6470. {*----------------------------------------------------------------------------
  6471. | Normalizes the subnormal quadruple-precision floating-point value
  6472. | represented by the denormalized significand formed by the concatenation of
  6473. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  6474. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  6475. | significand are stored at the location pointed to by `zSig0Ptr', and the
  6476. | least significant 64 bits of the normalized significand are stored at the
  6477. | location pointed to by `zSig1Ptr'.
  6478. *----------------------------------------------------------------------------*}
  6479. procedure normalizeFloat128Subnormal(
  6480. aSig0: bits64;
  6481. aSig1: bits64;
  6482. var zExpPtr: int32;
  6483. var zSig0Ptr: bits64;
  6484. var zSig1Ptr: bits64);
  6485. var
  6486. shiftCount: int8;
  6487. begin
  6488. if ( aSig0 = 0 ) then
  6489. begin
  6490. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  6491. if ( shiftCount < 0 ) then
  6492. begin
  6493. zSig0Ptr := aSig1 shr ( - shiftCount );
  6494. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  6495. end
  6496. else begin
  6497. zSig0Ptr := aSig1 shl shiftCount;
  6498. zSig1Ptr := 0;
  6499. end;
  6500. zExpPtr := - shiftCount - 63;
  6501. end
  6502. else begin
  6503. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  6504. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  6505. zExpPtr := 1 - shiftCount;
  6506. end;
  6507. end;
  6508. {*----------------------------------------------------------------------------
  6509. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  6510. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  6511. | floating-point value, returning the result. After being shifted into the
  6512. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  6513. | added together to form the most significant 32 bits of the result. This
  6514. | means that any integer portion of `zSig0' will be added into the exponent.
  6515. | Since a properly normalized significand will have an integer portion equal
  6516. | to 1, the `zExp' input should be 1 less than the desired result exponent
  6517. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  6518. | significand.
  6519. *----------------------------------------------------------------------------*}
  6520. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  6521. var
  6522. z: float128;
  6523. begin
  6524. z.low := zSig1;
  6525. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  6526. result:=z;
  6527. end;
  6528. {*----------------------------------------------------------------------------
  6529. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6530. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  6531. | and `zSig2', and returns the proper quadruple-precision floating-point value
  6532. | corresponding to the abstract input. Ordinarily, the abstract value is
  6533. | simply rounded and packed into the quadruple-precision format, with the
  6534. | inexact exception raised if the abstract input cannot be represented
  6535. | exactly. However, if the abstract value is too large, the overflow and
  6536. | inexact exceptions are raised and an infinity or maximal finite value is
  6537. | returned. If the abstract value is too small, the input value is rounded to
  6538. | a subnormal number, and the underflow and inexact exceptions are raised if
  6539. | the abstract input cannot be represented exactly as a subnormal quadruple-
  6540. | precision floating-point number.
  6541. | The input significand must be normalized or smaller. If the input
  6542. | significand is not normalized, `zExp' must be 0; in that case, the result
  6543. | returned is a subnormal number, and it must not require rounding. In the
  6544. | usual case that the input significand is normalized, `zExp' must be 1 less
  6545. | than the ``true'' floating-point exponent. The handling of underflow and
  6546. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6547. *----------------------------------------------------------------------------*}
  6548. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  6549. var
  6550. roundingMode: int8;
  6551. roundNearestEven, increment, isTiny: flag;
  6552. begin
  6553. roundingMode := softfloat_rounding_mode;
  6554. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  6555. increment := ord( sbits64(zSig2) < 0 );
  6556. if ( roundNearestEven=0 ) then
  6557. begin
  6558. if ( roundingMode = float_round_to_zero ) then
  6559. begin
  6560. increment := 0;
  6561. end
  6562. else begin
  6563. if ( zSign<>0 ) then
  6564. begin
  6565. increment := ord( roundingMode = float_round_down ) and zSig2;
  6566. end
  6567. else begin
  6568. increment := ord( roundingMode = float_round_up ) and zSig2;
  6569. end;
  6570. end;
  6571. end;
  6572. if ( $7FFD <= bits32(zExp) ) then
  6573. begin
  6574. if ( ord( $7FFD < zExp )
  6575. or ( ord( zExp = $7FFD )
  6576. and eq128(
  6577. int64( $0001FFFFFFFFFFFF ),
  6578. int64( $FFFFFFFFFFFFFFFF ),
  6579. zSig0,
  6580. zSig1
  6581. )
  6582. and increment
  6583. )
  6584. )<>0 then
  6585. begin
  6586. float_raise( float_flag_overflow or float_flag_inexact );
  6587. if ( ord( roundingMode = float_round_to_zero )
  6588. or ( zSign and ord( roundingMode = float_round_up ) )
  6589. or ( not(zSign) and ord( roundingMode = float_round_down ) )
  6590. )<>0 then
  6591. begin
  6592. result :=
  6593. packFloat128(
  6594. zSign,
  6595. $7FFE,
  6596. int64( $0000FFFFFFFFFFFF ),
  6597. int64( $FFFFFFFFFFFFFFFF )
  6598. );
  6599. end;
  6600. result:=packFloat128( zSign, $7FFF, 0, 0 );
  6601. end;
  6602. if ( zExp < 0 ) then
  6603. begin
  6604. isTiny :=
  6605. ord(( float_detect_tininess = float_tininess_before_rounding )
  6606. or ( zExp < -1 )
  6607. or not( increment<>0 )
  6608. or boolean(lt128(
  6609. zSig0,
  6610. zSig1,
  6611. int64( $0001FFFFFFFFFFFF ),
  6612. int64( $FFFFFFFFFFFFFFFF )
  6613. )));
  6614. shift128ExtraRightJamming(
  6615. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  6616. zExp := 0;
  6617. if ( isTiny and zSig2 )<>0 then
  6618. float_raise( float_flag_underflow );
  6619. if ( roundNearestEven<>0 ) then
  6620. begin
  6621. increment := ord( sbits64(zSig2) < 0 );
  6622. end
  6623. else begin
  6624. if ( zSign<>0 ) then
  6625. begin
  6626. increment := ord( roundingMode = float_round_down ) and zSig2;
  6627. end
  6628. else begin
  6629. increment := ord( roundingMode = float_round_up ) and zSig2;
  6630. end;
  6631. end;
  6632. end;
  6633. end;
  6634. if ( zSig2<>0 ) then
  6635. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6636. if ( increment<>0 ) then
  6637. begin
  6638. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  6639. zSig1 := zSig1 and not( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  6640. end
  6641. else begin
  6642. if ( ( zSig0 or zSig1 ) = 0 ) then
  6643. zExp := 0;
  6644. end;
  6645. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  6646. end;
  6647. {*----------------------------------------------------------------------------
  6648. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6649. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  6650. | returns the proper quadruple-precision floating-point value corresponding
  6651. | to the abstract input. This routine is just like `roundAndPackFloat128'
  6652. | except that the input significand has fewer bits and does not have to be
  6653. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  6654. | point exponent.
  6655. *----------------------------------------------------------------------------*}
  6656. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  6657. var
  6658. shiftCount: int8;
  6659. zSig2: bits64;
  6660. begin
  6661. if ( zSig0 = 0 ) then
  6662. begin
  6663. zSig0 := zSig1;
  6664. zSig1 := 0;
  6665. dec(zExp, 64);
  6666. end;
  6667. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  6668. if ( 0 <= shiftCount ) then
  6669. begin
  6670. zSig2 := 0;
  6671. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6672. end
  6673. else begin
  6674. shift128ExtraRightJamming(
  6675. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  6676. end;
  6677. dec(zExp, shiftCount);
  6678. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  6679. end;
  6680. {*----------------------------------------------------------------------------
  6681. | Returns the result of converting the quadruple-precision floating-point
  6682. | value `a' to the 32-bit two's complement integer format. The conversion
  6683. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6684. | Arithmetic---which means in particular that the conversion is rounded
  6685. | according to the current rounding mode. If `a' is a NaN, the largest
  6686. | positive integer is returned. Otherwise, if the conversion overflows, the
  6687. | largest integer with the same sign as `a' is returned.
  6688. *----------------------------------------------------------------------------*}
  6689. function float128_to_int32(a: float128): int32;
  6690. var
  6691. aSign: flag;
  6692. aExp, shiftCount: int32;
  6693. aSig0, aSig1: bits64;
  6694. begin
  6695. aSig1 := extractFloat128Frac1( a );
  6696. aSig0 := extractFloat128Frac0( a );
  6697. aExp := extractFloat128Exp( a );
  6698. aSign := extractFloat128Sign( a );
  6699. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  6700. aSign := 0;
  6701. if ( aExp<>0 ) then
  6702. aSig0 := aSig0 or int64( $0001000000000000 );
  6703. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6704. shiftCount := $4028 - aExp;
  6705. if ( 0 < shiftCount ) then
  6706. shift64RightJamming( aSig0, shiftCount, aSig0 );
  6707. result := roundAndPackInt32( aSign, aSig0 );
  6708. end;
  6709. {*----------------------------------------------------------------------------
  6710. | Returns the result of converting the quadruple-precision floating-point
  6711. | value `a' to the 32-bit two's complement integer format. The conversion
  6712. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6713. | Arithmetic, except that the conversion is always rounded toward zero. If
  6714. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  6715. | conversion overflows, the largest integer with the same sign as `a' is
  6716. | returned.
  6717. *----------------------------------------------------------------------------*}
  6718. function float128_to_int32_round_to_zero(a: float128): int32;
  6719. var
  6720. aSign: flag;
  6721. aExp, shiftCount: int32;
  6722. aSig0, aSig1, savedASig: bits64;
  6723. z: int32;
  6724. label
  6725. invalid;
  6726. begin
  6727. aSig1 := extractFloat128Frac1( a );
  6728. aSig0 := extractFloat128Frac0( a );
  6729. aExp := extractFloat128Exp( a );
  6730. aSign := extractFloat128Sign( a );
  6731. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6732. if ( $401E < aExp ) then
  6733. begin
  6734. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  6735. aSign := 0;
  6736. goto invalid;
  6737. end
  6738. else if ( aExp < $3FFF ) then
  6739. begin
  6740. if ( aExp or aSig0 )<>0 then
  6741. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6742. result := 0;
  6743. exit;
  6744. end;
  6745. aSig0 := aSig0 or int64( $0001000000000000 );
  6746. shiftCount := $402F - aExp;
  6747. savedASig := aSig0;
  6748. aSig0 := aSig0 shr shiftCount;
  6749. z := aSig0;
  6750. if ( aSign )<>0 then
  6751. z := - z;
  6752. if ( ord( z < 0 ) xor aSign )<>0 then
  6753. begin
  6754. invalid:
  6755. float_raise( float_flag_invalid );
  6756. if aSign<>0 then
  6757. result:=$80000000
  6758. else
  6759. result:=$7FFFFFFF;
  6760. exit;
  6761. end;
  6762. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  6763. begin
  6764. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6765. end;
  6766. result := z;
  6767. end;
  6768. {*----------------------------------------------------------------------------
  6769. | Returns the result of converting the quadruple-precision floating-point
  6770. | value `a' to the 64-bit two's complement integer format. The conversion
  6771. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6772. | Arithmetic---which means in particular that the conversion is rounded
  6773. | according to the current rounding mode. If `a' is a NaN, the largest
  6774. | positive integer is returned. Otherwise, if the conversion overflows, the
  6775. | largest integer with the same sign as `a' is returned.
  6776. *----------------------------------------------------------------------------*}
  6777. function float128_to_int64(a: float128): int64;
  6778. var
  6779. aSign: flag;
  6780. aExp, shiftCount: int32;
  6781. aSig0, aSig1: bits64;
  6782. begin
  6783. aSig1 := extractFloat128Frac1( a );
  6784. aSig0 := extractFloat128Frac0( a );
  6785. aExp := extractFloat128Exp( a );
  6786. aSign := extractFloat128Sign( a );
  6787. if ( aExp<>0 ) then
  6788. aSig0 := aSig0 or int64( $0001000000000000 );
  6789. shiftCount := $402F - aExp;
  6790. if ( shiftCount <= 0 ) then
  6791. begin
  6792. if ( $403E < aExp ) then
  6793. begin
  6794. float_raise( float_flag_invalid );
  6795. if ( (aSign=0)
  6796. or ( ( aExp = $7FFF )
  6797. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  6798. )
  6799. ) then
  6800. begin
  6801. result := int64( $7FFFFFFFFFFFFFFF );
  6802. end;
  6803. result := int64( $8000000000000000 );
  6804. end;
  6805. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  6806. end
  6807. else begin
  6808. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  6809. end;
  6810. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  6811. end;
  6812. {*----------------------------------------------------------------------------
  6813. | Returns the result of converting the quadruple-precision floating-point
  6814. | value `a' to the 64-bit two's complement integer format. The conversion
  6815. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6816. | Arithmetic, except that the conversion is always rounded toward zero.
  6817. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  6818. | the conversion overflows, the largest integer with the same sign as `a' is
  6819. | returned.
  6820. *----------------------------------------------------------------------------*}
  6821. function float128_to_int64_round_to_zero(a: float128): int64;
  6822. var
  6823. aSign: flag;
  6824. aExp, shiftCount: int32;
  6825. aSig0, aSig1: bits64;
  6826. z: int64;
  6827. begin
  6828. aSig1 := extractFloat128Frac1( a );
  6829. aSig0 := extractFloat128Frac0( a );
  6830. aExp := extractFloat128Exp( a );
  6831. aSign := extractFloat128Sign( a );
  6832. if ( aExp<>0 ) then
  6833. aSig0 := aSig0 or int64( $0001000000000000 );
  6834. shiftCount := aExp - $402F;
  6835. if ( 0 < shiftCount ) then
  6836. begin
  6837. if ( $403E <= aExp ) then
  6838. begin
  6839. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  6840. if ( ( a.high = int64( $C03E000000000000 ) )
  6841. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  6842. begin
  6843. if ( aSig1<>0 ) then
  6844. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6845. end
  6846. else begin
  6847. float_raise( float_flag_invalid );
  6848. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  6849. begin
  6850. result := int64( $7FFFFFFFFFFFFFFF );
  6851. exit;
  6852. end;
  6853. end;
  6854. result := int64( $8000000000000000 );
  6855. exit;
  6856. end;
  6857. z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
  6858. if ( int64( aSig1 shl shiftCount )<>0 ) then
  6859. begin
  6860. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6861. end;
  6862. end
  6863. else begin
  6864. if ( aExp < $3FFF ) then
  6865. begin
  6866. if ( aExp or aSig0 or aSig1 )<>0 then
  6867. begin
  6868. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6869. end;
  6870. result := 0;
  6871. exit;
  6872. end;
  6873. z := aSig0 shr ( - shiftCount );
  6874. if ( (aSig1<>0)
  6875. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  6876. begin
  6877. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6878. end;
  6879. end;
  6880. if ( aSign<>0 ) then
  6881. z := - z;
  6882. result := z;
  6883. end;
  6884. {*----------------------------------------------------------------------------
  6885. | Returns the result of converting the quadruple-precision floating-point
  6886. | value `a' to the single-precision floating-point format. The conversion
  6887. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6888. | Arithmetic.
  6889. *----------------------------------------------------------------------------*}
  6890. function float128_to_float32(a: float128): float32;
  6891. var
  6892. aSign: flag;
  6893. aExp: int32;
  6894. aSig0, aSig1: bits64;
  6895. zSig: bits32;
  6896. begin
  6897. aSig1 := extractFloat128Frac1( a );
  6898. aSig0 := extractFloat128Frac0( a );
  6899. aExp := extractFloat128Exp( a );
  6900. aSign := extractFloat128Sign( a );
  6901. if ( aExp = $7FFF ) then
  6902. begin
  6903. if ( aSig0 or aSig1 )<>0 then
  6904. begin
  6905. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  6906. exit;
  6907. end;
  6908. result := packFloat32( aSign, $FF, 0 );
  6909. exit;
  6910. end;
  6911. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6912. shift64RightJamming( aSig0, 18, aSig0 );
  6913. zSig := aSig0;
  6914. if ( aExp or zSig )<>0 then
  6915. begin
  6916. zSig := zSig or $40000000;
  6917. dec(aExp,$3F81);
  6918. end;
  6919. result := roundAndPackFloat32( aSign, aExp, zSig );
  6920. end;
  6921. {*----------------------------------------------------------------------------
  6922. | Returns the result of converting the quadruple-precision floating-point
  6923. | value `a' to the double-precision floating-point format. The conversion
  6924. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6925. | Arithmetic.
  6926. *----------------------------------------------------------------------------*}
  6927. function float128_to_float64(a: float128): float64;
  6928. var
  6929. aSign: flag;
  6930. aExp: int32;
  6931. aSig0, aSig1: bits64;
  6932. begin
  6933. aSig1 := extractFloat128Frac1( a );
  6934. aSig0 := extractFloat128Frac0( a );
  6935. aExp := extractFloat128Exp( a );
  6936. aSign := extractFloat128Sign( a );
  6937. if ( aExp = $7FFF ) then
  6938. begin
  6939. if ( aSig0 or aSig1 )<>0 then
  6940. begin
  6941. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  6942. exit;
  6943. end;
  6944. result:=packFloat64( aSign, $7FF, 0);
  6945. exit;
  6946. end;
  6947. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  6948. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6949. if ( aExp or aSig0 )<>0 then
  6950. begin
  6951. aSig0 := aSig0 or int64( $4000000000000000 );
  6952. dec(aExp,$3C01);
  6953. end;
  6954. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  6955. end;
  6956. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  6957. {*----------------------------------------------------------------------------
  6958. | Returns the result of converting the quadruple-precision floating-point
  6959. | value `a' to the extended double-precision floating-point format. The
  6960. | conversion is performed according to the IEC/IEEE Standard for Binary
  6961. | Floating-Point Arithmetic.
  6962. *----------------------------------------------------------------------------*}
  6963. function float128_to_floatx80(a: float128): floatx80;
  6964. var
  6965. aSign: flag;
  6966. aExp: int32;
  6967. aSig0, aSig1: bits64;
  6968. begin
  6969. aSig1 := extractFloat128Frac1( a );
  6970. aSig0 := extractFloat128Frac0( a );
  6971. aExp := extractFloat128Exp( a );
  6972. aSign := extractFloat128Sign( a );
  6973. if ( aExp = $7FFF ) begin
  6974. if ( aSig0 or aSig1 ) begin
  6975. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  6976. exit;
  6977. end;
  6978. result := packFloatx80( aSign, $7FFF, int64( $8000000000000000 ) );
  6979. exit;
  6980. end;
  6981. if ( aExp = 0 ) begin
  6982. if ( ( aSig0 or aSig1 ) = 0 ) then
  6983. begin
  6984. result := packFloatx80( aSign, 0, 0 );
  6985. exit;
  6986. end;
  6987. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  6988. end;
  6989. else begin
  6990. aSig0 or= int64( $0001000000000000 );
  6991. end;
  6992. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  6993. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  6994. end;
  6995. {$endif FPC_SOFTFLOAT_FLOATX80}
  6996. {*----------------------------------------------------------------------------
  6997. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  6998. | Returns the result as a quadruple-precision floating-point value. The
  6999. | operation is performed according to the IEC/IEEE Standard for Binary
  7000. | Floating-Point Arithmetic.
  7001. *----------------------------------------------------------------------------*}
  7002. function float128_round_to_int(a: float128): float128;
  7003. var
  7004. aSign: flag;
  7005. aExp: int32;
  7006. lastBitMask, roundBitsMask: bits64;
  7007. roundingMode: int8;
  7008. z: float128;
  7009. begin
  7010. aExp := extractFloat128Exp( a );
  7011. if ( $402F <= aExp ) then
  7012. begin
  7013. if ( $406F <= aExp ) then
  7014. begin
  7015. if ( ( aExp = $7FFF )
  7016. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7017. ) then
  7018. begin
  7019. result := propagateFloat128NaN( a, a );
  7020. exit;
  7021. end;
  7022. result := a;
  7023. exit;
  7024. end;
  7025. lastBitMask := 1;
  7026. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7027. roundBitsMask := lastBitMask - 1;
  7028. z := a;
  7029. roundingMode := softfloat_rounding_mode;
  7030. if ( roundingMode = float_round_nearest_even ) then
  7031. begin
  7032. if ( lastBitMask )<>0 then
  7033. begin
  7034. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7035. if ( ( z.low and roundBitsMask ) = 0 ) then
  7036. z.low := z.low and not(lastBitMask);
  7037. end
  7038. else begin
  7039. if ( sbits64(z.low) < 0 ) then
  7040. begin
  7041. inc(z.high);
  7042. if ( bits64( z.low shl 1 ) = 0 ) then
  7043. z.high := z.high and not(1);
  7044. end;
  7045. end;
  7046. end
  7047. else if ( roundingMode <> float_round_to_zero ) then
  7048. begin
  7049. if ( extractFloat128Sign( z )
  7050. xor ord( roundingMode = float_round_up ) )<>0 then
  7051. begin
  7052. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7053. end;
  7054. end;
  7055. z.low := z.low and not(roundBitsMask);
  7056. end
  7057. else begin
  7058. if ( aExp < $3FFF ) then
  7059. begin
  7060. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7061. begin
  7062. result := a;
  7063. exit;
  7064. end;
  7065. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7066. aSign := extractFloat128Sign( a );
  7067. case softfloat_rounding_mode of
  7068. float_round_nearest_even:
  7069. if ( ( aExp = $3FFE )
  7070. and ( (extractFloat128Frac0( a )<>0)
  7071. or (extractFloat128Frac1( a )<>0) )
  7072. ) then begin
  7073. begin
  7074. result := packFloat128( aSign, $3FFF, 0, 0 );
  7075. exit;
  7076. end;
  7077. end;
  7078. float_round_down:
  7079. begin
  7080. if aSign<>0 then
  7081. result:=packFloat128( 1, $3FFF, 0, 0 )
  7082. else
  7083. result:=packFloat128( 0, 0, 0, 0 );
  7084. exit;
  7085. end;
  7086. float_round_up:
  7087. begin
  7088. if aSign<>0 then
  7089. result := packFloat128( 1, 0, 0, 0 )
  7090. else
  7091. result:=packFloat128( 0, $3FFF, 0, 0 );
  7092. exit;
  7093. end;
  7094. end;
  7095. result := packFloat128( aSign, 0, 0, 0 );
  7096. exit;
  7097. end;
  7098. lastBitMask := 1;
  7099. lastBitMask := lastBitMask shl ($402F - aExp);
  7100. roundBitsMask := lastBitMask - 1;
  7101. z.low := 0;
  7102. z.high := a.high;
  7103. roundingMode := softfloat_rounding_mode;
  7104. if ( roundingMode = float_round_nearest_even ) then begin
  7105. inc(z.high,lastBitMask shr 1);
  7106. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7107. z.high := z.high and not(lastBitMask);
  7108. end;
  7109. end
  7110. else if ( roundingMode <> float_round_to_zero ) then begin
  7111. if ( (extractFloat128Sign( z )<>0)
  7112. xor ( roundingMode = float_round_up ) ) then begin
  7113. z.high := z.high or ord( a.low <> 0 );
  7114. z.high := z.high+roundBitsMask;
  7115. end;
  7116. end;
  7117. z.high := z.high and not(roundBitsMask);
  7118. end;
  7119. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7120. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7121. end;
  7122. result := z;
  7123. end;
  7124. {*----------------------------------------------------------------------------
  7125. | Returns the result of adding the absolute values of the quadruple-precision
  7126. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7127. | before being returned. `zSign' is ignored if the result is a NaN.
  7128. | The addition is performed according to the IEC/IEEE Standard for Binary
  7129. | Floating-Point Arithmetic.
  7130. *----------------------------------------------------------------------------*}
  7131. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7132. var
  7133. aExp, bExp, zExp: int32;
  7134. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7135. expDiff: int32;
  7136. label
  7137. shiftRight1,roundAndPack;
  7138. begin
  7139. aSig1 := extractFloat128Frac1( a );
  7140. aSig0 := extractFloat128Frac0( a );
  7141. aExp := extractFloat128Exp( a );
  7142. bSig1 := extractFloat128Frac1( b );
  7143. bSig0 := extractFloat128Frac0( b );
  7144. bExp := extractFloat128Exp( b );
  7145. expDiff := aExp - bExp;
  7146. if ( 0 < expDiff ) then begin
  7147. if ( aExp = $7FFF ) then begin
  7148. if ( aSig0 or aSig1 )<>0 then
  7149. begin
  7150. result := propagateFloat128NaN( a, b );
  7151. exit;
  7152. end;
  7153. result := a;
  7154. exit;
  7155. end;
  7156. if ( bExp = 0 ) then begin
  7157. dec(expDiff);
  7158. end
  7159. else begin
  7160. bSig0 := bSig0 or int64( $0001000000000000 );
  7161. end;
  7162. shift128ExtraRightJamming(
  7163. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  7164. zExp := aExp;
  7165. end
  7166. else if ( expDiff < 0 ) then begin
  7167. if ( bExp = $7FFF ) then begin
  7168. if ( bSig0 or bSig1 )<>0 then
  7169. begin
  7170. result := propagateFloat128NaN( a, b );
  7171. exit;
  7172. end;
  7173. result := packFloat128( zSign, $7FFF, 0, 0 );
  7174. exit;
  7175. end;
  7176. if ( aExp = 0 ) then begin
  7177. inc(expDiff);
  7178. end
  7179. else begin
  7180. aSig0 := aSig0 or int64( $0001000000000000 );
  7181. end;
  7182. shift128ExtraRightJamming(
  7183. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  7184. zExp := bExp;
  7185. end
  7186. else begin
  7187. if ( aExp = $7FFF ) then begin
  7188. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7189. result := propagateFloat128NaN( a, b );
  7190. exit;
  7191. end;
  7192. result := a;
  7193. exit;
  7194. end;
  7195. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7196. if ( aExp = 0 ) then
  7197. begin
  7198. result := packFloat128( zSign, 0, zSig0, zSig1 );
  7199. exit;
  7200. end;
  7201. zSig2 := 0;
  7202. zSig0 := zSig0 or int64( $0002000000000000 );
  7203. zExp := aExp;
  7204. goto shiftRight1;
  7205. end;
  7206. aSig0 := aSig0 or int64( $0001000000000000 );
  7207. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7208. dec(zExp);
  7209. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  7210. inc(zExp);
  7211. shiftRight1:
  7212. shift128ExtraRightJamming(
  7213. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7214. roundAndPack:
  7215. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7216. end;
  7217. {*----------------------------------------------------------------------------
  7218. | Returns the result of subtracting the absolute values of the quadruple-
  7219. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  7220. | difference is negated before being returned. `zSign' is ignored if the
  7221. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  7222. | Standard for Binary Floating-Point Arithmetic.
  7223. *----------------------------------------------------------------------------*}
  7224. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  7225. var
  7226. aExp, bExp, zExp: int32;
  7227. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  7228. expDiff: int32;
  7229. z: float128;
  7230. label
  7231. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  7232. begin
  7233. aSig1 := extractFloat128Frac1( a );
  7234. aSig0 := extractFloat128Frac0( a );
  7235. aExp := extractFloat128Exp( a );
  7236. bSig1 := extractFloat128Frac1( b );
  7237. bSig0 := extractFloat128Frac0( b );
  7238. bExp := extractFloat128Exp( b );
  7239. expDiff := aExp - bExp;
  7240. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7241. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  7242. if ( 0 < expDiff ) then goto aExpBigger;
  7243. if ( expDiff < 0 ) then goto bExpBigger;
  7244. if ( aExp = $7FFF ) then begin
  7245. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7246. result := propagateFloat128NaN( a, b );
  7247. exit;
  7248. end;
  7249. float_raise( float_flag_invalid );
  7250. z.low := float128_default_nan_low;
  7251. z.high := float128_default_nan_high;
  7252. result := z;
  7253. exit;
  7254. end;
  7255. if ( aExp = 0 ) then begin
  7256. aExp := 1;
  7257. bExp := 1;
  7258. end;
  7259. if ( bSig0 < aSig0 ) then goto aBigger;
  7260. if ( aSig0 < bSig0 ) then goto bBigger;
  7261. if ( bSig1 < aSig1 ) then goto aBigger;
  7262. if ( aSig1 < bSig1 ) then goto bBigger;
  7263. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  7264. exit;
  7265. bExpBigger:
  7266. if ( bExp = $7FFF ) then begin
  7267. if ( bSig0 or bSig1 )<>0 then
  7268. begin
  7269. result := propagateFloat128NaN( a, b );
  7270. exit;
  7271. end;
  7272. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  7273. exit;
  7274. end;
  7275. if ( aExp = 0 ) then begin
  7276. inc(expDiff);
  7277. end
  7278. else begin
  7279. aSig0 := aSig0 or int64( $4000000000000000 );
  7280. end;
  7281. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7282. bSig0 := bSig0 or int64( $4000000000000000 );
  7283. bBigger:
  7284. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  7285. zExp := bExp;
  7286. zSign := zSign xor 1;
  7287. goto normalizeRoundAndPack;
  7288. aExpBigger:
  7289. if ( aExp = $7FFF ) then begin
  7290. if ( aSig0 or aSig1 )<>0 then
  7291. begin
  7292. result := propagateFloat128NaN( a, b );
  7293. exit;
  7294. end;
  7295. result := a;
  7296. exit;
  7297. end;
  7298. if ( bExp = 0 ) then begin
  7299. dec(expDiff);
  7300. end
  7301. else begin
  7302. bSig0 := bSig0 or int64( $4000000000000000 );
  7303. end;
  7304. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  7305. aSig0 := aSig0 or int64( $4000000000000000 );
  7306. aBigger:
  7307. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7308. zExp := aExp;
  7309. normalizeRoundAndPack:
  7310. dec(zExp);
  7311. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  7312. end;
  7313. {*----------------------------------------------------------------------------
  7314. | Returns the result of adding the quadruple-precision floating-point values
  7315. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  7316. | for Binary Floating-Point Arithmetic.
  7317. *----------------------------------------------------------------------------*}
  7318. function float128_add(a: float128; b: float128): float128;
  7319. var
  7320. aSign, bSign: flag;
  7321. begin
  7322. aSign := extractFloat128Sign( a );
  7323. bSign := extractFloat128Sign( b );
  7324. if ( aSign = bSign ) then begin
  7325. result := addFloat128Sigs( a, b, aSign );
  7326. end
  7327. else begin
  7328. result := subFloat128Sigs( a, b, aSign );
  7329. end;
  7330. end;
  7331. {*----------------------------------------------------------------------------
  7332. | Returns the result of subtracting the quadruple-precision floating-point
  7333. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7334. | Standard for Binary Floating-Point Arithmetic.
  7335. *----------------------------------------------------------------------------*}
  7336. function float128_sub(a: float128; b: float128): float128;
  7337. var
  7338. aSign, bSign: flag;
  7339. begin
  7340. aSign := extractFloat128Sign( a );
  7341. bSign := extractFloat128Sign( b );
  7342. if ( aSign = bSign ) then begin
  7343. result := subFloat128Sigs( a, b, aSign );
  7344. end
  7345. else begin
  7346. result := addFloat128Sigs( a, b, aSign );
  7347. end;
  7348. end;
  7349. {*----------------------------------------------------------------------------
  7350. | Returns the result of multiplying the quadruple-precision floating-point
  7351. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7352. | Standard for Binary Floating-Point Arithmetic.
  7353. *----------------------------------------------------------------------------*}
  7354. function float128_mul(a: float128; b: float128): float128;
  7355. var
  7356. aSign, bSign, zSign: flag;
  7357. aExp, bExp, zExp: int32;
  7358. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  7359. z: float128;
  7360. label
  7361. invalid;
  7362. begin
  7363. aSig1 := extractFloat128Frac1( a );
  7364. aSig0 := extractFloat128Frac0( a );
  7365. aExp := extractFloat128Exp( a );
  7366. aSign := extractFloat128Sign( a );
  7367. bSig1 := extractFloat128Frac1( b );
  7368. bSig0 := extractFloat128Frac0( b );
  7369. bExp := extractFloat128Exp( b );
  7370. bSign := extractFloat128Sign( b );
  7371. zSign := aSign xor bSign;
  7372. if ( aExp = $7FFF ) then begin
  7373. if ( (( aSig0 or aSig1 )<>0)
  7374. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7375. result := propagateFloat128NaN( a, b );
  7376. exit;
  7377. end;
  7378. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  7379. result := packFloat128( zSign, $7FFF, 0, 0 );
  7380. exit;
  7381. end;
  7382. if ( bExp = $7FFF ) then begin
  7383. if ( bSig0 or bSig1 )<>0 then
  7384. begin
  7385. result := propagateFloat128NaN( a, b );
  7386. exit;
  7387. end;
  7388. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7389. invalid:
  7390. float_raise( float_flag_invalid );
  7391. z.low := float128_default_nan_low;
  7392. z.high := float128_default_nan_high;
  7393. result := z;
  7394. exit;
  7395. end;
  7396. result := packFloat128( zSign, $7FFF, 0, 0 );
  7397. exit;
  7398. end;
  7399. if ( aExp = 0 ) then begin
  7400. if ( ( aSig0 or aSig1 ) = 0 ) then
  7401. begin
  7402. result := packFloat128( zSign, 0, 0, 0 );
  7403. exit;
  7404. end;
  7405. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7406. end;
  7407. if ( bExp = 0 ) then begin
  7408. if ( ( bSig0 or bSig1 ) = 0 ) then
  7409. begin
  7410. result := packFloat128( zSign, 0, 0, 0 );
  7411. exit;
  7412. end;
  7413. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7414. end;
  7415. zExp := aExp + bExp - $4000;
  7416. aSig0 := aSig0 or int64( $0001000000000000 );
  7417. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  7418. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  7419. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  7420. zSig2 := zSig2 or ord( zSig3 <> 0 );
  7421. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  7422. shift128ExtraRightJamming(
  7423. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7424. inc(zExp);
  7425. end;
  7426. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7427. end;
  7428. {*----------------------------------------------------------------------------
  7429. | Returns the result of dividing the quadruple-precision floating-point value
  7430. | `a' by the corresponding value `b'. The operation is performed according to
  7431. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7432. *----------------------------------------------------------------------------*}
  7433. function float128_div(a: float128; b: float128): float128;
  7434. var
  7435. aSign, bSign, zSign: flag;
  7436. aExp, bExp, zExp: int32;
  7437. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7438. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7439. z: float128;
  7440. label
  7441. invalid;
  7442. begin
  7443. aSig1 := extractFloat128Frac1( a );
  7444. aSig0 := extractFloat128Frac0( a );
  7445. aExp := extractFloat128Exp( a );
  7446. aSign := extractFloat128Sign( a );
  7447. bSig1 := extractFloat128Frac1( b );
  7448. bSig0 := extractFloat128Frac0( b );
  7449. bExp := extractFloat128Exp( b );
  7450. bSign := extractFloat128Sign( b );
  7451. zSign := aSign xor bSign;
  7452. if ( aExp = $7FFF ) then begin
  7453. if ( aSig0 or aSig1 )<>0 then
  7454. begin
  7455. result := propagateFloat128NaN( a, b );
  7456. exit;
  7457. end;
  7458. if ( bExp = $7FFF ) then begin
  7459. if ( bSig0 or bSig1 )<>0 then
  7460. begin
  7461. result := propagateFloat128NaN( a, b );
  7462. exit;
  7463. end;
  7464. goto invalid;
  7465. end;
  7466. result := packFloat128( zSign, $7FFF, 0, 0 );
  7467. exit;
  7468. end;
  7469. if ( bExp = $7FFF ) then begin
  7470. if ( bSig0 or bSig1 )<>0 then
  7471. begin
  7472. result := propagateFloat128NaN( a, b );
  7473. exit;
  7474. end;
  7475. result := packFloat128( zSign, 0, 0, 0 );
  7476. exit;
  7477. end;
  7478. if ( bExp = 0 ) then begin
  7479. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7480. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7481. invalid:
  7482. float_raise( float_flag_invalid );
  7483. z.low := float128_default_nan_low;
  7484. z.high := float128_default_nan_high;
  7485. result := z;
  7486. exit;
  7487. end;
  7488. float_raise( float_flag_divbyzero );
  7489. result := packFloat128( zSign, $7FFF, 0, 0 );
  7490. exit;
  7491. end;
  7492. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7493. end;
  7494. if ( aExp = 0 ) then begin
  7495. if ( ( aSig0 or aSig1 ) = 0 ) then
  7496. begin
  7497. result := packFloat128( zSign, 0, 0, 0 );
  7498. exit;
  7499. end;
  7500. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7501. end;
  7502. zExp := aExp - bExp + $3FFD;
  7503. shortShift128Left(
  7504. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  7505. shortShift128Left(
  7506. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7507. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  7508. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  7509. inc(zExp);
  7510. end;
  7511. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7512. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  7513. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  7514. while ( sbits64(rem0) < 0 ) do begin
  7515. dec(zSig0);
  7516. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  7517. end;
  7518. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  7519. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  7520. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  7521. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  7522. while ( sbits64(rem1) < 0 ) do begin
  7523. dec(zSig1);
  7524. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  7525. end;
  7526. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7527. end;
  7528. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  7529. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7530. end;
  7531. {*----------------------------------------------------------------------------
  7532. | Returns the remainder of the quadruple-precision floating-point value `a'
  7533. | with respect to the corresponding value `b'. The operation is performed
  7534. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7535. *----------------------------------------------------------------------------*}
  7536. function float128_rem(a: float128; b: float128): float128;
  7537. var
  7538. aSign, bSign, zSign: flag;
  7539. aExp, bExp, expDiff: int32;
  7540. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  7541. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  7542. sigMean0: sbits64;
  7543. z: float128;
  7544. label
  7545. invalid;
  7546. begin
  7547. aSig1 := extractFloat128Frac1( a );
  7548. aSig0 := extractFloat128Frac0( a );
  7549. aExp := extractFloat128Exp( a );
  7550. aSign := extractFloat128Sign( a );
  7551. bSig1 := extractFloat128Frac1( b );
  7552. bSig0 := extractFloat128Frac0( b );
  7553. bExp := extractFloat128Exp( b );
  7554. bSign := extractFloat128Sign( b );
  7555. if ( aExp = $7FFF ) then begin
  7556. if ( (( aSig0 or aSig1 )<>0)
  7557. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7558. result := propagateFloat128NaN( a, b );
  7559. exit;
  7560. end;
  7561. goto invalid;
  7562. end;
  7563. if ( bExp = $7FFF ) then begin
  7564. if ( bSig0 or bSig1 )<>0 then
  7565. begin
  7566. result := propagateFloat128NaN( a, b );
  7567. exit;
  7568. end;
  7569. result := a;
  7570. exit;
  7571. end;
  7572. if ( bExp = 0 ) then begin
  7573. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7574. invalid:
  7575. float_raise( float_flag_invalid );
  7576. z.low := float128_default_nan_low;
  7577. z.high := float128_default_nan_high;
  7578. result := z;
  7579. exit;
  7580. end;
  7581. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7582. end;
  7583. if ( aExp = 0 ) then begin
  7584. if ( ( aSig0 or aSig1 ) = 0 ) then
  7585. begin
  7586. result := a;
  7587. exit;
  7588. end;
  7589. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7590. end;
  7591. expDiff := aExp - bExp;
  7592. if ( expDiff < -1 ) then
  7593. begin
  7594. result := a;
  7595. exit;
  7596. end;
  7597. shortShift128Left(
  7598. aSig0 or int64( $0001000000000000 ),
  7599. aSig1,
  7600. 15 - ord( expDiff < 0 ),
  7601. aSig0,
  7602. aSig1
  7603. );
  7604. shortShift128Left(
  7605. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7606. q := le128( bSig0, bSig1, aSig0, aSig1 );
  7607. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7608. dec(expDiff,64);
  7609. while ( 0 < expDiff ) do begin
  7610. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7611. if ( 4 < q ) then
  7612. q := q - 4
  7613. else
  7614. q := 0;
  7615. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7616. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  7617. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  7618. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  7619. dec(expDiff,61);
  7620. end;
  7621. if ( -64 < expDiff ) then begin
  7622. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7623. if ( 4 < q ) then
  7624. q := q - 4
  7625. else
  7626. q := 0;
  7627. q := q shr (- expDiff);
  7628. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7629. inc(expDiff,52);
  7630. if ( expDiff < 0 ) then begin
  7631. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7632. end
  7633. else begin
  7634. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  7635. end;
  7636. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7637. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  7638. end
  7639. else begin
  7640. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  7641. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7642. end;
  7643. repeat
  7644. alternateASig0 := aSig0;
  7645. alternateASig1 := aSig1;
  7646. inc(q);
  7647. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7648. until not( 0 <= sbits64(aSig0) );
  7649. add128(
  7650. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  7651. if ( ( sigMean0 < 0 )
  7652. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  7653. aSig0 := alternateASig0;
  7654. aSig1 := alternateASig1;
  7655. end;
  7656. zSign := ord( sbits64(aSig0) < 0 );
  7657. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  7658. result :=
  7659. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  7660. end;
  7661. {*----------------------------------------------------------------------------
  7662. | Returns the square root of the quadruple-precision floating-point value `a'.
  7663. | The operation is performed according to the IEC/IEEE Standard for Binary
  7664. | Floating-Point Arithmetic.
  7665. *----------------------------------------------------------------------------*}
  7666. function float128_sqrt(a: float128): float128;
  7667. var
  7668. aSign: flag;
  7669. aExp, zExp: int32;
  7670. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  7671. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7672. z: float128;
  7673. label
  7674. invalid;
  7675. begin
  7676. aSig1 := extractFloat128Frac1( a );
  7677. aSig0 := extractFloat128Frac0( a );
  7678. aExp := extractFloat128Exp( a );
  7679. aSign := extractFloat128Sign( a );
  7680. if ( aExp = $7FFF ) then begin
  7681. if ( aSig0 or aSig1 )<>0 then
  7682. begin
  7683. result := propagateFloat128NaN( a, a );
  7684. exit;
  7685. end;
  7686. if ( aSign=0 ) then
  7687. begin
  7688. result := a;
  7689. exit;
  7690. end;
  7691. goto invalid;
  7692. end;
  7693. if ( aSign<>0 ) then begin
  7694. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  7695. begin
  7696. result := a;
  7697. exit;
  7698. end;
  7699. invalid:
  7700. float_raise( float_flag_invalid );
  7701. z.low := float128_default_nan_low;
  7702. z.high := float128_default_nan_high;
  7703. result := z;
  7704. exit;
  7705. end;
  7706. if ( aExp = 0 ) then begin
  7707. if ( ( aSig0 or aSig1 ) = 0 ) then
  7708. begin
  7709. result := packFloat128( 0, 0, 0, 0 );
  7710. exit;
  7711. end;
  7712. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7713. end;
  7714. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
  7715. aSig0 := aSig0 or int64( $0001000000000000 );
  7716. zSig0 := estimateSqrt32( aExp, aSig0>>17 );
  7717. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  7718. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7719. doubleZSig0 := zSig0 shl 1;
  7720. mul64To128( zSig0, zSig0, term0, term1 );
  7721. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7722. while ( sbits64(rem0) < 0 ) do begin
  7723. dec(zSig0);
  7724. dec(doubleZSig0,2);
  7725. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7726. end;
  7727. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7728. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  7729. if ( zSig1 = 0 ) then zSig1 := 1;
  7730. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7731. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7732. mul64To128( zSig1, zSig1, term2, term3 );
  7733. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7734. while ( sbits64(rem1) < 0 ) do begin
  7735. dec(zSig1);
  7736. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7737. term3 := term3 or 1;
  7738. term2 := term2 or doubleZSig0;
  7739. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7740. end;
  7741. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7742. end;
  7743. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  7744. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  7745. end;
  7746. {*----------------------------------------------------------------------------
  7747. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7748. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7749. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7750. *----------------------------------------------------------------------------*}
  7751. function float128_eq(a: float128; b: float128): flag;
  7752. begin
  7753. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7754. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7755. or ( ( extractFloat128Exp( b ) = $7FFF )
  7756. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7757. ) then begin
  7758. if ( (float128_is_signaling_nan( a )<>0)
  7759. or (float128_is_signaling_nan( b )<>0) ) then begin
  7760. float_raise( float_flag_invalid );
  7761. end;
  7762. result := 0;
  7763. exit;
  7764. end;
  7765. result := ord(
  7766. ( a.low = b.low )
  7767. and ( ( a.high = b.high )
  7768. or ( ( a.low = 0 )
  7769. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  7770. ));
  7771. end;
  7772. {*----------------------------------------------------------------------------
  7773. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7774. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  7775. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7776. | Arithmetic.
  7777. *----------------------------------------------------------------------------*}
  7778. function float128_le(a: float128; b: float128): flag;
  7779. var
  7780. aSign, bSign: flag;
  7781. begin
  7782. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7783. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7784. or ( ( extractFloat128Exp( b ) = $7FFF )
  7785. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7786. ) then begin
  7787. float_raise( float_flag_invalid );
  7788. result := 0;
  7789. exit;
  7790. end;
  7791. aSign := extractFloat128Sign( a );
  7792. bSign := extractFloat128Sign( b );
  7793. if ( aSign <> bSign ) then begin
  7794. result := ord(
  7795. (aSign<>0)
  7796. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7797. = 0 ));
  7798. exit;
  7799. end;
  7800. if aSign<>0 then
  7801. result := le128( b.high, b.low, a.high, a.low )
  7802. else
  7803. result := le128( a.high, a.low, b.high, b.low );
  7804. end;
  7805. {*----------------------------------------------------------------------------
  7806. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7807. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7808. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7809. *----------------------------------------------------------------------------*}
  7810. function float128_lt(a: float128; b: float128): flag;
  7811. var
  7812. aSign, bSign: flag;
  7813. begin
  7814. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7815. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7816. or ( ( extractFloat128Exp( b ) = $7FFF )
  7817. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7818. ) then begin
  7819. float_raise( float_flag_invalid );
  7820. result := 0;
  7821. exit;
  7822. end;
  7823. aSign := extractFloat128Sign( a );
  7824. bSign := extractFloat128Sign( b );
  7825. if ( aSign <> bSign ) then begin
  7826. result := ord(
  7827. (aSign<>0)
  7828. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7829. <> 0 ));
  7830. exit;
  7831. end;
  7832. if aSign<>0 then
  7833. result := lt128( b.high, b.low, a.high, a.low )
  7834. else
  7835. result := lt128( a.high, a.low, b.high, b.low );
  7836. end;
  7837. {*----------------------------------------------------------------------------
  7838. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7839. | the corresponding value `b', and 0 otherwise. The invalid exception is
  7840. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7841. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7842. *----------------------------------------------------------------------------*}
  7843. function float128_eq_signaling(a: float128; b: float128): flag;
  7844. begin
  7845. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7846. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7847. or ( ( extractFloat128Exp( b ) = $7FFF )
  7848. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7849. ) then begin
  7850. float_raise( float_flag_invalid );
  7851. result := 0;
  7852. exit;
  7853. end;
  7854. result := ord(
  7855. ( a.low = b.low )
  7856. and ( ( a.high = b.high )
  7857. or ( ( a.low = 0 )
  7858. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7859. ));
  7860. end;
  7861. {*----------------------------------------------------------------------------
  7862. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7863. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  7864. | cause an exception. Otherwise, the comparison is performed according to the
  7865. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7866. *----------------------------------------------------------------------------*}
  7867. function float128_le_quiet(a: float128; b: float128): flag;
  7868. var
  7869. aSign, bSign: flag;
  7870. begin
  7871. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7872. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7873. or ( ( extractFloat128Exp( b ) = $7FFF )
  7874. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7875. ) then begin
  7876. if ( (float128_is_signaling_nan( a )<>0)
  7877. or (float128_is_signaling_nan( b )<>0) ) then begin
  7878. float_raise( float_flag_invalid );
  7879. end;
  7880. result := 0;
  7881. exit;
  7882. end;
  7883. aSign := extractFloat128Sign( a );
  7884. bSign := extractFloat128Sign( b );
  7885. if ( aSign <> bSign ) then begin
  7886. result := ord(
  7887. (aSign<>0)
  7888. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7889. = 0 ));
  7890. exit;
  7891. end;
  7892. if aSign<>0 then
  7893. result := le128( b.high, b.low, a.high, a.low )
  7894. else
  7895. result := le128( a.high, a.low, b.high, b.low );
  7896. end;
  7897. {*----------------------------------------------------------------------------
  7898. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7899. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  7900. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  7901. | Standard for Binary Floating-Point Arithmetic.
  7902. *----------------------------------------------------------------------------*}
  7903. function float128_lt_quiet(a: float128; b: float128): flag;
  7904. var
  7905. aSign, bSign: flag;
  7906. begin
  7907. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7908. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7909. or ( ( extractFloat128Exp( b ) = $7FFF )
  7910. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7911. ) then begin
  7912. if ( (float128_is_signaling_nan( a )<>0)
  7913. or (float128_is_signaling_nan( b )<>0) ) then begin
  7914. float_raise( float_flag_invalid );
  7915. end;
  7916. result := 0;
  7917. exit;
  7918. end;
  7919. aSign := extractFloat128Sign( a );
  7920. bSign := extractFloat128Sign( b );
  7921. if ( aSign <> bSign ) then begin
  7922. result := ord(
  7923. (aSign<>0)
  7924. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7925. <> 0 ));
  7926. exit;
  7927. end;
  7928. if aSign<>0 then
  7929. result:=lt128( b.high, b.low, a.high, a.low )
  7930. else
  7931. result:=lt128( a.high, a.low, b.high, b.low );
  7932. end;
  7933. {----------------------------------------------------------------------------
  7934. | Returns the result of converting the double-precision floating-point value
  7935. | `a' to the quadruple-precision floating-point format. The conversion is
  7936. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  7937. | Arithmetic.
  7938. *----------------------------------------------------------------------------}
  7939. function float64_to_float128( a : float64) : float128;
  7940. var
  7941. aSign : flag;
  7942. aExp : int16;
  7943. aSig, zSig0, zSig1 : bits64;
  7944. begin
  7945. aSig := extractFloat64Frac( a );
  7946. aExp := extractFloat64Exp( a );
  7947. aSign := extractFloat64Sign( a );
  7948. if ( aExp = $7FF ) then begin
  7949. if ( aSig<>0 ) then
  7950. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  7951. result:=packFloat128( aSign, $7FFF, 0, 0 );
  7952. exit;
  7953. end;
  7954. if ( aExp = 0 ) then begin
  7955. if ( aSig = 0 ) then
  7956. begin
  7957. result:=packFloat128( aSign, 0, 0, 0 );
  7958. exit;
  7959. end;
  7960. normalizeFloat64Subnormal( aSig, aExp, aSig );
  7961. dec(aExp);
  7962. end;
  7963. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  7964. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  7965. end;
  7966. {$endif FPC_SOFTFLOAT_FLOAT128}
  7967. {$endif not(defined(fpc_softfpu_interface))}
  7968. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  7969. end.
  7970. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}