softfpu.pp 292 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. interface
  69. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  70. {$if not(defined(fpc_softfpu_implementation))}
  71. {
  72. -------------------------------------------------------------------------------
  73. Software IEC/IEEE floating-point types.
  74. -------------------------------------------------------------------------------
  75. }
  76. TYPE
  77. float32 = longword;
  78. { we use here a record in the function header because
  79. the record allows bitwise conversion to single }
  80. float32rec = record
  81. float32 : float32;
  82. end;
  83. flag = byte;
  84. uint8 = byte;
  85. int8 = shortint;
  86. uint16 = word;
  87. int16 = smallint;
  88. uint32 = longword;
  89. int32 = longint;
  90. bits8 = byte;
  91. sbits8 = shortint;
  92. bits16 = word;
  93. sbits16 = smallint;
  94. sbits32 = longint;
  95. bits32 = longword;
  96. {$ifndef fpc}
  97. qword = int64;
  98. {$endif}
  99. { now part of the system unit
  100. uint64 = qword;
  101. }
  102. bits64 = qword;
  103. sbits64 = int64;
  104. {$ifdef ENDIAN_LITTLE}
  105. float64 = packed record
  106. low: bits32;
  107. high: bits32;
  108. end;
  109. int64rec = packed record
  110. low: bits32;
  111. high: bits32;
  112. end;
  113. floatx80 = packed record
  114. low : qword;
  115. high : word;
  116. end;
  117. float128 = packed record
  118. low : qword;
  119. high : qword;
  120. end;
  121. {$else}
  122. float64 = packed record
  123. high,low : bits32;
  124. end;
  125. int64rec = packed record
  126. high,low : bits32;
  127. end;
  128. floatx80 = packed record
  129. high : word;
  130. low : qword;
  131. end;
  132. float128 = packed record
  133. high : qword;
  134. low : qword;
  135. end;
  136. {$endif}
  137. {*
  138. -------------------------------------------------------------------------------
  139. Returns 1 if the double-precision floating-point value `a' is less than
  140. the corresponding value `b', and 0 otherwise. The comparison is performed
  141. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  142. -------------------------------------------------------------------------------
  143. *}
  144. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  145. {*
  146. -------------------------------------------------------------------------------
  147. Returns 1 if the double-precision floating-point value `a' is less than
  148. or equal to the corresponding value `b', and 0 otherwise. The comparison
  149. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  150. Arithmetic.
  151. -------------------------------------------------------------------------------
  152. *}
  153. Function float64_le(a: float64;b: float64): flag; compilerproc;
  154. {*
  155. -------------------------------------------------------------------------------
  156. Returns 1 if the double-precision floating-point value `a' is equal to
  157. the corresponding value `b', and 0 otherwise. The comparison is performed
  158. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  159. -------------------------------------------------------------------------------
  160. *}
  161. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  162. {*
  163. -------------------------------------------------------------------------------
  164. Returns the square root of the double-precision floating-point value `a'.
  165. The operation is performed according to the IEC/IEEE Standard for Binary
  166. Floating-Point Arithmetic.
  167. -------------------------------------------------------------------------------
  168. *}
  169. Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
  170. {*
  171. -------------------------------------------------------------------------------
  172. Returns the remainder of the double-precision floating-point value `a'
  173. with respect to the corresponding value `b'. The operation is performed
  174. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  175. -------------------------------------------------------------------------------
  176. *}
  177. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  178. {*
  179. -------------------------------------------------------------------------------
  180. Returns the result of dividing the double-precision floating-point value `a'
  181. by the corresponding value `b'. The operation is performed according to the
  182. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  183. -------------------------------------------------------------------------------
  184. *}
  185. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  186. {*
  187. -------------------------------------------------------------------------------
  188. Returns the result of multiplying the double-precision floating-point values
  189. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  190. for Binary Floating-Point Arithmetic.
  191. -------------------------------------------------------------------------------
  192. *}
  193. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  194. {*
  195. -------------------------------------------------------------------------------
  196. Returns the result of subtracting the double-precision floating-point values
  197. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  198. for Binary Floating-Point Arithmetic.
  199. -------------------------------------------------------------------------------
  200. *}
  201. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  202. {*
  203. -------------------------------------------------------------------------------
  204. Returns the result of adding the double-precision floating-point values `a'
  205. and `b'. The operation is performed according to the IEC/IEEE Standard for
  206. Binary Floating-Point Arithmetic.
  207. -------------------------------------------------------------------------------
  208. *}
  209. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  210. {*
  211. -------------------------------------------------------------------------------
  212. Rounds the double-precision floating-point value `a' to an integer,
  213. and returns the result as a double-precision floating-point value. The
  214. operation is performed according to the IEC/IEEE Standard for Binary
  215. Floating-Point Arithmetic.
  216. -------------------------------------------------------------------------------
  217. *}
  218. Function float64_round_to_int(a: float64) : float64; compilerproc;
  219. {*
  220. -------------------------------------------------------------------------------
  221. Returns the result of converting the double-precision floating-point value
  222. `a' to the single-precision floating-point format. The conversion is
  223. performed according to the IEC/IEEE Standard for Binary Floating-Point
  224. Arithmetic.
  225. -------------------------------------------------------------------------------
  226. *}
  227. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  228. {*
  229. -------------------------------------------------------------------------------
  230. Returns the result of converting the double-precision floating-point value
  231. `a' to the 32-bit two's complement integer format. The conversion is
  232. performed according to the IEC/IEEE Standard for Binary Floating-Point
  233. Arithmetic, except that the conversion is always rounded toward zero.
  234. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  235. the conversion overflows, the largest integer with the same sign as `a' is
  236. returned.
  237. -------------------------------------------------------------------------------
  238. *}
  239. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  240. {*
  241. -------------------------------------------------------------------------------
  242. Returns the result of converting the double-precision floating-point value
  243. `a' to the 32-bit two's complement integer format. The conversion is
  244. performed according to the IEC/IEEE Standard for Binary Floating-Point
  245. Arithmetic---which means in particular that the conversion is rounded
  246. according to the current rounding mode. If `a' is a NaN, the largest
  247. positive integer is returned. Otherwise, if the conversion overflows, the
  248. largest integer with the same sign as `a' is returned.
  249. -------------------------------------------------------------------------------
  250. *}
  251. Function float64_to_int32(a: float64): int32; compilerproc;
  252. {*
  253. -------------------------------------------------------------------------------
  254. Returns 1 if the single-precision floating-point value `a' is less than
  255. the corresponding value `b', and 0 otherwise. The comparison is performed
  256. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  257. -------------------------------------------------------------------------------
  258. *}
  259. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  260. {*
  261. -------------------------------------------------------------------------------
  262. Returns 1 if the single-precision floating-point value `a' is less than
  263. or equal to the corresponding value `b', and 0 otherwise. The comparison
  264. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  265. Arithmetic.
  266. -------------------------------------------------------------------------------
  267. *}
  268. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  269. {*
  270. -------------------------------------------------------------------------------
  271. Returns 1 if the single-precision floating-point value `a' is equal to
  272. the corresponding value `b', and 0 otherwise. The comparison is performed
  273. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  274. -------------------------------------------------------------------------------
  275. *}
  276. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  277. {*
  278. -------------------------------------------------------------------------------
  279. Returns the square root of the single-precision floating-point value `a'.
  280. The operation is performed according to the IEC/IEEE Standard for Binary
  281. Floating-Point Arithmetic.
  282. -------------------------------------------------------------------------------
  283. *}
  284. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  285. {*
  286. -------------------------------------------------------------------------------
  287. Returns the remainder of the single-precision floating-point value `a'
  288. with respect to the corresponding value `b'. The operation is performed
  289. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  290. -------------------------------------------------------------------------------
  291. *}
  292. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  293. {*
  294. -------------------------------------------------------------------------------
  295. Returns the result of dividing the single-precision floating-point value `a'
  296. by the corresponding value `b'. The operation is performed according to the
  297. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  298. -------------------------------------------------------------------------------
  299. *}
  300. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  301. {*
  302. -------------------------------------------------------------------------------
  303. Returns the result of multiplying the single-precision floating-point values
  304. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  305. for Binary Floating-Point Arithmetic.
  306. -------------------------------------------------------------------------------
  307. *}
  308. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  309. {*
  310. -------------------------------------------------------------------------------
  311. Returns the result of subtracting the single-precision floating-point values
  312. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  313. for Binary Floating-Point Arithmetic.
  314. -------------------------------------------------------------------------------
  315. *}
  316. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  317. {*
  318. -------------------------------------------------------------------------------
  319. Returns the result of adding the single-precision floating-point values `a'
  320. and `b'. The operation is performed according to the IEC/IEEE Standard for
  321. Binary Floating-Point Arithmetic.
  322. -------------------------------------------------------------------------------
  323. *}
  324. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  325. {*
  326. -------------------------------------------------------------------------------
  327. Rounds the single-precision floating-point value `a' to an integer,
  328. and returns the result as a single-precision floating-point value. The
  329. operation is performed according to the IEC/IEEE Standard for Binary
  330. Floating-Point Arithmetic.
  331. -------------------------------------------------------------------------------
  332. *}
  333. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  334. {*
  335. -------------------------------------------------------------------------------
  336. Returns the result of converting the single-precision floating-point value
  337. `a' to the double-precision floating-point format. The conversion is
  338. performed according to the IEC/IEEE Standard for Binary Floating-Point
  339. Arithmetic.
  340. -------------------------------------------------------------------------------
  341. *}
  342. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  343. {*
  344. -------------------------------------------------------------------------------
  345. Returns the result of converting the single-precision floating-point value
  346. `a' to the 32-bit two's complement integer format. The conversion is
  347. performed according to the IEC/IEEE Standard for Binary Floating-Point
  348. Arithmetic, except that the conversion is always rounded toward zero.
  349. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  350. the conversion overflows, the largest integer with the same sign as `a' is
  351. returned.
  352. -------------------------------------------------------------------------------
  353. *}
  354. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  355. {*
  356. -------------------------------------------------------------------------------
  357. Returns the result of converting the single-precision floating-point value
  358. `a' to the 32-bit two's complement integer format. The conversion is
  359. performed according to the IEC/IEEE Standard for Binary Floating-Point
  360. Arithmetic---which means in particular that the conversion is rounded
  361. according to the current rounding mode. If `a' is a NaN, the largest
  362. positive integer is returned. Otherwise, if the conversion overflows, the
  363. largest integer with the same sign as `a' is returned.
  364. -------------------------------------------------------------------------------
  365. *}
  366. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  367. {*
  368. -------------------------------------------------------------------------------
  369. Returns the result of converting the 32-bit two's complement integer `a' to
  370. the double-precision floating-point format. The conversion is performed
  371. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  372. -------------------------------------------------------------------------------
  373. *}
  374. Function int32_to_float64( a: int32) : float64; compilerproc;
  375. {*
  376. -------------------------------------------------------------------------------
  377. Returns the result of converting the 32-bit two's complement integer `a' to
  378. the single-precision floating-point format. The conversion is performed
  379. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  380. -------------------------------------------------------------------------------
  381. *}
  382. Function int32_to_float32( a: int32): float32rec; compilerproc;
  383. {*----------------------------------------------------------------------------
  384. | Returns the result of converting the 64-bit two's complement integer `a'
  385. | to the double-precision floating-point format. The conversion is performed
  386. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  387. *----------------------------------------------------------------------------*}
  388. Function int64_to_float64( a: int64 ): float64; compilerproc;
  389. {*----------------------------------------------------------------------------
  390. | Returns the result of converting the 64-bit two's complement integer `a'
  391. | to the single-precision floating-point format. The conversion is performed
  392. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  393. *----------------------------------------------------------------------------*}
  394. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  395. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  396. function float128_is_nan( a : float128): flag;
  397. function float128_is_signaling_nan( a : float128): flag;
  398. function float128_to_int32(a: float128): int32;
  399. function float128_to_int32_round_to_zero(a: float128): int32;
  400. function float128_to_int64(a: float128): int64;
  401. function float128_to_int64_round_to_zero(a: float128): int64;
  402. function float128_to_float32(a: float128): float32;
  403. function float128_to_float64(a: float128): float64;
  404. {$ifdef FPC_SOFTFLOAT_FLOAT80}
  405. function float128_to_floatx80(a: float128): floatx80;
  406. {$endif FPC_SOFTFLOAT_FLOAT80}
  407. function float128_round_to_int(a: float128): float128;
  408. function float128_add(a: float128; b: float128): float128;
  409. function float128_sub(a: float128; b: float128): float128;
  410. function float128_mul(a: float128; b: float128): float128;
  411. function float128_div(a: float128; b: float128): float128;
  412. function float128_rem(a: float128; b: float128): float128;
  413. function float128_sqrt(a: float128): float128;
  414. function float128_eq(a: float128; b: float128): flag;
  415. function float128_le(a: float128; b: float128): flag;
  416. function float128_lt(a: float128; b: float128): flag;
  417. function float128_eq_signaling(a: float128; b: float128): flag;
  418. function float128_le_quiet(a: float128; b: float128): flag;
  419. function float128_lt_quiet(a: float128; b: float128): flag;
  420. {$endif FPC_SOFTFLOAT_FLOAT128}
  421. CONST
  422. {-------------------------------------------------------------------------------
  423. Software IEC/IEEE floating-point underflow tininess-detection mode.
  424. -------------------------------------------------------------------------------
  425. *}
  426. float_tininess_after_rounding = 0;
  427. float_tininess_before_rounding = 1;
  428. {*
  429. -------------------------------------------------------------------------------
  430. Software IEC/IEEE floating-point rounding mode.
  431. -------------------------------------------------------------------------------
  432. *}
  433. {
  434. Round to nearest.
  435. This is the default mode. It should be used unless there is a specific
  436. need for one of the others. In this mode results are rounded to the
  437. nearest representable value. If the result is midway between two
  438. representable values, the even representable is chosen. Even here
  439. means the lowest-order bit is zero. This rounding mode prevents
  440. statistical bias and guarantees numeric stability: round-off errors
  441. in a lengthy calculation will remain smaller than half of FLT_EPSILON.
  442. Round toward plus Infinity.
  443. All results are rounded to the smallest representable value which is
  444. greater than the result.
  445. Round toward minus Infinity.
  446. All results are rounded to the largest representable value which is
  447. less than the result.
  448. Round toward zero.
  449. All results are rounded to the largest representable value whose
  450. magnitude is less than that of the result. In other words, if the
  451. result is negative it is rounded up; if it is positive, it is
  452. rounded down.
  453. }
  454. float_round_nearest_even = 0;
  455. float_round_down = 1;
  456. float_round_up = 2;
  457. float_round_to_zero = 3;
  458. {*
  459. -------------------------------------------------------------------------------
  460. Floating-point rounding mode and exception flags.
  461. -------------------------------------------------------------------------------
  462. *}
  463. const
  464. float_rounding_mode : Byte = float_round_nearest_even;
  465. {*
  466. -------------------------------------------------------------------------------
  467. Underflow tininess-detection mode, statically initialized to default value.
  468. (The declaration in `softfloat.h' must match the `int8' type here.)
  469. -------------------------------------------------------------------------------
  470. *}
  471. const float_detect_tininess: int8 = float_tininess_after_rounding;
  472. {$endif not(defined(fpc_softfpu_implementation))}
  473. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  474. implementation
  475. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  476. {$if not(defined(fpc_softfpu_interface))}
  477. (*****************************************************************************)
  478. (*----------------------------------------------------------------------------*)
  479. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  480. (* division and square root approximations. (Can be specialized to target if *)
  481. (* desired.) *)
  482. (* ---------------------------------------------------------------------------*)
  483. (*****************************************************************************)
  484. {*----------------------------------------------------------------------------
  485. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  486. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  487. | input. If `zSign' is 1, the input is negated before being converted to an
  488. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  489. | is simply rounded to an integer, with the inexact exception raised if the
  490. | input cannot be represented exactly as an integer. However, if the fixed-
  491. | point input is too large, the invalid exception is raised and the largest
  492. | positive or negative integer is returned.
  493. *----------------------------------------------------------------------------*}
  494. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  495. var
  496. roundingMode: int8;
  497. roundNearestEven: flag;
  498. roundIncrement, roundBits: int8;
  499. z: int32;
  500. begin
  501. roundingMode := float_rounding_mode;
  502. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  503. roundIncrement := $40;
  504. if ( roundNearestEven=0 ) then
  505. begin
  506. if ( roundingMode = float_round_to_zero ) then
  507. begin
  508. roundIncrement := 0;
  509. end
  510. else begin
  511. roundIncrement := $7F;
  512. if ( zSign<>0 ) then
  513. begin
  514. if ( roundingMode = float_round_up ) then
  515. roundIncrement := 0;
  516. end
  517. else begin
  518. if ( roundingMode = float_round_down ) then
  519. roundIncrement := 0;
  520. end;
  521. end;
  522. end;
  523. roundBits := absZ and $7F;
  524. absZ := ( absZ + roundIncrement ) shr 7;
  525. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
  526. z := absZ;
  527. if ( zSign<>0 ) then
  528. z := - z;
  529. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  530. begin
  531. float_raise( float_flag_invalid );
  532. if zSign<>0 then
  533. result:=sbits32($80000000)
  534. else
  535. result:=$7FFFFFFF;
  536. exit;
  537. end;
  538. if ( roundBits<>0 ) then
  539. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  540. result:=z;
  541. end;
  542. {*----------------------------------------------------------------------------
  543. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  544. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  545. | and returns the properly rounded 64-bit integer corresponding to the input.
  546. | If `zSign' is 1, the input is negated before being converted to an integer.
  547. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  548. | the inexact exception raised if the input cannot be represented exactly as
  549. | an integer. However, if the fixed-point input is too large, the invalid
  550. | exception is raised and the largest positive or negative integer is
  551. | returned.
  552. *----------------------------------------------------------------------------*}
  553. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  554. var
  555. roundingMode: int8;
  556. roundNearestEven, increment: flag;
  557. z: int64;
  558. label
  559. overflow;
  560. begin
  561. roundingMode := float_rounding_mode;
  562. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  563. increment := ord( sbits64(absZ1) < 0 );
  564. if ( roundNearestEven=0 ) then
  565. begin
  566. if ( roundingMode = float_round_to_zero ) then
  567. begin
  568. increment := 0;
  569. end
  570. else begin
  571. if ( zSign<>0 ) then
  572. begin
  573. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  574. end
  575. else begin
  576. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  577. end;
  578. end;
  579. end;
  580. if ( increment<>0 ) then
  581. begin
  582. inc(absZ0);
  583. if ( absZ0 = 0 ) then
  584. goto overflow;
  585. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  586. end;
  587. z := absZ0;
  588. if ( zSign<>0 ) then
  589. z := - z;
  590. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  591. begin
  592. overflow:
  593. float_raise( float_flag_invalid );
  594. if zSign<>0 then
  595. result:=int64($8000000000000000)
  596. else
  597. result:=int64($7FFFFFFFFFFFFFFF);
  598. end;
  599. if ( absZ1<>0 ) then
  600. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  601. result:=z;
  602. end;
  603. {*
  604. -------------------------------------------------------------------------------
  605. Shifts `a' right by the number of bits given in `count'. If any nonzero
  606. bits are shifted off, they are ``jammed'' into the least significant bit of
  607. the result by setting the least significant bit to 1. The value of `count'
  608. can be arbitrarily large; in particular, if `count' is greater than 32, the
  609. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  610. The result is stored in the location pointed to by `zPtr'.
  611. -------------------------------------------------------------------------------
  612. *}
  613. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  614. var
  615. z: Bits32;
  616. Begin
  617. if ( count = 0 ) then
  618. z := a
  619. else
  620. if ( count < 32 ) then
  621. Begin
  622. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  623. End
  624. else
  625. Begin
  626. z := bits32( a <> 0 );
  627. End;
  628. zPtr := z;
  629. End;
  630. {*----------------------------------------------------------------------------
  631. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  632. | number of bits given in `count'. Any bits shifted off are lost. The value
  633. | of `count' can be arbitrarily large; in particular, if `count' is greater
  634. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  635. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  636. *----------------------------------------------------------------------------*}
  637. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  638. var
  639. z0, z1: bits64;
  640. negCount: int8;
  641. begin
  642. negCount := ( - count ) and 63;
  643. if ( count = 0 ) then
  644. begin
  645. z1 := a1;
  646. z0 := a0;
  647. end
  648. else if ( count < 64 ) then
  649. begin
  650. z1 := ( a0 shl negCount ) or ( a1 shr count );
  651. z0 := a0 shr count;
  652. end
  653. else
  654. begin
  655. if ( count shl 64 )<>0 then
  656. z1 := a0 shr ( count and 63 )
  657. else
  658. z1 := 0;
  659. z0 := 0;
  660. end;
  661. z1Ptr := z1;
  662. z0Ptr := z0;
  663. end;
  664. {*----------------------------------------------------------------------------
  665. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  666. | number of bits given in `count'. If any nonzero bits are shifted off, they
  667. | are ``jammed'' into the least significant bit of the result by setting the
  668. | least significant bit to 1. The value of `count' can be arbitrarily large;
  669. | in particular, if `count' is greater than 128, the result will be either
  670. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  671. | nonzero. The result is broken into two 64-bit pieces which are stored at
  672. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  673. *----------------------------------------------------------------------------*}
  674. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  675. var
  676. z0,z1 : bits64;
  677. negCount : int8;
  678. begin
  679. negCount := ( - count ) and 63;
  680. if ( count = 0 ) then begin
  681. z1 := a1;
  682. z0 := a0;
  683. end
  684. else if ( count < 64 ) then begin
  685. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  686. z0 := a0>>count;
  687. end
  688. else begin
  689. if ( count = 64 ) then begin
  690. z1 := a0 or ord( a1 <> 0 );
  691. end
  692. else if ( count < 128 ) then begin
  693. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  694. end
  695. else begin
  696. z1 := ord( ( a0 or a1 ) <> 0 );
  697. end;
  698. z0 := 0;
  699. end;
  700. z1Ptr := z1;
  701. z0Ptr := z0;
  702. end;
  703. {*
  704. -------------------------------------------------------------------------------
  705. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  706. number of bits given in `count'. Any bits shifted off are lost. The value
  707. of `count' can be arbitrarily large; in particular, if `count' is greater
  708. than 64, the result will be 0. The result is broken into two 32-bit pieces
  709. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  710. -------------------------------------------------------------------------------
  711. *}
  712. Procedure
  713. shift64Right(
  714. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  715. Var
  716. z0, z1: bits32;
  717. negCount : int8;
  718. Begin
  719. negCount := ( - count ) AND 31;
  720. if ( count = 0 ) then
  721. Begin
  722. z1 := a1;
  723. z0 := a0;
  724. End
  725. else if ( count < 32 ) then
  726. Begin
  727. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  728. z0 := a0 shr count;
  729. End
  730. else
  731. Begin
  732. if (count < 64) then
  733. z1 := ( a0 shr ( count AND 31 ) )
  734. else
  735. z1 := 0;
  736. z0 := 0;
  737. End;
  738. z1Ptr := z1;
  739. z0Ptr := z0;
  740. End;
  741. {*
  742. -------------------------------------------------------------------------------
  743. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  744. number of bits given in `count'. If any nonzero bits are shifted off, they
  745. are ``jammed'' into the least significant bit of the result by setting the
  746. least significant bit to 1. The value of `count' can be arbitrarily large;
  747. in particular, if `count' is greater than 64, the result will be either 0
  748. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  749. nonzero. The result is broken into two 32-bit pieces which are stored at
  750. the locations pointed to by `z0Ptr' and `z1Ptr'.
  751. -------------------------------------------------------------------------------
  752. *}
  753. Procedure
  754. shift64RightJamming(
  755. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  756. VAR
  757. z0, z1 : bits32;
  758. negCount : int8;
  759. Begin
  760. negCount := ( - count ) AND 31;
  761. if ( count = 0 ) then
  762. Begin
  763. z1 := a1;
  764. z0 := a0;
  765. End
  766. else
  767. if ( count < 32 ) then
  768. Begin
  769. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  770. z0 := a0 shr count;
  771. End
  772. else
  773. Begin
  774. if ( count = 32 ) then
  775. Begin
  776. z1 := a0 OR bits32( a1 <> 0 );
  777. End
  778. else
  779. if ( count < 64 ) Then
  780. Begin
  781. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  782. End
  783. else
  784. Begin
  785. z1 := bits32( ( a0 OR a1 ) <> 0 );
  786. End;
  787. z0 := 0;
  788. End;
  789. z1Ptr := z1;
  790. z0Ptr := z0;
  791. End;
  792. {*----------------------------------------------------------------------------
  793. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  794. | bits are shifted off, they are ``jammed'' into the least significant bit of
  795. | the result by setting the least significant bit to 1. The value of `count'
  796. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  797. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  798. | The result is stored in the location pointed to by `zPtr'.
  799. *----------------------------------------------------------------------------*}
  800. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  801. var
  802. z: bits64;
  803. begin
  804. if ( count = 0 ) then
  805. begin
  806. z := a;
  807. end
  808. else if ( count < 64 ) then
  809. begin
  810. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  811. end
  812. else
  813. begin
  814. z := ord( a <> 0 );
  815. end;
  816. zPtr := z;
  817. end;
  818. {*
  819. -------------------------------------------------------------------------------
  820. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  821. by 32 _plus_ the number of bits given in `count'. The shifted result is
  822. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  823. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  824. off form a third 32-bit result as follows: The _last_ bit shifted off is
  825. the most-significant bit of the extra result, and the other 31 bits of the
  826. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  827. were all zero. This extra result is stored in the location pointed to by
  828. `z2Ptr'. The value of `count' can be arbitrarily large.
  829. (This routine makes more sense if `a0', `a1', and `a2' are considered
  830. to form a fixed-point value with binary point between `a1' and `a2'. This
  831. fixed-point value is shifted right by the number of bits given in `count',
  832. and the integer part of the result is returned at the locations pointed to
  833. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  834. corrupted as described above, and is returned at the location pointed to by
  835. `z2Ptr'.)
  836. -------------------------------------------------------------------------------
  837. }
  838. Procedure
  839. shift64ExtraRightJamming(
  840. a0: bits32;
  841. a1: bits32;
  842. a2: bits32;
  843. count: int16;
  844. VAR z0Ptr: bits32;
  845. VAR z1Ptr: bits32;
  846. VAR z2Ptr: bits32
  847. );
  848. Var
  849. z0, z1, z2: bits32;
  850. negCount : int8;
  851. Begin
  852. negCount := ( - count ) AND 31;
  853. if ( count = 0 ) then
  854. Begin
  855. z2 := a2;
  856. z1 := a1;
  857. z0 := a0;
  858. End
  859. else
  860. Begin
  861. if ( count < 32 ) Then
  862. Begin
  863. z2 := a1 shl negCount;
  864. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  865. z0 := a0 shr count;
  866. End
  867. else
  868. Begin
  869. if ( count = 32 ) then
  870. Begin
  871. z2 := a1;
  872. z1 := a0;
  873. End
  874. else
  875. Begin
  876. a2 := a2 or a1;
  877. if ( count < 64 ) then
  878. Begin
  879. z2 := a0 shl negCount;
  880. z1 := a0 shr ( count AND 31 );
  881. End
  882. else
  883. Begin
  884. if count = 64 then
  885. z2 := a0
  886. else
  887. z2 := bits32(a0 <> 0);
  888. z1 := 0;
  889. End;
  890. End;
  891. z0 := 0;
  892. End;
  893. z2 := z2 or bits32( a2 <> 0 );
  894. End;
  895. z2Ptr := z2;
  896. z1Ptr := z1;
  897. z0Ptr := z0;
  898. End;
  899. {*
  900. -------------------------------------------------------------------------------
  901. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  902. number of bits given in `count'. Any bits shifted off are lost. The value
  903. of `count' must be less than 32. The result is broken into two 32-bit
  904. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  905. -------------------------------------------------------------------------------
  906. *}
  907. Procedure
  908. shortShift64Left(
  909. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  910. Begin
  911. z1Ptr := a1 shl count;
  912. if count = 0 then
  913. z0Ptr := a0
  914. else
  915. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  916. End;
  917. {*
  918. -------------------------------------------------------------------------------
  919. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  920. by the number of bits given in `count'. Any bits shifted off are lost.
  921. The value of `count' must be less than 32. The result is broken into three
  922. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  923. `z1Ptr', and `z2Ptr'.
  924. -------------------------------------------------------------------------------
  925. *}
  926. Procedure
  927. shortShift96Left(
  928. a0: bits32;
  929. a1: bits32;
  930. a2: bits32;
  931. count: int16;
  932. VAR z0Ptr: bits32;
  933. VAR z1Ptr: bits32;
  934. VAR z2Ptr: bits32
  935. );
  936. Var
  937. z0, z1, z2: bits32;
  938. negCount: int8;
  939. Begin
  940. z2 := a2 shl count;
  941. z1 := a1 shl count;
  942. z0 := a0 shl count;
  943. if ( 0 < count ) then
  944. Begin
  945. negCount := ( ( - count ) AND 31 );
  946. z1 := z1 or (a2 shr negCount);
  947. z0 := z0 or (a1 shr negCount);
  948. End;
  949. z2Ptr := z2;
  950. z1Ptr := z1;
  951. z0Ptr := z0;
  952. End;
  953. {*----------------------------------------------------------------------------
  954. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  955. | number of bits given in `count'. Any bits shifted off are lost. The value
  956. | of `count' must be less than 64. The result is broken into two 64-bit
  957. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  958. *----------------------------------------------------------------------------*}
  959. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  960. begin
  961. z1Ptr := a1 shl count;
  962. if count=0 then
  963. z0Ptr:=a0
  964. else
  965. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  966. end;
  967. {*
  968. -------------------------------------------------------------------------------
  969. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  970. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  971. any carry out is lost. The result is broken into two 32-bit pieces which
  972. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  973. -------------------------------------------------------------------------------
  974. *}
  975. Procedure
  976. add64(
  977. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  978. Var
  979. z1: bits32;
  980. Begin
  981. z1 := a1 + b1;
  982. z1Ptr := z1;
  983. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  984. End;
  985. {*
  986. -------------------------------------------------------------------------------
  987. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  988. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  989. modulo 2^96, so any carry out is lost. The result is broken into three
  990. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  991. `z1Ptr', and `z2Ptr'.
  992. -------------------------------------------------------------------------------
  993. *}
  994. Procedure
  995. add96(
  996. a0: bits32;
  997. a1: bits32;
  998. a2: bits32;
  999. b0: bits32;
  1000. b1: bits32;
  1001. b2: bits32;
  1002. VAR z0Ptr: bits32;
  1003. VAR z1Ptr: bits32;
  1004. VAR z2Ptr: bits32
  1005. );
  1006. var
  1007. z0, z1, z2: bits32;
  1008. carry0, carry1: int8;
  1009. Begin
  1010. z2 := a2 + b2;
  1011. carry1 := int8( z2 < a2 );
  1012. z1 := a1 + b1;
  1013. carry0 := int8( z1 < a1 );
  1014. z0 := a0 + b0;
  1015. z1 := z1 + carry1;
  1016. z0 := z0 + bits32( z1 < carry1 );
  1017. z0 := z0 + carry0;
  1018. z2Ptr := z2;
  1019. z1Ptr := z1;
  1020. z0Ptr := z0;
  1021. End;
  1022. {*----------------------------------------------------------------------------
  1023. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1024. | by the number of bits given in `count'. Any bits shifted off are lost.
  1025. | The value of `count' must be less than 64. The result is broken into three
  1026. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1027. | `z1Ptr', and `z2Ptr'.
  1028. *----------------------------------------------------------------------------*}
  1029. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1030. var
  1031. z0, z1, z2 : bits64;
  1032. negCount : int8;
  1033. begin
  1034. z2 := a2 shl count;
  1035. z1 := a1 shl count;
  1036. z0 := a0 shl count;
  1037. if ( 0 < count ) then
  1038. begin
  1039. negCount := ( ( - count ) and 63 );
  1040. z1 := z1 or (a2 shr negCount);
  1041. z0 := z0 or (a1 shr negCount);
  1042. end;
  1043. z2Ptr := z2;
  1044. z1Ptr := z1;
  1045. z0Ptr := z0;
  1046. end;
  1047. {*----------------------------------------------------------------------------
  1048. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1049. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1050. | any carry out is lost. The result is broken into two 64-bit pieces which
  1051. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1052. *----------------------------------------------------------------------------*}
  1053. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);inline;
  1054. var
  1055. z1 : bits64;
  1056. begin
  1057. z1 := a1 + b1;
  1058. z1Ptr := z1;
  1059. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1060. end;
  1061. {*----------------------------------------------------------------------------
  1062. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1063. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1064. | modulo 2^192, so any carry out is lost. The result is broken into three
  1065. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1066. | `z1Ptr', and `z2Ptr'.
  1067. *----------------------------------------------------------------------------*}
  1068. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1069. var
  1070. z0, z1, z2 : bits64;
  1071. carry0, carry1 : int8;
  1072. begin
  1073. z2 := a2 + b2;
  1074. carry1 := ord( z2 < a2 );
  1075. z1 := a1 + b1;
  1076. carry0 := ord( z1 < a1 );
  1077. z0 := a0 + b0;
  1078. inc(z1, carry1);
  1079. inc(z0, ord( z1 < carry1 ));
  1080. inc(z0, carry0);
  1081. z2Ptr := z2;
  1082. z1Ptr := z1;
  1083. z0Ptr := z0;
  1084. end;
  1085. {*
  1086. -------------------------------------------------------------------------------
  1087. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1088. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1089. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1090. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1091. `z1Ptr'.
  1092. -------------------------------------------------------------------------------
  1093. *}
  1094. Procedure
  1095. sub64(
  1096. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  1097. Begin
  1098. z1Ptr := a1 - b1;
  1099. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1100. End;
  1101. {*
  1102. -------------------------------------------------------------------------------
  1103. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1104. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1105. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1106. into three 32-bit pieces which are stored at the locations pointed to by
  1107. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1108. -------------------------------------------------------------------------------
  1109. *}
  1110. Procedure
  1111. sub96(
  1112. a0:bits32;
  1113. a1:bits32;
  1114. a2:bits32;
  1115. b0:bits32;
  1116. b1:bits32;
  1117. b2:bits32;
  1118. VAR z0Ptr:bits32;
  1119. VAR z1Ptr:bits32;
  1120. VAR z2Ptr:bits32
  1121. );
  1122. Var
  1123. z0, z1, z2: bits32;
  1124. borrow0, borrow1: int8;
  1125. Begin
  1126. z2 := a2 - b2;
  1127. borrow1 := int8( a2 < b2 );
  1128. z1 := a1 - b1;
  1129. borrow0 := int8( a1 < b1 );
  1130. z0 := a0 - b0;
  1131. z0 := z0 - bits32( z1 < borrow1 );
  1132. z1 := z1 - borrow1;
  1133. z0 := z0 -borrow0;
  1134. z2Ptr := z2;
  1135. z1Ptr := z1;
  1136. z0Ptr := z0;
  1137. End;
  1138. {*----------------------------------------------------------------------------
  1139. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1140. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1141. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1142. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1143. | `z1Ptr'.
  1144. *----------------------------------------------------------------------------*}
  1145. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1146. begin
  1147. z1Ptr := a1 - b1;
  1148. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1149. end;
  1150. {*----------------------------------------------------------------------------
  1151. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1152. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1153. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1154. | result is broken into three 64-bit pieces which are stored at the locations
  1155. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1156. *----------------------------------------------------------------------------*}
  1157. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1158. var
  1159. z0, z1, z2 : bits64;
  1160. borrow0, borrow1 : int8;
  1161. begin
  1162. z2 := a2 - b2;
  1163. borrow1 := ord( a2 < b2 );
  1164. z1 := a1 - b1;
  1165. borrow0 := ord( a1 < b1 );
  1166. z0 := a0 - b0;
  1167. dec(z0, ord( z1 < borrow1 ));
  1168. dec(z1, borrow1);
  1169. dec(z0, borrow0);
  1170. z2Ptr := z2;
  1171. z1Ptr := z1;
  1172. z0Ptr := z0;
  1173. end;
  1174. {*
  1175. -------------------------------------------------------------------------------
  1176. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1177. into two 32-bit pieces which are stored at the locations pointed to by
  1178. `z0Ptr' and `z1Ptr'.
  1179. -------------------------------------------------------------------------------
  1180. *}
  1181. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1182. :bits32 );
  1183. Var
  1184. aHigh, aLow, bHigh, bLow: bits16;
  1185. z0, zMiddleA, zMiddleB, z1: bits32;
  1186. Begin
  1187. aLow := a and $ffff;
  1188. aHigh := a shr 16;
  1189. bLow := b and $ffff;
  1190. bHigh := b shr 16;
  1191. z1 := ( bits32( aLow) ) * bLow;
  1192. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1193. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1194. z0 := ( bits32 (aHigh) ) * bHigh;
  1195. zMiddleA := zMiddleA + zMiddleB;
  1196. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1197. zMiddleA := zmiddleA shl 16;
  1198. z1 := z1 + zMiddleA;
  1199. z0 := z0 + bits32( z1 < zMiddleA );
  1200. z1Ptr := z1;
  1201. z0Ptr := z0;
  1202. End;
  1203. {*
  1204. -------------------------------------------------------------------------------
  1205. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1206. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1207. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1208. `z2Ptr'.
  1209. -------------------------------------------------------------------------------
  1210. *}
  1211. Procedure
  1212. mul64By32To96(
  1213. a0:bits32;
  1214. a1:bits32;
  1215. b:bits32;
  1216. VAR z0Ptr:bits32;
  1217. VAR z1Ptr:bits32;
  1218. VAR z2Ptr:bits32
  1219. );
  1220. Var
  1221. z0, z1, z2, more1: bits32;
  1222. Begin
  1223. mul32To64( a1, b, z1, z2 );
  1224. mul32To64( a0, b, z0, more1 );
  1225. add64( z0, more1, 0, z1, z0, z1 );
  1226. z2Ptr := z2;
  1227. z1Ptr := z1;
  1228. z0Ptr := z0;
  1229. End;
  1230. {*
  1231. -------------------------------------------------------------------------------
  1232. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1233. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1234. product. The product is broken into four 32-bit pieces which are stored at
  1235. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1236. -------------------------------------------------------------------------------
  1237. *}
  1238. Procedure
  1239. mul64To128(
  1240. a0:bits32;
  1241. a1:bits32;
  1242. b0:bits32;
  1243. b1:bits32;
  1244. VAR z0Ptr:bits32;
  1245. VAR z1Ptr:bits32;
  1246. VAR z2Ptr:bits32;
  1247. VAR z3Ptr:bits32
  1248. );
  1249. Var
  1250. z0, z1, z2, z3: bits32;
  1251. more1, more2: bits32;
  1252. Begin
  1253. mul32To64( a1, b1, z2, z3 );
  1254. mul32To64( a1, b0, z1, more2 );
  1255. add64( z1, more2, 0, z2, z1, z2 );
  1256. mul32To64( a0, b0, z0, more1 );
  1257. add64( z0, more1, 0, z1, z0, z1 );
  1258. mul32To64( a0, b1, more1, more2 );
  1259. add64( more1, more2, 0, z2, more1, z2 );
  1260. add64( z0, z1, 0, more1, z0, z1 );
  1261. z3Ptr := z3;
  1262. z2Ptr := z2;
  1263. z1Ptr := z1;
  1264. z0Ptr := z0;
  1265. End;
  1266. {*----------------------------------------------------------------------------
  1267. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1268. | into two 64-bit pieces which are stored at the locations pointed to by
  1269. | `z0Ptr' and `z1Ptr'.
  1270. *----------------------------------------------------------------------------*}
  1271. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1272. var
  1273. aHigh, aLow, bHigh, bLow : bits32;
  1274. z0, zMiddleA, zMiddleB, z1 : bits64;
  1275. begin
  1276. aLow := a;
  1277. aHigh := a shr 32;
  1278. bLow := b;
  1279. bHigh := b shr 32;
  1280. z1 := ( bits64(aLow) ) * bLow;
  1281. zMiddleA := ( bits64( aLow )) * bHigh;
  1282. zMiddleB := ( bits64( aHigh )) * bLow;
  1283. z0 := ( bits64(aHigh) ) * bHigh;
  1284. inc(zMiddleA, zMiddleB);
  1285. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1286. zMiddleA := zMiddleA shl 32;
  1287. inc(z1, zMiddleA);
  1288. inc(z0, ord( z1 < zMiddleA ));
  1289. z1Ptr := z1;
  1290. z0Ptr := z0;
  1291. end;
  1292. {*----------------------------------------------------------------------------
  1293. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1294. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1295. | product. The product is broken into four 64-bit pieces which are stored at
  1296. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1297. *----------------------------------------------------------------------------*}
  1298. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1299. var
  1300. z0,z1,z2,z3,more1,more2 : bits64;
  1301. begin
  1302. mul64To128( a1, b1, z2, z3 );
  1303. mul64To128( a1, b0, z1, more2 );
  1304. add128( z1, more2, 0, z2, z1, z2 );
  1305. mul64To128( a0, b0, z0, more1 );
  1306. add128( z0, more1, 0, z1, z0, z1 );
  1307. mul64To128( a0, b1, more1, more2 );
  1308. add128( more1, more2, 0, z2, more1, z2 );
  1309. add128( z0, z1, 0, more1, z0, z1 );
  1310. z3Ptr := z3;
  1311. z2Ptr := z2;
  1312. z1Ptr := z1;
  1313. z0Ptr := z0;
  1314. end;
  1315. {*----------------------------------------------------------------------------
  1316. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1317. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1318. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1319. | `z2Ptr'.
  1320. *----------------------------------------------------------------------------*}
  1321. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1322. var
  1323. z0, z1, z2, more1 : bits64;
  1324. begin
  1325. mul64To128( a1, b, z1, z2 );
  1326. mul64To128( a0, b, z0, more1 );
  1327. add128( z0, more1, 0, z1, z0, z1 );
  1328. z2Ptr := z2;
  1329. z1Ptr := z1;
  1330. z0Ptr := z0;
  1331. end;
  1332. {*----------------------------------------------------------------------------
  1333. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1334. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1335. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1336. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1337. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1338. | unsigned integer is returned.
  1339. *----------------------------------------------------------------------------*}
  1340. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1341. var
  1342. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1343. begin
  1344. if ( b <= a0 ) then
  1345. begin
  1346. result:=qword( $FFFFFFFFFFFFFFFF );
  1347. exit;
  1348. end;
  1349. b0 := b shr 32;
  1350. if ( b0 shl 32 <= a0 ) then
  1351. z:=qword( $FFFFFFFF00000000 )
  1352. else
  1353. z:=( a0 div b0 ) shl 32;
  1354. mul64To128( b, z, term0, term1 );
  1355. sub128( a0, a1, term0, term1, rem0, rem1 );
  1356. while ( ( sbits64(rem0) ) < 0 ) do begin
  1357. dec(z,qword( $100000000 ));
  1358. b1 := b shl 32;
  1359. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1360. end;
  1361. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1362. if ( b0 shl 32 <= rem0 ) then
  1363. z:=z or $FFFFFFFF
  1364. else
  1365. z:=z or rem0 div b0;
  1366. result:=z;
  1367. end;
  1368. {*
  1369. -------------------------------------------------------------------------------
  1370. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1371. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1372. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1373. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1374. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1375. unsigned integer is returned.
  1376. -------------------------------------------------------------------------------
  1377. *}
  1378. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1379. Var
  1380. b0, b1: bits32;
  1381. rem0, rem1, term0, term1: bits32;
  1382. z: bits32;
  1383. Begin
  1384. if ( b <= a0 ) then
  1385. Begin
  1386. estimateDiv64To32 := $FFFFFFFF;
  1387. exit;
  1388. End;
  1389. b0 := b shr 16;
  1390. if ( b0 shl 16 <= a0 ) then
  1391. z:= $FFFF0000
  1392. else
  1393. z:= ( a0 div b0 ) shl 16;
  1394. mul32To64( b, z, term0, term1 );
  1395. sub64( a0, a1, term0, term1, rem0, rem1 );
  1396. while ( ( sbits32 (rem0) ) < 0 ) do
  1397. Begin
  1398. z := z - $10000;
  1399. b1 := b shl 16;
  1400. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1401. End;
  1402. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1403. if ( b0 shl 16 <= rem0 ) then
  1404. z := z or $FFFF
  1405. else
  1406. z := z or (rem0 div b0);
  1407. estimateDiv64To32 := z;
  1408. End;
  1409. {*
  1410. -------------------------------------------------------------------------------
  1411. Returns an approximation to the square root of the 32-bit significand given
  1412. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1413. `aExp' (the least significant bit) is 1, the integer returned approximates
  1414. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1415. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1416. case, the approximation returned lies strictly within +/-2 of the exact
  1417. value.
  1418. -------------------------------------------------------------------------------
  1419. *}
  1420. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1421. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1422. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1423. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1424. );
  1425. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1426. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1427. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1428. );
  1429. Var
  1430. index: int8;
  1431. z: bits32;
  1432. Begin
  1433. index := ( a shr 27 ) AND 15;
  1434. if ( aExp AND 1 ) <> 0 then
  1435. Begin
  1436. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1437. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1438. a := a shr 1;
  1439. End
  1440. else
  1441. Begin
  1442. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1443. z := a div z + z;
  1444. if ( $20000 <= z ) then
  1445. z := $FFFF8000
  1446. else
  1447. z := ( z shl 15 );
  1448. if ( z <= a ) then
  1449. Begin
  1450. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1451. exit;
  1452. End;
  1453. End;
  1454. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1455. End;
  1456. {*
  1457. -------------------------------------------------------------------------------
  1458. Returns the number of leading 0 bits before the most-significant 1 bit of
  1459. `a'. If `a' is zero, 32 is returned.
  1460. -------------------------------------------------------------------------------
  1461. *}
  1462. Function countLeadingZeros32( a:bits32 ): int8;
  1463. const countLeadingZerosHigh:array[0..255] of int8 = (
  1464. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1465. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1466. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1467. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1468. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1469. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1470. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1471. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1472. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1473. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1474. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1475. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1476. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1477. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1478. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1479. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1480. );
  1481. Var
  1482. shiftCount: int8;
  1483. Begin
  1484. shiftCount := 0;
  1485. if ( a < $10000 ) then
  1486. Begin
  1487. shiftCount := shiftcount + 16;
  1488. a := a shl 16;
  1489. End;
  1490. if ( a < $1000000 ) then
  1491. Begin
  1492. shiftCount := shiftcount + 8;
  1493. a := a shl 8;
  1494. end;
  1495. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1496. countLeadingZeros32:= shiftCount;
  1497. End;
  1498. {*----------------------------------------------------------------------------
  1499. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1500. | `a'. If `a' is zero, 64 is returned.
  1501. *----------------------------------------------------------------------------*}
  1502. function countLeadingZeros64( a : bits64): int8;
  1503. var
  1504. shiftcount : int8;
  1505. Begin
  1506. shiftCount := 0;
  1507. if ( a < (bits64(1) shl 32 )) then
  1508. shiftCount := shiftcount + 32
  1509. else
  1510. a := a shr 32;
  1511. shiftCount := shiftCount + countLeadingZeros32( a );
  1512. countLeadingZeros64:= shiftCount;
  1513. End;
  1514. {*
  1515. -------------------------------------------------------------------------------
  1516. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
  1517. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1518. returns 0.
  1519. -------------------------------------------------------------------------------
  1520. *}
  1521. Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1522. Begin
  1523. eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
  1524. End;
  1525. {*
  1526. -------------------------------------------------------------------------------
  1527. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1528. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1529. Otherwise, returns 0.
  1530. -------------------------------------------------------------------------------
  1531. *}
  1532. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1533. Begin
  1534. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1535. End;
  1536. {*
  1537. -------------------------------------------------------------------------------
  1538. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1539. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1540. returns 0.
  1541. -------------------------------------------------------------------------------
  1542. *}
  1543. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1544. Begin
  1545. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1546. End;
  1547. {*
  1548. -------------------------------------------------------------------------------
  1549. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
  1550. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1551. returns 0.
  1552. -------------------------------------------------------------------------------
  1553. *}
  1554. Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1555. Begin
  1556. ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
  1557. End;
  1558. const
  1559. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1560. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1561. (*****************************************************************************)
  1562. (* End Low-Level arithmetic *)
  1563. (*****************************************************************************)
  1564. {*
  1565. -------------------------------------------------------------------------------
  1566. Functions and definitions to determine: (1) whether tininess for underflow
  1567. is detected before or after rounding by default, (2) what (if anything)
  1568. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1569. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1570. are propagated from function inputs to output. These details are ENDIAN
  1571. specific
  1572. -------------------------------------------------------------------------------
  1573. *}
  1574. {$IFDEF ENDIAN_LITTLE}
  1575. {*
  1576. -------------------------------------------------------------------------------
  1577. Internal canonical NaN format.
  1578. -------------------------------------------------------------------------------
  1579. *}
  1580. TYPE
  1581. commonNaNT = packed record
  1582. sign: flag;
  1583. high, low : bits32;
  1584. end;
  1585. {*
  1586. -------------------------------------------------------------------------------
  1587. The pattern for a default generated single-precision NaN.
  1588. -------------------------------------------------------------------------------
  1589. *}
  1590. const float32_default_nan = $FFC00000;
  1591. {*
  1592. -------------------------------------------------------------------------------
  1593. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1594. otherwise returns 0.
  1595. -------------------------------------------------------------------------------
  1596. *}
  1597. Function float32_is_nan( a : float32 ): flag;
  1598. Begin
  1599. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1600. End;
  1601. {*
  1602. -------------------------------------------------------------------------------
  1603. Returns 1 if the single-precision floating-point value `a' is a signaling
  1604. NaN; otherwise returns 0.
  1605. -------------------------------------------------------------------------------
  1606. *}
  1607. Function float32_is_signaling_nan( a : float32 ): flag;
  1608. Begin
  1609. float32_is_signaling_nan := flag
  1610. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1611. End;
  1612. {*
  1613. -------------------------------------------------------------------------------
  1614. Returns the result of converting the single-precision floating-point NaN
  1615. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1616. exception is raised.
  1617. -------------------------------------------------------------------------------
  1618. *}
  1619. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1620. var
  1621. z : commonNaNT ;
  1622. Begin
  1623. if ( float32_is_signaling_nan( a ) <> 0) then
  1624. float_raise( float_flag_invalid );
  1625. z.sign := a shr 31;
  1626. z.low := 0;
  1627. z.high := a shl 9;
  1628. c := z;
  1629. End;
  1630. {*
  1631. -------------------------------------------------------------------------------
  1632. Returns the result of converting the canonical NaN `a' to the single-
  1633. precision floating-point format.
  1634. -------------------------------------------------------------------------------
  1635. *}
  1636. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1637. Begin
  1638. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1639. End;
  1640. {*
  1641. -------------------------------------------------------------------------------
  1642. Takes two single-precision floating-point values `a' and `b', one of which
  1643. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1644. signaling NaN, the invalid exception is raised.
  1645. -------------------------------------------------------------------------------
  1646. *}
  1647. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1648. Var
  1649. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1650. label returnLargerSignificand;
  1651. Begin
  1652. aIsNaN := float32_is_nan( a );
  1653. aIsSignalingNaN := float32_is_signaling_nan( a );
  1654. bIsNaN := float32_is_nan( b );
  1655. bIsSignalingNaN := float32_is_signaling_nan( b );
  1656. a := a or $00400000;
  1657. b := b or $00400000;
  1658. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1659. float_raise( float_flag_invalid );
  1660. if ( aIsSignalingNaN )<> 0 then
  1661. Begin
  1662. if ( bIsSignalingNaN ) <> 0 then
  1663. goto returnLargerSignificand;
  1664. if bIsNan <> 0 then
  1665. propagateFloat32NaN := b
  1666. else
  1667. propagateFloat32NaN := a;
  1668. exit;
  1669. End
  1670. else if ( aIsNaN <> 0) then
  1671. Begin
  1672. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1673. Begin
  1674. propagateFloat32NaN := a;
  1675. exit;
  1676. End;
  1677. returnLargerSignificand:
  1678. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1679. Begin
  1680. propagateFloat32NaN := b;
  1681. exit;
  1682. End;
  1683. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1684. Begin
  1685. propagateFloat32NaN := a;
  1686. End;
  1687. if a < b then
  1688. propagateFloat32NaN := a
  1689. else
  1690. propagateFloat32NaN := b;
  1691. exit;
  1692. End
  1693. else
  1694. Begin
  1695. propagateFloat32NaN := b;
  1696. exit;
  1697. End;
  1698. End;
  1699. {*
  1700. -------------------------------------------------------------------------------
  1701. The pattern for a default generated double-precision NaN. The `high' and
  1702. `low' values hold the most- and least-significant bits, respectively.
  1703. -------------------------------------------------------------------------------
  1704. *}
  1705. const
  1706. float64_default_nan_high = $FFF80000;
  1707. float64_default_nan_low = $00000000;
  1708. {*
  1709. -------------------------------------------------------------------------------
  1710. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1711. otherwise returns 0.
  1712. -------------------------------------------------------------------------------
  1713. *}
  1714. Function float64_is_nan( a : float64 ) : flag;
  1715. Begin
  1716. float64_is_nan :=
  1717. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1718. and ( a.low or ( a.high and $000FFFFF ) );
  1719. End;
  1720. {*
  1721. -------------------------------------------------------------------------------
  1722. Returns 1 if the double-precision floating-point value `a' is a signaling
  1723. NaN; otherwise returns 0.
  1724. -------------------------------------------------------------------------------
  1725. *}
  1726. Function float64_is_signaling_nan( a : float64 ): flag;
  1727. Begin
  1728. float64_is_signaling_nan :=
  1729. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1730. and ( a.low or ( a.high and $0007FFFF ) );
  1731. End;
  1732. {*
  1733. -------------------------------------------------------------------------------
  1734. Returns the result of converting the double-precision floating-point NaN
  1735. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1736. exception is raised.
  1737. -------------------------------------------------------------------------------
  1738. *}
  1739. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1740. Var
  1741. z : commonNaNT;
  1742. Begin
  1743. if ( float64_is_signaling_nan( a )<>0 ) then
  1744. float_raise( float_flag_invalid );
  1745. z.sign := a.high shr 31;
  1746. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1747. c := z;
  1748. End;
  1749. {*
  1750. -------------------------------------------------------------------------------
  1751. Returns the result of converting the canonical NaN `a' to the double-
  1752. precision floating-point format.
  1753. -------------------------------------------------------------------------------
  1754. *}
  1755. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1756. Var
  1757. z: float64;
  1758. Begin
  1759. shift64Right( a.high, a.low, 12, z.high, z.low );
  1760. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1761. c := z;
  1762. End;
  1763. {*
  1764. -------------------------------------------------------------------------------
  1765. Takes two double-precision floating-point values `a' and `b', one of which
  1766. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1767. signaling NaN, the invalid exception is raised.
  1768. -------------------------------------------------------------------------------
  1769. *}
  1770. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1771. Var
  1772. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1773. label returnLargerSignificand;
  1774. Begin
  1775. aIsNaN := float64_is_nan( a );
  1776. aIsSignalingNaN := float64_is_signaling_nan( a );
  1777. bIsNaN := float64_is_nan( b );
  1778. bIsSignalingNaN := float64_is_signaling_nan( b );
  1779. a.high := a.high or $00080000;
  1780. b.high := b.high or $00080000;
  1781. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1782. float_raise( float_flag_invalid );
  1783. if ( aIsSignalingNaN )<>0 then
  1784. Begin
  1785. if ( bIsSignalingNaN )<>0 then
  1786. goto returnLargerSignificand;
  1787. if bIsNan <> 0 then
  1788. c := b
  1789. else
  1790. c := a;
  1791. exit;
  1792. End
  1793. else if ( aIsNaN )<> 0 then
  1794. Begin
  1795. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1796. Begin
  1797. c := a;
  1798. exit;
  1799. End;
  1800. returnLargerSignificand:
  1801. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1802. Begin
  1803. c := b;
  1804. exit;
  1805. End;
  1806. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1807. Begin
  1808. c := a;
  1809. exit;
  1810. End;
  1811. if a.high < b.high then
  1812. c := a
  1813. else
  1814. c := b;
  1815. exit;
  1816. End
  1817. else
  1818. Begin
  1819. c := b;
  1820. exit;
  1821. End;
  1822. End;
  1823. {*----------------------------------------------------------------------------
  1824. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1825. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1826. | returns 0.
  1827. *----------------------------------------------------------------------------*}
  1828. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1829. begin
  1830. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1831. end;
  1832. {*----------------------------------------------------------------------------
  1833. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1834. | otherwise returns 0.
  1835. *----------------------------------------------------------------------------*}
  1836. function float128_is_nan( a : float128): flag;
  1837. begin
  1838. result:= ord(( int64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1839. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1840. end;
  1841. {*----------------------------------------------------------------------------
  1842. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1843. | signaling NaN; otherwise returns 0.
  1844. *----------------------------------------------------------------------------*}
  1845. function float128_is_signaling_nan( a : float128): flag;
  1846. begin
  1847. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1848. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1849. end;
  1850. {*----------------------------------------------------------------------------
  1851. | Returns the result of converting the quadruple-precision floating-point NaN
  1852. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1853. | exception is raised.
  1854. *----------------------------------------------------------------------------*}
  1855. function float128ToCommonNaN( a : float128): commonNaNT;
  1856. var
  1857. z: commonNaNT;
  1858. qhigh,qlow : qword;
  1859. begin
  1860. if ( float128_is_signaling_nan( a )<>0) then
  1861. float_raise( float_flag_invalid );
  1862. z.sign := a.high shr 63;
  1863. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1864. z.high:=qhigh shr 32;
  1865. z.low:=qhigh and $ffffffff;
  1866. result:=z;
  1867. end;
  1868. {*----------------------------------------------------------------------------
  1869. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1870. | precision floating-point format.
  1871. *----------------------------------------------------------------------------*}
  1872. function commonNaNToFloat128( a : commonNaNT): float128;
  1873. var
  1874. z: float128;
  1875. begin
  1876. shift128Right( a.high, a.low, 16, z.high, z.low );
  1877. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1878. result:=z;
  1879. end;
  1880. {*----------------------------------------------------------------------------
  1881. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1882. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1883. | `b' is a signaling NaN, the invalid exception is raised.
  1884. *----------------------------------------------------------------------------*}
  1885. function propagateFloat128NaN( a: float128; b : float128): float128;
  1886. var
  1887. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1888. label
  1889. returnLargerSignificand;
  1890. begin
  1891. aIsNaN := float128_is_nan( a );
  1892. aIsSignalingNaN := float128_is_signaling_nan( a );
  1893. bIsNaN := float128_is_nan( b );
  1894. bIsSignalingNaN := float128_is_signaling_nan( b );
  1895. a.high := a.high or int64( $0000800000000000 );
  1896. b.high := b.high or int64( $0000800000000000 );
  1897. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1898. float_raise( float_flag_invalid );
  1899. if ( aIsSignalingNaN )<>0 then
  1900. begin
  1901. if ( bIsSignalingNaN )<>0 then
  1902. goto returnLargerSignificand;
  1903. if bIsNaN<>0 then
  1904. result := b
  1905. else
  1906. result := a;
  1907. exit;
  1908. end
  1909. else if ( aIsNaN )<>0 then
  1910. begin
  1911. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1912. begin
  1913. result := a;
  1914. exit;
  1915. end;
  1916. returnLargerSignificand:
  1917. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1918. begin
  1919. result := b;
  1920. exit;
  1921. end;
  1922. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1923. begin
  1924. result := a;
  1925. exit
  1926. end;
  1927. if ( a.high < b.high ) then
  1928. result := a
  1929. else
  1930. result := b;
  1931. exit;
  1932. end
  1933. else
  1934. result:=b;
  1935. end;
  1936. {$ELSE}
  1937. { Big endian code }
  1938. (*----------------------------------------------------------------------------
  1939. | Internal canonical NaN format.
  1940. *----------------------------------------------------------------------------*)
  1941. type
  1942. commonNANT = packed record
  1943. sign : flag;
  1944. high, low : bits32;
  1945. end;
  1946. (*----------------------------------------------------------------------------
  1947. | The pattern for a default generated single-precision NaN.
  1948. *----------------------------------------------------------------------------*)
  1949. const float32_default_nan = $7FFFFFFF;
  1950. (*----------------------------------------------------------------------------
  1951. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1952. | otherwise returns 0.
  1953. *----------------------------------------------------------------------------*)
  1954. function float32_is_nan(a: float32): flag;
  1955. begin
  1956. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1957. end;
  1958. (*----------------------------------------------------------------------------
  1959. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1960. | NaN; otherwise returns 0.
  1961. *----------------------------------------------------------------------------*)
  1962. function float32_is_signaling_nan(a: float32):flag;
  1963. begin
  1964. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  1965. end;
  1966. (*----------------------------------------------------------------------------
  1967. | Returns the result of converting the single-precision floating-point NaN
  1968. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1969. | exception is raised.
  1970. *----------------------------------------------------------------------------*)
  1971. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1972. var
  1973. z: commonNANT;
  1974. begin
  1975. if float32_is_signaling_nan(a)<>0 then
  1976. float_raise(float_flag_invalid);
  1977. z.sign := a shr 31;
  1978. z.low := 0;
  1979. z.high := a shl 9;
  1980. c:=z;
  1981. end;
  1982. (*----------------------------------------------------------------------------
  1983. | Returns the result of converting the canonical NaN `a' to the single-
  1984. | precision floating-point format.
  1985. *----------------------------------------------------------------------------*)
  1986. function CommonNanToFloat32(a : CommonNaNT): float32;
  1987. begin
  1988. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  1989. end;
  1990. (*----------------------------------------------------------------------------
  1991. | Takes two single-precision floating-point values `a' and `b', one of which
  1992. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1993. | signaling NaN, the invalid exception is raised.
  1994. *----------------------------------------------------------------------------*)
  1995. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  1996. var
  1997. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1998. begin
  1999. aIsNaN := float32_is_nan( a );
  2000. aIsSignalingNaN := float32_is_signaling_nan( a );
  2001. bIsNaN := float32_is_nan( b );
  2002. bIsSignalingNaN := float32_is_signaling_nan( b );
  2003. a := a or $00400000;
  2004. b := b or $00400000;
  2005. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2006. float_raise( float_flag_invalid );
  2007. if bIsSignalingNaN<>0 then
  2008. propagateFloat32Nan := b
  2009. else if aIsSignalingNan<>0 then
  2010. propagateFloat32Nan := a
  2011. else if bIsNan<>0 then
  2012. propagateFloat32Nan := b
  2013. else
  2014. propagateFloat32Nan := a;
  2015. end;
  2016. (*----------------------------------------------------------------------------
  2017. | The pattern for a default generated double-precision NaN. The `high' and
  2018. | `low' values hold the most- and least-significant bits, respectively.
  2019. *----------------------------------------------------------------------------*)
  2020. const
  2021. float64_default_nan_high = $7FFFFFFF;
  2022. float64_default_nan_low = $FFFFFFFF;
  2023. (*----------------------------------------------------------------------------
  2024. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2025. | otherwise returns 0.
  2026. *----------------------------------------------------------------------------*)
  2027. function float64_is_nan(a: float64): flag;
  2028. begin
  2029. float64_is_nan := flag (
  2030. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2031. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2032. end;
  2033. (*----------------------------------------------------------------------------
  2034. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2035. | NaN; otherwise returns 0.
  2036. *----------------------------------------------------------------------------*)
  2037. function float64_is_signaling_nan( a:float64): flag;
  2038. begin
  2039. float64_is_signaling_nan := flag(
  2040. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2041. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2042. end;
  2043. (*----------------------------------------------------------------------------
  2044. | Returns the result of converting the double-precision floating-point NaN
  2045. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2046. | exception is raised.
  2047. *----------------------------------------------------------------------------*)
  2048. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  2049. var
  2050. z : commonNaNT;
  2051. begin
  2052. if ( float64_is_signaling_nan( a )<>0 ) then
  2053. float_raise( float_flag_invalid );
  2054. z.sign := a.high shr 31;
  2055. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2056. c:=z;
  2057. end;
  2058. (*----------------------------------------------------------------------------
  2059. | Returns the result of converting the canonical NaN `a' to the double-
  2060. | precision floating-point format.
  2061. *----------------------------------------------------------------------------*)
  2062. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  2063. var
  2064. z: float64;
  2065. begin
  2066. shift64Right( a.high, a.low, 12, z.high, z.low );
  2067. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2068. c:=z;
  2069. end;
  2070. (*----------------------------------------------------------------------------
  2071. | Takes two double-precision floating-point values `a' and `b', one of which
  2072. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2073. | signaling NaN, the invalid exception is raised.
  2074. *----------------------------------------------------------------------------*)
  2075. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2076. var
  2077. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2078. begin
  2079. aIsNaN := float64_is_nan( a );
  2080. aIsSignalingNaN := float64_is_signaling_nan( a );
  2081. bIsNaN := float64_is_nan( b );
  2082. bIsSignalingNaN := float64_is_signaling_nan( b );
  2083. a.high := a.high or $00080000;
  2084. b.high := b.high or $00080000;
  2085. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2086. float_raise( float_flag_invalid );
  2087. if bIsSignalingNaN<>0 then
  2088. c := b
  2089. else if aIsSignalingNan<>0 then
  2090. c := a
  2091. else if bIsNan<>0 then
  2092. c := b
  2093. else
  2094. c := a;
  2095. end;
  2096. {$ENDIF}
  2097. (****************************************************************************)
  2098. (* END ENDIAN SPECIFIC CODE *)
  2099. (****************************************************************************)
  2100. {*
  2101. -------------------------------------------------------------------------------
  2102. Returns the fraction bits of the single-precision floating-point value `a'.
  2103. -------------------------------------------------------------------------------
  2104. *}
  2105. Function ExtractFloat32Frac(a : Float32) : Bits32;
  2106. Begin
  2107. ExtractFloat32Frac := A AND $007FFFFF;
  2108. End;
  2109. {*
  2110. -------------------------------------------------------------------------------
  2111. Returns the exponent bits of the single-precision floating-point value `a'.
  2112. -------------------------------------------------------------------------------
  2113. *}
  2114. Function extractFloat32Exp( a: float32 ): Int16;
  2115. Begin
  2116. extractFloat32Exp := (a shr 23) AND $FF;
  2117. End;
  2118. {*
  2119. -------------------------------------------------------------------------------
  2120. Returns the sign bit of the single-precision floating-point value `a'.
  2121. -------------------------------------------------------------------------------
  2122. *}
  2123. Function extractFloat32Sign( a: float32 ): Flag;
  2124. Begin
  2125. extractFloat32Sign := a shr 31;
  2126. End;
  2127. {*
  2128. -------------------------------------------------------------------------------
  2129. Normalizes the subnormal single-precision floating-point value represented
  2130. by the denormalized significand `aSig'. The normalized exponent and
  2131. significand are stored at the locations pointed to by `zExpPtr' and
  2132. `zSigPtr', respectively.
  2133. -------------------------------------------------------------------------------
  2134. *}
  2135. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2136. Var
  2137. ShiftCount : BYTE;
  2138. Begin
  2139. shiftCount := countLeadingZeros32( aSig ) - 8;
  2140. zSigPtr := aSig shl shiftCount;
  2141. zExpPtr := 1 - shiftCount;
  2142. End;
  2143. {*
  2144. -------------------------------------------------------------------------------
  2145. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2146. single-precision floating-point value, returning the result. After being
  2147. shifted into the proper positions, the three fields are simply added
  2148. together to form the result. This means that any integer portion of `zSig'
  2149. will be added into the exponent. Since a properly normalized significand
  2150. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2151. than the desired result exponent whenever `zSig' is a complete, normalized
  2152. significand.
  2153. -------------------------------------------------------------------------------
  2154. *}
  2155. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  2156. Begin
  2157. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2158. + zSig;
  2159. End;
  2160. {*
  2161. -------------------------------------------------------------------------------
  2162. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2163. and significand `zSig', and returns the proper single-precision floating-
  2164. point value corresponding to the abstract input. Ordinarily, the abstract
  2165. value is simply rounded and packed into the single-precision format, with
  2166. the inexact exception raised if the abstract input cannot be represented
  2167. exactly. However, if the abstract value is too large, the overflow and
  2168. inexact exceptions are raised and an infinity or maximal finite value is
  2169. returned. If the abstract value is too small, the input value is rounded to
  2170. a subnormal number, and the underflow and inexact exceptions are raised if
  2171. the abstract input cannot be represented exactly as a subnormal single-
  2172. precision floating-point number.
  2173. The input significand `zSig' has its binary point between bits 30
  2174. and 29, which is 7 bits to the left of the usual location. This shifted
  2175. significand must be normalized or smaller. If `zSig' is not normalized,
  2176. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2177. and it must not require rounding. In the usual case that `zSig' is
  2178. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2179. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2180. Binary Floating-Point Arithmetic.
  2181. -------------------------------------------------------------------------------
  2182. *}
  2183. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2184. Var
  2185. roundingMode : BYTE;
  2186. roundNearestEven : Flag;
  2187. roundIncrement, roundBits : BYTE;
  2188. IsTiny : Flag;
  2189. Begin
  2190. roundingMode := float_rounding_mode;
  2191. if (roundingMode = float_round_nearest_even) then
  2192. Begin
  2193. roundNearestEven := Flag(TRUE);
  2194. end
  2195. else
  2196. roundNearestEven := Flag(FALSE);
  2197. roundIncrement := $40;
  2198. if ( Boolean(roundNearestEven) = FALSE) then
  2199. Begin
  2200. if ( roundingMode = float_round_to_zero ) Then
  2201. Begin
  2202. roundIncrement := 0;
  2203. End
  2204. else
  2205. Begin
  2206. roundIncrement := $7F;
  2207. if ( zSign <> 0 ) then
  2208. Begin
  2209. if roundingMode = float_round_up then roundIncrement := 0;
  2210. End
  2211. else
  2212. Begin
  2213. if roundingMode = float_round_down then roundIncrement := 0;
  2214. End;
  2215. End
  2216. End;
  2217. roundBits := zSig AND $7F;
  2218. if ($FD <= bits16 (zExp) ) then
  2219. Begin
  2220. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2221. Begin
  2222. float_raise( float_flag_overflow OR float_flag_inexact );
  2223. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2224. exit;
  2225. End;
  2226. if ( zExp < 0 ) then
  2227. Begin
  2228. isTiny :=
  2229. flag(( float_detect_tininess = float_tininess_before_rounding )
  2230. OR ( zExp < -1 )
  2231. OR ( (zSig + roundIncrement) < $80000000 ));
  2232. shift32RightJamming( zSig, - zExp, zSig );
  2233. zExp := 0;
  2234. roundBits := zSig AND $7F;
  2235. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  2236. float_raise( float_flag_underflow );
  2237. End;
  2238. End;
  2239. if ( roundBits )<> 0 then
  2240. softfloat_exception_flags := float_flag_inexact OR softfloat_exception_flags;
  2241. zSig := ( zSig + roundIncrement ) shr 7;
  2242. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  2243. if ( zSig = 0 ) then zExp := 0;
  2244. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2245. exit;
  2246. End;
  2247. {*
  2248. -------------------------------------------------------------------------------
  2249. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2250. and significand `zSig', and returns the proper single-precision floating-
  2251. point value corresponding to the abstract input. This routine is just like
  2252. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2253. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2254. floating-point exponent.
  2255. -------------------------------------------------------------------------------
  2256. *}
  2257. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2258. Var
  2259. ShiftCount : int8;
  2260. Begin
  2261. shiftCount := countLeadingZeros32( zSig ) - 1;
  2262. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2263. End;
  2264. {*
  2265. -------------------------------------------------------------------------------
  2266. Returns the most-significant 20 fraction bits of the double-precision
  2267. floating-point value `a'.
  2268. -------------------------------------------------------------------------------
  2269. *}
  2270. Function extractFloat64Frac0(a: float64): bits32;
  2271. Begin
  2272. extractFloat64Frac0 := a.high and $000FFFFF;
  2273. End;
  2274. {*
  2275. -------------------------------------------------------------------------------
  2276. Returns the least-significant 32 fraction bits of the double-precision
  2277. floating-point value `a'.
  2278. -------------------------------------------------------------------------------
  2279. *}
  2280. Function extractFloat64Frac1(a: float64): bits32;
  2281. Begin
  2282. extractFloat64Frac1 := a.low;
  2283. End;
  2284. {*
  2285. -------------------------------------------------------------------------------
  2286. Returns the exponent bits of the double-precision floating-point value `a'.
  2287. -------------------------------------------------------------------------------
  2288. *}
  2289. Function extractFloat64Exp(a: float64): int16;
  2290. Begin
  2291. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2292. End;
  2293. {*
  2294. -------------------------------------------------------------------------------
  2295. Returns the sign bit of the double-precision floating-point value `a'.
  2296. -------------------------------------------------------------------------------
  2297. *}
  2298. Function extractFloat64Sign(a: float64) : flag;
  2299. Begin
  2300. extractFloat64Sign := a.high shr 31;
  2301. End;
  2302. {*
  2303. -------------------------------------------------------------------------------
  2304. Normalizes the subnormal double-precision floating-point value represented
  2305. by the denormalized significand formed by the concatenation of `aSig0' and
  2306. `aSig1'. The normalized exponent is stored at the location pointed to by
  2307. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2308. stored at the location pointed to by `zSig0Ptr', and the least significant
  2309. 32 bits of the normalized significand are stored at the location pointed to
  2310. by `zSig1Ptr'.
  2311. -------------------------------------------------------------------------------
  2312. *}
  2313. Procedure normalizeFloat64Subnormal(
  2314. aSig0: bits32;
  2315. aSig1: bits32;
  2316. VAR zExpPtr : Int16;
  2317. VAR zSig0Ptr : Bits32;
  2318. VAR zSig1Ptr : Bits32
  2319. );
  2320. Var
  2321. ShiftCount : Int8;
  2322. Begin
  2323. if ( aSig0 = 0 ) then
  2324. Begin
  2325. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2326. if ( shiftCount < 0 ) then
  2327. Begin
  2328. zSig0Ptr := aSig1 shr ( - shiftCount );
  2329. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2330. End
  2331. else
  2332. Begin
  2333. zSig0Ptr := aSig1 shl shiftCount;
  2334. zSig1Ptr := 0;
  2335. End;
  2336. zExpPtr := - shiftCount - 31;
  2337. End
  2338. else
  2339. Begin
  2340. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2341. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2342. zExpPtr := 1 - shiftCount;
  2343. End;
  2344. End;
  2345. {*
  2346. -------------------------------------------------------------------------------
  2347. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2348. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2349. point value, returning the result. After being shifted into the proper
  2350. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2351. together to form the most significant 32 bits of the result. This means
  2352. that any integer portion of `zSig0' will be added into the exponent. Since
  2353. a properly normalized significand will have an integer portion equal to 1,
  2354. the `zExp' input should be 1 less than the desired result exponent whenever
  2355. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2356. -------------------------------------------------------------------------------
  2357. *}
  2358. Procedure
  2359. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2360. var
  2361. z: Float64;
  2362. Begin
  2363. z.low := zSig1;
  2364. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2365. c := z;
  2366. End;
  2367. {*----------------------------------------------------------------------------
  2368. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2369. | double-precision floating-point value, returning the result. After being
  2370. | shifted into the proper positions, the three fields are simply added
  2371. | together to form the result. This means that any integer portion of `zSig'
  2372. | will be added into the exponent. Since a properly normalized significand
  2373. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2374. | than the desired result exponent whenever `zSig' is a complete, normalized
  2375. | significand.
  2376. *----------------------------------------------------------------------------*}
  2377. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2378. begin
  2379. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2380. end;
  2381. {*
  2382. -------------------------------------------------------------------------------
  2383. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2384. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2385. and `zSig2', and returns the proper double-precision floating-point value
  2386. corresponding to the abstract input. Ordinarily, the abstract value is
  2387. simply rounded and packed into the double-precision format, with the inexact
  2388. exception raised if the abstract input cannot be represented exactly.
  2389. However, if the abstract value is too large, the overflow and inexact
  2390. exceptions are raised and an infinity or maximal finite value is returned.
  2391. If the abstract value is too small, the input value is rounded to a
  2392. subnormal number, and the underflow and inexact exceptions are raised if the
  2393. abstract input cannot be represented exactly as a subnormal double-precision
  2394. floating-point number.
  2395. The input significand must be normalized or smaller. If the input
  2396. significand is not normalized, `zExp' must be 0; in that case, the result
  2397. returned is a subnormal number, and it must not require rounding. In the
  2398. usual case that the input significand is normalized, `zExp' must be 1 less
  2399. than the ``true'' floating-point exponent. The handling of underflow and
  2400. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2401. -------------------------------------------------------------------------------
  2402. *}
  2403. Procedure
  2404. roundAndPackFloat64(
  2405. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2406. Var
  2407. roundingMode : Int8;
  2408. roundNearestEven, increment, isTiny : Flag;
  2409. Begin
  2410. roundingMode := float_rounding_mode;
  2411. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2412. increment := flag( sbits32 (zSig2) < 0 );
  2413. if ( roundNearestEven = flag(FALSE) ) then
  2414. Begin
  2415. if ( roundingMode = float_round_to_zero ) then
  2416. increment := 0
  2417. else
  2418. Begin
  2419. if ( zSign )<> 0 then
  2420. Begin
  2421. increment := flag( roundingMode = float_round_down ) and zSig2;
  2422. End
  2423. else
  2424. Begin
  2425. increment := flag( roundingMode = float_round_up ) and zSig2;
  2426. End
  2427. End
  2428. End;
  2429. if ( $7FD <= bits16 (zExp) ) then
  2430. Begin
  2431. if (( $7FD < zExp )
  2432. or (( zExp = $7FD )
  2433. and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
  2434. and (increment<>0)
  2435. )
  2436. ) then
  2437. Begin
  2438. float_raise( float_flag_overflow OR float_flag_inexact );
  2439. if (( roundingMode = float_round_to_zero )
  2440. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2441. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2442. ) then
  2443. Begin
  2444. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2445. exit;
  2446. End;
  2447. packFloat64( zSign, $7FF, 0, 0, c );
  2448. exit;
  2449. End;
  2450. if ( zExp < 0 ) then
  2451. Begin
  2452. isTiny :=
  2453. flag( float_detect_tininess = float_tininess_before_rounding )
  2454. or flag( zExp < -1 )
  2455. or flag(increment = 0)
  2456. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2457. shift64ExtraRightJamming(
  2458. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2459. zExp := 0;
  2460. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2461. if ( roundNearestEven )<>0 then
  2462. Begin
  2463. increment := flag( sbits32 (zSig2) < 0 );
  2464. End
  2465. else
  2466. Begin
  2467. if ( zSign )<>0 then
  2468. Begin
  2469. increment := flag( roundingMode = float_round_down ) and zSig2;
  2470. End
  2471. else
  2472. Begin
  2473. increment := flag( roundingMode = float_round_up ) and zSig2;
  2474. End
  2475. End;
  2476. End;
  2477. End;
  2478. if ( zSig2 )<>0 then
  2479. softfloat_exception_flags := softfloat_exception_flags OR float_flag_inexact;
  2480. if ( increment )<>0 then
  2481. Begin
  2482. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2483. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2484. End
  2485. else
  2486. Begin
  2487. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2488. End;
  2489. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2490. End;
  2491. {*----------------------------------------------------------------------------
  2492. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2493. | and significand `zSig', and returns the proper double-precision floating-
  2494. | point value corresponding to the abstract input. Ordinarily, the abstract
  2495. | value is simply rounded and packed into the double-precision format, with
  2496. | the inexact exception raised if the abstract input cannot be represented
  2497. | exactly. However, if the abstract value is too large, the overflow and
  2498. | inexact exceptions are raised and an infinity or maximal finite value is
  2499. | returned. If the abstract value is too small, the input value is rounded
  2500. | to a subnormal number, and the underflow and inexact exceptions are raised
  2501. | if the abstract input cannot be represented exactly as a subnormal double-
  2502. | precision floating-point number.
  2503. | The input significand `zSig' has its binary point between bits 62
  2504. | and 61, which is 10 bits to the left of the usual location. This shifted
  2505. | significand must be normalized or smaller. If `zSig' is not normalized,
  2506. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2507. | and it must not require rounding. In the usual case that `zSig' is
  2508. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2509. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2510. | Binary Floating-Point Arithmetic.
  2511. *----------------------------------------------------------------------------*}
  2512. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2513. var
  2514. roundingMode: int8;
  2515. roundNearestEven: flag;
  2516. roundIncrement, roundBits: int16;
  2517. isTiny: flag;
  2518. begin
  2519. roundingMode := float_rounding_mode;
  2520. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2521. roundIncrement := $200;
  2522. if ( roundNearestEven=0 ) then
  2523. begin
  2524. if ( roundingMode = float_round_to_zero ) then
  2525. begin
  2526. roundIncrement := 0;
  2527. end
  2528. else begin
  2529. roundIncrement := $3FF;
  2530. if ( zSign<>0 ) then
  2531. begin
  2532. if ( roundingMode = float_round_up ) then
  2533. roundIncrement := 0;
  2534. end
  2535. else begin
  2536. if ( roundingMode = float_round_down ) then
  2537. roundIncrement := 0;
  2538. end
  2539. end
  2540. end;
  2541. roundBits := zSig and $3FF;
  2542. if ( $7FD <= bits16(zExp) ) then
  2543. begin
  2544. if ( ( $7FD < zExp )
  2545. or ( ( zExp = $7FD )
  2546. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2547. ) then
  2548. begin
  2549. float_raise( float_flag_overflow or float_flag_inexact );
  2550. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2551. exit;
  2552. end;
  2553. if ( zExp < 0 ) then
  2554. begin
  2555. isTiny := ord(
  2556. ( float_detect_tininess = float_tininess_before_rounding )
  2557. or ( zExp < -1 )
  2558. or ( (zSig + roundIncrement) < int64( $8000000000000000 ) ) );
  2559. shift64RightJamming( zSig, - zExp, zSig );
  2560. zExp := 0;
  2561. roundBits := zSig and $3FF;
  2562. if ( isTiny and roundBits )<>0 then
  2563. float_raise( float_flag_underflow );
  2564. end
  2565. end;
  2566. if ( roundBits<>0 ) then
  2567. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2568. zSig := ( zSig + roundIncrement ) shr 10;
  2569. zSig := zSig and not( ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven );
  2570. if ( zSig = 0 ) then
  2571. zExp := 0;
  2572. result:=packFloat64( zSign, zExp, zSig );
  2573. end;
  2574. {*
  2575. -------------------------------------------------------------------------------
  2576. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2577. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2578. returns the proper double-precision floating-point value corresponding
  2579. to the abstract input. This routine is just like `roundAndPackFloat64'
  2580. except that the input significand has fewer bits and does not have to be
  2581. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2582. point exponent.
  2583. -------------------------------------------------------------------------------
  2584. *}
  2585. Procedure
  2586. normalizeRoundAndPackFloat64(
  2587. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2588. Var
  2589. shiftCount : int8;
  2590. zSig2 : bits32;
  2591. Begin
  2592. if ( zSig0 = 0 ) then
  2593. Begin
  2594. zSig0 := zSig1;
  2595. zSig1 := 0;
  2596. zExp := zExp -32;
  2597. End;
  2598. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2599. if ( 0 <= shiftCount ) then
  2600. Begin
  2601. zSig2 := 0;
  2602. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2603. End
  2604. else
  2605. Begin
  2606. shift64ExtraRightJamming
  2607. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2608. End;
  2609. zExp := zExp - shiftCount;
  2610. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2611. End;
  2612. {*
  2613. -------------------------------------------------------------------------------
  2614. Returns the result of converting the 32-bit two's complement integer `a' to
  2615. the single-precision floating-point format. The conversion is performed
  2616. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2617. -------------------------------------------------------------------------------
  2618. *}
  2619. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2620. Var
  2621. zSign : Flag;
  2622. Begin
  2623. if ( a = 0 ) then
  2624. Begin
  2625. int32_to_float32.float32 := 0;
  2626. exit;
  2627. End;
  2628. if ( a = sbits32 ($80000000) ) then
  2629. Begin
  2630. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2631. exit;
  2632. end;
  2633. zSign := flag( a < 0 );
  2634. If zSign<>0 then
  2635. a := -a;
  2636. int32_to_float32.float32:=
  2637. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2638. End;
  2639. {*
  2640. -------------------------------------------------------------------------------
  2641. Returns the result of converting the 32-bit two's complement integer `a' to
  2642. the double-precision floating-point format. The conversion is performed
  2643. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2644. -------------------------------------------------------------------------------
  2645. *}
  2646. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2647. var
  2648. zSign : flag;
  2649. absA : bits32;
  2650. shiftCount : int8;
  2651. zSig0, zSig1 : bits32;
  2652. Begin
  2653. if ( a = 0 ) then
  2654. Begin
  2655. packFloat64( 0, 0, 0, 0, result );
  2656. exit;
  2657. end;
  2658. zSign := flag( a < 0 );
  2659. if ZSign<>0 then
  2660. AbsA := -a
  2661. else
  2662. AbsA := a;
  2663. shiftCount := countLeadingZeros32( absA ) - 11;
  2664. if ( 0 <= shiftCount ) then
  2665. Begin
  2666. zSig0 := absA shl shiftCount;
  2667. zSig1 := 0;
  2668. End
  2669. else
  2670. Begin
  2671. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2672. End;
  2673. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2674. End;
  2675. {*
  2676. -------------------------------------------------------------------------------
  2677. Returns the result of converting the single-precision floating-point value
  2678. `a' to the 32-bit two's complement integer format. The conversion is
  2679. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2680. Arithmetic---which means in particular that the conversion is rounded
  2681. according to the current rounding mode. If `a' is a NaN, the largest
  2682. positive integer is returned. Otherwise, if the conversion overflows, the
  2683. largest integer with the same sign as `a' is returned.
  2684. -------------------------------------------------------------------------------
  2685. *}
  2686. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2687. Var
  2688. aSign: flag;
  2689. aExp, shiftCount: int16;
  2690. aSig, aSigExtra: bits32;
  2691. z: int32;
  2692. roundingMode: int8;
  2693. Begin
  2694. aSig := extractFloat32Frac( a.float32 );
  2695. aExp := extractFloat32Exp( a.float32 );
  2696. aSign := extractFloat32Sign( a.float32 );
  2697. shiftCount := aExp - $96;
  2698. if ( 0 <= shiftCount ) then
  2699. Begin
  2700. if ( $9E <= aExp ) then
  2701. Begin
  2702. if ( a.float32 <> $CF000000 ) then
  2703. Begin
  2704. float_raise( float_flag_invalid );
  2705. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2706. Begin
  2707. float32_to_int32 := $7FFFFFFF;
  2708. exit;
  2709. End;
  2710. End;
  2711. float32_to_int32 := sbits32 ($80000000);
  2712. exit;
  2713. End;
  2714. z := ( aSig or $00800000 ) shl shiftCount;
  2715. if ( aSign<>0 ) then z := - z;
  2716. End
  2717. else
  2718. Begin
  2719. if ( aExp < $7E ) then
  2720. Begin
  2721. aSigExtra := aExp OR aSig;
  2722. z := 0;
  2723. End
  2724. else
  2725. Begin
  2726. aSig := aSig OR $00800000;
  2727. aSigExtra := aSig shl ( shiftCount and 31 );
  2728. z := aSig shr ( - shiftCount );
  2729. End;
  2730. if ( aSigExtra<>0 ) then
  2731. softfloat_exception_flags := softfloat_exception_flags
  2732. or float_flag_inexact;
  2733. roundingMode := float_rounding_mode;
  2734. if ( roundingMode = float_round_nearest_even ) then
  2735. Begin
  2736. if ( sbits32 (aSigExtra) < 0 ) then
  2737. Begin
  2738. Inc(z);
  2739. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2740. z := z and not 1;
  2741. End;
  2742. if ( aSign<>0 ) then
  2743. z := - z;
  2744. End
  2745. else
  2746. Begin
  2747. aSigExtra := flag( aSigExtra <> 0 );
  2748. if ( aSign<>0 ) then
  2749. Begin
  2750. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2751. z := - z;
  2752. End
  2753. else
  2754. Begin
  2755. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2756. End
  2757. End;
  2758. End;
  2759. float32_to_int32 := z;
  2760. End;
  2761. {*
  2762. -------------------------------------------------------------------------------
  2763. Returns the result of converting the single-precision floating-point value
  2764. `a' to the 32-bit two's complement integer format. The conversion is
  2765. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2766. Arithmetic, except that the conversion is always rounded toward zero.
  2767. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2768. the conversion overflows, the largest integer with the same sign as `a' is
  2769. returned.
  2770. -------------------------------------------------------------------------------
  2771. *}
  2772. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2773. Var
  2774. aSign : flag;
  2775. aExp, shiftCount : int16;
  2776. aSig : bits32;
  2777. z : int32;
  2778. Begin
  2779. aSig := extractFloat32Frac( a.float32 );
  2780. aExp := extractFloat32Exp( a.float32 );
  2781. aSign := extractFloat32Sign( a.float32 );
  2782. shiftCount := aExp - $9E;
  2783. if ( 0 <= shiftCount ) then
  2784. Begin
  2785. if ( a.float32 <> $CF000000 ) then
  2786. Begin
  2787. float_raise( float_flag_invalid );
  2788. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2789. Begin
  2790. float32_to_int32_round_to_zero := $7FFFFFFF;
  2791. exit;
  2792. end;
  2793. End;
  2794. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2795. exit;
  2796. End
  2797. else
  2798. if ( aExp <= $7E ) then
  2799. Begin
  2800. if ( aExp or aSig )<>0 then
  2801. softfloat_exception_flags :=
  2802. softfloat_exception_flags or float_flag_inexact;
  2803. float32_to_int32_round_to_zero := 0;
  2804. exit;
  2805. End;
  2806. aSig := ( aSig or $00800000 ) shl 8;
  2807. z := aSig shr ( - shiftCount );
  2808. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2809. Begin
  2810. softfloat_exception_flags :=
  2811. softfloat_exception_flags or float_flag_inexact;
  2812. End;
  2813. if ( aSign<>0 ) then z := - z;
  2814. float32_to_int32_round_to_zero := z;
  2815. End;
  2816. {*
  2817. -------------------------------------------------------------------------------
  2818. Returns the result of converting the single-precision floating-point value
  2819. `a' to the double-precision floating-point format. The conversion is
  2820. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2821. Arithmetic.
  2822. -------------------------------------------------------------------------------
  2823. *}
  2824. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  2825. Var
  2826. aSign : flag;
  2827. aExp : int16;
  2828. aSig, zSig0, zSig1: bits32;
  2829. tmp : CommonNanT;
  2830. Begin
  2831. aSig := extractFloat32Frac( a.float32 );
  2832. aExp := extractFloat32Exp( a.float32 );
  2833. aSign := extractFloat32Sign( a.float32 );
  2834. if ( aExp = $FF ) then
  2835. Begin
  2836. if ( aSig<>0 ) then
  2837. Begin
  2838. float32ToCommonNaN(a.float32, tmp);
  2839. commonNaNToFloat64(tmp , result);
  2840. exit;
  2841. End;
  2842. packFloat64( aSign, $7FF, 0, 0, result);
  2843. exit;
  2844. End;
  2845. if ( aExp = 0 ) then
  2846. Begin
  2847. if ( aSig = 0 ) then
  2848. Begin
  2849. packFloat64( aSign, 0, 0, 0, result );
  2850. exit;
  2851. end;
  2852. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2853. Dec(aExp);
  2854. End;
  2855. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  2856. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  2857. End;
  2858. {*
  2859. -------------------------------------------------------------------------------
  2860. Rounds the single-precision floating-point value `a' to an integer,
  2861. and returns the result as a single-precision floating-point value. The
  2862. operation is performed according to the IEC/IEEE Standard for Binary
  2863. Floating-Point Arithmetic.
  2864. -------------------------------------------------------------------------------
  2865. *}
  2866. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  2867. Var
  2868. aSign: flag;
  2869. aExp: int16;
  2870. lastBitMask, roundBitsMask: bits32;
  2871. roundingMode: int8;
  2872. z: float32;
  2873. Begin
  2874. aExp := extractFloat32Exp( a.float32 );
  2875. if ( $96 <= aExp ) then
  2876. Begin
  2877. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2878. Begin
  2879. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  2880. exit;
  2881. End;
  2882. float32_round_to_int:=a;
  2883. exit;
  2884. End;
  2885. if ( aExp <= $7E ) then
  2886. Begin
  2887. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  2888. Begin
  2889. float32_round_to_int:=a;
  2890. exit;
  2891. end;
  2892. softfloat_exception_flags
  2893. := softfloat_exception_flags OR float_flag_inexact;
  2894. aSign := extractFloat32Sign( a.float32 );
  2895. case ( float_rounding_mode ) of
  2896. float_round_nearest_even:
  2897. Begin
  2898. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2899. Begin
  2900. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  2901. exit;
  2902. End;
  2903. End;
  2904. float_round_down:
  2905. Begin
  2906. if aSign <> 0 then
  2907. float32_round_to_int.float32 := $BF800000
  2908. else
  2909. float32_round_to_int.float32 := 0;
  2910. exit;
  2911. End;
  2912. float_round_up:
  2913. Begin
  2914. if aSign <> 0 then
  2915. float32_round_to_int.float32 := $80000000
  2916. else
  2917. float32_round_to_int.float32 := $3F800000;
  2918. exit;
  2919. End;
  2920. end;
  2921. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  2922. End;
  2923. lastBitMask := 1;
  2924. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  2925. lastBitMask := lastBitMask shl ($96 - aExp);
  2926. roundBitsMask := lastBitMask - 1;
  2927. z := a.float32;
  2928. roundingMode := float_rounding_mode;
  2929. if ( roundingMode = float_round_nearest_even ) then
  2930. Begin
  2931. z := z + (lastBitMask shr 1);
  2932. if ( ( z and roundBitsMask ) = 0 ) then
  2933. z := z and not lastBitMask;
  2934. End
  2935. else if ( roundingMode <> float_round_to_zero ) then
  2936. Begin
  2937. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  2938. Begin
  2939. z := z + roundBitsMask;
  2940. End;
  2941. End;
  2942. z := z and not roundBitsMask;
  2943. if ( z <> a.float32 ) then
  2944. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2945. float32_round_to_int.float32 := z;
  2946. End;
  2947. {*
  2948. -------------------------------------------------------------------------------
  2949. Returns the result of adding the absolute values of the single-precision
  2950. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  2951. before being returned. `zSign' is ignored if the result is a NaN.
  2952. The addition is performed according to the IEC/IEEE Standard for Binary
  2953. Floating-Point Arithmetic.
  2954. -------------------------------------------------------------------------------
  2955. *}
  2956. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  2957. Var
  2958. aExp, bExp, zExp: int16;
  2959. aSig, bSig, zSig: bits32;
  2960. expDiff: int16;
  2961. label roundAndPack;
  2962. Begin
  2963. aSig:=extractFloat32Frac( a );
  2964. aExp:=extractFloat32Exp( a );
  2965. bSig:=extractFloat32Frac( b );
  2966. bExp := extractFloat32Exp( b );
  2967. expDiff := aExp - bExp;
  2968. aSig := aSig shl 6;
  2969. bSig := bSig shl 6;
  2970. if ( 0 < expDiff ) then
  2971. Begin
  2972. if ( aExp = $FF ) then
  2973. Begin
  2974. if ( aSig <> 0) then
  2975. Begin
  2976. addFloat32Sigs := propagateFloat32NaN( a, b );
  2977. exit;
  2978. End;
  2979. addFloat32Sigs := a;
  2980. exit;
  2981. End;
  2982. if ( bExp = 0 ) then
  2983. Begin
  2984. Dec(expDiff);
  2985. End
  2986. else
  2987. Begin
  2988. bSig := bSig or $20000000;
  2989. End;
  2990. shift32RightJamming( bSig, expDiff, bSig );
  2991. zExp := aExp;
  2992. End
  2993. else
  2994. If ( expDiff < 0 ) then
  2995. Begin
  2996. if ( bExp = $FF ) then
  2997. Begin
  2998. if ( bSig<>0 ) then
  2999. Begin
  3000. addFloat32Sigs := propagateFloat32NaN( a, b );
  3001. exit;
  3002. end;
  3003. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3004. exit;
  3005. End;
  3006. if ( aExp = 0 ) then
  3007. Begin
  3008. Inc(expDiff);
  3009. End
  3010. else
  3011. Begin
  3012. aSig := aSig OR $20000000;
  3013. End;
  3014. shift32RightJamming( aSig, - expDiff, aSig );
  3015. zExp := bExp;
  3016. End
  3017. else
  3018. Begin
  3019. if ( aExp = $FF ) then
  3020. Begin
  3021. if ( aSig OR bSig )<> 0 then
  3022. Begin
  3023. addFloat32Sigs := propagateFloat32NaN( a, b );
  3024. exit;
  3025. end;
  3026. addFloat32Sigs := a;
  3027. exit;
  3028. End;
  3029. if ( aExp = 0 ) then
  3030. Begin
  3031. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3032. exit;
  3033. end;
  3034. zSig := $40000000 + aSig + bSig;
  3035. zExp := aExp;
  3036. goto roundAndPack;
  3037. End;
  3038. aSig := aSig OR $20000000;
  3039. zSig := ( aSig + bSig ) shl 1;
  3040. Dec(zExp);
  3041. if ( sbits32 (zSig) < 0 ) then
  3042. Begin
  3043. zSig := aSig + bSig;
  3044. Inc(zExp);
  3045. End;
  3046. roundAndPack:
  3047. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3048. End;
  3049. {*
  3050. -------------------------------------------------------------------------------
  3051. Returns the result of subtracting the absolute values of the single-
  3052. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3053. difference is negated before being returned. `zSign' is ignored if the
  3054. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3055. Standard for Binary Floating-Point Arithmetic.
  3056. -------------------------------------------------------------------------------
  3057. *}
  3058. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3059. Var
  3060. aExp, bExp, zExp: int16;
  3061. aSig, bSig, zSig: bits32;
  3062. expDiff : int16;
  3063. label aExpBigger;
  3064. label bExpBigger;
  3065. label aBigger;
  3066. label bBigger;
  3067. label normalizeRoundAndPack;
  3068. Begin
  3069. aSig := extractFloat32Frac( a );
  3070. aExp := extractFloat32Exp( a );
  3071. bSig := extractFloat32Frac( b );
  3072. bExp := extractFloat32Exp( b );
  3073. expDiff := aExp - bExp;
  3074. aSig := aSig shl 7;
  3075. bSig := bSig shl 7;
  3076. if ( 0 < expDiff ) then goto aExpBigger;
  3077. if ( expDiff < 0 ) then goto bExpBigger;
  3078. if ( aExp = $FF ) then
  3079. Begin
  3080. if ( aSig OR bSig )<> 0 then
  3081. Begin
  3082. subFloat32Sigs := propagateFloat32NaN( a, b );
  3083. exit;
  3084. End;
  3085. float_raise( float_flag_invalid );
  3086. subFloat32Sigs := float32_default_nan;
  3087. exit;
  3088. End;
  3089. if ( aExp = 0 ) then
  3090. Begin
  3091. aExp := 1;
  3092. bExp := 1;
  3093. End;
  3094. if ( bSig < aSig ) Then goto aBigger;
  3095. if ( aSig < bSig ) Then goto bBigger;
  3096. subFloat32Sigs := packFloat32( flag(float_rounding_mode = float_round_down), 0, 0 );
  3097. exit;
  3098. bExpBigger:
  3099. if ( bExp = $FF ) then
  3100. Begin
  3101. if ( bSig<>0 ) then
  3102. Begin
  3103. subFloat32Sigs := propagateFloat32NaN( a, b );
  3104. exit;
  3105. End;
  3106. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3107. exit;
  3108. End;
  3109. if ( aExp = 0 ) then
  3110. Begin
  3111. Inc(expDiff);
  3112. End
  3113. else
  3114. Begin
  3115. aSig := aSig OR $40000000;
  3116. End;
  3117. shift32RightJamming( aSig, - expDiff, aSig );
  3118. bSig := bSig OR $40000000;
  3119. bBigger:
  3120. zSig := bSig - aSig;
  3121. zExp := bExp;
  3122. zSign := zSign xor 1;
  3123. goto normalizeRoundAndPack;
  3124. aExpBigger:
  3125. if ( aExp = $FF ) then
  3126. Begin
  3127. if ( aSig <> 0) then
  3128. Begin
  3129. subFloat32Sigs := propagateFloat32NaN( a, b );
  3130. exit;
  3131. End;
  3132. subFloat32Sigs := a;
  3133. exit;
  3134. End;
  3135. if ( bExp = 0 ) then
  3136. Begin
  3137. Dec(expDiff);
  3138. End
  3139. else
  3140. Begin
  3141. bSig := bSig OR $40000000;
  3142. End;
  3143. shift32RightJamming( bSig, expDiff, bSig );
  3144. aSig := aSig OR $40000000;
  3145. aBigger:
  3146. zSig := aSig - bSig;
  3147. zExp := aExp;
  3148. normalizeRoundAndPack:
  3149. Dec(zExp);
  3150. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3151. End;
  3152. {*
  3153. -------------------------------------------------------------------------------
  3154. Returns the result of adding the single-precision floating-point values `a'
  3155. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3156. Binary Floating-Point Arithmetic.
  3157. -------------------------------------------------------------------------------
  3158. *}
  3159. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3160. Var
  3161. aSign, bSign: Flag;
  3162. Begin
  3163. aSign := extractFloat32Sign( a.float32 );
  3164. bSign := extractFloat32Sign( b.float32 );
  3165. if ( aSign = bSign ) then
  3166. Begin
  3167. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3168. End
  3169. else
  3170. Begin
  3171. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3172. End;
  3173. End;
  3174. {*
  3175. -------------------------------------------------------------------------------
  3176. Returns the result of subtracting the single-precision floating-point values
  3177. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3178. for Binary Floating-Point Arithmetic.
  3179. -------------------------------------------------------------------------------
  3180. *}
  3181. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3182. Var
  3183. aSign, bSign: flag;
  3184. Begin
  3185. aSign := extractFloat32Sign( a.float32 );
  3186. bSign := extractFloat32Sign( b.float32 );
  3187. if ( aSign = bSign ) then
  3188. Begin
  3189. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3190. End
  3191. else
  3192. Begin
  3193. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3194. End;
  3195. End;
  3196. {*
  3197. -------------------------------------------------------------------------------
  3198. Returns the result of multiplying the single-precision floating-point values
  3199. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3200. for Binary Floating-Point Arithmetic.
  3201. -------------------------------------------------------------------------------
  3202. *}
  3203. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3204. Var
  3205. aSign, bSign, zSign: flag;
  3206. aExp, bExp, zExp : int16;
  3207. aSig, bSig, zSig0, zSig1: bits32;
  3208. Begin
  3209. aSig := extractFloat32Frac( a.float32 );
  3210. aExp := extractFloat32Exp( a.float32 );
  3211. aSign := extractFloat32Sign( a.float32 );
  3212. bSig := extractFloat32Frac( b.float32 );
  3213. bExp := extractFloat32Exp( b.float32 );
  3214. bSign := extractFloat32Sign( b.float32 );
  3215. zSign := aSign xor bSign;
  3216. if ( aExp = $FF ) then
  3217. Begin
  3218. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3219. Begin
  3220. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3221. End;
  3222. if ( ( bExp OR bSig ) = 0 ) then
  3223. Begin
  3224. float_raise( float_flag_invalid );
  3225. float32_mul.float32 := float32_default_nan;
  3226. exit;
  3227. End;
  3228. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3229. exit;
  3230. End;
  3231. if ( bExp = $FF ) then
  3232. Begin
  3233. if ( bSig <> 0 ) then
  3234. Begin
  3235. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3236. exit;
  3237. End;
  3238. if ( ( aExp OR aSig ) = 0 ) then
  3239. Begin
  3240. float_raise( float_flag_invalid );
  3241. float32_mul.float32 := float32_default_nan;
  3242. exit;
  3243. End;
  3244. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3245. exit;
  3246. End;
  3247. if ( aExp = 0 ) then
  3248. Begin
  3249. if ( aSig = 0 ) then
  3250. Begin
  3251. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3252. exit;
  3253. End;
  3254. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3255. End;
  3256. if ( bExp = 0 ) then
  3257. Begin
  3258. if ( bSig = 0 ) then
  3259. Begin
  3260. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3261. exit;
  3262. End;
  3263. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3264. End;
  3265. zExp := aExp + bExp - $7F;
  3266. aSig := ( aSig OR $00800000 ) shl 7;
  3267. bSig := ( bSig OR $00800000 ) shl 8;
  3268. mul32To64( aSig, bSig, zSig0, zSig1 );
  3269. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3270. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3271. Begin
  3272. zSig0 := zSig0 shl 1;
  3273. Dec(zExp);
  3274. End;
  3275. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3276. End;
  3277. {*
  3278. -------------------------------------------------------------------------------
  3279. Returns the result of dividing the single-precision floating-point value `a'
  3280. by the corresponding value `b'. The operation is performed according to the
  3281. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3282. -------------------------------------------------------------------------------
  3283. *}
  3284. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3285. Var
  3286. aSign, bSign, zSign: flag;
  3287. aExp, bExp, zExp: int16;
  3288. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3289. Begin
  3290. aSig := extractFloat32Frac( a.float32 );
  3291. aExp := extractFloat32Exp( a.float32 );
  3292. aSign := extractFloat32Sign( a.float32 );
  3293. bSig := extractFloat32Frac( b.float32 );
  3294. bExp := extractFloat32Exp( b.float32 );
  3295. bSign := extractFloat32Sign( b.float32 );
  3296. zSign := aSign xor bSign;
  3297. if ( aExp = $FF ) then
  3298. Begin
  3299. if ( aSig <> 0 ) then
  3300. Begin
  3301. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3302. exit;
  3303. End;
  3304. if ( bExp = $FF ) then
  3305. Begin
  3306. if ( bSig <> 0) then
  3307. Begin
  3308. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3309. End;
  3310. float_raise( float_flag_invalid );
  3311. float32_div.float32 := float32_default_nan;
  3312. exit;
  3313. End;
  3314. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3315. exit;
  3316. End;
  3317. if ( bExp = $FF ) then
  3318. Begin
  3319. if ( bSig <> 0) then
  3320. Begin
  3321. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3322. exit;
  3323. End;
  3324. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3325. exit;
  3326. End;
  3327. if ( bExp = 0 ) Then
  3328. Begin
  3329. if ( bSig = 0 ) Then
  3330. Begin
  3331. if ( ( aExp OR aSig ) = 0 ) then
  3332. Begin
  3333. float_raise( float_flag_invalid );
  3334. float32_div.float32 := float32_default_nan;
  3335. exit;
  3336. End;
  3337. float_raise( float_flag_divbyzero );
  3338. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3339. exit;
  3340. End;
  3341. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3342. End;
  3343. if ( aExp = 0 ) Then
  3344. Begin
  3345. if ( aSig = 0 ) Then
  3346. Begin
  3347. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3348. exit;
  3349. End;
  3350. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3351. End;
  3352. zExp := aExp - bExp + $7D;
  3353. aSig := ( aSig OR $00800000 ) shl 7;
  3354. bSig := ( bSig OR $00800000 ) shl 8;
  3355. if ( bSig <= ( aSig + aSig ) ) then
  3356. Begin
  3357. aSig := aSig shr 1;
  3358. Inc(zExp);
  3359. End;
  3360. zSig := estimateDiv64To32( aSig, 0, bSig );
  3361. if ( ( zSig and $3F ) <= 2 ) then
  3362. Begin
  3363. mul32To64( bSig, zSig, term0, term1 );
  3364. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3365. while ( sbits32 (rem0) < 0 ) do
  3366. Begin
  3367. Dec(zSig);
  3368. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3369. End;
  3370. zSig := zSig or bits32( rem1 <> 0 );
  3371. End;
  3372. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3373. End;
  3374. {*
  3375. -------------------------------------------------------------------------------
  3376. Returns the remainder of the single-precision floating-point value `a'
  3377. with respect to the corresponding value `b'. The operation is performed
  3378. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3379. -------------------------------------------------------------------------------
  3380. *}
  3381. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3382. Var
  3383. aSign, bSign, zSign: flag;
  3384. aExp, bExp, expDiff: int16;
  3385. aSig, bSig, q, allZero, alternateASig: bits32;
  3386. sigMean: sbits32;
  3387. Begin
  3388. aSig := extractFloat32Frac( a.float32 );
  3389. aExp := extractFloat32Exp( a.float32 );
  3390. aSign := extractFloat32Sign( a.float32 );
  3391. bSig := extractFloat32Frac( b.float32 );
  3392. bExp := extractFloat32Exp( b.float32 );
  3393. bSign := extractFloat32Sign( b.float32 );
  3394. if ( aExp = $FF ) then
  3395. Begin
  3396. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3397. Begin
  3398. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3399. exit;
  3400. End;
  3401. float_raise( float_flag_invalid );
  3402. float32_rem.float32 := float32_default_nan;
  3403. exit;
  3404. End;
  3405. if ( bExp = $FF ) then
  3406. Begin
  3407. if ( bSig <> 0 ) then
  3408. Begin
  3409. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3410. exit;
  3411. End;
  3412. float32_rem := a;
  3413. exit;
  3414. End;
  3415. if ( bExp = 0 ) then
  3416. Begin
  3417. if ( bSig = 0 ) then
  3418. Begin
  3419. float_raise( float_flag_invalid );
  3420. float32_rem.float32 := float32_default_nan;
  3421. exit;
  3422. End;
  3423. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3424. End;
  3425. if ( aExp = 0 ) then
  3426. Begin
  3427. if ( aSig = 0 ) then
  3428. Begin
  3429. float32_rem := a;
  3430. exit;
  3431. End;
  3432. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3433. End;
  3434. expDiff := aExp - bExp;
  3435. aSig := ( aSig OR $00800000 ) shl 8;
  3436. bSig := ( bSig OR $00800000 ) shl 8;
  3437. if ( expDiff < 0 ) then
  3438. Begin
  3439. if ( expDiff < -1 ) then
  3440. Begin
  3441. float32_rem := a;
  3442. exit;
  3443. End;
  3444. aSig := aSig shr 1;
  3445. End;
  3446. q := bits32( bSig <= aSig );
  3447. if ( q <> 0) then
  3448. aSig := aSig - bSig;
  3449. expDiff := expDiff - 32;
  3450. while ( 0 < expDiff ) do
  3451. Begin
  3452. q := estimateDiv64To32( aSig, 0, bSig );
  3453. if (2 < q) then
  3454. q := q - 2
  3455. else
  3456. q := 0;
  3457. aSig := - ( ( bSig shr 2 ) * q );
  3458. expDiff := expDiff - 30;
  3459. End;
  3460. expDiff := expDiff + 32;
  3461. if ( 0 < expDiff ) then
  3462. Begin
  3463. q := estimateDiv64To32( aSig, 0, bSig );
  3464. if (2 < q) then
  3465. q := q - 2
  3466. else
  3467. q := 0;
  3468. q := q shr (32 - expDiff);
  3469. bSig := bSig shr 2;
  3470. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3471. End
  3472. else
  3473. Begin
  3474. aSig := aSig shr 2;
  3475. bSig := bSig shr 2;
  3476. End;
  3477. Repeat
  3478. alternateASig := aSig;
  3479. Inc(q);
  3480. aSig := aSig - bSig;
  3481. Until not ( 0 <= sbits32 (aSig) );
  3482. sigMean := aSig + alternateASig;
  3483. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3484. Begin
  3485. aSig := alternateASig;
  3486. End;
  3487. zSign := flag( sbits32 (aSig) < 0 );
  3488. if ( zSign<>0 ) then
  3489. aSig := - aSig;
  3490. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3491. End;
  3492. {*
  3493. -------------------------------------------------------------------------------
  3494. Returns the square root of the single-precision floating-point value `a'.
  3495. The operation is performed according to the IEC/IEEE Standard for Binary
  3496. Floating-Point Arithmetic.
  3497. -------------------------------------------------------------------------------
  3498. *}
  3499. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3500. Var
  3501. aSign : flag;
  3502. aExp, zExp : int16;
  3503. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3504. label roundAndPack;
  3505. Begin
  3506. aSig := extractFloat32Frac( a.float32 );
  3507. aExp := extractFloat32Exp( a.float32 );
  3508. aSign := extractFloat32Sign( a.float32 );
  3509. if ( aExp = $FF ) then
  3510. Begin
  3511. if ( aSig <> 0) then
  3512. Begin
  3513. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3514. exit;
  3515. End;
  3516. if ( aSign = 0) then
  3517. Begin
  3518. float32_sqrt := a;
  3519. exit;
  3520. End;
  3521. float_raise( float_flag_invalid );
  3522. float32_sqrt.float32 := float32_default_nan;
  3523. exit;
  3524. End;
  3525. if ( aSign <> 0) then
  3526. Begin
  3527. if ( ( aExp OR aSig ) = 0 ) then
  3528. Begin
  3529. float32_sqrt := a;
  3530. exit;
  3531. End;
  3532. float_raise( float_flag_invalid );
  3533. float32_sqrt.float32 := float32_default_nan;
  3534. exit;
  3535. End;
  3536. if ( aExp = 0 ) then
  3537. Begin
  3538. if ( aSig = 0 ) then
  3539. Begin
  3540. float32_sqrt.float32 := 0;
  3541. exit;
  3542. End;
  3543. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3544. End;
  3545. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3546. aSig := ( aSig OR $00800000 ) shl 8;
  3547. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3548. if ( ( zSig and $7F ) <= 5 ) then
  3549. Begin
  3550. if ( zSig < 2 ) then
  3551. Begin
  3552. zSig := $7FFFFFFF;
  3553. goto roundAndPack;
  3554. End
  3555. else
  3556. Begin
  3557. aSig := aSig shr (aExp and 1);
  3558. mul32To64( zSig, zSig, term0, term1 );
  3559. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3560. while ( sbits32 (rem0) < 0 ) do
  3561. Begin
  3562. Dec(zSig);
  3563. shortShift64Left( 0, zSig, 1, term0, term1 );
  3564. term1 := term1 or 1;
  3565. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3566. End;
  3567. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3568. End;
  3569. End;
  3570. shift32RightJamming( zSig, 1, zSig );
  3571. roundAndPack:
  3572. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3573. End;
  3574. {*
  3575. -------------------------------------------------------------------------------
  3576. Returns 1 if the single-precision floating-point value `a' is equal to
  3577. the corresponding value `b', and 0 otherwise. The comparison is performed
  3578. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3579. -------------------------------------------------------------------------------
  3580. *}
  3581. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3582. Begin
  3583. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3584. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3585. ) then
  3586. Begin
  3587. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3588. Begin
  3589. float_raise( float_flag_invalid );
  3590. End;
  3591. float32_eq := 0;
  3592. exit;
  3593. End;
  3594. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3595. End;
  3596. {*
  3597. -------------------------------------------------------------------------------
  3598. Returns 1 if the single-precision floating-point value `a' is less than
  3599. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3600. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3601. Arithmetic.
  3602. -------------------------------------------------------------------------------
  3603. *}
  3604. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3605. var
  3606. aSign, bSign: flag;
  3607. Begin
  3608. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3609. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3610. ) then
  3611. Begin
  3612. float_raise( float_flag_invalid );
  3613. float32_le := 0;
  3614. exit;
  3615. End;
  3616. aSign := extractFloat32Sign( a.float32 );
  3617. bSign := extractFloat32Sign( b.float32 );
  3618. if ( aSign <> bSign ) then
  3619. Begin
  3620. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3621. exit;
  3622. End;
  3623. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3624. End;
  3625. {*
  3626. -------------------------------------------------------------------------------
  3627. Returns 1 if the single-precision floating-point value `a' is less than
  3628. the corresponding value `b', and 0 otherwise. The comparison is performed
  3629. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3630. -------------------------------------------------------------------------------
  3631. *}
  3632. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3633. var
  3634. aSign, bSign: flag;
  3635. Begin
  3636. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3637. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3638. ) then
  3639. Begin
  3640. float_raise( float_flag_invalid );
  3641. float32_lt :=0;
  3642. exit;
  3643. End;
  3644. aSign := extractFloat32Sign( a.float32 );
  3645. bSign := extractFloat32Sign( b.float32 );
  3646. if ( aSign <> bSign ) then
  3647. Begin
  3648. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3649. exit;
  3650. End;
  3651. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3652. End;
  3653. {*
  3654. -------------------------------------------------------------------------------
  3655. Returns 1 if the single-precision floating-point value `a' is equal to
  3656. the corresponding value `b', and 0 otherwise. The invalid exception is
  3657. raised if either operand is a NaN. Otherwise, the comparison is performed
  3658. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3659. -------------------------------------------------------------------------------
  3660. *}
  3661. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3662. Begin
  3663. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3664. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3665. ) then
  3666. Begin
  3667. float_raise( float_flag_invalid );
  3668. float32_eq_signaling := 0;
  3669. exit;
  3670. End;
  3671. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3672. End;
  3673. {*
  3674. -------------------------------------------------------------------------------
  3675. Returns 1 if the single-precision floating-point value `a' is less than or
  3676. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3677. cause an exception. Otherwise, the comparison is performed according to the
  3678. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3679. -------------------------------------------------------------------------------
  3680. *}
  3681. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3682. Var
  3683. aSign, bSign: flag;
  3684. aExp, bExp: int16;
  3685. Begin
  3686. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3687. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3688. ) then
  3689. Begin
  3690. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3691. Begin
  3692. float_raise( float_flag_invalid );
  3693. End;
  3694. float32_le_quiet := 0;
  3695. exit;
  3696. End;
  3697. aSign := extractFloat32Sign( a );
  3698. bSign := extractFloat32Sign( b );
  3699. if ( aSign <> bSign ) then
  3700. Begin
  3701. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3702. exit;
  3703. End;
  3704. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3705. End;
  3706. {*
  3707. -------------------------------------------------------------------------------
  3708. Returns 1 if the single-precision floating-point value `a' is less than
  3709. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3710. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3711. Standard for Binary Floating-Point Arithmetic.
  3712. -------------------------------------------------------------------------------
  3713. *}
  3714. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  3715. Var
  3716. aSign, bSign: flag;
  3717. Begin
  3718. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3719. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3720. ) then
  3721. Begin
  3722. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3723. Begin
  3724. float_raise( float_flag_invalid );
  3725. End;
  3726. float32_lt_quiet := 0;
  3727. exit;
  3728. End;
  3729. aSign := extractFloat32Sign( a );
  3730. bSign := extractFloat32Sign( b );
  3731. if ( aSign <> bSign ) then
  3732. Begin
  3733. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  3734. exit;
  3735. End;
  3736. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  3737. End;
  3738. {*
  3739. -------------------------------------------------------------------------------
  3740. Returns the result of converting the double-precision floating-point value
  3741. `a' to the 32-bit two's complement integer format. The conversion is
  3742. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3743. Arithmetic---which means in particular that the conversion is rounded
  3744. according to the current rounding mode. If `a' is a NaN, the largest
  3745. positive integer is returned. Otherwise, if the conversion overflows, the
  3746. largest integer with the same sign as `a' is returned.
  3747. -------------------------------------------------------------------------------
  3748. *}
  3749. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  3750. var
  3751. aSign: flag;
  3752. aExp, shiftCount: int16;
  3753. aSig0, aSig1, absZ, aSigExtra: bits32;
  3754. z: int32;
  3755. roundingMode: int8;
  3756. label invalid;
  3757. Begin
  3758. aSig1 := extractFloat64Frac1( a );
  3759. aSig0 := extractFloat64Frac0( a );
  3760. aExp := extractFloat64Exp( a );
  3761. aSign := extractFloat64Sign( a );
  3762. shiftCount := aExp - $413;
  3763. if ( 0 <= shiftCount ) then
  3764. Begin
  3765. if ( $41E < aExp ) then
  3766. Begin
  3767. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3768. aSign := 0;
  3769. goto invalid;
  3770. End;
  3771. shortShift64Left(
  3772. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3773. if ( $80000000 < absZ ) then
  3774. goto invalid;
  3775. End
  3776. else
  3777. Begin
  3778. aSig1 := flag( aSig1 <> 0 );
  3779. if ( aExp < $3FE ) then
  3780. Begin
  3781. aSigExtra := aExp OR aSig0 OR aSig1;
  3782. absZ := 0;
  3783. End
  3784. else
  3785. Begin
  3786. aSig0 := aSig0 OR $00100000;
  3787. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3788. absZ := aSig0 shr ( - shiftCount );
  3789. End;
  3790. End;
  3791. roundingMode := float_rounding_mode;
  3792. if ( roundingMode = float_round_nearest_even ) then
  3793. Begin
  3794. if ( sbits32(aSigExtra) < 0 ) then
  3795. Begin
  3796. Inc(absZ);
  3797. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  3798. absZ := absZ and not 1;
  3799. End;
  3800. if aSign <> 0 then
  3801. z := - absZ
  3802. else
  3803. z := absZ;
  3804. End
  3805. else
  3806. Begin
  3807. aSigExtra := bits32( aSigExtra <> 0 );
  3808. if ( aSign <> 0) then
  3809. Begin
  3810. z := - ( absZ
  3811. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  3812. End
  3813. else
  3814. Begin
  3815. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  3816. End
  3817. End;
  3818. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  3819. Begin
  3820. invalid:
  3821. float_raise( float_flag_invalid );
  3822. if (aSign <> 0 ) then
  3823. float64_to_int32 := sbits32 ($80000000)
  3824. else
  3825. float64_to_int32 := $7FFFFFFF;
  3826. exit;
  3827. End;
  3828. if ( aSigExtra <> 0) then
  3829. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3830. float64_to_int32 := z;
  3831. End;
  3832. {*
  3833. -------------------------------------------------------------------------------
  3834. Returns the result of converting the double-precision floating-point value
  3835. `a' to the 32-bit two's complement integer format. The conversion is
  3836. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3837. Arithmetic, except that the conversion is always rounded toward zero.
  3838. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3839. the conversion overflows, the largest integer with the same sign as `a' is
  3840. returned.
  3841. -------------------------------------------------------------------------------
  3842. *}
  3843. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  3844. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  3845. Var
  3846. aSign: flag;
  3847. aExp, shiftCount: int16;
  3848. aSig0, aSig1, absZ, aSigExtra: bits32;
  3849. z: int32;
  3850. label invalid;
  3851. Begin
  3852. aSig1 := extractFloat64Frac1( a );
  3853. aSig0 := extractFloat64Frac0( a );
  3854. aExp := extractFloat64Exp( a );
  3855. aSign := extractFloat64Sign( a );
  3856. shiftCount := aExp - $413;
  3857. if ( 0 <= shiftCount ) then
  3858. Begin
  3859. if ( $41E < aExp ) then
  3860. Begin
  3861. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3862. aSign := 0;
  3863. goto invalid;
  3864. End;
  3865. shortShift64Left(
  3866. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3867. End
  3868. else
  3869. Begin
  3870. if ( aExp < $3FF ) then
  3871. Begin
  3872. if ( aExp OR aSig0 OR aSig1 )<>0 then
  3873. Begin
  3874. softfloat_exception_flags :=
  3875. softfloat_exception_flags or float_flag_inexact;
  3876. End;
  3877. float64_to_int32_round_to_zero := 0;
  3878. exit;
  3879. End;
  3880. aSig0 := aSig0 or $00100000;
  3881. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3882. absZ := aSig0 shr ( - shiftCount );
  3883. End;
  3884. if aSign <> 0 then
  3885. z := - absZ
  3886. else
  3887. z := absZ;
  3888. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  3889. Begin
  3890. invalid:
  3891. float_raise( float_flag_invalid );
  3892. if (aSign <> 0) then
  3893. float64_to_int32_round_to_zero := sbits32 ($80000000)
  3894. else
  3895. float64_to_int32_round_to_zero := $7FFFFFFF;
  3896. exit;
  3897. End;
  3898. if ( aSigExtra <> 0) then
  3899. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3900. float64_to_int32_round_to_zero := z;
  3901. End;
  3902. {*
  3903. -------------------------------------------------------------------------------
  3904. Returns the result of converting the double-precision floating-point value
  3905. `a' to the single-precision floating-point format. The conversion is
  3906. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3907. Arithmetic.
  3908. -------------------------------------------------------------------------------
  3909. *}
  3910. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  3911. Var
  3912. aSign: flag;
  3913. aExp: int16;
  3914. aSig0, aSig1, zSig: bits32;
  3915. allZero: bits32;
  3916. tmp : CommonNanT;
  3917. Begin
  3918. aSig1 := extractFloat64Frac1( a );
  3919. aSig0 := extractFloat64Frac0( a );
  3920. aExp := extractFloat64Exp( a );
  3921. aSign := extractFloat64Sign( a );
  3922. if ( aExp = $7FF ) then
  3923. Begin
  3924. if ( aSig0 OR aSig1 ) <> 0 then
  3925. Begin
  3926. float64ToCommonNaN( a, tmp );
  3927. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  3928. exit;
  3929. End;
  3930. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  3931. exit;
  3932. End;
  3933. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  3934. if ( aExp <> 0) then
  3935. zSig := zSig OR $40000000;
  3936. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  3937. End;
  3938. {*
  3939. -------------------------------------------------------------------------------
  3940. Rounds the double-precision floating-point value `a' to an integer,
  3941. and returns the result as a double-precision floating-point value. The
  3942. operation is performed according to the IEC/IEEE Standard for Binary
  3943. Floating-Point Arithmetic.
  3944. -------------------------------------------------------------------------------
  3945. *}
  3946. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  3947. Var
  3948. aSign: flag;
  3949. aExp: int16;
  3950. lastBitMask, roundBitsMask: bits32;
  3951. roundingMode: int8;
  3952. z: float64;
  3953. Begin
  3954. aExp := extractFloat64Exp( a );
  3955. if ( $413 <= aExp ) then
  3956. Begin
  3957. if ( $433 <= aExp ) then
  3958. Begin
  3959. if ( ( aExp = $7FF )
  3960. AND
  3961. (
  3962. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  3963. ) <>0)
  3964. ) then
  3965. Begin
  3966. propagateFloat64NaN( a, a, result );
  3967. exit;
  3968. End;
  3969. result := a;
  3970. exit;
  3971. End;
  3972. lastBitMask := 1;
  3973. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  3974. roundBitsMask := lastBitMask - 1;
  3975. z := a;
  3976. roundingMode := float_rounding_mode;
  3977. if ( roundingMode = float_round_nearest_even ) then
  3978. Begin
  3979. if ( lastBitMask <> 0) then
  3980. Begin
  3981. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  3982. if ( ( z.low and roundBitsMask ) = 0 ) then
  3983. z.low := z.low and not lastBitMask;
  3984. End
  3985. else
  3986. Begin
  3987. if ( sbits32 (z.low) < 0 ) then
  3988. Begin
  3989. Inc(z.high);
  3990. if ( bits32 ( z.low shl 1 ) = 0 ) then
  3991. z.high := z.high and not 1;
  3992. End;
  3993. End;
  3994. End
  3995. else if ( roundingMode <> float_round_to_zero ) then
  3996. Begin
  3997. if ( extractFloat64Sign( z )
  3998. xor flag( roundingMode = float_round_up ) )<> 0 then
  3999. Begin
  4000. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4001. End;
  4002. End;
  4003. z.low := z.low and not roundBitsMask;
  4004. End
  4005. else
  4006. Begin
  4007. if ( aExp <= $3FE ) then
  4008. Begin
  4009. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4010. Begin
  4011. result := a;
  4012. exit;
  4013. End;
  4014. softfloat_exception_flags := softfloat_exception_flags or
  4015. float_flag_inexact;
  4016. aSign := extractFloat64Sign( a );
  4017. case ( float_rounding_mode ) of
  4018. float_round_nearest_even:
  4019. Begin
  4020. if ( ( aExp = $3FE )
  4021. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4022. ) then
  4023. Begin
  4024. packFloat64( aSign, $3FF, 0, 0, result );
  4025. exit;
  4026. End;
  4027. End;
  4028. float_round_down:
  4029. Begin
  4030. if aSign<>0 then
  4031. packFloat64( 1, $3FF, 0, 0, result )
  4032. else
  4033. packFloat64( 0, 0, 0, 0, result );
  4034. exit;
  4035. End;
  4036. float_round_up:
  4037. Begin
  4038. if aSign <> 0 then
  4039. packFloat64( 1, 0, 0, 0, result )
  4040. else
  4041. packFloat64( 0, $3FF, 0, 0, result );
  4042. exit;
  4043. End;
  4044. end;
  4045. packFloat64( aSign, 0, 0, 0, result );
  4046. exit;
  4047. End;
  4048. lastBitMask := 1;
  4049. lastBitMask := lastBitMask shl ($413 - aExp);
  4050. roundBitsMask := lastBitMask - 1;
  4051. z.low := 0;
  4052. z.high := a.high;
  4053. roundingMode := float_rounding_mode;
  4054. if ( roundingMode = float_round_nearest_even ) then
  4055. Begin
  4056. z.high := z.high + lastBitMask shr 1;
  4057. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4058. Begin
  4059. z.high := z.high and not lastBitMask;
  4060. End;
  4061. End
  4062. else if ( roundingMode <> float_round_to_zero ) then
  4063. Begin
  4064. if ( extractFloat64Sign( z )
  4065. xor flag( roundingMode = float_round_up ) )<> 0 then
  4066. Begin
  4067. z.high := z.high or bits32( a.low <> 0 );
  4068. z.high := z.high + roundBitsMask;
  4069. End;
  4070. End;
  4071. z.high := z.high and not roundBitsMask;
  4072. End;
  4073. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4074. Begin
  4075. softfloat_exception_flags :=
  4076. softfloat_exception_flags or float_flag_inexact;
  4077. End;
  4078. result := z;
  4079. End;
  4080. {*
  4081. -------------------------------------------------------------------------------
  4082. Returns the result of adding the absolute values of the double-precision
  4083. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4084. before being returned. `zSign' is ignored if the result is a NaN.
  4085. The addition is performed according to the IEC/IEEE Standard for Binary
  4086. Floating-Point Arithmetic.
  4087. -------------------------------------------------------------------------------
  4088. *}
  4089. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4090. Var
  4091. aExp, bExp, zExp: int16;
  4092. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4093. expDiff: int16;
  4094. label shiftRight1;
  4095. label roundAndPack;
  4096. Begin
  4097. aSig1 := extractFloat64Frac1( a );
  4098. aSig0 := extractFloat64Frac0( a );
  4099. aExp := extractFloat64Exp( a );
  4100. bSig1 := extractFloat64Frac1( b );
  4101. bSig0 := extractFloat64Frac0( b );
  4102. bExp := extractFloat64Exp( b );
  4103. expDiff := aExp - bExp;
  4104. if ( 0 < expDiff ) then
  4105. Begin
  4106. if ( aExp = $7FF ) then
  4107. Begin
  4108. if ( aSig0 OR aSig1 ) <> 0 then
  4109. Begin
  4110. propagateFloat64NaN( a, b, out );
  4111. exit;
  4112. end;
  4113. out := a;
  4114. exit;
  4115. End;
  4116. if ( bExp = 0 ) then
  4117. Begin
  4118. Dec(expDiff);
  4119. End
  4120. else
  4121. Begin
  4122. bSig0 := bSig0 or $00100000;
  4123. End;
  4124. shift64ExtraRightJamming(
  4125. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4126. zExp := aExp;
  4127. End
  4128. else if ( expDiff < 0 ) then
  4129. Begin
  4130. if ( bExp = $7FF ) then
  4131. Begin
  4132. if ( bSig0 OR bSig1 ) <> 0 then
  4133. Begin
  4134. propagateFloat64NaN( a, b, out );
  4135. exit;
  4136. End;
  4137. packFloat64( zSign, $7FF, 0, 0, out );
  4138. End;
  4139. if ( aExp = 0 ) then
  4140. Begin
  4141. Inc(expDiff);
  4142. End
  4143. else
  4144. Begin
  4145. aSig0 := aSig0 or $00100000;
  4146. End;
  4147. shift64ExtraRightJamming(
  4148. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4149. zExp := bExp;
  4150. End
  4151. else
  4152. Begin
  4153. if ( aExp = $7FF ) then
  4154. Begin
  4155. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4156. Begin
  4157. propagateFloat64NaN( a, b, out );
  4158. exit;
  4159. End;
  4160. out := a;
  4161. exit;
  4162. End;
  4163. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4164. if ( aExp = 0 ) then
  4165. Begin
  4166. packFloat64( zSign, 0, zSig0, zSig1, out );
  4167. exit;
  4168. End;
  4169. zSig2 := 0;
  4170. zSig0 := zSig0 or $00200000;
  4171. zExp := aExp;
  4172. goto shiftRight1;
  4173. End;
  4174. aSig0 := aSig0 or $00100000;
  4175. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4176. Dec(zExp);
  4177. if ( zSig0 < $00200000 ) then
  4178. goto roundAndPack;
  4179. Inc(zExp);
  4180. shiftRight1:
  4181. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4182. roundAndPack:
  4183. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4184. End;
  4185. {*
  4186. -------------------------------------------------------------------------------
  4187. Returns the result of subtracting the absolute values of the double-
  4188. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4189. difference is negated before being returned. `zSign' is ignored if the
  4190. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4191. Standard for Binary Floating-Point Arithmetic.
  4192. -------------------------------------------------------------------------------
  4193. *}
  4194. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4195. Var
  4196. aExp, bExp, zExp: int16;
  4197. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4198. expDiff: int16;
  4199. z: float64;
  4200. label aExpBigger;
  4201. label bExpBigger;
  4202. label aBigger;
  4203. label bBigger;
  4204. label normalizeRoundAndPack;
  4205. Begin
  4206. aSig1 := extractFloat64Frac1( a );
  4207. aSig0 := extractFloat64Frac0( a );
  4208. aExp := extractFloat64Exp( a );
  4209. bSig1 := extractFloat64Frac1( b );
  4210. bSig0 := extractFloat64Frac0( b );
  4211. bExp := extractFloat64Exp( b );
  4212. expDiff := aExp - bExp;
  4213. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4214. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4215. if ( 0 < expDiff ) then goto aExpBigger;
  4216. if ( expDiff < 0 ) then goto bExpBigger;
  4217. if ( aExp = $7FF ) then
  4218. Begin
  4219. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4220. Begin
  4221. propagateFloat64NaN( a, b, out );
  4222. exit;
  4223. End;
  4224. float_raise( float_flag_invalid );
  4225. z.low := float64_default_nan_low;
  4226. z.high := float64_default_nan_high;
  4227. out := z;
  4228. exit;
  4229. End;
  4230. if ( aExp = 0 ) then
  4231. Begin
  4232. aExp := 1;
  4233. bExp := 1;
  4234. End;
  4235. if ( bSig0 < aSig0 ) then goto aBigger;
  4236. if ( aSig0 < bSig0 ) then goto bBigger;
  4237. if ( bSig1 < aSig1 ) then goto aBigger;
  4238. if ( aSig1 < bSig1 ) then goto bBigger;
  4239. packFloat64( flag(float_rounding_mode = float_round_down), 0, 0, 0 , out);
  4240. exit;
  4241. bExpBigger:
  4242. if ( bExp = $7FF ) then
  4243. Begin
  4244. if ( bSig0 OR bSig1 ) <> 0 then
  4245. Begin
  4246. propagateFloat64NaN( a, b, out );
  4247. exit;
  4248. End;
  4249. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4250. exit;
  4251. End;
  4252. if ( aExp = 0 ) then
  4253. Begin
  4254. Inc(expDiff);
  4255. End
  4256. else
  4257. Begin
  4258. aSig0 := aSig0 or $40000000;
  4259. End;
  4260. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4261. bSig0 := bSig0 or $40000000;
  4262. bBigger:
  4263. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4264. zExp := bExp;
  4265. zSign := zSign xor 1;
  4266. goto normalizeRoundAndPack;
  4267. aExpBigger:
  4268. if ( aExp = $7FF ) then
  4269. Begin
  4270. if ( aSig0 OR aSig1 ) <> 0 then
  4271. Begin
  4272. propagateFloat64NaN( a, b, out );
  4273. exit;
  4274. End;
  4275. out := a;
  4276. exit;
  4277. End;
  4278. if ( bExp = 0 ) then
  4279. Begin
  4280. Dec(expDiff);
  4281. End
  4282. else
  4283. Begin
  4284. bSig0 := bSig0 or $40000000;
  4285. End;
  4286. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4287. aSig0 := aSig0 or $40000000;
  4288. aBigger:
  4289. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4290. zExp := aExp;
  4291. normalizeRoundAndPack:
  4292. Dec(zExp);
  4293. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4294. End;
  4295. {*
  4296. -------------------------------------------------------------------------------
  4297. Returns the result of adding the double-precision floating-point values `a'
  4298. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4299. Binary Floating-Point Arithmetic.
  4300. -------------------------------------------------------------------------------
  4301. *}
  4302. Function float64_add( a: float64; b : float64) : Float64;
  4303. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4304. Var
  4305. aSign, bSign: flag;
  4306. Begin
  4307. aSign := extractFloat64Sign( a );
  4308. bSign := extractFloat64Sign( b );
  4309. if ( aSign = bSign ) then
  4310. Begin
  4311. addFloat64Sigs( a, b, aSign, result );
  4312. End
  4313. else
  4314. Begin
  4315. subFloat64Sigs( a, b, aSign, result );
  4316. End;
  4317. End;
  4318. {*
  4319. -------------------------------------------------------------------------------
  4320. Returns the result of subtracting the double-precision floating-point values
  4321. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4322. for Binary Floating-Point Arithmetic.
  4323. -------------------------------------------------------------------------------
  4324. *}
  4325. Function float64_sub(a: float64; b : float64) : Float64;
  4326. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4327. Var
  4328. aSign, bSign: flag;
  4329. Begin
  4330. aSign := extractFloat64Sign( a );
  4331. bSign := extractFloat64Sign( b );
  4332. if ( aSign = bSign ) then
  4333. Begin
  4334. subFloat64Sigs( a, b, aSign, result );
  4335. End
  4336. else
  4337. Begin
  4338. addFloat64Sigs( a, b, aSign, result );
  4339. End;
  4340. End;
  4341. {*
  4342. -------------------------------------------------------------------------------
  4343. Returns the result of multiplying the double-precision floating-point values
  4344. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4345. for Binary Floating-Point Arithmetic.
  4346. -------------------------------------------------------------------------------
  4347. *}
  4348. Function float64_mul( a: float64; b:float64) : Float64;
  4349. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4350. Var
  4351. aSign, bSign, zSign: flag;
  4352. aExp, bExp, zExp: int16;
  4353. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4354. z: float64;
  4355. label invalid;
  4356. Begin
  4357. aSig1 := extractFloat64Frac1( a );
  4358. aSig0 := extractFloat64Frac0( a );
  4359. aExp := extractFloat64Exp( a );
  4360. aSign := extractFloat64Sign( a );
  4361. bSig1 := extractFloat64Frac1( b );
  4362. bSig0 := extractFloat64Frac0( b );
  4363. bExp := extractFloat64Exp( b );
  4364. bSign := extractFloat64Sign( b );
  4365. zSign := aSign xor bSign;
  4366. if ( aExp = $7FF ) then
  4367. Begin
  4368. if ( (( aSig0 OR aSig1 ) <>0)
  4369. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4370. Begin
  4371. propagateFloat64NaN( a, b, result );
  4372. exit;
  4373. End;
  4374. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4375. packFloat64( zSign, $7FF, 0, 0, result );
  4376. exit;
  4377. End;
  4378. if ( bExp = $7FF ) then
  4379. Begin
  4380. if ( bSig0 OR bSig1 )<> 0 then
  4381. Begin
  4382. propagateFloat64NaN( a, b, result );
  4383. exit;
  4384. End;
  4385. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4386. Begin
  4387. invalid:
  4388. float_raise( float_flag_invalid );
  4389. z.low := float64_default_nan_low;
  4390. z.high := float64_default_nan_high;
  4391. result := z;
  4392. exit;
  4393. End;
  4394. packFloat64( zSign, $7FF, 0, 0, result );
  4395. exit;
  4396. End;
  4397. if ( aExp = 0 ) then
  4398. Begin
  4399. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4400. Begin
  4401. packFloat64( zSign, 0, 0, 0, result );
  4402. exit;
  4403. End;
  4404. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4405. End;
  4406. if ( bExp = 0 ) then
  4407. Begin
  4408. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4409. Begin
  4410. packFloat64( zSign, 0, 0, 0, result );
  4411. exit;
  4412. End;
  4413. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4414. End;
  4415. zExp := aExp + bExp - $400;
  4416. aSig0 := aSig0 or $00100000;
  4417. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4418. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4419. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4420. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4421. if ( $00200000 <= zSig0 ) then
  4422. Begin
  4423. shift64ExtraRightJamming(
  4424. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4425. Inc(zExp);
  4426. End;
  4427. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4428. End;
  4429. {*
  4430. -------------------------------------------------------------------------------
  4431. Returns the result of dividing the double-precision floating-point value `a'
  4432. by the corresponding value `b'. The operation is performed according to the
  4433. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4434. -------------------------------------------------------------------------------
  4435. *}
  4436. Function float64_div(a: float64; b : float64) : Float64;
  4437. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4438. Var
  4439. aSign, bSign, zSign: flag;
  4440. aExp, bExp, zExp: int16;
  4441. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4442. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4443. z: float64;
  4444. label invalid;
  4445. Begin
  4446. aSig1 := extractFloat64Frac1( a );
  4447. aSig0 := extractFloat64Frac0( a );
  4448. aExp := extractFloat64Exp( a );
  4449. aSign := extractFloat64Sign( a );
  4450. bSig1 := extractFloat64Frac1( b );
  4451. bSig0 := extractFloat64Frac0( b );
  4452. bExp := extractFloat64Exp( b );
  4453. bSign := extractFloat64Sign( b );
  4454. zSign := aSign xor bSign;
  4455. if ( aExp = $7FF ) then
  4456. Begin
  4457. if ( aSig0 OR aSig1 )<> 0 then
  4458. Begin
  4459. propagateFloat64NaN( a, b, result );
  4460. exit;
  4461. end;
  4462. if ( bExp = $7FF ) then
  4463. Begin
  4464. if ( bSig0 OR bSig1 )<>0 then
  4465. Begin
  4466. propagateFloat64NaN( a, b, result );
  4467. exit;
  4468. End;
  4469. goto invalid;
  4470. End;
  4471. packFloat64( zSign, $7FF, 0, 0, result );
  4472. exit;
  4473. End;
  4474. if ( bExp = $7FF ) then
  4475. Begin
  4476. if ( bSig0 OR bSig1 )<> 0 then
  4477. Begin
  4478. propagateFloat64NaN( a, b, result );
  4479. exit;
  4480. End;
  4481. packFloat64( zSign, 0, 0, 0, result );
  4482. exit;
  4483. End;
  4484. if ( bExp = 0 ) then
  4485. Begin
  4486. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4487. Begin
  4488. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4489. Begin
  4490. invalid:
  4491. float_raise( float_flag_invalid );
  4492. z.low := float64_default_nan_low;
  4493. z.high := float64_default_nan_high;
  4494. result := z;
  4495. exit;
  4496. End;
  4497. float_raise( float_flag_divbyzero );
  4498. packFloat64( zSign, $7FF, 0, 0, result );
  4499. exit;
  4500. End;
  4501. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4502. End;
  4503. if ( aExp = 0 ) then
  4504. Begin
  4505. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4506. Begin
  4507. packFloat64( zSign, 0, 0, 0, result );
  4508. exit;
  4509. End;
  4510. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4511. End;
  4512. zExp := aExp - bExp + $3FD;
  4513. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4514. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4515. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4516. Begin
  4517. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4518. Inc(zExp);
  4519. End;
  4520. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4521. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4522. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4523. while ( sbits32 (rem0) < 0 ) do
  4524. Begin
  4525. Dec(zSig0);
  4526. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4527. End;
  4528. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4529. if ( ( zSig1 and $3FF ) <= 4 ) then
  4530. Begin
  4531. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4532. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4533. while ( sbits32 (rem1) < 0 ) do
  4534. Begin
  4535. Dec(zSig1);
  4536. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4537. End;
  4538. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4539. End;
  4540. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4541. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4542. End;
  4543. {*
  4544. -------------------------------------------------------------------------------
  4545. Returns the remainder of the double-precision floating-point value `a'
  4546. with respect to the corresponding value `b'. The operation is performed
  4547. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4548. -------------------------------------------------------------------------------
  4549. *}
  4550. Function float64_rem(a: float64; b : float64) : float64;
  4551. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4552. Var
  4553. aSign, bSign, zSign: flag;
  4554. aExp, bExp, expDiff: int16;
  4555. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4556. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4557. sigMean0: sbits32;
  4558. z: float64;
  4559. label invalid;
  4560. Begin
  4561. aSig1 := extractFloat64Frac1( a );
  4562. aSig0 := extractFloat64Frac0( a );
  4563. aExp := extractFloat64Exp( a );
  4564. aSign := extractFloat64Sign( a );
  4565. bSig1 := extractFloat64Frac1( b );
  4566. bSig0 := extractFloat64Frac0( b );
  4567. bExp := extractFloat64Exp( b );
  4568. bSign := extractFloat64Sign( b );
  4569. if ( aExp = $7FF ) then
  4570. Begin
  4571. if ((( aSig0 OR aSig1 )<>0)
  4572. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4573. Begin
  4574. propagateFloat64NaN( a, b, result );
  4575. exit;
  4576. End;
  4577. goto invalid;
  4578. End;
  4579. if ( bExp = $7FF ) then
  4580. Begin
  4581. if ( bSig0 OR bSig1 ) <> 0 then
  4582. Begin
  4583. propagateFloat64NaN( a, b, result );
  4584. exit;
  4585. End;
  4586. result := a;
  4587. exit;
  4588. End;
  4589. if ( bExp = 0 ) then
  4590. Begin
  4591. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4592. Begin
  4593. invalid:
  4594. float_raise( float_flag_invalid );
  4595. z.low := float64_default_nan_low;
  4596. z.high := float64_default_nan_high;
  4597. result := z;
  4598. exit;
  4599. End;
  4600. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4601. End;
  4602. if ( aExp = 0 ) then
  4603. Begin
  4604. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4605. Begin
  4606. result := a;
  4607. exit;
  4608. End;
  4609. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4610. End;
  4611. expDiff := aExp - bExp;
  4612. if ( expDiff < -1 ) then
  4613. Begin
  4614. result := a;
  4615. exit;
  4616. End;
  4617. shortShift64Left(
  4618. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  4619. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4620. q := le64( bSig0, bSig1, aSig0, aSig1 );
  4621. if ( q )<>0 then
  4622. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4623. expDiff := expDiff - 32;
  4624. while ( 0 < expDiff ) do
  4625. Begin
  4626. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4627. if 4 < q then
  4628. q:= q - 4
  4629. else
  4630. q := 0;
  4631. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4632. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  4633. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  4634. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  4635. expDiff := expDiff - 29;
  4636. End;
  4637. if ( -32 < expDiff ) then
  4638. Begin
  4639. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4640. if 4 < q then
  4641. q := q - 4
  4642. else
  4643. q := 0;
  4644. q := q shr (- expDiff);
  4645. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4646. expDiff := expDiff + 24;
  4647. if ( expDiff < 0 ) then
  4648. Begin
  4649. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4650. End
  4651. else
  4652. Begin
  4653. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  4654. End;
  4655. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4656. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  4657. End
  4658. else
  4659. Begin
  4660. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  4661. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4662. End;
  4663. Repeat
  4664. alternateASig0 := aSig0;
  4665. alternateASig1 := aSig1;
  4666. Inc(q);
  4667. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4668. Until not ( 0 <= sbits32 (aSig0) );
  4669. add64(
  4670. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  4671. if ( ( sigMean0 < 0 )
  4672. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  4673. Begin
  4674. aSig0 := alternateASig0;
  4675. aSig1 := alternateASig1;
  4676. End;
  4677. zSign := flag( sbits32 (aSig0) < 0 );
  4678. if ( zSign <> 0 ) then
  4679. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  4680. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  4681. End;
  4682. {*
  4683. -------------------------------------------------------------------------------
  4684. Returns the square root of the double-precision floating-point value `a'.
  4685. The operation is performed according to the IEC/IEEE Standard for Binary
  4686. Floating-Point Arithmetic.
  4687. -------------------------------------------------------------------------------
  4688. *}
  4689. Procedure float64_sqrt( a: float64; var out: float64 );
  4690. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  4691. Var
  4692. aSign: flag;
  4693. aExp, zExp: int16;
  4694. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  4695. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4696. z: float64;
  4697. label invalid;
  4698. Begin
  4699. aSig1 := extractFloat64Frac1( a );
  4700. aSig0 := extractFloat64Frac0( a );
  4701. aExp := extractFloat64Exp( a );
  4702. aSign := extractFloat64Sign( a );
  4703. if ( aExp = $7FF ) then
  4704. Begin
  4705. if ( aSig0 OR aSig1 ) <> 0 then
  4706. Begin
  4707. propagateFloat64NaN( a, a, out );
  4708. exit;
  4709. End;
  4710. if ( aSign = 0) then
  4711. Begin
  4712. out := a;
  4713. exit;
  4714. End;
  4715. goto invalid;
  4716. End;
  4717. if ( aSign <> 0 ) then
  4718. Begin
  4719. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4720. Begin
  4721. out := a;
  4722. exit;
  4723. End;
  4724. invalid:
  4725. float_raise( float_flag_invalid );
  4726. z.low := float64_default_nan_low;
  4727. z.high := float64_default_nan_high;
  4728. out := z;
  4729. exit;
  4730. End;
  4731. if ( aExp = 0 ) then
  4732. Begin
  4733. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4734. Begin
  4735. packFloat64( 0, 0, 0, 0, out );
  4736. exit;
  4737. End;
  4738. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4739. End;
  4740. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  4741. aSig0 := aSig0 or $00100000;
  4742. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  4743. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  4744. if ( zSig0 = 0 ) then
  4745. zSig0 := $7FFFFFFF;
  4746. doubleZSig0 := zSig0 + zSig0;
  4747. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  4748. mul32To64( zSig0, zSig0, term0, term1 );
  4749. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  4750. while ( sbits32 (rem0) < 0 ) do
  4751. Begin
  4752. Dec(zSig0);
  4753. doubleZSig0 := doubleZSig0 - 2;
  4754. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  4755. End;
  4756. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  4757. if ( ( zSig1 and $1FF ) <= 5 ) then
  4758. Begin
  4759. if ( zSig1 = 0 ) then
  4760. zSig1 := 1;
  4761. mul32To64( doubleZSig0, zSig1, term1, term2 );
  4762. sub64( rem1, 0, term1, term2, rem1, rem2 );
  4763. mul32To64( zSig1, zSig1, term2, term3 );
  4764. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  4765. while ( sbits32 (rem1) < 0 ) do
  4766. Begin
  4767. Dec(zSig1);
  4768. shortShift64Left( 0, zSig1, 1, term2, term3 );
  4769. term3 := term3 or 1;
  4770. term2 := term2 or doubleZSig0;
  4771. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  4772. End;
  4773. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4774. End;
  4775. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  4776. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  4777. End;
  4778. {*
  4779. -------------------------------------------------------------------------------
  4780. Returns 1 if the double-precision floating-point value `a' is equal to
  4781. the corresponding value `b', and 0 otherwise. The comparison is performed
  4782. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4783. -------------------------------------------------------------------------------
  4784. *}
  4785. Function float64_eq(a: float64; b: float64): flag;
  4786. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  4787. Begin
  4788. if
  4789. (
  4790. ( extractFloat64Exp( a ) = $7FF )
  4791. AND
  4792. (
  4793. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4794. )
  4795. )
  4796. OR (
  4797. ( extractFloat64Exp( b ) = $7FF )
  4798. AND (
  4799. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4800. )
  4801. )
  4802. ) then
  4803. Begin
  4804. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4805. float_raise( float_flag_invalid );
  4806. float64_eq := 0;
  4807. exit;
  4808. End;
  4809. float64_eq := flag(
  4810. ( a.low = b.low )
  4811. AND ( ( a.high = b.high )
  4812. OR ( ( a.low = 0 )
  4813. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4814. ));
  4815. End;
  4816. {*
  4817. -------------------------------------------------------------------------------
  4818. Returns 1 if the double-precision floating-point value `a' is less than
  4819. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4820. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4821. Arithmetic.
  4822. -------------------------------------------------------------------------------
  4823. *}
  4824. Function float64_le(a: float64;b: float64): flag;
  4825. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  4826. Var
  4827. aSign, bSign: flag;
  4828. Begin
  4829. if
  4830. (
  4831. ( extractFloat64Exp( a ) = $7FF )
  4832. AND
  4833. (
  4834. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4835. )
  4836. )
  4837. OR (
  4838. ( extractFloat64Exp( b ) = $7FF )
  4839. AND (
  4840. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4841. )
  4842. )
  4843. ) then
  4844. Begin
  4845. float_raise( float_flag_invalid );
  4846. float64_le := 0;
  4847. exit;
  4848. End;
  4849. aSign := extractFloat64Sign( a );
  4850. bSign := extractFloat64Sign( b );
  4851. if ( aSign <> bSign ) then
  4852. Begin
  4853. float64_le := flag(
  4854. (aSign <> 0)
  4855. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4856. = 0 ));
  4857. exit;
  4858. End;
  4859. if aSign <> 0 then
  4860. float64_le := le64( b.high, b.low, a.high, a.low )
  4861. else
  4862. float64_le := le64( a.high, a.low, b.high, b.low );
  4863. End;
  4864. {*
  4865. -------------------------------------------------------------------------------
  4866. Returns 1 if the double-precision floating-point value `a' is less than
  4867. the corresponding value `b', and 0 otherwise. The comparison is performed
  4868. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4869. -------------------------------------------------------------------------------
  4870. *}
  4871. Function float64_lt(a: float64;b: float64): flag;
  4872. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  4873. Var
  4874. aSign, bSign: flag;
  4875. Begin
  4876. if
  4877. (
  4878. ( extractFloat64Exp( a ) = $7FF )
  4879. AND
  4880. (
  4881. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4882. )
  4883. )
  4884. OR (
  4885. ( extractFloat64Exp( b ) = $7FF )
  4886. AND (
  4887. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4888. )
  4889. )
  4890. ) then
  4891. Begin
  4892. float_raise( float_flag_invalid );
  4893. float64_lt := 0;
  4894. exit;
  4895. End;
  4896. aSign := extractFloat64Sign( a );
  4897. bSign := extractFloat64Sign( b );
  4898. if ( aSign <> bSign ) then
  4899. Begin
  4900. float64_lt := flag(
  4901. (aSign <> 0)
  4902. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4903. <> 0 ));
  4904. exit;
  4905. End;
  4906. if aSign <> 0 then
  4907. float64_lt := lt64( b.high, b.low, a.high, a.low )
  4908. else
  4909. float64_lt := lt64( a.high, a.low, b.high, b.low );
  4910. End;
  4911. {*
  4912. -------------------------------------------------------------------------------
  4913. Returns 1 if the double-precision floating-point value `a' is equal to
  4914. the corresponding value `b', and 0 otherwise. The invalid exception is
  4915. raised if either operand is a NaN. Otherwise, the comparison is performed
  4916. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4917. -------------------------------------------------------------------------------
  4918. *}
  4919. Function float64_eq_signaling( a: float64; b: float64): flag;
  4920. Begin
  4921. if
  4922. (
  4923. ( extractFloat64Exp( a ) = $7FF )
  4924. AND
  4925. (
  4926. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4927. )
  4928. )
  4929. OR (
  4930. ( extractFloat64Exp( b ) = $7FF )
  4931. AND (
  4932. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4933. )
  4934. )
  4935. ) then
  4936. Begin
  4937. float_raise( float_flag_invalid );
  4938. float64_eq_signaling := 0;
  4939. exit;
  4940. End;
  4941. float64_eq_signaling := flag(
  4942. ( a.low = b.low )
  4943. AND ( ( a.high = b.high )
  4944. OR ( ( a.low = 0 )
  4945. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4946. ));
  4947. End;
  4948. {*
  4949. -------------------------------------------------------------------------------
  4950. Returns 1 if the double-precision floating-point value `a' is less than or
  4951. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4952. cause an exception. Otherwise, the comparison is performed according to the
  4953. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4954. -------------------------------------------------------------------------------
  4955. *}
  4956. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  4957. Var
  4958. aSign, bSign : flag;
  4959. Begin
  4960. if
  4961. (
  4962. ( extractFloat64Exp( a ) = $7FF )
  4963. AND
  4964. (
  4965. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4966. )
  4967. )
  4968. OR (
  4969. ( extractFloat64Exp( b ) = $7FF )
  4970. AND (
  4971. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4972. )
  4973. )
  4974. ) then
  4975. Begin
  4976. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4977. float_raise( float_flag_invalid );
  4978. float64_le_quiet := 0;
  4979. exit;
  4980. End;
  4981. aSign := extractFloat64Sign( a );
  4982. bSign := extractFloat64Sign( b );
  4983. if ( aSign <> bSign ) then
  4984. Begin
  4985. float64_le_quiet := flag
  4986. ((aSign <> 0)
  4987. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4988. = 0 ));
  4989. exit;
  4990. End;
  4991. if aSign <> 0 then
  4992. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  4993. else
  4994. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  4995. End;
  4996. {*
  4997. -------------------------------------------------------------------------------
  4998. Returns 1 if the double-precision floating-point value `a' is less than
  4999. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5000. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5001. Standard for Binary Floating-Point Arithmetic.
  5002. -------------------------------------------------------------------------------
  5003. *}
  5004. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5005. Var
  5006. aSign, bSign: flag;
  5007. Begin
  5008. if
  5009. (
  5010. ( extractFloat64Exp( a ) = $7FF )
  5011. AND
  5012. (
  5013. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5014. )
  5015. )
  5016. OR (
  5017. ( extractFloat64Exp( b ) = $7FF )
  5018. AND (
  5019. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5020. )
  5021. )
  5022. ) then
  5023. Begin
  5024. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5025. float_raise( float_flag_invalid );
  5026. float64_lt_quiet := 0;
  5027. exit;
  5028. End;
  5029. aSign := extractFloat64Sign( a );
  5030. bSign := extractFloat64Sign( b );
  5031. if ( aSign <> bSign ) then
  5032. Begin
  5033. float64_lt_quiet := flag(
  5034. (aSign<>0)
  5035. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5036. <> 0 ));
  5037. exit;
  5038. End;
  5039. If aSign <> 0 then
  5040. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5041. else
  5042. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5043. End;
  5044. {*----------------------------------------------------------------------------
  5045. | Returns the result of converting the 64-bit two's complement integer `a'
  5046. | to the single-precision floating-point format. The conversion is performed
  5047. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5048. *----------------------------------------------------------------------------*}
  5049. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5050. var
  5051. zSign : flag;
  5052. absA : uint64;
  5053. shiftCount: int8;
  5054. zSig : bits32;
  5055. intval : int64rec;
  5056. Begin
  5057. if ( a = 0 ) then
  5058. begin
  5059. int64_to_float32.float32 := 0;
  5060. exit;
  5061. end;
  5062. if a < 0 then
  5063. zSign := flag(TRUE)
  5064. else
  5065. zSign := flag(FALSE);
  5066. if zSign<>0 then
  5067. absA := -a
  5068. else
  5069. absA := a;
  5070. shiftCount := countLeadingZeros64( absA ) - 40;
  5071. if ( 0 <= shiftCount ) then
  5072. begin
  5073. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5074. end
  5075. else
  5076. begin
  5077. shiftCount := shiftCount + 7;
  5078. if ( shiftCount < 0 ) then
  5079. begin
  5080. intval.low := int64rec(AbsA).low;
  5081. intval.high := int64rec(AbsA).high;
  5082. shift64RightJamming( intval.low, intval.high, - shiftCount,
  5083. intval.low, intval.high);
  5084. int64rec(absA).low := intval.low;
  5085. int64rec(absA).high := intval.high;
  5086. end
  5087. else
  5088. absA := absA shl shiftCount;
  5089. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5090. end;
  5091. End;
  5092. {*----------------------------------------------------------------------------
  5093. | Returns the result of converting the 64-bit two's complement integer `a'
  5094. | to the double-precision floating-point format. The conversion is performed
  5095. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5096. *----------------------------------------------------------------------------*}
  5097. function int64_to_float64( a: int64 ): float64;
  5098. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5099. var
  5100. zSign : flag;
  5101. float_result : float64;
  5102. intval : int64rec;
  5103. AbsA : bits64;
  5104. shiftcount : int8;
  5105. zSig0, zSig1 : bits32;
  5106. Begin
  5107. if ( a = 0 ) then
  5108. Begin
  5109. packFloat64( 0, 0, 0, 0, result );
  5110. exit;
  5111. end;
  5112. zSign := flag( a < 0 );
  5113. if ZSign<>0 then
  5114. AbsA := -a
  5115. else
  5116. AbsA := a;
  5117. shiftCount := countLeadingZeros64( absA ) - 11;
  5118. if ( 0 <= shiftCount ) then
  5119. Begin
  5120. absA := absA shl shiftcount;
  5121. zSig0:=int64rec(absA).high;
  5122. zSig1:=int64rec(absA).low;
  5123. End
  5124. else
  5125. Begin
  5126. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  5127. End;
  5128. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5129. int64_to_float64:= float_result;
  5130. End;
  5131. {*----------------------------------------------------------------------------
  5132. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5133. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5134. | Otherwise, returns 0.
  5135. *----------------------------------------------------------------------------*}
  5136. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5137. begin
  5138. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5139. end;
  5140. {*----------------------------------------------------------------------------
  5141. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5142. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5143. | Otherwise, returns 0.
  5144. *----------------------------------------------------------------------------*}
  5145. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5146. begin
  5147. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5148. end;
  5149. {*----------------------------------------------------------------------------
  5150. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5151. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5152. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5153. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5154. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5155. | the most-significant bit of the extra result, and the other 63 bits of the
  5156. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5157. | were all zero. This extra result is stored in the location pointed to by
  5158. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5159. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5160. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5161. | fixed-point value is shifted right by the number of bits given in `count',
  5162. | and the integer part of the result is returned at the locations pointed to
  5163. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5164. | corrupted as described above, and is returned at the location pointed to by
  5165. | `z2Ptr'.)
  5166. *----------------------------------------------------------------------------*}
  5167. procedure shift128ExtraRightJamming(
  5168. a0: bits64;
  5169. a1: bits64;
  5170. a2: bits64;
  5171. count: int16;
  5172. var z0Ptr: bits64;
  5173. var z1Ptr: bits64;
  5174. var z2Ptr: bits64);
  5175. var
  5176. z0, z1, z2: bits64;
  5177. negCount: int8;
  5178. begin
  5179. negCount := ( - count ) and 63;
  5180. if ( count = 0 ) then
  5181. begin
  5182. z2 := a2;
  5183. z1 := a1;
  5184. z0 := a0;
  5185. end
  5186. else begin
  5187. if ( count < 64 ) then
  5188. begin
  5189. z2 := a1 shr negCount;
  5190. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5191. z0 := a0 shr count;
  5192. end
  5193. else begin
  5194. if ( count = 64 ) then
  5195. begin
  5196. z2 := a1;
  5197. z1 := a0;
  5198. end
  5199. else begin
  5200. a2 := a2 or a1;
  5201. if ( count < 128 ) then
  5202. begin
  5203. z2 := a0 shl negCount;
  5204. z1 := a0 shr ( count and 63 );
  5205. end
  5206. else begin
  5207. if ( count = 128 ) then
  5208. z2 := a0
  5209. else
  5210. z2 := ord( a0 <> 0 );
  5211. z1 := 0;
  5212. end;
  5213. end;
  5214. z0 := 0;
  5215. end;
  5216. z2 := z2 or ord( a2 <> 0 );
  5217. end;
  5218. z2Ptr := z2;
  5219. z1Ptr := z1;
  5220. z0Ptr := z0;
  5221. end;
  5222. {*----------------------------------------------------------------------------
  5223. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5224. | _plus_ the number of bits given in `count'. The shifted result is at most
  5225. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5226. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5227. | shifted off is the most-significant bit of the extra result, and the other
  5228. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5229. | bits shifted off were all zero. This extra result is stored in the location
  5230. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5231. | (This routine makes more sense if `a0' and `a1' are considered to form
  5232. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5233. | point value is shifted right by the number of bits given in `count', and
  5234. | the integer part of the result is returned at the location pointed to by
  5235. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5236. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5237. *----------------------------------------------------------------------------*}
  5238. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5239. var
  5240. z0, z1: bits64;
  5241. negCount: int8;
  5242. begin
  5243. negCount := ( - count ) and 63;
  5244. if ( count = 0 ) then
  5245. begin
  5246. z1 := a1;
  5247. z0 := a0;
  5248. end
  5249. else if ( count < 64 ) then
  5250. begin
  5251. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5252. z0 := a0 shr count;
  5253. end
  5254. else begin
  5255. if ( count = 64 ) then
  5256. begin
  5257. z1 := a0 or ord( a1 <> 0 );
  5258. end
  5259. else begin
  5260. z1 := ord( ( a0 or a1 ) <> 0 );
  5261. end;
  5262. z0 := 0;
  5263. end;
  5264. z1Ptr := z1;
  5265. z0Ptr := z0;
  5266. end;
  5267. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5268. {*----------------------------------------------------------------------------
  5269. | Returns the fraction bits of the extended double-precision floating-point
  5270. | value `a'.
  5271. *----------------------------------------------------------------------------*}
  5272. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5273. begin
  5274. result:=a.low;
  5275. end;
  5276. {*----------------------------------------------------------------------------
  5277. | Returns the exponent bits of the extended double-precision floating-point
  5278. | value `a'.
  5279. *----------------------------------------------------------------------------*}
  5280. function extractFloatx80Exp(a : floatx80): int32;inline;
  5281. begin
  5282. result:=a.high and $7FFF;
  5283. end;
  5284. {*----------------------------------------------------------------------------
  5285. | Returns the sign bit of the extended double-precision floating-point value
  5286. | `a'.
  5287. *----------------------------------------------------------------------------*}
  5288. function extractFloatx80Sign(a : floatx80): flag;inline;
  5289. begin
  5290. result:=a.high shr 15;
  5291. end;
  5292. {*----------------------------------------------------------------------------
  5293. | Normalizes the subnormal extended double-precision floating-point value
  5294. | represented by the denormalized significand `aSig'. The normalized exponent
  5295. | and significand are stored at the locations pointed to by `zExpPtr' and
  5296. | `zSigPtr', respectively.
  5297. *----------------------------------------------------------------------------*}
  5298. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5299. var
  5300. shiftCount: int8;
  5301. begin
  5302. shiftCount := countLeadingZeros64( aSig );
  5303. zSigPtr := aSig shl shiftCount;
  5304. zExpPtr := 1 - shiftCount;
  5305. end;
  5306. {*----------------------------------------------------------------------------
  5307. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5308. | extended double-precision floating-point value, returning the result.
  5309. *----------------------------------------------------------------------------*}
  5310. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5311. var
  5312. z: floatx80;
  5313. begin
  5314. z.low := zSig;
  5315. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5316. result:=z;
  5317. end;
  5318. {*----------------------------------------------------------------------------
  5319. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5320. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5321. | and returns the proper extended double-precision floating-point value
  5322. | corresponding to the abstract input. Ordinarily, the abstract value is
  5323. | rounded and packed into the extended double-precision format, with the
  5324. | inexact exception raised if the abstract input cannot be represented
  5325. | exactly. However, if the abstract value is too large, the overflow and
  5326. | inexact exceptions are raised and an infinity or maximal finite value is
  5327. | returned. If the abstract value is too small, the input value is rounded to
  5328. | a subnormal number, and the underflow and inexact exceptions are raised if
  5329. | the abstract input cannot be represented exactly as a subnormal extended
  5330. | double-precision floating-point number.
  5331. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5332. | number of bits as single or double precision, respectively. Otherwise, the
  5333. | result is rounded to the full precision of the extended double-precision
  5334. | format.
  5335. | The input significand must be normalized or smaller. If the input
  5336. | significand is not normalized, `zExp' must be 0; in that case, the result
  5337. | returned is a subnormal number, and it must not require rounding. The
  5338. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5339. | Floating-Point Arithmetic.
  5340. *----------------------------------------------------------------------------*}
  5341. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5342. var
  5343. roundingMode: int8;
  5344. roundNearestEven, increment, isTiny: flag;
  5345. roundIncrement, roundMask, roundBits: int64;
  5346. label
  5347. precision80;
  5348. begin
  5349. roundingMode := float_rounding_mode;
  5350. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5351. if ( roundingPrecision = 80 ) then
  5352. goto precision80;
  5353. if ( roundingPrecision = 64 ) then
  5354. begin
  5355. roundIncrement := int64( $0000000000000400 );
  5356. roundMask := int64( $00000000000007FF );
  5357. end
  5358. else if ( roundingPrecision = 32 ) then
  5359. begin
  5360. roundIncrement := int64( $0000008000000000 );
  5361. roundMask := int64( $000000FFFFFFFFFF );
  5362. end
  5363. else begin
  5364. goto precision80;
  5365. end;
  5366. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5367. if ( not (roundNearestEven<>0) ) then
  5368. begin
  5369. if ( roundingMode = float_round_to_zero ) then
  5370. begin
  5371. roundIncrement := 0;
  5372. end
  5373. else begin
  5374. roundIncrement := roundMask;
  5375. if ( zSign<>0 ) then
  5376. begin
  5377. if ( roundingMode = float_round_up ) then
  5378. roundIncrement := 0;
  5379. end
  5380. else begin
  5381. if ( roundingMode = float_round_down ) then
  5382. roundIncrement := 0;
  5383. end;
  5384. end;
  5385. end;
  5386. roundBits := zSig0 and roundMask;
  5387. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5388. if ( ( $7FFE < zExp )
  5389. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5390. ) begin
  5391. goto overflow;
  5392. end;
  5393. if ( zExp <= 0 ) begin
  5394. isTiny =
  5395. ( float_detect_tininess = float_tininess_before_rounding )
  5396. or ( zExp < 0 )
  5397. or ( zSig0 <= zSig0 + roundIncrement );
  5398. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  5399. zExp := 0;
  5400. roundBits := zSig0 and roundMask;
  5401. if ( isTiny and roundBits ) float_raise( float_flag_underflow );
  5402. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5403. zSig0 += roundIncrement;
  5404. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5405. roundIncrement := roundMask + 1;
  5406. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5407. roundMask |= roundIncrement;
  5408. end;
  5409. zSig0 = ~ roundMask;
  5410. result:=packFloatx80( zSign, zExp, zSig0 );
  5411. end;
  5412. end;
  5413. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5414. zSig0 += roundIncrement;
  5415. if ( zSig0 < roundIncrement ) begin
  5416. ++zExp;
  5417. zSig0 := LIT64( $8000000000000000 );
  5418. end;
  5419. roundIncrement := roundMask + 1;
  5420. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5421. roundMask |= roundIncrement;
  5422. end;
  5423. zSig0 = ~ roundMask;
  5424. if ( zSig0 = 0 ) zExp := 0;
  5425. result:=packFloatx80( zSign, zExp, zSig0 );
  5426. precision80:
  5427. increment := ( (sbits64) zSig1 < 0 );
  5428. if ( ! roundNearestEven ) begin
  5429. if ( roundingMode = float_round_to_zero ) begin
  5430. increment := 0;
  5431. end;
  5432. else begin
  5433. if ( zSign ) begin
  5434. increment := ( roundingMode = float_round_down ) and zSig1;
  5435. end;
  5436. else begin
  5437. increment := ( roundingMode = float_round_up ) and zSig1;
  5438. end;
  5439. end;
  5440. end;
  5441. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5442. if ( ( $7FFE < zExp )
  5443. or ( ( zExp = $7FFE )
  5444. and ( zSig0 = LIT64( $FFFFFFFFFFFFFFFF ) )
  5445. and increment
  5446. )
  5447. ) begin
  5448. roundMask := 0;
  5449. overflow:
  5450. float_raise( float_flag_overflow or float_flag_inexact );
  5451. if ( ( roundingMode = float_round_to_zero )
  5452. or ( zSign and ( roundingMode = float_round_up ) )
  5453. or ( ! zSign and ( roundingMode = float_round_down ) )
  5454. ) begin
  5455. result:=packFloatx80( zSign, $7FFE, ~ roundMask );
  5456. end;
  5457. result:=packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5458. end;
  5459. if ( zExp <= 0 ) begin
  5460. isTiny =
  5461. ( float_detect_tininess = float_tininess_before_rounding )
  5462. or ( zExp < 0 )
  5463. or ! increment
  5464. or ( zSig0 < LIT64( $FFFFFFFFFFFFFFFF ) );
  5465. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  5466. zExp := 0;
  5467. if ( isTiny and zSig1 ) float_raise( float_flag_underflow );
  5468. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5469. if ( roundNearestEven ) begin
  5470. increment := ( (sbits64) zSig1 < 0 );
  5471. end;
  5472. else begin
  5473. if ( zSign ) begin
  5474. increment := ( roundingMode = float_round_down ) and zSig1;
  5475. end;
  5476. else begin
  5477. increment := ( roundingMode = float_round_up ) and zSig1;
  5478. end;
  5479. end;
  5480. if ( increment ) begin
  5481. ++zSig0;
  5482. zSig0 =
  5483. ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5484. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5485. end;
  5486. result:=packFloatx80( zSign, zExp, zSig0 );
  5487. end;
  5488. end;
  5489. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5490. if ( increment ) begin
  5491. ++zSig0;
  5492. if ( zSig0 = 0 ) begin
  5493. ++zExp;
  5494. zSig0 := LIT64( $8000000000000000 );
  5495. end;
  5496. else begin
  5497. zSig0 = ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5498. end;
  5499. end;
  5500. else begin
  5501. if ( zSig0 = 0 ) zExp := 0;
  5502. end;
  5503. result:=packFloatx80( zSign, zExp, zSig0 );
  5504. end;
  5505. {*----------------------------------------------------------------------------
  5506. | Takes an abstract floating-point value having sign `zSign', exponent
  5507. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  5508. | and returns the proper extended double-precision floating-point value
  5509. | corresponding to the abstract input. This routine is just like
  5510. | `roundAndPackFloatx80' except that the input significand does not have to be
  5511. | normalized.
  5512. *----------------------------------------------------------------------------*}
  5513. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5514. var
  5515. shiftCount: int8;
  5516. begin
  5517. if ( zSig0 = 0 ) begin
  5518. zSig0 := zSig1;
  5519. zSig1 := 0;
  5520. zExp -= 64;
  5521. end;
  5522. shiftCount := countLeadingZeros64( zSig0 );
  5523. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5524. zExp := eExp - shiftCount;
  5525. return
  5526. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  5527. end;
  5528. {*----------------------------------------------------------------------------
  5529. | Returns the result of converting the extended double-precision floating-
  5530. | point value `a' to the 32-bit two's complement integer format. The
  5531. | conversion is performed according to the IEC/IEEE Standard for Binary
  5532. | Floating-Point Arithmetic---which means in particular that the conversion
  5533. | is rounded according to the current rounding mode. If `a' is a NaN, the
  5534. | largest positive integer is returned. Otherwise, if the conversion
  5535. | overflows, the largest integer with the same sign as `a' is returned.
  5536. *----------------------------------------------------------------------------*}
  5537. function floatx80_to_int32(a: floatx80): int32;
  5538. var
  5539. aSign: flag;
  5540. aExp, shiftCount: int32;
  5541. aSig: bits64;
  5542. begin
  5543. aSig := extractFloatx80Frac( a );
  5544. aExp := extractFloatx80Exp( a );
  5545. aSign := extractFloatx80Sign( a );
  5546. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5547. shiftCount := $4037 - aExp;
  5548. if ( shiftCount <= 0 ) shiftCount := 1;
  5549. shift64RightJamming( aSig, shiftCount, aSig );
  5550. result := roundAndPackInt32( aSign, aSig );
  5551. end;
  5552. {*----------------------------------------------------------------------------
  5553. | Returns the result of converting the extended double-precision floating-
  5554. | point value `a' to the 32-bit two's complement integer format. The
  5555. | conversion is performed according to the IEC/IEEE Standard for Binary
  5556. | Floating-Point Arithmetic, except that the conversion is always rounded
  5557. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5558. | Otherwise, if the conversion overflows, the largest integer with the same
  5559. | sign as `a' is returned.
  5560. *----------------------------------------------------------------------------*}
  5561. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  5562. var
  5563. aSign: flag;
  5564. aExp, shiftCount: int32;
  5565. aSig, savedASig: bits64;
  5566. z: int32;
  5567. begin
  5568. aSig := extractFloatx80Frac( a );
  5569. aExp := extractFloatx80Exp( a );
  5570. aSign := extractFloatx80Sign( a );
  5571. if ( $401E < aExp ) begin
  5572. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5573. goto invalid;
  5574. end;
  5575. else if ( aExp < $3FFF ) begin
  5576. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5577. result := 0;
  5578. end;
  5579. shiftCount := $403E - aExp;
  5580. savedASig := aSig;
  5581. aSig >>= shiftCount;
  5582. z := aSig;
  5583. if ( aSign ) z := - z;
  5584. if ( ( z < 0 ) xor aSign ) begin
  5585. invalid:
  5586. float_raise( float_flag_invalid );
  5587. result := aSign ? (sbits32) $80000000 : $7FFFFFFF;
  5588. end;
  5589. if ( ( aSig shl shiftCount ) <> savedASig ) begin
  5590. softfloat_exception_flags or= float_flag_inexact;
  5591. end;
  5592. result := z;
  5593. end;
  5594. {*----------------------------------------------------------------------------
  5595. | Returns the result of converting the extended double-precision floating-
  5596. | point value `a' to the 64-bit two's complement integer format. The
  5597. | conversion is performed according to the IEC/IEEE Standard for Binary
  5598. | Floating-Point Arithmetic---which means in particular that the conversion
  5599. | is rounded according to the current rounding mode. If `a' is a NaN,
  5600. | the largest positive integer is returned. Otherwise, if the conversion
  5601. | overflows, the largest integer with the same sign as `a' is returned.
  5602. *----------------------------------------------------------------------------*}
  5603. function floatx80_to_int64(a: floatx80): int64;
  5604. var
  5605. aSign: flag;
  5606. aExp, shiftCount: int32;
  5607. aSig, aSigExtra: bits64;
  5608. begin
  5609. aSig := extractFloatx80Frac( a );
  5610. aExp := extractFloatx80Exp( a );
  5611. aSign := extractFloatx80Sign( a );
  5612. shiftCount := $403E - aExp;
  5613. if ( shiftCount <= 0 ) begin
  5614. if ( shiftCount ) begin
  5615. float_raise( float_flag_invalid );
  5616. if ( ! aSign
  5617. or ( ( aExp = $7FFF )
  5618. and ( aSig <> LIT64( $8000000000000000 ) ) )
  5619. ) begin
  5620. result := LIT64( $7FFFFFFFFFFFFFFF );
  5621. end;
  5622. result := (sbits64) LIT64( $8000000000000000 );
  5623. end;
  5624. aSigExtra := 0;
  5625. end;
  5626. else begin
  5627. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  5628. end;
  5629. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  5630. end;
  5631. {*----------------------------------------------------------------------------
  5632. | Returns the result of converting the extended double-precision floating-
  5633. | point value `a' to the 64-bit two's complement integer format. The
  5634. | conversion is performed according to the IEC/IEEE Standard for Binary
  5635. | Floating-Point Arithmetic, except that the conversion is always rounded
  5636. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5637. | Otherwise, if the conversion overflows, the largest integer with the same
  5638. | sign as `a' is returned.
  5639. *----------------------------------------------------------------------------*}
  5640. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  5641. var
  5642. aSign: flag;
  5643. aExp, shiftCount: int32;
  5644. aSig: bits64;
  5645. z: int64;
  5646. begin
  5647. aSig := extractFloatx80Frac( a );
  5648. aExp := extractFloatx80Exp( a );
  5649. aSign := extractFloatx80Sign( a );
  5650. shiftCount := aExp - $403E;
  5651. if ( 0 <= shiftCount ) begin
  5652. aSig = LIT64( $7FFFFFFFFFFFFFFF );
  5653. if ( ( a.high <> $C03E ) or aSig ) begin
  5654. float_raise( float_flag_invalid );
  5655. if ( ! aSign or ( ( aExp = $7FFF ) and aSig ) ) begin
  5656. result := LIT64( $7FFFFFFFFFFFFFFF );
  5657. end;
  5658. end;
  5659. result := (sbits64) LIT64( $8000000000000000 );
  5660. end;
  5661. else if ( aExp < $3FFF ) begin
  5662. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5663. result := 0;
  5664. end;
  5665. z := aSig>>( - shiftCount );
  5666. if ( (bits64) ( aSig shl ( shiftCount and 63 ) ) ) begin
  5667. softfloat_exception_flags or= float_flag_inexact;
  5668. end;
  5669. if ( aSign ) z := - z;
  5670. result := z;
  5671. end;
  5672. {*----------------------------------------------------------------------------
  5673. | Returns the result of converting the extended double-precision floating-
  5674. | point value `a' to the single-precision floating-point format. The
  5675. | conversion is performed according to the IEC/IEEE Standard for Binary
  5676. | Floating-Point Arithmetic.
  5677. *----------------------------------------------------------------------------*}
  5678. function floatx80_to_float32(a: floatx80): float32;
  5679. var
  5680. aSign: flag;
  5681. aExp: int32;
  5682. aSig: bits64;
  5683. begin
  5684. aSig := extractFloatx80Frac( a );
  5685. aExp := extractFloatx80Exp( a );
  5686. aSign := extractFloatx80Sign( a );
  5687. if ( aExp = $7FFF ) begin
  5688. if ( (bits64) ( aSig shl 1 ) ) begin
  5689. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  5690. end;
  5691. result := packFloat32( aSign, $FF, 0 );
  5692. end;
  5693. shift64RightJamming( aSig, 33, aSig );
  5694. if ( aExp or aSig ) aExp -= $3F81;
  5695. result := roundAndPackFloat32( aSign, aExp, aSig );
  5696. end;
  5697. {*----------------------------------------------------------------------------
  5698. | Returns the result of converting the extended double-precision floating-
  5699. | point value `a' to the double-precision floating-point format. The
  5700. | conversion is performed according to the IEC/IEEE Standard for Binary
  5701. | Floating-Point Arithmetic.
  5702. *----------------------------------------------------------------------------*}
  5703. function floatx80_to_float64(a: floatx80): float64;
  5704. var
  5705. aSign: flag;
  5706. aExp: int32;
  5707. aSig, zSig: bits64;
  5708. begin
  5709. aSig := extractFloatx80Frac( a );
  5710. aExp := extractFloatx80Exp( a );
  5711. aSign := extractFloatx80Sign( a );
  5712. if ( aExp = $7FFF ) begin
  5713. if ( (bits64) ( aSig shl 1 ) ) begin
  5714. result := commonNaNToFloat64( floatx80ToCommonNaN( a ) );
  5715. end;
  5716. result := packFloat64( aSign, $7FF, 0 );
  5717. end;
  5718. shift64RightJamming( aSig, 1, zSig );
  5719. if ( aExp or aSig ) aExp -= $3C01;
  5720. result := roundAndPackFloat64( aSign, aExp, zSig );
  5721. end;
  5722. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5723. {*----------------------------------------------------------------------------
  5724. | Returns the result of converting the extended double-precision floating-
  5725. | point value `a' to the quadruple-precision floating-point format. The
  5726. | conversion is performed according to the IEC/IEEE Standard for Binary
  5727. | Floating-Point Arithmetic.
  5728. *----------------------------------------------------------------------------*}
  5729. function floatx80_to_float128(a: floatx80): float128;
  5730. var
  5731. aSign: flag;
  5732. aExp: int16;
  5733. aSig, zSig0, zSig1: bits64;
  5734. begin
  5735. aSig := extractFloatx80Frac( a );
  5736. aExp := extractFloatx80Exp( a );
  5737. aSign := extractFloatx80Sign( a );
  5738. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) begin
  5739. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  5740. end;
  5741. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  5742. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  5743. end;
  5744. {$endif FPC_SOFTFLOAT_FLOAT128}
  5745. {*----------------------------------------------------------------------------
  5746. | Rounds the extended double-precision floating-point value `a' to an integer,
  5747. | and Returns the result as an extended quadruple-precision floating-point
  5748. | value. The operation is performed according to the IEC/IEEE Standard for
  5749. | Binary Floating-Point Arithmetic.
  5750. *----------------------------------------------------------------------------*}
  5751. function floatx80_round_to_int(a: floatx80): floatx80;
  5752. var
  5753. aSign: flag;
  5754. aExp: int32;
  5755. lastBitMask, roundBitsMask: bits64;
  5756. roundingMode: int8;
  5757. z: floatx80;
  5758. begin
  5759. aExp := extractFloatx80Exp( a );
  5760. if ( $403E <= aExp ) begin
  5761. if ( ( aExp = $7FFF ) and (bits64) ( extractFloatx80Frac( a ) shl 1 ) ) begin
  5762. result := propagateFloatx80NaN( a, a );
  5763. end;
  5764. result := a;
  5765. end;
  5766. if ( aExp < $3FFF ) begin
  5767. if ( ( aExp = 0 )
  5768. and ( (bits64) ( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) begin
  5769. result := a;
  5770. end;
  5771. softfloat_exception_flags or= float_flag_inexact;
  5772. aSign := extractFloatx80Sign( a );
  5773. switch ( float_rounding_mode ) begin
  5774. case float_round_nearest_even:
  5775. if ( ( aExp = $3FFE ) and (bits64) ( extractFloatx80Frac( a ) shl 1 )
  5776. ) begin
  5777. result :=
  5778. packFloatx80( aSign, $3FFF, LIT64( $8000000000000000 ) );
  5779. end;
  5780. break;
  5781. case float_round_down:
  5782. result :=
  5783. aSign ?
  5784. packFloatx80( 1, $3FFF, LIT64( $8000000000000000 ) )
  5785. : packFloatx80( 0, 0, 0 );
  5786. case float_round_up:
  5787. result :=
  5788. aSign ? packFloatx80( 1, 0, 0 )
  5789. : packFloatx80( 0, $3FFF, LIT64( $8000000000000000 ) );
  5790. end;
  5791. result := packFloatx80( aSign, 0, 0 );
  5792. end;
  5793. lastBitMask := 1;
  5794. lastBitMask shl = $403E - aExp;
  5795. roundBitsMask := lastBitMask - 1;
  5796. z := a;
  5797. roundingMode := float_rounding_mode;
  5798. if ( roundingMode = float_round_nearest_even ) begin
  5799. z.low += lastBitMask>>1;
  5800. if ( ( z.low and roundBitsMask ) = 0 ) z.low = ~ lastBitMask;
  5801. end;
  5802. else if ( roundingMode <> float_round_to_zero ) begin
  5803. if ( extractFloatx80Sign( z ) xor ( roundingMode = float_round_up ) ) begin
  5804. z.low += roundBitsMask;
  5805. end;
  5806. end;
  5807. z.low = ~ roundBitsMask;
  5808. if ( z.low = 0 ) begin
  5809. ++z.high;
  5810. z.low := LIT64( $8000000000000000 );
  5811. end;
  5812. if ( z.low <> a.low ) softfloat_exception_flags or= float_flag_inexact;
  5813. result := z;
  5814. end;
  5815. {*----------------------------------------------------------------------------
  5816. | Returns the result of adding the absolute values of the extended double-
  5817. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  5818. | negated before being returned. `zSign' is ignored if the result is a NaN.
  5819. | The addition is performed according to the IEC/IEEE Standard for Binary
  5820. | Floating-Point Arithmetic.
  5821. *----------------------------------------------------------------------------*}
  5822. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5823. var
  5824. aExp, bExp, zExp: int32;
  5825. aSig, bSig, zSig0, zSig1: bits64;
  5826. expDiff: int32;
  5827. begin
  5828. aSig := extractFloatx80Frac( a );
  5829. aExp := extractFloatx80Exp( a );
  5830. bSig := extractFloatx80Frac( b );
  5831. bExp := extractFloatx80Exp( b );
  5832. expDiff := aExp - bExp;
  5833. if ( 0 < expDiff ) begin
  5834. if ( aExp = $7FFF ) begin
  5835. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5836. result := a;
  5837. end;
  5838. if ( bExp = 0 ) --expDiff;
  5839. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  5840. zExp := aExp;
  5841. end;
  5842. else if ( expDiff < 0 ) begin
  5843. if ( bExp = $7FFF ) begin
  5844. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5845. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5846. end;
  5847. if ( aExp = 0 ) ++expDiff;
  5848. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  5849. zExp := bExp;
  5850. end;
  5851. else begin
  5852. if ( aExp = $7FFF ) begin
  5853. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5854. result := propagateFloatx80NaN( a, b );
  5855. end;
  5856. result := a;
  5857. end;
  5858. zSig1 := 0;
  5859. zSig0 := aSig + bSig;
  5860. if ( aExp = 0 ) begin
  5861. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  5862. goto roundAndPack;
  5863. end;
  5864. zExp := aExp;
  5865. goto shiftRight1;
  5866. end;
  5867. zSig0 := aSig + bSig;
  5868. if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
  5869. shiftRight1:
  5870. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  5871. zSig0 or= LIT64( $8000000000000000 );
  5872. ++zExp;
  5873. roundAndPack:
  5874. result :=
  5875. roundAndPackFloatx80(
  5876. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5877. end;
  5878. {*----------------------------------------------------------------------------
  5879. | Returns the result of subtracting the absolute values of the extended
  5880. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  5881. | difference is negated before being returned. `zSign' is ignored if the
  5882. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  5883. | Standard for Binary Floating-Point Arithmetic.
  5884. *----------------------------------------------------------------------------*}
  5885. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5886. var
  5887. aExp, bExp, zExp: int32;
  5888. aSig, bSig, zSig0, zSig1: bits64;
  5889. expDiff: int32;
  5890. z: floatx80;
  5891. begin
  5892. aSig := extractFloatx80Frac( a );
  5893. aExp := extractFloatx80Exp( a );
  5894. bSig := extractFloatx80Frac( b );
  5895. bExp := extractFloatx80Exp( b );
  5896. expDiff := aExp - bExp;
  5897. if ( 0 < expDiff ) goto aExpBigger;
  5898. if ( expDiff < 0 ) goto bExpBigger;
  5899. if ( aExp = $7FFF ) begin
  5900. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5901. result := propagateFloatx80NaN( a, b );
  5902. end;
  5903. float_raise( float_flag_invalid );
  5904. z.low := floatx80_default_nan_low;
  5905. z.high := floatx80_default_nan_high;
  5906. result := z;
  5907. end;
  5908. if ( aExp = 0 ) begin
  5909. aExp := 1;
  5910. bExp := 1;
  5911. end;
  5912. zSig1 := 0;
  5913. if ( bSig < aSig ) goto aBigger;
  5914. if ( aSig < bSig ) goto bBigger;
  5915. result := packFloatx80( float_rounding_mode = float_round_down, 0, 0 );
  5916. bExpBigger:
  5917. if ( bExp = $7FFF ) begin
  5918. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5919. result := packFloatx80( zSign xor 1, $7FFF, LIT64( $8000000000000000 ) );
  5920. end;
  5921. if ( aExp = 0 ) ++expDiff;
  5922. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  5923. bBigger:
  5924. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  5925. zExp := bExp;
  5926. zSign xor = 1;
  5927. goto normalizeRoundAndPack;
  5928. aExpBigger:
  5929. if ( aExp = $7FFF ) begin
  5930. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5931. result := a;
  5932. end;
  5933. if ( bExp = 0 ) --expDiff;
  5934. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  5935. aBigger:
  5936. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  5937. zExp := aExp;
  5938. normalizeRoundAndPack:
  5939. result :=
  5940. normalizeRoundAndPackFloatx80(
  5941. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5942. end;
  5943. {*----------------------------------------------------------------------------
  5944. | Returns the result of adding the extended double-precision floating-point
  5945. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  5946. | Standard for Binary Floating-Point Arithmetic.
  5947. *----------------------------------------------------------------------------*}
  5948. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  5949. var
  5950. aSign, bSign: flag;
  5951. begin
  5952. aSign := extractFloatx80Sign( a );
  5953. bSign := extractFloatx80Sign( b );
  5954. if ( aSign = bSign ) begin
  5955. result := addFloatx80Sigs( a, b, aSign );
  5956. end;
  5957. else begin
  5958. result := subFloatx80Sigs( a, b, aSign );
  5959. end;
  5960. end;
  5961. {*----------------------------------------------------------------------------
  5962. | Returns the result of subtracting the extended double-precision floating-
  5963. | point values `a' and `b'. The operation is performed according to the
  5964. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5965. *----------------------------------------------------------------------------*}
  5966. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  5967. var
  5968. aSign, bSign: flag;
  5969. begin
  5970. aSign := extractFloatx80Sign( a );
  5971. bSign := extractFloatx80Sign( b );
  5972. if ( aSign = bSign ) begin
  5973. result := subFloatx80Sigs( a, b, aSign );
  5974. end;
  5975. else begin
  5976. result := addFloatx80Sigs( a, b, aSign );
  5977. end;
  5978. end;
  5979. {*----------------------------------------------------------------------------
  5980. | Returns the result of multiplying the extended double-precision floating-
  5981. | point values `a' and `b'. The operation is performed according to the
  5982. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5983. *----------------------------------------------------------------------------*}
  5984. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  5985. var
  5986. aSign, bSign, zSign: flag;
  5987. aExp, bExp, zExp: int32;
  5988. aSig, bSig, zSig0, zSig1: bits64;
  5989. z: floatx80;
  5990. begin
  5991. aSig := extractFloatx80Frac( a );
  5992. aExp := extractFloatx80Exp( a );
  5993. aSign := extractFloatx80Sign( a );
  5994. bSig := extractFloatx80Frac( b );
  5995. bExp := extractFloatx80Exp( b );
  5996. bSign := extractFloatx80Sign( b );
  5997. zSign := aSign xor bSign;
  5998. if ( aExp = $7FFF ) begin
  5999. if ( (bits64) ( aSig shl 1 )
  6000. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  6001. result := propagateFloatx80NaN( a, b );
  6002. end;
  6003. if ( ( bExp or bSig ) = 0 ) goto invalid;
  6004. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6005. end;
  6006. if ( bExp = $7FFF ) begin
  6007. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6008. if ( ( aExp or aSig ) = 0 ) begin
  6009. invalid:
  6010. float_raise( float_flag_invalid );
  6011. z.low := floatx80_default_nan_low;
  6012. z.high := floatx80_default_nan_high;
  6013. result := z;
  6014. end;
  6015. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6016. end;
  6017. if ( aExp = 0 ) begin
  6018. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6019. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6020. end;
  6021. if ( bExp = 0 ) begin
  6022. if ( bSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6023. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6024. end;
  6025. zExp := aExp + bExp - $3FFE;
  6026. mul64To128( aSig, bSig, zSig0, zSig1 );
  6027. if ( 0 < (sbits64) zSig0 ) begin
  6028. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6029. --zExp;
  6030. end;
  6031. result :=
  6032. roundAndPackFloatx80(
  6033. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6034. end;
  6035. {*----------------------------------------------------------------------------
  6036. | Returns the result of dividing the extended double-precision floating-point
  6037. | value `a' by the corresponding value `b'. The operation is performed
  6038. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6039. *----------------------------------------------------------------------------*}
  6040. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6041. var
  6042. aSign, bSign, zSign: flag;
  6043. aExp, bExp, zExp: int32;
  6044. aSig, bSig, zSig0, zSig1: bits64;
  6045. rem0, rem1, rem2, term0, term1, term2: bits64;
  6046. z: floatx80;
  6047. begin
  6048. aSig := extractFloatx80Frac( a );
  6049. aExp := extractFloatx80Exp( a );
  6050. aSign := extractFloatx80Sign( a );
  6051. bSig := extractFloatx80Frac( b );
  6052. bExp := extractFloatx80Exp( b );
  6053. bSign := extractFloatx80Sign( b );
  6054. zSign := aSign xor bSign;
  6055. if ( aExp = $7FFF ) begin
  6056. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6057. if ( bExp = $7FFF ) begin
  6058. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6059. goto invalid;
  6060. end;
  6061. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6062. end;
  6063. if ( bExp = $7FFF ) begin
  6064. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6065. result := packFloatx80( zSign, 0, 0 );
  6066. end;
  6067. if ( bExp = 0 ) begin
  6068. if ( bSig = 0 ) begin
  6069. if ( ( aExp or aSig ) = 0 ) begin
  6070. invalid:
  6071. float_raise( float_flag_invalid );
  6072. z.low := floatx80_default_nan_low;
  6073. z.high := floatx80_default_nan_high;
  6074. result := z;
  6075. end;
  6076. float_raise( float_flag_divbyzero );
  6077. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6078. end;
  6079. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6080. end;
  6081. if ( aExp = 0 ) begin
  6082. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6083. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6084. end;
  6085. zExp := aExp - bExp + $3FFE;
  6086. rem1 := 0;
  6087. if ( bSig <= aSig ) begin
  6088. shift128Right( aSig, 0, 1, aSig, rem1 );
  6089. ++zExp;
  6090. end;
  6091. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6092. mul64To128( bSig, zSig0, term0, term1 );
  6093. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6094. while ( (sbits64) rem0 < 0 ) begin
  6095. --zSig0;
  6096. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6097. end;
  6098. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6099. if ( (bits64) ( zSig1 shl 1 ) <= 8 ) begin
  6100. mul64To128( bSig, zSig1, term1, term2 );
  6101. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6102. while ( (sbits64) rem1 < 0 ) begin
  6103. --zSig1;
  6104. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6105. end;
  6106. zSig1 or= ( ( rem1 or rem2 ) <> 0 );
  6107. end;
  6108. result :=
  6109. roundAndPackFloatx80(
  6110. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6111. end;
  6112. {*----------------------------------------------------------------------------
  6113. | Returns the remainder of the extended double-precision floating-point value
  6114. | `a' with respect to the corresponding value `b'. The operation is performed
  6115. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6116. *----------------------------------------------------------------------------*}
  6117. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6118. var
  6119. aSign, bSign, zSign: flag;
  6120. aExp, bExp, expDiff: int32;
  6121. aSig0, aSig1, bSig: bits64;
  6122. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6123. z: floatx80;
  6124. begin
  6125. aSig0 := extractFloatx80Frac( a );
  6126. aExp := extractFloatx80Exp( a );
  6127. aSign := extractFloatx80Sign( a );
  6128. bSig := extractFloatx80Frac( b );
  6129. bExp := extractFloatx80Exp( b );
  6130. bSign := extractFloatx80Sign( b );
  6131. if ( aExp = $7FFF ) begin
  6132. if ( (bits64) ( aSig0 shl 1 )
  6133. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  6134. result := propagateFloatx80NaN( a, b );
  6135. end;
  6136. goto invalid;
  6137. end;
  6138. if ( bExp = $7FFF ) begin
  6139. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6140. result := a;
  6141. end;
  6142. if ( bExp = 0 ) begin
  6143. if ( bSig = 0 ) begin
  6144. invalid:
  6145. float_raise( float_flag_invalid );
  6146. z.low := floatx80_default_nan_low;
  6147. z.high := floatx80_default_nan_high;
  6148. result := z;
  6149. end;
  6150. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6151. end;
  6152. if ( aExp = 0 ) begin
  6153. if ( (bits64) ( aSig0 shl 1 ) = 0 ) result := a;
  6154. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6155. end;
  6156. bSig or= LIT64( $8000000000000000 );
  6157. zSign := aSign;
  6158. expDiff := aExp - bExp;
  6159. aSig1 := 0;
  6160. if ( expDiff < 0 ) begin
  6161. if ( expDiff < -1 ) result := a;
  6162. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6163. expDiff := 0;
  6164. end;
  6165. q := ( bSig <= aSig0 );
  6166. if ( q ) aSig0 -= bSig;
  6167. expDiff -= 64;
  6168. while ( 0 < expDiff ) begin
  6169. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6170. q := ( 2 < q ) ? q - 2 : 0;
  6171. mul64To128( bSig, q, term0, term1 );
  6172. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6173. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  6174. expDiff -= 62;
  6175. end;
  6176. expDiff += 64;
  6177. if ( 0 < expDiff ) begin
  6178. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6179. q := ( 2 < q ) ? q - 2 : 0;
  6180. q >>= 64 - expDiff;
  6181. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  6182. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6183. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  6184. while ( le128( term0, term1, aSig0, aSig1 ) ) begin
  6185. ++q;
  6186. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6187. end;
  6188. end;
  6189. else begin
  6190. term1 := 0;
  6191. term0 := bSig;
  6192. end;
  6193. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  6194. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6195. or ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6196. and ( q and 1 ) )
  6197. ) begin
  6198. aSig0 := alternateASig0;
  6199. aSig1 := alternateASig1;
  6200. zSign := ! zSign;
  6201. end;
  6202. result :=
  6203. normalizeRoundAndPackFloatx80(
  6204. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  6205. end;
  6206. {*----------------------------------------------------------------------------
  6207. | Returns the square root of the extended double-precision floating-point
  6208. | value `a'. The operation is performed according to the IEC/IEEE Standard
  6209. | for Binary Floating-Point Arithmetic.
  6210. *----------------------------------------------------------------------------*}
  6211. function floatx80_sqrt(a: floatx80): floatx80;
  6212. var
  6213. aSign: flag;
  6214. aExp, zExp: int32;
  6215. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  6216. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  6217. z: floatx80;
  6218. label
  6219. invalid;
  6220. begin
  6221. aSig0 := extractFloatx80Frac( a );
  6222. aExp := extractFloatx80Exp( a );
  6223. aSign := extractFloatx80Sign( a );
  6224. if ( aExp = $7FFF ) begin
  6225. if ( (bits64) ( aSig0 shl 1 ) ) result := propagateFloatx80NaN( a, a );
  6226. if ( ! aSign ) result := a;
  6227. goto invalid;
  6228. end;
  6229. if ( aSign ) begin
  6230. if ( ( aExp or aSig0 ) = 0 ) result := a;
  6231. invalid:
  6232. float_raise( float_flag_invalid );
  6233. z.low := floatx80_default_nan_low;
  6234. z.high := floatx80_default_nan_high;
  6235. result := z;
  6236. end;
  6237. if ( aExp = 0 ) begin
  6238. if ( aSig0 = 0 ) result := packFloatx80( 0, 0, 0 );
  6239. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6240. end;
  6241. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
  6242. zSig0 := estimateSqrt32( aExp, aSig0>>32 );
  6243. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  6244. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  6245. doubleZSig0 := zSig0 shl 1;
  6246. mul64To128( zSig0, zSig0, term0, term1 );
  6247. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  6248. while ( (sbits64) rem0 < 0 ) begin
  6249. --zSig0;
  6250. doubleZSig0 -= 2;
  6251. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, rem0, rem1 );
  6252. end;
  6253. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  6254. if ( ( zSig1 and LIT64( $3FFFFFFFFFFFFFFF ) ) <= 5 ) begin
  6255. if ( zSig1 = 0 ) zSig1 := 1;
  6256. mul64To128( doubleZSig0, zSig1, term1, term2 );
  6257. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6258. mul64To128( zSig1, zSig1, term2, term3 );
  6259. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  6260. while ( (sbits64) rem1 < 0 ) begin
  6261. --zSig1;
  6262. shortShift128Left( 0, zSig1, 1, term2, term3 );
  6263. term3 or= 1;
  6264. term2 or= doubleZSig0;
  6265. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  6266. end;
  6267. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  6268. end;
  6269. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  6270. zSig0 or= doubleZSig0;
  6271. result :=
  6272. roundAndPackFloatx80(
  6273. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  6274. end;
  6275. {*----------------------------------------------------------------------------
  6276. | Returns 1 if the extended double-precision floating-point value `a' is
  6277. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  6278. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  6279. | Arithmetic.
  6280. *----------------------------------------------------------------------------*}
  6281. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  6282. begin
  6283. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6284. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6285. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6286. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6287. ) begin
  6288. if ( floatx80_is_signaling_nan( a )
  6289. or floatx80_is_signaling_nan( b ) ) begin
  6290. float_raise( float_flag_invalid );
  6291. end;
  6292. result := 0;
  6293. end;
  6294. result :=
  6295. ( a.low = b.low )
  6296. and ( ( a.high = b.high )
  6297. or ( ( a.low = 0 )
  6298. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6299. );
  6300. end;
  6301. {*----------------------------------------------------------------------------
  6302. | Returns 1 if the extended double-precision floating-point value `a' is
  6303. | less than or equal to the corresponding value `b', and 0 otherwise. The
  6304. | comparison is performed according to the IEC/IEEE Standard for Binary
  6305. | Floating-Point Arithmetic.
  6306. *----------------------------------------------------------------------------*}
  6307. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  6308. var
  6309. aSign, bSign: flag;
  6310. begin
  6311. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6312. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6313. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6314. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6315. ) begin
  6316. float_raise( float_flag_invalid );
  6317. result := 0;
  6318. end;
  6319. aSign := extractFloatx80Sign( a );
  6320. bSign := extractFloatx80Sign( b );
  6321. if ( aSign <> bSign ) begin
  6322. result :=
  6323. aSign
  6324. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6325. = 0 );
  6326. end;
  6327. result :=
  6328. aSign ? le128( b.high, b.low, a.high, a.low )
  6329. : le128( a.high, a.low, b.high, b.low );
  6330. end;
  6331. {*----------------------------------------------------------------------------
  6332. | Returns 1 if the extended double-precision floating-point value `a' is
  6333. | less than the corresponding value `b', and 0 otherwise. The comparison
  6334. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6335. | Arithmetic.
  6336. *----------------------------------------------------------------------------*}
  6337. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  6338. var
  6339. aSign, bSign: flag;
  6340. begin
  6341. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6342. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6343. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6344. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6345. ) begin
  6346. float_raise( float_flag_invalid );
  6347. result := 0;
  6348. end;
  6349. aSign := extractFloatx80Sign( a );
  6350. bSign := extractFloatx80Sign( b );
  6351. if ( aSign <> bSign ) begin
  6352. result :=
  6353. aSign
  6354. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6355. <> 0 );
  6356. end;
  6357. result :=
  6358. aSign ? lt128( b.high, b.low, a.high, a.low )
  6359. : lt128( a.high, a.low, b.high, b.low );
  6360. end;
  6361. {*----------------------------------------------------------------------------
  6362. | Returns 1 if the extended double-precision floating-point value `a' is equal
  6363. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  6364. | raised if either operand is a NaN. Otherwise, the comparison is performed
  6365. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6366. *----------------------------------------------------------------------------*}
  6367. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  6368. begin
  6369. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6370. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6371. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6372. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6373. ) begin
  6374. float_raise( float_flag_invalid );
  6375. result := 0;
  6376. end;
  6377. result :=
  6378. ( a.low = b.low )
  6379. and ( ( a.high = b.high )
  6380. or ( ( a.low = 0 )
  6381. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6382. );
  6383. end;
  6384. {*----------------------------------------------------------------------------
  6385. | Returns 1 if the extended double-precision floating-point value `a' is less
  6386. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  6387. | do not cause an exception. Otherwise, the comparison is performed according
  6388. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6389. *----------------------------------------------------------------------------*}
  6390. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  6391. var
  6392. aSign, bSign: flag;
  6393. begin
  6394. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6395. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6396. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6397. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6398. ) begin
  6399. if ( floatx80_is_signaling_nan( a )
  6400. or floatx80_is_signaling_nan( b ) ) begin
  6401. float_raise( float_flag_invalid );
  6402. end;
  6403. result := 0;
  6404. end;
  6405. aSign := extractFloatx80Sign( a );
  6406. bSign := extractFloatx80Sign( b );
  6407. if ( aSign <> bSign ) begin
  6408. result :=
  6409. aSign
  6410. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6411. = 0 );
  6412. end;
  6413. result :=
  6414. aSign ? le128( b.high, b.low, a.high, a.low )
  6415. : le128( a.high, a.low, b.high, b.low );
  6416. end;
  6417. {*----------------------------------------------------------------------------
  6418. | Returns 1 if the extended double-precision floating-point value `a' is less
  6419. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  6420. | an exception. Otherwise, the comparison is performed according to the
  6421. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6422. *----------------------------------------------------------------------------*}
  6423. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  6424. var
  6425. aSign, bSign: flag;
  6426. begin
  6427. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6428. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6429. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6430. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6431. ) begin
  6432. if ( floatx80_is_signaling_nan( a )
  6433. or floatx80_is_signaling_nan( b ) ) begin
  6434. float_raise( float_flag_invalid );
  6435. end;
  6436. result := 0;
  6437. end;
  6438. aSign := extractFloatx80Sign( a );
  6439. bSign := extractFloatx80Sign( b );
  6440. if ( aSign <> bSign ) begin
  6441. result :=
  6442. aSign
  6443. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6444. <> 0 );
  6445. end;
  6446. result :=
  6447. aSign ? lt128( b.high, b.low, a.high, a.low )
  6448. : lt128( a.high, a.low, b.high, b.low );
  6449. end;
  6450. {$endif FPC_SOFTFLOAT_FLOATX80}
  6451. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6452. {*----------------------------------------------------------------------------
  6453. | Returns the least-significant 64 fraction bits of the quadruple-precision
  6454. | floating-point value `a'.
  6455. *----------------------------------------------------------------------------*}
  6456. function extractFloat128Frac1(a : float128): bits64;
  6457. begin
  6458. result:=a.low;
  6459. end;
  6460. {*----------------------------------------------------------------------------
  6461. | Returns the most-significant 48 fraction bits of the quadruple-precision
  6462. | floating-point value `a'.
  6463. *----------------------------------------------------------------------------*}
  6464. function extractFloat128Frac0(a : float128): bits64;
  6465. begin
  6466. result:=a.high and int64($0000FFFFFFFFFFFF);
  6467. end;
  6468. {*----------------------------------------------------------------------------
  6469. | Returns the exponent bits of the quadruple-precision floating-point value
  6470. | `a'.
  6471. *----------------------------------------------------------------------------*}
  6472. function extractFloat128Exp(a : float128): int32;
  6473. begin
  6474. result:=( a.high shr 48 ) and $7FFF;
  6475. end;
  6476. {*----------------------------------------------------------------------------
  6477. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  6478. *----------------------------------------------------------------------------*}
  6479. function extractFloat128Sign(a : float128): flag;
  6480. begin
  6481. result:=a.high shr 63;
  6482. end;
  6483. {*----------------------------------------------------------------------------
  6484. | Normalizes the subnormal quadruple-precision floating-point value
  6485. | represented by the denormalized significand formed by the concatenation of
  6486. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  6487. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  6488. | significand are stored at the location pointed to by `zSig0Ptr', and the
  6489. | least significant 64 bits of the normalized significand are stored at the
  6490. | location pointed to by `zSig1Ptr'.
  6491. *----------------------------------------------------------------------------*}
  6492. procedure normalizeFloat128Subnormal(
  6493. aSig0: bits64;
  6494. aSig1: bits64;
  6495. var zExpPtr: int32;
  6496. var zSig0Ptr: bits64;
  6497. var zSig1Ptr: bits64);
  6498. var
  6499. shiftCount: int8;
  6500. begin
  6501. if ( aSig0 = 0 ) then
  6502. begin
  6503. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  6504. if ( shiftCount < 0 ) then
  6505. begin
  6506. zSig0Ptr := aSig1 shr ( - shiftCount );
  6507. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  6508. end
  6509. else begin
  6510. zSig0Ptr := aSig1 shl shiftCount;
  6511. zSig1Ptr := 0;
  6512. end;
  6513. zExpPtr := - shiftCount - 63;
  6514. end
  6515. else begin
  6516. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  6517. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  6518. zExpPtr := 1 - shiftCount;
  6519. end;
  6520. end;
  6521. {*----------------------------------------------------------------------------
  6522. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  6523. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  6524. | floating-point value, returning the result. After being shifted into the
  6525. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  6526. | added together to form the most significant 32 bits of the result. This
  6527. | means that any integer portion of `zSig0' will be added into the exponent.
  6528. | Since a properly normalized significand will have an integer portion equal
  6529. | to 1, the `zExp' input should be 1 less than the desired result exponent
  6530. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  6531. | significand.
  6532. *----------------------------------------------------------------------------*}
  6533. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  6534. var
  6535. z: float128;
  6536. begin
  6537. z.low := zSig1;
  6538. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  6539. result:=z;
  6540. end;
  6541. {*----------------------------------------------------------------------------
  6542. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6543. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  6544. | and `zSig2', and returns the proper quadruple-precision floating-point value
  6545. | corresponding to the abstract input. Ordinarily, the abstract value is
  6546. | simply rounded and packed into the quadruple-precision format, with the
  6547. | inexact exception raised if the abstract input cannot be represented
  6548. | exactly. However, if the abstract value is too large, the overflow and
  6549. | inexact exceptions are raised and an infinity or maximal finite value is
  6550. | returned. If the abstract value is too small, the input value is rounded to
  6551. | a subnormal number, and the underflow and inexact exceptions are raised if
  6552. | the abstract input cannot be represented exactly as a subnormal quadruple-
  6553. | precision floating-point number.
  6554. | The input significand must be normalized or smaller. If the input
  6555. | significand is not normalized, `zExp' must be 0; in that case, the result
  6556. | returned is a subnormal number, and it must not require rounding. In the
  6557. | usual case that the input significand is normalized, `zExp' must be 1 less
  6558. | than the ``true'' floating-point exponent. The handling of underflow and
  6559. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6560. *----------------------------------------------------------------------------*}
  6561. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  6562. var
  6563. roundingMode: int8;
  6564. roundNearestEven, increment, isTiny: flag;
  6565. begin
  6566. roundingMode := float_rounding_mode;
  6567. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  6568. increment := ord( sbits64(zSig2) < 0 );
  6569. if ( roundNearestEven=0 ) then
  6570. begin
  6571. if ( roundingMode = float_round_to_zero ) then
  6572. begin
  6573. increment := 0;
  6574. end
  6575. else begin
  6576. if ( zSign<>0 ) then
  6577. begin
  6578. increment := ord( roundingMode = float_round_down ) and zSig2;
  6579. end
  6580. else begin
  6581. increment := ord( roundingMode = float_round_up ) and zSig2;
  6582. end;
  6583. end;
  6584. end;
  6585. if ( $7FFD <= bits32(zExp) ) then
  6586. begin
  6587. if ( ord( $7FFD < zExp )
  6588. or ( ord( zExp = $7FFD )
  6589. and eq128(
  6590. int64( $0001FFFFFFFFFFFF ),
  6591. int64( $FFFFFFFFFFFFFFFF ),
  6592. zSig0,
  6593. zSig1
  6594. )
  6595. and increment
  6596. )
  6597. )<>0 then
  6598. begin
  6599. float_raise( float_flag_overflow or float_flag_inexact );
  6600. if ( ord( roundingMode = float_round_to_zero )
  6601. or ( zSign and ord( roundingMode = float_round_up ) )
  6602. or ( not(zSign) and ord( roundingMode = float_round_down ) )
  6603. )<>0 then
  6604. begin
  6605. result :=
  6606. packFloat128(
  6607. zSign,
  6608. $7FFE,
  6609. int64( $0000FFFFFFFFFFFF ),
  6610. int64( $FFFFFFFFFFFFFFFF )
  6611. );
  6612. end;
  6613. result:=packFloat128( zSign, $7FFF, 0, 0 );
  6614. end;
  6615. if ( zExp < 0 ) then
  6616. begin
  6617. isTiny :=
  6618. ord(( float_detect_tininess = float_tininess_before_rounding )
  6619. or ( zExp < -1 )
  6620. or not( increment<>0 )
  6621. or boolean(lt128(
  6622. zSig0,
  6623. zSig1,
  6624. int64( $0001FFFFFFFFFFFF ),
  6625. int64( $FFFFFFFFFFFFFFFF )
  6626. )));
  6627. shift128ExtraRightJamming(
  6628. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  6629. zExp := 0;
  6630. if ( isTiny and zSig2 )<>0 then
  6631. float_raise( float_flag_underflow );
  6632. if ( roundNearestEven<>0 ) then
  6633. begin
  6634. increment := ord( sbits64(zSig2) < 0 );
  6635. end
  6636. else begin
  6637. if ( zSign<>0 ) then
  6638. begin
  6639. increment := ord( roundingMode = float_round_down ) and zSig2;
  6640. end
  6641. else begin
  6642. increment := ord( roundingMode = float_round_up ) and zSig2;
  6643. end;
  6644. end;
  6645. end;
  6646. end;
  6647. if ( zSig2<>0 ) then
  6648. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6649. if ( increment<>0 ) then
  6650. begin
  6651. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  6652. zSig1 := zSig1 and not( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  6653. end
  6654. else begin
  6655. if ( ( zSig0 or zSig1 ) = 0 ) then
  6656. zExp := 0;
  6657. end;
  6658. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  6659. end;
  6660. {*----------------------------------------------------------------------------
  6661. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6662. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  6663. | returns the proper quadruple-precision floating-point value corresponding
  6664. | to the abstract input. This routine is just like `roundAndPackFloat128'
  6665. | except that the input significand has fewer bits and does not have to be
  6666. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  6667. | point exponent.
  6668. *----------------------------------------------------------------------------*}
  6669. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  6670. var
  6671. shiftCount: int8;
  6672. zSig2: bits64;
  6673. begin
  6674. if ( zSig0 = 0 ) then
  6675. begin
  6676. zSig0 := zSig1;
  6677. zSig1 := 0;
  6678. dec(zExp, 64);
  6679. end;
  6680. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  6681. if ( 0 <= shiftCount ) then
  6682. begin
  6683. zSig2 := 0;
  6684. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6685. end
  6686. else begin
  6687. shift128ExtraRightJamming(
  6688. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  6689. end;
  6690. dec(zExp, shiftCount);
  6691. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  6692. end;
  6693. {*----------------------------------------------------------------------------
  6694. | Returns the result of converting the quadruple-precision floating-point
  6695. | value `a' to the 32-bit two's complement integer format. The conversion
  6696. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6697. | Arithmetic---which means in particular that the conversion is rounded
  6698. | according to the current rounding mode. If `a' is a NaN, the largest
  6699. | positive integer is returned. Otherwise, if the conversion overflows, the
  6700. | largest integer with the same sign as `a' is returned.
  6701. *----------------------------------------------------------------------------*}
  6702. function float128_to_int32(a: float128): int32;
  6703. var
  6704. aSign: flag;
  6705. aExp, shiftCount: int32;
  6706. aSig0, aSig1: bits64;
  6707. begin
  6708. aSig1 := extractFloat128Frac1( a );
  6709. aSig0 := extractFloat128Frac0( a );
  6710. aExp := extractFloat128Exp( a );
  6711. aSign := extractFloat128Sign( a );
  6712. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  6713. aSign := 0;
  6714. if ( aExp<>0 ) then
  6715. aSig0 := aSig0 or int64( $0001000000000000 );
  6716. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6717. shiftCount := $4028 - aExp;
  6718. if ( 0 < shiftCount ) then
  6719. shift64RightJamming( aSig0, shiftCount, aSig0 );
  6720. result := roundAndPackInt32( aSign, aSig0 );
  6721. end;
  6722. {*----------------------------------------------------------------------------
  6723. | Returns the result of converting the quadruple-precision floating-point
  6724. | value `a' to the 32-bit two's complement integer format. The conversion
  6725. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6726. | Arithmetic, except that the conversion is always rounded toward zero. If
  6727. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  6728. | conversion overflows, the largest integer with the same sign as `a' is
  6729. | returned.
  6730. *----------------------------------------------------------------------------*}
  6731. function float128_to_int32_round_to_zero(a: float128): int32;
  6732. var
  6733. aSign: flag;
  6734. aExp, shiftCount: int32;
  6735. aSig0, aSig1, savedASig: bits64;
  6736. z: int32;
  6737. label
  6738. invalid;
  6739. begin
  6740. aSig1 := extractFloat128Frac1( a );
  6741. aSig0 := extractFloat128Frac0( a );
  6742. aExp := extractFloat128Exp( a );
  6743. aSign := extractFloat128Sign( a );
  6744. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6745. if ( $401E < aExp ) then
  6746. begin
  6747. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  6748. aSign := 0;
  6749. goto invalid;
  6750. end
  6751. else if ( aExp < $3FFF ) then
  6752. begin
  6753. if ( aExp or aSig0 )<>0 then
  6754. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6755. result := 0;
  6756. exit;
  6757. end;
  6758. aSig0 := aSig0 or int64( $0001000000000000 );
  6759. shiftCount := $402F - aExp;
  6760. savedASig := aSig0;
  6761. aSig0 := aSig0 shr shiftCount;
  6762. z := aSig0;
  6763. if ( aSign )<>0 then
  6764. z := - z;
  6765. if ( ord( z < 0 ) xor aSign )<>0 then
  6766. begin
  6767. invalid:
  6768. float_raise( float_flag_invalid );
  6769. if aSign<>0 then
  6770. result:=$80000000
  6771. else
  6772. result:=$7FFFFFFF;
  6773. exit;
  6774. end;
  6775. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  6776. begin
  6777. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6778. end;
  6779. result := z;
  6780. end;
  6781. {*----------------------------------------------------------------------------
  6782. | Returns the result of converting the quadruple-precision floating-point
  6783. | value `a' to the 64-bit two's complement integer format. The conversion
  6784. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6785. | Arithmetic---which means in particular that the conversion is rounded
  6786. | according to the current rounding mode. If `a' is a NaN, the largest
  6787. | positive integer is returned. Otherwise, if the conversion overflows, the
  6788. | largest integer with the same sign as `a' is returned.
  6789. *----------------------------------------------------------------------------*}
  6790. function float128_to_int64(a: float128): int64;
  6791. var
  6792. aSign: flag;
  6793. aExp, shiftCount: int32;
  6794. aSig0, aSig1: bits64;
  6795. begin
  6796. aSig1 := extractFloat128Frac1( a );
  6797. aSig0 := extractFloat128Frac0( a );
  6798. aExp := extractFloat128Exp( a );
  6799. aSign := extractFloat128Sign( a );
  6800. if ( aExp<>0 ) then
  6801. aSig0 := aSig0 or int64( $0001000000000000 );
  6802. shiftCount := $402F - aExp;
  6803. if ( shiftCount <= 0 ) then
  6804. begin
  6805. if ( $403E < aExp ) then
  6806. begin
  6807. float_raise( float_flag_invalid );
  6808. if ( (aSign=0)
  6809. or ( ( aExp = $7FFF )
  6810. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  6811. )
  6812. ) then
  6813. begin
  6814. result := int64( $7FFFFFFFFFFFFFFF );
  6815. end;
  6816. result := int64( $8000000000000000 );
  6817. end;
  6818. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  6819. end
  6820. else begin
  6821. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  6822. end;
  6823. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  6824. end;
  6825. {*----------------------------------------------------------------------------
  6826. | Returns the result of converting the quadruple-precision floating-point
  6827. | value `a' to the 64-bit two's complement integer format. The conversion
  6828. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6829. | Arithmetic, except that the conversion is always rounded toward zero.
  6830. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  6831. | the conversion overflows, the largest integer with the same sign as `a' is
  6832. | returned.
  6833. *----------------------------------------------------------------------------*}
  6834. function float128_to_int64_round_to_zero(a: float128): int64;
  6835. var
  6836. aSign: flag;
  6837. aExp, shiftCount: int32;
  6838. aSig0, aSig1: bits64;
  6839. z: int64;
  6840. begin
  6841. aSig1 := extractFloat128Frac1( a );
  6842. aSig0 := extractFloat128Frac0( a );
  6843. aExp := extractFloat128Exp( a );
  6844. aSign := extractFloat128Sign( a );
  6845. if ( aExp<>0 ) then
  6846. aSig0 := aSig0 or int64( $0001000000000000 );
  6847. shiftCount := aExp - $402F;
  6848. if ( 0 < shiftCount ) then
  6849. begin
  6850. if ( $403E <= aExp ) then
  6851. begin
  6852. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  6853. if ( ( a.high = int64( $C03E000000000000 ) )
  6854. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  6855. begin
  6856. if ( aSig1<>0 ) then
  6857. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6858. end
  6859. else begin
  6860. float_raise( float_flag_invalid );
  6861. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  6862. begin
  6863. result := int64( $7FFFFFFFFFFFFFFF );
  6864. exit;
  6865. end;
  6866. end;
  6867. result := int64( $8000000000000000 );
  6868. exit;
  6869. end;
  6870. z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
  6871. if ( int64( aSig1 shl shiftCount )<>0 ) then
  6872. begin
  6873. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6874. end;
  6875. end
  6876. else begin
  6877. if ( aExp < $3FFF ) then
  6878. begin
  6879. if ( aExp or aSig0 or aSig1 )<>0 then
  6880. begin
  6881. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6882. end;
  6883. result := 0;
  6884. exit;
  6885. end;
  6886. z := aSig0 shr ( - shiftCount );
  6887. if ( (aSig1<>0)
  6888. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  6889. begin
  6890. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6891. end;
  6892. end;
  6893. if ( aSign<>0 ) then
  6894. z := - z;
  6895. result := z;
  6896. end;
  6897. {*----------------------------------------------------------------------------
  6898. | Returns the result of converting the quadruple-precision floating-point
  6899. | value `a' to the single-precision floating-point format. The conversion
  6900. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6901. | Arithmetic.
  6902. *----------------------------------------------------------------------------*}
  6903. function float128_to_float32(a: float128): float32;
  6904. var
  6905. aSign: flag;
  6906. aExp: int32;
  6907. aSig0, aSig1: bits64;
  6908. zSig: bits32;
  6909. begin
  6910. aSig1 := extractFloat128Frac1( a );
  6911. aSig0 := extractFloat128Frac0( a );
  6912. aExp := extractFloat128Exp( a );
  6913. aSign := extractFloat128Sign( a );
  6914. if ( aExp = $7FFF ) then
  6915. begin
  6916. if ( aSig0 or aSig1 )<>0 then
  6917. begin
  6918. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  6919. exit;
  6920. end;
  6921. result := packFloat32( aSign, $FF, 0 );
  6922. exit;
  6923. end;
  6924. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6925. shift64RightJamming( aSig0, 18, aSig0 );
  6926. zSig := aSig0;
  6927. if ( aExp or zSig )<>0 then
  6928. begin
  6929. zSig := zSig or $40000000;
  6930. dec(aExp,$3F81);
  6931. end;
  6932. result := roundAndPackFloat32( aSign, aExp, zSig );
  6933. end;
  6934. {*----------------------------------------------------------------------------
  6935. | Returns the result of converting the quadruple-precision floating-point
  6936. | value `a' to the double-precision floating-point format. The conversion
  6937. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6938. | Arithmetic.
  6939. *----------------------------------------------------------------------------*}
  6940. function float128_to_float64(a: float128): float64;
  6941. var
  6942. aSign: flag;
  6943. aExp: int32;
  6944. aSig0, aSig1: bits64;
  6945. begin
  6946. aSig1 := extractFloat128Frac1( a );
  6947. aSig0 := extractFloat128Frac0( a );
  6948. aExp := extractFloat128Exp( a );
  6949. aSign := extractFloat128Sign( a );
  6950. if ( aExp = $7FFF ) then
  6951. begin
  6952. if ( aSig0 or aSig1 )<>0 then
  6953. begin
  6954. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  6955. exit;
  6956. end;
  6957. result:=packFloat64( aSign, $7FF, 0);
  6958. exit;
  6959. end;
  6960. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  6961. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6962. if ( aExp or aSig0 )<>0 then
  6963. begin
  6964. aSig0 := aSig0 or int64( $4000000000000000 );
  6965. dec(aExp,$3C01);
  6966. end;
  6967. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  6968. end;
  6969. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  6970. {*----------------------------------------------------------------------------
  6971. | Returns the result of converting the quadruple-precision floating-point
  6972. | value `a' to the extended double-precision floating-point format. The
  6973. | conversion is performed according to the IEC/IEEE Standard for Binary
  6974. | Floating-Point Arithmetic.
  6975. *----------------------------------------------------------------------------*}
  6976. function float128_to_floatx80(a: float128): floatx80;
  6977. var
  6978. aSign: flag;
  6979. aExp: int32;
  6980. aSig0, aSig1: bits64;
  6981. begin
  6982. aSig1 := extractFloat128Frac1( a );
  6983. aSig0 := extractFloat128Frac0( a );
  6984. aExp := extractFloat128Exp( a );
  6985. aSign := extractFloat128Sign( a );
  6986. if ( aExp = $7FFF ) begin
  6987. if ( aSig0 or aSig1 ) begin
  6988. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  6989. exit;
  6990. end;
  6991. result := packFloatx80( aSign, $7FFF, int64( $8000000000000000 ) );
  6992. exit;
  6993. end;
  6994. if ( aExp = 0 ) begin
  6995. if ( ( aSig0 or aSig1 ) = 0 ) then
  6996. begin
  6997. result := packFloatx80( aSign, 0, 0 );
  6998. exit;
  6999. end;
  7000. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7001. end;
  7002. else begin
  7003. aSig0 or= int64( $0001000000000000 );
  7004. end;
  7005. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7006. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7007. end;
  7008. {$endif FPC_SOFTFLOAT_FLOATX80}
  7009. {*----------------------------------------------------------------------------
  7010. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7011. | Returns the result as a quadruple-precision floating-point value. The
  7012. | operation is performed according to the IEC/IEEE Standard for Binary
  7013. | Floating-Point Arithmetic.
  7014. *----------------------------------------------------------------------------*}
  7015. function float128_round_to_int(a: float128): float128;
  7016. var
  7017. aSign: flag;
  7018. aExp: int32;
  7019. lastBitMask, roundBitsMask: bits64;
  7020. roundingMode: int8;
  7021. z: float128;
  7022. begin
  7023. aExp := extractFloat128Exp( a );
  7024. if ( $402F <= aExp ) then
  7025. begin
  7026. if ( $406F <= aExp ) then
  7027. begin
  7028. if ( ( aExp = $7FFF )
  7029. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7030. ) then
  7031. begin
  7032. result := propagateFloat128NaN( a, a );
  7033. exit;
  7034. end;
  7035. result := a;
  7036. exit;
  7037. end;
  7038. lastBitMask := 1;
  7039. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7040. roundBitsMask := lastBitMask - 1;
  7041. z := a;
  7042. roundingMode := float_rounding_mode;
  7043. if ( roundingMode = float_round_nearest_even ) then
  7044. begin
  7045. if ( lastBitMask )<>0 then
  7046. begin
  7047. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7048. if ( ( z.low and roundBitsMask ) = 0 ) then
  7049. z.low := z.low and not(lastBitMask);
  7050. end
  7051. else begin
  7052. if ( sbits64(z.low) < 0 ) then
  7053. begin
  7054. inc(z.high);
  7055. if ( bits64( z.low shl 1 ) = 0 ) then
  7056. z.high := z.high and not(1);
  7057. end;
  7058. end;
  7059. end
  7060. else if ( roundingMode <> float_round_to_zero ) then
  7061. begin
  7062. if ( extractFloat128Sign( z )
  7063. xor ord( roundingMode = float_round_up ) )<>0 then
  7064. begin
  7065. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7066. end;
  7067. end;
  7068. z.low := z.low and not(roundBitsMask);
  7069. end
  7070. else begin
  7071. if ( aExp < $3FFF ) then
  7072. begin
  7073. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7074. begin
  7075. result := a;
  7076. exit;
  7077. end;
  7078. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7079. aSign := extractFloat128Sign( a );
  7080. case float_rounding_mode of
  7081. float_round_nearest_even:
  7082. if ( ( aExp = $3FFE )
  7083. and ( (extractFloat128Frac0( a )<>0)
  7084. or (extractFloat128Frac1( a )<>0) )
  7085. ) then begin
  7086. begin
  7087. result := packFloat128( aSign, $3FFF, 0, 0 );
  7088. exit;
  7089. end;
  7090. end;
  7091. float_round_down:
  7092. begin
  7093. if aSign<>0 then
  7094. result:=packFloat128( 1, $3FFF, 0, 0 )
  7095. else
  7096. result:=packFloat128( 0, 0, 0, 0 );
  7097. exit;
  7098. end;
  7099. float_round_up:
  7100. begin
  7101. if aSign<>0 then
  7102. result := packFloat128( 1, 0, 0, 0 )
  7103. else
  7104. result:=packFloat128( 0, $3FFF, 0, 0 );
  7105. exit;
  7106. end;
  7107. end;
  7108. result := packFloat128( aSign, 0, 0, 0 );
  7109. exit;
  7110. end;
  7111. lastBitMask := 1;
  7112. lastBitMask := lastBitMask shl ($402F - aExp);
  7113. roundBitsMask := lastBitMask - 1;
  7114. z.low := 0;
  7115. z.high := a.high;
  7116. roundingMode := float_rounding_mode;
  7117. if ( roundingMode = float_round_nearest_even ) then begin
  7118. inc(z.high,lastBitMask shr 1);
  7119. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7120. z.high := z.high and not(lastBitMask);
  7121. end;
  7122. end
  7123. else if ( roundingMode <> float_round_to_zero ) then begin
  7124. if ( (extractFloat128Sign( z )<>0)
  7125. xor ( roundingMode = float_round_up ) ) then begin
  7126. z.high := z.high or ord( a.low <> 0 );
  7127. z.high := z.high+roundBitsMask;
  7128. end;
  7129. end;
  7130. z.high := z.high and not(roundBitsMask);
  7131. end;
  7132. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7133. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7134. end;
  7135. result := z;
  7136. end;
  7137. {*----------------------------------------------------------------------------
  7138. | Returns the result of adding the absolute values of the quadruple-precision
  7139. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7140. | before being returned. `zSign' is ignored if the result is a NaN.
  7141. | The addition is performed according to the IEC/IEEE Standard for Binary
  7142. | Floating-Point Arithmetic.
  7143. *----------------------------------------------------------------------------*}
  7144. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7145. var
  7146. aExp, bExp, zExp: int32;
  7147. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7148. expDiff: int32;
  7149. label
  7150. shiftRight1,roundAndPack;
  7151. begin
  7152. aSig1 := extractFloat128Frac1( a );
  7153. aSig0 := extractFloat128Frac0( a );
  7154. aExp := extractFloat128Exp( a );
  7155. bSig1 := extractFloat128Frac1( b );
  7156. bSig0 := extractFloat128Frac0( b );
  7157. bExp := extractFloat128Exp( b );
  7158. expDiff := aExp - bExp;
  7159. if ( 0 < expDiff ) then begin
  7160. if ( aExp = $7FFF ) then begin
  7161. if ( aSig0 or aSig1 )<>0 then
  7162. begin
  7163. result := propagateFloat128NaN( a, b );
  7164. exit;
  7165. end;
  7166. result := a;
  7167. exit;
  7168. end;
  7169. if ( bExp = 0 ) then begin
  7170. dec(expDiff);
  7171. end
  7172. else begin
  7173. bSig0 := bSig0 or int64( $0001000000000000 );
  7174. end;
  7175. shift128ExtraRightJamming(
  7176. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  7177. zExp := aExp;
  7178. end
  7179. else if ( expDiff < 0 ) then begin
  7180. if ( bExp = $7FFF ) then begin
  7181. if ( bSig0 or bSig1 )<>0 then
  7182. begin
  7183. result := propagateFloat128NaN( a, b );
  7184. exit;
  7185. end;
  7186. result := packFloat128( zSign, $7FFF, 0, 0 );
  7187. exit;
  7188. end;
  7189. if ( aExp = 0 ) then begin
  7190. inc(expDiff);
  7191. end
  7192. else begin
  7193. aSig0 := aSig0 or int64( $0001000000000000 );
  7194. end;
  7195. shift128ExtraRightJamming(
  7196. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  7197. zExp := bExp;
  7198. end
  7199. else begin
  7200. if ( aExp = $7FFF ) then begin
  7201. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7202. result := propagateFloat128NaN( a, b );
  7203. exit;
  7204. end;
  7205. result := a;
  7206. exit;
  7207. end;
  7208. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7209. if ( aExp = 0 ) then
  7210. begin
  7211. result := packFloat128( zSign, 0, zSig0, zSig1 );
  7212. exit;
  7213. end;
  7214. zSig2 := 0;
  7215. zSig0 := zSig0 or int64( $0002000000000000 );
  7216. zExp := aExp;
  7217. goto shiftRight1;
  7218. end;
  7219. aSig0 := aSig0 or int64( $0001000000000000 );
  7220. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7221. dec(zExp);
  7222. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  7223. inc(zExp);
  7224. shiftRight1:
  7225. shift128ExtraRightJamming(
  7226. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7227. roundAndPack:
  7228. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7229. end;
  7230. {*----------------------------------------------------------------------------
  7231. | Returns the result of subtracting the absolute values of the quadruple-
  7232. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  7233. | difference is negated before being returned. `zSign' is ignored if the
  7234. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  7235. | Standard for Binary Floating-Point Arithmetic.
  7236. *----------------------------------------------------------------------------*}
  7237. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  7238. var
  7239. aExp, bExp, zExp: int32;
  7240. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  7241. expDiff: int32;
  7242. z: float128;
  7243. label
  7244. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  7245. begin
  7246. aSig1 := extractFloat128Frac1( a );
  7247. aSig0 := extractFloat128Frac0( a );
  7248. aExp := extractFloat128Exp( a );
  7249. bSig1 := extractFloat128Frac1( b );
  7250. bSig0 := extractFloat128Frac0( b );
  7251. bExp := extractFloat128Exp( b );
  7252. expDiff := aExp - bExp;
  7253. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7254. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  7255. if ( 0 < expDiff ) then goto aExpBigger;
  7256. if ( expDiff < 0 ) then goto bExpBigger;
  7257. if ( aExp = $7FFF ) then begin
  7258. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7259. result := propagateFloat128NaN( a, b );
  7260. exit;
  7261. end;
  7262. float_raise( float_flag_invalid );
  7263. z.low := float128_default_nan_low;
  7264. z.high := float128_default_nan_high;
  7265. result := z;
  7266. exit;
  7267. end;
  7268. if ( aExp = 0 ) then begin
  7269. aExp := 1;
  7270. bExp := 1;
  7271. end;
  7272. if ( bSig0 < aSig0 ) then goto aBigger;
  7273. if ( aSig0 < bSig0 ) then goto bBigger;
  7274. if ( bSig1 < aSig1 ) then goto aBigger;
  7275. if ( aSig1 < bSig1 ) then goto bBigger;
  7276. result := packFloat128( ord(float_rounding_mode = float_round_down), 0, 0, 0 );
  7277. exit;
  7278. bExpBigger:
  7279. if ( bExp = $7FFF ) then begin
  7280. if ( bSig0 or bSig1 )<>0 then
  7281. begin
  7282. result := propagateFloat128NaN( a, b );
  7283. exit;
  7284. end;
  7285. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  7286. exit;
  7287. end;
  7288. if ( aExp = 0 ) then begin
  7289. inc(expDiff);
  7290. end
  7291. else begin
  7292. aSig0 := aSig0 or int64( $4000000000000000 );
  7293. end;
  7294. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7295. bSig0 := bSig0 or int64( $4000000000000000 );
  7296. bBigger:
  7297. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  7298. zExp := bExp;
  7299. zSign := zSign xor 1;
  7300. goto normalizeRoundAndPack;
  7301. aExpBigger:
  7302. if ( aExp = $7FFF ) then begin
  7303. if ( aSig0 or aSig1 )<>0 then
  7304. begin
  7305. result := propagateFloat128NaN( a, b );
  7306. exit;
  7307. end;
  7308. result := a;
  7309. exit;
  7310. end;
  7311. if ( bExp = 0 ) then begin
  7312. dec(expDiff);
  7313. end
  7314. else begin
  7315. bSig0 := bSig0 or int64( $4000000000000000 );
  7316. end;
  7317. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  7318. aSig0 := aSig0 or int64( $4000000000000000 );
  7319. aBigger:
  7320. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7321. zExp := aExp;
  7322. normalizeRoundAndPack:
  7323. dec(zExp);
  7324. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  7325. end;
  7326. {*----------------------------------------------------------------------------
  7327. | Returns the result of adding the quadruple-precision floating-point values
  7328. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  7329. | for Binary Floating-Point Arithmetic.
  7330. *----------------------------------------------------------------------------*}
  7331. function float128_add(a: float128; b: float128): float128;
  7332. var
  7333. aSign, bSign: flag;
  7334. begin
  7335. aSign := extractFloat128Sign( a );
  7336. bSign := extractFloat128Sign( b );
  7337. if ( aSign = bSign ) then begin
  7338. result := addFloat128Sigs( a, b, aSign );
  7339. end
  7340. else begin
  7341. result := subFloat128Sigs( a, b, aSign );
  7342. end;
  7343. end;
  7344. {*----------------------------------------------------------------------------
  7345. | Returns the result of subtracting the quadruple-precision floating-point
  7346. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7347. | Standard for Binary Floating-Point Arithmetic.
  7348. *----------------------------------------------------------------------------*}
  7349. function float128_sub(a: float128; b: float128): float128;
  7350. var
  7351. aSign, bSign: flag;
  7352. begin
  7353. aSign := extractFloat128Sign( a );
  7354. bSign := extractFloat128Sign( b );
  7355. if ( aSign = bSign ) then begin
  7356. result := subFloat128Sigs( a, b, aSign );
  7357. end
  7358. else begin
  7359. result := addFloat128Sigs( a, b, aSign );
  7360. end;
  7361. end;
  7362. {*----------------------------------------------------------------------------
  7363. | Returns the result of multiplying the quadruple-precision floating-point
  7364. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7365. | Standard for Binary Floating-Point Arithmetic.
  7366. *----------------------------------------------------------------------------*}
  7367. function float128_mul(a: float128; b: float128): float128;
  7368. var
  7369. aSign, bSign, zSign: flag;
  7370. aExp, bExp, zExp: int32;
  7371. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  7372. z: float128;
  7373. label
  7374. invalid;
  7375. begin
  7376. aSig1 := extractFloat128Frac1( a );
  7377. aSig0 := extractFloat128Frac0( a );
  7378. aExp := extractFloat128Exp( a );
  7379. aSign := extractFloat128Sign( a );
  7380. bSig1 := extractFloat128Frac1( b );
  7381. bSig0 := extractFloat128Frac0( b );
  7382. bExp := extractFloat128Exp( b );
  7383. bSign := extractFloat128Sign( b );
  7384. zSign := aSign xor bSign;
  7385. if ( aExp = $7FFF ) then begin
  7386. if ( (( aSig0 or aSig1 )<>0)
  7387. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7388. result := propagateFloat128NaN( a, b );
  7389. exit;
  7390. end;
  7391. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  7392. result := packFloat128( zSign, $7FFF, 0, 0 );
  7393. exit;
  7394. end;
  7395. if ( bExp = $7FFF ) then begin
  7396. if ( bSig0 or bSig1 )<>0 then
  7397. begin
  7398. result := propagateFloat128NaN( a, b );
  7399. exit;
  7400. end;
  7401. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7402. invalid:
  7403. float_raise( float_flag_invalid );
  7404. z.low := float128_default_nan_low;
  7405. z.high := float128_default_nan_high;
  7406. result := z;
  7407. exit;
  7408. end;
  7409. result := packFloat128( zSign, $7FFF, 0, 0 );
  7410. exit;
  7411. end;
  7412. if ( aExp = 0 ) then begin
  7413. if ( ( aSig0 or aSig1 ) = 0 ) then
  7414. begin
  7415. result := packFloat128( zSign, 0, 0, 0 );
  7416. exit;
  7417. end;
  7418. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7419. end;
  7420. if ( bExp = 0 ) then begin
  7421. if ( ( bSig0 or bSig1 ) = 0 ) then
  7422. begin
  7423. result := packFloat128( zSign, 0, 0, 0 );
  7424. exit;
  7425. end;
  7426. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7427. end;
  7428. zExp := aExp + bExp - $4000;
  7429. aSig0 := aSig0 or int64( $0001000000000000 );
  7430. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  7431. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  7432. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  7433. zSig2 := zSig2 or ord( zSig3 <> 0 );
  7434. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  7435. shift128ExtraRightJamming(
  7436. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7437. inc(zExp);
  7438. end;
  7439. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7440. end;
  7441. {*----------------------------------------------------------------------------
  7442. | Returns the result of dividing the quadruple-precision floating-point value
  7443. | `a' by the corresponding value `b'. The operation is performed according to
  7444. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7445. *----------------------------------------------------------------------------*}
  7446. function float128_div(a: float128; b: float128): float128;
  7447. var
  7448. aSign, bSign, zSign: flag;
  7449. aExp, bExp, zExp: int32;
  7450. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7451. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7452. z: float128;
  7453. label
  7454. invalid;
  7455. begin
  7456. aSig1 := extractFloat128Frac1( a );
  7457. aSig0 := extractFloat128Frac0( a );
  7458. aExp := extractFloat128Exp( a );
  7459. aSign := extractFloat128Sign( a );
  7460. bSig1 := extractFloat128Frac1( b );
  7461. bSig0 := extractFloat128Frac0( b );
  7462. bExp := extractFloat128Exp( b );
  7463. bSign := extractFloat128Sign( b );
  7464. zSign := aSign xor bSign;
  7465. if ( aExp = $7FFF ) then begin
  7466. if ( aSig0 or aSig1 )<>0 then
  7467. begin
  7468. result := propagateFloat128NaN( a, b );
  7469. exit;
  7470. end;
  7471. if ( bExp = $7FFF ) then begin
  7472. if ( bSig0 or bSig1 )<>0 then
  7473. begin
  7474. result := propagateFloat128NaN( a, b );
  7475. exit;
  7476. end;
  7477. goto invalid;
  7478. end;
  7479. result := packFloat128( zSign, $7FFF, 0, 0 );
  7480. exit;
  7481. end;
  7482. if ( bExp = $7FFF ) then begin
  7483. if ( bSig0 or bSig1 )<>0 then
  7484. begin
  7485. result := propagateFloat128NaN( a, b );
  7486. exit;
  7487. end;
  7488. result := packFloat128( zSign, 0, 0, 0 );
  7489. exit;
  7490. end;
  7491. if ( bExp = 0 ) then begin
  7492. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7493. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7494. invalid:
  7495. float_raise( float_flag_invalid );
  7496. z.low := float128_default_nan_low;
  7497. z.high := float128_default_nan_high;
  7498. result := z;
  7499. exit;
  7500. end;
  7501. float_raise( float_flag_divbyzero );
  7502. result := packFloat128( zSign, $7FFF, 0, 0 );
  7503. exit;
  7504. end;
  7505. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7506. end;
  7507. if ( aExp = 0 ) then begin
  7508. if ( ( aSig0 or aSig1 ) = 0 ) then
  7509. begin
  7510. result := packFloat128( zSign, 0, 0, 0 );
  7511. exit;
  7512. end;
  7513. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7514. end;
  7515. zExp := aExp - bExp + $3FFD;
  7516. shortShift128Left(
  7517. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  7518. shortShift128Left(
  7519. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7520. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  7521. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  7522. inc(zExp);
  7523. end;
  7524. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7525. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  7526. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  7527. while ( sbits64(rem0) < 0 ) do begin
  7528. dec(zSig0);
  7529. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  7530. end;
  7531. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  7532. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  7533. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  7534. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  7535. while ( sbits64(rem1) < 0 ) do begin
  7536. dec(zSig1);
  7537. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  7538. end;
  7539. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7540. end;
  7541. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  7542. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7543. end;
  7544. {*----------------------------------------------------------------------------
  7545. | Returns the remainder of the quadruple-precision floating-point value `a'
  7546. | with respect to the corresponding value `b'. The operation is performed
  7547. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7548. *----------------------------------------------------------------------------*}
  7549. function float128_rem(a: float128; b: float128): float128;
  7550. var
  7551. aSign, bSign, zSign: flag;
  7552. aExp, bExp, expDiff: int32;
  7553. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  7554. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  7555. sigMean0: sbits64;
  7556. z: float128;
  7557. label
  7558. invalid;
  7559. begin
  7560. aSig1 := extractFloat128Frac1( a );
  7561. aSig0 := extractFloat128Frac0( a );
  7562. aExp := extractFloat128Exp( a );
  7563. aSign := extractFloat128Sign( a );
  7564. bSig1 := extractFloat128Frac1( b );
  7565. bSig0 := extractFloat128Frac0( b );
  7566. bExp := extractFloat128Exp( b );
  7567. bSign := extractFloat128Sign( b );
  7568. if ( aExp = $7FFF ) then begin
  7569. if ( (( aSig0 or aSig1 )<>0)
  7570. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7571. result := propagateFloat128NaN( a, b );
  7572. exit;
  7573. end;
  7574. goto invalid;
  7575. end;
  7576. if ( bExp = $7FFF ) then begin
  7577. if ( bSig0 or bSig1 )<>0 then
  7578. begin
  7579. result := propagateFloat128NaN( a, b );
  7580. exit;
  7581. end;
  7582. result := a;
  7583. exit;
  7584. end;
  7585. if ( bExp = 0 ) then begin
  7586. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7587. invalid:
  7588. float_raise( float_flag_invalid );
  7589. z.low := float128_default_nan_low;
  7590. z.high := float128_default_nan_high;
  7591. result := z;
  7592. exit;
  7593. end;
  7594. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7595. end;
  7596. if ( aExp = 0 ) then begin
  7597. if ( ( aSig0 or aSig1 ) = 0 ) then
  7598. begin
  7599. result := a;
  7600. exit;
  7601. end;
  7602. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7603. end;
  7604. expDiff := aExp - bExp;
  7605. if ( expDiff < -1 ) then
  7606. begin
  7607. result := a;
  7608. exit;
  7609. end;
  7610. shortShift128Left(
  7611. aSig0 or int64( $0001000000000000 ),
  7612. aSig1,
  7613. 15 - ord( expDiff < 0 ),
  7614. aSig0,
  7615. aSig1
  7616. );
  7617. shortShift128Left(
  7618. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7619. q := le128( bSig0, bSig1, aSig0, aSig1 );
  7620. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7621. dec(expDiff,64);
  7622. while ( 0 < expDiff ) do begin
  7623. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7624. if ( 4 < q ) then
  7625. q := q - 4
  7626. else
  7627. q := 0;
  7628. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7629. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  7630. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  7631. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  7632. dec(expDiff,61);
  7633. end;
  7634. if ( -64 < expDiff ) then begin
  7635. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7636. if ( 4 < q ) then
  7637. q := q - 4
  7638. else
  7639. q := 0;
  7640. q := q shr (- expDiff);
  7641. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7642. inc(expDiff,52);
  7643. if ( expDiff < 0 ) then begin
  7644. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7645. end
  7646. else begin
  7647. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  7648. end;
  7649. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7650. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  7651. end
  7652. else begin
  7653. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  7654. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7655. end;
  7656. repeat
  7657. alternateASig0 := aSig0;
  7658. alternateASig1 := aSig1;
  7659. inc(q);
  7660. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7661. until not( 0 <= sbits64(aSig0) );
  7662. add128(
  7663. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  7664. if ( ( sigMean0 < 0 )
  7665. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  7666. aSig0 := alternateASig0;
  7667. aSig1 := alternateASig1;
  7668. end;
  7669. zSign := ord( sbits64(aSig0) < 0 );
  7670. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  7671. result :=
  7672. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  7673. end;
  7674. {*----------------------------------------------------------------------------
  7675. | Returns the square root of the quadruple-precision floating-point value `a'.
  7676. | The operation is performed according to the IEC/IEEE Standard for Binary
  7677. | Floating-Point Arithmetic.
  7678. *----------------------------------------------------------------------------*}
  7679. function float128_sqrt(a: float128): float128;
  7680. var
  7681. aSign: flag;
  7682. aExp, zExp: int32;
  7683. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  7684. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7685. z: float128;
  7686. label
  7687. invalid;
  7688. begin
  7689. aSig1 := extractFloat128Frac1( a );
  7690. aSig0 := extractFloat128Frac0( a );
  7691. aExp := extractFloat128Exp( a );
  7692. aSign := extractFloat128Sign( a );
  7693. if ( aExp = $7FFF ) then begin
  7694. if ( aSig0 or aSig1 )<>0 then
  7695. begin
  7696. result := propagateFloat128NaN( a, a );
  7697. exit;
  7698. end;
  7699. if ( aSign=0 ) then
  7700. begin
  7701. result := a;
  7702. exit;
  7703. end;
  7704. goto invalid;
  7705. end;
  7706. if ( aSign<>0 ) then begin
  7707. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  7708. begin
  7709. result := a;
  7710. exit;
  7711. end;
  7712. invalid:
  7713. float_raise( float_flag_invalid );
  7714. z.low := float128_default_nan_low;
  7715. z.high := float128_default_nan_high;
  7716. result := z;
  7717. exit;
  7718. end;
  7719. if ( aExp = 0 ) then begin
  7720. if ( ( aSig0 or aSig1 ) = 0 ) then
  7721. begin
  7722. result := packFloat128( 0, 0, 0, 0 );
  7723. exit;
  7724. end;
  7725. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7726. end;
  7727. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
  7728. aSig0 := aSig0 or int64( $0001000000000000 );
  7729. zSig0 := estimateSqrt32( aExp, aSig0>>17 );
  7730. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  7731. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7732. doubleZSig0 := zSig0 shl 1;
  7733. mul64To128( zSig0, zSig0, term0, term1 );
  7734. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7735. while ( sbits64(rem0) < 0 ) do begin
  7736. dec(zSig0);
  7737. dec(doubleZSig0,2);
  7738. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7739. end;
  7740. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7741. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  7742. if ( zSig1 = 0 ) then zSig1 := 1;
  7743. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7744. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7745. mul64To128( zSig1, zSig1, term2, term3 );
  7746. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7747. while ( sbits64(rem1) < 0 ) do begin
  7748. dec(zSig1);
  7749. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7750. term3 := term3 or 1;
  7751. term2 := term2 or doubleZSig0;
  7752. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7753. end;
  7754. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7755. end;
  7756. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  7757. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  7758. end;
  7759. {*----------------------------------------------------------------------------
  7760. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7761. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7762. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7763. *----------------------------------------------------------------------------*}
  7764. function float128_eq(a: float128; b: float128): flag;
  7765. begin
  7766. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7767. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7768. or ( ( extractFloat128Exp( b ) = $7FFF )
  7769. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7770. ) then begin
  7771. if ( (float128_is_signaling_nan( a )<>0)
  7772. or (float128_is_signaling_nan( b )<>0) ) then begin
  7773. float_raise( float_flag_invalid );
  7774. end;
  7775. result := 0;
  7776. exit;
  7777. end;
  7778. result := ord(
  7779. ( a.low = b.low )
  7780. and ( ( a.high = b.high )
  7781. or ( ( a.low = 0 )
  7782. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  7783. ));
  7784. end;
  7785. {*----------------------------------------------------------------------------
  7786. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7787. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  7788. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7789. | Arithmetic.
  7790. *----------------------------------------------------------------------------*}
  7791. function float128_le(a: float128; b: float128): flag;
  7792. var
  7793. aSign, bSign: flag;
  7794. begin
  7795. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7796. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7797. or ( ( extractFloat128Exp( b ) = $7FFF )
  7798. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7799. ) then begin
  7800. float_raise( float_flag_invalid );
  7801. result := 0;
  7802. exit;
  7803. end;
  7804. aSign := extractFloat128Sign( a );
  7805. bSign := extractFloat128Sign( b );
  7806. if ( aSign <> bSign ) then begin
  7807. result := ord(
  7808. (aSign<>0)
  7809. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7810. = 0 ));
  7811. exit;
  7812. end;
  7813. if aSign<>0 then
  7814. result := le128( b.high, b.low, a.high, a.low )
  7815. else
  7816. result := le128( a.high, a.low, b.high, b.low );
  7817. end;
  7818. {*----------------------------------------------------------------------------
  7819. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7820. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7821. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7822. *----------------------------------------------------------------------------*}
  7823. function float128_lt(a: float128; b: float128): flag;
  7824. var
  7825. aSign, bSign: flag;
  7826. begin
  7827. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7828. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7829. or ( ( extractFloat128Exp( b ) = $7FFF )
  7830. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7831. ) then begin
  7832. float_raise( float_flag_invalid );
  7833. result := 0;
  7834. exit;
  7835. end;
  7836. aSign := extractFloat128Sign( a );
  7837. bSign := extractFloat128Sign( b );
  7838. if ( aSign <> bSign ) then begin
  7839. result := ord(
  7840. (aSign<>0)
  7841. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7842. <> 0 ));
  7843. exit;
  7844. end;
  7845. if aSign<>0 then
  7846. result := lt128( b.high, b.low, a.high, a.low )
  7847. else
  7848. result := lt128( a.high, a.low, b.high, b.low );
  7849. end;
  7850. {*----------------------------------------------------------------------------
  7851. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7852. | the corresponding value `b', and 0 otherwise. The invalid exception is
  7853. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7854. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7855. *----------------------------------------------------------------------------*}
  7856. function float128_eq_signaling(a: float128; b: float128): flag;
  7857. begin
  7858. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7859. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7860. or ( ( extractFloat128Exp( b ) = $7FFF )
  7861. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7862. ) then begin
  7863. float_raise( float_flag_invalid );
  7864. result := 0;
  7865. exit;
  7866. end;
  7867. result := ord(
  7868. ( a.low = b.low )
  7869. and ( ( a.high = b.high )
  7870. or ( ( a.low = 0 )
  7871. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7872. ));
  7873. end;
  7874. {*----------------------------------------------------------------------------
  7875. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7876. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  7877. | cause an exception. Otherwise, the comparison is performed according to the
  7878. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7879. *----------------------------------------------------------------------------*}
  7880. function float128_le_quiet(a: float128; b: float128): flag;
  7881. var
  7882. aSign, bSign: flag;
  7883. begin
  7884. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7885. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7886. or ( ( extractFloat128Exp( b ) = $7FFF )
  7887. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7888. ) then begin
  7889. if ( (float128_is_signaling_nan( a )<>0)
  7890. or (float128_is_signaling_nan( b )<>0) ) then begin
  7891. float_raise( float_flag_invalid );
  7892. end;
  7893. result := 0;
  7894. exit;
  7895. end;
  7896. aSign := extractFloat128Sign( a );
  7897. bSign := extractFloat128Sign( b );
  7898. if ( aSign <> bSign ) then begin
  7899. result := ord(
  7900. (aSign<>0)
  7901. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7902. = 0 ));
  7903. exit;
  7904. end;
  7905. if aSign<>0 then
  7906. result := le128( b.high, b.low, a.high, a.low )
  7907. else
  7908. result := le128( a.high, a.low, b.high, b.low );
  7909. end;
  7910. {*----------------------------------------------------------------------------
  7911. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7912. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  7913. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  7914. | Standard for Binary Floating-Point Arithmetic.
  7915. *----------------------------------------------------------------------------*}
  7916. function float128_lt_quiet(a: float128; b: float128): flag;
  7917. var
  7918. aSign, bSign: flag;
  7919. begin
  7920. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7921. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7922. or ( ( extractFloat128Exp( b ) = $7FFF )
  7923. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7924. ) then begin
  7925. if ( (float128_is_signaling_nan( a )<>0)
  7926. or (float128_is_signaling_nan( b )<>0) ) then begin
  7927. float_raise( float_flag_invalid );
  7928. end;
  7929. result := 0;
  7930. exit;
  7931. end;
  7932. aSign := extractFloat128Sign( a );
  7933. bSign := extractFloat128Sign( b );
  7934. if ( aSign <> bSign ) then begin
  7935. result := ord(
  7936. (aSign<>0)
  7937. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7938. <> 0 ));
  7939. exit;
  7940. end;
  7941. if aSign<>0 then
  7942. result:=lt128( b.high, b.low, a.high, a.low )
  7943. else
  7944. result:=lt128( a.high, a.low, b.high, b.low );
  7945. end;
  7946. {$endif FPC_SOFTFLOAT_FLOAT128}
  7947. {$endif not(defined(fpc_softfpu_interface))}
  7948. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  7949. end.
  7950. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}