softfpu.pp 294 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. {$define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. interface
  69. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  70. {$if not(defined(fpc_softfpu_implementation))}
  71. {
  72. -------------------------------------------------------------------------------
  73. Software IEC/IEEE floating-point types.
  74. -------------------------------------------------------------------------------
  75. }
  76. TYPE
  77. float32 = longword;
  78. { we use here a record in the function header because
  79. the record allows bitwise conversion to single }
  80. float32rec = record
  81. float32 : float32;
  82. end;
  83. flag = byte;
  84. uint8 = byte;
  85. int8 = shortint;
  86. uint16 = word;
  87. int16 = smallint;
  88. uint32 = longword;
  89. int32 = longint;
  90. bits8 = byte;
  91. sbits8 = shortint;
  92. bits16 = word;
  93. sbits16 = smallint;
  94. sbits32 = longint;
  95. bits32 = longword;
  96. {$ifndef fpc}
  97. qword = int64;
  98. {$endif}
  99. { now part of the system unit
  100. uint64 = qword;
  101. }
  102. bits64 = qword;
  103. sbits64 = int64;
  104. {$ifdef ENDIAN_LITTLE}
  105. float64 = packed record
  106. low: bits32;
  107. high: bits32;
  108. end;
  109. int64rec = packed record
  110. low: bits32;
  111. high: bits32;
  112. end;
  113. floatx80 = packed record
  114. low : qword;
  115. high : word;
  116. end;
  117. float128 = packed record
  118. low : qword;
  119. high : qword;
  120. end;
  121. {$else}
  122. float64 = packed record
  123. high,low : bits32;
  124. end;
  125. int64rec = packed record
  126. high,low : bits32;
  127. end;
  128. floatx80 = packed record
  129. high : word;
  130. low : qword;
  131. end;
  132. float128 = packed record
  133. high : qword;
  134. low : qword;
  135. end;
  136. {$endif}
  137. {*
  138. -------------------------------------------------------------------------------
  139. Returns 1 if the double-precision floating-point value `a' is less than
  140. the corresponding value `b', and 0 otherwise. The comparison is performed
  141. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  142. -------------------------------------------------------------------------------
  143. *}
  144. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  145. {*
  146. -------------------------------------------------------------------------------
  147. Returns 1 if the double-precision floating-point value `a' is less than
  148. or equal to the corresponding value `b', and 0 otherwise. The comparison
  149. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  150. Arithmetic.
  151. -------------------------------------------------------------------------------
  152. *}
  153. Function float64_le(a: float64;b: float64): flag; compilerproc;
  154. {*
  155. -------------------------------------------------------------------------------
  156. Returns 1 if the double-precision floating-point value `a' is equal to
  157. the corresponding value `b', and 0 otherwise. The comparison is performed
  158. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  159. -------------------------------------------------------------------------------
  160. *}
  161. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  162. {*
  163. -------------------------------------------------------------------------------
  164. Returns the square root of the double-precision floating-point value `a'.
  165. The operation is performed according to the IEC/IEEE Standard for Binary
  166. Floating-Point Arithmetic.
  167. -------------------------------------------------------------------------------
  168. *}
  169. Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
  170. {*
  171. -------------------------------------------------------------------------------
  172. Returns the remainder of the double-precision floating-point value `a'
  173. with respect to the corresponding value `b'. The operation is performed
  174. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  175. -------------------------------------------------------------------------------
  176. *}
  177. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  178. {*
  179. -------------------------------------------------------------------------------
  180. Returns the result of dividing the double-precision floating-point value `a'
  181. by the corresponding value `b'. The operation is performed according to the
  182. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  183. -------------------------------------------------------------------------------
  184. *}
  185. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  186. {*
  187. -------------------------------------------------------------------------------
  188. Returns the result of multiplying the double-precision floating-point values
  189. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  190. for Binary Floating-Point Arithmetic.
  191. -------------------------------------------------------------------------------
  192. *}
  193. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  194. {*
  195. -------------------------------------------------------------------------------
  196. Returns the result of subtracting the double-precision floating-point values
  197. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  198. for Binary Floating-Point Arithmetic.
  199. -------------------------------------------------------------------------------
  200. *}
  201. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  202. {*
  203. -------------------------------------------------------------------------------
  204. Returns the result of adding the double-precision floating-point values `a'
  205. and `b'. The operation is performed according to the IEC/IEEE Standard for
  206. Binary Floating-Point Arithmetic.
  207. -------------------------------------------------------------------------------
  208. *}
  209. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  210. {*
  211. -------------------------------------------------------------------------------
  212. Rounds the double-precision floating-point value `a' to an integer,
  213. and returns the result as a double-precision floating-point value. The
  214. operation is performed according to the IEC/IEEE Standard for Binary
  215. Floating-Point Arithmetic.
  216. -------------------------------------------------------------------------------
  217. *}
  218. Function float64_round_to_int(a: float64) : float64; compilerproc;
  219. {*
  220. -------------------------------------------------------------------------------
  221. Returns the result of converting the double-precision floating-point value
  222. `a' to the single-precision floating-point format. The conversion is
  223. performed according to the IEC/IEEE Standard for Binary Floating-Point
  224. Arithmetic.
  225. -------------------------------------------------------------------------------
  226. *}
  227. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  228. {*
  229. -------------------------------------------------------------------------------
  230. Returns the result of converting the double-precision floating-point value
  231. `a' to the 32-bit two's complement integer format. The conversion is
  232. performed according to the IEC/IEEE Standard for Binary Floating-Point
  233. Arithmetic, except that the conversion is always rounded toward zero.
  234. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  235. the conversion overflows, the largest integer with the same sign as `a' is
  236. returned.
  237. -------------------------------------------------------------------------------
  238. *}
  239. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  240. {*
  241. -------------------------------------------------------------------------------
  242. Returns the result of converting the double-precision floating-point value
  243. `a' to the 32-bit two's complement integer format. The conversion is
  244. performed according to the IEC/IEEE Standard for Binary Floating-Point
  245. Arithmetic---which means in particular that the conversion is rounded
  246. according to the current rounding mode. If `a' is a NaN, the largest
  247. positive integer is returned. Otherwise, if the conversion overflows, the
  248. largest integer with the same sign as `a' is returned.
  249. -------------------------------------------------------------------------------
  250. *}
  251. Function float64_to_int32(a: float64): int32; compilerproc;
  252. {*
  253. -------------------------------------------------------------------------------
  254. Returns 1 if the single-precision floating-point value `a' is less than
  255. the corresponding value `b', and 0 otherwise. The comparison is performed
  256. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  257. -------------------------------------------------------------------------------
  258. *}
  259. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  260. {*
  261. -------------------------------------------------------------------------------
  262. Returns 1 if the single-precision floating-point value `a' is less than
  263. or equal to the corresponding value `b', and 0 otherwise. The comparison
  264. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  265. Arithmetic.
  266. -------------------------------------------------------------------------------
  267. *}
  268. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  269. {*
  270. -------------------------------------------------------------------------------
  271. Returns 1 if the single-precision floating-point value `a' is equal to
  272. the corresponding value `b', and 0 otherwise. The comparison is performed
  273. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  274. -------------------------------------------------------------------------------
  275. *}
  276. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  277. {*
  278. -------------------------------------------------------------------------------
  279. Returns the square root of the single-precision floating-point value `a'.
  280. The operation is performed according to the IEC/IEEE Standard for Binary
  281. Floating-Point Arithmetic.
  282. -------------------------------------------------------------------------------
  283. *}
  284. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  285. {*
  286. -------------------------------------------------------------------------------
  287. Returns the remainder of the single-precision floating-point value `a'
  288. with respect to the corresponding value `b'. The operation is performed
  289. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  290. -------------------------------------------------------------------------------
  291. *}
  292. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  293. {*
  294. -------------------------------------------------------------------------------
  295. Returns the result of dividing the single-precision floating-point value `a'
  296. by the corresponding value `b'. The operation is performed according to the
  297. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  298. -------------------------------------------------------------------------------
  299. *}
  300. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  301. {*
  302. -------------------------------------------------------------------------------
  303. Returns the result of multiplying the single-precision floating-point values
  304. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  305. for Binary Floating-Point Arithmetic.
  306. -------------------------------------------------------------------------------
  307. *}
  308. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  309. {*
  310. -------------------------------------------------------------------------------
  311. Returns the result of subtracting the single-precision floating-point values
  312. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  313. for Binary Floating-Point Arithmetic.
  314. -------------------------------------------------------------------------------
  315. *}
  316. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  317. {*
  318. -------------------------------------------------------------------------------
  319. Returns the result of adding the single-precision floating-point values `a'
  320. and `b'. The operation is performed according to the IEC/IEEE Standard for
  321. Binary Floating-Point Arithmetic.
  322. -------------------------------------------------------------------------------
  323. *}
  324. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  325. {*
  326. -------------------------------------------------------------------------------
  327. Rounds the single-precision floating-point value `a' to an integer,
  328. and returns the result as a single-precision floating-point value. The
  329. operation is performed according to the IEC/IEEE Standard for Binary
  330. Floating-Point Arithmetic.
  331. -------------------------------------------------------------------------------
  332. *}
  333. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  334. {*
  335. -------------------------------------------------------------------------------
  336. Returns the result of converting the single-precision floating-point value
  337. `a' to the double-precision floating-point format. The conversion is
  338. performed according to the IEC/IEEE Standard for Binary Floating-Point
  339. Arithmetic.
  340. -------------------------------------------------------------------------------
  341. *}
  342. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  343. {*
  344. -------------------------------------------------------------------------------
  345. Returns the result of converting the single-precision floating-point value
  346. `a' to the 32-bit two's complement integer format. The conversion is
  347. performed according to the IEC/IEEE Standard for Binary Floating-Point
  348. Arithmetic, except that the conversion is always rounded toward zero.
  349. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  350. the conversion overflows, the largest integer with the same sign as `a' is
  351. returned.
  352. -------------------------------------------------------------------------------
  353. *}
  354. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  355. {*
  356. -------------------------------------------------------------------------------
  357. Returns the result of converting the single-precision floating-point value
  358. `a' to the 32-bit two's complement integer format. The conversion is
  359. performed according to the IEC/IEEE Standard for Binary Floating-Point
  360. Arithmetic---which means in particular that the conversion is rounded
  361. according to the current rounding mode. If `a' is a NaN, the largest
  362. positive integer is returned. Otherwise, if the conversion overflows, the
  363. largest integer with the same sign as `a' is returned.
  364. -------------------------------------------------------------------------------
  365. *}
  366. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  367. {*
  368. -------------------------------------------------------------------------------
  369. Returns the result of converting the 32-bit two's complement integer `a' to
  370. the double-precision floating-point format. The conversion is performed
  371. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  372. -------------------------------------------------------------------------------
  373. *}
  374. Function int32_to_float64( a: int32) : float64; compilerproc;
  375. {*
  376. -------------------------------------------------------------------------------
  377. Returns the result of converting the 32-bit two's complement integer `a' to
  378. the single-precision floating-point format. The conversion is performed
  379. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  380. -------------------------------------------------------------------------------
  381. *}
  382. Function int32_to_float32( a: int32): float32rec; compilerproc;
  383. {*----------------------------------------------------------------------------
  384. | Returns the result of converting the 64-bit two's complement integer `a'
  385. | to the double-precision floating-point format. The conversion is performed
  386. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  387. *----------------------------------------------------------------------------*}
  388. Function int64_to_float64( a: int64 ): float64; compilerproc;
  389. {*----------------------------------------------------------------------------
  390. | Returns the result of converting the 64-bit two's complement integer `a'
  391. | to the single-precision floating-point format. The conversion is performed
  392. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  393. *----------------------------------------------------------------------------*}
  394. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  395. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  396. function float128_is_nan( a : float128): flag;
  397. function float128_is_signaling_nan( a : float128): flag;
  398. function float128_to_int32(a: float128): int32;
  399. function float128_to_int32_round_to_zero(a: float128): int32;
  400. function float128_to_int64(a: float128): int64;
  401. function float128_to_int64_round_to_zero(a: float128): int64;
  402. function float128_to_float32(a: float128): float32;
  403. function float128_to_float64(a: float128): float64;
  404. function float64_to_float128( a : float64) : float128;
  405. {$ifdef FPC_SOFTFLOAT_FLOAT80}
  406. function float128_to_floatx80(a: float128): floatx80;
  407. {$endif FPC_SOFTFLOAT_FLOAT80}
  408. function float128_round_to_int(a: float128): float128;
  409. function float128_add(a: float128; b: float128): float128;
  410. function float128_sub(a: float128; b: float128): float128;
  411. function float128_mul(a: float128; b: float128): float128;
  412. function float128_div(a: float128; b: float128): float128;
  413. function float128_rem(a: float128; b: float128): float128;
  414. function float128_sqrt(a: float128): float128;
  415. function float128_eq(a: float128; b: float128): flag;
  416. function float128_le(a: float128; b: float128): flag;
  417. function float128_lt(a: float128; b: float128): flag;
  418. function float128_eq_signaling(a: float128; b: float128): flag;
  419. function float128_le_quiet(a: float128; b: float128): flag;
  420. function float128_lt_quiet(a: float128; b: float128): flag;
  421. {$endif FPC_SOFTFLOAT_FLOAT128}
  422. CONST
  423. {-------------------------------------------------------------------------------
  424. Software IEC/IEEE floating-point underflow tininess-detection mode.
  425. -------------------------------------------------------------------------------
  426. *}
  427. float_tininess_after_rounding = 0;
  428. float_tininess_before_rounding = 1;
  429. {*
  430. -------------------------------------------------------------------------------
  431. Software IEC/IEEE floating-point rounding mode.
  432. -------------------------------------------------------------------------------
  433. *}
  434. {
  435. Round to nearest.
  436. This is the default mode. It should be used unless there is a specific
  437. need for one of the others. In this mode results are rounded to the
  438. nearest representable value. If the result is midway between two
  439. representable values, the even representable is chosen. Even here
  440. means the lowest-order bit is zero. This rounding mode prevents
  441. statistical bias and guarantees numeric stability: round-off errors
  442. in a lengthy calculation will remain smaller than half of FLT_EPSILON.
  443. Round toward plus Infinity.
  444. All results are rounded to the smallest representable value which is
  445. greater than the result.
  446. Round toward minus Infinity.
  447. All results are rounded to the largest representable value which is
  448. less than the result.
  449. Round toward zero.
  450. All results are rounded to the largest representable value whose
  451. magnitude is less than that of the result. In other words, if the
  452. result is negative it is rounded up; if it is positive, it is
  453. rounded down.
  454. }
  455. float_round_nearest_even = 0;
  456. float_round_down = 1;
  457. float_round_up = 2;
  458. float_round_to_zero = 3;
  459. {*
  460. -------------------------------------------------------------------------------
  461. Floating-point rounding mode and exception flags.
  462. -------------------------------------------------------------------------------
  463. *}
  464. const
  465. float_rounding_mode : Byte = float_round_nearest_even;
  466. {*
  467. -------------------------------------------------------------------------------
  468. Underflow tininess-detection mode, statically initialized to default value.
  469. (The declaration in `softfloat.h' must match the `int8' type here.)
  470. -------------------------------------------------------------------------------
  471. *}
  472. const float_detect_tininess: int8 = float_tininess_after_rounding;
  473. {$endif not(defined(fpc_softfpu_implementation))}
  474. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  475. implementation
  476. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  477. {$if not(defined(fpc_softfpu_interface))}
  478. (*****************************************************************************)
  479. (*----------------------------------------------------------------------------*)
  480. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  481. (* division and square root approximations. (Can be specialized to target if *)
  482. (* desired.) *)
  483. (* ---------------------------------------------------------------------------*)
  484. (*****************************************************************************)
  485. {*----------------------------------------------------------------------------
  486. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  487. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  488. | input. If `zSign' is 1, the input is negated before being converted to an
  489. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  490. | is simply rounded to an integer, with the inexact exception raised if the
  491. | input cannot be represented exactly as an integer. However, if the fixed-
  492. | point input is too large, the invalid exception is raised and the largest
  493. | positive or negative integer is returned.
  494. *----------------------------------------------------------------------------*}
  495. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  496. var
  497. roundingMode: int8;
  498. roundNearestEven: flag;
  499. roundIncrement, roundBits: int8;
  500. z: int32;
  501. begin
  502. roundingMode := float_rounding_mode;
  503. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  504. roundIncrement := $40;
  505. if ( roundNearestEven=0 ) then
  506. begin
  507. if ( roundingMode = float_round_to_zero ) then
  508. begin
  509. roundIncrement := 0;
  510. end
  511. else begin
  512. roundIncrement := $7F;
  513. if ( zSign<>0 ) then
  514. begin
  515. if ( roundingMode = float_round_up ) then
  516. roundIncrement := 0;
  517. end
  518. else begin
  519. if ( roundingMode = float_round_down ) then
  520. roundIncrement := 0;
  521. end;
  522. end;
  523. end;
  524. roundBits := absZ and $7F;
  525. absZ := ( absZ + roundIncrement ) shr 7;
  526. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
  527. z := absZ;
  528. if ( zSign<>0 ) then
  529. z := - z;
  530. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  531. begin
  532. float_raise( float_flag_invalid );
  533. if zSign<>0 then
  534. result:=sbits32($80000000)
  535. else
  536. result:=$7FFFFFFF;
  537. exit;
  538. end;
  539. if ( roundBits<>0 ) then
  540. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  541. result:=z;
  542. end;
  543. {*----------------------------------------------------------------------------
  544. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  545. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  546. | and returns the properly rounded 64-bit integer corresponding to the input.
  547. | If `zSign' is 1, the input is negated before being converted to an integer.
  548. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  549. | the inexact exception raised if the input cannot be represented exactly as
  550. | an integer. However, if the fixed-point input is too large, the invalid
  551. | exception is raised and the largest positive or negative integer is
  552. | returned.
  553. *----------------------------------------------------------------------------*}
  554. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  555. var
  556. roundingMode: int8;
  557. roundNearestEven, increment: flag;
  558. z: int64;
  559. label
  560. overflow;
  561. begin
  562. roundingMode := float_rounding_mode;
  563. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  564. increment := ord( sbits64(absZ1) < 0 );
  565. if ( roundNearestEven=0 ) then
  566. begin
  567. if ( roundingMode = float_round_to_zero ) then
  568. begin
  569. increment := 0;
  570. end
  571. else begin
  572. if ( zSign<>0 ) then
  573. begin
  574. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  575. end
  576. else begin
  577. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  578. end;
  579. end;
  580. end;
  581. if ( increment<>0 ) then
  582. begin
  583. inc(absZ0);
  584. if ( absZ0 = 0 ) then
  585. goto overflow;
  586. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  587. end;
  588. z := absZ0;
  589. if ( zSign<>0 ) then
  590. z := - z;
  591. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  592. begin
  593. overflow:
  594. float_raise( float_flag_invalid );
  595. if zSign<>0 then
  596. result:=int64($8000000000000000)
  597. else
  598. result:=int64($7FFFFFFFFFFFFFFF);
  599. end;
  600. if ( absZ1<>0 ) then
  601. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  602. result:=z;
  603. end;
  604. {*
  605. -------------------------------------------------------------------------------
  606. Shifts `a' right by the number of bits given in `count'. If any nonzero
  607. bits are shifted off, they are ``jammed'' into the least significant bit of
  608. the result by setting the least significant bit to 1. The value of `count'
  609. can be arbitrarily large; in particular, if `count' is greater than 32, the
  610. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  611. The result is stored in the location pointed to by `zPtr'.
  612. -------------------------------------------------------------------------------
  613. *}
  614. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  615. var
  616. z: Bits32;
  617. Begin
  618. if ( count = 0 ) then
  619. z := a
  620. else
  621. if ( count < 32 ) then
  622. Begin
  623. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  624. End
  625. else
  626. Begin
  627. z := bits32( a <> 0 );
  628. End;
  629. zPtr := z;
  630. End;
  631. {*----------------------------------------------------------------------------
  632. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  633. | number of bits given in `count'. Any bits shifted off are lost. The value
  634. | of `count' can be arbitrarily large; in particular, if `count' is greater
  635. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  636. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  637. *----------------------------------------------------------------------------*}
  638. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  639. var
  640. z0, z1: bits64;
  641. negCount: int8;
  642. begin
  643. negCount := ( - count ) and 63;
  644. if ( count = 0 ) then
  645. begin
  646. z1 := a1;
  647. z0 := a0;
  648. end
  649. else if ( count < 64 ) then
  650. begin
  651. z1 := ( a0 shl negCount ) or ( a1 shr count );
  652. z0 := a0 shr count;
  653. end
  654. else
  655. begin
  656. if ( count shl 64 )<>0 then
  657. z1 := a0 shr ( count and 63 )
  658. else
  659. z1 := 0;
  660. z0 := 0;
  661. end;
  662. z1Ptr := z1;
  663. z0Ptr := z0;
  664. end;
  665. {*----------------------------------------------------------------------------
  666. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  667. | number of bits given in `count'. If any nonzero bits are shifted off, they
  668. | are ``jammed'' into the least significant bit of the result by setting the
  669. | least significant bit to 1. The value of `count' can be arbitrarily large;
  670. | in particular, if `count' is greater than 128, the result will be either
  671. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  672. | nonzero. The result is broken into two 64-bit pieces which are stored at
  673. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  674. *----------------------------------------------------------------------------*}
  675. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  676. var
  677. z0,z1 : bits64;
  678. negCount : int8;
  679. begin
  680. negCount := ( - count ) and 63;
  681. if ( count = 0 ) then begin
  682. z1 := a1;
  683. z0 := a0;
  684. end
  685. else if ( count < 64 ) then begin
  686. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  687. z0 := a0>>count;
  688. end
  689. else begin
  690. if ( count = 64 ) then begin
  691. z1 := a0 or ord( a1 <> 0 );
  692. end
  693. else if ( count < 128 ) then begin
  694. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  695. end
  696. else begin
  697. z1 := ord( ( a0 or a1 ) <> 0 );
  698. end;
  699. z0 := 0;
  700. end;
  701. z1Ptr := z1;
  702. z0Ptr := z0;
  703. end;
  704. {*
  705. -------------------------------------------------------------------------------
  706. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  707. number of bits given in `count'. Any bits shifted off are lost. The value
  708. of `count' can be arbitrarily large; in particular, if `count' is greater
  709. than 64, the result will be 0. The result is broken into two 32-bit pieces
  710. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  711. -------------------------------------------------------------------------------
  712. *}
  713. Procedure
  714. shift64Right(
  715. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  716. Var
  717. z0, z1: bits32;
  718. negCount : int8;
  719. Begin
  720. negCount := ( - count ) AND 31;
  721. if ( count = 0 ) then
  722. Begin
  723. z1 := a1;
  724. z0 := a0;
  725. End
  726. else if ( count < 32 ) then
  727. Begin
  728. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  729. z0 := a0 shr count;
  730. End
  731. else
  732. Begin
  733. if (count < 64) then
  734. z1 := ( a0 shr ( count AND 31 ) )
  735. else
  736. z1 := 0;
  737. z0 := 0;
  738. End;
  739. z1Ptr := z1;
  740. z0Ptr := z0;
  741. End;
  742. {*
  743. -------------------------------------------------------------------------------
  744. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  745. number of bits given in `count'. If any nonzero bits are shifted off, they
  746. are ``jammed'' into the least significant bit of the result by setting the
  747. least significant bit to 1. The value of `count' can be arbitrarily large;
  748. in particular, if `count' is greater than 64, the result will be either 0
  749. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  750. nonzero. The result is broken into two 32-bit pieces which are stored at
  751. the locations pointed to by `z0Ptr' and `z1Ptr'.
  752. -------------------------------------------------------------------------------
  753. *}
  754. Procedure
  755. shift64RightJamming(
  756. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  757. VAR
  758. z0, z1 : bits32;
  759. negCount : int8;
  760. Begin
  761. negCount := ( - count ) AND 31;
  762. if ( count = 0 ) then
  763. Begin
  764. z1 := a1;
  765. z0 := a0;
  766. End
  767. else
  768. if ( count < 32 ) then
  769. Begin
  770. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  771. z0 := a0 shr count;
  772. End
  773. else
  774. Begin
  775. if ( count = 32 ) then
  776. Begin
  777. z1 := a0 OR bits32( a1 <> 0 );
  778. End
  779. else
  780. if ( count < 64 ) Then
  781. Begin
  782. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  783. End
  784. else
  785. Begin
  786. z1 := bits32( ( a0 OR a1 ) <> 0 );
  787. End;
  788. z0 := 0;
  789. End;
  790. z1Ptr := z1;
  791. z0Ptr := z0;
  792. End;
  793. {*----------------------------------------------------------------------------
  794. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  795. | bits are shifted off, they are ``jammed'' into the least significant bit of
  796. | the result by setting the least significant bit to 1. The value of `count'
  797. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  798. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  799. | The result is stored in the location pointed to by `zPtr'.
  800. *----------------------------------------------------------------------------*}
  801. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  802. var
  803. z: bits64;
  804. begin
  805. if ( count = 0 ) then
  806. begin
  807. z := a;
  808. end
  809. else if ( count < 64 ) then
  810. begin
  811. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  812. end
  813. else
  814. begin
  815. z := ord( a <> 0 );
  816. end;
  817. zPtr := z;
  818. end;
  819. {*
  820. -------------------------------------------------------------------------------
  821. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  822. by 32 _plus_ the number of bits given in `count'. The shifted result is
  823. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  824. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  825. off form a third 32-bit result as follows: The _last_ bit shifted off is
  826. the most-significant bit of the extra result, and the other 31 bits of the
  827. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  828. were all zero. This extra result is stored in the location pointed to by
  829. `z2Ptr'. The value of `count' can be arbitrarily large.
  830. (This routine makes more sense if `a0', `a1', and `a2' are considered
  831. to form a fixed-point value with binary point between `a1' and `a2'. This
  832. fixed-point value is shifted right by the number of bits given in `count',
  833. and the integer part of the result is returned at the locations pointed to
  834. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  835. corrupted as described above, and is returned at the location pointed to by
  836. `z2Ptr'.)
  837. -------------------------------------------------------------------------------
  838. }
  839. Procedure
  840. shift64ExtraRightJamming(
  841. a0: bits32;
  842. a1: bits32;
  843. a2: bits32;
  844. count: int16;
  845. VAR z0Ptr: bits32;
  846. VAR z1Ptr: bits32;
  847. VAR z2Ptr: bits32
  848. );
  849. Var
  850. z0, z1, z2: bits32;
  851. negCount : int8;
  852. Begin
  853. negCount := ( - count ) AND 31;
  854. if ( count = 0 ) then
  855. Begin
  856. z2 := a2;
  857. z1 := a1;
  858. z0 := a0;
  859. End
  860. else
  861. Begin
  862. if ( count < 32 ) Then
  863. Begin
  864. z2 := a1 shl negCount;
  865. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  866. z0 := a0 shr count;
  867. End
  868. else
  869. Begin
  870. if ( count = 32 ) then
  871. Begin
  872. z2 := a1;
  873. z1 := a0;
  874. End
  875. else
  876. Begin
  877. a2 := a2 or a1;
  878. if ( count < 64 ) then
  879. Begin
  880. z2 := a0 shl negCount;
  881. z1 := a0 shr ( count AND 31 );
  882. End
  883. else
  884. Begin
  885. if count = 64 then
  886. z2 := a0
  887. else
  888. z2 := bits32(a0 <> 0);
  889. z1 := 0;
  890. End;
  891. End;
  892. z0 := 0;
  893. End;
  894. z2 := z2 or bits32( a2 <> 0 );
  895. End;
  896. z2Ptr := z2;
  897. z1Ptr := z1;
  898. z0Ptr := z0;
  899. End;
  900. {*
  901. -------------------------------------------------------------------------------
  902. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  903. number of bits given in `count'. Any bits shifted off are lost. The value
  904. of `count' must be less than 32. The result is broken into two 32-bit
  905. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  906. -------------------------------------------------------------------------------
  907. *}
  908. Procedure
  909. shortShift64Left(
  910. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  911. Begin
  912. z1Ptr := a1 shl count;
  913. if count = 0 then
  914. z0Ptr := a0
  915. else
  916. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  917. End;
  918. {*
  919. -------------------------------------------------------------------------------
  920. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  921. by the number of bits given in `count'. Any bits shifted off are lost.
  922. The value of `count' must be less than 32. The result is broken into three
  923. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  924. `z1Ptr', and `z2Ptr'.
  925. -------------------------------------------------------------------------------
  926. *}
  927. Procedure
  928. shortShift96Left(
  929. a0: bits32;
  930. a1: bits32;
  931. a2: bits32;
  932. count: int16;
  933. VAR z0Ptr: bits32;
  934. VAR z1Ptr: bits32;
  935. VAR z2Ptr: bits32
  936. );
  937. Var
  938. z0, z1, z2: bits32;
  939. negCount: int8;
  940. Begin
  941. z2 := a2 shl count;
  942. z1 := a1 shl count;
  943. z0 := a0 shl count;
  944. if ( 0 < count ) then
  945. Begin
  946. negCount := ( ( - count ) AND 31 );
  947. z1 := z1 or (a2 shr negCount);
  948. z0 := z0 or (a1 shr negCount);
  949. End;
  950. z2Ptr := z2;
  951. z1Ptr := z1;
  952. z0Ptr := z0;
  953. End;
  954. {*----------------------------------------------------------------------------
  955. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  956. | number of bits given in `count'. Any bits shifted off are lost. The value
  957. | of `count' must be less than 64. The result is broken into two 64-bit
  958. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  959. *----------------------------------------------------------------------------*}
  960. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  961. begin
  962. z1Ptr := a1 shl count;
  963. if count=0 then
  964. z0Ptr:=a0
  965. else
  966. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  967. end;
  968. {*
  969. -------------------------------------------------------------------------------
  970. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  971. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  972. any carry out is lost. The result is broken into two 32-bit pieces which
  973. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  974. -------------------------------------------------------------------------------
  975. *}
  976. Procedure
  977. add64(
  978. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  979. Var
  980. z1: bits32;
  981. Begin
  982. z1 := a1 + b1;
  983. z1Ptr := z1;
  984. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  985. End;
  986. {*
  987. -------------------------------------------------------------------------------
  988. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  989. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  990. modulo 2^96, so any carry out is lost. The result is broken into three
  991. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  992. `z1Ptr', and `z2Ptr'.
  993. -------------------------------------------------------------------------------
  994. *}
  995. Procedure
  996. add96(
  997. a0: bits32;
  998. a1: bits32;
  999. a2: bits32;
  1000. b0: bits32;
  1001. b1: bits32;
  1002. b2: bits32;
  1003. VAR z0Ptr: bits32;
  1004. VAR z1Ptr: bits32;
  1005. VAR z2Ptr: bits32
  1006. );
  1007. var
  1008. z0, z1, z2: bits32;
  1009. carry0, carry1: int8;
  1010. Begin
  1011. z2 := a2 + b2;
  1012. carry1 := int8( z2 < a2 );
  1013. z1 := a1 + b1;
  1014. carry0 := int8( z1 < a1 );
  1015. z0 := a0 + b0;
  1016. z1 := z1 + carry1;
  1017. z0 := z0 + bits32( z1 < carry1 );
  1018. z0 := z0 + carry0;
  1019. z2Ptr := z2;
  1020. z1Ptr := z1;
  1021. z0Ptr := z0;
  1022. End;
  1023. {*----------------------------------------------------------------------------
  1024. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1025. | by the number of bits given in `count'. Any bits shifted off are lost.
  1026. | The value of `count' must be less than 64. The result is broken into three
  1027. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1028. | `z1Ptr', and `z2Ptr'.
  1029. *----------------------------------------------------------------------------*}
  1030. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1031. var
  1032. z0, z1, z2 : bits64;
  1033. negCount : int8;
  1034. begin
  1035. z2 := a2 shl count;
  1036. z1 := a1 shl count;
  1037. z0 := a0 shl count;
  1038. if ( 0 < count ) then
  1039. begin
  1040. negCount := ( ( - count ) and 63 );
  1041. z1 := z1 or (a2 shr negCount);
  1042. z0 := z0 or (a1 shr negCount);
  1043. end;
  1044. z2Ptr := z2;
  1045. z1Ptr := z1;
  1046. z0Ptr := z0;
  1047. end;
  1048. {*----------------------------------------------------------------------------
  1049. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1050. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1051. | any carry out is lost. The result is broken into two 64-bit pieces which
  1052. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1053. *----------------------------------------------------------------------------*}
  1054. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);inline;
  1055. var
  1056. z1 : bits64;
  1057. begin
  1058. z1 := a1 + b1;
  1059. z1Ptr := z1;
  1060. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1061. end;
  1062. {*----------------------------------------------------------------------------
  1063. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1064. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1065. | modulo 2^192, so any carry out is lost. The result is broken into three
  1066. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1067. | `z1Ptr', and `z2Ptr'.
  1068. *----------------------------------------------------------------------------*}
  1069. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1070. var
  1071. z0, z1, z2 : bits64;
  1072. carry0, carry1 : int8;
  1073. begin
  1074. z2 := a2 + b2;
  1075. carry1 := ord( z2 < a2 );
  1076. z1 := a1 + b1;
  1077. carry0 := ord( z1 < a1 );
  1078. z0 := a0 + b0;
  1079. inc(z1, carry1);
  1080. inc(z0, ord( z1 < carry1 ));
  1081. inc(z0, carry0);
  1082. z2Ptr := z2;
  1083. z1Ptr := z1;
  1084. z0Ptr := z0;
  1085. end;
  1086. {*
  1087. -------------------------------------------------------------------------------
  1088. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1089. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1090. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1091. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1092. `z1Ptr'.
  1093. -------------------------------------------------------------------------------
  1094. *}
  1095. Procedure
  1096. sub64(
  1097. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  1098. Begin
  1099. z1Ptr := a1 - b1;
  1100. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1101. End;
  1102. {*
  1103. -------------------------------------------------------------------------------
  1104. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1105. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1106. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1107. into three 32-bit pieces which are stored at the locations pointed to by
  1108. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1109. -------------------------------------------------------------------------------
  1110. *}
  1111. Procedure
  1112. sub96(
  1113. a0:bits32;
  1114. a1:bits32;
  1115. a2:bits32;
  1116. b0:bits32;
  1117. b1:bits32;
  1118. b2:bits32;
  1119. VAR z0Ptr:bits32;
  1120. VAR z1Ptr:bits32;
  1121. VAR z2Ptr:bits32
  1122. );
  1123. Var
  1124. z0, z1, z2: bits32;
  1125. borrow0, borrow1: int8;
  1126. Begin
  1127. z2 := a2 - b2;
  1128. borrow1 := int8( a2 < b2 );
  1129. z1 := a1 - b1;
  1130. borrow0 := int8( a1 < b1 );
  1131. z0 := a0 - b0;
  1132. z0 := z0 - bits32( z1 < borrow1 );
  1133. z1 := z1 - borrow1;
  1134. z0 := z0 -borrow0;
  1135. z2Ptr := z2;
  1136. z1Ptr := z1;
  1137. z0Ptr := z0;
  1138. End;
  1139. {*----------------------------------------------------------------------------
  1140. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1141. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1142. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1143. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1144. | `z1Ptr'.
  1145. *----------------------------------------------------------------------------*}
  1146. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1147. begin
  1148. z1Ptr := a1 - b1;
  1149. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1150. end;
  1151. {*----------------------------------------------------------------------------
  1152. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1153. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1154. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1155. | result is broken into three 64-bit pieces which are stored at the locations
  1156. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1157. *----------------------------------------------------------------------------*}
  1158. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1159. var
  1160. z0, z1, z2 : bits64;
  1161. borrow0, borrow1 : int8;
  1162. begin
  1163. z2 := a2 - b2;
  1164. borrow1 := ord( a2 < b2 );
  1165. z1 := a1 - b1;
  1166. borrow0 := ord( a1 < b1 );
  1167. z0 := a0 - b0;
  1168. dec(z0, ord( z1 < borrow1 ));
  1169. dec(z1, borrow1);
  1170. dec(z0, borrow0);
  1171. z2Ptr := z2;
  1172. z1Ptr := z1;
  1173. z0Ptr := z0;
  1174. end;
  1175. {*
  1176. -------------------------------------------------------------------------------
  1177. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1178. into two 32-bit pieces which are stored at the locations pointed to by
  1179. `z0Ptr' and `z1Ptr'.
  1180. -------------------------------------------------------------------------------
  1181. *}
  1182. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1183. :bits32 );
  1184. Var
  1185. aHigh, aLow, bHigh, bLow: bits16;
  1186. z0, zMiddleA, zMiddleB, z1: bits32;
  1187. Begin
  1188. aLow := a and $ffff;
  1189. aHigh := a shr 16;
  1190. bLow := b and $ffff;
  1191. bHigh := b shr 16;
  1192. z1 := ( bits32( aLow) ) * bLow;
  1193. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1194. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1195. z0 := ( bits32 (aHigh) ) * bHigh;
  1196. zMiddleA := zMiddleA + zMiddleB;
  1197. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1198. zMiddleA := zmiddleA shl 16;
  1199. z1 := z1 + zMiddleA;
  1200. z0 := z0 + bits32( z1 < zMiddleA );
  1201. z1Ptr := z1;
  1202. z0Ptr := z0;
  1203. End;
  1204. {*
  1205. -------------------------------------------------------------------------------
  1206. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1207. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1208. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1209. `z2Ptr'.
  1210. -------------------------------------------------------------------------------
  1211. *}
  1212. Procedure
  1213. mul64By32To96(
  1214. a0:bits32;
  1215. a1:bits32;
  1216. b:bits32;
  1217. VAR z0Ptr:bits32;
  1218. VAR z1Ptr:bits32;
  1219. VAR z2Ptr:bits32
  1220. );
  1221. Var
  1222. z0, z1, z2, more1: bits32;
  1223. Begin
  1224. mul32To64( a1, b, z1, z2 );
  1225. mul32To64( a0, b, z0, more1 );
  1226. add64( z0, more1, 0, z1, z0, z1 );
  1227. z2Ptr := z2;
  1228. z1Ptr := z1;
  1229. z0Ptr := z0;
  1230. End;
  1231. {*
  1232. -------------------------------------------------------------------------------
  1233. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1234. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1235. product. The product is broken into four 32-bit pieces which are stored at
  1236. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1237. -------------------------------------------------------------------------------
  1238. *}
  1239. Procedure
  1240. mul64To128(
  1241. a0:bits32;
  1242. a1:bits32;
  1243. b0:bits32;
  1244. b1:bits32;
  1245. VAR z0Ptr:bits32;
  1246. VAR z1Ptr:bits32;
  1247. VAR z2Ptr:bits32;
  1248. VAR z3Ptr:bits32
  1249. );
  1250. Var
  1251. z0, z1, z2, z3: bits32;
  1252. more1, more2: bits32;
  1253. Begin
  1254. mul32To64( a1, b1, z2, z3 );
  1255. mul32To64( a1, b0, z1, more2 );
  1256. add64( z1, more2, 0, z2, z1, z2 );
  1257. mul32To64( a0, b0, z0, more1 );
  1258. add64( z0, more1, 0, z1, z0, z1 );
  1259. mul32To64( a0, b1, more1, more2 );
  1260. add64( more1, more2, 0, z2, more1, z2 );
  1261. add64( z0, z1, 0, more1, z0, z1 );
  1262. z3Ptr := z3;
  1263. z2Ptr := z2;
  1264. z1Ptr := z1;
  1265. z0Ptr := z0;
  1266. End;
  1267. {*----------------------------------------------------------------------------
  1268. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1269. | into two 64-bit pieces which are stored at the locations pointed to by
  1270. | `z0Ptr' and `z1Ptr'.
  1271. *----------------------------------------------------------------------------*}
  1272. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1273. var
  1274. aHigh, aLow, bHigh, bLow : bits32;
  1275. z0, zMiddleA, zMiddleB, z1 : bits64;
  1276. begin
  1277. aLow := a;
  1278. aHigh := a shr 32;
  1279. bLow := b;
  1280. bHigh := b shr 32;
  1281. z1 := ( bits64(aLow) ) * bLow;
  1282. zMiddleA := ( bits64( aLow )) * bHigh;
  1283. zMiddleB := ( bits64( aHigh )) * bLow;
  1284. z0 := ( bits64(aHigh) ) * bHigh;
  1285. inc(zMiddleA, zMiddleB);
  1286. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1287. zMiddleA := zMiddleA shl 32;
  1288. inc(z1, zMiddleA);
  1289. inc(z0, ord( z1 < zMiddleA ));
  1290. z1Ptr := z1;
  1291. z0Ptr := z0;
  1292. end;
  1293. {*----------------------------------------------------------------------------
  1294. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1295. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1296. | product. The product is broken into four 64-bit pieces which are stored at
  1297. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1298. *----------------------------------------------------------------------------*}
  1299. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1300. var
  1301. z0,z1,z2,z3,more1,more2 : bits64;
  1302. begin
  1303. mul64To128( a1, b1, z2, z3 );
  1304. mul64To128( a1, b0, z1, more2 );
  1305. add128( z1, more2, 0, z2, z1, z2 );
  1306. mul64To128( a0, b0, z0, more1 );
  1307. add128( z0, more1, 0, z1, z0, z1 );
  1308. mul64To128( a0, b1, more1, more2 );
  1309. add128( more1, more2, 0, z2, more1, z2 );
  1310. add128( z0, z1, 0, more1, z0, z1 );
  1311. z3Ptr := z3;
  1312. z2Ptr := z2;
  1313. z1Ptr := z1;
  1314. z0Ptr := z0;
  1315. end;
  1316. {*----------------------------------------------------------------------------
  1317. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1318. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1319. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1320. | `z2Ptr'.
  1321. *----------------------------------------------------------------------------*}
  1322. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1323. var
  1324. z0, z1, z2, more1 : bits64;
  1325. begin
  1326. mul64To128( a1, b, z1, z2 );
  1327. mul64To128( a0, b, z0, more1 );
  1328. add128( z0, more1, 0, z1, z0, z1 );
  1329. z2Ptr := z2;
  1330. z1Ptr := z1;
  1331. z0Ptr := z0;
  1332. end;
  1333. {*----------------------------------------------------------------------------
  1334. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1335. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1336. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1337. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1338. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1339. | unsigned integer is returned.
  1340. *----------------------------------------------------------------------------*}
  1341. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1342. var
  1343. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1344. begin
  1345. if ( b <= a0 ) then
  1346. begin
  1347. result:=qword( $FFFFFFFFFFFFFFFF );
  1348. exit;
  1349. end;
  1350. b0 := b shr 32;
  1351. if ( b0 shl 32 <= a0 ) then
  1352. z:=qword( $FFFFFFFF00000000 )
  1353. else
  1354. z:=( a0 div b0 ) shl 32;
  1355. mul64To128( b, z, term0, term1 );
  1356. sub128( a0, a1, term0, term1, rem0, rem1 );
  1357. while ( ( sbits64(rem0) ) < 0 ) do begin
  1358. dec(z,qword( $100000000 ));
  1359. b1 := b shl 32;
  1360. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1361. end;
  1362. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1363. if ( b0 shl 32 <= rem0 ) then
  1364. z:=z or $FFFFFFFF
  1365. else
  1366. z:=z or rem0 div b0;
  1367. result:=z;
  1368. end;
  1369. {*
  1370. -------------------------------------------------------------------------------
  1371. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1372. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1373. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1374. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1375. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1376. unsigned integer is returned.
  1377. -------------------------------------------------------------------------------
  1378. *}
  1379. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1380. Var
  1381. b0, b1: bits32;
  1382. rem0, rem1, term0, term1: bits32;
  1383. z: bits32;
  1384. Begin
  1385. if ( b <= a0 ) then
  1386. Begin
  1387. estimateDiv64To32 := $FFFFFFFF;
  1388. exit;
  1389. End;
  1390. b0 := b shr 16;
  1391. if ( b0 shl 16 <= a0 ) then
  1392. z:= $FFFF0000
  1393. else
  1394. z:= ( a0 div b0 ) shl 16;
  1395. mul32To64( b, z, term0, term1 );
  1396. sub64( a0, a1, term0, term1, rem0, rem1 );
  1397. while ( ( sbits32 (rem0) ) < 0 ) do
  1398. Begin
  1399. z := z - $10000;
  1400. b1 := b shl 16;
  1401. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1402. End;
  1403. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1404. if ( b0 shl 16 <= rem0 ) then
  1405. z := z or $FFFF
  1406. else
  1407. z := z or (rem0 div b0);
  1408. estimateDiv64To32 := z;
  1409. End;
  1410. {*
  1411. -------------------------------------------------------------------------------
  1412. Returns an approximation to the square root of the 32-bit significand given
  1413. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1414. `aExp' (the least significant bit) is 1, the integer returned approximates
  1415. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1416. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1417. case, the approximation returned lies strictly within +/-2 of the exact
  1418. value.
  1419. -------------------------------------------------------------------------------
  1420. *}
  1421. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1422. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1423. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1424. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1425. );
  1426. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1427. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1428. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1429. );
  1430. Var
  1431. index: int8;
  1432. z: bits32;
  1433. Begin
  1434. index := ( a shr 27 ) AND 15;
  1435. if ( aExp AND 1 ) <> 0 then
  1436. Begin
  1437. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1438. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1439. a := a shr 1;
  1440. End
  1441. else
  1442. Begin
  1443. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1444. z := a div z + z;
  1445. if ( $20000 <= z ) then
  1446. z := $FFFF8000
  1447. else
  1448. z := ( z shl 15 );
  1449. if ( z <= a ) then
  1450. Begin
  1451. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1452. exit;
  1453. End;
  1454. End;
  1455. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1456. End;
  1457. {*
  1458. -------------------------------------------------------------------------------
  1459. Returns the number of leading 0 bits before the most-significant 1 bit of
  1460. `a'. If `a' is zero, 32 is returned.
  1461. -------------------------------------------------------------------------------
  1462. *}
  1463. Function countLeadingZeros32( a:bits32 ): int8;
  1464. const countLeadingZerosHigh:array[0..255] of int8 = (
  1465. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1466. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1467. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1468. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1469. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1470. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1471. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1472. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1473. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1474. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1475. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1476. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1477. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1478. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1479. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1480. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1481. );
  1482. Var
  1483. shiftCount: int8;
  1484. Begin
  1485. shiftCount := 0;
  1486. if ( a < $10000 ) then
  1487. Begin
  1488. shiftCount := shiftcount + 16;
  1489. a := a shl 16;
  1490. End;
  1491. if ( a < $1000000 ) then
  1492. Begin
  1493. shiftCount := shiftcount + 8;
  1494. a := a shl 8;
  1495. end;
  1496. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1497. countLeadingZeros32:= shiftCount;
  1498. End;
  1499. {*----------------------------------------------------------------------------
  1500. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1501. | `a'. If `a' is zero, 64 is returned.
  1502. *----------------------------------------------------------------------------*}
  1503. function countLeadingZeros64( a : bits64): int8;
  1504. var
  1505. shiftcount : int8;
  1506. Begin
  1507. shiftCount := 0;
  1508. if ( a < (bits64(1) shl 32 )) then
  1509. shiftCount := shiftcount + 32
  1510. else
  1511. a := a shr 32;
  1512. shiftCount := shiftCount + countLeadingZeros32( a );
  1513. countLeadingZeros64:= shiftCount;
  1514. End;
  1515. {*
  1516. -------------------------------------------------------------------------------
  1517. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
  1518. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1519. returns 0.
  1520. -------------------------------------------------------------------------------
  1521. *}
  1522. Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1523. Begin
  1524. eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
  1525. End;
  1526. {*
  1527. -------------------------------------------------------------------------------
  1528. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1529. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1530. Otherwise, returns 0.
  1531. -------------------------------------------------------------------------------
  1532. *}
  1533. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1534. Begin
  1535. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1536. End;
  1537. {*
  1538. -------------------------------------------------------------------------------
  1539. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1540. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1541. returns 0.
  1542. -------------------------------------------------------------------------------
  1543. *}
  1544. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1545. Begin
  1546. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1547. End;
  1548. {*
  1549. -------------------------------------------------------------------------------
  1550. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
  1551. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1552. returns 0.
  1553. -------------------------------------------------------------------------------
  1554. *}
  1555. Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1556. Begin
  1557. ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
  1558. End;
  1559. const
  1560. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1561. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1562. (*****************************************************************************)
  1563. (* End Low-Level arithmetic *)
  1564. (*****************************************************************************)
  1565. {*
  1566. -------------------------------------------------------------------------------
  1567. Functions and definitions to determine: (1) whether tininess for underflow
  1568. is detected before or after rounding by default, (2) what (if anything)
  1569. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1570. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1571. are propagated from function inputs to output. These details are ENDIAN
  1572. specific
  1573. -------------------------------------------------------------------------------
  1574. *}
  1575. {$IFDEF ENDIAN_LITTLE}
  1576. {*
  1577. -------------------------------------------------------------------------------
  1578. Internal canonical NaN format.
  1579. -------------------------------------------------------------------------------
  1580. *}
  1581. TYPE
  1582. commonNaNT = packed record
  1583. sign: flag;
  1584. high, low : bits32;
  1585. end;
  1586. {*
  1587. -------------------------------------------------------------------------------
  1588. The pattern for a default generated single-precision NaN.
  1589. -------------------------------------------------------------------------------
  1590. *}
  1591. const float32_default_nan = $FFC00000;
  1592. {*
  1593. -------------------------------------------------------------------------------
  1594. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1595. otherwise returns 0.
  1596. -------------------------------------------------------------------------------
  1597. *}
  1598. Function float32_is_nan( a : float32 ): flag;
  1599. Begin
  1600. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1601. End;
  1602. {*
  1603. -------------------------------------------------------------------------------
  1604. Returns 1 if the single-precision floating-point value `a' is a signaling
  1605. NaN; otherwise returns 0.
  1606. -------------------------------------------------------------------------------
  1607. *}
  1608. Function float32_is_signaling_nan( a : float32 ): flag;
  1609. Begin
  1610. float32_is_signaling_nan := flag
  1611. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1612. End;
  1613. {*
  1614. -------------------------------------------------------------------------------
  1615. Returns the result of converting the single-precision floating-point NaN
  1616. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1617. exception is raised.
  1618. -------------------------------------------------------------------------------
  1619. *}
  1620. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1621. var
  1622. z : commonNaNT ;
  1623. Begin
  1624. if ( float32_is_signaling_nan( a ) <> 0) then
  1625. float_raise( float_flag_invalid );
  1626. z.sign := a shr 31;
  1627. z.low := 0;
  1628. z.high := a shl 9;
  1629. c := z;
  1630. End;
  1631. {*
  1632. -------------------------------------------------------------------------------
  1633. Returns the result of converting the canonical NaN `a' to the single-
  1634. precision floating-point format.
  1635. -------------------------------------------------------------------------------
  1636. *}
  1637. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1638. Begin
  1639. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1640. End;
  1641. {*
  1642. -------------------------------------------------------------------------------
  1643. Takes two single-precision floating-point values `a' and `b', one of which
  1644. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1645. signaling NaN, the invalid exception is raised.
  1646. -------------------------------------------------------------------------------
  1647. *}
  1648. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1649. Var
  1650. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1651. label returnLargerSignificand;
  1652. Begin
  1653. aIsNaN := float32_is_nan( a );
  1654. aIsSignalingNaN := float32_is_signaling_nan( a );
  1655. bIsNaN := float32_is_nan( b );
  1656. bIsSignalingNaN := float32_is_signaling_nan( b );
  1657. a := a or $00400000;
  1658. b := b or $00400000;
  1659. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1660. float_raise( float_flag_invalid );
  1661. if ( aIsSignalingNaN )<> 0 then
  1662. Begin
  1663. if ( bIsSignalingNaN ) <> 0 then
  1664. goto returnLargerSignificand;
  1665. if bIsNan <> 0 then
  1666. propagateFloat32NaN := b
  1667. else
  1668. propagateFloat32NaN := a;
  1669. exit;
  1670. End
  1671. else if ( aIsNaN <> 0) then
  1672. Begin
  1673. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1674. Begin
  1675. propagateFloat32NaN := a;
  1676. exit;
  1677. End;
  1678. returnLargerSignificand:
  1679. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1680. Begin
  1681. propagateFloat32NaN := b;
  1682. exit;
  1683. End;
  1684. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1685. Begin
  1686. propagateFloat32NaN := a;
  1687. End;
  1688. if a < b then
  1689. propagateFloat32NaN := a
  1690. else
  1691. propagateFloat32NaN := b;
  1692. exit;
  1693. End
  1694. else
  1695. Begin
  1696. propagateFloat32NaN := b;
  1697. exit;
  1698. End;
  1699. End;
  1700. {*
  1701. -------------------------------------------------------------------------------
  1702. The pattern for a default generated double-precision NaN. The `high' and
  1703. `low' values hold the most- and least-significant bits, respectively.
  1704. -------------------------------------------------------------------------------
  1705. *}
  1706. const
  1707. float64_default_nan_high = $FFF80000;
  1708. float64_default_nan_low = $00000000;
  1709. {*
  1710. -------------------------------------------------------------------------------
  1711. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1712. otherwise returns 0.
  1713. -------------------------------------------------------------------------------
  1714. *}
  1715. Function float64_is_nan( a : float64 ) : flag;
  1716. Begin
  1717. float64_is_nan :=
  1718. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1719. and ( a.low or ( a.high and $000FFFFF ) );
  1720. End;
  1721. {*
  1722. -------------------------------------------------------------------------------
  1723. Returns 1 if the double-precision floating-point value `a' is a signaling
  1724. NaN; otherwise returns 0.
  1725. -------------------------------------------------------------------------------
  1726. *}
  1727. Function float64_is_signaling_nan( a : float64 ): flag;
  1728. Begin
  1729. float64_is_signaling_nan :=
  1730. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1731. and ( a.low or ( a.high and $0007FFFF ) );
  1732. End;
  1733. {*
  1734. -------------------------------------------------------------------------------
  1735. Returns the result of converting the double-precision floating-point NaN
  1736. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1737. exception is raised.
  1738. -------------------------------------------------------------------------------
  1739. *}
  1740. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1741. Var
  1742. z : commonNaNT;
  1743. Begin
  1744. if ( float64_is_signaling_nan( a )<>0 ) then
  1745. float_raise( float_flag_invalid );
  1746. z.sign := a.high shr 31;
  1747. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1748. c := z;
  1749. End;
  1750. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1751. Var
  1752. z : commonNaNT;
  1753. Begin
  1754. if ( float64_is_signaling_nan( a )<>0 ) then
  1755. float_raise( float_flag_invalid );
  1756. z.sign := a.high shr 31;
  1757. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1758. result := z;
  1759. End;
  1760. {*
  1761. -------------------------------------------------------------------------------
  1762. Returns the result of converting the canonical NaN `a' to the double-
  1763. precision floating-point format.
  1764. -------------------------------------------------------------------------------
  1765. *}
  1766. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1767. Var
  1768. z: float64;
  1769. Begin
  1770. shift64Right( a.high, a.low, 12, z.high, z.low );
  1771. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1772. c := z;
  1773. End;
  1774. {*
  1775. -------------------------------------------------------------------------------
  1776. Takes two double-precision floating-point values `a' and `b', one of which
  1777. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1778. signaling NaN, the invalid exception is raised.
  1779. -------------------------------------------------------------------------------
  1780. *}
  1781. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1782. Var
  1783. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1784. label returnLargerSignificand;
  1785. Begin
  1786. aIsNaN := float64_is_nan( a );
  1787. aIsSignalingNaN := float64_is_signaling_nan( a );
  1788. bIsNaN := float64_is_nan( b );
  1789. bIsSignalingNaN := float64_is_signaling_nan( b );
  1790. a.high := a.high or $00080000;
  1791. b.high := b.high or $00080000;
  1792. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1793. float_raise( float_flag_invalid );
  1794. if ( aIsSignalingNaN )<>0 then
  1795. Begin
  1796. if ( bIsSignalingNaN )<>0 then
  1797. goto returnLargerSignificand;
  1798. if bIsNan <> 0 then
  1799. c := b
  1800. else
  1801. c := a;
  1802. exit;
  1803. End
  1804. else if ( aIsNaN )<> 0 then
  1805. Begin
  1806. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1807. Begin
  1808. c := a;
  1809. exit;
  1810. End;
  1811. returnLargerSignificand:
  1812. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1813. Begin
  1814. c := b;
  1815. exit;
  1816. End;
  1817. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1818. Begin
  1819. c := a;
  1820. exit;
  1821. End;
  1822. if a.high < b.high then
  1823. c := a
  1824. else
  1825. c := b;
  1826. exit;
  1827. End
  1828. else
  1829. Begin
  1830. c := b;
  1831. exit;
  1832. End;
  1833. End;
  1834. {*----------------------------------------------------------------------------
  1835. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1836. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1837. | returns 0.
  1838. *----------------------------------------------------------------------------*}
  1839. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1840. begin
  1841. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1842. end;
  1843. {*----------------------------------------------------------------------------
  1844. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1845. | otherwise returns 0.
  1846. *----------------------------------------------------------------------------*}
  1847. function float128_is_nan( a : float128): flag;
  1848. begin
  1849. result:= ord(( int64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1850. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1851. end;
  1852. {*----------------------------------------------------------------------------
  1853. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1854. | signaling NaN; otherwise returns 0.
  1855. *----------------------------------------------------------------------------*}
  1856. function float128_is_signaling_nan( a : float128): flag;
  1857. begin
  1858. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1859. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1860. end;
  1861. {*----------------------------------------------------------------------------
  1862. | Returns the result of converting the quadruple-precision floating-point NaN
  1863. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1864. | exception is raised.
  1865. *----------------------------------------------------------------------------*}
  1866. function float128ToCommonNaN( a : float128): commonNaNT;
  1867. var
  1868. z: commonNaNT;
  1869. qhigh,qlow : qword;
  1870. begin
  1871. if ( float128_is_signaling_nan( a )<>0) then
  1872. float_raise( float_flag_invalid );
  1873. z.sign := a.high shr 63;
  1874. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1875. z.high:=qhigh shr 32;
  1876. z.low:=qhigh and $ffffffff;
  1877. result:=z;
  1878. end;
  1879. {*----------------------------------------------------------------------------
  1880. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1881. | precision floating-point format.
  1882. *----------------------------------------------------------------------------*}
  1883. function commonNaNToFloat128( a : commonNaNT): float128;
  1884. var
  1885. z: float128;
  1886. begin
  1887. shift128Right( a.high, a.low, 16, z.high, z.low );
  1888. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1889. result:=z;
  1890. end;
  1891. {*----------------------------------------------------------------------------
  1892. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1893. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1894. | `b' is a signaling NaN, the invalid exception is raised.
  1895. *----------------------------------------------------------------------------*}
  1896. function propagateFloat128NaN( a: float128; b : float128): float128;
  1897. var
  1898. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1899. label
  1900. returnLargerSignificand;
  1901. begin
  1902. aIsNaN := float128_is_nan( a );
  1903. aIsSignalingNaN := float128_is_signaling_nan( a );
  1904. bIsNaN := float128_is_nan( b );
  1905. bIsSignalingNaN := float128_is_signaling_nan( b );
  1906. a.high := a.high or int64( $0000800000000000 );
  1907. b.high := b.high or int64( $0000800000000000 );
  1908. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1909. float_raise( float_flag_invalid );
  1910. if ( aIsSignalingNaN )<>0 then
  1911. begin
  1912. if ( bIsSignalingNaN )<>0 then
  1913. goto returnLargerSignificand;
  1914. if bIsNaN<>0 then
  1915. result := b
  1916. else
  1917. result := a;
  1918. exit;
  1919. end
  1920. else if ( aIsNaN )<>0 then
  1921. begin
  1922. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1923. begin
  1924. result := a;
  1925. exit;
  1926. end;
  1927. returnLargerSignificand:
  1928. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1929. begin
  1930. result := b;
  1931. exit;
  1932. end;
  1933. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1934. begin
  1935. result := a;
  1936. exit
  1937. end;
  1938. if ( a.high < b.high ) then
  1939. result := a
  1940. else
  1941. result := b;
  1942. exit;
  1943. end
  1944. else
  1945. result:=b;
  1946. end;
  1947. {$ELSE}
  1948. { Big endian code }
  1949. (*----------------------------------------------------------------------------
  1950. | Internal canonical NaN format.
  1951. *----------------------------------------------------------------------------*)
  1952. type
  1953. commonNANT = packed record
  1954. sign : flag;
  1955. high, low : bits32;
  1956. end;
  1957. (*----------------------------------------------------------------------------
  1958. | The pattern for a default generated single-precision NaN.
  1959. *----------------------------------------------------------------------------*)
  1960. const float32_default_nan = $7FFFFFFF;
  1961. (*----------------------------------------------------------------------------
  1962. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1963. | otherwise returns 0.
  1964. *----------------------------------------------------------------------------*)
  1965. function float32_is_nan(a: float32): flag;
  1966. begin
  1967. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1968. end;
  1969. (*----------------------------------------------------------------------------
  1970. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1971. | NaN; otherwise returns 0.
  1972. *----------------------------------------------------------------------------*)
  1973. function float32_is_signaling_nan(a: float32):flag;
  1974. begin
  1975. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  1976. end;
  1977. (*----------------------------------------------------------------------------
  1978. | Returns the result of converting the single-precision floating-point NaN
  1979. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1980. | exception is raised.
  1981. *----------------------------------------------------------------------------*)
  1982. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1983. var
  1984. z: commonNANT;
  1985. begin
  1986. if float32_is_signaling_nan(a)<>0 then
  1987. float_raise(float_flag_invalid);
  1988. z.sign := a shr 31;
  1989. z.low := 0;
  1990. z.high := a shl 9;
  1991. c:=z;
  1992. end;
  1993. (*----------------------------------------------------------------------------
  1994. | Returns the result of converting the canonical NaN `a' to the single-
  1995. | precision floating-point format.
  1996. *----------------------------------------------------------------------------*)
  1997. function CommonNanToFloat32(a : CommonNaNT): float32;
  1998. begin
  1999. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  2000. end;
  2001. (*----------------------------------------------------------------------------
  2002. | Takes two single-precision floating-point values `a' and `b', one of which
  2003. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2004. | signaling NaN, the invalid exception is raised.
  2005. *----------------------------------------------------------------------------*)
  2006. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  2007. var
  2008. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2009. begin
  2010. aIsNaN := float32_is_nan( a );
  2011. aIsSignalingNaN := float32_is_signaling_nan( a );
  2012. bIsNaN := float32_is_nan( b );
  2013. bIsSignalingNaN := float32_is_signaling_nan( b );
  2014. a := a or $00400000;
  2015. b := b or $00400000;
  2016. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2017. float_raise( float_flag_invalid );
  2018. if bIsSignalingNaN<>0 then
  2019. propagateFloat32Nan := b
  2020. else if aIsSignalingNan<>0 then
  2021. propagateFloat32Nan := a
  2022. else if bIsNan<>0 then
  2023. propagateFloat32Nan := b
  2024. else
  2025. propagateFloat32Nan := a;
  2026. end;
  2027. (*----------------------------------------------------------------------------
  2028. | The pattern for a default generated double-precision NaN. The `high' and
  2029. | `low' values hold the most- and least-significant bits, respectively.
  2030. *----------------------------------------------------------------------------*)
  2031. const
  2032. float64_default_nan_high = $7FFFFFFF;
  2033. float64_default_nan_low = $FFFFFFFF;
  2034. (*----------------------------------------------------------------------------
  2035. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2036. | otherwise returns 0.
  2037. *----------------------------------------------------------------------------*)
  2038. function float64_is_nan(a: float64): flag;
  2039. begin
  2040. float64_is_nan := flag (
  2041. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2042. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2043. end;
  2044. (*----------------------------------------------------------------------------
  2045. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2046. | NaN; otherwise returns 0.
  2047. *----------------------------------------------------------------------------*)
  2048. function float64_is_signaling_nan( a:float64): flag;
  2049. begin
  2050. float64_is_signaling_nan := flag(
  2051. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2052. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2053. end;
  2054. (*----------------------------------------------------------------------------
  2055. | Returns the result of converting the double-precision floating-point NaN
  2056. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2057. | exception is raised.
  2058. *----------------------------------------------------------------------------*)
  2059. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  2060. var
  2061. z : commonNaNT;
  2062. begin
  2063. if ( float64_is_signaling_nan( a )<>0 ) then
  2064. float_raise( float_flag_invalid );
  2065. z.sign := a.high shr 31;
  2066. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2067. c:=z;
  2068. end;
  2069. (*----------------------------------------------------------------------------
  2070. | Returns the result of converting the canonical NaN `a' to the double-
  2071. | precision floating-point format.
  2072. *----------------------------------------------------------------------------*)
  2073. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  2074. var
  2075. z: float64;
  2076. begin
  2077. shift64Right( a.high, a.low, 12, z.high, z.low );
  2078. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2079. c:=z;
  2080. end;
  2081. (*----------------------------------------------------------------------------
  2082. | Takes two double-precision floating-point values `a' and `b', one of which
  2083. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2084. | signaling NaN, the invalid exception is raised.
  2085. *----------------------------------------------------------------------------*)
  2086. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2087. var
  2088. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2089. begin
  2090. aIsNaN := float64_is_nan( a );
  2091. aIsSignalingNaN := float64_is_signaling_nan( a );
  2092. bIsNaN := float64_is_nan( b );
  2093. bIsSignalingNaN := float64_is_signaling_nan( b );
  2094. a.high := a.high or $00080000;
  2095. b.high := b.high or $00080000;
  2096. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2097. float_raise( float_flag_invalid );
  2098. if bIsSignalingNaN<>0 then
  2099. c := b
  2100. else if aIsSignalingNan<>0 then
  2101. c := a
  2102. else if bIsNan<>0 then
  2103. c := b
  2104. else
  2105. c := a;
  2106. end;
  2107. {$ENDIF}
  2108. (****************************************************************************)
  2109. (* END ENDIAN SPECIFIC CODE *)
  2110. (****************************************************************************)
  2111. {*
  2112. -------------------------------------------------------------------------------
  2113. Returns the fraction bits of the single-precision floating-point value `a'.
  2114. -------------------------------------------------------------------------------
  2115. *}
  2116. Function ExtractFloat32Frac(a : Float32) : Bits32;
  2117. Begin
  2118. ExtractFloat32Frac := A AND $007FFFFF;
  2119. End;
  2120. {*
  2121. -------------------------------------------------------------------------------
  2122. Returns the exponent bits of the single-precision floating-point value `a'.
  2123. -------------------------------------------------------------------------------
  2124. *}
  2125. Function extractFloat32Exp( a: float32 ): Int16;
  2126. Begin
  2127. extractFloat32Exp := (a shr 23) AND $FF;
  2128. End;
  2129. {*
  2130. -------------------------------------------------------------------------------
  2131. Returns the sign bit of the single-precision floating-point value `a'.
  2132. -------------------------------------------------------------------------------
  2133. *}
  2134. Function extractFloat32Sign( a: float32 ): Flag;
  2135. Begin
  2136. extractFloat32Sign := a shr 31;
  2137. End;
  2138. {*
  2139. -------------------------------------------------------------------------------
  2140. Normalizes the subnormal single-precision floating-point value represented
  2141. by the denormalized significand `aSig'. The normalized exponent and
  2142. significand are stored at the locations pointed to by `zExpPtr' and
  2143. `zSigPtr', respectively.
  2144. -------------------------------------------------------------------------------
  2145. *}
  2146. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2147. Var
  2148. ShiftCount : BYTE;
  2149. Begin
  2150. shiftCount := countLeadingZeros32( aSig ) - 8;
  2151. zSigPtr := aSig shl shiftCount;
  2152. zExpPtr := 1 - shiftCount;
  2153. End;
  2154. {*
  2155. -------------------------------------------------------------------------------
  2156. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2157. single-precision floating-point value, returning the result. After being
  2158. shifted into the proper positions, the three fields are simply added
  2159. together to form the result. This means that any integer portion of `zSig'
  2160. will be added into the exponent. Since a properly normalized significand
  2161. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2162. than the desired result exponent whenever `zSig' is a complete, normalized
  2163. significand.
  2164. -------------------------------------------------------------------------------
  2165. *}
  2166. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  2167. Begin
  2168. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2169. + zSig;
  2170. End;
  2171. {*
  2172. -------------------------------------------------------------------------------
  2173. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2174. and significand `zSig', and returns the proper single-precision floating-
  2175. point value corresponding to the abstract input. Ordinarily, the abstract
  2176. value is simply rounded and packed into the single-precision format, with
  2177. the inexact exception raised if the abstract input cannot be represented
  2178. exactly. However, if the abstract value is too large, the overflow and
  2179. inexact exceptions are raised and an infinity or maximal finite value is
  2180. returned. If the abstract value is too small, the input value is rounded to
  2181. a subnormal number, and the underflow and inexact exceptions are raised if
  2182. the abstract input cannot be represented exactly as a subnormal single-
  2183. precision floating-point number.
  2184. The input significand `zSig' has its binary point between bits 30
  2185. and 29, which is 7 bits to the left of the usual location. This shifted
  2186. significand must be normalized or smaller. If `zSig' is not normalized,
  2187. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2188. and it must not require rounding. In the usual case that `zSig' is
  2189. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2190. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2191. Binary Floating-Point Arithmetic.
  2192. -------------------------------------------------------------------------------
  2193. *}
  2194. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2195. Var
  2196. roundingMode : BYTE;
  2197. roundNearestEven : Flag;
  2198. roundIncrement, roundBits : BYTE;
  2199. IsTiny : Flag;
  2200. Begin
  2201. roundingMode := float_rounding_mode;
  2202. if (roundingMode = float_round_nearest_even) then
  2203. Begin
  2204. roundNearestEven := Flag(TRUE);
  2205. end
  2206. else
  2207. roundNearestEven := Flag(FALSE);
  2208. roundIncrement := $40;
  2209. if ( Boolean(roundNearestEven) = FALSE) then
  2210. Begin
  2211. if ( roundingMode = float_round_to_zero ) Then
  2212. Begin
  2213. roundIncrement := 0;
  2214. End
  2215. else
  2216. Begin
  2217. roundIncrement := $7F;
  2218. if ( zSign <> 0 ) then
  2219. Begin
  2220. if roundingMode = float_round_up then roundIncrement := 0;
  2221. End
  2222. else
  2223. Begin
  2224. if roundingMode = float_round_down then roundIncrement := 0;
  2225. End;
  2226. End
  2227. End;
  2228. roundBits := zSig AND $7F;
  2229. if ($FD <= bits16 (zExp) ) then
  2230. Begin
  2231. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2232. Begin
  2233. float_raise( float_flag_overflow OR float_flag_inexact );
  2234. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2235. exit;
  2236. End;
  2237. if ( zExp < 0 ) then
  2238. Begin
  2239. isTiny :=
  2240. flag(( float_detect_tininess = float_tininess_before_rounding )
  2241. OR ( zExp < -1 )
  2242. OR ( (zSig + roundIncrement) < $80000000 ));
  2243. shift32RightJamming( zSig, - zExp, zSig );
  2244. zExp := 0;
  2245. roundBits := zSig AND $7F;
  2246. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  2247. float_raise( float_flag_underflow );
  2248. End;
  2249. End;
  2250. if ( roundBits )<> 0 then
  2251. softfloat_exception_flags := float_flag_inexact OR softfloat_exception_flags;
  2252. zSig := ( zSig + roundIncrement ) shr 7;
  2253. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  2254. if ( zSig = 0 ) then zExp := 0;
  2255. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2256. exit;
  2257. End;
  2258. {*
  2259. -------------------------------------------------------------------------------
  2260. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2261. and significand `zSig', and returns the proper single-precision floating-
  2262. point value corresponding to the abstract input. This routine is just like
  2263. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2264. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2265. floating-point exponent.
  2266. -------------------------------------------------------------------------------
  2267. *}
  2268. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2269. Var
  2270. ShiftCount : int8;
  2271. Begin
  2272. shiftCount := countLeadingZeros32( zSig ) - 1;
  2273. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2274. End;
  2275. {*
  2276. -------------------------------------------------------------------------------
  2277. Returns the most-significant 20 fraction bits of the double-precision
  2278. floating-point value `a'.
  2279. -------------------------------------------------------------------------------
  2280. *}
  2281. Function extractFloat64Frac0(a: float64): bits32;
  2282. Begin
  2283. extractFloat64Frac0 := a.high and $000FFFFF;
  2284. End;
  2285. {*
  2286. -------------------------------------------------------------------------------
  2287. Returns the least-significant 32 fraction bits of the double-precision
  2288. floating-point value `a'.
  2289. -------------------------------------------------------------------------------
  2290. *}
  2291. Function extractFloat64Frac1(a: float64): bits32;
  2292. Begin
  2293. extractFloat64Frac1 := a.low;
  2294. End;
  2295. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2296. Function extractFloat64Frac(a: float64): bits64;
  2297. Begin
  2298. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2299. End;
  2300. {*
  2301. -------------------------------------------------------------------------------
  2302. Returns the exponent bits of the double-precision floating-point value `a'.
  2303. -------------------------------------------------------------------------------
  2304. *}
  2305. Function extractFloat64Exp(a: float64): int16;
  2306. Begin
  2307. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2308. End;
  2309. {*
  2310. -------------------------------------------------------------------------------
  2311. Returns the sign bit of the double-precision floating-point value `a'.
  2312. -------------------------------------------------------------------------------
  2313. *}
  2314. Function extractFloat64Sign(a: float64) : flag;
  2315. Begin
  2316. extractFloat64Sign := a.high shr 31;
  2317. End;
  2318. {*
  2319. -------------------------------------------------------------------------------
  2320. Normalizes the subnormal double-precision floating-point value represented
  2321. by the denormalized significand formed by the concatenation of `aSig0' and
  2322. `aSig1'. The normalized exponent is stored at the location pointed to by
  2323. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2324. stored at the location pointed to by `zSig0Ptr', and the least significant
  2325. 32 bits of the normalized significand are stored at the location pointed to
  2326. by `zSig1Ptr'.
  2327. -------------------------------------------------------------------------------
  2328. *}
  2329. Procedure normalizeFloat64Subnormal(
  2330. aSig0: bits32;
  2331. aSig1: bits32;
  2332. VAR zExpPtr : Int16;
  2333. VAR zSig0Ptr : Bits32;
  2334. VAR zSig1Ptr : Bits32
  2335. );
  2336. Var
  2337. ShiftCount : Int8;
  2338. Begin
  2339. if ( aSig0 = 0 ) then
  2340. Begin
  2341. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2342. if ( shiftCount < 0 ) then
  2343. Begin
  2344. zSig0Ptr := aSig1 shr ( - shiftCount );
  2345. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2346. End
  2347. else
  2348. Begin
  2349. zSig0Ptr := aSig1 shl shiftCount;
  2350. zSig1Ptr := 0;
  2351. End;
  2352. zExpPtr := - shiftCount - 31;
  2353. End
  2354. else
  2355. Begin
  2356. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2357. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2358. zExpPtr := 1 - shiftCount;
  2359. End;
  2360. End;
  2361. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2362. var
  2363. shiftCount : int8;
  2364. begin
  2365. shiftCount := countLeadingZeros64( aSig ) - 11;
  2366. zSigPtr := aSig shl shiftCount;
  2367. zExpPtr := 1 - shiftCount;
  2368. end;
  2369. {*
  2370. -------------------------------------------------------------------------------
  2371. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2372. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2373. point value, returning the result. After being shifted into the proper
  2374. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2375. together to form the most significant 32 bits of the result. This means
  2376. that any integer portion of `zSig0' will be added into the exponent. Since
  2377. a properly normalized significand will have an integer portion equal to 1,
  2378. the `zExp' input should be 1 less than the desired result exponent whenever
  2379. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2380. -------------------------------------------------------------------------------
  2381. *}
  2382. Procedure
  2383. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2384. var
  2385. z: Float64;
  2386. Begin
  2387. z.low := zSig1;
  2388. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2389. c := z;
  2390. End;
  2391. {*----------------------------------------------------------------------------
  2392. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2393. | double-precision floating-point value, returning the result. After being
  2394. | shifted into the proper positions, the three fields are simply added
  2395. | together to form the result. This means that any integer portion of `zSig'
  2396. | will be added into the exponent. Since a properly normalized significand
  2397. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2398. | than the desired result exponent whenever `zSig' is a complete, normalized
  2399. | significand.
  2400. *----------------------------------------------------------------------------*}
  2401. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2402. begin
  2403. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2404. end;
  2405. {*
  2406. -------------------------------------------------------------------------------
  2407. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2408. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2409. and `zSig2', and returns the proper double-precision floating-point value
  2410. corresponding to the abstract input. Ordinarily, the abstract value is
  2411. simply rounded and packed into the double-precision format, with the inexact
  2412. exception raised if the abstract input cannot be represented exactly.
  2413. However, if the abstract value is too large, the overflow and inexact
  2414. exceptions are raised and an infinity or maximal finite value is returned.
  2415. If the abstract value is too small, the input value is rounded to a
  2416. subnormal number, and the underflow and inexact exceptions are raised if the
  2417. abstract input cannot be represented exactly as a subnormal double-precision
  2418. floating-point number.
  2419. The input significand must be normalized or smaller. If the input
  2420. significand is not normalized, `zExp' must be 0; in that case, the result
  2421. returned is a subnormal number, and it must not require rounding. In the
  2422. usual case that the input significand is normalized, `zExp' must be 1 less
  2423. than the ``true'' floating-point exponent. The handling of underflow and
  2424. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2425. -------------------------------------------------------------------------------
  2426. *}
  2427. Procedure
  2428. roundAndPackFloat64(
  2429. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2430. Var
  2431. roundingMode : Int8;
  2432. roundNearestEven, increment, isTiny : Flag;
  2433. Begin
  2434. roundingMode := float_rounding_mode;
  2435. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2436. increment := flag( sbits32 (zSig2) < 0 );
  2437. if ( roundNearestEven = flag(FALSE) ) then
  2438. Begin
  2439. if ( roundingMode = float_round_to_zero ) then
  2440. increment := 0
  2441. else
  2442. Begin
  2443. if ( zSign )<> 0 then
  2444. Begin
  2445. increment := flag( roundingMode = float_round_down ) and zSig2;
  2446. End
  2447. else
  2448. Begin
  2449. increment := flag( roundingMode = float_round_up ) and zSig2;
  2450. End
  2451. End
  2452. End;
  2453. if ( $7FD <= bits16 (zExp) ) then
  2454. Begin
  2455. if (( $7FD < zExp )
  2456. or (( zExp = $7FD )
  2457. and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
  2458. and (increment<>0)
  2459. )
  2460. ) then
  2461. Begin
  2462. float_raise( float_flag_overflow OR float_flag_inexact );
  2463. if (( roundingMode = float_round_to_zero )
  2464. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2465. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2466. ) then
  2467. Begin
  2468. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2469. exit;
  2470. End;
  2471. packFloat64( zSign, $7FF, 0, 0, c );
  2472. exit;
  2473. End;
  2474. if ( zExp < 0 ) then
  2475. Begin
  2476. isTiny :=
  2477. flag( float_detect_tininess = float_tininess_before_rounding )
  2478. or flag( zExp < -1 )
  2479. or flag(increment = 0)
  2480. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2481. shift64ExtraRightJamming(
  2482. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2483. zExp := 0;
  2484. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2485. if ( roundNearestEven )<>0 then
  2486. Begin
  2487. increment := flag( sbits32 (zSig2) < 0 );
  2488. End
  2489. else
  2490. Begin
  2491. if ( zSign )<>0 then
  2492. Begin
  2493. increment := flag( roundingMode = float_round_down ) and zSig2;
  2494. End
  2495. else
  2496. Begin
  2497. increment := flag( roundingMode = float_round_up ) and zSig2;
  2498. End
  2499. End;
  2500. End;
  2501. End;
  2502. if ( zSig2 )<>0 then
  2503. softfloat_exception_flags := softfloat_exception_flags OR float_flag_inexact;
  2504. if ( increment )<>0 then
  2505. Begin
  2506. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2507. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2508. End
  2509. else
  2510. Begin
  2511. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2512. End;
  2513. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2514. End;
  2515. {*----------------------------------------------------------------------------
  2516. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2517. | and significand `zSig', and returns the proper double-precision floating-
  2518. | point value corresponding to the abstract input. Ordinarily, the abstract
  2519. | value is simply rounded and packed into the double-precision format, with
  2520. | the inexact exception raised if the abstract input cannot be represented
  2521. | exactly. However, if the abstract value is too large, the overflow and
  2522. | inexact exceptions are raised and an infinity or maximal finite value is
  2523. | returned. If the abstract value is too small, the input value is rounded
  2524. | to a subnormal number, and the underflow and inexact exceptions are raised
  2525. | if the abstract input cannot be represented exactly as a subnormal double-
  2526. | precision floating-point number.
  2527. | The input significand `zSig' has its binary point between bits 62
  2528. | and 61, which is 10 bits to the left of the usual location. This shifted
  2529. | significand must be normalized or smaller. If `zSig' is not normalized,
  2530. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2531. | and it must not require rounding. In the usual case that `zSig' is
  2532. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2533. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2534. | Binary Floating-Point Arithmetic.
  2535. *----------------------------------------------------------------------------*}
  2536. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2537. var
  2538. roundingMode: int8;
  2539. roundNearestEven: flag;
  2540. roundIncrement, roundBits: int16;
  2541. isTiny: flag;
  2542. begin
  2543. roundingMode := float_rounding_mode;
  2544. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2545. roundIncrement := $200;
  2546. if ( roundNearestEven=0 ) then
  2547. begin
  2548. if ( roundingMode = float_round_to_zero ) then
  2549. begin
  2550. roundIncrement := 0;
  2551. end
  2552. else begin
  2553. roundIncrement := $3FF;
  2554. if ( zSign<>0 ) then
  2555. begin
  2556. if ( roundingMode = float_round_up ) then
  2557. roundIncrement := 0;
  2558. end
  2559. else begin
  2560. if ( roundingMode = float_round_down ) then
  2561. roundIncrement := 0;
  2562. end
  2563. end
  2564. end;
  2565. roundBits := zSig and $3FF;
  2566. if ( $7FD <= bits16(zExp) ) then
  2567. begin
  2568. if ( ( $7FD < zExp )
  2569. or ( ( zExp = $7FD )
  2570. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2571. ) then
  2572. begin
  2573. float_raise( float_flag_overflow or float_flag_inexact );
  2574. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2575. exit;
  2576. end;
  2577. if ( zExp < 0 ) then
  2578. begin
  2579. isTiny := ord(
  2580. ( float_detect_tininess = float_tininess_before_rounding )
  2581. or ( zExp < -1 )
  2582. or ( (zSig + roundIncrement) < int64( $8000000000000000 ) ) );
  2583. shift64RightJamming( zSig, - zExp, zSig );
  2584. zExp := 0;
  2585. roundBits := zSig and $3FF;
  2586. if ( isTiny and roundBits )<>0 then
  2587. float_raise( float_flag_underflow );
  2588. end
  2589. end;
  2590. if ( roundBits<>0 ) then
  2591. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2592. zSig := ( zSig + roundIncrement ) shr 10;
  2593. zSig := zSig and not( ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven );
  2594. if ( zSig = 0 ) then
  2595. zExp := 0;
  2596. result:=packFloat64( zSign, zExp, zSig );
  2597. end;
  2598. {*
  2599. -------------------------------------------------------------------------------
  2600. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2601. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2602. returns the proper double-precision floating-point value corresponding
  2603. to the abstract input. This routine is just like `roundAndPackFloat64'
  2604. except that the input significand has fewer bits and does not have to be
  2605. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2606. point exponent.
  2607. -------------------------------------------------------------------------------
  2608. *}
  2609. Procedure
  2610. normalizeRoundAndPackFloat64(
  2611. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2612. Var
  2613. shiftCount : int8;
  2614. zSig2 : bits32;
  2615. Begin
  2616. if ( zSig0 = 0 ) then
  2617. Begin
  2618. zSig0 := zSig1;
  2619. zSig1 := 0;
  2620. zExp := zExp -32;
  2621. End;
  2622. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2623. if ( 0 <= shiftCount ) then
  2624. Begin
  2625. zSig2 := 0;
  2626. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2627. End
  2628. else
  2629. Begin
  2630. shift64ExtraRightJamming
  2631. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2632. End;
  2633. zExp := zExp - shiftCount;
  2634. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2635. End;
  2636. {*
  2637. -------------------------------------------------------------------------------
  2638. Returns the result of converting the 32-bit two's complement integer `a' to
  2639. the single-precision floating-point format. The conversion is performed
  2640. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2641. -------------------------------------------------------------------------------
  2642. *}
  2643. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2644. Var
  2645. zSign : Flag;
  2646. Begin
  2647. if ( a = 0 ) then
  2648. Begin
  2649. int32_to_float32.float32 := 0;
  2650. exit;
  2651. End;
  2652. if ( a = sbits32 ($80000000) ) then
  2653. Begin
  2654. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2655. exit;
  2656. end;
  2657. zSign := flag( a < 0 );
  2658. If zSign<>0 then
  2659. a := -a;
  2660. int32_to_float32.float32:=
  2661. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2662. End;
  2663. {*
  2664. -------------------------------------------------------------------------------
  2665. Returns the result of converting the 32-bit two's complement integer `a' to
  2666. the double-precision floating-point format. The conversion is performed
  2667. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2668. -------------------------------------------------------------------------------
  2669. *}
  2670. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2671. var
  2672. zSign : flag;
  2673. absA : bits32;
  2674. shiftCount : int8;
  2675. zSig0, zSig1 : bits32;
  2676. Begin
  2677. if ( a = 0 ) then
  2678. Begin
  2679. packFloat64( 0, 0, 0, 0, result );
  2680. exit;
  2681. end;
  2682. zSign := flag( a < 0 );
  2683. if ZSign<>0 then
  2684. AbsA := -a
  2685. else
  2686. AbsA := a;
  2687. shiftCount := countLeadingZeros32( absA ) - 11;
  2688. if ( 0 <= shiftCount ) then
  2689. Begin
  2690. zSig0 := absA shl shiftCount;
  2691. zSig1 := 0;
  2692. End
  2693. else
  2694. Begin
  2695. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2696. End;
  2697. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2698. End;
  2699. {*
  2700. -------------------------------------------------------------------------------
  2701. Returns the result of converting the single-precision floating-point value
  2702. `a' to the 32-bit two's complement integer format. The conversion is
  2703. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2704. Arithmetic---which means in particular that the conversion is rounded
  2705. according to the current rounding mode. If `a' is a NaN, the largest
  2706. positive integer is returned. Otherwise, if the conversion overflows, the
  2707. largest integer with the same sign as `a' is returned.
  2708. -------------------------------------------------------------------------------
  2709. *}
  2710. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2711. Var
  2712. aSign: flag;
  2713. aExp, shiftCount: int16;
  2714. aSig, aSigExtra: bits32;
  2715. z: int32;
  2716. roundingMode: int8;
  2717. Begin
  2718. aSig := extractFloat32Frac( a.float32 );
  2719. aExp := extractFloat32Exp( a.float32 );
  2720. aSign := extractFloat32Sign( a.float32 );
  2721. shiftCount := aExp - $96;
  2722. if ( 0 <= shiftCount ) then
  2723. Begin
  2724. if ( $9E <= aExp ) then
  2725. Begin
  2726. if ( a.float32 <> $CF000000 ) then
  2727. Begin
  2728. float_raise( float_flag_invalid );
  2729. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2730. Begin
  2731. float32_to_int32 := $7FFFFFFF;
  2732. exit;
  2733. End;
  2734. End;
  2735. float32_to_int32 := sbits32 ($80000000);
  2736. exit;
  2737. End;
  2738. z := ( aSig or $00800000 ) shl shiftCount;
  2739. if ( aSign<>0 ) then z := - z;
  2740. End
  2741. else
  2742. Begin
  2743. if ( aExp < $7E ) then
  2744. Begin
  2745. aSigExtra := aExp OR aSig;
  2746. z := 0;
  2747. End
  2748. else
  2749. Begin
  2750. aSig := aSig OR $00800000;
  2751. aSigExtra := aSig shl ( shiftCount and 31 );
  2752. z := aSig shr ( - shiftCount );
  2753. End;
  2754. if ( aSigExtra<>0 ) then
  2755. softfloat_exception_flags := softfloat_exception_flags
  2756. or float_flag_inexact;
  2757. roundingMode := float_rounding_mode;
  2758. if ( roundingMode = float_round_nearest_even ) then
  2759. Begin
  2760. if ( sbits32 (aSigExtra) < 0 ) then
  2761. Begin
  2762. Inc(z);
  2763. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2764. z := z and not 1;
  2765. End;
  2766. if ( aSign<>0 ) then
  2767. z := - z;
  2768. End
  2769. else
  2770. Begin
  2771. aSigExtra := flag( aSigExtra <> 0 );
  2772. if ( aSign<>0 ) then
  2773. Begin
  2774. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2775. z := - z;
  2776. End
  2777. else
  2778. Begin
  2779. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2780. End
  2781. End;
  2782. End;
  2783. float32_to_int32 := z;
  2784. End;
  2785. {*
  2786. -------------------------------------------------------------------------------
  2787. Returns the result of converting the single-precision floating-point value
  2788. `a' to the 32-bit two's complement integer format. The conversion is
  2789. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2790. Arithmetic, except that the conversion is always rounded toward zero.
  2791. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2792. the conversion overflows, the largest integer with the same sign as `a' is
  2793. returned.
  2794. -------------------------------------------------------------------------------
  2795. *}
  2796. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2797. Var
  2798. aSign : flag;
  2799. aExp, shiftCount : int16;
  2800. aSig : bits32;
  2801. z : int32;
  2802. Begin
  2803. aSig := extractFloat32Frac( a.float32 );
  2804. aExp := extractFloat32Exp( a.float32 );
  2805. aSign := extractFloat32Sign( a.float32 );
  2806. shiftCount := aExp - $9E;
  2807. if ( 0 <= shiftCount ) then
  2808. Begin
  2809. if ( a.float32 <> $CF000000 ) then
  2810. Begin
  2811. float_raise( float_flag_invalid );
  2812. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2813. Begin
  2814. float32_to_int32_round_to_zero := $7FFFFFFF;
  2815. exit;
  2816. end;
  2817. End;
  2818. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2819. exit;
  2820. End
  2821. else
  2822. if ( aExp <= $7E ) then
  2823. Begin
  2824. if ( aExp or aSig )<>0 then
  2825. softfloat_exception_flags :=
  2826. softfloat_exception_flags or float_flag_inexact;
  2827. float32_to_int32_round_to_zero := 0;
  2828. exit;
  2829. End;
  2830. aSig := ( aSig or $00800000 ) shl 8;
  2831. z := aSig shr ( - shiftCount );
  2832. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2833. Begin
  2834. softfloat_exception_flags :=
  2835. softfloat_exception_flags or float_flag_inexact;
  2836. End;
  2837. if ( aSign<>0 ) then z := - z;
  2838. float32_to_int32_round_to_zero := z;
  2839. End;
  2840. {*
  2841. -------------------------------------------------------------------------------
  2842. Returns the result of converting the single-precision floating-point value
  2843. `a' to the double-precision floating-point format. The conversion is
  2844. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2845. Arithmetic.
  2846. -------------------------------------------------------------------------------
  2847. *}
  2848. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  2849. Var
  2850. aSign : flag;
  2851. aExp : int16;
  2852. aSig, zSig0, zSig1: bits32;
  2853. tmp : CommonNanT;
  2854. Begin
  2855. aSig := extractFloat32Frac( a.float32 );
  2856. aExp := extractFloat32Exp( a.float32 );
  2857. aSign := extractFloat32Sign( a.float32 );
  2858. if ( aExp = $FF ) then
  2859. Begin
  2860. if ( aSig<>0 ) then
  2861. Begin
  2862. float32ToCommonNaN(a.float32, tmp);
  2863. commonNaNToFloat64(tmp , result);
  2864. exit;
  2865. End;
  2866. packFloat64( aSign, $7FF, 0, 0, result);
  2867. exit;
  2868. End;
  2869. if ( aExp = 0 ) then
  2870. Begin
  2871. if ( aSig = 0 ) then
  2872. Begin
  2873. packFloat64( aSign, 0, 0, 0, result );
  2874. exit;
  2875. end;
  2876. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2877. Dec(aExp);
  2878. End;
  2879. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  2880. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  2881. End;
  2882. {*
  2883. -------------------------------------------------------------------------------
  2884. Rounds the single-precision floating-point value `a' to an integer,
  2885. and returns the result as a single-precision floating-point value. The
  2886. operation is performed according to the IEC/IEEE Standard for Binary
  2887. Floating-Point Arithmetic.
  2888. -------------------------------------------------------------------------------
  2889. *}
  2890. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  2891. Var
  2892. aSign: flag;
  2893. aExp: int16;
  2894. lastBitMask, roundBitsMask: bits32;
  2895. roundingMode: int8;
  2896. z: float32;
  2897. Begin
  2898. aExp := extractFloat32Exp( a.float32 );
  2899. if ( $96 <= aExp ) then
  2900. Begin
  2901. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2902. Begin
  2903. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  2904. exit;
  2905. End;
  2906. float32_round_to_int:=a;
  2907. exit;
  2908. End;
  2909. if ( aExp <= $7E ) then
  2910. Begin
  2911. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  2912. Begin
  2913. float32_round_to_int:=a;
  2914. exit;
  2915. end;
  2916. softfloat_exception_flags
  2917. := softfloat_exception_flags OR float_flag_inexact;
  2918. aSign := extractFloat32Sign( a.float32 );
  2919. case ( float_rounding_mode ) of
  2920. float_round_nearest_even:
  2921. Begin
  2922. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2923. Begin
  2924. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  2925. exit;
  2926. End;
  2927. End;
  2928. float_round_down:
  2929. Begin
  2930. if aSign <> 0 then
  2931. float32_round_to_int.float32 := $BF800000
  2932. else
  2933. float32_round_to_int.float32 := 0;
  2934. exit;
  2935. End;
  2936. float_round_up:
  2937. Begin
  2938. if aSign <> 0 then
  2939. float32_round_to_int.float32 := $80000000
  2940. else
  2941. float32_round_to_int.float32 := $3F800000;
  2942. exit;
  2943. End;
  2944. end;
  2945. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  2946. End;
  2947. lastBitMask := 1;
  2948. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  2949. lastBitMask := lastBitMask shl ($96 - aExp);
  2950. roundBitsMask := lastBitMask - 1;
  2951. z := a.float32;
  2952. roundingMode := float_rounding_mode;
  2953. if ( roundingMode = float_round_nearest_even ) then
  2954. Begin
  2955. z := z + (lastBitMask shr 1);
  2956. if ( ( z and roundBitsMask ) = 0 ) then
  2957. z := z and not lastBitMask;
  2958. End
  2959. else if ( roundingMode <> float_round_to_zero ) then
  2960. Begin
  2961. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  2962. Begin
  2963. z := z + roundBitsMask;
  2964. End;
  2965. End;
  2966. z := z and not roundBitsMask;
  2967. if ( z <> a.float32 ) then
  2968. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2969. float32_round_to_int.float32 := z;
  2970. End;
  2971. {*
  2972. -------------------------------------------------------------------------------
  2973. Returns the result of adding the absolute values of the single-precision
  2974. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  2975. before being returned. `zSign' is ignored if the result is a NaN.
  2976. The addition is performed according to the IEC/IEEE Standard for Binary
  2977. Floating-Point Arithmetic.
  2978. -------------------------------------------------------------------------------
  2979. *}
  2980. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  2981. Var
  2982. aExp, bExp, zExp: int16;
  2983. aSig, bSig, zSig: bits32;
  2984. expDiff: int16;
  2985. label roundAndPack;
  2986. Begin
  2987. aSig:=extractFloat32Frac( a );
  2988. aExp:=extractFloat32Exp( a );
  2989. bSig:=extractFloat32Frac( b );
  2990. bExp := extractFloat32Exp( b );
  2991. expDiff := aExp - bExp;
  2992. aSig := aSig shl 6;
  2993. bSig := bSig shl 6;
  2994. if ( 0 < expDiff ) then
  2995. Begin
  2996. if ( aExp = $FF ) then
  2997. Begin
  2998. if ( aSig <> 0) then
  2999. Begin
  3000. addFloat32Sigs := propagateFloat32NaN( a, b );
  3001. exit;
  3002. End;
  3003. addFloat32Sigs := a;
  3004. exit;
  3005. End;
  3006. if ( bExp = 0 ) then
  3007. Begin
  3008. Dec(expDiff);
  3009. End
  3010. else
  3011. Begin
  3012. bSig := bSig or $20000000;
  3013. End;
  3014. shift32RightJamming( bSig, expDiff, bSig );
  3015. zExp := aExp;
  3016. End
  3017. else
  3018. If ( expDiff < 0 ) then
  3019. Begin
  3020. if ( bExp = $FF ) then
  3021. Begin
  3022. if ( bSig<>0 ) then
  3023. Begin
  3024. addFloat32Sigs := propagateFloat32NaN( a, b );
  3025. exit;
  3026. end;
  3027. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3028. exit;
  3029. End;
  3030. if ( aExp = 0 ) then
  3031. Begin
  3032. Inc(expDiff);
  3033. End
  3034. else
  3035. Begin
  3036. aSig := aSig OR $20000000;
  3037. End;
  3038. shift32RightJamming( aSig, - expDiff, aSig );
  3039. zExp := bExp;
  3040. End
  3041. else
  3042. Begin
  3043. if ( aExp = $FF ) then
  3044. Begin
  3045. if ( aSig OR bSig )<> 0 then
  3046. Begin
  3047. addFloat32Sigs := propagateFloat32NaN( a, b );
  3048. exit;
  3049. end;
  3050. addFloat32Sigs := a;
  3051. exit;
  3052. End;
  3053. if ( aExp = 0 ) then
  3054. Begin
  3055. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3056. exit;
  3057. end;
  3058. zSig := $40000000 + aSig + bSig;
  3059. zExp := aExp;
  3060. goto roundAndPack;
  3061. End;
  3062. aSig := aSig OR $20000000;
  3063. zSig := ( aSig + bSig ) shl 1;
  3064. Dec(zExp);
  3065. if ( sbits32 (zSig) < 0 ) then
  3066. Begin
  3067. zSig := aSig + bSig;
  3068. Inc(zExp);
  3069. End;
  3070. roundAndPack:
  3071. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3072. End;
  3073. {*
  3074. -------------------------------------------------------------------------------
  3075. Returns the result of subtracting the absolute values of the single-
  3076. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3077. difference is negated before being returned. `zSign' is ignored if the
  3078. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3079. Standard for Binary Floating-Point Arithmetic.
  3080. -------------------------------------------------------------------------------
  3081. *}
  3082. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3083. Var
  3084. aExp, bExp, zExp: int16;
  3085. aSig, bSig, zSig: bits32;
  3086. expDiff : int16;
  3087. label aExpBigger;
  3088. label bExpBigger;
  3089. label aBigger;
  3090. label bBigger;
  3091. label normalizeRoundAndPack;
  3092. Begin
  3093. aSig := extractFloat32Frac( a );
  3094. aExp := extractFloat32Exp( a );
  3095. bSig := extractFloat32Frac( b );
  3096. bExp := extractFloat32Exp( b );
  3097. expDiff := aExp - bExp;
  3098. aSig := aSig shl 7;
  3099. bSig := bSig shl 7;
  3100. if ( 0 < expDiff ) then goto aExpBigger;
  3101. if ( expDiff < 0 ) then goto bExpBigger;
  3102. if ( aExp = $FF ) then
  3103. Begin
  3104. if ( aSig OR bSig )<> 0 then
  3105. Begin
  3106. subFloat32Sigs := propagateFloat32NaN( a, b );
  3107. exit;
  3108. End;
  3109. float_raise( float_flag_invalid );
  3110. subFloat32Sigs := float32_default_nan;
  3111. exit;
  3112. End;
  3113. if ( aExp = 0 ) then
  3114. Begin
  3115. aExp := 1;
  3116. bExp := 1;
  3117. End;
  3118. if ( bSig < aSig ) Then goto aBigger;
  3119. if ( aSig < bSig ) Then goto bBigger;
  3120. subFloat32Sigs := packFloat32( flag(float_rounding_mode = float_round_down), 0, 0 );
  3121. exit;
  3122. bExpBigger:
  3123. if ( bExp = $FF ) then
  3124. Begin
  3125. if ( bSig<>0 ) then
  3126. Begin
  3127. subFloat32Sigs := propagateFloat32NaN( a, b );
  3128. exit;
  3129. End;
  3130. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3131. exit;
  3132. End;
  3133. if ( aExp = 0 ) then
  3134. Begin
  3135. Inc(expDiff);
  3136. End
  3137. else
  3138. Begin
  3139. aSig := aSig OR $40000000;
  3140. End;
  3141. shift32RightJamming( aSig, - expDiff, aSig );
  3142. bSig := bSig OR $40000000;
  3143. bBigger:
  3144. zSig := bSig - aSig;
  3145. zExp := bExp;
  3146. zSign := zSign xor 1;
  3147. goto normalizeRoundAndPack;
  3148. aExpBigger:
  3149. if ( aExp = $FF ) then
  3150. Begin
  3151. if ( aSig <> 0) then
  3152. Begin
  3153. subFloat32Sigs := propagateFloat32NaN( a, b );
  3154. exit;
  3155. End;
  3156. subFloat32Sigs := a;
  3157. exit;
  3158. End;
  3159. if ( bExp = 0 ) then
  3160. Begin
  3161. Dec(expDiff);
  3162. End
  3163. else
  3164. Begin
  3165. bSig := bSig OR $40000000;
  3166. End;
  3167. shift32RightJamming( bSig, expDiff, bSig );
  3168. aSig := aSig OR $40000000;
  3169. aBigger:
  3170. zSig := aSig - bSig;
  3171. zExp := aExp;
  3172. normalizeRoundAndPack:
  3173. Dec(zExp);
  3174. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3175. End;
  3176. {*
  3177. -------------------------------------------------------------------------------
  3178. Returns the result of adding the single-precision floating-point values `a'
  3179. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3180. Binary Floating-Point Arithmetic.
  3181. -------------------------------------------------------------------------------
  3182. *}
  3183. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3184. Var
  3185. aSign, bSign: Flag;
  3186. Begin
  3187. aSign := extractFloat32Sign( a.float32 );
  3188. bSign := extractFloat32Sign( b.float32 );
  3189. if ( aSign = bSign ) then
  3190. Begin
  3191. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3192. End
  3193. else
  3194. Begin
  3195. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3196. End;
  3197. End;
  3198. {*
  3199. -------------------------------------------------------------------------------
  3200. Returns the result of subtracting the single-precision floating-point values
  3201. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3202. for Binary Floating-Point Arithmetic.
  3203. -------------------------------------------------------------------------------
  3204. *}
  3205. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3206. Var
  3207. aSign, bSign: flag;
  3208. Begin
  3209. aSign := extractFloat32Sign( a.float32 );
  3210. bSign := extractFloat32Sign( b.float32 );
  3211. if ( aSign = bSign ) then
  3212. Begin
  3213. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3214. End
  3215. else
  3216. Begin
  3217. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3218. End;
  3219. End;
  3220. {*
  3221. -------------------------------------------------------------------------------
  3222. Returns the result of multiplying the single-precision floating-point values
  3223. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3224. for Binary Floating-Point Arithmetic.
  3225. -------------------------------------------------------------------------------
  3226. *}
  3227. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3228. Var
  3229. aSign, bSign, zSign: flag;
  3230. aExp, bExp, zExp : int16;
  3231. aSig, bSig, zSig0, zSig1: bits32;
  3232. Begin
  3233. aSig := extractFloat32Frac( a.float32 );
  3234. aExp := extractFloat32Exp( a.float32 );
  3235. aSign := extractFloat32Sign( a.float32 );
  3236. bSig := extractFloat32Frac( b.float32 );
  3237. bExp := extractFloat32Exp( b.float32 );
  3238. bSign := extractFloat32Sign( b.float32 );
  3239. zSign := aSign xor bSign;
  3240. if ( aExp = $FF ) then
  3241. Begin
  3242. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3243. Begin
  3244. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3245. End;
  3246. if ( ( bExp OR bSig ) = 0 ) then
  3247. Begin
  3248. float_raise( float_flag_invalid );
  3249. float32_mul.float32 := float32_default_nan;
  3250. exit;
  3251. End;
  3252. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3253. exit;
  3254. End;
  3255. if ( bExp = $FF ) then
  3256. Begin
  3257. if ( bSig <> 0 ) then
  3258. Begin
  3259. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3260. exit;
  3261. End;
  3262. if ( ( aExp OR aSig ) = 0 ) then
  3263. Begin
  3264. float_raise( float_flag_invalid );
  3265. float32_mul.float32 := float32_default_nan;
  3266. exit;
  3267. End;
  3268. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3269. exit;
  3270. End;
  3271. if ( aExp = 0 ) then
  3272. Begin
  3273. if ( aSig = 0 ) then
  3274. Begin
  3275. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3276. exit;
  3277. End;
  3278. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3279. End;
  3280. if ( bExp = 0 ) then
  3281. Begin
  3282. if ( bSig = 0 ) then
  3283. Begin
  3284. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3285. exit;
  3286. End;
  3287. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3288. End;
  3289. zExp := aExp + bExp - $7F;
  3290. aSig := ( aSig OR $00800000 ) shl 7;
  3291. bSig := ( bSig OR $00800000 ) shl 8;
  3292. mul32To64( aSig, bSig, zSig0, zSig1 );
  3293. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3294. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3295. Begin
  3296. zSig0 := zSig0 shl 1;
  3297. Dec(zExp);
  3298. End;
  3299. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3300. End;
  3301. {*
  3302. -------------------------------------------------------------------------------
  3303. Returns the result of dividing the single-precision floating-point value `a'
  3304. by the corresponding value `b'. The operation is performed according to the
  3305. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3306. -------------------------------------------------------------------------------
  3307. *}
  3308. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3309. Var
  3310. aSign, bSign, zSign: flag;
  3311. aExp, bExp, zExp: int16;
  3312. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3313. Begin
  3314. aSig := extractFloat32Frac( a.float32 );
  3315. aExp := extractFloat32Exp( a.float32 );
  3316. aSign := extractFloat32Sign( a.float32 );
  3317. bSig := extractFloat32Frac( b.float32 );
  3318. bExp := extractFloat32Exp( b.float32 );
  3319. bSign := extractFloat32Sign( b.float32 );
  3320. zSign := aSign xor bSign;
  3321. if ( aExp = $FF ) then
  3322. Begin
  3323. if ( aSig <> 0 ) then
  3324. Begin
  3325. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3326. exit;
  3327. End;
  3328. if ( bExp = $FF ) then
  3329. Begin
  3330. if ( bSig <> 0) then
  3331. Begin
  3332. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3333. End;
  3334. float_raise( float_flag_invalid );
  3335. float32_div.float32 := float32_default_nan;
  3336. exit;
  3337. End;
  3338. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3339. exit;
  3340. End;
  3341. if ( bExp = $FF ) then
  3342. Begin
  3343. if ( bSig <> 0) then
  3344. Begin
  3345. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3346. exit;
  3347. End;
  3348. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3349. exit;
  3350. End;
  3351. if ( bExp = 0 ) Then
  3352. Begin
  3353. if ( bSig = 0 ) Then
  3354. Begin
  3355. if ( ( aExp OR aSig ) = 0 ) then
  3356. Begin
  3357. float_raise( float_flag_invalid );
  3358. float32_div.float32 := float32_default_nan;
  3359. exit;
  3360. End;
  3361. float_raise( float_flag_divbyzero );
  3362. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3363. exit;
  3364. End;
  3365. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3366. End;
  3367. if ( aExp = 0 ) Then
  3368. Begin
  3369. if ( aSig = 0 ) Then
  3370. Begin
  3371. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3372. exit;
  3373. End;
  3374. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3375. End;
  3376. zExp := aExp - bExp + $7D;
  3377. aSig := ( aSig OR $00800000 ) shl 7;
  3378. bSig := ( bSig OR $00800000 ) shl 8;
  3379. if ( bSig <= ( aSig + aSig ) ) then
  3380. Begin
  3381. aSig := aSig shr 1;
  3382. Inc(zExp);
  3383. End;
  3384. zSig := estimateDiv64To32( aSig, 0, bSig );
  3385. if ( ( zSig and $3F ) <= 2 ) then
  3386. Begin
  3387. mul32To64( bSig, zSig, term0, term1 );
  3388. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3389. while ( sbits32 (rem0) < 0 ) do
  3390. Begin
  3391. Dec(zSig);
  3392. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3393. End;
  3394. zSig := zSig or bits32( rem1 <> 0 );
  3395. End;
  3396. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3397. End;
  3398. {*
  3399. -------------------------------------------------------------------------------
  3400. Returns the remainder of the single-precision floating-point value `a'
  3401. with respect to the corresponding value `b'. The operation is performed
  3402. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3403. -------------------------------------------------------------------------------
  3404. *}
  3405. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3406. Var
  3407. aSign, bSign, zSign: flag;
  3408. aExp, bExp, expDiff: int16;
  3409. aSig, bSig, q, allZero, alternateASig: bits32;
  3410. sigMean: sbits32;
  3411. Begin
  3412. aSig := extractFloat32Frac( a.float32 );
  3413. aExp := extractFloat32Exp( a.float32 );
  3414. aSign := extractFloat32Sign( a.float32 );
  3415. bSig := extractFloat32Frac( b.float32 );
  3416. bExp := extractFloat32Exp( b.float32 );
  3417. bSign := extractFloat32Sign( b.float32 );
  3418. if ( aExp = $FF ) then
  3419. Begin
  3420. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3421. Begin
  3422. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3423. exit;
  3424. End;
  3425. float_raise( float_flag_invalid );
  3426. float32_rem.float32 := float32_default_nan;
  3427. exit;
  3428. End;
  3429. if ( bExp = $FF ) then
  3430. Begin
  3431. if ( bSig <> 0 ) then
  3432. Begin
  3433. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3434. exit;
  3435. End;
  3436. float32_rem := a;
  3437. exit;
  3438. End;
  3439. if ( bExp = 0 ) then
  3440. Begin
  3441. if ( bSig = 0 ) then
  3442. Begin
  3443. float_raise( float_flag_invalid );
  3444. float32_rem.float32 := float32_default_nan;
  3445. exit;
  3446. End;
  3447. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3448. End;
  3449. if ( aExp = 0 ) then
  3450. Begin
  3451. if ( aSig = 0 ) then
  3452. Begin
  3453. float32_rem := a;
  3454. exit;
  3455. End;
  3456. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3457. End;
  3458. expDiff := aExp - bExp;
  3459. aSig := ( aSig OR $00800000 ) shl 8;
  3460. bSig := ( bSig OR $00800000 ) shl 8;
  3461. if ( expDiff < 0 ) then
  3462. Begin
  3463. if ( expDiff < -1 ) then
  3464. Begin
  3465. float32_rem := a;
  3466. exit;
  3467. End;
  3468. aSig := aSig shr 1;
  3469. End;
  3470. q := bits32( bSig <= aSig );
  3471. if ( q <> 0) then
  3472. aSig := aSig - bSig;
  3473. expDiff := expDiff - 32;
  3474. while ( 0 < expDiff ) do
  3475. Begin
  3476. q := estimateDiv64To32( aSig, 0, bSig );
  3477. if (2 < q) then
  3478. q := q - 2
  3479. else
  3480. q := 0;
  3481. aSig := - ( ( bSig shr 2 ) * q );
  3482. expDiff := expDiff - 30;
  3483. End;
  3484. expDiff := expDiff + 32;
  3485. if ( 0 < expDiff ) then
  3486. Begin
  3487. q := estimateDiv64To32( aSig, 0, bSig );
  3488. if (2 < q) then
  3489. q := q - 2
  3490. else
  3491. q := 0;
  3492. q := q shr (32 - expDiff);
  3493. bSig := bSig shr 2;
  3494. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3495. End
  3496. else
  3497. Begin
  3498. aSig := aSig shr 2;
  3499. bSig := bSig shr 2;
  3500. End;
  3501. Repeat
  3502. alternateASig := aSig;
  3503. Inc(q);
  3504. aSig := aSig - bSig;
  3505. Until not ( 0 <= sbits32 (aSig) );
  3506. sigMean := aSig + alternateASig;
  3507. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3508. Begin
  3509. aSig := alternateASig;
  3510. End;
  3511. zSign := flag( sbits32 (aSig) < 0 );
  3512. if ( zSign<>0 ) then
  3513. aSig := - aSig;
  3514. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3515. End;
  3516. {*
  3517. -------------------------------------------------------------------------------
  3518. Returns the square root of the single-precision floating-point value `a'.
  3519. The operation is performed according to the IEC/IEEE Standard for Binary
  3520. Floating-Point Arithmetic.
  3521. -------------------------------------------------------------------------------
  3522. *}
  3523. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3524. Var
  3525. aSign : flag;
  3526. aExp, zExp : int16;
  3527. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3528. label roundAndPack;
  3529. Begin
  3530. aSig := extractFloat32Frac( a.float32 );
  3531. aExp := extractFloat32Exp( a.float32 );
  3532. aSign := extractFloat32Sign( a.float32 );
  3533. if ( aExp = $FF ) then
  3534. Begin
  3535. if ( aSig <> 0) then
  3536. Begin
  3537. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3538. exit;
  3539. End;
  3540. if ( aSign = 0) then
  3541. Begin
  3542. float32_sqrt := a;
  3543. exit;
  3544. End;
  3545. float_raise( float_flag_invalid );
  3546. float32_sqrt.float32 := float32_default_nan;
  3547. exit;
  3548. End;
  3549. if ( aSign <> 0) then
  3550. Begin
  3551. if ( ( aExp OR aSig ) = 0 ) then
  3552. Begin
  3553. float32_sqrt := a;
  3554. exit;
  3555. End;
  3556. float_raise( float_flag_invalid );
  3557. float32_sqrt.float32 := float32_default_nan;
  3558. exit;
  3559. End;
  3560. if ( aExp = 0 ) then
  3561. Begin
  3562. if ( aSig = 0 ) then
  3563. Begin
  3564. float32_sqrt.float32 := 0;
  3565. exit;
  3566. End;
  3567. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3568. End;
  3569. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3570. aSig := ( aSig OR $00800000 ) shl 8;
  3571. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3572. if ( ( zSig and $7F ) <= 5 ) then
  3573. Begin
  3574. if ( zSig < 2 ) then
  3575. Begin
  3576. zSig := $7FFFFFFF;
  3577. goto roundAndPack;
  3578. End
  3579. else
  3580. Begin
  3581. aSig := aSig shr (aExp and 1);
  3582. mul32To64( zSig, zSig, term0, term1 );
  3583. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3584. while ( sbits32 (rem0) < 0 ) do
  3585. Begin
  3586. Dec(zSig);
  3587. shortShift64Left( 0, zSig, 1, term0, term1 );
  3588. term1 := term1 or 1;
  3589. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3590. End;
  3591. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3592. End;
  3593. End;
  3594. shift32RightJamming( zSig, 1, zSig );
  3595. roundAndPack:
  3596. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3597. End;
  3598. {*
  3599. -------------------------------------------------------------------------------
  3600. Returns 1 if the single-precision floating-point value `a' is equal to
  3601. the corresponding value `b', and 0 otherwise. The comparison is performed
  3602. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3603. -------------------------------------------------------------------------------
  3604. *}
  3605. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3606. Begin
  3607. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3608. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3609. ) then
  3610. Begin
  3611. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3612. Begin
  3613. float_raise( float_flag_invalid );
  3614. End;
  3615. float32_eq := 0;
  3616. exit;
  3617. End;
  3618. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3619. End;
  3620. {*
  3621. -------------------------------------------------------------------------------
  3622. Returns 1 if the single-precision floating-point value `a' is less than
  3623. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3624. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3625. Arithmetic.
  3626. -------------------------------------------------------------------------------
  3627. *}
  3628. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3629. var
  3630. aSign, bSign: flag;
  3631. Begin
  3632. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3633. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3634. ) then
  3635. Begin
  3636. float_raise( float_flag_invalid );
  3637. float32_le := 0;
  3638. exit;
  3639. End;
  3640. aSign := extractFloat32Sign( a.float32 );
  3641. bSign := extractFloat32Sign( b.float32 );
  3642. if ( aSign <> bSign ) then
  3643. Begin
  3644. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3645. exit;
  3646. End;
  3647. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3648. End;
  3649. {*
  3650. -------------------------------------------------------------------------------
  3651. Returns 1 if the single-precision floating-point value `a' is less than
  3652. the corresponding value `b', and 0 otherwise. The comparison is performed
  3653. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3654. -------------------------------------------------------------------------------
  3655. *}
  3656. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3657. var
  3658. aSign, bSign: flag;
  3659. Begin
  3660. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3661. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3662. ) then
  3663. Begin
  3664. float_raise( float_flag_invalid );
  3665. float32_lt :=0;
  3666. exit;
  3667. End;
  3668. aSign := extractFloat32Sign( a.float32 );
  3669. bSign := extractFloat32Sign( b.float32 );
  3670. if ( aSign <> bSign ) then
  3671. Begin
  3672. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3673. exit;
  3674. End;
  3675. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3676. End;
  3677. {*
  3678. -------------------------------------------------------------------------------
  3679. Returns 1 if the single-precision floating-point value `a' is equal to
  3680. the corresponding value `b', and 0 otherwise. The invalid exception is
  3681. raised if either operand is a NaN. Otherwise, the comparison is performed
  3682. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3683. -------------------------------------------------------------------------------
  3684. *}
  3685. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3686. Begin
  3687. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3688. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3689. ) then
  3690. Begin
  3691. float_raise( float_flag_invalid );
  3692. float32_eq_signaling := 0;
  3693. exit;
  3694. End;
  3695. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3696. End;
  3697. {*
  3698. -------------------------------------------------------------------------------
  3699. Returns 1 if the single-precision floating-point value `a' is less than or
  3700. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3701. cause an exception. Otherwise, the comparison is performed according to the
  3702. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3703. -------------------------------------------------------------------------------
  3704. *}
  3705. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3706. Var
  3707. aSign, bSign: flag;
  3708. aExp, bExp: int16;
  3709. Begin
  3710. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3711. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3712. ) then
  3713. Begin
  3714. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3715. Begin
  3716. float_raise( float_flag_invalid );
  3717. End;
  3718. float32_le_quiet := 0;
  3719. exit;
  3720. End;
  3721. aSign := extractFloat32Sign( a );
  3722. bSign := extractFloat32Sign( b );
  3723. if ( aSign <> bSign ) then
  3724. Begin
  3725. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3726. exit;
  3727. End;
  3728. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3729. End;
  3730. {*
  3731. -------------------------------------------------------------------------------
  3732. Returns 1 if the single-precision floating-point value `a' is less than
  3733. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3734. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3735. Standard for Binary Floating-Point Arithmetic.
  3736. -------------------------------------------------------------------------------
  3737. *}
  3738. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  3739. Var
  3740. aSign, bSign: flag;
  3741. Begin
  3742. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3743. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3744. ) then
  3745. Begin
  3746. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3747. Begin
  3748. float_raise( float_flag_invalid );
  3749. End;
  3750. float32_lt_quiet := 0;
  3751. exit;
  3752. End;
  3753. aSign := extractFloat32Sign( a );
  3754. bSign := extractFloat32Sign( b );
  3755. if ( aSign <> bSign ) then
  3756. Begin
  3757. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  3758. exit;
  3759. End;
  3760. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  3761. End;
  3762. {*
  3763. -------------------------------------------------------------------------------
  3764. Returns the result of converting the double-precision floating-point value
  3765. `a' to the 32-bit two's complement integer format. The conversion is
  3766. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3767. Arithmetic---which means in particular that the conversion is rounded
  3768. according to the current rounding mode. If `a' is a NaN, the largest
  3769. positive integer is returned. Otherwise, if the conversion overflows, the
  3770. largest integer with the same sign as `a' is returned.
  3771. -------------------------------------------------------------------------------
  3772. *}
  3773. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  3774. var
  3775. aSign: flag;
  3776. aExp, shiftCount: int16;
  3777. aSig0, aSig1, absZ, aSigExtra: bits32;
  3778. z: int32;
  3779. roundingMode: int8;
  3780. label invalid;
  3781. Begin
  3782. aSig1 := extractFloat64Frac1( a );
  3783. aSig0 := extractFloat64Frac0( a );
  3784. aExp := extractFloat64Exp( a );
  3785. aSign := extractFloat64Sign( a );
  3786. shiftCount := aExp - $413;
  3787. if ( 0 <= shiftCount ) then
  3788. Begin
  3789. if ( $41E < aExp ) then
  3790. Begin
  3791. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3792. aSign := 0;
  3793. goto invalid;
  3794. End;
  3795. shortShift64Left(
  3796. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3797. if ( $80000000 < absZ ) then
  3798. goto invalid;
  3799. End
  3800. else
  3801. Begin
  3802. aSig1 := flag( aSig1 <> 0 );
  3803. if ( aExp < $3FE ) then
  3804. Begin
  3805. aSigExtra := aExp OR aSig0 OR aSig1;
  3806. absZ := 0;
  3807. End
  3808. else
  3809. Begin
  3810. aSig0 := aSig0 OR $00100000;
  3811. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3812. absZ := aSig0 shr ( - shiftCount );
  3813. End;
  3814. End;
  3815. roundingMode := float_rounding_mode;
  3816. if ( roundingMode = float_round_nearest_even ) then
  3817. Begin
  3818. if ( sbits32(aSigExtra) < 0 ) then
  3819. Begin
  3820. Inc(absZ);
  3821. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  3822. absZ := absZ and not 1;
  3823. End;
  3824. if aSign <> 0 then
  3825. z := - absZ
  3826. else
  3827. z := absZ;
  3828. End
  3829. else
  3830. Begin
  3831. aSigExtra := bits32( aSigExtra <> 0 );
  3832. if ( aSign <> 0) then
  3833. Begin
  3834. z := - ( absZ
  3835. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  3836. End
  3837. else
  3838. Begin
  3839. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  3840. End
  3841. End;
  3842. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  3843. Begin
  3844. invalid:
  3845. float_raise( float_flag_invalid );
  3846. if (aSign <> 0 ) then
  3847. float64_to_int32 := sbits32 ($80000000)
  3848. else
  3849. float64_to_int32 := $7FFFFFFF;
  3850. exit;
  3851. End;
  3852. if ( aSigExtra <> 0) then
  3853. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3854. float64_to_int32 := z;
  3855. End;
  3856. {*
  3857. -------------------------------------------------------------------------------
  3858. Returns the result of converting the double-precision floating-point value
  3859. `a' to the 32-bit two's complement integer format. The conversion is
  3860. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3861. Arithmetic, except that the conversion is always rounded toward zero.
  3862. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3863. the conversion overflows, the largest integer with the same sign as `a' is
  3864. returned.
  3865. -------------------------------------------------------------------------------
  3866. *}
  3867. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  3868. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  3869. Var
  3870. aSign: flag;
  3871. aExp, shiftCount: int16;
  3872. aSig0, aSig1, absZ, aSigExtra: bits32;
  3873. z: int32;
  3874. label invalid;
  3875. Begin
  3876. aSig1 := extractFloat64Frac1( a );
  3877. aSig0 := extractFloat64Frac0( a );
  3878. aExp := extractFloat64Exp( a );
  3879. aSign := extractFloat64Sign( a );
  3880. shiftCount := aExp - $413;
  3881. if ( 0 <= shiftCount ) then
  3882. Begin
  3883. if ( $41E < aExp ) then
  3884. Begin
  3885. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3886. aSign := 0;
  3887. goto invalid;
  3888. End;
  3889. shortShift64Left(
  3890. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3891. End
  3892. else
  3893. Begin
  3894. if ( aExp < $3FF ) then
  3895. Begin
  3896. if ( aExp OR aSig0 OR aSig1 )<>0 then
  3897. Begin
  3898. softfloat_exception_flags :=
  3899. softfloat_exception_flags or float_flag_inexact;
  3900. End;
  3901. float64_to_int32_round_to_zero := 0;
  3902. exit;
  3903. End;
  3904. aSig0 := aSig0 or $00100000;
  3905. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3906. absZ := aSig0 shr ( - shiftCount );
  3907. End;
  3908. if aSign <> 0 then
  3909. z := - absZ
  3910. else
  3911. z := absZ;
  3912. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  3913. Begin
  3914. invalid:
  3915. float_raise( float_flag_invalid );
  3916. if (aSign <> 0) then
  3917. float64_to_int32_round_to_zero := sbits32 ($80000000)
  3918. else
  3919. float64_to_int32_round_to_zero := $7FFFFFFF;
  3920. exit;
  3921. End;
  3922. if ( aSigExtra <> 0) then
  3923. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3924. float64_to_int32_round_to_zero := z;
  3925. End;
  3926. {*
  3927. -------------------------------------------------------------------------------
  3928. Returns the result of converting the double-precision floating-point value
  3929. `a' to the single-precision floating-point format. The conversion is
  3930. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3931. Arithmetic.
  3932. -------------------------------------------------------------------------------
  3933. *}
  3934. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  3935. Var
  3936. aSign: flag;
  3937. aExp: int16;
  3938. aSig0, aSig1, zSig: bits32;
  3939. allZero: bits32;
  3940. tmp : CommonNanT;
  3941. Begin
  3942. aSig1 := extractFloat64Frac1( a );
  3943. aSig0 := extractFloat64Frac0( a );
  3944. aExp := extractFloat64Exp( a );
  3945. aSign := extractFloat64Sign( a );
  3946. if ( aExp = $7FF ) then
  3947. Begin
  3948. if ( aSig0 OR aSig1 ) <> 0 then
  3949. Begin
  3950. float64ToCommonNaN( a, tmp );
  3951. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  3952. exit;
  3953. End;
  3954. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  3955. exit;
  3956. End;
  3957. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  3958. if ( aExp <> 0) then
  3959. zSig := zSig OR $40000000;
  3960. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  3961. End;
  3962. {*
  3963. -------------------------------------------------------------------------------
  3964. Rounds the double-precision floating-point value `a' to an integer,
  3965. and returns the result as a double-precision floating-point value. The
  3966. operation is performed according to the IEC/IEEE Standard for Binary
  3967. Floating-Point Arithmetic.
  3968. -------------------------------------------------------------------------------
  3969. *}
  3970. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  3971. Var
  3972. aSign: flag;
  3973. aExp: int16;
  3974. lastBitMask, roundBitsMask: bits32;
  3975. roundingMode: int8;
  3976. z: float64;
  3977. Begin
  3978. aExp := extractFloat64Exp( a );
  3979. if ( $413 <= aExp ) then
  3980. Begin
  3981. if ( $433 <= aExp ) then
  3982. Begin
  3983. if ( ( aExp = $7FF )
  3984. AND
  3985. (
  3986. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  3987. ) <>0)
  3988. ) then
  3989. Begin
  3990. propagateFloat64NaN( a, a, result );
  3991. exit;
  3992. End;
  3993. result := a;
  3994. exit;
  3995. End;
  3996. lastBitMask := 1;
  3997. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  3998. roundBitsMask := lastBitMask - 1;
  3999. z := a;
  4000. roundingMode := float_rounding_mode;
  4001. if ( roundingMode = float_round_nearest_even ) then
  4002. Begin
  4003. if ( lastBitMask <> 0) then
  4004. Begin
  4005. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  4006. if ( ( z.low and roundBitsMask ) = 0 ) then
  4007. z.low := z.low and not lastBitMask;
  4008. End
  4009. else
  4010. Begin
  4011. if ( sbits32 (z.low) < 0 ) then
  4012. Begin
  4013. Inc(z.high);
  4014. if ( bits32 ( z.low shl 1 ) = 0 ) then
  4015. z.high := z.high and not 1;
  4016. End;
  4017. End;
  4018. End
  4019. else if ( roundingMode <> float_round_to_zero ) then
  4020. Begin
  4021. if ( extractFloat64Sign( z )
  4022. xor flag( roundingMode = float_round_up ) )<> 0 then
  4023. Begin
  4024. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4025. End;
  4026. End;
  4027. z.low := z.low and not roundBitsMask;
  4028. End
  4029. else
  4030. Begin
  4031. if ( aExp <= $3FE ) then
  4032. Begin
  4033. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4034. Begin
  4035. result := a;
  4036. exit;
  4037. End;
  4038. softfloat_exception_flags := softfloat_exception_flags or
  4039. float_flag_inexact;
  4040. aSign := extractFloat64Sign( a );
  4041. case ( float_rounding_mode ) of
  4042. float_round_nearest_even:
  4043. Begin
  4044. if ( ( aExp = $3FE )
  4045. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4046. ) then
  4047. Begin
  4048. packFloat64( aSign, $3FF, 0, 0, result );
  4049. exit;
  4050. End;
  4051. End;
  4052. float_round_down:
  4053. Begin
  4054. if aSign<>0 then
  4055. packFloat64( 1, $3FF, 0, 0, result )
  4056. else
  4057. packFloat64( 0, 0, 0, 0, result );
  4058. exit;
  4059. End;
  4060. float_round_up:
  4061. Begin
  4062. if aSign <> 0 then
  4063. packFloat64( 1, 0, 0, 0, result )
  4064. else
  4065. packFloat64( 0, $3FF, 0, 0, result );
  4066. exit;
  4067. End;
  4068. end;
  4069. packFloat64( aSign, 0, 0, 0, result );
  4070. exit;
  4071. End;
  4072. lastBitMask := 1;
  4073. lastBitMask := lastBitMask shl ($413 - aExp);
  4074. roundBitsMask := lastBitMask - 1;
  4075. z.low := 0;
  4076. z.high := a.high;
  4077. roundingMode := float_rounding_mode;
  4078. if ( roundingMode = float_round_nearest_even ) then
  4079. Begin
  4080. z.high := z.high + lastBitMask shr 1;
  4081. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4082. Begin
  4083. z.high := z.high and not lastBitMask;
  4084. End;
  4085. End
  4086. else if ( roundingMode <> float_round_to_zero ) then
  4087. Begin
  4088. if ( extractFloat64Sign( z )
  4089. xor flag( roundingMode = float_round_up ) )<> 0 then
  4090. Begin
  4091. z.high := z.high or bits32( a.low <> 0 );
  4092. z.high := z.high + roundBitsMask;
  4093. End;
  4094. End;
  4095. z.high := z.high and not roundBitsMask;
  4096. End;
  4097. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4098. Begin
  4099. softfloat_exception_flags :=
  4100. softfloat_exception_flags or float_flag_inexact;
  4101. End;
  4102. result := z;
  4103. End;
  4104. {*
  4105. -------------------------------------------------------------------------------
  4106. Returns the result of adding the absolute values of the double-precision
  4107. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4108. before being returned. `zSign' is ignored if the result is a NaN.
  4109. The addition is performed according to the IEC/IEEE Standard for Binary
  4110. Floating-Point Arithmetic.
  4111. -------------------------------------------------------------------------------
  4112. *}
  4113. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4114. Var
  4115. aExp, bExp, zExp: int16;
  4116. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4117. expDiff: int16;
  4118. label shiftRight1;
  4119. label roundAndPack;
  4120. Begin
  4121. aSig1 := extractFloat64Frac1( a );
  4122. aSig0 := extractFloat64Frac0( a );
  4123. aExp := extractFloat64Exp( a );
  4124. bSig1 := extractFloat64Frac1( b );
  4125. bSig0 := extractFloat64Frac0( b );
  4126. bExp := extractFloat64Exp( b );
  4127. expDiff := aExp - bExp;
  4128. if ( 0 < expDiff ) then
  4129. Begin
  4130. if ( aExp = $7FF ) then
  4131. Begin
  4132. if ( aSig0 OR aSig1 ) <> 0 then
  4133. Begin
  4134. propagateFloat64NaN( a, b, out );
  4135. exit;
  4136. end;
  4137. out := a;
  4138. exit;
  4139. End;
  4140. if ( bExp = 0 ) then
  4141. Begin
  4142. Dec(expDiff);
  4143. End
  4144. else
  4145. Begin
  4146. bSig0 := bSig0 or $00100000;
  4147. End;
  4148. shift64ExtraRightJamming(
  4149. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4150. zExp := aExp;
  4151. End
  4152. else if ( expDiff < 0 ) then
  4153. Begin
  4154. if ( bExp = $7FF ) then
  4155. Begin
  4156. if ( bSig0 OR bSig1 ) <> 0 then
  4157. Begin
  4158. propagateFloat64NaN( a, b, out );
  4159. exit;
  4160. End;
  4161. packFloat64( zSign, $7FF, 0, 0, out );
  4162. End;
  4163. if ( aExp = 0 ) then
  4164. Begin
  4165. Inc(expDiff);
  4166. End
  4167. else
  4168. Begin
  4169. aSig0 := aSig0 or $00100000;
  4170. End;
  4171. shift64ExtraRightJamming(
  4172. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4173. zExp := bExp;
  4174. End
  4175. else
  4176. Begin
  4177. if ( aExp = $7FF ) then
  4178. Begin
  4179. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4180. Begin
  4181. propagateFloat64NaN( a, b, out );
  4182. exit;
  4183. End;
  4184. out := a;
  4185. exit;
  4186. End;
  4187. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4188. if ( aExp = 0 ) then
  4189. Begin
  4190. packFloat64( zSign, 0, zSig0, zSig1, out );
  4191. exit;
  4192. End;
  4193. zSig2 := 0;
  4194. zSig0 := zSig0 or $00200000;
  4195. zExp := aExp;
  4196. goto shiftRight1;
  4197. End;
  4198. aSig0 := aSig0 or $00100000;
  4199. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4200. Dec(zExp);
  4201. if ( zSig0 < $00200000 ) then
  4202. goto roundAndPack;
  4203. Inc(zExp);
  4204. shiftRight1:
  4205. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4206. roundAndPack:
  4207. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4208. End;
  4209. {*
  4210. -------------------------------------------------------------------------------
  4211. Returns the result of subtracting the absolute values of the double-
  4212. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4213. difference is negated before being returned. `zSign' is ignored if the
  4214. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4215. Standard for Binary Floating-Point Arithmetic.
  4216. -------------------------------------------------------------------------------
  4217. *}
  4218. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4219. Var
  4220. aExp, bExp, zExp: int16;
  4221. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4222. expDiff: int16;
  4223. z: float64;
  4224. label aExpBigger;
  4225. label bExpBigger;
  4226. label aBigger;
  4227. label bBigger;
  4228. label normalizeRoundAndPack;
  4229. Begin
  4230. aSig1 := extractFloat64Frac1( a );
  4231. aSig0 := extractFloat64Frac0( a );
  4232. aExp := extractFloat64Exp( a );
  4233. bSig1 := extractFloat64Frac1( b );
  4234. bSig0 := extractFloat64Frac0( b );
  4235. bExp := extractFloat64Exp( b );
  4236. expDiff := aExp - bExp;
  4237. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4238. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4239. if ( 0 < expDiff ) then goto aExpBigger;
  4240. if ( expDiff < 0 ) then goto bExpBigger;
  4241. if ( aExp = $7FF ) then
  4242. Begin
  4243. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4244. Begin
  4245. propagateFloat64NaN( a, b, out );
  4246. exit;
  4247. End;
  4248. float_raise( float_flag_invalid );
  4249. z.low := float64_default_nan_low;
  4250. z.high := float64_default_nan_high;
  4251. out := z;
  4252. exit;
  4253. End;
  4254. if ( aExp = 0 ) then
  4255. Begin
  4256. aExp := 1;
  4257. bExp := 1;
  4258. End;
  4259. if ( bSig0 < aSig0 ) then goto aBigger;
  4260. if ( aSig0 < bSig0 ) then goto bBigger;
  4261. if ( bSig1 < aSig1 ) then goto aBigger;
  4262. if ( aSig1 < bSig1 ) then goto bBigger;
  4263. packFloat64( flag(float_rounding_mode = float_round_down), 0, 0, 0 , out);
  4264. exit;
  4265. bExpBigger:
  4266. if ( bExp = $7FF ) then
  4267. Begin
  4268. if ( bSig0 OR bSig1 ) <> 0 then
  4269. Begin
  4270. propagateFloat64NaN( a, b, out );
  4271. exit;
  4272. End;
  4273. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4274. exit;
  4275. End;
  4276. if ( aExp = 0 ) then
  4277. Begin
  4278. Inc(expDiff);
  4279. End
  4280. else
  4281. Begin
  4282. aSig0 := aSig0 or $40000000;
  4283. End;
  4284. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4285. bSig0 := bSig0 or $40000000;
  4286. bBigger:
  4287. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4288. zExp := bExp;
  4289. zSign := zSign xor 1;
  4290. goto normalizeRoundAndPack;
  4291. aExpBigger:
  4292. if ( aExp = $7FF ) then
  4293. Begin
  4294. if ( aSig0 OR aSig1 ) <> 0 then
  4295. Begin
  4296. propagateFloat64NaN( a, b, out );
  4297. exit;
  4298. End;
  4299. out := a;
  4300. exit;
  4301. End;
  4302. if ( bExp = 0 ) then
  4303. Begin
  4304. Dec(expDiff);
  4305. End
  4306. else
  4307. Begin
  4308. bSig0 := bSig0 or $40000000;
  4309. End;
  4310. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4311. aSig0 := aSig0 or $40000000;
  4312. aBigger:
  4313. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4314. zExp := aExp;
  4315. normalizeRoundAndPack:
  4316. Dec(zExp);
  4317. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4318. End;
  4319. {*
  4320. -------------------------------------------------------------------------------
  4321. Returns the result of adding the double-precision floating-point values `a'
  4322. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4323. Binary Floating-Point Arithmetic.
  4324. -------------------------------------------------------------------------------
  4325. *}
  4326. Function float64_add( a: float64; b : float64) : Float64;
  4327. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4328. Var
  4329. aSign, bSign: flag;
  4330. Begin
  4331. aSign := extractFloat64Sign( a );
  4332. bSign := extractFloat64Sign( b );
  4333. if ( aSign = bSign ) then
  4334. Begin
  4335. addFloat64Sigs( a, b, aSign, result );
  4336. End
  4337. else
  4338. Begin
  4339. subFloat64Sigs( a, b, aSign, result );
  4340. End;
  4341. End;
  4342. {*
  4343. -------------------------------------------------------------------------------
  4344. Returns the result of subtracting the double-precision floating-point values
  4345. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4346. for Binary Floating-Point Arithmetic.
  4347. -------------------------------------------------------------------------------
  4348. *}
  4349. Function float64_sub(a: float64; b : float64) : Float64;
  4350. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4351. Var
  4352. aSign, bSign: flag;
  4353. Begin
  4354. aSign := extractFloat64Sign( a );
  4355. bSign := extractFloat64Sign( b );
  4356. if ( aSign = bSign ) then
  4357. Begin
  4358. subFloat64Sigs( a, b, aSign, result );
  4359. End
  4360. else
  4361. Begin
  4362. addFloat64Sigs( a, b, aSign, result );
  4363. End;
  4364. End;
  4365. {*
  4366. -------------------------------------------------------------------------------
  4367. Returns the result of multiplying the double-precision floating-point values
  4368. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4369. for Binary Floating-Point Arithmetic.
  4370. -------------------------------------------------------------------------------
  4371. *}
  4372. Function float64_mul( a: float64; b:float64) : Float64;
  4373. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4374. Var
  4375. aSign, bSign, zSign: flag;
  4376. aExp, bExp, zExp: int16;
  4377. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4378. z: float64;
  4379. label invalid;
  4380. Begin
  4381. aSig1 := extractFloat64Frac1( a );
  4382. aSig0 := extractFloat64Frac0( a );
  4383. aExp := extractFloat64Exp( a );
  4384. aSign := extractFloat64Sign( a );
  4385. bSig1 := extractFloat64Frac1( b );
  4386. bSig0 := extractFloat64Frac0( b );
  4387. bExp := extractFloat64Exp( b );
  4388. bSign := extractFloat64Sign( b );
  4389. zSign := aSign xor bSign;
  4390. if ( aExp = $7FF ) then
  4391. Begin
  4392. if ( (( aSig0 OR aSig1 ) <>0)
  4393. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4394. Begin
  4395. propagateFloat64NaN( a, b, result );
  4396. exit;
  4397. End;
  4398. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4399. packFloat64( zSign, $7FF, 0, 0, result );
  4400. exit;
  4401. End;
  4402. if ( bExp = $7FF ) then
  4403. Begin
  4404. if ( bSig0 OR bSig1 )<> 0 then
  4405. Begin
  4406. propagateFloat64NaN( a, b, result );
  4407. exit;
  4408. End;
  4409. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4410. Begin
  4411. invalid:
  4412. float_raise( float_flag_invalid );
  4413. z.low := float64_default_nan_low;
  4414. z.high := float64_default_nan_high;
  4415. result := z;
  4416. exit;
  4417. End;
  4418. packFloat64( zSign, $7FF, 0, 0, result );
  4419. exit;
  4420. End;
  4421. if ( aExp = 0 ) then
  4422. Begin
  4423. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4424. Begin
  4425. packFloat64( zSign, 0, 0, 0, result );
  4426. exit;
  4427. End;
  4428. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4429. End;
  4430. if ( bExp = 0 ) then
  4431. Begin
  4432. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4433. Begin
  4434. packFloat64( zSign, 0, 0, 0, result );
  4435. exit;
  4436. End;
  4437. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4438. End;
  4439. zExp := aExp + bExp - $400;
  4440. aSig0 := aSig0 or $00100000;
  4441. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4442. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4443. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4444. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4445. if ( $00200000 <= zSig0 ) then
  4446. Begin
  4447. shift64ExtraRightJamming(
  4448. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4449. Inc(zExp);
  4450. End;
  4451. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4452. End;
  4453. {*
  4454. -------------------------------------------------------------------------------
  4455. Returns the result of dividing the double-precision floating-point value `a'
  4456. by the corresponding value `b'. The operation is performed according to the
  4457. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4458. -------------------------------------------------------------------------------
  4459. *}
  4460. Function float64_div(a: float64; b : float64) : Float64;
  4461. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4462. Var
  4463. aSign, bSign, zSign: flag;
  4464. aExp, bExp, zExp: int16;
  4465. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4466. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4467. z: float64;
  4468. label invalid;
  4469. Begin
  4470. aSig1 := extractFloat64Frac1( a );
  4471. aSig0 := extractFloat64Frac0( a );
  4472. aExp := extractFloat64Exp( a );
  4473. aSign := extractFloat64Sign( a );
  4474. bSig1 := extractFloat64Frac1( b );
  4475. bSig0 := extractFloat64Frac0( b );
  4476. bExp := extractFloat64Exp( b );
  4477. bSign := extractFloat64Sign( b );
  4478. zSign := aSign xor bSign;
  4479. if ( aExp = $7FF ) then
  4480. Begin
  4481. if ( aSig0 OR aSig1 )<> 0 then
  4482. Begin
  4483. propagateFloat64NaN( a, b, result );
  4484. exit;
  4485. end;
  4486. if ( bExp = $7FF ) then
  4487. Begin
  4488. if ( bSig0 OR bSig1 )<>0 then
  4489. Begin
  4490. propagateFloat64NaN( a, b, result );
  4491. exit;
  4492. End;
  4493. goto invalid;
  4494. End;
  4495. packFloat64( zSign, $7FF, 0, 0, result );
  4496. exit;
  4497. End;
  4498. if ( bExp = $7FF ) then
  4499. Begin
  4500. if ( bSig0 OR bSig1 )<> 0 then
  4501. Begin
  4502. propagateFloat64NaN( a, b, result );
  4503. exit;
  4504. End;
  4505. packFloat64( zSign, 0, 0, 0, result );
  4506. exit;
  4507. End;
  4508. if ( bExp = 0 ) then
  4509. Begin
  4510. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4511. Begin
  4512. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4513. Begin
  4514. invalid:
  4515. float_raise( float_flag_invalid );
  4516. z.low := float64_default_nan_low;
  4517. z.high := float64_default_nan_high;
  4518. result := z;
  4519. exit;
  4520. End;
  4521. float_raise( float_flag_divbyzero );
  4522. packFloat64( zSign, $7FF, 0, 0, result );
  4523. exit;
  4524. End;
  4525. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4526. End;
  4527. if ( aExp = 0 ) then
  4528. Begin
  4529. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4530. Begin
  4531. packFloat64( zSign, 0, 0, 0, result );
  4532. exit;
  4533. End;
  4534. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4535. End;
  4536. zExp := aExp - bExp + $3FD;
  4537. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4538. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4539. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4540. Begin
  4541. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4542. Inc(zExp);
  4543. End;
  4544. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4545. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4546. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4547. while ( sbits32 (rem0) < 0 ) do
  4548. Begin
  4549. Dec(zSig0);
  4550. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4551. End;
  4552. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4553. if ( ( zSig1 and $3FF ) <= 4 ) then
  4554. Begin
  4555. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4556. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4557. while ( sbits32 (rem1) < 0 ) do
  4558. Begin
  4559. Dec(zSig1);
  4560. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4561. End;
  4562. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4563. End;
  4564. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4565. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4566. End;
  4567. {*
  4568. -------------------------------------------------------------------------------
  4569. Returns the remainder of the double-precision floating-point value `a'
  4570. with respect to the corresponding value `b'. The operation is performed
  4571. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4572. -------------------------------------------------------------------------------
  4573. *}
  4574. Function float64_rem(a: float64; b : float64) : float64;
  4575. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4576. Var
  4577. aSign, bSign, zSign: flag;
  4578. aExp, bExp, expDiff: int16;
  4579. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4580. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4581. sigMean0: sbits32;
  4582. z: float64;
  4583. label invalid;
  4584. Begin
  4585. aSig1 := extractFloat64Frac1( a );
  4586. aSig0 := extractFloat64Frac0( a );
  4587. aExp := extractFloat64Exp( a );
  4588. aSign := extractFloat64Sign( a );
  4589. bSig1 := extractFloat64Frac1( b );
  4590. bSig0 := extractFloat64Frac0( b );
  4591. bExp := extractFloat64Exp( b );
  4592. bSign := extractFloat64Sign( b );
  4593. if ( aExp = $7FF ) then
  4594. Begin
  4595. if ((( aSig0 OR aSig1 )<>0)
  4596. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4597. Begin
  4598. propagateFloat64NaN( a, b, result );
  4599. exit;
  4600. End;
  4601. goto invalid;
  4602. End;
  4603. if ( bExp = $7FF ) then
  4604. Begin
  4605. if ( bSig0 OR bSig1 ) <> 0 then
  4606. Begin
  4607. propagateFloat64NaN( a, b, result );
  4608. exit;
  4609. End;
  4610. result := a;
  4611. exit;
  4612. End;
  4613. if ( bExp = 0 ) then
  4614. Begin
  4615. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4616. Begin
  4617. invalid:
  4618. float_raise( float_flag_invalid );
  4619. z.low := float64_default_nan_low;
  4620. z.high := float64_default_nan_high;
  4621. result := z;
  4622. exit;
  4623. End;
  4624. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4625. End;
  4626. if ( aExp = 0 ) then
  4627. Begin
  4628. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4629. Begin
  4630. result := a;
  4631. exit;
  4632. End;
  4633. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4634. End;
  4635. expDiff := aExp - bExp;
  4636. if ( expDiff < -1 ) then
  4637. Begin
  4638. result := a;
  4639. exit;
  4640. End;
  4641. shortShift64Left(
  4642. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  4643. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4644. q := le64( bSig0, bSig1, aSig0, aSig1 );
  4645. if ( q )<>0 then
  4646. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4647. expDiff := expDiff - 32;
  4648. while ( 0 < expDiff ) do
  4649. Begin
  4650. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4651. if 4 < q then
  4652. q:= q - 4
  4653. else
  4654. q := 0;
  4655. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4656. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  4657. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  4658. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  4659. expDiff := expDiff - 29;
  4660. End;
  4661. if ( -32 < expDiff ) then
  4662. Begin
  4663. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4664. if 4 < q then
  4665. q := q - 4
  4666. else
  4667. q := 0;
  4668. q := q shr (- expDiff);
  4669. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4670. expDiff := expDiff + 24;
  4671. if ( expDiff < 0 ) then
  4672. Begin
  4673. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4674. End
  4675. else
  4676. Begin
  4677. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  4678. End;
  4679. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4680. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  4681. End
  4682. else
  4683. Begin
  4684. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  4685. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4686. End;
  4687. Repeat
  4688. alternateASig0 := aSig0;
  4689. alternateASig1 := aSig1;
  4690. Inc(q);
  4691. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4692. Until not ( 0 <= sbits32 (aSig0) );
  4693. add64(
  4694. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  4695. if ( ( sigMean0 < 0 )
  4696. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  4697. Begin
  4698. aSig0 := alternateASig0;
  4699. aSig1 := alternateASig1;
  4700. End;
  4701. zSign := flag( sbits32 (aSig0) < 0 );
  4702. if ( zSign <> 0 ) then
  4703. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  4704. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  4705. End;
  4706. {*
  4707. -------------------------------------------------------------------------------
  4708. Returns the square root of the double-precision floating-point value `a'.
  4709. The operation is performed according to the IEC/IEEE Standard for Binary
  4710. Floating-Point Arithmetic.
  4711. -------------------------------------------------------------------------------
  4712. *}
  4713. Procedure float64_sqrt( a: float64; var out: float64 );
  4714. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  4715. Var
  4716. aSign: flag;
  4717. aExp, zExp: int16;
  4718. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  4719. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4720. z: float64;
  4721. label invalid;
  4722. Begin
  4723. aSig1 := extractFloat64Frac1( a );
  4724. aSig0 := extractFloat64Frac0( a );
  4725. aExp := extractFloat64Exp( a );
  4726. aSign := extractFloat64Sign( a );
  4727. if ( aExp = $7FF ) then
  4728. Begin
  4729. if ( aSig0 OR aSig1 ) <> 0 then
  4730. Begin
  4731. propagateFloat64NaN( a, a, out );
  4732. exit;
  4733. End;
  4734. if ( aSign = 0) then
  4735. Begin
  4736. out := a;
  4737. exit;
  4738. End;
  4739. goto invalid;
  4740. End;
  4741. if ( aSign <> 0 ) then
  4742. Begin
  4743. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4744. Begin
  4745. out := a;
  4746. exit;
  4747. End;
  4748. invalid:
  4749. float_raise( float_flag_invalid );
  4750. z.low := float64_default_nan_low;
  4751. z.high := float64_default_nan_high;
  4752. out := z;
  4753. exit;
  4754. End;
  4755. if ( aExp = 0 ) then
  4756. Begin
  4757. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4758. Begin
  4759. packFloat64( 0, 0, 0, 0, out );
  4760. exit;
  4761. End;
  4762. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4763. End;
  4764. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  4765. aSig0 := aSig0 or $00100000;
  4766. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  4767. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  4768. if ( zSig0 = 0 ) then
  4769. zSig0 := $7FFFFFFF;
  4770. doubleZSig0 := zSig0 + zSig0;
  4771. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  4772. mul32To64( zSig0, zSig0, term0, term1 );
  4773. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  4774. while ( sbits32 (rem0) < 0 ) do
  4775. Begin
  4776. Dec(zSig0);
  4777. doubleZSig0 := doubleZSig0 - 2;
  4778. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  4779. End;
  4780. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  4781. if ( ( zSig1 and $1FF ) <= 5 ) then
  4782. Begin
  4783. if ( zSig1 = 0 ) then
  4784. zSig1 := 1;
  4785. mul32To64( doubleZSig0, zSig1, term1, term2 );
  4786. sub64( rem1, 0, term1, term2, rem1, rem2 );
  4787. mul32To64( zSig1, zSig1, term2, term3 );
  4788. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  4789. while ( sbits32 (rem1) < 0 ) do
  4790. Begin
  4791. Dec(zSig1);
  4792. shortShift64Left( 0, zSig1, 1, term2, term3 );
  4793. term3 := term3 or 1;
  4794. term2 := term2 or doubleZSig0;
  4795. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  4796. End;
  4797. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4798. End;
  4799. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  4800. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  4801. End;
  4802. {*
  4803. -------------------------------------------------------------------------------
  4804. Returns 1 if the double-precision floating-point value `a' is equal to
  4805. the corresponding value `b', and 0 otherwise. The comparison is performed
  4806. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4807. -------------------------------------------------------------------------------
  4808. *}
  4809. Function float64_eq(a: float64; b: float64): flag;
  4810. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  4811. Begin
  4812. if
  4813. (
  4814. ( extractFloat64Exp( a ) = $7FF )
  4815. AND
  4816. (
  4817. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4818. )
  4819. )
  4820. OR (
  4821. ( extractFloat64Exp( b ) = $7FF )
  4822. AND (
  4823. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4824. )
  4825. )
  4826. ) then
  4827. Begin
  4828. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4829. float_raise( float_flag_invalid );
  4830. float64_eq := 0;
  4831. exit;
  4832. End;
  4833. float64_eq := flag(
  4834. ( a.low = b.low )
  4835. AND ( ( a.high = b.high )
  4836. OR ( ( a.low = 0 )
  4837. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4838. ));
  4839. End;
  4840. {*
  4841. -------------------------------------------------------------------------------
  4842. Returns 1 if the double-precision floating-point value `a' is less than
  4843. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4844. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4845. Arithmetic.
  4846. -------------------------------------------------------------------------------
  4847. *}
  4848. Function float64_le(a: float64;b: float64): flag;
  4849. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  4850. Var
  4851. aSign, bSign: flag;
  4852. Begin
  4853. if
  4854. (
  4855. ( extractFloat64Exp( a ) = $7FF )
  4856. AND
  4857. (
  4858. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4859. )
  4860. )
  4861. OR (
  4862. ( extractFloat64Exp( b ) = $7FF )
  4863. AND (
  4864. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4865. )
  4866. )
  4867. ) then
  4868. Begin
  4869. float_raise( float_flag_invalid );
  4870. float64_le := 0;
  4871. exit;
  4872. End;
  4873. aSign := extractFloat64Sign( a );
  4874. bSign := extractFloat64Sign( b );
  4875. if ( aSign <> bSign ) then
  4876. Begin
  4877. float64_le := flag(
  4878. (aSign <> 0)
  4879. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4880. = 0 ));
  4881. exit;
  4882. End;
  4883. if aSign <> 0 then
  4884. float64_le := le64( b.high, b.low, a.high, a.low )
  4885. else
  4886. float64_le := le64( a.high, a.low, b.high, b.low );
  4887. End;
  4888. {*
  4889. -------------------------------------------------------------------------------
  4890. Returns 1 if the double-precision floating-point value `a' is less than
  4891. the corresponding value `b', and 0 otherwise. The comparison is performed
  4892. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4893. -------------------------------------------------------------------------------
  4894. *}
  4895. Function float64_lt(a: float64;b: float64): flag;
  4896. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  4897. Var
  4898. aSign, bSign: flag;
  4899. Begin
  4900. if
  4901. (
  4902. ( extractFloat64Exp( a ) = $7FF )
  4903. AND
  4904. (
  4905. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4906. )
  4907. )
  4908. OR (
  4909. ( extractFloat64Exp( b ) = $7FF )
  4910. AND (
  4911. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4912. )
  4913. )
  4914. ) then
  4915. Begin
  4916. float_raise( float_flag_invalid );
  4917. float64_lt := 0;
  4918. exit;
  4919. End;
  4920. aSign := extractFloat64Sign( a );
  4921. bSign := extractFloat64Sign( b );
  4922. if ( aSign <> bSign ) then
  4923. Begin
  4924. float64_lt := flag(
  4925. (aSign <> 0)
  4926. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4927. <> 0 ));
  4928. exit;
  4929. End;
  4930. if aSign <> 0 then
  4931. float64_lt := lt64( b.high, b.low, a.high, a.low )
  4932. else
  4933. float64_lt := lt64( a.high, a.low, b.high, b.low );
  4934. End;
  4935. {*
  4936. -------------------------------------------------------------------------------
  4937. Returns 1 if the double-precision floating-point value `a' is equal to
  4938. the corresponding value `b', and 0 otherwise. The invalid exception is
  4939. raised if either operand is a NaN. Otherwise, the comparison is performed
  4940. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4941. -------------------------------------------------------------------------------
  4942. *}
  4943. Function float64_eq_signaling( a: float64; b: float64): flag;
  4944. Begin
  4945. if
  4946. (
  4947. ( extractFloat64Exp( a ) = $7FF )
  4948. AND
  4949. (
  4950. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4951. )
  4952. )
  4953. OR (
  4954. ( extractFloat64Exp( b ) = $7FF )
  4955. AND (
  4956. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4957. )
  4958. )
  4959. ) then
  4960. Begin
  4961. float_raise( float_flag_invalid );
  4962. float64_eq_signaling := 0;
  4963. exit;
  4964. End;
  4965. float64_eq_signaling := flag(
  4966. ( a.low = b.low )
  4967. AND ( ( a.high = b.high )
  4968. OR ( ( a.low = 0 )
  4969. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4970. ));
  4971. End;
  4972. {*
  4973. -------------------------------------------------------------------------------
  4974. Returns 1 if the double-precision floating-point value `a' is less than or
  4975. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4976. cause an exception. Otherwise, the comparison is performed according to the
  4977. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4978. -------------------------------------------------------------------------------
  4979. *}
  4980. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  4981. Var
  4982. aSign, bSign : flag;
  4983. Begin
  4984. if
  4985. (
  4986. ( extractFloat64Exp( a ) = $7FF )
  4987. AND
  4988. (
  4989. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4990. )
  4991. )
  4992. OR (
  4993. ( extractFloat64Exp( b ) = $7FF )
  4994. AND (
  4995. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4996. )
  4997. )
  4998. ) then
  4999. Begin
  5000. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5001. float_raise( float_flag_invalid );
  5002. float64_le_quiet := 0;
  5003. exit;
  5004. End;
  5005. aSign := extractFloat64Sign( a );
  5006. bSign := extractFloat64Sign( b );
  5007. if ( aSign <> bSign ) then
  5008. Begin
  5009. float64_le_quiet := flag
  5010. ((aSign <> 0)
  5011. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5012. = 0 ));
  5013. exit;
  5014. End;
  5015. if aSign <> 0 then
  5016. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  5017. else
  5018. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  5019. End;
  5020. {*
  5021. -------------------------------------------------------------------------------
  5022. Returns 1 if the double-precision floating-point value `a' is less than
  5023. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5024. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5025. Standard for Binary Floating-Point Arithmetic.
  5026. -------------------------------------------------------------------------------
  5027. *}
  5028. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5029. Var
  5030. aSign, bSign: flag;
  5031. Begin
  5032. if
  5033. (
  5034. ( extractFloat64Exp( a ) = $7FF )
  5035. AND
  5036. (
  5037. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5038. )
  5039. )
  5040. OR (
  5041. ( extractFloat64Exp( b ) = $7FF )
  5042. AND (
  5043. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5044. )
  5045. )
  5046. ) then
  5047. Begin
  5048. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5049. float_raise( float_flag_invalid );
  5050. float64_lt_quiet := 0;
  5051. exit;
  5052. End;
  5053. aSign := extractFloat64Sign( a );
  5054. bSign := extractFloat64Sign( b );
  5055. if ( aSign <> bSign ) then
  5056. Begin
  5057. float64_lt_quiet := flag(
  5058. (aSign<>0)
  5059. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5060. <> 0 ));
  5061. exit;
  5062. End;
  5063. If aSign <> 0 then
  5064. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5065. else
  5066. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5067. End;
  5068. {*----------------------------------------------------------------------------
  5069. | Returns the result of converting the 64-bit two's complement integer `a'
  5070. | to the single-precision floating-point format. The conversion is performed
  5071. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5072. *----------------------------------------------------------------------------*}
  5073. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5074. var
  5075. zSign : flag;
  5076. absA : uint64;
  5077. shiftCount: int8;
  5078. zSig : bits32;
  5079. intval : int64rec;
  5080. Begin
  5081. if ( a = 0 ) then
  5082. begin
  5083. int64_to_float32.float32 := 0;
  5084. exit;
  5085. end;
  5086. if a < 0 then
  5087. zSign := flag(TRUE)
  5088. else
  5089. zSign := flag(FALSE);
  5090. if zSign<>0 then
  5091. absA := -a
  5092. else
  5093. absA := a;
  5094. shiftCount := countLeadingZeros64( absA ) - 40;
  5095. if ( 0 <= shiftCount ) then
  5096. begin
  5097. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5098. end
  5099. else
  5100. begin
  5101. shiftCount := shiftCount + 7;
  5102. if ( shiftCount < 0 ) then
  5103. begin
  5104. intval.low := int64rec(AbsA).low;
  5105. intval.high := int64rec(AbsA).high;
  5106. shift64RightJamming( intval.low, intval.high, - shiftCount,
  5107. intval.low, intval.high);
  5108. int64rec(absA).low := intval.low;
  5109. int64rec(absA).high := intval.high;
  5110. end
  5111. else
  5112. absA := absA shl shiftCount;
  5113. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5114. end;
  5115. End;
  5116. {*----------------------------------------------------------------------------
  5117. | Returns the result of converting the 64-bit two's complement integer `a'
  5118. | to the double-precision floating-point format. The conversion is performed
  5119. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5120. *----------------------------------------------------------------------------*}
  5121. function int64_to_float64( a: int64 ): float64;
  5122. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5123. var
  5124. zSign : flag;
  5125. float_result : float64;
  5126. intval : int64rec;
  5127. AbsA : bits64;
  5128. shiftcount : int8;
  5129. zSig0, zSig1 : bits32;
  5130. Begin
  5131. if ( a = 0 ) then
  5132. Begin
  5133. packFloat64( 0, 0, 0, 0, result );
  5134. exit;
  5135. end;
  5136. zSign := flag( a < 0 );
  5137. if ZSign<>0 then
  5138. AbsA := -a
  5139. else
  5140. AbsA := a;
  5141. shiftCount := countLeadingZeros64( absA ) - 11;
  5142. if ( 0 <= shiftCount ) then
  5143. Begin
  5144. absA := absA shl shiftcount;
  5145. zSig0:=int64rec(absA).high;
  5146. zSig1:=int64rec(absA).low;
  5147. End
  5148. else
  5149. Begin
  5150. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  5151. End;
  5152. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5153. int64_to_float64:= float_result;
  5154. End;
  5155. {*----------------------------------------------------------------------------
  5156. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5157. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5158. | Otherwise, returns 0.
  5159. *----------------------------------------------------------------------------*}
  5160. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5161. begin
  5162. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5163. end;
  5164. {*----------------------------------------------------------------------------
  5165. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5166. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5167. | Otherwise, returns 0.
  5168. *----------------------------------------------------------------------------*}
  5169. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5170. begin
  5171. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5172. end;
  5173. {*----------------------------------------------------------------------------
  5174. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5175. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5176. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5177. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5178. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5179. | the most-significant bit of the extra result, and the other 63 bits of the
  5180. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5181. | were all zero. This extra result is stored in the location pointed to by
  5182. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5183. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5184. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5185. | fixed-point value is shifted right by the number of bits given in `count',
  5186. | and the integer part of the result is returned at the locations pointed to
  5187. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5188. | corrupted as described above, and is returned at the location pointed to by
  5189. | `z2Ptr'.)
  5190. *----------------------------------------------------------------------------*}
  5191. procedure shift128ExtraRightJamming(
  5192. a0: bits64;
  5193. a1: bits64;
  5194. a2: bits64;
  5195. count: int16;
  5196. var z0Ptr: bits64;
  5197. var z1Ptr: bits64;
  5198. var z2Ptr: bits64);
  5199. var
  5200. z0, z1, z2: bits64;
  5201. negCount: int8;
  5202. begin
  5203. negCount := ( - count ) and 63;
  5204. if ( count = 0 ) then
  5205. begin
  5206. z2 := a2;
  5207. z1 := a1;
  5208. z0 := a0;
  5209. end
  5210. else begin
  5211. if ( count < 64 ) then
  5212. begin
  5213. z2 := a1 shr negCount;
  5214. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5215. z0 := a0 shr count;
  5216. end
  5217. else begin
  5218. if ( count = 64 ) then
  5219. begin
  5220. z2 := a1;
  5221. z1 := a0;
  5222. end
  5223. else begin
  5224. a2 := a2 or a1;
  5225. if ( count < 128 ) then
  5226. begin
  5227. z2 := a0 shl negCount;
  5228. z1 := a0 shr ( count and 63 );
  5229. end
  5230. else begin
  5231. if ( count = 128 ) then
  5232. z2 := a0
  5233. else
  5234. z2 := ord( a0 <> 0 );
  5235. z1 := 0;
  5236. end;
  5237. end;
  5238. z0 := 0;
  5239. end;
  5240. z2 := z2 or ord( a2 <> 0 );
  5241. end;
  5242. z2Ptr := z2;
  5243. z1Ptr := z1;
  5244. z0Ptr := z0;
  5245. end;
  5246. {*----------------------------------------------------------------------------
  5247. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5248. | _plus_ the number of bits given in `count'. The shifted result is at most
  5249. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5250. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5251. | shifted off is the most-significant bit of the extra result, and the other
  5252. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5253. | bits shifted off were all zero. This extra result is stored in the location
  5254. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5255. | (This routine makes more sense if `a0' and `a1' are considered to form
  5256. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5257. | point value is shifted right by the number of bits given in `count', and
  5258. | the integer part of the result is returned at the location pointed to by
  5259. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5260. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5261. *----------------------------------------------------------------------------*}
  5262. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5263. var
  5264. z0, z1: bits64;
  5265. negCount: int8;
  5266. begin
  5267. negCount := ( - count ) and 63;
  5268. if ( count = 0 ) then
  5269. begin
  5270. z1 := a1;
  5271. z0 := a0;
  5272. end
  5273. else if ( count < 64 ) then
  5274. begin
  5275. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5276. z0 := a0 shr count;
  5277. end
  5278. else begin
  5279. if ( count = 64 ) then
  5280. begin
  5281. z1 := a0 or ord( a1 <> 0 );
  5282. end
  5283. else begin
  5284. z1 := ord( ( a0 or a1 ) <> 0 );
  5285. end;
  5286. z0 := 0;
  5287. end;
  5288. z1Ptr := z1;
  5289. z0Ptr := z0;
  5290. end;
  5291. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5292. {*----------------------------------------------------------------------------
  5293. | Returns the fraction bits of the extended double-precision floating-point
  5294. | value `a'.
  5295. *----------------------------------------------------------------------------*}
  5296. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5297. begin
  5298. result:=a.low;
  5299. end;
  5300. {*----------------------------------------------------------------------------
  5301. | Returns the exponent bits of the extended double-precision floating-point
  5302. | value `a'.
  5303. *----------------------------------------------------------------------------*}
  5304. function extractFloatx80Exp(a : floatx80): int32;inline;
  5305. begin
  5306. result:=a.high and $7FFF;
  5307. end;
  5308. {*----------------------------------------------------------------------------
  5309. | Returns the sign bit of the extended double-precision floating-point value
  5310. | `a'.
  5311. *----------------------------------------------------------------------------*}
  5312. function extractFloatx80Sign(a : floatx80): flag;inline;
  5313. begin
  5314. result:=a.high shr 15;
  5315. end;
  5316. {*----------------------------------------------------------------------------
  5317. | Normalizes the subnormal extended double-precision floating-point value
  5318. | represented by the denormalized significand `aSig'. The normalized exponent
  5319. | and significand are stored at the locations pointed to by `zExpPtr' and
  5320. | `zSigPtr', respectively.
  5321. *----------------------------------------------------------------------------*}
  5322. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5323. var
  5324. shiftCount: int8;
  5325. begin
  5326. shiftCount := countLeadingZeros64( aSig );
  5327. zSigPtr := aSig shl shiftCount;
  5328. zExpPtr := 1 - shiftCount;
  5329. end;
  5330. {*----------------------------------------------------------------------------
  5331. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5332. | extended double-precision floating-point value, returning the result.
  5333. *----------------------------------------------------------------------------*}
  5334. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5335. var
  5336. z: floatx80;
  5337. begin
  5338. z.low := zSig;
  5339. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5340. result:=z;
  5341. end;
  5342. {*----------------------------------------------------------------------------
  5343. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5344. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5345. | and returns the proper extended double-precision floating-point value
  5346. | corresponding to the abstract input. Ordinarily, the abstract value is
  5347. | rounded and packed into the extended double-precision format, with the
  5348. | inexact exception raised if the abstract input cannot be represented
  5349. | exactly. However, if the abstract value is too large, the overflow and
  5350. | inexact exceptions are raised and an infinity or maximal finite value is
  5351. | returned. If the abstract value is too small, the input value is rounded to
  5352. | a subnormal number, and the underflow and inexact exceptions are raised if
  5353. | the abstract input cannot be represented exactly as a subnormal extended
  5354. | double-precision floating-point number.
  5355. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5356. | number of bits as single or double precision, respectively. Otherwise, the
  5357. | result is rounded to the full precision of the extended double-precision
  5358. | format.
  5359. | The input significand must be normalized or smaller. If the input
  5360. | significand is not normalized, `zExp' must be 0; in that case, the result
  5361. | returned is a subnormal number, and it must not require rounding. The
  5362. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5363. | Floating-Point Arithmetic.
  5364. *----------------------------------------------------------------------------*}
  5365. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5366. var
  5367. roundingMode: int8;
  5368. roundNearestEven, increment, isTiny: flag;
  5369. roundIncrement, roundMask, roundBits: int64;
  5370. label
  5371. precision80;
  5372. begin
  5373. roundingMode := float_rounding_mode;
  5374. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5375. if ( roundingPrecision = 80 ) then
  5376. goto precision80;
  5377. if ( roundingPrecision = 64 ) then
  5378. begin
  5379. roundIncrement := int64( $0000000000000400 );
  5380. roundMask := int64( $00000000000007FF );
  5381. end
  5382. else if ( roundingPrecision = 32 ) then
  5383. begin
  5384. roundIncrement := int64( $0000008000000000 );
  5385. roundMask := int64( $000000FFFFFFFFFF );
  5386. end
  5387. else begin
  5388. goto precision80;
  5389. end;
  5390. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5391. if ( not (roundNearestEven<>0) ) then
  5392. begin
  5393. if ( roundingMode = float_round_to_zero ) then
  5394. begin
  5395. roundIncrement := 0;
  5396. end
  5397. else begin
  5398. roundIncrement := roundMask;
  5399. if ( zSign<>0 ) then
  5400. begin
  5401. if ( roundingMode = float_round_up ) then
  5402. roundIncrement := 0;
  5403. end
  5404. else begin
  5405. if ( roundingMode = float_round_down ) then
  5406. roundIncrement := 0;
  5407. end;
  5408. end;
  5409. end;
  5410. roundBits := zSig0 and roundMask;
  5411. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5412. if ( ( $7FFE < zExp )
  5413. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5414. ) begin
  5415. goto overflow;
  5416. end;
  5417. if ( zExp <= 0 ) begin
  5418. isTiny =
  5419. ( float_detect_tininess = float_tininess_before_rounding )
  5420. or ( zExp < 0 )
  5421. or ( zSig0 <= zSig0 + roundIncrement );
  5422. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  5423. zExp := 0;
  5424. roundBits := zSig0 and roundMask;
  5425. if ( isTiny and roundBits ) float_raise( float_flag_underflow );
  5426. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5427. zSig0 += roundIncrement;
  5428. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5429. roundIncrement := roundMask + 1;
  5430. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5431. roundMask |= roundIncrement;
  5432. end;
  5433. zSig0 = ~ roundMask;
  5434. result:=packFloatx80( zSign, zExp, zSig0 );
  5435. end;
  5436. end;
  5437. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5438. zSig0 += roundIncrement;
  5439. if ( zSig0 < roundIncrement ) begin
  5440. ++zExp;
  5441. zSig0 := LIT64( $8000000000000000 );
  5442. end;
  5443. roundIncrement := roundMask + 1;
  5444. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5445. roundMask |= roundIncrement;
  5446. end;
  5447. zSig0 = ~ roundMask;
  5448. if ( zSig0 = 0 ) zExp := 0;
  5449. result:=packFloatx80( zSign, zExp, zSig0 );
  5450. precision80:
  5451. increment := ( (sbits64) zSig1 < 0 );
  5452. if ( ! roundNearestEven ) begin
  5453. if ( roundingMode = float_round_to_zero ) begin
  5454. increment := 0;
  5455. end;
  5456. else begin
  5457. if ( zSign ) begin
  5458. increment := ( roundingMode = float_round_down ) and zSig1;
  5459. end;
  5460. else begin
  5461. increment := ( roundingMode = float_round_up ) and zSig1;
  5462. end;
  5463. end;
  5464. end;
  5465. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5466. if ( ( $7FFE < zExp )
  5467. or ( ( zExp = $7FFE )
  5468. and ( zSig0 = LIT64( $FFFFFFFFFFFFFFFF ) )
  5469. and increment
  5470. )
  5471. ) begin
  5472. roundMask := 0;
  5473. overflow:
  5474. float_raise( float_flag_overflow or float_flag_inexact );
  5475. if ( ( roundingMode = float_round_to_zero )
  5476. or ( zSign and ( roundingMode = float_round_up ) )
  5477. or ( ! zSign and ( roundingMode = float_round_down ) )
  5478. ) begin
  5479. result:=packFloatx80( zSign, $7FFE, ~ roundMask );
  5480. end;
  5481. result:=packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5482. end;
  5483. if ( zExp <= 0 ) begin
  5484. isTiny =
  5485. ( float_detect_tininess = float_tininess_before_rounding )
  5486. or ( zExp < 0 )
  5487. or ! increment
  5488. or ( zSig0 < LIT64( $FFFFFFFFFFFFFFFF ) );
  5489. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  5490. zExp := 0;
  5491. if ( isTiny and zSig1 ) float_raise( float_flag_underflow );
  5492. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5493. if ( roundNearestEven ) begin
  5494. increment := ( (sbits64) zSig1 < 0 );
  5495. end;
  5496. else begin
  5497. if ( zSign ) begin
  5498. increment := ( roundingMode = float_round_down ) and zSig1;
  5499. end;
  5500. else begin
  5501. increment := ( roundingMode = float_round_up ) and zSig1;
  5502. end;
  5503. end;
  5504. if ( increment ) begin
  5505. ++zSig0;
  5506. zSig0 =
  5507. ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5508. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5509. end;
  5510. result:=packFloatx80( zSign, zExp, zSig0 );
  5511. end;
  5512. end;
  5513. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5514. if ( increment ) begin
  5515. ++zSig0;
  5516. if ( zSig0 = 0 ) begin
  5517. ++zExp;
  5518. zSig0 := LIT64( $8000000000000000 );
  5519. end;
  5520. else begin
  5521. zSig0 = ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5522. end;
  5523. end;
  5524. else begin
  5525. if ( zSig0 = 0 ) zExp := 0;
  5526. end;
  5527. result:=packFloatx80( zSign, zExp, zSig0 );
  5528. end;
  5529. {*----------------------------------------------------------------------------
  5530. | Takes an abstract floating-point value having sign `zSign', exponent
  5531. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  5532. | and returns the proper extended double-precision floating-point value
  5533. | corresponding to the abstract input. This routine is just like
  5534. | `roundAndPackFloatx80' except that the input significand does not have to be
  5535. | normalized.
  5536. *----------------------------------------------------------------------------*}
  5537. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5538. var
  5539. shiftCount: int8;
  5540. begin
  5541. if ( zSig0 = 0 ) begin
  5542. zSig0 := zSig1;
  5543. zSig1 := 0;
  5544. zExp -= 64;
  5545. end;
  5546. shiftCount := countLeadingZeros64( zSig0 );
  5547. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5548. zExp := eExp - shiftCount;
  5549. return
  5550. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  5551. end;
  5552. {*----------------------------------------------------------------------------
  5553. | Returns the result of converting the extended double-precision floating-
  5554. | point value `a' to the 32-bit two's complement integer format. The
  5555. | conversion is performed according to the IEC/IEEE Standard for Binary
  5556. | Floating-Point Arithmetic---which means in particular that the conversion
  5557. | is rounded according to the current rounding mode. If `a' is a NaN, the
  5558. | largest positive integer is returned. Otherwise, if the conversion
  5559. | overflows, the largest integer with the same sign as `a' is returned.
  5560. *----------------------------------------------------------------------------*}
  5561. function floatx80_to_int32(a: floatx80): int32;
  5562. var
  5563. aSign: flag;
  5564. aExp, shiftCount: int32;
  5565. aSig: bits64;
  5566. begin
  5567. aSig := extractFloatx80Frac( a );
  5568. aExp := extractFloatx80Exp( a );
  5569. aSign := extractFloatx80Sign( a );
  5570. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5571. shiftCount := $4037 - aExp;
  5572. if ( shiftCount <= 0 ) shiftCount := 1;
  5573. shift64RightJamming( aSig, shiftCount, aSig );
  5574. result := roundAndPackInt32( aSign, aSig );
  5575. end;
  5576. {*----------------------------------------------------------------------------
  5577. | Returns the result of converting the extended double-precision floating-
  5578. | point value `a' to the 32-bit two's complement integer format. The
  5579. | conversion is performed according to the IEC/IEEE Standard for Binary
  5580. | Floating-Point Arithmetic, except that the conversion is always rounded
  5581. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5582. | Otherwise, if the conversion overflows, the largest integer with the same
  5583. | sign as `a' is returned.
  5584. *----------------------------------------------------------------------------*}
  5585. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  5586. var
  5587. aSign: flag;
  5588. aExp, shiftCount: int32;
  5589. aSig, savedASig: bits64;
  5590. z: int32;
  5591. begin
  5592. aSig := extractFloatx80Frac( a );
  5593. aExp := extractFloatx80Exp( a );
  5594. aSign := extractFloatx80Sign( a );
  5595. if ( $401E < aExp ) begin
  5596. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5597. goto invalid;
  5598. end;
  5599. else if ( aExp < $3FFF ) begin
  5600. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5601. result := 0;
  5602. end;
  5603. shiftCount := $403E - aExp;
  5604. savedASig := aSig;
  5605. aSig >>= shiftCount;
  5606. z := aSig;
  5607. if ( aSign ) z := - z;
  5608. if ( ( z < 0 ) xor aSign ) begin
  5609. invalid:
  5610. float_raise( float_flag_invalid );
  5611. result := aSign ? (sbits32) $80000000 : $7FFFFFFF;
  5612. end;
  5613. if ( ( aSig shl shiftCount ) <> savedASig ) begin
  5614. softfloat_exception_flags or= float_flag_inexact;
  5615. end;
  5616. result := z;
  5617. end;
  5618. {*----------------------------------------------------------------------------
  5619. | Returns the result of converting the extended double-precision floating-
  5620. | point value `a' to the 64-bit two's complement integer format. The
  5621. | conversion is performed according to the IEC/IEEE Standard for Binary
  5622. | Floating-Point Arithmetic---which means in particular that the conversion
  5623. | is rounded according to the current rounding mode. If `a' is a NaN,
  5624. | the largest positive integer is returned. Otherwise, if the conversion
  5625. | overflows, the largest integer with the same sign as `a' is returned.
  5626. *----------------------------------------------------------------------------*}
  5627. function floatx80_to_int64(a: floatx80): int64;
  5628. var
  5629. aSign: flag;
  5630. aExp, shiftCount: int32;
  5631. aSig, aSigExtra: bits64;
  5632. begin
  5633. aSig := extractFloatx80Frac( a );
  5634. aExp := extractFloatx80Exp( a );
  5635. aSign := extractFloatx80Sign( a );
  5636. shiftCount := $403E - aExp;
  5637. if ( shiftCount <= 0 ) begin
  5638. if ( shiftCount ) begin
  5639. float_raise( float_flag_invalid );
  5640. if ( ! aSign
  5641. or ( ( aExp = $7FFF )
  5642. and ( aSig <> LIT64( $8000000000000000 ) ) )
  5643. ) begin
  5644. result := LIT64( $7FFFFFFFFFFFFFFF );
  5645. end;
  5646. result := (sbits64) LIT64( $8000000000000000 );
  5647. end;
  5648. aSigExtra := 0;
  5649. end;
  5650. else begin
  5651. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  5652. end;
  5653. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  5654. end;
  5655. {*----------------------------------------------------------------------------
  5656. | Returns the result of converting the extended double-precision floating-
  5657. | point value `a' to the 64-bit two's complement integer format. The
  5658. | conversion is performed according to the IEC/IEEE Standard for Binary
  5659. | Floating-Point Arithmetic, except that the conversion is always rounded
  5660. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5661. | Otherwise, if the conversion overflows, the largest integer with the same
  5662. | sign as `a' is returned.
  5663. *----------------------------------------------------------------------------*}
  5664. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  5665. var
  5666. aSign: flag;
  5667. aExp, shiftCount: int32;
  5668. aSig: bits64;
  5669. z: int64;
  5670. begin
  5671. aSig := extractFloatx80Frac( a );
  5672. aExp := extractFloatx80Exp( a );
  5673. aSign := extractFloatx80Sign( a );
  5674. shiftCount := aExp - $403E;
  5675. if ( 0 <= shiftCount ) begin
  5676. aSig = LIT64( $7FFFFFFFFFFFFFFF );
  5677. if ( ( a.high <> $C03E ) or aSig ) begin
  5678. float_raise( float_flag_invalid );
  5679. if ( ! aSign or ( ( aExp = $7FFF ) and aSig ) ) begin
  5680. result := LIT64( $7FFFFFFFFFFFFFFF );
  5681. end;
  5682. end;
  5683. result := (sbits64) LIT64( $8000000000000000 );
  5684. end;
  5685. else if ( aExp < $3FFF ) begin
  5686. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5687. result := 0;
  5688. end;
  5689. z := aSig>>( - shiftCount );
  5690. if ( (bits64) ( aSig shl ( shiftCount and 63 ) ) ) begin
  5691. softfloat_exception_flags or= float_flag_inexact;
  5692. end;
  5693. if ( aSign ) z := - z;
  5694. result := z;
  5695. end;
  5696. {*----------------------------------------------------------------------------
  5697. | Returns the result of converting the extended double-precision floating-
  5698. | point value `a' to the single-precision floating-point format. The
  5699. | conversion is performed according to the IEC/IEEE Standard for Binary
  5700. | Floating-Point Arithmetic.
  5701. *----------------------------------------------------------------------------*}
  5702. function floatx80_to_float32(a: floatx80): float32;
  5703. var
  5704. aSign: flag;
  5705. aExp: int32;
  5706. aSig: bits64;
  5707. begin
  5708. aSig := extractFloatx80Frac( a );
  5709. aExp := extractFloatx80Exp( a );
  5710. aSign := extractFloatx80Sign( a );
  5711. if ( aExp = $7FFF ) begin
  5712. if ( (bits64) ( aSig shl 1 ) ) begin
  5713. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  5714. end;
  5715. result := packFloat32( aSign, $FF, 0 );
  5716. end;
  5717. shift64RightJamming( aSig, 33, aSig );
  5718. if ( aExp or aSig ) aExp -= $3F81;
  5719. result := roundAndPackFloat32( aSign, aExp, aSig );
  5720. end;
  5721. {*----------------------------------------------------------------------------
  5722. | Returns the result of converting the extended double-precision floating-
  5723. | point value `a' to the double-precision floating-point format. The
  5724. | conversion is performed according to the IEC/IEEE Standard for Binary
  5725. | Floating-Point Arithmetic.
  5726. *----------------------------------------------------------------------------*}
  5727. function floatx80_to_float64(a: floatx80): float64;
  5728. var
  5729. aSign: flag;
  5730. aExp: int32;
  5731. aSig, zSig: bits64;
  5732. begin
  5733. aSig := extractFloatx80Frac( a );
  5734. aExp := extractFloatx80Exp( a );
  5735. aSign := extractFloatx80Sign( a );
  5736. if ( aExp = $7FFF ) begin
  5737. if ( (bits64) ( aSig shl 1 ) ) begin
  5738. result := commonNaNToFloat64( floatx80ToCommonNaN( a ) );
  5739. end;
  5740. result := packFloat64( aSign, $7FF, 0 );
  5741. end;
  5742. shift64RightJamming( aSig, 1, zSig );
  5743. if ( aExp or aSig ) aExp -= $3C01;
  5744. result := roundAndPackFloat64( aSign, aExp, zSig );
  5745. end;
  5746. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5747. {*----------------------------------------------------------------------------
  5748. | Returns the result of converting the extended double-precision floating-
  5749. | point value `a' to the quadruple-precision floating-point format. The
  5750. | conversion is performed according to the IEC/IEEE Standard for Binary
  5751. | Floating-Point Arithmetic.
  5752. *----------------------------------------------------------------------------*}
  5753. function floatx80_to_float128(a: floatx80): float128;
  5754. var
  5755. aSign: flag;
  5756. aExp: int16;
  5757. aSig, zSig0, zSig1: bits64;
  5758. begin
  5759. aSig := extractFloatx80Frac( a );
  5760. aExp := extractFloatx80Exp( a );
  5761. aSign := extractFloatx80Sign( a );
  5762. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) begin
  5763. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  5764. end;
  5765. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  5766. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  5767. end;
  5768. {$endif FPC_SOFTFLOAT_FLOAT128}
  5769. {*----------------------------------------------------------------------------
  5770. | Rounds the extended double-precision floating-point value `a' to an integer,
  5771. | and Returns the result as an extended quadruple-precision floating-point
  5772. | value. The operation is performed according to the IEC/IEEE Standard for
  5773. | Binary Floating-Point Arithmetic.
  5774. *----------------------------------------------------------------------------*}
  5775. function floatx80_round_to_int(a: floatx80): floatx80;
  5776. var
  5777. aSign: flag;
  5778. aExp: int32;
  5779. lastBitMask, roundBitsMask: bits64;
  5780. roundingMode: int8;
  5781. z: floatx80;
  5782. begin
  5783. aExp := extractFloatx80Exp( a );
  5784. if ( $403E <= aExp ) begin
  5785. if ( ( aExp = $7FFF ) and (bits64) ( extractFloatx80Frac( a ) shl 1 ) ) begin
  5786. result := propagateFloatx80NaN( a, a );
  5787. end;
  5788. result := a;
  5789. end;
  5790. if ( aExp < $3FFF ) begin
  5791. if ( ( aExp = 0 )
  5792. and ( (bits64) ( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) begin
  5793. result := a;
  5794. end;
  5795. softfloat_exception_flags or= float_flag_inexact;
  5796. aSign := extractFloatx80Sign( a );
  5797. switch ( float_rounding_mode ) begin
  5798. case float_round_nearest_even:
  5799. if ( ( aExp = $3FFE ) and (bits64) ( extractFloatx80Frac( a ) shl 1 )
  5800. ) begin
  5801. result :=
  5802. packFloatx80( aSign, $3FFF, LIT64( $8000000000000000 ) );
  5803. end;
  5804. break;
  5805. case float_round_down:
  5806. result :=
  5807. aSign ?
  5808. packFloatx80( 1, $3FFF, LIT64( $8000000000000000 ) )
  5809. : packFloatx80( 0, 0, 0 );
  5810. case float_round_up:
  5811. result :=
  5812. aSign ? packFloatx80( 1, 0, 0 )
  5813. : packFloatx80( 0, $3FFF, LIT64( $8000000000000000 ) );
  5814. end;
  5815. result := packFloatx80( aSign, 0, 0 );
  5816. end;
  5817. lastBitMask := 1;
  5818. lastBitMask shl = $403E - aExp;
  5819. roundBitsMask := lastBitMask - 1;
  5820. z := a;
  5821. roundingMode := float_rounding_mode;
  5822. if ( roundingMode = float_round_nearest_even ) begin
  5823. z.low += lastBitMask>>1;
  5824. if ( ( z.low and roundBitsMask ) = 0 ) z.low = ~ lastBitMask;
  5825. end;
  5826. else if ( roundingMode <> float_round_to_zero ) begin
  5827. if ( extractFloatx80Sign( z ) xor ( roundingMode = float_round_up ) ) begin
  5828. z.low += roundBitsMask;
  5829. end;
  5830. end;
  5831. z.low = ~ roundBitsMask;
  5832. if ( z.low = 0 ) begin
  5833. ++z.high;
  5834. z.low := LIT64( $8000000000000000 );
  5835. end;
  5836. if ( z.low <> a.low ) softfloat_exception_flags or= float_flag_inexact;
  5837. result := z;
  5838. end;
  5839. {*----------------------------------------------------------------------------
  5840. | Returns the result of adding the absolute values of the extended double-
  5841. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  5842. | negated before being returned. `zSign' is ignored if the result is a NaN.
  5843. | The addition is performed according to the IEC/IEEE Standard for Binary
  5844. | Floating-Point Arithmetic.
  5845. *----------------------------------------------------------------------------*}
  5846. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5847. var
  5848. aExp, bExp, zExp: int32;
  5849. aSig, bSig, zSig0, zSig1: bits64;
  5850. expDiff: int32;
  5851. begin
  5852. aSig := extractFloatx80Frac( a );
  5853. aExp := extractFloatx80Exp( a );
  5854. bSig := extractFloatx80Frac( b );
  5855. bExp := extractFloatx80Exp( b );
  5856. expDiff := aExp - bExp;
  5857. if ( 0 < expDiff ) begin
  5858. if ( aExp = $7FFF ) begin
  5859. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5860. result := a;
  5861. end;
  5862. if ( bExp = 0 ) --expDiff;
  5863. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  5864. zExp := aExp;
  5865. end;
  5866. else if ( expDiff < 0 ) begin
  5867. if ( bExp = $7FFF ) begin
  5868. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5869. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5870. end;
  5871. if ( aExp = 0 ) ++expDiff;
  5872. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  5873. zExp := bExp;
  5874. end;
  5875. else begin
  5876. if ( aExp = $7FFF ) begin
  5877. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5878. result := propagateFloatx80NaN( a, b );
  5879. end;
  5880. result := a;
  5881. end;
  5882. zSig1 := 0;
  5883. zSig0 := aSig + bSig;
  5884. if ( aExp = 0 ) begin
  5885. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  5886. goto roundAndPack;
  5887. end;
  5888. zExp := aExp;
  5889. goto shiftRight1;
  5890. end;
  5891. zSig0 := aSig + bSig;
  5892. if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
  5893. shiftRight1:
  5894. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  5895. zSig0 or= LIT64( $8000000000000000 );
  5896. ++zExp;
  5897. roundAndPack:
  5898. result :=
  5899. roundAndPackFloatx80(
  5900. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5901. end;
  5902. {*----------------------------------------------------------------------------
  5903. | Returns the result of subtracting the absolute values of the extended
  5904. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  5905. | difference is negated before being returned. `zSign' is ignored if the
  5906. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  5907. | Standard for Binary Floating-Point Arithmetic.
  5908. *----------------------------------------------------------------------------*}
  5909. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5910. var
  5911. aExp, bExp, zExp: int32;
  5912. aSig, bSig, zSig0, zSig1: bits64;
  5913. expDiff: int32;
  5914. z: floatx80;
  5915. begin
  5916. aSig := extractFloatx80Frac( a );
  5917. aExp := extractFloatx80Exp( a );
  5918. bSig := extractFloatx80Frac( b );
  5919. bExp := extractFloatx80Exp( b );
  5920. expDiff := aExp - bExp;
  5921. if ( 0 < expDiff ) goto aExpBigger;
  5922. if ( expDiff < 0 ) goto bExpBigger;
  5923. if ( aExp = $7FFF ) begin
  5924. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5925. result := propagateFloatx80NaN( a, b );
  5926. end;
  5927. float_raise( float_flag_invalid );
  5928. z.low := floatx80_default_nan_low;
  5929. z.high := floatx80_default_nan_high;
  5930. result := z;
  5931. end;
  5932. if ( aExp = 0 ) begin
  5933. aExp := 1;
  5934. bExp := 1;
  5935. end;
  5936. zSig1 := 0;
  5937. if ( bSig < aSig ) goto aBigger;
  5938. if ( aSig < bSig ) goto bBigger;
  5939. result := packFloatx80( float_rounding_mode = float_round_down, 0, 0 );
  5940. bExpBigger:
  5941. if ( bExp = $7FFF ) begin
  5942. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5943. result := packFloatx80( zSign xor 1, $7FFF, LIT64( $8000000000000000 ) );
  5944. end;
  5945. if ( aExp = 0 ) ++expDiff;
  5946. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  5947. bBigger:
  5948. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  5949. zExp := bExp;
  5950. zSign xor = 1;
  5951. goto normalizeRoundAndPack;
  5952. aExpBigger:
  5953. if ( aExp = $7FFF ) begin
  5954. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5955. result := a;
  5956. end;
  5957. if ( bExp = 0 ) --expDiff;
  5958. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  5959. aBigger:
  5960. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  5961. zExp := aExp;
  5962. normalizeRoundAndPack:
  5963. result :=
  5964. normalizeRoundAndPackFloatx80(
  5965. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5966. end;
  5967. {*----------------------------------------------------------------------------
  5968. | Returns the result of adding the extended double-precision floating-point
  5969. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  5970. | Standard for Binary Floating-Point Arithmetic.
  5971. *----------------------------------------------------------------------------*}
  5972. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  5973. var
  5974. aSign, bSign: flag;
  5975. begin
  5976. aSign := extractFloatx80Sign( a );
  5977. bSign := extractFloatx80Sign( b );
  5978. if ( aSign = bSign ) begin
  5979. result := addFloatx80Sigs( a, b, aSign );
  5980. end;
  5981. else begin
  5982. result := subFloatx80Sigs( a, b, aSign );
  5983. end;
  5984. end;
  5985. {*----------------------------------------------------------------------------
  5986. | Returns the result of subtracting the extended double-precision floating-
  5987. | point values `a' and `b'. The operation is performed according to the
  5988. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5989. *----------------------------------------------------------------------------*}
  5990. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  5991. var
  5992. aSign, bSign: flag;
  5993. begin
  5994. aSign := extractFloatx80Sign( a );
  5995. bSign := extractFloatx80Sign( b );
  5996. if ( aSign = bSign ) begin
  5997. result := subFloatx80Sigs( a, b, aSign );
  5998. end;
  5999. else begin
  6000. result := addFloatx80Sigs( a, b, aSign );
  6001. end;
  6002. end;
  6003. {*----------------------------------------------------------------------------
  6004. | Returns the result of multiplying the extended double-precision floating-
  6005. | point values `a' and `b'. The operation is performed according to the
  6006. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6007. *----------------------------------------------------------------------------*}
  6008. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6009. var
  6010. aSign, bSign, zSign: flag;
  6011. aExp, bExp, zExp: int32;
  6012. aSig, bSig, zSig0, zSig1: bits64;
  6013. z: floatx80;
  6014. begin
  6015. aSig := extractFloatx80Frac( a );
  6016. aExp := extractFloatx80Exp( a );
  6017. aSign := extractFloatx80Sign( a );
  6018. bSig := extractFloatx80Frac( b );
  6019. bExp := extractFloatx80Exp( b );
  6020. bSign := extractFloatx80Sign( b );
  6021. zSign := aSign xor bSign;
  6022. if ( aExp = $7FFF ) begin
  6023. if ( (bits64) ( aSig shl 1 )
  6024. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  6025. result := propagateFloatx80NaN( a, b );
  6026. end;
  6027. if ( ( bExp or bSig ) = 0 ) goto invalid;
  6028. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6029. end;
  6030. if ( bExp = $7FFF ) begin
  6031. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6032. if ( ( aExp or aSig ) = 0 ) begin
  6033. invalid:
  6034. float_raise( float_flag_invalid );
  6035. z.low := floatx80_default_nan_low;
  6036. z.high := floatx80_default_nan_high;
  6037. result := z;
  6038. end;
  6039. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6040. end;
  6041. if ( aExp = 0 ) begin
  6042. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6043. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6044. end;
  6045. if ( bExp = 0 ) begin
  6046. if ( bSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6047. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6048. end;
  6049. zExp := aExp + bExp - $3FFE;
  6050. mul64To128( aSig, bSig, zSig0, zSig1 );
  6051. if ( 0 < (sbits64) zSig0 ) begin
  6052. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6053. --zExp;
  6054. end;
  6055. result :=
  6056. roundAndPackFloatx80(
  6057. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6058. end;
  6059. {*----------------------------------------------------------------------------
  6060. | Returns the result of dividing the extended double-precision floating-point
  6061. | value `a' by the corresponding value `b'. The operation is performed
  6062. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6063. *----------------------------------------------------------------------------*}
  6064. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6065. var
  6066. aSign, bSign, zSign: flag;
  6067. aExp, bExp, zExp: int32;
  6068. aSig, bSig, zSig0, zSig1: bits64;
  6069. rem0, rem1, rem2, term0, term1, term2: bits64;
  6070. z: floatx80;
  6071. begin
  6072. aSig := extractFloatx80Frac( a );
  6073. aExp := extractFloatx80Exp( a );
  6074. aSign := extractFloatx80Sign( a );
  6075. bSig := extractFloatx80Frac( b );
  6076. bExp := extractFloatx80Exp( b );
  6077. bSign := extractFloatx80Sign( b );
  6078. zSign := aSign xor bSign;
  6079. if ( aExp = $7FFF ) begin
  6080. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6081. if ( bExp = $7FFF ) begin
  6082. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6083. goto invalid;
  6084. end;
  6085. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6086. end;
  6087. if ( bExp = $7FFF ) begin
  6088. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6089. result := packFloatx80( zSign, 0, 0 );
  6090. end;
  6091. if ( bExp = 0 ) begin
  6092. if ( bSig = 0 ) begin
  6093. if ( ( aExp or aSig ) = 0 ) begin
  6094. invalid:
  6095. float_raise( float_flag_invalid );
  6096. z.low := floatx80_default_nan_low;
  6097. z.high := floatx80_default_nan_high;
  6098. result := z;
  6099. end;
  6100. float_raise( float_flag_divbyzero );
  6101. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6102. end;
  6103. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6104. end;
  6105. if ( aExp = 0 ) begin
  6106. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6107. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6108. end;
  6109. zExp := aExp - bExp + $3FFE;
  6110. rem1 := 0;
  6111. if ( bSig <= aSig ) begin
  6112. shift128Right( aSig, 0, 1, aSig, rem1 );
  6113. ++zExp;
  6114. end;
  6115. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6116. mul64To128( bSig, zSig0, term0, term1 );
  6117. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6118. while ( (sbits64) rem0 < 0 ) begin
  6119. --zSig0;
  6120. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6121. end;
  6122. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6123. if ( (bits64) ( zSig1 shl 1 ) <= 8 ) begin
  6124. mul64To128( bSig, zSig1, term1, term2 );
  6125. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6126. while ( (sbits64) rem1 < 0 ) begin
  6127. --zSig1;
  6128. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6129. end;
  6130. zSig1 or= ( ( rem1 or rem2 ) <> 0 );
  6131. end;
  6132. result :=
  6133. roundAndPackFloatx80(
  6134. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6135. end;
  6136. {*----------------------------------------------------------------------------
  6137. | Returns the remainder of the extended double-precision floating-point value
  6138. | `a' with respect to the corresponding value `b'. The operation is performed
  6139. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6140. *----------------------------------------------------------------------------*}
  6141. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6142. var
  6143. aSign, bSign, zSign: flag;
  6144. aExp, bExp, expDiff: int32;
  6145. aSig0, aSig1, bSig: bits64;
  6146. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6147. z: floatx80;
  6148. begin
  6149. aSig0 := extractFloatx80Frac( a );
  6150. aExp := extractFloatx80Exp( a );
  6151. aSign := extractFloatx80Sign( a );
  6152. bSig := extractFloatx80Frac( b );
  6153. bExp := extractFloatx80Exp( b );
  6154. bSign := extractFloatx80Sign( b );
  6155. if ( aExp = $7FFF ) begin
  6156. if ( (bits64) ( aSig0 shl 1 )
  6157. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  6158. result := propagateFloatx80NaN( a, b );
  6159. end;
  6160. goto invalid;
  6161. end;
  6162. if ( bExp = $7FFF ) begin
  6163. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6164. result := a;
  6165. end;
  6166. if ( bExp = 0 ) begin
  6167. if ( bSig = 0 ) begin
  6168. invalid:
  6169. float_raise( float_flag_invalid );
  6170. z.low := floatx80_default_nan_low;
  6171. z.high := floatx80_default_nan_high;
  6172. result := z;
  6173. end;
  6174. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6175. end;
  6176. if ( aExp = 0 ) begin
  6177. if ( (bits64) ( aSig0 shl 1 ) = 0 ) result := a;
  6178. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6179. end;
  6180. bSig or= LIT64( $8000000000000000 );
  6181. zSign := aSign;
  6182. expDiff := aExp - bExp;
  6183. aSig1 := 0;
  6184. if ( expDiff < 0 ) begin
  6185. if ( expDiff < -1 ) result := a;
  6186. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6187. expDiff := 0;
  6188. end;
  6189. q := ( bSig <= aSig0 );
  6190. if ( q ) aSig0 -= bSig;
  6191. expDiff -= 64;
  6192. while ( 0 < expDiff ) begin
  6193. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6194. q := ( 2 < q ) ? q - 2 : 0;
  6195. mul64To128( bSig, q, term0, term1 );
  6196. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6197. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  6198. expDiff -= 62;
  6199. end;
  6200. expDiff += 64;
  6201. if ( 0 < expDiff ) begin
  6202. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6203. q := ( 2 < q ) ? q - 2 : 0;
  6204. q >>= 64 - expDiff;
  6205. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  6206. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6207. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  6208. while ( le128( term0, term1, aSig0, aSig1 ) ) begin
  6209. ++q;
  6210. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6211. end;
  6212. end;
  6213. else begin
  6214. term1 := 0;
  6215. term0 := bSig;
  6216. end;
  6217. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  6218. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6219. or ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6220. and ( q and 1 ) )
  6221. ) begin
  6222. aSig0 := alternateASig0;
  6223. aSig1 := alternateASig1;
  6224. zSign := ! zSign;
  6225. end;
  6226. result :=
  6227. normalizeRoundAndPackFloatx80(
  6228. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  6229. end;
  6230. {*----------------------------------------------------------------------------
  6231. | Returns the square root of the extended double-precision floating-point
  6232. | value `a'. The operation is performed according to the IEC/IEEE Standard
  6233. | for Binary Floating-Point Arithmetic.
  6234. *----------------------------------------------------------------------------*}
  6235. function floatx80_sqrt(a: floatx80): floatx80;
  6236. var
  6237. aSign: flag;
  6238. aExp, zExp: int32;
  6239. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  6240. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  6241. z: floatx80;
  6242. label
  6243. invalid;
  6244. begin
  6245. aSig0 := extractFloatx80Frac( a );
  6246. aExp := extractFloatx80Exp( a );
  6247. aSign := extractFloatx80Sign( a );
  6248. if ( aExp = $7FFF ) begin
  6249. if ( (bits64) ( aSig0 shl 1 ) ) result := propagateFloatx80NaN( a, a );
  6250. if ( ! aSign ) result := a;
  6251. goto invalid;
  6252. end;
  6253. if ( aSign ) begin
  6254. if ( ( aExp or aSig0 ) = 0 ) result := a;
  6255. invalid:
  6256. float_raise( float_flag_invalid );
  6257. z.low := floatx80_default_nan_low;
  6258. z.high := floatx80_default_nan_high;
  6259. result := z;
  6260. end;
  6261. if ( aExp = 0 ) begin
  6262. if ( aSig0 = 0 ) result := packFloatx80( 0, 0, 0 );
  6263. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6264. end;
  6265. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
  6266. zSig0 := estimateSqrt32( aExp, aSig0>>32 );
  6267. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  6268. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  6269. doubleZSig0 := zSig0 shl 1;
  6270. mul64To128( zSig0, zSig0, term0, term1 );
  6271. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  6272. while ( (sbits64) rem0 < 0 ) begin
  6273. --zSig0;
  6274. doubleZSig0 -= 2;
  6275. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, rem0, rem1 );
  6276. end;
  6277. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  6278. if ( ( zSig1 and LIT64( $3FFFFFFFFFFFFFFF ) ) <= 5 ) begin
  6279. if ( zSig1 = 0 ) zSig1 := 1;
  6280. mul64To128( doubleZSig0, zSig1, term1, term2 );
  6281. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6282. mul64To128( zSig1, zSig1, term2, term3 );
  6283. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  6284. while ( (sbits64) rem1 < 0 ) begin
  6285. --zSig1;
  6286. shortShift128Left( 0, zSig1, 1, term2, term3 );
  6287. term3 or= 1;
  6288. term2 or= doubleZSig0;
  6289. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  6290. end;
  6291. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  6292. end;
  6293. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  6294. zSig0 or= doubleZSig0;
  6295. result :=
  6296. roundAndPackFloatx80(
  6297. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  6298. end;
  6299. {*----------------------------------------------------------------------------
  6300. | Returns 1 if the extended double-precision floating-point value `a' is
  6301. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  6302. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  6303. | Arithmetic.
  6304. *----------------------------------------------------------------------------*}
  6305. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  6306. begin
  6307. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6308. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6309. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6310. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6311. ) begin
  6312. if ( floatx80_is_signaling_nan( a )
  6313. or floatx80_is_signaling_nan( b ) ) begin
  6314. float_raise( float_flag_invalid );
  6315. end;
  6316. result := 0;
  6317. end;
  6318. result :=
  6319. ( a.low = b.low )
  6320. and ( ( a.high = b.high )
  6321. or ( ( a.low = 0 )
  6322. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6323. );
  6324. end;
  6325. {*----------------------------------------------------------------------------
  6326. | Returns 1 if the extended double-precision floating-point value `a' is
  6327. | less than or equal to the corresponding value `b', and 0 otherwise. The
  6328. | comparison is performed according to the IEC/IEEE Standard for Binary
  6329. | Floating-Point Arithmetic.
  6330. *----------------------------------------------------------------------------*}
  6331. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  6332. var
  6333. aSign, bSign: flag;
  6334. begin
  6335. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6336. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6337. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6338. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6339. ) begin
  6340. float_raise( float_flag_invalid );
  6341. result := 0;
  6342. end;
  6343. aSign := extractFloatx80Sign( a );
  6344. bSign := extractFloatx80Sign( b );
  6345. if ( aSign <> bSign ) begin
  6346. result :=
  6347. aSign
  6348. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6349. = 0 );
  6350. end;
  6351. result :=
  6352. aSign ? le128( b.high, b.low, a.high, a.low )
  6353. : le128( a.high, a.low, b.high, b.low );
  6354. end;
  6355. {*----------------------------------------------------------------------------
  6356. | Returns 1 if the extended double-precision floating-point value `a' is
  6357. | less than the corresponding value `b', and 0 otherwise. The comparison
  6358. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6359. | Arithmetic.
  6360. *----------------------------------------------------------------------------*}
  6361. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  6362. var
  6363. aSign, bSign: flag;
  6364. begin
  6365. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6366. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6367. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6368. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6369. ) begin
  6370. float_raise( float_flag_invalid );
  6371. result := 0;
  6372. end;
  6373. aSign := extractFloatx80Sign( a );
  6374. bSign := extractFloatx80Sign( b );
  6375. if ( aSign <> bSign ) begin
  6376. result :=
  6377. aSign
  6378. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6379. <> 0 );
  6380. end;
  6381. result :=
  6382. aSign ? lt128( b.high, b.low, a.high, a.low )
  6383. : lt128( a.high, a.low, b.high, b.low );
  6384. end;
  6385. {*----------------------------------------------------------------------------
  6386. | Returns 1 if the extended double-precision floating-point value `a' is equal
  6387. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  6388. | raised if either operand is a NaN. Otherwise, the comparison is performed
  6389. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6390. *----------------------------------------------------------------------------*}
  6391. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  6392. begin
  6393. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6394. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6395. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6396. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6397. ) begin
  6398. float_raise( float_flag_invalid );
  6399. result := 0;
  6400. end;
  6401. result :=
  6402. ( a.low = b.low )
  6403. and ( ( a.high = b.high )
  6404. or ( ( a.low = 0 )
  6405. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6406. );
  6407. end;
  6408. {*----------------------------------------------------------------------------
  6409. | Returns 1 if the extended double-precision floating-point value `a' is less
  6410. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  6411. | do not cause an exception. Otherwise, the comparison is performed according
  6412. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6413. *----------------------------------------------------------------------------*}
  6414. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  6415. var
  6416. aSign, bSign: flag;
  6417. begin
  6418. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6419. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6420. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6421. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6422. ) begin
  6423. if ( floatx80_is_signaling_nan( a )
  6424. or floatx80_is_signaling_nan( b ) ) begin
  6425. float_raise( float_flag_invalid );
  6426. end;
  6427. result := 0;
  6428. end;
  6429. aSign := extractFloatx80Sign( a );
  6430. bSign := extractFloatx80Sign( b );
  6431. if ( aSign <> bSign ) begin
  6432. result :=
  6433. aSign
  6434. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6435. = 0 );
  6436. end;
  6437. result :=
  6438. aSign ? le128( b.high, b.low, a.high, a.low )
  6439. : le128( a.high, a.low, b.high, b.low );
  6440. end;
  6441. {*----------------------------------------------------------------------------
  6442. | Returns 1 if the extended double-precision floating-point value `a' is less
  6443. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  6444. | an exception. Otherwise, the comparison is performed according to the
  6445. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6446. *----------------------------------------------------------------------------*}
  6447. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  6448. var
  6449. aSign, bSign: flag;
  6450. begin
  6451. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6452. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6453. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6454. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6455. ) begin
  6456. if ( floatx80_is_signaling_nan( a )
  6457. or floatx80_is_signaling_nan( b ) ) begin
  6458. float_raise( float_flag_invalid );
  6459. end;
  6460. result := 0;
  6461. end;
  6462. aSign := extractFloatx80Sign( a );
  6463. bSign := extractFloatx80Sign( b );
  6464. if ( aSign <> bSign ) begin
  6465. result :=
  6466. aSign
  6467. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6468. <> 0 );
  6469. end;
  6470. result :=
  6471. aSign ? lt128( b.high, b.low, a.high, a.low )
  6472. : lt128( a.high, a.low, b.high, b.low );
  6473. end;
  6474. {$endif FPC_SOFTFLOAT_FLOATX80}
  6475. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6476. {*----------------------------------------------------------------------------
  6477. | Returns the least-significant 64 fraction bits of the quadruple-precision
  6478. | floating-point value `a'.
  6479. *----------------------------------------------------------------------------*}
  6480. function extractFloat128Frac1(a : float128): bits64;
  6481. begin
  6482. result:=a.low;
  6483. end;
  6484. {*----------------------------------------------------------------------------
  6485. | Returns the most-significant 48 fraction bits of the quadruple-precision
  6486. | floating-point value `a'.
  6487. *----------------------------------------------------------------------------*}
  6488. function extractFloat128Frac0(a : float128): bits64;
  6489. begin
  6490. result:=a.high and int64($0000FFFFFFFFFFFF);
  6491. end;
  6492. {*----------------------------------------------------------------------------
  6493. | Returns the exponent bits of the quadruple-precision floating-point value
  6494. | `a'.
  6495. *----------------------------------------------------------------------------*}
  6496. function extractFloat128Exp(a : float128): int32;
  6497. begin
  6498. result:=( a.high shr 48 ) and $7FFF;
  6499. end;
  6500. {*----------------------------------------------------------------------------
  6501. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  6502. *----------------------------------------------------------------------------*}
  6503. function extractFloat128Sign(a : float128): flag;
  6504. begin
  6505. result:=a.high shr 63;
  6506. end;
  6507. {*----------------------------------------------------------------------------
  6508. | Normalizes the subnormal quadruple-precision floating-point value
  6509. | represented by the denormalized significand formed by the concatenation of
  6510. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  6511. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  6512. | significand are stored at the location pointed to by `zSig0Ptr', and the
  6513. | least significant 64 bits of the normalized significand are stored at the
  6514. | location pointed to by `zSig1Ptr'.
  6515. *----------------------------------------------------------------------------*}
  6516. procedure normalizeFloat128Subnormal(
  6517. aSig0: bits64;
  6518. aSig1: bits64;
  6519. var zExpPtr: int32;
  6520. var zSig0Ptr: bits64;
  6521. var zSig1Ptr: bits64);
  6522. var
  6523. shiftCount: int8;
  6524. begin
  6525. if ( aSig0 = 0 ) then
  6526. begin
  6527. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  6528. if ( shiftCount < 0 ) then
  6529. begin
  6530. zSig0Ptr := aSig1 shr ( - shiftCount );
  6531. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  6532. end
  6533. else begin
  6534. zSig0Ptr := aSig1 shl shiftCount;
  6535. zSig1Ptr := 0;
  6536. end;
  6537. zExpPtr := - shiftCount - 63;
  6538. end
  6539. else begin
  6540. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  6541. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  6542. zExpPtr := 1 - shiftCount;
  6543. end;
  6544. end;
  6545. {*----------------------------------------------------------------------------
  6546. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  6547. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  6548. | floating-point value, returning the result. After being shifted into the
  6549. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  6550. | added together to form the most significant 32 bits of the result. This
  6551. | means that any integer portion of `zSig0' will be added into the exponent.
  6552. | Since a properly normalized significand will have an integer portion equal
  6553. | to 1, the `zExp' input should be 1 less than the desired result exponent
  6554. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  6555. | significand.
  6556. *----------------------------------------------------------------------------*}
  6557. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  6558. var
  6559. z: float128;
  6560. begin
  6561. z.low := zSig1;
  6562. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  6563. result:=z;
  6564. end;
  6565. {*----------------------------------------------------------------------------
  6566. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6567. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  6568. | and `zSig2', and returns the proper quadruple-precision floating-point value
  6569. | corresponding to the abstract input. Ordinarily, the abstract value is
  6570. | simply rounded and packed into the quadruple-precision format, with the
  6571. | inexact exception raised if the abstract input cannot be represented
  6572. | exactly. However, if the abstract value is too large, the overflow and
  6573. | inexact exceptions are raised and an infinity or maximal finite value is
  6574. | returned. If the abstract value is too small, the input value is rounded to
  6575. | a subnormal number, and the underflow and inexact exceptions are raised if
  6576. | the abstract input cannot be represented exactly as a subnormal quadruple-
  6577. | precision floating-point number.
  6578. | The input significand must be normalized or smaller. If the input
  6579. | significand is not normalized, `zExp' must be 0; in that case, the result
  6580. | returned is a subnormal number, and it must not require rounding. In the
  6581. | usual case that the input significand is normalized, `zExp' must be 1 less
  6582. | than the ``true'' floating-point exponent. The handling of underflow and
  6583. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6584. *----------------------------------------------------------------------------*}
  6585. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  6586. var
  6587. roundingMode: int8;
  6588. roundNearestEven, increment, isTiny: flag;
  6589. begin
  6590. roundingMode := float_rounding_mode;
  6591. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  6592. increment := ord( sbits64(zSig2) < 0 );
  6593. if ( roundNearestEven=0 ) then
  6594. begin
  6595. if ( roundingMode = float_round_to_zero ) then
  6596. begin
  6597. increment := 0;
  6598. end
  6599. else begin
  6600. if ( zSign<>0 ) then
  6601. begin
  6602. increment := ord( roundingMode = float_round_down ) and zSig2;
  6603. end
  6604. else begin
  6605. increment := ord( roundingMode = float_round_up ) and zSig2;
  6606. end;
  6607. end;
  6608. end;
  6609. if ( $7FFD <= bits32(zExp) ) then
  6610. begin
  6611. if ( ord( $7FFD < zExp )
  6612. or ( ord( zExp = $7FFD )
  6613. and eq128(
  6614. int64( $0001FFFFFFFFFFFF ),
  6615. int64( $FFFFFFFFFFFFFFFF ),
  6616. zSig0,
  6617. zSig1
  6618. )
  6619. and increment
  6620. )
  6621. )<>0 then
  6622. begin
  6623. float_raise( float_flag_overflow or float_flag_inexact );
  6624. if ( ord( roundingMode = float_round_to_zero )
  6625. or ( zSign and ord( roundingMode = float_round_up ) )
  6626. or ( not(zSign) and ord( roundingMode = float_round_down ) )
  6627. )<>0 then
  6628. begin
  6629. result :=
  6630. packFloat128(
  6631. zSign,
  6632. $7FFE,
  6633. int64( $0000FFFFFFFFFFFF ),
  6634. int64( $FFFFFFFFFFFFFFFF )
  6635. );
  6636. end;
  6637. result:=packFloat128( zSign, $7FFF, 0, 0 );
  6638. end;
  6639. if ( zExp < 0 ) then
  6640. begin
  6641. isTiny :=
  6642. ord(( float_detect_tininess = float_tininess_before_rounding )
  6643. or ( zExp < -1 )
  6644. or not( increment<>0 )
  6645. or boolean(lt128(
  6646. zSig0,
  6647. zSig1,
  6648. int64( $0001FFFFFFFFFFFF ),
  6649. int64( $FFFFFFFFFFFFFFFF )
  6650. )));
  6651. shift128ExtraRightJamming(
  6652. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  6653. zExp := 0;
  6654. if ( isTiny and zSig2 )<>0 then
  6655. float_raise( float_flag_underflow );
  6656. if ( roundNearestEven<>0 ) then
  6657. begin
  6658. increment := ord( sbits64(zSig2) < 0 );
  6659. end
  6660. else begin
  6661. if ( zSign<>0 ) then
  6662. begin
  6663. increment := ord( roundingMode = float_round_down ) and zSig2;
  6664. end
  6665. else begin
  6666. increment := ord( roundingMode = float_round_up ) and zSig2;
  6667. end;
  6668. end;
  6669. end;
  6670. end;
  6671. if ( zSig2<>0 ) then
  6672. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6673. if ( increment<>0 ) then
  6674. begin
  6675. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  6676. zSig1 := zSig1 and not( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  6677. end
  6678. else begin
  6679. if ( ( zSig0 or zSig1 ) = 0 ) then
  6680. zExp := 0;
  6681. end;
  6682. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  6683. end;
  6684. {*----------------------------------------------------------------------------
  6685. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6686. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  6687. | returns the proper quadruple-precision floating-point value corresponding
  6688. | to the abstract input. This routine is just like `roundAndPackFloat128'
  6689. | except that the input significand has fewer bits and does not have to be
  6690. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  6691. | point exponent.
  6692. *----------------------------------------------------------------------------*}
  6693. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  6694. var
  6695. shiftCount: int8;
  6696. zSig2: bits64;
  6697. begin
  6698. if ( zSig0 = 0 ) then
  6699. begin
  6700. zSig0 := zSig1;
  6701. zSig1 := 0;
  6702. dec(zExp, 64);
  6703. end;
  6704. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  6705. if ( 0 <= shiftCount ) then
  6706. begin
  6707. zSig2 := 0;
  6708. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6709. end
  6710. else begin
  6711. shift128ExtraRightJamming(
  6712. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  6713. end;
  6714. dec(zExp, shiftCount);
  6715. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  6716. end;
  6717. {*----------------------------------------------------------------------------
  6718. | Returns the result of converting the quadruple-precision floating-point
  6719. | value `a' to the 32-bit two's complement integer format. The conversion
  6720. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6721. | Arithmetic---which means in particular that the conversion is rounded
  6722. | according to the current rounding mode. If `a' is a NaN, the largest
  6723. | positive integer is returned. Otherwise, if the conversion overflows, the
  6724. | largest integer with the same sign as `a' is returned.
  6725. *----------------------------------------------------------------------------*}
  6726. function float128_to_int32(a: float128): int32;
  6727. var
  6728. aSign: flag;
  6729. aExp, shiftCount: int32;
  6730. aSig0, aSig1: bits64;
  6731. begin
  6732. aSig1 := extractFloat128Frac1( a );
  6733. aSig0 := extractFloat128Frac0( a );
  6734. aExp := extractFloat128Exp( a );
  6735. aSign := extractFloat128Sign( a );
  6736. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  6737. aSign := 0;
  6738. if ( aExp<>0 ) then
  6739. aSig0 := aSig0 or int64( $0001000000000000 );
  6740. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6741. shiftCount := $4028 - aExp;
  6742. if ( 0 < shiftCount ) then
  6743. shift64RightJamming( aSig0, shiftCount, aSig0 );
  6744. result := roundAndPackInt32( aSign, aSig0 );
  6745. end;
  6746. {*----------------------------------------------------------------------------
  6747. | Returns the result of converting the quadruple-precision floating-point
  6748. | value `a' to the 32-bit two's complement integer format. The conversion
  6749. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6750. | Arithmetic, except that the conversion is always rounded toward zero. If
  6751. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  6752. | conversion overflows, the largest integer with the same sign as `a' is
  6753. | returned.
  6754. *----------------------------------------------------------------------------*}
  6755. function float128_to_int32_round_to_zero(a: float128): int32;
  6756. var
  6757. aSign: flag;
  6758. aExp, shiftCount: int32;
  6759. aSig0, aSig1, savedASig: bits64;
  6760. z: int32;
  6761. label
  6762. invalid;
  6763. begin
  6764. aSig1 := extractFloat128Frac1( a );
  6765. aSig0 := extractFloat128Frac0( a );
  6766. aExp := extractFloat128Exp( a );
  6767. aSign := extractFloat128Sign( a );
  6768. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6769. if ( $401E < aExp ) then
  6770. begin
  6771. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  6772. aSign := 0;
  6773. goto invalid;
  6774. end
  6775. else if ( aExp < $3FFF ) then
  6776. begin
  6777. if ( aExp or aSig0 )<>0 then
  6778. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6779. result := 0;
  6780. exit;
  6781. end;
  6782. aSig0 := aSig0 or int64( $0001000000000000 );
  6783. shiftCount := $402F - aExp;
  6784. savedASig := aSig0;
  6785. aSig0 := aSig0 shr shiftCount;
  6786. z := aSig0;
  6787. if ( aSign )<>0 then
  6788. z := - z;
  6789. if ( ord( z < 0 ) xor aSign )<>0 then
  6790. begin
  6791. invalid:
  6792. float_raise( float_flag_invalid );
  6793. if aSign<>0 then
  6794. result:=$80000000
  6795. else
  6796. result:=$7FFFFFFF;
  6797. exit;
  6798. end;
  6799. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  6800. begin
  6801. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6802. end;
  6803. result := z;
  6804. end;
  6805. {*----------------------------------------------------------------------------
  6806. | Returns the result of converting the quadruple-precision floating-point
  6807. | value `a' to the 64-bit two's complement integer format. The conversion
  6808. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6809. | Arithmetic---which means in particular that the conversion is rounded
  6810. | according to the current rounding mode. If `a' is a NaN, the largest
  6811. | positive integer is returned. Otherwise, if the conversion overflows, the
  6812. | largest integer with the same sign as `a' is returned.
  6813. *----------------------------------------------------------------------------*}
  6814. function float128_to_int64(a: float128): int64;
  6815. var
  6816. aSign: flag;
  6817. aExp, shiftCount: int32;
  6818. aSig0, aSig1: bits64;
  6819. begin
  6820. aSig1 := extractFloat128Frac1( a );
  6821. aSig0 := extractFloat128Frac0( a );
  6822. aExp := extractFloat128Exp( a );
  6823. aSign := extractFloat128Sign( a );
  6824. if ( aExp<>0 ) then
  6825. aSig0 := aSig0 or int64( $0001000000000000 );
  6826. shiftCount := $402F - aExp;
  6827. if ( shiftCount <= 0 ) then
  6828. begin
  6829. if ( $403E < aExp ) then
  6830. begin
  6831. float_raise( float_flag_invalid );
  6832. if ( (aSign=0)
  6833. or ( ( aExp = $7FFF )
  6834. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  6835. )
  6836. ) then
  6837. begin
  6838. result := int64( $7FFFFFFFFFFFFFFF );
  6839. end;
  6840. result := int64( $8000000000000000 );
  6841. end;
  6842. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  6843. end
  6844. else begin
  6845. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  6846. end;
  6847. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  6848. end;
  6849. {*----------------------------------------------------------------------------
  6850. | Returns the result of converting the quadruple-precision floating-point
  6851. | value `a' to the 64-bit two's complement integer format. The conversion
  6852. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6853. | Arithmetic, except that the conversion is always rounded toward zero.
  6854. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  6855. | the conversion overflows, the largest integer with the same sign as `a' is
  6856. | returned.
  6857. *----------------------------------------------------------------------------*}
  6858. function float128_to_int64_round_to_zero(a: float128): int64;
  6859. var
  6860. aSign: flag;
  6861. aExp, shiftCount: int32;
  6862. aSig0, aSig1: bits64;
  6863. z: int64;
  6864. begin
  6865. aSig1 := extractFloat128Frac1( a );
  6866. aSig0 := extractFloat128Frac0( a );
  6867. aExp := extractFloat128Exp( a );
  6868. aSign := extractFloat128Sign( a );
  6869. if ( aExp<>0 ) then
  6870. aSig0 := aSig0 or int64( $0001000000000000 );
  6871. shiftCount := aExp - $402F;
  6872. if ( 0 < shiftCount ) then
  6873. begin
  6874. if ( $403E <= aExp ) then
  6875. begin
  6876. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  6877. if ( ( a.high = int64( $C03E000000000000 ) )
  6878. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  6879. begin
  6880. if ( aSig1<>0 ) then
  6881. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6882. end
  6883. else begin
  6884. float_raise( float_flag_invalid );
  6885. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  6886. begin
  6887. result := int64( $7FFFFFFFFFFFFFFF );
  6888. exit;
  6889. end;
  6890. end;
  6891. result := int64( $8000000000000000 );
  6892. exit;
  6893. end;
  6894. z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
  6895. if ( int64( aSig1 shl shiftCount )<>0 ) then
  6896. begin
  6897. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6898. end;
  6899. end
  6900. else begin
  6901. if ( aExp < $3FFF ) then
  6902. begin
  6903. if ( aExp or aSig0 or aSig1 )<>0 then
  6904. begin
  6905. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6906. end;
  6907. result := 0;
  6908. exit;
  6909. end;
  6910. z := aSig0 shr ( - shiftCount );
  6911. if ( (aSig1<>0)
  6912. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  6913. begin
  6914. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6915. end;
  6916. end;
  6917. if ( aSign<>0 ) then
  6918. z := - z;
  6919. result := z;
  6920. end;
  6921. {*----------------------------------------------------------------------------
  6922. | Returns the result of converting the quadruple-precision floating-point
  6923. | value `a' to the single-precision floating-point format. The conversion
  6924. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6925. | Arithmetic.
  6926. *----------------------------------------------------------------------------*}
  6927. function float128_to_float32(a: float128): float32;
  6928. var
  6929. aSign: flag;
  6930. aExp: int32;
  6931. aSig0, aSig1: bits64;
  6932. zSig: bits32;
  6933. begin
  6934. aSig1 := extractFloat128Frac1( a );
  6935. aSig0 := extractFloat128Frac0( a );
  6936. aExp := extractFloat128Exp( a );
  6937. aSign := extractFloat128Sign( a );
  6938. if ( aExp = $7FFF ) then
  6939. begin
  6940. if ( aSig0 or aSig1 )<>0 then
  6941. begin
  6942. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  6943. exit;
  6944. end;
  6945. result := packFloat32( aSign, $FF, 0 );
  6946. exit;
  6947. end;
  6948. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6949. shift64RightJamming( aSig0, 18, aSig0 );
  6950. zSig := aSig0;
  6951. if ( aExp or zSig )<>0 then
  6952. begin
  6953. zSig := zSig or $40000000;
  6954. dec(aExp,$3F81);
  6955. end;
  6956. result := roundAndPackFloat32( aSign, aExp, zSig );
  6957. end;
  6958. {*----------------------------------------------------------------------------
  6959. | Returns the result of converting the quadruple-precision floating-point
  6960. | value `a' to the double-precision floating-point format. The conversion
  6961. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6962. | Arithmetic.
  6963. *----------------------------------------------------------------------------*}
  6964. function float128_to_float64(a: float128): float64;
  6965. var
  6966. aSign: flag;
  6967. aExp: int32;
  6968. aSig0, aSig1: bits64;
  6969. begin
  6970. aSig1 := extractFloat128Frac1( a );
  6971. aSig0 := extractFloat128Frac0( a );
  6972. aExp := extractFloat128Exp( a );
  6973. aSign := extractFloat128Sign( a );
  6974. if ( aExp = $7FFF ) then
  6975. begin
  6976. if ( aSig0 or aSig1 )<>0 then
  6977. begin
  6978. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  6979. exit;
  6980. end;
  6981. result:=packFloat64( aSign, $7FF, 0);
  6982. exit;
  6983. end;
  6984. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  6985. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6986. if ( aExp or aSig0 )<>0 then
  6987. begin
  6988. aSig0 := aSig0 or int64( $4000000000000000 );
  6989. dec(aExp,$3C01);
  6990. end;
  6991. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  6992. end;
  6993. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  6994. {*----------------------------------------------------------------------------
  6995. | Returns the result of converting the quadruple-precision floating-point
  6996. | value `a' to the extended double-precision floating-point format. The
  6997. | conversion is performed according to the IEC/IEEE Standard for Binary
  6998. | Floating-Point Arithmetic.
  6999. *----------------------------------------------------------------------------*}
  7000. function float128_to_floatx80(a: float128): floatx80;
  7001. var
  7002. aSign: flag;
  7003. aExp: int32;
  7004. aSig0, aSig1: bits64;
  7005. begin
  7006. aSig1 := extractFloat128Frac1( a );
  7007. aSig0 := extractFloat128Frac0( a );
  7008. aExp := extractFloat128Exp( a );
  7009. aSign := extractFloat128Sign( a );
  7010. if ( aExp = $7FFF ) begin
  7011. if ( aSig0 or aSig1 ) begin
  7012. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7013. exit;
  7014. end;
  7015. result := packFloatx80( aSign, $7FFF, int64( $8000000000000000 ) );
  7016. exit;
  7017. end;
  7018. if ( aExp = 0 ) begin
  7019. if ( ( aSig0 or aSig1 ) = 0 ) then
  7020. begin
  7021. result := packFloatx80( aSign, 0, 0 );
  7022. exit;
  7023. end;
  7024. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7025. end;
  7026. else begin
  7027. aSig0 or= int64( $0001000000000000 );
  7028. end;
  7029. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7030. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7031. end;
  7032. {$endif FPC_SOFTFLOAT_FLOATX80}
  7033. {*----------------------------------------------------------------------------
  7034. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7035. | Returns the result as a quadruple-precision floating-point value. The
  7036. | operation is performed according to the IEC/IEEE Standard for Binary
  7037. | Floating-Point Arithmetic.
  7038. *----------------------------------------------------------------------------*}
  7039. function float128_round_to_int(a: float128): float128;
  7040. var
  7041. aSign: flag;
  7042. aExp: int32;
  7043. lastBitMask, roundBitsMask: bits64;
  7044. roundingMode: int8;
  7045. z: float128;
  7046. begin
  7047. aExp := extractFloat128Exp( a );
  7048. if ( $402F <= aExp ) then
  7049. begin
  7050. if ( $406F <= aExp ) then
  7051. begin
  7052. if ( ( aExp = $7FFF )
  7053. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7054. ) then
  7055. begin
  7056. result := propagateFloat128NaN( a, a );
  7057. exit;
  7058. end;
  7059. result := a;
  7060. exit;
  7061. end;
  7062. lastBitMask := 1;
  7063. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7064. roundBitsMask := lastBitMask - 1;
  7065. z := a;
  7066. roundingMode := float_rounding_mode;
  7067. if ( roundingMode = float_round_nearest_even ) then
  7068. begin
  7069. if ( lastBitMask )<>0 then
  7070. begin
  7071. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7072. if ( ( z.low and roundBitsMask ) = 0 ) then
  7073. z.low := z.low and not(lastBitMask);
  7074. end
  7075. else begin
  7076. if ( sbits64(z.low) < 0 ) then
  7077. begin
  7078. inc(z.high);
  7079. if ( bits64( z.low shl 1 ) = 0 ) then
  7080. z.high := z.high and not(1);
  7081. end;
  7082. end;
  7083. end
  7084. else if ( roundingMode <> float_round_to_zero ) then
  7085. begin
  7086. if ( extractFloat128Sign( z )
  7087. xor ord( roundingMode = float_round_up ) )<>0 then
  7088. begin
  7089. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7090. end;
  7091. end;
  7092. z.low := z.low and not(roundBitsMask);
  7093. end
  7094. else begin
  7095. if ( aExp < $3FFF ) then
  7096. begin
  7097. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7098. begin
  7099. result := a;
  7100. exit;
  7101. end;
  7102. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7103. aSign := extractFloat128Sign( a );
  7104. case float_rounding_mode of
  7105. float_round_nearest_even:
  7106. if ( ( aExp = $3FFE )
  7107. and ( (extractFloat128Frac0( a )<>0)
  7108. or (extractFloat128Frac1( a )<>0) )
  7109. ) then begin
  7110. begin
  7111. result := packFloat128( aSign, $3FFF, 0, 0 );
  7112. exit;
  7113. end;
  7114. end;
  7115. float_round_down:
  7116. begin
  7117. if aSign<>0 then
  7118. result:=packFloat128( 1, $3FFF, 0, 0 )
  7119. else
  7120. result:=packFloat128( 0, 0, 0, 0 );
  7121. exit;
  7122. end;
  7123. float_round_up:
  7124. begin
  7125. if aSign<>0 then
  7126. result := packFloat128( 1, 0, 0, 0 )
  7127. else
  7128. result:=packFloat128( 0, $3FFF, 0, 0 );
  7129. exit;
  7130. end;
  7131. end;
  7132. result := packFloat128( aSign, 0, 0, 0 );
  7133. exit;
  7134. end;
  7135. lastBitMask := 1;
  7136. lastBitMask := lastBitMask shl ($402F - aExp);
  7137. roundBitsMask := lastBitMask - 1;
  7138. z.low := 0;
  7139. z.high := a.high;
  7140. roundingMode := float_rounding_mode;
  7141. if ( roundingMode = float_round_nearest_even ) then begin
  7142. inc(z.high,lastBitMask shr 1);
  7143. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7144. z.high := z.high and not(lastBitMask);
  7145. end;
  7146. end
  7147. else if ( roundingMode <> float_round_to_zero ) then begin
  7148. if ( (extractFloat128Sign( z )<>0)
  7149. xor ( roundingMode = float_round_up ) ) then begin
  7150. z.high := z.high or ord( a.low <> 0 );
  7151. z.high := z.high+roundBitsMask;
  7152. end;
  7153. end;
  7154. z.high := z.high and not(roundBitsMask);
  7155. end;
  7156. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7157. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7158. end;
  7159. result := z;
  7160. end;
  7161. {*----------------------------------------------------------------------------
  7162. | Returns the result of adding the absolute values of the quadruple-precision
  7163. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7164. | before being returned. `zSign' is ignored if the result is a NaN.
  7165. | The addition is performed according to the IEC/IEEE Standard for Binary
  7166. | Floating-Point Arithmetic.
  7167. *----------------------------------------------------------------------------*}
  7168. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7169. var
  7170. aExp, bExp, zExp: int32;
  7171. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7172. expDiff: int32;
  7173. label
  7174. shiftRight1,roundAndPack;
  7175. begin
  7176. aSig1 := extractFloat128Frac1( a );
  7177. aSig0 := extractFloat128Frac0( a );
  7178. aExp := extractFloat128Exp( a );
  7179. bSig1 := extractFloat128Frac1( b );
  7180. bSig0 := extractFloat128Frac0( b );
  7181. bExp := extractFloat128Exp( b );
  7182. expDiff := aExp - bExp;
  7183. if ( 0 < expDiff ) then begin
  7184. if ( aExp = $7FFF ) then begin
  7185. if ( aSig0 or aSig1 )<>0 then
  7186. begin
  7187. result := propagateFloat128NaN( a, b );
  7188. exit;
  7189. end;
  7190. result := a;
  7191. exit;
  7192. end;
  7193. if ( bExp = 0 ) then begin
  7194. dec(expDiff);
  7195. end
  7196. else begin
  7197. bSig0 := bSig0 or int64( $0001000000000000 );
  7198. end;
  7199. shift128ExtraRightJamming(
  7200. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  7201. zExp := aExp;
  7202. end
  7203. else if ( expDiff < 0 ) then begin
  7204. if ( bExp = $7FFF ) then begin
  7205. if ( bSig0 or bSig1 )<>0 then
  7206. begin
  7207. result := propagateFloat128NaN( a, b );
  7208. exit;
  7209. end;
  7210. result := packFloat128( zSign, $7FFF, 0, 0 );
  7211. exit;
  7212. end;
  7213. if ( aExp = 0 ) then begin
  7214. inc(expDiff);
  7215. end
  7216. else begin
  7217. aSig0 := aSig0 or int64( $0001000000000000 );
  7218. end;
  7219. shift128ExtraRightJamming(
  7220. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  7221. zExp := bExp;
  7222. end
  7223. else begin
  7224. if ( aExp = $7FFF ) then begin
  7225. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7226. result := propagateFloat128NaN( a, b );
  7227. exit;
  7228. end;
  7229. result := a;
  7230. exit;
  7231. end;
  7232. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7233. if ( aExp = 0 ) then
  7234. begin
  7235. result := packFloat128( zSign, 0, zSig0, zSig1 );
  7236. exit;
  7237. end;
  7238. zSig2 := 0;
  7239. zSig0 := zSig0 or int64( $0002000000000000 );
  7240. zExp := aExp;
  7241. goto shiftRight1;
  7242. end;
  7243. aSig0 := aSig0 or int64( $0001000000000000 );
  7244. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7245. dec(zExp);
  7246. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  7247. inc(zExp);
  7248. shiftRight1:
  7249. shift128ExtraRightJamming(
  7250. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7251. roundAndPack:
  7252. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7253. end;
  7254. {*----------------------------------------------------------------------------
  7255. | Returns the result of subtracting the absolute values of the quadruple-
  7256. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  7257. | difference is negated before being returned. `zSign' is ignored if the
  7258. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  7259. | Standard for Binary Floating-Point Arithmetic.
  7260. *----------------------------------------------------------------------------*}
  7261. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  7262. var
  7263. aExp, bExp, zExp: int32;
  7264. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  7265. expDiff: int32;
  7266. z: float128;
  7267. label
  7268. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  7269. begin
  7270. aSig1 := extractFloat128Frac1( a );
  7271. aSig0 := extractFloat128Frac0( a );
  7272. aExp := extractFloat128Exp( a );
  7273. bSig1 := extractFloat128Frac1( b );
  7274. bSig0 := extractFloat128Frac0( b );
  7275. bExp := extractFloat128Exp( b );
  7276. expDiff := aExp - bExp;
  7277. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7278. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  7279. if ( 0 < expDiff ) then goto aExpBigger;
  7280. if ( expDiff < 0 ) then goto bExpBigger;
  7281. if ( aExp = $7FFF ) then begin
  7282. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7283. result := propagateFloat128NaN( a, b );
  7284. exit;
  7285. end;
  7286. float_raise( float_flag_invalid );
  7287. z.low := float128_default_nan_low;
  7288. z.high := float128_default_nan_high;
  7289. result := z;
  7290. exit;
  7291. end;
  7292. if ( aExp = 0 ) then begin
  7293. aExp := 1;
  7294. bExp := 1;
  7295. end;
  7296. if ( bSig0 < aSig0 ) then goto aBigger;
  7297. if ( aSig0 < bSig0 ) then goto bBigger;
  7298. if ( bSig1 < aSig1 ) then goto aBigger;
  7299. if ( aSig1 < bSig1 ) then goto bBigger;
  7300. result := packFloat128( ord(float_rounding_mode = float_round_down), 0, 0, 0 );
  7301. exit;
  7302. bExpBigger:
  7303. if ( bExp = $7FFF ) then begin
  7304. if ( bSig0 or bSig1 )<>0 then
  7305. begin
  7306. result := propagateFloat128NaN( a, b );
  7307. exit;
  7308. end;
  7309. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  7310. exit;
  7311. end;
  7312. if ( aExp = 0 ) then begin
  7313. inc(expDiff);
  7314. end
  7315. else begin
  7316. aSig0 := aSig0 or int64( $4000000000000000 );
  7317. end;
  7318. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7319. bSig0 := bSig0 or int64( $4000000000000000 );
  7320. bBigger:
  7321. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  7322. zExp := bExp;
  7323. zSign := zSign xor 1;
  7324. goto normalizeRoundAndPack;
  7325. aExpBigger:
  7326. if ( aExp = $7FFF ) then begin
  7327. if ( aSig0 or aSig1 )<>0 then
  7328. begin
  7329. result := propagateFloat128NaN( a, b );
  7330. exit;
  7331. end;
  7332. result := a;
  7333. exit;
  7334. end;
  7335. if ( bExp = 0 ) then begin
  7336. dec(expDiff);
  7337. end
  7338. else begin
  7339. bSig0 := bSig0 or int64( $4000000000000000 );
  7340. end;
  7341. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  7342. aSig0 := aSig0 or int64( $4000000000000000 );
  7343. aBigger:
  7344. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7345. zExp := aExp;
  7346. normalizeRoundAndPack:
  7347. dec(zExp);
  7348. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  7349. end;
  7350. {*----------------------------------------------------------------------------
  7351. | Returns the result of adding the quadruple-precision floating-point values
  7352. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  7353. | for Binary Floating-Point Arithmetic.
  7354. *----------------------------------------------------------------------------*}
  7355. function float128_add(a: float128; b: float128): float128;
  7356. var
  7357. aSign, bSign: flag;
  7358. begin
  7359. aSign := extractFloat128Sign( a );
  7360. bSign := extractFloat128Sign( b );
  7361. if ( aSign = bSign ) then begin
  7362. result := addFloat128Sigs( a, b, aSign );
  7363. end
  7364. else begin
  7365. result := subFloat128Sigs( a, b, aSign );
  7366. end;
  7367. end;
  7368. {*----------------------------------------------------------------------------
  7369. | Returns the result of subtracting the quadruple-precision floating-point
  7370. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7371. | Standard for Binary Floating-Point Arithmetic.
  7372. *----------------------------------------------------------------------------*}
  7373. function float128_sub(a: float128; b: float128): float128;
  7374. var
  7375. aSign, bSign: flag;
  7376. begin
  7377. aSign := extractFloat128Sign( a );
  7378. bSign := extractFloat128Sign( b );
  7379. if ( aSign = bSign ) then begin
  7380. result := subFloat128Sigs( a, b, aSign );
  7381. end
  7382. else begin
  7383. result := addFloat128Sigs( a, b, aSign );
  7384. end;
  7385. end;
  7386. {*----------------------------------------------------------------------------
  7387. | Returns the result of multiplying the quadruple-precision floating-point
  7388. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7389. | Standard for Binary Floating-Point Arithmetic.
  7390. *----------------------------------------------------------------------------*}
  7391. function float128_mul(a: float128; b: float128): float128;
  7392. var
  7393. aSign, bSign, zSign: flag;
  7394. aExp, bExp, zExp: int32;
  7395. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  7396. z: float128;
  7397. label
  7398. invalid;
  7399. begin
  7400. aSig1 := extractFloat128Frac1( a );
  7401. aSig0 := extractFloat128Frac0( a );
  7402. aExp := extractFloat128Exp( a );
  7403. aSign := extractFloat128Sign( a );
  7404. bSig1 := extractFloat128Frac1( b );
  7405. bSig0 := extractFloat128Frac0( b );
  7406. bExp := extractFloat128Exp( b );
  7407. bSign := extractFloat128Sign( b );
  7408. zSign := aSign xor bSign;
  7409. if ( aExp = $7FFF ) then begin
  7410. if ( (( aSig0 or aSig1 )<>0)
  7411. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7412. result := propagateFloat128NaN( a, b );
  7413. exit;
  7414. end;
  7415. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  7416. result := packFloat128( zSign, $7FFF, 0, 0 );
  7417. exit;
  7418. end;
  7419. if ( bExp = $7FFF ) then begin
  7420. if ( bSig0 or bSig1 )<>0 then
  7421. begin
  7422. result := propagateFloat128NaN( a, b );
  7423. exit;
  7424. end;
  7425. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7426. invalid:
  7427. float_raise( float_flag_invalid );
  7428. z.low := float128_default_nan_low;
  7429. z.high := float128_default_nan_high;
  7430. result := z;
  7431. exit;
  7432. end;
  7433. result := packFloat128( zSign, $7FFF, 0, 0 );
  7434. exit;
  7435. end;
  7436. if ( aExp = 0 ) then begin
  7437. if ( ( aSig0 or aSig1 ) = 0 ) then
  7438. begin
  7439. result := packFloat128( zSign, 0, 0, 0 );
  7440. exit;
  7441. end;
  7442. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7443. end;
  7444. if ( bExp = 0 ) then begin
  7445. if ( ( bSig0 or bSig1 ) = 0 ) then
  7446. begin
  7447. result := packFloat128( zSign, 0, 0, 0 );
  7448. exit;
  7449. end;
  7450. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7451. end;
  7452. zExp := aExp + bExp - $4000;
  7453. aSig0 := aSig0 or int64( $0001000000000000 );
  7454. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  7455. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  7456. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  7457. zSig2 := zSig2 or ord( zSig3 <> 0 );
  7458. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  7459. shift128ExtraRightJamming(
  7460. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7461. inc(zExp);
  7462. end;
  7463. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7464. end;
  7465. {*----------------------------------------------------------------------------
  7466. | Returns the result of dividing the quadruple-precision floating-point value
  7467. | `a' by the corresponding value `b'. The operation is performed according to
  7468. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7469. *----------------------------------------------------------------------------*}
  7470. function float128_div(a: float128; b: float128): float128;
  7471. var
  7472. aSign, bSign, zSign: flag;
  7473. aExp, bExp, zExp: int32;
  7474. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7475. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7476. z: float128;
  7477. label
  7478. invalid;
  7479. begin
  7480. aSig1 := extractFloat128Frac1( a );
  7481. aSig0 := extractFloat128Frac0( a );
  7482. aExp := extractFloat128Exp( a );
  7483. aSign := extractFloat128Sign( a );
  7484. bSig1 := extractFloat128Frac1( b );
  7485. bSig0 := extractFloat128Frac0( b );
  7486. bExp := extractFloat128Exp( b );
  7487. bSign := extractFloat128Sign( b );
  7488. zSign := aSign xor bSign;
  7489. if ( aExp = $7FFF ) then begin
  7490. if ( aSig0 or aSig1 )<>0 then
  7491. begin
  7492. result := propagateFloat128NaN( a, b );
  7493. exit;
  7494. end;
  7495. if ( bExp = $7FFF ) then begin
  7496. if ( bSig0 or bSig1 )<>0 then
  7497. begin
  7498. result := propagateFloat128NaN( a, b );
  7499. exit;
  7500. end;
  7501. goto invalid;
  7502. end;
  7503. result := packFloat128( zSign, $7FFF, 0, 0 );
  7504. exit;
  7505. end;
  7506. if ( bExp = $7FFF ) then begin
  7507. if ( bSig0 or bSig1 )<>0 then
  7508. begin
  7509. result := propagateFloat128NaN( a, b );
  7510. exit;
  7511. end;
  7512. result := packFloat128( zSign, 0, 0, 0 );
  7513. exit;
  7514. end;
  7515. if ( bExp = 0 ) then begin
  7516. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7517. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7518. invalid:
  7519. float_raise( float_flag_invalid );
  7520. z.low := float128_default_nan_low;
  7521. z.high := float128_default_nan_high;
  7522. result := z;
  7523. exit;
  7524. end;
  7525. float_raise( float_flag_divbyzero );
  7526. result := packFloat128( zSign, $7FFF, 0, 0 );
  7527. exit;
  7528. end;
  7529. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7530. end;
  7531. if ( aExp = 0 ) then begin
  7532. if ( ( aSig0 or aSig1 ) = 0 ) then
  7533. begin
  7534. result := packFloat128( zSign, 0, 0, 0 );
  7535. exit;
  7536. end;
  7537. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7538. end;
  7539. zExp := aExp - bExp + $3FFD;
  7540. shortShift128Left(
  7541. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  7542. shortShift128Left(
  7543. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7544. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  7545. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  7546. inc(zExp);
  7547. end;
  7548. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7549. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  7550. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  7551. while ( sbits64(rem0) < 0 ) do begin
  7552. dec(zSig0);
  7553. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  7554. end;
  7555. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  7556. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  7557. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  7558. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  7559. while ( sbits64(rem1) < 0 ) do begin
  7560. dec(zSig1);
  7561. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  7562. end;
  7563. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7564. end;
  7565. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  7566. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7567. end;
  7568. {*----------------------------------------------------------------------------
  7569. | Returns the remainder of the quadruple-precision floating-point value `a'
  7570. | with respect to the corresponding value `b'. The operation is performed
  7571. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7572. *----------------------------------------------------------------------------*}
  7573. function float128_rem(a: float128; b: float128): float128;
  7574. var
  7575. aSign, bSign, zSign: flag;
  7576. aExp, bExp, expDiff: int32;
  7577. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  7578. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  7579. sigMean0: sbits64;
  7580. z: float128;
  7581. label
  7582. invalid;
  7583. begin
  7584. aSig1 := extractFloat128Frac1( a );
  7585. aSig0 := extractFloat128Frac0( a );
  7586. aExp := extractFloat128Exp( a );
  7587. aSign := extractFloat128Sign( a );
  7588. bSig1 := extractFloat128Frac1( b );
  7589. bSig0 := extractFloat128Frac0( b );
  7590. bExp := extractFloat128Exp( b );
  7591. bSign := extractFloat128Sign( b );
  7592. if ( aExp = $7FFF ) then begin
  7593. if ( (( aSig0 or aSig1 )<>0)
  7594. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7595. result := propagateFloat128NaN( a, b );
  7596. exit;
  7597. end;
  7598. goto invalid;
  7599. end;
  7600. if ( bExp = $7FFF ) then begin
  7601. if ( bSig0 or bSig1 )<>0 then
  7602. begin
  7603. result := propagateFloat128NaN( a, b );
  7604. exit;
  7605. end;
  7606. result := a;
  7607. exit;
  7608. end;
  7609. if ( bExp = 0 ) then begin
  7610. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7611. invalid:
  7612. float_raise( float_flag_invalid );
  7613. z.low := float128_default_nan_low;
  7614. z.high := float128_default_nan_high;
  7615. result := z;
  7616. exit;
  7617. end;
  7618. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7619. end;
  7620. if ( aExp = 0 ) then begin
  7621. if ( ( aSig0 or aSig1 ) = 0 ) then
  7622. begin
  7623. result := a;
  7624. exit;
  7625. end;
  7626. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7627. end;
  7628. expDiff := aExp - bExp;
  7629. if ( expDiff < -1 ) then
  7630. begin
  7631. result := a;
  7632. exit;
  7633. end;
  7634. shortShift128Left(
  7635. aSig0 or int64( $0001000000000000 ),
  7636. aSig1,
  7637. 15 - ord( expDiff < 0 ),
  7638. aSig0,
  7639. aSig1
  7640. );
  7641. shortShift128Left(
  7642. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7643. q := le128( bSig0, bSig1, aSig0, aSig1 );
  7644. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7645. dec(expDiff,64);
  7646. while ( 0 < expDiff ) do begin
  7647. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7648. if ( 4 < q ) then
  7649. q := q - 4
  7650. else
  7651. q := 0;
  7652. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7653. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  7654. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  7655. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  7656. dec(expDiff,61);
  7657. end;
  7658. if ( -64 < expDiff ) then begin
  7659. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7660. if ( 4 < q ) then
  7661. q := q - 4
  7662. else
  7663. q := 0;
  7664. q := q shr (- expDiff);
  7665. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7666. inc(expDiff,52);
  7667. if ( expDiff < 0 ) then begin
  7668. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7669. end
  7670. else begin
  7671. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  7672. end;
  7673. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7674. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  7675. end
  7676. else begin
  7677. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  7678. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7679. end;
  7680. repeat
  7681. alternateASig0 := aSig0;
  7682. alternateASig1 := aSig1;
  7683. inc(q);
  7684. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7685. until not( 0 <= sbits64(aSig0) );
  7686. add128(
  7687. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  7688. if ( ( sigMean0 < 0 )
  7689. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  7690. aSig0 := alternateASig0;
  7691. aSig1 := alternateASig1;
  7692. end;
  7693. zSign := ord( sbits64(aSig0) < 0 );
  7694. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  7695. result :=
  7696. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  7697. end;
  7698. {*----------------------------------------------------------------------------
  7699. | Returns the square root of the quadruple-precision floating-point value `a'.
  7700. | The operation is performed according to the IEC/IEEE Standard for Binary
  7701. | Floating-Point Arithmetic.
  7702. *----------------------------------------------------------------------------*}
  7703. function float128_sqrt(a: float128): float128;
  7704. var
  7705. aSign: flag;
  7706. aExp, zExp: int32;
  7707. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  7708. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7709. z: float128;
  7710. label
  7711. invalid;
  7712. begin
  7713. aSig1 := extractFloat128Frac1( a );
  7714. aSig0 := extractFloat128Frac0( a );
  7715. aExp := extractFloat128Exp( a );
  7716. aSign := extractFloat128Sign( a );
  7717. if ( aExp = $7FFF ) then begin
  7718. if ( aSig0 or aSig1 )<>0 then
  7719. begin
  7720. result := propagateFloat128NaN( a, a );
  7721. exit;
  7722. end;
  7723. if ( aSign=0 ) then
  7724. begin
  7725. result := a;
  7726. exit;
  7727. end;
  7728. goto invalid;
  7729. end;
  7730. if ( aSign<>0 ) then begin
  7731. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  7732. begin
  7733. result := a;
  7734. exit;
  7735. end;
  7736. invalid:
  7737. float_raise( float_flag_invalid );
  7738. z.low := float128_default_nan_low;
  7739. z.high := float128_default_nan_high;
  7740. result := z;
  7741. exit;
  7742. end;
  7743. if ( aExp = 0 ) then begin
  7744. if ( ( aSig0 or aSig1 ) = 0 ) then
  7745. begin
  7746. result := packFloat128( 0, 0, 0, 0 );
  7747. exit;
  7748. end;
  7749. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7750. end;
  7751. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
  7752. aSig0 := aSig0 or int64( $0001000000000000 );
  7753. zSig0 := estimateSqrt32( aExp, aSig0>>17 );
  7754. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  7755. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7756. doubleZSig0 := zSig0 shl 1;
  7757. mul64To128( zSig0, zSig0, term0, term1 );
  7758. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7759. while ( sbits64(rem0) < 0 ) do begin
  7760. dec(zSig0);
  7761. dec(doubleZSig0,2);
  7762. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7763. end;
  7764. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7765. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  7766. if ( zSig1 = 0 ) then zSig1 := 1;
  7767. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7768. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7769. mul64To128( zSig1, zSig1, term2, term3 );
  7770. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7771. while ( sbits64(rem1) < 0 ) do begin
  7772. dec(zSig1);
  7773. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7774. term3 := term3 or 1;
  7775. term2 := term2 or doubleZSig0;
  7776. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7777. end;
  7778. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7779. end;
  7780. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  7781. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  7782. end;
  7783. {*----------------------------------------------------------------------------
  7784. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7785. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7786. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7787. *----------------------------------------------------------------------------*}
  7788. function float128_eq(a: float128; b: float128): flag;
  7789. begin
  7790. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7791. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7792. or ( ( extractFloat128Exp( b ) = $7FFF )
  7793. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7794. ) then begin
  7795. if ( (float128_is_signaling_nan( a )<>0)
  7796. or (float128_is_signaling_nan( b )<>0) ) then begin
  7797. float_raise( float_flag_invalid );
  7798. end;
  7799. result := 0;
  7800. exit;
  7801. end;
  7802. result := ord(
  7803. ( a.low = b.low )
  7804. and ( ( a.high = b.high )
  7805. or ( ( a.low = 0 )
  7806. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  7807. ));
  7808. end;
  7809. {*----------------------------------------------------------------------------
  7810. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7811. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  7812. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7813. | Arithmetic.
  7814. *----------------------------------------------------------------------------*}
  7815. function float128_le(a: float128; b: float128): flag;
  7816. var
  7817. aSign, bSign: flag;
  7818. begin
  7819. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7820. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7821. or ( ( extractFloat128Exp( b ) = $7FFF )
  7822. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7823. ) then begin
  7824. float_raise( float_flag_invalid );
  7825. result := 0;
  7826. exit;
  7827. end;
  7828. aSign := extractFloat128Sign( a );
  7829. bSign := extractFloat128Sign( b );
  7830. if ( aSign <> bSign ) then begin
  7831. result := ord(
  7832. (aSign<>0)
  7833. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7834. = 0 ));
  7835. exit;
  7836. end;
  7837. if aSign<>0 then
  7838. result := le128( b.high, b.low, a.high, a.low )
  7839. else
  7840. result := le128( a.high, a.low, b.high, b.low );
  7841. end;
  7842. {*----------------------------------------------------------------------------
  7843. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7844. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7845. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7846. *----------------------------------------------------------------------------*}
  7847. function float128_lt(a: float128; b: float128): flag;
  7848. var
  7849. aSign, bSign: flag;
  7850. begin
  7851. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7852. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7853. or ( ( extractFloat128Exp( b ) = $7FFF )
  7854. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7855. ) then begin
  7856. float_raise( float_flag_invalid );
  7857. result := 0;
  7858. exit;
  7859. end;
  7860. aSign := extractFloat128Sign( a );
  7861. bSign := extractFloat128Sign( b );
  7862. if ( aSign <> bSign ) then begin
  7863. result := ord(
  7864. (aSign<>0)
  7865. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7866. <> 0 ));
  7867. exit;
  7868. end;
  7869. if aSign<>0 then
  7870. result := lt128( b.high, b.low, a.high, a.low )
  7871. else
  7872. result := lt128( a.high, a.low, b.high, b.low );
  7873. end;
  7874. {*----------------------------------------------------------------------------
  7875. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7876. | the corresponding value `b', and 0 otherwise. The invalid exception is
  7877. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7878. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7879. *----------------------------------------------------------------------------*}
  7880. function float128_eq_signaling(a: float128; b: float128): flag;
  7881. begin
  7882. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7883. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7884. or ( ( extractFloat128Exp( b ) = $7FFF )
  7885. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7886. ) then begin
  7887. float_raise( float_flag_invalid );
  7888. result := 0;
  7889. exit;
  7890. end;
  7891. result := ord(
  7892. ( a.low = b.low )
  7893. and ( ( a.high = b.high )
  7894. or ( ( a.low = 0 )
  7895. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7896. ));
  7897. end;
  7898. {*----------------------------------------------------------------------------
  7899. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7900. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  7901. | cause an exception. Otherwise, the comparison is performed according to the
  7902. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7903. *----------------------------------------------------------------------------*}
  7904. function float128_le_quiet(a: float128; b: float128): flag;
  7905. var
  7906. aSign, bSign: flag;
  7907. begin
  7908. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7909. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7910. or ( ( extractFloat128Exp( b ) = $7FFF )
  7911. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7912. ) then begin
  7913. if ( (float128_is_signaling_nan( a )<>0)
  7914. or (float128_is_signaling_nan( b )<>0) ) then begin
  7915. float_raise( float_flag_invalid );
  7916. end;
  7917. result := 0;
  7918. exit;
  7919. end;
  7920. aSign := extractFloat128Sign( a );
  7921. bSign := extractFloat128Sign( b );
  7922. if ( aSign <> bSign ) then begin
  7923. result := ord(
  7924. (aSign<>0)
  7925. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7926. = 0 ));
  7927. exit;
  7928. end;
  7929. if aSign<>0 then
  7930. result := le128( b.high, b.low, a.high, a.low )
  7931. else
  7932. result := le128( a.high, a.low, b.high, b.low );
  7933. end;
  7934. {*----------------------------------------------------------------------------
  7935. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7936. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  7937. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  7938. | Standard for Binary Floating-Point Arithmetic.
  7939. *----------------------------------------------------------------------------*}
  7940. function float128_lt_quiet(a: float128; b: float128): flag;
  7941. var
  7942. aSign, bSign: flag;
  7943. begin
  7944. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7945. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7946. or ( ( extractFloat128Exp( b ) = $7FFF )
  7947. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7948. ) then begin
  7949. if ( (float128_is_signaling_nan( a )<>0)
  7950. or (float128_is_signaling_nan( b )<>0) ) then begin
  7951. float_raise( float_flag_invalid );
  7952. end;
  7953. result := 0;
  7954. exit;
  7955. end;
  7956. aSign := extractFloat128Sign( a );
  7957. bSign := extractFloat128Sign( b );
  7958. if ( aSign <> bSign ) then begin
  7959. result := ord(
  7960. (aSign<>0)
  7961. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7962. <> 0 ));
  7963. exit;
  7964. end;
  7965. if aSign<>0 then
  7966. result:=lt128( b.high, b.low, a.high, a.low )
  7967. else
  7968. result:=lt128( a.high, a.low, b.high, b.low );
  7969. end;
  7970. {----------------------------------------------------------------------------
  7971. | Returns the result of converting the double-precision floating-point value
  7972. | `a' to the quadruple-precision floating-point format. The conversion is
  7973. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  7974. | Arithmetic.
  7975. *----------------------------------------------------------------------------}
  7976. function float64_to_float128( a : float64) : float128;
  7977. var
  7978. aSign : flag;
  7979. aExp : int16;
  7980. aSig, zSig0, zSig1 : bits64;
  7981. begin
  7982. aSig := extractFloat64Frac( a );
  7983. aExp := extractFloat64Exp( a );
  7984. aSign := extractFloat64Sign( a );
  7985. if ( aExp = $7FF ) then begin
  7986. if ( aSig<>0 ) then
  7987. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  7988. result:=packFloat128( aSign, $7FFF, 0, 0 );
  7989. exit;
  7990. end;
  7991. if ( aExp = 0 ) then begin
  7992. if ( aSig = 0 ) then
  7993. begin
  7994. result:=packFloat128( aSign, 0, 0, 0 );
  7995. exit;
  7996. end;
  7997. normalizeFloat64Subnormal( aSig, aExp, aSig );
  7998. dec(aExp);
  7999. end;
  8000. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8001. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8002. end;
  8003. {$endif FPC_SOFTFLOAT_FLOAT128}
  8004. {$endif not(defined(fpc_softfpu_interface))}
  8005. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8006. end.
  8007. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}