softfpu.pp 290 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. interface
  69. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  70. {$if not(defined(fpc_softfpu_implementation))}
  71. {
  72. -------------------------------------------------------------------------------
  73. Software IEC/IEEE floating-point types.
  74. -------------------------------------------------------------------------------
  75. }
  76. TYPE
  77. float32 = longword;
  78. { we use here a record in the function header because
  79. the record allows bitwise conversion to single }
  80. float32rec = record
  81. float32 : float32;
  82. end;
  83. flag = byte;
  84. uint8 = byte;
  85. int8 = shortint;
  86. uint16 = word;
  87. int16 = smallint;
  88. uint32 = longword;
  89. int32 = longint;
  90. bits8 = byte;
  91. sbits8 = shortint;
  92. bits16 = word;
  93. sbits16 = smallint;
  94. sbits32 = longint;
  95. bits32 = longword;
  96. {$ifndef fpc}
  97. qword = int64;
  98. {$endif}
  99. { now part of the system unit
  100. uint64 = qword;
  101. }
  102. bits64 = qword;
  103. sbits64 = int64;
  104. {$ifdef ENDIAN_LITTLE}
  105. float64 = packed record
  106. low: bits32;
  107. high: bits32;
  108. end;
  109. int64rec = packed record
  110. low: bits32;
  111. high: bits32;
  112. end;
  113. floatx80 = packed record
  114. low : qword;
  115. high : word;
  116. end;
  117. float128 = packed record
  118. low : qword;
  119. high : qword;
  120. end;
  121. {$else}
  122. float64 = packed record
  123. high,low : bits32;
  124. end;
  125. int64rec = packed record
  126. high,low : bits32;
  127. end;
  128. floatx80 = packed record
  129. high : word;
  130. low : qword;
  131. end;
  132. float128 = packed record
  133. high : qword;
  134. low : qword;
  135. end;
  136. {$endif}
  137. {*
  138. -------------------------------------------------------------------------------
  139. Returns 1 if the double-precision floating-point value `a' is less than
  140. the corresponding value `b', and 0 otherwise. The comparison is performed
  141. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  142. -------------------------------------------------------------------------------
  143. *}
  144. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  145. {*
  146. -------------------------------------------------------------------------------
  147. Returns 1 if the double-precision floating-point value `a' is less than
  148. or equal to the corresponding value `b', and 0 otherwise. The comparison
  149. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  150. Arithmetic.
  151. -------------------------------------------------------------------------------
  152. *}
  153. Function float64_le(a: float64;b: float64): flag; compilerproc;
  154. {*
  155. -------------------------------------------------------------------------------
  156. Returns 1 if the double-precision floating-point value `a' is equal to
  157. the corresponding value `b', and 0 otherwise. The comparison is performed
  158. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  159. -------------------------------------------------------------------------------
  160. *}
  161. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  162. {*
  163. -------------------------------------------------------------------------------
  164. Returns the square root of the double-precision floating-point value `a'.
  165. The operation is performed according to the IEC/IEEE Standard for Binary
  166. Floating-Point Arithmetic.
  167. -------------------------------------------------------------------------------
  168. *}
  169. Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
  170. {*
  171. -------------------------------------------------------------------------------
  172. Returns the remainder of the double-precision floating-point value `a'
  173. with respect to the corresponding value `b'. The operation is performed
  174. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  175. -------------------------------------------------------------------------------
  176. *}
  177. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  178. {*
  179. -------------------------------------------------------------------------------
  180. Returns the result of dividing the double-precision floating-point value `a'
  181. by the corresponding value `b'. The operation is performed according to the
  182. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  183. -------------------------------------------------------------------------------
  184. *}
  185. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  186. {*
  187. -------------------------------------------------------------------------------
  188. Returns the result of multiplying the double-precision floating-point values
  189. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  190. for Binary Floating-Point Arithmetic.
  191. -------------------------------------------------------------------------------
  192. *}
  193. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  194. {*
  195. -------------------------------------------------------------------------------
  196. Returns the result of subtracting the double-precision floating-point values
  197. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  198. for Binary Floating-Point Arithmetic.
  199. -------------------------------------------------------------------------------
  200. *}
  201. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  202. {*
  203. -------------------------------------------------------------------------------
  204. Returns the result of adding the double-precision floating-point values `a'
  205. and `b'. The operation is performed according to the IEC/IEEE Standard for
  206. Binary Floating-Point Arithmetic.
  207. -------------------------------------------------------------------------------
  208. *}
  209. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  210. {*
  211. -------------------------------------------------------------------------------
  212. Rounds the double-precision floating-point value `a' to an integer,
  213. and returns the result as a double-precision floating-point value. The
  214. operation is performed according to the IEC/IEEE Standard for Binary
  215. Floating-Point Arithmetic.
  216. -------------------------------------------------------------------------------
  217. *}
  218. Function float64_round_to_int(a: float64) : float64; compilerproc;
  219. {*
  220. -------------------------------------------------------------------------------
  221. Returns the result of converting the double-precision floating-point value
  222. `a' to the single-precision floating-point format. The conversion is
  223. performed according to the IEC/IEEE Standard for Binary Floating-Point
  224. Arithmetic.
  225. -------------------------------------------------------------------------------
  226. *}
  227. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  228. {*
  229. -------------------------------------------------------------------------------
  230. Returns the result of converting the double-precision floating-point value
  231. `a' to the 32-bit two's complement integer format. The conversion is
  232. performed according to the IEC/IEEE Standard for Binary Floating-Point
  233. Arithmetic, except that the conversion is always rounded toward zero.
  234. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  235. the conversion overflows, the largest integer with the same sign as `a' is
  236. returned.
  237. -------------------------------------------------------------------------------
  238. *}
  239. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  240. {*
  241. -------------------------------------------------------------------------------
  242. Returns the result of converting the double-precision floating-point value
  243. `a' to the 32-bit two's complement integer format. The conversion is
  244. performed according to the IEC/IEEE Standard for Binary Floating-Point
  245. Arithmetic---which means in particular that the conversion is rounded
  246. according to the current rounding mode. If `a' is a NaN, the largest
  247. positive integer is returned. Otherwise, if the conversion overflows, the
  248. largest integer with the same sign as `a' is returned.
  249. -------------------------------------------------------------------------------
  250. *}
  251. Function float64_to_int32(a: float64): int32; compilerproc;
  252. {*
  253. -------------------------------------------------------------------------------
  254. Returns 1 if the single-precision floating-point value `a' is less than
  255. the corresponding value `b', and 0 otherwise. The comparison is performed
  256. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  257. -------------------------------------------------------------------------------
  258. *}
  259. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  260. {*
  261. -------------------------------------------------------------------------------
  262. Returns 1 if the single-precision floating-point value `a' is less than
  263. or equal to the corresponding value `b', and 0 otherwise. The comparison
  264. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  265. Arithmetic.
  266. -------------------------------------------------------------------------------
  267. *}
  268. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  269. {*
  270. -------------------------------------------------------------------------------
  271. Returns 1 if the single-precision floating-point value `a' is equal to
  272. the corresponding value `b', and 0 otherwise. The comparison is performed
  273. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  274. -------------------------------------------------------------------------------
  275. *}
  276. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  277. {*
  278. -------------------------------------------------------------------------------
  279. Returns the square root of the single-precision floating-point value `a'.
  280. The operation is performed according to the IEC/IEEE Standard for Binary
  281. Floating-Point Arithmetic.
  282. -------------------------------------------------------------------------------
  283. *}
  284. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  285. {*
  286. -------------------------------------------------------------------------------
  287. Returns the remainder of the single-precision floating-point value `a'
  288. with respect to the corresponding value `b'. The operation is performed
  289. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  290. -------------------------------------------------------------------------------
  291. *}
  292. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  293. {*
  294. -------------------------------------------------------------------------------
  295. Returns the result of dividing the single-precision floating-point value `a'
  296. by the corresponding value `b'. The operation is performed according to the
  297. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  298. -------------------------------------------------------------------------------
  299. *}
  300. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  301. {*
  302. -------------------------------------------------------------------------------
  303. Returns the result of multiplying the single-precision floating-point values
  304. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  305. for Binary Floating-Point Arithmetic.
  306. -------------------------------------------------------------------------------
  307. *}
  308. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  309. {*
  310. -------------------------------------------------------------------------------
  311. Returns the result of subtracting the single-precision floating-point values
  312. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  313. for Binary Floating-Point Arithmetic.
  314. -------------------------------------------------------------------------------
  315. *}
  316. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  317. {*
  318. -------------------------------------------------------------------------------
  319. Returns the result of adding the single-precision floating-point values `a'
  320. and `b'. The operation is performed according to the IEC/IEEE Standard for
  321. Binary Floating-Point Arithmetic.
  322. -------------------------------------------------------------------------------
  323. *}
  324. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  325. {*
  326. -------------------------------------------------------------------------------
  327. Rounds the single-precision floating-point value `a' to an integer,
  328. and returns the result as a single-precision floating-point value. The
  329. operation is performed according to the IEC/IEEE Standard for Binary
  330. Floating-Point Arithmetic.
  331. -------------------------------------------------------------------------------
  332. *}
  333. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  334. {*
  335. -------------------------------------------------------------------------------
  336. Returns the result of converting the single-precision floating-point value
  337. `a' to the double-precision floating-point format. The conversion is
  338. performed according to the IEC/IEEE Standard for Binary Floating-Point
  339. Arithmetic.
  340. -------------------------------------------------------------------------------
  341. *}
  342. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  343. {*
  344. -------------------------------------------------------------------------------
  345. Returns the result of converting the single-precision floating-point value
  346. `a' to the 32-bit two's complement integer format. The conversion is
  347. performed according to the IEC/IEEE Standard for Binary Floating-Point
  348. Arithmetic, except that the conversion is always rounded toward zero.
  349. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  350. the conversion overflows, the largest integer with the same sign as `a' is
  351. returned.
  352. -------------------------------------------------------------------------------
  353. *}
  354. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  355. {*
  356. -------------------------------------------------------------------------------
  357. Returns the result of converting the single-precision floating-point value
  358. `a' to the 32-bit two's complement integer format. The conversion is
  359. performed according to the IEC/IEEE Standard for Binary Floating-Point
  360. Arithmetic---which means in particular that the conversion is rounded
  361. according to the current rounding mode. If `a' is a NaN, the largest
  362. positive integer is returned. Otherwise, if the conversion overflows, the
  363. largest integer with the same sign as `a' is returned.
  364. -------------------------------------------------------------------------------
  365. *}
  366. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  367. {*
  368. -------------------------------------------------------------------------------
  369. Returns the result of converting the 32-bit two's complement integer `a' to
  370. the double-precision floating-point format. The conversion is performed
  371. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  372. -------------------------------------------------------------------------------
  373. *}
  374. Function int32_to_float64( a: int32) : float64; compilerproc;
  375. {*
  376. -------------------------------------------------------------------------------
  377. Returns the result of converting the 32-bit two's complement integer `a' to
  378. the single-precision floating-point format. The conversion is performed
  379. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  380. -------------------------------------------------------------------------------
  381. *}
  382. Function int32_to_float32( a: int32): float32rec; compilerproc;
  383. {*----------------------------------------------------------------------------
  384. | Returns the result of converting the 64-bit two's complement integer `a'
  385. | to the double-precision floating-point format. The conversion is performed
  386. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  387. *----------------------------------------------------------------------------*}
  388. Function int64_to_float64( a: int64 ): float64; compilerproc;
  389. {*----------------------------------------------------------------------------
  390. | Returns the result of converting the 64-bit two's complement integer `a'
  391. | to the single-precision floating-point format. The conversion is performed
  392. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  393. *----------------------------------------------------------------------------*}
  394. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  395. CONST
  396. {-------------------------------------------------------------------------------
  397. Software IEC/IEEE floating-point underflow tininess-detection mode.
  398. -------------------------------------------------------------------------------
  399. *}
  400. float_tininess_after_rounding = 0;
  401. float_tininess_before_rounding = 1;
  402. {*
  403. -------------------------------------------------------------------------------
  404. Software IEC/IEEE floating-point rounding mode.
  405. -------------------------------------------------------------------------------
  406. *}
  407. {
  408. Round to nearest.
  409. This is the default mode. It should be used unless there is a specific
  410. need for one of the others. In this mode results are rounded to the
  411. nearest representable value. If the result is midway between two
  412. representable values, the even representable is chosen. Even here
  413. means the lowest-order bit is zero. This rounding mode prevents
  414. statistical bias and guarantees numeric stability: round-off errors
  415. in a lengthy calculation will remain smaller than half of FLT_EPSILON.
  416. Round toward plus Infinity.
  417. All results are rounded to the smallest representable value which is
  418. greater than the result.
  419. Round toward minus Infinity.
  420. All results are rounded to the largest representable value which is
  421. less than the result.
  422. Round toward zero.
  423. All results are rounded to the largest representable value whose
  424. magnitude is less than that of the result. In other words, if the
  425. result is negative it is rounded up; if it is positive, it is
  426. rounded down.
  427. }
  428. float_round_nearest_even = 0;
  429. float_round_down = 1;
  430. float_round_up = 2;
  431. float_round_to_zero = 3;
  432. {*
  433. -------------------------------------------------------------------------------
  434. Floating-point rounding mode and exception flags.
  435. -------------------------------------------------------------------------------
  436. *}
  437. const
  438. float_rounding_mode : Byte = float_round_nearest_even;
  439. {*
  440. -------------------------------------------------------------------------------
  441. Underflow tininess-detection mode, statically initialized to default value.
  442. (The declaration in `softfloat.h' must match the `int8' type here.)
  443. -------------------------------------------------------------------------------
  444. *}
  445. const float_detect_tininess: int8 = float_tininess_after_rounding;
  446. {$endif not(defined(fpc_softfpu_implementation))}
  447. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  448. implementation
  449. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  450. {$if not(defined(fpc_softfpu_interface))}
  451. (*****************************************************************************)
  452. (*----------------------------------------------------------------------------*)
  453. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  454. (* division and square root approximations. (Can be specialized to target if *)
  455. (* desired.) *)
  456. (* ---------------------------------------------------------------------------*)
  457. (*****************************************************************************)
  458. {*----------------------------------------------------------------------------
  459. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  460. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  461. | input. If `zSign' is 1, the input is negated before being converted to an
  462. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  463. | is simply rounded to an integer, with the inexact exception raised if the
  464. | input cannot be represented exactly as an integer. However, if the fixed-
  465. | point input is too large, the invalid exception is raised and the largest
  466. | positive or negative integer is returned.
  467. *----------------------------------------------------------------------------*}
  468. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  469. var
  470. roundingMode: int8;
  471. roundNearestEven: flag;
  472. roundIncrement, roundBits: int8;
  473. z: int32;
  474. begin
  475. roundingMode := float_rounding_mode;
  476. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  477. roundIncrement := $40;
  478. if ( roundNearestEven=0 ) then
  479. begin
  480. if ( roundingMode = float_round_to_zero ) then
  481. begin
  482. roundIncrement := 0;
  483. end
  484. else begin
  485. roundIncrement := $7F;
  486. if ( zSign<>0 ) then
  487. begin
  488. if ( roundingMode = float_round_up ) then
  489. roundIncrement := 0;
  490. end
  491. else begin
  492. if ( roundingMode = float_round_down ) then
  493. roundIncrement := 0;
  494. end;
  495. end;
  496. end;
  497. roundBits := absZ and $7F;
  498. absZ := ( absZ + roundIncrement ) shr 7;
  499. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
  500. z := absZ;
  501. if ( zSign<>0 ) then
  502. z := - z;
  503. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  504. begin
  505. float_raise( float_flag_invalid );
  506. if zSign<>0 then
  507. result:=sbits32($80000000)
  508. else
  509. result:=$7FFFFFFF;
  510. exit;
  511. end;
  512. if ( roundBits<>0 ) then
  513. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  514. result:=z;
  515. end;
  516. {*----------------------------------------------------------------------------
  517. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  518. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  519. | and returns the properly rounded 64-bit integer corresponding to the input.
  520. | If `zSign' is 1, the input is negated before being converted to an integer.
  521. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  522. | the inexact exception raised if the input cannot be represented exactly as
  523. | an integer. However, if the fixed-point input is too large, the invalid
  524. | exception is raised and the largest positive or negative integer is
  525. | returned.
  526. *----------------------------------------------------------------------------*}
  527. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  528. var
  529. roundingMode: int8;
  530. roundNearestEven, increment: flag;
  531. z: int64;
  532. label
  533. overflow;
  534. begin
  535. roundingMode := float_rounding_mode;
  536. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  537. increment := ord( sbits64(absZ1) < 0 );
  538. if ( roundNearestEven=0 ) then
  539. begin
  540. if ( roundingMode = float_round_to_zero ) then
  541. begin
  542. increment := 0;
  543. end
  544. else begin
  545. if ( zSign<>0 ) then
  546. begin
  547. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  548. end
  549. else begin
  550. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  551. end;
  552. end;
  553. end;
  554. if ( increment<>0 ) then
  555. begin
  556. inc(absZ0);
  557. if ( absZ0 = 0 ) then
  558. goto overflow;
  559. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  560. end;
  561. z := absZ0;
  562. if ( zSign<>0 ) then
  563. z := - z;
  564. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  565. begin
  566. overflow:
  567. float_raise( float_flag_invalid );
  568. if zSign<>0 then
  569. result:=int64($8000000000000000)
  570. else
  571. result:=int64($7FFFFFFFFFFFFFFF);
  572. end;
  573. if ( absZ1<>0 ) then
  574. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  575. result:=z;
  576. end;
  577. {*
  578. -------------------------------------------------------------------------------
  579. Shifts `a' right by the number of bits given in `count'. If any nonzero
  580. bits are shifted off, they are ``jammed'' into the least significant bit of
  581. the result by setting the least significant bit to 1. The value of `count'
  582. can be arbitrarily large; in particular, if `count' is greater than 32, the
  583. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  584. The result is stored in the location pointed to by `zPtr'.
  585. -------------------------------------------------------------------------------
  586. *}
  587. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  588. var
  589. z: Bits32;
  590. Begin
  591. if ( count = 0 ) then
  592. z := a
  593. else
  594. if ( count < 32 ) then
  595. Begin
  596. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  597. End
  598. else
  599. Begin
  600. z := bits32( a <> 0 );
  601. End;
  602. zPtr := z;
  603. End;
  604. {*----------------------------------------------------------------------------
  605. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  606. | number of bits given in `count'. Any bits shifted off are lost. The value
  607. | of `count' can be arbitrarily large; in particular, if `count' is greater
  608. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  609. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  610. *----------------------------------------------------------------------------*}
  611. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  612. var
  613. z0, z1: bits64;
  614. negCount: int8;
  615. begin
  616. negCount := ( - count ) and 63;
  617. if ( count = 0 ) then
  618. begin
  619. z1 := a1;
  620. z0 := a0;
  621. end
  622. else if ( count < 64 ) then
  623. begin
  624. z1 := ( a0 shl negCount ) or ( a1 shr count );
  625. z0 := a0 shr count;
  626. end
  627. else
  628. begin
  629. if ( count shl 64 )<>0 then
  630. z1 := a0 shr ( count and 63 )
  631. else
  632. z1 := 0;
  633. z0 := 0;
  634. end;
  635. z1Ptr := z1;
  636. z0Ptr := z0;
  637. end;
  638. {*----------------------------------------------------------------------------
  639. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  640. | number of bits given in `count'. If any nonzero bits are shifted off, they
  641. | are ``jammed'' into the least significant bit of the result by setting the
  642. | least significant bit to 1. The value of `count' can be arbitrarily large;
  643. | in particular, if `count' is greater than 128, the result will be either
  644. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  645. | nonzero. The result is broken into two 64-bit pieces which are stored at
  646. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  647. *----------------------------------------------------------------------------*}
  648. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  649. var
  650. z0,z1 : bits64;
  651. negCount : int8;
  652. begin
  653. negCount := ( - count ) and 63;
  654. if ( count = 0 ) then begin
  655. z1 := a1;
  656. z0 := a0;
  657. end
  658. else if ( count < 64 ) then begin
  659. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  660. z0 := a0>>count;
  661. end
  662. else begin
  663. if ( count = 64 ) then begin
  664. z1 := a0 or ord( a1 <> 0 );
  665. end
  666. else if ( count < 128 ) then begin
  667. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  668. end
  669. else begin
  670. z1 := ord( ( a0 or a1 ) <> 0 );
  671. end;
  672. z0 := 0;
  673. end;
  674. z1Ptr := z1;
  675. z0Ptr := z0;
  676. end;
  677. {*
  678. -------------------------------------------------------------------------------
  679. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  680. number of bits given in `count'. Any bits shifted off are lost. The value
  681. of `count' can be arbitrarily large; in particular, if `count' is greater
  682. than 64, the result will be 0. The result is broken into two 32-bit pieces
  683. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  684. -------------------------------------------------------------------------------
  685. *}
  686. Procedure
  687. shift64Right(
  688. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  689. Var
  690. z0, z1: bits32;
  691. negCount : int8;
  692. Begin
  693. negCount := ( - count ) AND 31;
  694. if ( count = 0 ) then
  695. Begin
  696. z1 := a1;
  697. z0 := a0;
  698. End
  699. else if ( count < 32 ) then
  700. Begin
  701. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  702. z0 := a0 shr count;
  703. End
  704. else
  705. Begin
  706. if (count < 64) then
  707. z1 := ( a0 shr ( count AND 31 ) )
  708. else
  709. z1 := 0;
  710. z0 := 0;
  711. End;
  712. z1Ptr := z1;
  713. z0Ptr := z0;
  714. End;
  715. {*
  716. -------------------------------------------------------------------------------
  717. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  718. number of bits given in `count'. If any nonzero bits are shifted off, they
  719. are ``jammed'' into the least significant bit of the result by setting the
  720. least significant bit to 1. The value of `count' can be arbitrarily large;
  721. in particular, if `count' is greater than 64, the result will be either 0
  722. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  723. nonzero. The result is broken into two 32-bit pieces which are stored at
  724. the locations pointed to by `z0Ptr' and `z1Ptr'.
  725. -------------------------------------------------------------------------------
  726. *}
  727. Procedure
  728. shift64RightJamming(
  729. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  730. VAR
  731. z0, z1 : bits32;
  732. negCount : int8;
  733. Begin
  734. negCount := ( - count ) AND 31;
  735. if ( count = 0 ) then
  736. Begin
  737. z1 := a1;
  738. z0 := a0;
  739. End
  740. else
  741. if ( count < 32 ) then
  742. Begin
  743. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  744. z0 := a0 shr count;
  745. End
  746. else
  747. Begin
  748. if ( count = 32 ) then
  749. Begin
  750. z1 := a0 OR bits32( a1 <> 0 );
  751. End
  752. else
  753. if ( count < 64 ) Then
  754. Begin
  755. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  756. End
  757. else
  758. Begin
  759. z1 := bits32( ( a0 OR a1 ) <> 0 );
  760. End;
  761. z0 := 0;
  762. End;
  763. z1Ptr := z1;
  764. z0Ptr := z0;
  765. End;
  766. {*----------------------------------------------------------------------------
  767. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  768. | bits are shifted off, they are ``jammed'' into the least significant bit of
  769. | the result by setting the least significant bit to 1. The value of `count'
  770. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  771. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  772. | The result is stored in the location pointed to by `zPtr'.
  773. *----------------------------------------------------------------------------*}
  774. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  775. var
  776. z: bits64;
  777. begin
  778. if ( count = 0 ) then
  779. begin
  780. z := a;
  781. end
  782. else if ( count < 64 ) then
  783. begin
  784. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  785. end
  786. else
  787. begin
  788. z := ord( a <> 0 );
  789. end;
  790. zPtr := z;
  791. end;
  792. {*
  793. -------------------------------------------------------------------------------
  794. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  795. by 32 _plus_ the number of bits given in `count'. The shifted result is
  796. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  797. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  798. off form a third 32-bit result as follows: The _last_ bit shifted off is
  799. the most-significant bit of the extra result, and the other 31 bits of the
  800. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  801. were all zero. This extra result is stored in the location pointed to by
  802. `z2Ptr'. The value of `count' can be arbitrarily large.
  803. (This routine makes more sense if `a0', `a1', and `a2' are considered
  804. to form a fixed-point value with binary point between `a1' and `a2'. This
  805. fixed-point value is shifted right by the number of bits given in `count',
  806. and the integer part of the result is returned at the locations pointed to
  807. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  808. corrupted as described above, and is returned at the location pointed to by
  809. `z2Ptr'.)
  810. -------------------------------------------------------------------------------
  811. }
  812. Procedure
  813. shift64ExtraRightJamming(
  814. a0: bits32;
  815. a1: bits32;
  816. a2: bits32;
  817. count: int16;
  818. VAR z0Ptr: bits32;
  819. VAR z1Ptr: bits32;
  820. VAR z2Ptr: bits32
  821. );
  822. Var
  823. z0, z1, z2: bits32;
  824. negCount : int8;
  825. Begin
  826. negCount := ( - count ) AND 31;
  827. if ( count = 0 ) then
  828. Begin
  829. z2 := a2;
  830. z1 := a1;
  831. z0 := a0;
  832. End
  833. else
  834. Begin
  835. if ( count < 32 ) Then
  836. Begin
  837. z2 := a1 shl negCount;
  838. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  839. z0 := a0 shr count;
  840. End
  841. else
  842. Begin
  843. if ( count = 32 ) then
  844. Begin
  845. z2 := a1;
  846. z1 := a0;
  847. End
  848. else
  849. Begin
  850. a2 := a2 or a1;
  851. if ( count < 64 ) then
  852. Begin
  853. z2 := a0 shl negCount;
  854. z1 := a0 shr ( count AND 31 );
  855. End
  856. else
  857. Begin
  858. if count = 64 then
  859. z2 := a0
  860. else
  861. z2 := bits32(a0 <> 0);
  862. z1 := 0;
  863. End;
  864. End;
  865. z0 := 0;
  866. End;
  867. z2 := z2 or bits32( a2 <> 0 );
  868. End;
  869. z2Ptr := z2;
  870. z1Ptr := z1;
  871. z0Ptr := z0;
  872. End;
  873. {*
  874. -------------------------------------------------------------------------------
  875. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  876. number of bits given in `count'. Any bits shifted off are lost. The value
  877. of `count' must be less than 32. The result is broken into two 32-bit
  878. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  879. -------------------------------------------------------------------------------
  880. *}
  881. Procedure
  882. shortShift64Left(
  883. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  884. Begin
  885. z1Ptr := a1 shl count;
  886. if count = 0 then
  887. z0Ptr := a0
  888. else
  889. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  890. End;
  891. {*
  892. -------------------------------------------------------------------------------
  893. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  894. by the number of bits given in `count'. Any bits shifted off are lost.
  895. The value of `count' must be less than 32. The result is broken into three
  896. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  897. `z1Ptr', and `z2Ptr'.
  898. -------------------------------------------------------------------------------
  899. *}
  900. Procedure
  901. shortShift96Left(
  902. a0: bits32;
  903. a1: bits32;
  904. a2: bits32;
  905. count: int16;
  906. VAR z0Ptr: bits32;
  907. VAR z1Ptr: bits32;
  908. VAR z2Ptr: bits32
  909. );
  910. Var
  911. z0, z1, z2: bits32;
  912. negCount: int8;
  913. Begin
  914. z2 := a2 shl count;
  915. z1 := a1 shl count;
  916. z0 := a0 shl count;
  917. if ( 0 < count ) then
  918. Begin
  919. negCount := ( ( - count ) AND 31 );
  920. z1 := z1 or (a2 shr negCount);
  921. z0 := z0 or (a1 shr negCount);
  922. End;
  923. z2Ptr := z2;
  924. z1Ptr := z1;
  925. z0Ptr := z0;
  926. End;
  927. {*----------------------------------------------------------------------------
  928. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  929. | number of bits given in `count'. Any bits shifted off are lost. The value
  930. | of `count' must be less than 64. The result is broken into two 64-bit
  931. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  932. *----------------------------------------------------------------------------*}
  933. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  934. begin
  935. z1Ptr := a1 shl count;
  936. if count=0 then
  937. z0Ptr:=a0
  938. else
  939. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  940. end;
  941. {*
  942. -------------------------------------------------------------------------------
  943. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  944. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  945. any carry out is lost. The result is broken into two 32-bit pieces which
  946. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  947. -------------------------------------------------------------------------------
  948. *}
  949. Procedure
  950. add64(
  951. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  952. Var
  953. z1: bits32;
  954. Begin
  955. z1 := a1 + b1;
  956. z1Ptr := z1;
  957. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  958. End;
  959. {*
  960. -------------------------------------------------------------------------------
  961. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  962. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  963. modulo 2^96, so any carry out is lost. The result is broken into three
  964. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  965. `z1Ptr', and `z2Ptr'.
  966. -------------------------------------------------------------------------------
  967. *}
  968. Procedure
  969. add96(
  970. a0: bits32;
  971. a1: bits32;
  972. a2: bits32;
  973. b0: bits32;
  974. b1: bits32;
  975. b2: bits32;
  976. VAR z0Ptr: bits32;
  977. VAR z1Ptr: bits32;
  978. VAR z2Ptr: bits32
  979. );
  980. var
  981. z0, z1, z2: bits32;
  982. carry0, carry1: int8;
  983. Begin
  984. z2 := a2 + b2;
  985. carry1 := int8( z2 < a2 );
  986. z1 := a1 + b1;
  987. carry0 := int8( z1 < a1 );
  988. z0 := a0 + b0;
  989. z1 := z1 + carry1;
  990. z0 := z0 + bits32( z1 < carry1 );
  991. z0 := z0 + carry0;
  992. z2Ptr := z2;
  993. z1Ptr := z1;
  994. z0Ptr := z0;
  995. End;
  996. {*----------------------------------------------------------------------------
  997. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  998. | by the number of bits given in `count'. Any bits shifted off are lost.
  999. | The value of `count' must be less than 64. The result is broken into three
  1000. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1001. | `z1Ptr', and `z2Ptr'.
  1002. *----------------------------------------------------------------------------*}
  1003. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1004. var
  1005. z0, z1, z2 : bits64;
  1006. negCount : int8;
  1007. begin
  1008. z2 := a2 shl count;
  1009. z1 := a1 shl count;
  1010. z0 := a0 shl count;
  1011. if ( 0 < count ) then
  1012. begin
  1013. negCount := ( ( - count ) and 63 );
  1014. z1 := z1 or (a2 shr negCount);
  1015. z0 := z0 or (a1 shr negCount);
  1016. end;
  1017. z2Ptr := z2;
  1018. z1Ptr := z1;
  1019. z0Ptr := z0;
  1020. end;
  1021. {*----------------------------------------------------------------------------
  1022. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1023. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1024. | any carry out is lost. The result is broken into two 64-bit pieces which
  1025. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1026. *----------------------------------------------------------------------------*}
  1027. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);inline;
  1028. var
  1029. z1 : bits64;
  1030. begin
  1031. z1 := a1 + b1;
  1032. z1Ptr := z1;
  1033. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1034. end;
  1035. {*----------------------------------------------------------------------------
  1036. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1037. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1038. | modulo 2^192, so any carry out is lost. The result is broken into three
  1039. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1040. | `z1Ptr', and `z2Ptr'.
  1041. *----------------------------------------------------------------------------*}
  1042. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1043. var
  1044. z0, z1, z2 : bits64;
  1045. carry0, carry1 : int8;
  1046. begin
  1047. z2 := a2 + b2;
  1048. carry1 := ord( z2 < a2 );
  1049. z1 := a1 + b1;
  1050. carry0 := ord( z1 < a1 );
  1051. z0 := a0 + b0;
  1052. inc(z1, carry1);
  1053. inc(z0, ord( z1 < carry1 ));
  1054. inc(z0, carry0);
  1055. z2Ptr := z2;
  1056. z1Ptr := z1;
  1057. z0Ptr := z0;
  1058. end;
  1059. {*
  1060. -------------------------------------------------------------------------------
  1061. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1062. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1063. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1064. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1065. `z1Ptr'.
  1066. -------------------------------------------------------------------------------
  1067. *}
  1068. Procedure
  1069. sub64(
  1070. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  1071. Begin
  1072. z1Ptr := a1 - b1;
  1073. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1074. End;
  1075. {*
  1076. -------------------------------------------------------------------------------
  1077. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1078. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1079. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1080. into three 32-bit pieces which are stored at the locations pointed to by
  1081. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1082. -------------------------------------------------------------------------------
  1083. *}
  1084. Procedure
  1085. sub96(
  1086. a0:bits32;
  1087. a1:bits32;
  1088. a2:bits32;
  1089. b0:bits32;
  1090. b1:bits32;
  1091. b2:bits32;
  1092. VAR z0Ptr:bits32;
  1093. VAR z1Ptr:bits32;
  1094. VAR z2Ptr:bits32
  1095. );
  1096. Var
  1097. z0, z1, z2: bits32;
  1098. borrow0, borrow1: int8;
  1099. Begin
  1100. z2 := a2 - b2;
  1101. borrow1 := int8( a2 < b2 );
  1102. z1 := a1 - b1;
  1103. borrow0 := int8( a1 < b1 );
  1104. z0 := a0 - b0;
  1105. z0 := z0 - bits32( z1 < borrow1 );
  1106. z1 := z1 - borrow1;
  1107. z0 := z0 -borrow0;
  1108. z2Ptr := z2;
  1109. z1Ptr := z1;
  1110. z0Ptr := z0;
  1111. End;
  1112. {*----------------------------------------------------------------------------
  1113. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1114. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1115. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1116. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1117. | `z1Ptr'.
  1118. *----------------------------------------------------------------------------*}
  1119. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1120. begin
  1121. z1Ptr := a1 - b1;
  1122. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1123. end;
  1124. {*----------------------------------------------------------------------------
  1125. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1126. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1127. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1128. | result is broken into three 64-bit pieces which are stored at the locations
  1129. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1130. *----------------------------------------------------------------------------*}
  1131. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1132. var
  1133. z0, z1, z2 : bits64;
  1134. borrow0, borrow1 : int8;
  1135. begin
  1136. z2 := a2 - b2;
  1137. borrow1 := ord( a2 < b2 );
  1138. z1 := a1 - b1;
  1139. borrow0 := ord( a1 < b1 );
  1140. z0 := a0 - b0;
  1141. dec(z0, ord( z1 < borrow1 ));
  1142. dec(z1, borrow1);
  1143. dec(z0, borrow0);
  1144. z2Ptr := z2;
  1145. z1Ptr := z1;
  1146. z0Ptr := z0;
  1147. end;
  1148. {*
  1149. -------------------------------------------------------------------------------
  1150. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1151. into two 32-bit pieces which are stored at the locations pointed to by
  1152. `z0Ptr' and `z1Ptr'.
  1153. -------------------------------------------------------------------------------
  1154. *}
  1155. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1156. :bits32 );
  1157. Var
  1158. aHigh, aLow, bHigh, bLow: bits16;
  1159. z0, zMiddleA, zMiddleB, z1: bits32;
  1160. Begin
  1161. aLow := a and $ffff;
  1162. aHigh := a shr 16;
  1163. bLow := b and $ffff;
  1164. bHigh := b shr 16;
  1165. z1 := ( bits32( aLow) ) * bLow;
  1166. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1167. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1168. z0 := ( bits32 (aHigh) ) * bHigh;
  1169. zMiddleA := zMiddleA + zMiddleB;
  1170. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1171. zMiddleA := zmiddleA shl 16;
  1172. z1 := z1 + zMiddleA;
  1173. z0 := z0 + bits32( z1 < zMiddleA );
  1174. z1Ptr := z1;
  1175. z0Ptr := z0;
  1176. End;
  1177. {*
  1178. -------------------------------------------------------------------------------
  1179. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1180. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1181. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1182. `z2Ptr'.
  1183. -------------------------------------------------------------------------------
  1184. *}
  1185. Procedure
  1186. mul64By32To96(
  1187. a0:bits32;
  1188. a1:bits32;
  1189. b:bits32;
  1190. VAR z0Ptr:bits32;
  1191. VAR z1Ptr:bits32;
  1192. VAR z2Ptr:bits32
  1193. );
  1194. Var
  1195. z0, z1, z2, more1: bits32;
  1196. Begin
  1197. mul32To64( a1, b, z1, z2 );
  1198. mul32To64( a0, b, z0, more1 );
  1199. add64( z0, more1, 0, z1, z0, z1 );
  1200. z2Ptr := z2;
  1201. z1Ptr := z1;
  1202. z0Ptr := z0;
  1203. End;
  1204. {*
  1205. -------------------------------------------------------------------------------
  1206. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1207. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1208. product. The product is broken into four 32-bit pieces which are stored at
  1209. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1210. -------------------------------------------------------------------------------
  1211. *}
  1212. Procedure
  1213. mul64To128(
  1214. a0:bits32;
  1215. a1:bits32;
  1216. b0:bits32;
  1217. b1:bits32;
  1218. VAR z0Ptr:bits32;
  1219. VAR z1Ptr:bits32;
  1220. VAR z2Ptr:bits32;
  1221. VAR z3Ptr:bits32
  1222. );
  1223. Var
  1224. z0, z1, z2, z3: bits32;
  1225. more1, more2: bits32;
  1226. Begin
  1227. mul32To64( a1, b1, z2, z3 );
  1228. mul32To64( a1, b0, z1, more2 );
  1229. add64( z1, more2, 0, z2, z1, z2 );
  1230. mul32To64( a0, b0, z0, more1 );
  1231. add64( z0, more1, 0, z1, z0, z1 );
  1232. mul32To64( a0, b1, more1, more2 );
  1233. add64( more1, more2, 0, z2, more1, z2 );
  1234. add64( z0, z1, 0, more1, z0, z1 );
  1235. z3Ptr := z3;
  1236. z2Ptr := z2;
  1237. z1Ptr := z1;
  1238. z0Ptr := z0;
  1239. End;
  1240. {*----------------------------------------------------------------------------
  1241. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1242. | into two 64-bit pieces which are stored at the locations pointed to by
  1243. | `z0Ptr' and `z1Ptr'.
  1244. *----------------------------------------------------------------------------*}
  1245. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1246. var
  1247. aHigh, aLow, bHigh, bLow : bits32;
  1248. z0, zMiddleA, zMiddleB, z1 : bits64;
  1249. begin
  1250. aLow := a;
  1251. aHigh := a shr 32;
  1252. bLow := b;
  1253. bHigh := b shr 32;
  1254. z1 := ( bits64(aLow) ) * bLow;
  1255. zMiddleA := ( bits64( aLow )) * bHigh;
  1256. zMiddleB := ( bits64( aHigh )) * bLow;
  1257. z0 := ( bits64(aHigh) ) * bHigh;
  1258. inc(zMiddleA, zMiddleB);
  1259. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1260. zMiddleA := zMiddleA shl 32;
  1261. inc(z1, zMiddleA);
  1262. inc(z0, ord( z1 < zMiddleA ));
  1263. z1Ptr := z1;
  1264. z0Ptr := z0;
  1265. end;
  1266. {*----------------------------------------------------------------------------
  1267. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1268. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1269. | product. The product is broken into four 64-bit pieces which are stored at
  1270. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1271. *----------------------------------------------------------------------------*}
  1272. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1273. var
  1274. z0,z1,z2,z3,more1,more2 : bits64;
  1275. begin
  1276. mul64To128( a1, b1, z2, z3 );
  1277. mul64To128( a1, b0, z1, more2 );
  1278. add128( z1, more2, 0, z2, z1, z2 );
  1279. mul64To128( a0, b0, z0, more1 );
  1280. add128( z0, more1, 0, z1, z0, z1 );
  1281. mul64To128( a0, b1, more1, more2 );
  1282. add128( more1, more2, 0, z2, more1, z2 );
  1283. add128( z0, z1, 0, more1, z0, z1 );
  1284. z3Ptr := z3;
  1285. z2Ptr := z2;
  1286. z1Ptr := z1;
  1287. z0Ptr := z0;
  1288. end;
  1289. {*----------------------------------------------------------------------------
  1290. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1291. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1292. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1293. | `z2Ptr'.
  1294. *----------------------------------------------------------------------------*}
  1295. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1296. var
  1297. z0, z1, z2, more1 : bits64;
  1298. begin
  1299. mul64To128( a1, b, z1, z2 );
  1300. mul64To128( a0, b, z0, more1 );
  1301. add128( z0, more1, 0, z1, z0, z1 );
  1302. z2Ptr := z2;
  1303. z1Ptr := z1;
  1304. z0Ptr := z0;
  1305. end;
  1306. {*----------------------------------------------------------------------------
  1307. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1308. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1309. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1310. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1311. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1312. | unsigned integer is returned.
  1313. *----------------------------------------------------------------------------*}
  1314. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1315. var
  1316. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1317. begin
  1318. if ( b <= a0 ) then
  1319. begin
  1320. result:=qword( $FFFFFFFFFFFFFFFF );
  1321. exit;
  1322. end;
  1323. b0 := b shr 32;
  1324. if ( b0 shl 32 <= a0 ) then
  1325. z:=qword( $FFFFFFFF00000000 )
  1326. else
  1327. z:=( a0 div b0 ) shl 32;
  1328. mul64To128( b, z, term0, term1 );
  1329. sub128( a0, a1, term0, term1, rem0, rem1 );
  1330. while ( ( sbits64(rem0) ) < 0 ) do begin
  1331. dec(z,qword( $100000000 ));
  1332. b1 := b shl 32;
  1333. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1334. end;
  1335. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1336. if ( b0 shl 32 <= rem0 ) then
  1337. z:=z or $FFFFFFFF
  1338. else
  1339. z:=z or rem0 div b0;
  1340. result:=z;
  1341. end;
  1342. {*
  1343. -------------------------------------------------------------------------------
  1344. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1345. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1346. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1347. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1348. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1349. unsigned integer is returned.
  1350. -------------------------------------------------------------------------------
  1351. *}
  1352. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1353. Var
  1354. b0, b1: bits32;
  1355. rem0, rem1, term0, term1: bits32;
  1356. z: bits32;
  1357. Begin
  1358. if ( b <= a0 ) then
  1359. Begin
  1360. estimateDiv64To32 := $FFFFFFFF;
  1361. exit;
  1362. End;
  1363. b0 := b shr 16;
  1364. if ( b0 shl 16 <= a0 ) then
  1365. z:= $FFFF0000
  1366. else
  1367. z:= ( a0 div b0 ) shl 16;
  1368. mul32To64( b, z, term0, term1 );
  1369. sub64( a0, a1, term0, term1, rem0, rem1 );
  1370. while ( ( sbits32 (rem0) ) < 0 ) do
  1371. Begin
  1372. z := z - $10000;
  1373. b1 := b shl 16;
  1374. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1375. End;
  1376. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1377. if ( b0 shl 16 <= rem0 ) then
  1378. z := z or $FFFF
  1379. else
  1380. z := z or (rem0 div b0);
  1381. estimateDiv64To32 := z;
  1382. End;
  1383. {*
  1384. -------------------------------------------------------------------------------
  1385. Returns an approximation to the square root of the 32-bit significand given
  1386. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1387. `aExp' (the least significant bit) is 1, the integer returned approximates
  1388. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1389. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1390. case, the approximation returned lies strictly within +/-2 of the exact
  1391. value.
  1392. -------------------------------------------------------------------------------
  1393. *}
  1394. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1395. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1396. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1397. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1398. );
  1399. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1400. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1401. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1402. );
  1403. Var
  1404. index: int8;
  1405. z: bits32;
  1406. Begin
  1407. index := ( a shr 27 ) AND 15;
  1408. if ( aExp AND 1 ) <> 0 then
  1409. Begin
  1410. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1411. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1412. a := a shr 1;
  1413. End
  1414. else
  1415. Begin
  1416. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1417. z := a div z + z;
  1418. if ( $20000 <= z ) then
  1419. z := $FFFF8000
  1420. else
  1421. z := ( z shl 15 );
  1422. if ( z <= a ) then
  1423. Begin
  1424. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1425. exit;
  1426. End;
  1427. End;
  1428. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1429. End;
  1430. {*
  1431. -------------------------------------------------------------------------------
  1432. Returns the number of leading 0 bits before the most-significant 1 bit of
  1433. `a'. If `a' is zero, 32 is returned.
  1434. -------------------------------------------------------------------------------
  1435. *}
  1436. Function countLeadingZeros32( a:bits32 ): int8;
  1437. const countLeadingZerosHigh:array[0..255] of int8 = (
  1438. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1439. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1440. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1441. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1442. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1443. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1444. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1445. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1446. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1447. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1448. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1449. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1450. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1451. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1452. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1453. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1454. );
  1455. Var
  1456. shiftCount: int8;
  1457. Begin
  1458. shiftCount := 0;
  1459. if ( a < $10000 ) then
  1460. Begin
  1461. shiftCount := shiftcount + 16;
  1462. a := a shl 16;
  1463. End;
  1464. if ( a < $1000000 ) then
  1465. Begin
  1466. shiftCount := shiftcount + 8;
  1467. a := a shl 8;
  1468. end;
  1469. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1470. countLeadingZeros32:= shiftCount;
  1471. End;
  1472. {*----------------------------------------------------------------------------
  1473. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1474. | `a'. If `a' is zero, 64 is returned.
  1475. *----------------------------------------------------------------------------*}
  1476. function countLeadingZeros64( a : bits64): int8;
  1477. var
  1478. shiftcount : int8;
  1479. Begin
  1480. shiftCount := 0;
  1481. if ( a < (bits64(1) shl 32 )) then
  1482. shiftCount := shiftcount + 32
  1483. else
  1484. a := a shr 32;
  1485. shiftCount := shiftCount + countLeadingZeros32( a );
  1486. countLeadingZeros64:= shiftCount;
  1487. End;
  1488. {*
  1489. -------------------------------------------------------------------------------
  1490. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
  1491. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1492. returns 0.
  1493. -------------------------------------------------------------------------------
  1494. *}
  1495. Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1496. Begin
  1497. eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
  1498. End;
  1499. {*
  1500. -------------------------------------------------------------------------------
  1501. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1502. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1503. Otherwise, returns 0.
  1504. -------------------------------------------------------------------------------
  1505. *}
  1506. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1507. Begin
  1508. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1509. End;
  1510. {*
  1511. -------------------------------------------------------------------------------
  1512. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1513. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1514. returns 0.
  1515. -------------------------------------------------------------------------------
  1516. *}
  1517. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1518. Begin
  1519. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1520. End;
  1521. {*
  1522. -------------------------------------------------------------------------------
  1523. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
  1524. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1525. returns 0.
  1526. -------------------------------------------------------------------------------
  1527. *}
  1528. Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1529. Begin
  1530. ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
  1531. End;
  1532. const
  1533. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1534. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1535. (*****************************************************************************)
  1536. (* End Low-Level arithmetic *)
  1537. (*****************************************************************************)
  1538. {*
  1539. -------------------------------------------------------------------------------
  1540. Functions and definitions to determine: (1) whether tininess for underflow
  1541. is detected before or after rounding by default, (2) what (if anything)
  1542. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1543. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1544. are propagated from function inputs to output. These details are ENDIAN
  1545. specific
  1546. -------------------------------------------------------------------------------
  1547. *}
  1548. {$IFDEF ENDIAN_LITTLE}
  1549. {*
  1550. -------------------------------------------------------------------------------
  1551. Internal canonical NaN format.
  1552. -------------------------------------------------------------------------------
  1553. *}
  1554. TYPE
  1555. commonNaNT = packed record
  1556. sign: flag;
  1557. high, low : bits32;
  1558. end;
  1559. {*
  1560. -------------------------------------------------------------------------------
  1561. The pattern for a default generated single-precision NaN.
  1562. -------------------------------------------------------------------------------
  1563. *}
  1564. const float32_default_nan = $FFC00000;
  1565. {*
  1566. -------------------------------------------------------------------------------
  1567. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1568. otherwise returns 0.
  1569. -------------------------------------------------------------------------------
  1570. *}
  1571. Function float32_is_nan( a : float32 ): flag;
  1572. Begin
  1573. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1574. End;
  1575. {*
  1576. -------------------------------------------------------------------------------
  1577. Returns 1 if the single-precision floating-point value `a' is a signaling
  1578. NaN; otherwise returns 0.
  1579. -------------------------------------------------------------------------------
  1580. *}
  1581. Function float32_is_signaling_nan( a : float32 ): flag;
  1582. Begin
  1583. float32_is_signaling_nan := flag
  1584. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1585. End;
  1586. {*
  1587. -------------------------------------------------------------------------------
  1588. Returns the result of converting the single-precision floating-point NaN
  1589. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1590. exception is raised.
  1591. -------------------------------------------------------------------------------
  1592. *}
  1593. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1594. var
  1595. z : commonNaNT ;
  1596. Begin
  1597. if ( float32_is_signaling_nan( a ) <> 0) then
  1598. float_raise( float_flag_invalid );
  1599. z.sign := a shr 31;
  1600. z.low := 0;
  1601. z.high := a shl 9;
  1602. c := z;
  1603. End;
  1604. {*
  1605. -------------------------------------------------------------------------------
  1606. Returns the result of converting the canonical NaN `a' to the single-
  1607. precision floating-point format.
  1608. -------------------------------------------------------------------------------
  1609. *}
  1610. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1611. Begin
  1612. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1613. End;
  1614. {*
  1615. -------------------------------------------------------------------------------
  1616. Takes two single-precision floating-point values `a' and `b', one of which
  1617. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1618. signaling NaN, the invalid exception is raised.
  1619. -------------------------------------------------------------------------------
  1620. *}
  1621. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1622. Var
  1623. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1624. label returnLargerSignificand;
  1625. Begin
  1626. aIsNaN := float32_is_nan( a );
  1627. aIsSignalingNaN := float32_is_signaling_nan( a );
  1628. bIsNaN := float32_is_nan( b );
  1629. bIsSignalingNaN := float32_is_signaling_nan( b );
  1630. a := a or $00400000;
  1631. b := b or $00400000;
  1632. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1633. float_raise( float_flag_invalid );
  1634. if ( aIsSignalingNaN )<> 0 then
  1635. Begin
  1636. if ( bIsSignalingNaN ) <> 0 then
  1637. goto returnLargerSignificand;
  1638. if bIsNan <> 0 then
  1639. propagateFloat32NaN := b
  1640. else
  1641. propagateFloat32NaN := a;
  1642. exit;
  1643. End
  1644. else if ( aIsNaN <> 0) then
  1645. Begin
  1646. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1647. Begin
  1648. propagateFloat32NaN := a;
  1649. exit;
  1650. End;
  1651. returnLargerSignificand:
  1652. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1653. Begin
  1654. propagateFloat32NaN := b;
  1655. exit;
  1656. End;
  1657. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1658. Begin
  1659. propagateFloat32NaN := a;
  1660. End;
  1661. if a < b then
  1662. propagateFloat32NaN := a
  1663. else
  1664. propagateFloat32NaN := b;
  1665. exit;
  1666. End
  1667. else
  1668. Begin
  1669. propagateFloat32NaN := b;
  1670. exit;
  1671. End;
  1672. End;
  1673. {*
  1674. -------------------------------------------------------------------------------
  1675. The pattern for a default generated double-precision NaN. The `high' and
  1676. `low' values hold the most- and least-significant bits, respectively.
  1677. -------------------------------------------------------------------------------
  1678. *}
  1679. const
  1680. float64_default_nan_high = $FFF80000;
  1681. float64_default_nan_low = $00000000;
  1682. {*
  1683. -------------------------------------------------------------------------------
  1684. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1685. otherwise returns 0.
  1686. -------------------------------------------------------------------------------
  1687. *}
  1688. Function float64_is_nan( a : float64 ) : flag;
  1689. Begin
  1690. float64_is_nan :=
  1691. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1692. and ( a.low or ( a.high and $000FFFFF ) );
  1693. End;
  1694. {*
  1695. -------------------------------------------------------------------------------
  1696. Returns 1 if the double-precision floating-point value `a' is a signaling
  1697. NaN; otherwise returns 0.
  1698. -------------------------------------------------------------------------------
  1699. *}
  1700. Function float64_is_signaling_nan( a : float64 ): flag;
  1701. Begin
  1702. float64_is_signaling_nan :=
  1703. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1704. and ( a.low or ( a.high and $0007FFFF ) );
  1705. End;
  1706. {*
  1707. -------------------------------------------------------------------------------
  1708. Returns the result of converting the double-precision floating-point NaN
  1709. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1710. exception is raised.
  1711. -------------------------------------------------------------------------------
  1712. *}
  1713. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1714. Var
  1715. z : commonNaNT;
  1716. Begin
  1717. if ( float64_is_signaling_nan( a )<>0 ) then
  1718. float_raise( float_flag_invalid );
  1719. z.sign := a.high shr 31;
  1720. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1721. c := z;
  1722. End;
  1723. {*
  1724. -------------------------------------------------------------------------------
  1725. Returns the result of converting the canonical NaN `a' to the double-
  1726. precision floating-point format.
  1727. -------------------------------------------------------------------------------
  1728. *}
  1729. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1730. Var
  1731. z: float64;
  1732. Begin
  1733. shift64Right( a.high, a.low, 12, z.high, z.low );
  1734. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1735. c := z;
  1736. End;
  1737. {*
  1738. -------------------------------------------------------------------------------
  1739. Takes two double-precision floating-point values `a' and `b', one of which
  1740. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1741. signaling NaN, the invalid exception is raised.
  1742. -------------------------------------------------------------------------------
  1743. *}
  1744. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1745. Var
  1746. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1747. label returnLargerSignificand;
  1748. Begin
  1749. aIsNaN := float64_is_nan( a );
  1750. aIsSignalingNaN := float64_is_signaling_nan( a );
  1751. bIsNaN := float64_is_nan( b );
  1752. bIsSignalingNaN := float64_is_signaling_nan( b );
  1753. a.high := a.high or $00080000;
  1754. b.high := b.high or $00080000;
  1755. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1756. float_raise( float_flag_invalid );
  1757. if ( aIsSignalingNaN )<>0 then
  1758. Begin
  1759. if ( bIsSignalingNaN )<>0 then
  1760. goto returnLargerSignificand;
  1761. if bIsNan <> 0 then
  1762. c := b
  1763. else
  1764. c := a;
  1765. exit;
  1766. End
  1767. else if ( aIsNaN )<> 0 then
  1768. Begin
  1769. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1770. Begin
  1771. c := a;
  1772. exit;
  1773. End;
  1774. returnLargerSignificand:
  1775. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1776. Begin
  1777. c := b;
  1778. exit;
  1779. End;
  1780. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1781. Begin
  1782. c := a;
  1783. exit;
  1784. End;
  1785. if a.high < b.high then
  1786. c := a
  1787. else
  1788. c := b;
  1789. exit;
  1790. End
  1791. else
  1792. Begin
  1793. c := b;
  1794. exit;
  1795. End;
  1796. End;
  1797. {*----------------------------------------------------------------------------
  1798. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1799. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1800. | returns 0.
  1801. *----------------------------------------------------------------------------*}
  1802. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1803. begin
  1804. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1805. end;
  1806. {*----------------------------------------------------------------------------
  1807. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1808. | otherwise returns 0.
  1809. *----------------------------------------------------------------------------*}
  1810. function float128_is_nan( a : float128): flag;
  1811. begin
  1812. result:= ord(( int64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1813. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1814. end;
  1815. {*----------------------------------------------------------------------------
  1816. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1817. | signaling NaN; otherwise returns 0.
  1818. *----------------------------------------------------------------------------*}
  1819. function float128_is_signaling_nan( a : float128): flag;
  1820. begin
  1821. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1822. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1823. end;
  1824. {*----------------------------------------------------------------------------
  1825. | Returns the result of converting the quadruple-precision floating-point NaN
  1826. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1827. | exception is raised.
  1828. *----------------------------------------------------------------------------*}
  1829. function float128ToCommonNaN( a : float128): commonNaNT;
  1830. var
  1831. z: commonNaNT;
  1832. qhigh,qlow : qword;
  1833. begin
  1834. if ( float128_is_signaling_nan( a )<>0) then
  1835. float_raise( float_flag_invalid );
  1836. z.sign := a.high shr 63;
  1837. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1838. z.high:=qhigh shr 32;
  1839. z.low:=qhigh and $ffffffff;
  1840. result:=z;
  1841. end;
  1842. {*----------------------------------------------------------------------------
  1843. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1844. | precision floating-point format.
  1845. *----------------------------------------------------------------------------*}
  1846. function commonNaNToFloat128( a : commonNaNT): float128;
  1847. var
  1848. z: float128;
  1849. begin
  1850. shift128Right( a.high, a.low, 16, z.high, z.low );
  1851. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1852. result:=z;
  1853. end;
  1854. {*----------------------------------------------------------------------------
  1855. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1856. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1857. | `b' is a signaling NaN, the invalid exception is raised.
  1858. *----------------------------------------------------------------------------*}
  1859. function propagateFloat128NaN( a: float128; b : float128): float128;
  1860. var
  1861. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1862. label
  1863. returnLargerSignificand;
  1864. begin
  1865. aIsNaN := float128_is_nan( a );
  1866. aIsSignalingNaN := float128_is_signaling_nan( a );
  1867. bIsNaN := float128_is_nan( b );
  1868. bIsSignalingNaN := float128_is_signaling_nan( b );
  1869. a.high := a.high or int64( $0000800000000000 );
  1870. b.high := b.high or int64( $0000800000000000 );
  1871. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1872. float_raise( float_flag_invalid );
  1873. if ( aIsSignalingNaN )<>0 then
  1874. begin
  1875. if ( bIsSignalingNaN )<>0 then
  1876. goto returnLargerSignificand;
  1877. if bIsNaN<>0 then
  1878. result := b
  1879. else
  1880. result := a;
  1881. exit;
  1882. end
  1883. else if ( aIsNaN )<>0 then
  1884. begin
  1885. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1886. begin
  1887. result := a;
  1888. exit;
  1889. end;
  1890. returnLargerSignificand:
  1891. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1892. begin
  1893. result := b;
  1894. exit;
  1895. end;
  1896. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1897. begin
  1898. result := a;
  1899. exit
  1900. end;
  1901. if ( a.high < b.high ) then
  1902. result := a
  1903. else
  1904. result := b;
  1905. exit;
  1906. end
  1907. else
  1908. result:=b;
  1909. end;
  1910. {$ELSE}
  1911. { Big endian code }
  1912. (*----------------------------------------------------------------------------
  1913. | Internal canonical NaN format.
  1914. *----------------------------------------------------------------------------*)
  1915. type
  1916. commonNANT = packed record
  1917. sign : flag;
  1918. high, low : bits32;
  1919. end;
  1920. (*----------------------------------------------------------------------------
  1921. | The pattern for a default generated single-precision NaN.
  1922. *----------------------------------------------------------------------------*)
  1923. const float32_default_nan = $7FFFFFFF;
  1924. (*----------------------------------------------------------------------------
  1925. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1926. | otherwise returns 0.
  1927. *----------------------------------------------------------------------------*)
  1928. function float32_is_nan(a: float32): flag;
  1929. begin
  1930. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1931. end;
  1932. (*----------------------------------------------------------------------------
  1933. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1934. | NaN; otherwise returns 0.
  1935. *----------------------------------------------------------------------------*)
  1936. function float32_is_signaling_nan(a: float32):flag;
  1937. begin
  1938. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  1939. end;
  1940. (*----------------------------------------------------------------------------
  1941. | Returns the result of converting the single-precision floating-point NaN
  1942. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1943. | exception is raised.
  1944. *----------------------------------------------------------------------------*)
  1945. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1946. var
  1947. z: commonNANT;
  1948. begin
  1949. if float32_is_signaling_nan(a)<>0 then
  1950. float_raise(float_flag_invalid);
  1951. z.sign := a shr 31;
  1952. z.low := 0;
  1953. z.high := a shl 9;
  1954. c:=z;
  1955. end;
  1956. (*----------------------------------------------------------------------------
  1957. | Returns the result of converting the canonical NaN `a' to the single-
  1958. | precision floating-point format.
  1959. *----------------------------------------------------------------------------*)
  1960. function CommonNanToFloat32(a : CommonNaNT): float32;
  1961. begin
  1962. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  1963. end;
  1964. (*----------------------------------------------------------------------------
  1965. | Takes two single-precision floating-point values `a' and `b', one of which
  1966. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1967. | signaling NaN, the invalid exception is raised.
  1968. *----------------------------------------------------------------------------*)
  1969. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  1970. var
  1971. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1972. begin
  1973. aIsNaN := float32_is_nan( a );
  1974. aIsSignalingNaN := float32_is_signaling_nan( a );
  1975. bIsNaN := float32_is_nan( b );
  1976. bIsSignalingNaN := float32_is_signaling_nan( b );
  1977. a := a or $00400000;
  1978. b := b or $00400000;
  1979. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1980. float_raise( float_flag_invalid );
  1981. if bIsSignalingNaN<>0 then
  1982. propagateFloat32Nan := b
  1983. else if aIsSignalingNan<>0 then
  1984. propagateFloat32Nan := a
  1985. else if bIsNan<>0 then
  1986. propagateFloat32Nan := b
  1987. else
  1988. propagateFloat32Nan := a;
  1989. end;
  1990. (*----------------------------------------------------------------------------
  1991. | The pattern for a default generated double-precision NaN. The `high' and
  1992. | `low' values hold the most- and least-significant bits, respectively.
  1993. *----------------------------------------------------------------------------*)
  1994. const
  1995. float64_default_nan_high = $7FFFFFFF;
  1996. float64_default_nan_low = $FFFFFFFF;
  1997. (*----------------------------------------------------------------------------
  1998. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  1999. | otherwise returns 0.
  2000. *----------------------------------------------------------------------------*)
  2001. function float64_is_nan(a: float64): flag;
  2002. begin
  2003. float64_is_nan := flag (
  2004. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2005. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2006. end;
  2007. (*----------------------------------------------------------------------------
  2008. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2009. | NaN; otherwise returns 0.
  2010. *----------------------------------------------------------------------------*)
  2011. function float64_is_signaling_nan( a:float64): flag;
  2012. begin
  2013. float64_is_signaling_nan := flag(
  2014. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2015. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2016. end;
  2017. (*----------------------------------------------------------------------------
  2018. | Returns the result of converting the double-precision floating-point NaN
  2019. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2020. | exception is raised.
  2021. *----------------------------------------------------------------------------*)
  2022. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  2023. var
  2024. z : commonNaNT;
  2025. begin
  2026. if ( float64_is_signaling_nan( a )<>0 ) then
  2027. float_raise( float_flag_invalid );
  2028. z.sign := a.high shr 31;
  2029. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2030. c:=z;
  2031. end;
  2032. (*----------------------------------------------------------------------------
  2033. | Returns the result of converting the canonical NaN `a' to the double-
  2034. | precision floating-point format.
  2035. *----------------------------------------------------------------------------*)
  2036. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  2037. var
  2038. z: float64;
  2039. begin
  2040. shift64Right( a.high, a.low, 12, z.high, z.low );
  2041. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2042. c:=z;
  2043. end;
  2044. (*----------------------------------------------------------------------------
  2045. | Takes two double-precision floating-point values `a' and `b', one of which
  2046. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2047. | signaling NaN, the invalid exception is raised.
  2048. *----------------------------------------------------------------------------*)
  2049. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2050. var
  2051. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2052. begin
  2053. aIsNaN := float64_is_nan( a );
  2054. aIsSignalingNaN := float64_is_signaling_nan( a );
  2055. bIsNaN := float64_is_nan( b );
  2056. bIsSignalingNaN := float64_is_signaling_nan( b );
  2057. a.high := a.high or $00080000;
  2058. b.high := b.high or $00080000;
  2059. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2060. float_raise( float_flag_invalid );
  2061. if bIsSignalingNaN<>0 then
  2062. c := b
  2063. else if aIsSignalingNan<>0 then
  2064. c := a
  2065. else if bIsNan<>0 then
  2066. c := b
  2067. else
  2068. c := a;
  2069. end;
  2070. {$ENDIF}
  2071. (****************************************************************************)
  2072. (* END ENDIAN SPECIFIC CODE *)
  2073. (****************************************************************************)
  2074. {*
  2075. -------------------------------------------------------------------------------
  2076. Returns the fraction bits of the single-precision floating-point value `a'.
  2077. -------------------------------------------------------------------------------
  2078. *}
  2079. Function ExtractFloat32Frac(a : Float32) : Bits32;
  2080. Begin
  2081. ExtractFloat32Frac := A AND $007FFFFF;
  2082. End;
  2083. {*
  2084. -------------------------------------------------------------------------------
  2085. Returns the exponent bits of the single-precision floating-point value `a'.
  2086. -------------------------------------------------------------------------------
  2087. *}
  2088. Function extractFloat32Exp( a: float32 ): Int16;
  2089. Begin
  2090. extractFloat32Exp := (a shr 23) AND $FF;
  2091. End;
  2092. {*
  2093. -------------------------------------------------------------------------------
  2094. Returns the sign bit of the single-precision floating-point value `a'.
  2095. -------------------------------------------------------------------------------
  2096. *}
  2097. Function extractFloat32Sign( a: float32 ): Flag;
  2098. Begin
  2099. extractFloat32Sign := a shr 31;
  2100. End;
  2101. {*
  2102. -------------------------------------------------------------------------------
  2103. Normalizes the subnormal single-precision floating-point value represented
  2104. by the denormalized significand `aSig'. The normalized exponent and
  2105. significand are stored at the locations pointed to by `zExpPtr' and
  2106. `zSigPtr', respectively.
  2107. -------------------------------------------------------------------------------
  2108. *}
  2109. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2110. Var
  2111. ShiftCount : BYTE;
  2112. Begin
  2113. shiftCount := countLeadingZeros32( aSig ) - 8;
  2114. zSigPtr := aSig shl shiftCount;
  2115. zExpPtr := 1 - shiftCount;
  2116. End;
  2117. {*
  2118. -------------------------------------------------------------------------------
  2119. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2120. single-precision floating-point value, returning the result. After being
  2121. shifted into the proper positions, the three fields are simply added
  2122. together to form the result. This means that any integer portion of `zSig'
  2123. will be added into the exponent. Since a properly normalized significand
  2124. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2125. than the desired result exponent whenever `zSig' is a complete, normalized
  2126. significand.
  2127. -------------------------------------------------------------------------------
  2128. *}
  2129. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  2130. Begin
  2131. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2132. + zSig;
  2133. End;
  2134. {*
  2135. -------------------------------------------------------------------------------
  2136. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2137. and significand `zSig', and returns the proper single-precision floating-
  2138. point value corresponding to the abstract input. Ordinarily, the abstract
  2139. value is simply rounded and packed into the single-precision format, with
  2140. the inexact exception raised if the abstract input cannot be represented
  2141. exactly. However, if the abstract value is too large, the overflow and
  2142. inexact exceptions are raised and an infinity or maximal finite value is
  2143. returned. If the abstract value is too small, the input value is rounded to
  2144. a subnormal number, and the underflow and inexact exceptions are raised if
  2145. the abstract input cannot be represented exactly as a subnormal single-
  2146. precision floating-point number.
  2147. The input significand `zSig' has its binary point between bits 30
  2148. and 29, which is 7 bits to the left of the usual location. This shifted
  2149. significand must be normalized or smaller. If `zSig' is not normalized,
  2150. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2151. and it must not require rounding. In the usual case that `zSig' is
  2152. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2153. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2154. Binary Floating-Point Arithmetic.
  2155. -------------------------------------------------------------------------------
  2156. *}
  2157. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2158. Var
  2159. roundingMode : BYTE;
  2160. roundNearestEven : Flag;
  2161. roundIncrement, roundBits : BYTE;
  2162. IsTiny : Flag;
  2163. Begin
  2164. roundingMode := float_rounding_mode;
  2165. if (roundingMode = float_round_nearest_even) then
  2166. Begin
  2167. roundNearestEven := Flag(TRUE);
  2168. end
  2169. else
  2170. roundNearestEven := Flag(FALSE);
  2171. roundIncrement := $40;
  2172. if ( Boolean(roundNearestEven) = FALSE) then
  2173. Begin
  2174. if ( roundingMode = float_round_to_zero ) Then
  2175. Begin
  2176. roundIncrement := 0;
  2177. End
  2178. else
  2179. Begin
  2180. roundIncrement := $7F;
  2181. if ( zSign <> 0 ) then
  2182. Begin
  2183. if roundingMode = float_round_up then roundIncrement := 0;
  2184. End
  2185. else
  2186. Begin
  2187. if roundingMode = float_round_down then roundIncrement := 0;
  2188. End;
  2189. End
  2190. End;
  2191. roundBits := zSig AND $7F;
  2192. if ($FD <= bits16 (zExp) ) then
  2193. Begin
  2194. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2195. Begin
  2196. float_raise( float_flag_overflow OR float_flag_inexact );
  2197. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2198. exit;
  2199. End;
  2200. if ( zExp < 0 ) then
  2201. Begin
  2202. isTiny :=
  2203. flag(( float_detect_tininess = float_tininess_before_rounding )
  2204. OR ( zExp < -1 )
  2205. OR ( (zSig + roundIncrement) < $80000000 ));
  2206. shift32RightJamming( zSig, - zExp, zSig );
  2207. zExp := 0;
  2208. roundBits := zSig AND $7F;
  2209. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  2210. float_raise( float_flag_underflow );
  2211. End;
  2212. End;
  2213. if ( roundBits )<> 0 then
  2214. softfloat_exception_flags := float_flag_inexact OR softfloat_exception_flags;
  2215. zSig := ( zSig + roundIncrement ) shr 7;
  2216. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  2217. if ( zSig = 0 ) then zExp := 0;
  2218. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2219. exit;
  2220. End;
  2221. {*
  2222. -------------------------------------------------------------------------------
  2223. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2224. and significand `zSig', and returns the proper single-precision floating-
  2225. point value corresponding to the abstract input. This routine is just like
  2226. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2227. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2228. floating-point exponent.
  2229. -------------------------------------------------------------------------------
  2230. *}
  2231. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2232. Var
  2233. ShiftCount : int8;
  2234. Begin
  2235. shiftCount := countLeadingZeros32( zSig ) - 1;
  2236. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2237. End;
  2238. {*
  2239. -------------------------------------------------------------------------------
  2240. Returns the most-significant 20 fraction bits of the double-precision
  2241. floating-point value `a'.
  2242. -------------------------------------------------------------------------------
  2243. *}
  2244. Function extractFloat64Frac0(a: float64): bits32;
  2245. Begin
  2246. extractFloat64Frac0 := a.high and $000FFFFF;
  2247. End;
  2248. {*
  2249. -------------------------------------------------------------------------------
  2250. Returns the least-significant 32 fraction bits of the double-precision
  2251. floating-point value `a'.
  2252. -------------------------------------------------------------------------------
  2253. *}
  2254. Function extractFloat64Frac1(a: float64): bits32;
  2255. Begin
  2256. extractFloat64Frac1 := a.low;
  2257. End;
  2258. {*
  2259. -------------------------------------------------------------------------------
  2260. Returns the exponent bits of the double-precision floating-point value `a'.
  2261. -------------------------------------------------------------------------------
  2262. *}
  2263. Function extractFloat64Exp(a: float64): int16;
  2264. Begin
  2265. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2266. End;
  2267. {*
  2268. -------------------------------------------------------------------------------
  2269. Returns the sign bit of the double-precision floating-point value `a'.
  2270. -------------------------------------------------------------------------------
  2271. *}
  2272. Function extractFloat64Sign(a: float64) : flag;
  2273. Begin
  2274. extractFloat64Sign := a.high shr 31;
  2275. End;
  2276. {*
  2277. -------------------------------------------------------------------------------
  2278. Normalizes the subnormal double-precision floating-point value represented
  2279. by the denormalized significand formed by the concatenation of `aSig0' and
  2280. `aSig1'. The normalized exponent is stored at the location pointed to by
  2281. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2282. stored at the location pointed to by `zSig0Ptr', and the least significant
  2283. 32 bits of the normalized significand are stored at the location pointed to
  2284. by `zSig1Ptr'.
  2285. -------------------------------------------------------------------------------
  2286. *}
  2287. Procedure normalizeFloat64Subnormal(
  2288. aSig0: bits32;
  2289. aSig1: bits32;
  2290. VAR zExpPtr : Int16;
  2291. VAR zSig0Ptr : Bits32;
  2292. VAR zSig1Ptr : Bits32
  2293. );
  2294. Var
  2295. ShiftCount : Int8;
  2296. Begin
  2297. if ( aSig0 = 0 ) then
  2298. Begin
  2299. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2300. if ( shiftCount < 0 ) then
  2301. Begin
  2302. zSig0Ptr := aSig1 shr ( - shiftCount );
  2303. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2304. End
  2305. else
  2306. Begin
  2307. zSig0Ptr := aSig1 shl shiftCount;
  2308. zSig1Ptr := 0;
  2309. End;
  2310. zExpPtr := - shiftCount - 31;
  2311. End
  2312. else
  2313. Begin
  2314. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2315. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2316. zExpPtr := 1 - shiftCount;
  2317. End;
  2318. End;
  2319. {*
  2320. -------------------------------------------------------------------------------
  2321. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2322. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2323. point value, returning the result. After being shifted into the proper
  2324. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2325. together to form the most significant 32 bits of the result. This means
  2326. that any integer portion of `zSig0' will be added into the exponent. Since
  2327. a properly normalized significand will have an integer portion equal to 1,
  2328. the `zExp' input should be 1 less than the desired result exponent whenever
  2329. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2330. -------------------------------------------------------------------------------
  2331. *}
  2332. Procedure
  2333. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2334. var
  2335. z: Float64;
  2336. Begin
  2337. z.low := zSig1;
  2338. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2339. c := z;
  2340. End;
  2341. {*----------------------------------------------------------------------------
  2342. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2343. | double-precision floating-point value, returning the result. After being
  2344. | shifted into the proper positions, the three fields are simply added
  2345. | together to form the result. This means that any integer portion of `zSig'
  2346. | will be added into the exponent. Since a properly normalized significand
  2347. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2348. | than the desired result exponent whenever `zSig' is a complete, normalized
  2349. | significand.
  2350. *----------------------------------------------------------------------------*}
  2351. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2352. begin
  2353. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2354. end;
  2355. {*
  2356. -------------------------------------------------------------------------------
  2357. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2358. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2359. and `zSig2', and returns the proper double-precision floating-point value
  2360. corresponding to the abstract input. Ordinarily, the abstract value is
  2361. simply rounded and packed into the double-precision format, with the inexact
  2362. exception raised if the abstract input cannot be represented exactly.
  2363. However, if the abstract value is too large, the overflow and inexact
  2364. exceptions are raised and an infinity or maximal finite value is returned.
  2365. If the abstract value is too small, the input value is rounded to a
  2366. subnormal number, and the underflow and inexact exceptions are raised if the
  2367. abstract input cannot be represented exactly as a subnormal double-precision
  2368. floating-point number.
  2369. The input significand must be normalized or smaller. If the input
  2370. significand is not normalized, `zExp' must be 0; in that case, the result
  2371. returned is a subnormal number, and it must not require rounding. In the
  2372. usual case that the input significand is normalized, `zExp' must be 1 less
  2373. than the ``true'' floating-point exponent. The handling of underflow and
  2374. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2375. -------------------------------------------------------------------------------
  2376. *}
  2377. Procedure
  2378. roundAndPackFloat64(
  2379. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2380. Var
  2381. roundingMode : Int8;
  2382. roundNearestEven, increment, isTiny : Flag;
  2383. Begin
  2384. roundingMode := float_rounding_mode;
  2385. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2386. increment := flag( sbits32 (zSig2) < 0 );
  2387. if ( roundNearestEven = flag(FALSE) ) then
  2388. Begin
  2389. if ( roundingMode = float_round_to_zero ) then
  2390. increment := 0
  2391. else
  2392. Begin
  2393. if ( zSign )<> 0 then
  2394. Begin
  2395. increment := flag( roundingMode = float_round_down ) and zSig2;
  2396. End
  2397. else
  2398. Begin
  2399. increment := flag( roundingMode = float_round_up ) and zSig2;
  2400. End
  2401. End
  2402. End;
  2403. if ( $7FD <= bits16 (zExp) ) then
  2404. Begin
  2405. if (( $7FD < zExp )
  2406. or (( zExp = $7FD )
  2407. and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
  2408. and (increment<>0)
  2409. )
  2410. ) then
  2411. Begin
  2412. float_raise( float_flag_overflow OR float_flag_inexact );
  2413. if (( roundingMode = float_round_to_zero )
  2414. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2415. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2416. ) then
  2417. Begin
  2418. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2419. exit;
  2420. End;
  2421. packFloat64( zSign, $7FF, 0, 0, c );
  2422. exit;
  2423. End;
  2424. if ( zExp < 0 ) then
  2425. Begin
  2426. isTiny :=
  2427. flag( float_detect_tininess = float_tininess_before_rounding )
  2428. or flag( zExp < -1 )
  2429. or flag(increment = 0)
  2430. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2431. shift64ExtraRightJamming(
  2432. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2433. zExp := 0;
  2434. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2435. if ( roundNearestEven )<>0 then
  2436. Begin
  2437. increment := flag( sbits32 (zSig2) < 0 );
  2438. End
  2439. else
  2440. Begin
  2441. if ( zSign )<>0 then
  2442. Begin
  2443. increment := flag( roundingMode = float_round_down ) and zSig2;
  2444. End
  2445. else
  2446. Begin
  2447. increment := flag( roundingMode = float_round_up ) and zSig2;
  2448. End
  2449. End;
  2450. End;
  2451. End;
  2452. if ( zSig2 )<>0 then
  2453. softfloat_exception_flags := softfloat_exception_flags OR float_flag_inexact;
  2454. if ( increment )<>0 then
  2455. Begin
  2456. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2457. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2458. End
  2459. else
  2460. Begin
  2461. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2462. End;
  2463. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2464. End;
  2465. {*----------------------------------------------------------------------------
  2466. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2467. | and significand `zSig', and returns the proper double-precision floating-
  2468. | point value corresponding to the abstract input. Ordinarily, the abstract
  2469. | value is simply rounded and packed into the double-precision format, with
  2470. | the inexact exception raised if the abstract input cannot be represented
  2471. | exactly. However, if the abstract value is too large, the overflow and
  2472. | inexact exceptions are raised and an infinity or maximal finite value is
  2473. | returned. If the abstract value is too small, the input value is rounded
  2474. | to a subnormal number, and the underflow and inexact exceptions are raised
  2475. | if the abstract input cannot be represented exactly as a subnormal double-
  2476. | precision floating-point number.
  2477. | The input significand `zSig' has its binary point between bits 62
  2478. | and 61, which is 10 bits to the left of the usual location. This shifted
  2479. | significand must be normalized or smaller. If `zSig' is not normalized,
  2480. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2481. | and it must not require rounding. In the usual case that `zSig' is
  2482. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2483. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2484. | Binary Floating-Point Arithmetic.
  2485. *----------------------------------------------------------------------------*}
  2486. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2487. var
  2488. roundingMode: int8;
  2489. roundNearestEven: flag;
  2490. roundIncrement, roundBits: int16;
  2491. isTiny: flag;
  2492. begin
  2493. roundingMode := float_rounding_mode;
  2494. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2495. roundIncrement := $200;
  2496. if ( roundNearestEven=0 ) then
  2497. begin
  2498. if ( roundingMode = float_round_to_zero ) then
  2499. begin
  2500. roundIncrement := 0;
  2501. end
  2502. else begin
  2503. roundIncrement := $3FF;
  2504. if ( zSign<>0 ) then
  2505. begin
  2506. if ( roundingMode = float_round_up ) then
  2507. roundIncrement := 0;
  2508. end
  2509. else begin
  2510. if ( roundingMode = float_round_down ) then
  2511. roundIncrement := 0;
  2512. end
  2513. end
  2514. end;
  2515. roundBits := zSig and $3FF;
  2516. if ( $7FD <= bits16(zExp) ) then
  2517. begin
  2518. if ( ( $7FD < zExp )
  2519. or ( ( zExp = $7FD )
  2520. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2521. ) then
  2522. begin
  2523. float_raise( float_flag_overflow or float_flag_inexact );
  2524. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2525. exit;
  2526. end;
  2527. if ( zExp < 0 ) then
  2528. begin
  2529. isTiny := ord(
  2530. ( float_detect_tininess = float_tininess_before_rounding )
  2531. or ( zExp < -1 )
  2532. or ( (zSig + roundIncrement) < int64( $8000000000000000 ) ) );
  2533. shift64RightJamming( zSig, - zExp, zSig );
  2534. zExp := 0;
  2535. roundBits := zSig and $3FF;
  2536. if ( isTiny and roundBits )<>0 then
  2537. float_raise( float_flag_underflow );
  2538. end
  2539. end;
  2540. if ( roundBits<>0 ) then
  2541. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2542. zSig := ( zSig + roundIncrement ) shr 10;
  2543. zSig := zSig and not( ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven );
  2544. if ( zSig = 0 ) then
  2545. zExp := 0;
  2546. result:=packFloat64( zSign, zExp, zSig );
  2547. end;
  2548. {*
  2549. -------------------------------------------------------------------------------
  2550. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2551. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2552. returns the proper double-precision floating-point value corresponding
  2553. to the abstract input. This routine is just like `roundAndPackFloat64'
  2554. except that the input significand has fewer bits and does not have to be
  2555. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2556. point exponent.
  2557. -------------------------------------------------------------------------------
  2558. *}
  2559. Procedure
  2560. normalizeRoundAndPackFloat64(
  2561. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2562. Var
  2563. shiftCount : int8;
  2564. zSig2 : bits32;
  2565. Begin
  2566. if ( zSig0 = 0 ) then
  2567. Begin
  2568. zSig0 := zSig1;
  2569. zSig1 := 0;
  2570. zExp := zExp -32;
  2571. End;
  2572. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2573. if ( 0 <= shiftCount ) then
  2574. Begin
  2575. zSig2 := 0;
  2576. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2577. End
  2578. else
  2579. Begin
  2580. shift64ExtraRightJamming
  2581. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2582. End;
  2583. zExp := zExp - shiftCount;
  2584. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2585. End;
  2586. {*
  2587. -------------------------------------------------------------------------------
  2588. Returns the result of converting the 32-bit two's complement integer `a' to
  2589. the single-precision floating-point format. The conversion is performed
  2590. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2591. -------------------------------------------------------------------------------
  2592. *}
  2593. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2594. Var
  2595. zSign : Flag;
  2596. Begin
  2597. if ( a = 0 ) then
  2598. Begin
  2599. int32_to_float32.float32 := 0;
  2600. exit;
  2601. End;
  2602. if ( a = sbits32 ($80000000) ) then
  2603. Begin
  2604. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2605. exit;
  2606. end;
  2607. zSign := flag( a < 0 );
  2608. If zSign<>0 then
  2609. a := -a;
  2610. int32_to_float32.float32:=
  2611. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2612. End;
  2613. {*
  2614. -------------------------------------------------------------------------------
  2615. Returns the result of converting the 32-bit two's complement integer `a' to
  2616. the double-precision floating-point format. The conversion is performed
  2617. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2618. -------------------------------------------------------------------------------
  2619. *}
  2620. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2621. var
  2622. zSign : flag;
  2623. absA : bits32;
  2624. shiftCount : int8;
  2625. zSig0, zSig1 : bits32;
  2626. Begin
  2627. if ( a = 0 ) then
  2628. Begin
  2629. packFloat64( 0, 0, 0, 0, result );
  2630. exit;
  2631. end;
  2632. zSign := flag( a < 0 );
  2633. if ZSign<>0 then
  2634. AbsA := -a
  2635. else
  2636. AbsA := a;
  2637. shiftCount := countLeadingZeros32( absA ) - 11;
  2638. if ( 0 <= shiftCount ) then
  2639. Begin
  2640. zSig0 := absA shl shiftCount;
  2641. zSig1 := 0;
  2642. End
  2643. else
  2644. Begin
  2645. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2646. End;
  2647. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2648. End;
  2649. {*
  2650. -------------------------------------------------------------------------------
  2651. Returns the result of converting the single-precision floating-point value
  2652. `a' to the 32-bit two's complement integer format. The conversion is
  2653. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2654. Arithmetic---which means in particular that the conversion is rounded
  2655. according to the current rounding mode. If `a' is a NaN, the largest
  2656. positive integer is returned. Otherwise, if the conversion overflows, the
  2657. largest integer with the same sign as `a' is returned.
  2658. -------------------------------------------------------------------------------
  2659. *}
  2660. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2661. Var
  2662. aSign: flag;
  2663. aExp, shiftCount: int16;
  2664. aSig, aSigExtra: bits32;
  2665. z: int32;
  2666. roundingMode: int8;
  2667. Begin
  2668. aSig := extractFloat32Frac( a.float32 );
  2669. aExp := extractFloat32Exp( a.float32 );
  2670. aSign := extractFloat32Sign( a.float32 );
  2671. shiftCount := aExp - $96;
  2672. if ( 0 <= shiftCount ) then
  2673. Begin
  2674. if ( $9E <= aExp ) then
  2675. Begin
  2676. if ( a.float32 <> $CF000000 ) then
  2677. Begin
  2678. float_raise( float_flag_invalid );
  2679. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2680. Begin
  2681. float32_to_int32 := $7FFFFFFF;
  2682. exit;
  2683. End;
  2684. End;
  2685. float32_to_int32 := sbits32 ($80000000);
  2686. exit;
  2687. End;
  2688. z := ( aSig or $00800000 ) shl shiftCount;
  2689. if ( aSign<>0 ) then z := - z;
  2690. End
  2691. else
  2692. Begin
  2693. if ( aExp < $7E ) then
  2694. Begin
  2695. aSigExtra := aExp OR aSig;
  2696. z := 0;
  2697. End
  2698. else
  2699. Begin
  2700. aSig := aSig OR $00800000;
  2701. aSigExtra := aSig shl ( shiftCount and 31 );
  2702. z := aSig shr ( - shiftCount );
  2703. End;
  2704. if ( aSigExtra<>0 ) then
  2705. softfloat_exception_flags := softfloat_exception_flags
  2706. or float_flag_inexact;
  2707. roundingMode := float_rounding_mode;
  2708. if ( roundingMode = float_round_nearest_even ) then
  2709. Begin
  2710. if ( sbits32 (aSigExtra) < 0 ) then
  2711. Begin
  2712. Inc(z);
  2713. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2714. z := z and not 1;
  2715. End;
  2716. if ( aSign<>0 ) then
  2717. z := - z;
  2718. End
  2719. else
  2720. Begin
  2721. aSigExtra := flag( aSigExtra <> 0 );
  2722. if ( aSign<>0 ) then
  2723. Begin
  2724. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2725. z := - z;
  2726. End
  2727. else
  2728. Begin
  2729. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2730. End
  2731. End;
  2732. End;
  2733. float32_to_int32 := z;
  2734. End;
  2735. {*
  2736. -------------------------------------------------------------------------------
  2737. Returns the result of converting the single-precision floating-point value
  2738. `a' to the 32-bit two's complement integer format. The conversion is
  2739. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2740. Arithmetic, except that the conversion is always rounded toward zero.
  2741. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2742. the conversion overflows, the largest integer with the same sign as `a' is
  2743. returned.
  2744. -------------------------------------------------------------------------------
  2745. *}
  2746. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2747. Var
  2748. aSign : flag;
  2749. aExp, shiftCount : int16;
  2750. aSig : bits32;
  2751. z : int32;
  2752. Begin
  2753. aSig := extractFloat32Frac( a.float32 );
  2754. aExp := extractFloat32Exp( a.float32 );
  2755. aSign := extractFloat32Sign( a.float32 );
  2756. shiftCount := aExp - $9E;
  2757. if ( 0 <= shiftCount ) then
  2758. Begin
  2759. if ( a.float32 <> $CF000000 ) then
  2760. Begin
  2761. float_raise( float_flag_invalid );
  2762. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2763. Begin
  2764. float32_to_int32_round_to_zero := $7FFFFFFF;
  2765. exit;
  2766. end;
  2767. End;
  2768. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2769. exit;
  2770. End
  2771. else
  2772. if ( aExp <= $7E ) then
  2773. Begin
  2774. if ( aExp or aSig )<>0 then
  2775. softfloat_exception_flags :=
  2776. softfloat_exception_flags or float_flag_inexact;
  2777. float32_to_int32_round_to_zero := 0;
  2778. exit;
  2779. End;
  2780. aSig := ( aSig or $00800000 ) shl 8;
  2781. z := aSig shr ( - shiftCount );
  2782. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2783. Begin
  2784. softfloat_exception_flags :=
  2785. softfloat_exception_flags or float_flag_inexact;
  2786. End;
  2787. if ( aSign<>0 ) then z := - z;
  2788. float32_to_int32_round_to_zero := z;
  2789. End;
  2790. {*
  2791. -------------------------------------------------------------------------------
  2792. Returns the result of converting the single-precision floating-point value
  2793. `a' to the double-precision floating-point format. The conversion is
  2794. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2795. Arithmetic.
  2796. -------------------------------------------------------------------------------
  2797. *}
  2798. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  2799. Var
  2800. aSign : flag;
  2801. aExp : int16;
  2802. aSig, zSig0, zSig1: bits32;
  2803. tmp : CommonNanT;
  2804. Begin
  2805. aSig := extractFloat32Frac( a.float32 );
  2806. aExp := extractFloat32Exp( a.float32 );
  2807. aSign := extractFloat32Sign( a.float32 );
  2808. if ( aExp = $FF ) then
  2809. Begin
  2810. if ( aSig<>0 ) then
  2811. Begin
  2812. float32ToCommonNaN(a.float32, tmp);
  2813. commonNaNToFloat64(tmp , result);
  2814. exit;
  2815. End;
  2816. packFloat64( aSign, $7FF, 0, 0, result);
  2817. exit;
  2818. End;
  2819. if ( aExp = 0 ) then
  2820. Begin
  2821. if ( aSig = 0 ) then
  2822. Begin
  2823. packFloat64( aSign, 0, 0, 0, result );
  2824. exit;
  2825. end;
  2826. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2827. Dec(aExp);
  2828. End;
  2829. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  2830. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  2831. End;
  2832. {*
  2833. -------------------------------------------------------------------------------
  2834. Rounds the single-precision floating-point value `a' to an integer,
  2835. and returns the result as a single-precision floating-point value. The
  2836. operation is performed according to the IEC/IEEE Standard for Binary
  2837. Floating-Point Arithmetic.
  2838. -------------------------------------------------------------------------------
  2839. *}
  2840. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  2841. Var
  2842. aSign: flag;
  2843. aExp: int16;
  2844. lastBitMask, roundBitsMask: bits32;
  2845. roundingMode: int8;
  2846. z: float32;
  2847. Begin
  2848. aExp := extractFloat32Exp( a.float32 );
  2849. if ( $96 <= aExp ) then
  2850. Begin
  2851. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2852. Begin
  2853. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  2854. exit;
  2855. End;
  2856. float32_round_to_int:=a;
  2857. exit;
  2858. End;
  2859. if ( aExp <= $7E ) then
  2860. Begin
  2861. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  2862. Begin
  2863. float32_round_to_int:=a;
  2864. exit;
  2865. end;
  2866. softfloat_exception_flags
  2867. := softfloat_exception_flags OR float_flag_inexact;
  2868. aSign := extractFloat32Sign( a.float32 );
  2869. case ( float_rounding_mode ) of
  2870. float_round_nearest_even:
  2871. Begin
  2872. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2873. Begin
  2874. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  2875. exit;
  2876. End;
  2877. End;
  2878. float_round_down:
  2879. Begin
  2880. if aSign <> 0 then
  2881. float32_round_to_int.float32 := $BF800000
  2882. else
  2883. float32_round_to_int.float32 := 0;
  2884. exit;
  2885. End;
  2886. float_round_up:
  2887. Begin
  2888. if aSign <> 0 then
  2889. float32_round_to_int.float32 := $80000000
  2890. else
  2891. float32_round_to_int.float32 := $3F800000;
  2892. exit;
  2893. End;
  2894. end;
  2895. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  2896. End;
  2897. lastBitMask := 1;
  2898. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  2899. lastBitMask := lastBitMask shl ($96 - aExp);
  2900. roundBitsMask := lastBitMask - 1;
  2901. z := a.float32;
  2902. roundingMode := float_rounding_mode;
  2903. if ( roundingMode = float_round_nearest_even ) then
  2904. Begin
  2905. z := z + (lastBitMask shr 1);
  2906. if ( ( z and roundBitsMask ) = 0 ) then
  2907. z := z and not lastBitMask;
  2908. End
  2909. else if ( roundingMode <> float_round_to_zero ) then
  2910. Begin
  2911. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  2912. Begin
  2913. z := z + roundBitsMask;
  2914. End;
  2915. End;
  2916. z := z and not roundBitsMask;
  2917. if ( z <> a.float32 ) then
  2918. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2919. float32_round_to_int.float32 := z;
  2920. End;
  2921. {*
  2922. -------------------------------------------------------------------------------
  2923. Returns the result of adding the absolute values of the single-precision
  2924. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  2925. before being returned. `zSign' is ignored if the result is a NaN.
  2926. The addition is performed according to the IEC/IEEE Standard for Binary
  2927. Floating-Point Arithmetic.
  2928. -------------------------------------------------------------------------------
  2929. *}
  2930. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  2931. Var
  2932. aExp, bExp, zExp: int16;
  2933. aSig, bSig, zSig: bits32;
  2934. expDiff: int16;
  2935. label roundAndPack;
  2936. Begin
  2937. aSig:=extractFloat32Frac( a );
  2938. aExp:=extractFloat32Exp( a );
  2939. bSig:=extractFloat32Frac( b );
  2940. bExp := extractFloat32Exp( b );
  2941. expDiff := aExp - bExp;
  2942. aSig := aSig shl 6;
  2943. bSig := bSig shl 6;
  2944. if ( 0 < expDiff ) then
  2945. Begin
  2946. if ( aExp = $FF ) then
  2947. Begin
  2948. if ( aSig <> 0) then
  2949. Begin
  2950. addFloat32Sigs := propagateFloat32NaN( a, b );
  2951. exit;
  2952. End;
  2953. addFloat32Sigs := a;
  2954. exit;
  2955. End;
  2956. if ( bExp = 0 ) then
  2957. Begin
  2958. Dec(expDiff);
  2959. End
  2960. else
  2961. Begin
  2962. bSig := bSig or $20000000;
  2963. End;
  2964. shift32RightJamming( bSig, expDiff, bSig );
  2965. zExp := aExp;
  2966. End
  2967. else
  2968. If ( expDiff < 0 ) then
  2969. Begin
  2970. if ( bExp = $FF ) then
  2971. Begin
  2972. if ( bSig<>0 ) then
  2973. Begin
  2974. addFloat32Sigs := propagateFloat32NaN( a, b );
  2975. exit;
  2976. end;
  2977. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  2978. exit;
  2979. End;
  2980. if ( aExp = 0 ) then
  2981. Begin
  2982. Inc(expDiff);
  2983. End
  2984. else
  2985. Begin
  2986. aSig := aSig OR $20000000;
  2987. End;
  2988. shift32RightJamming( aSig, - expDiff, aSig );
  2989. zExp := bExp;
  2990. End
  2991. else
  2992. Begin
  2993. if ( aExp = $FF ) then
  2994. Begin
  2995. if ( aSig OR bSig )<> 0 then
  2996. Begin
  2997. addFloat32Sigs := propagateFloat32NaN( a, b );
  2998. exit;
  2999. end;
  3000. addFloat32Sigs := a;
  3001. exit;
  3002. End;
  3003. if ( aExp = 0 ) then
  3004. Begin
  3005. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3006. exit;
  3007. end;
  3008. zSig := $40000000 + aSig + bSig;
  3009. zExp := aExp;
  3010. goto roundAndPack;
  3011. End;
  3012. aSig := aSig OR $20000000;
  3013. zSig := ( aSig + bSig ) shl 1;
  3014. Dec(zExp);
  3015. if ( sbits32 (zSig) < 0 ) then
  3016. Begin
  3017. zSig := aSig + bSig;
  3018. Inc(zExp);
  3019. End;
  3020. roundAndPack:
  3021. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3022. End;
  3023. {*
  3024. -------------------------------------------------------------------------------
  3025. Returns the result of subtracting the absolute values of the single-
  3026. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3027. difference is negated before being returned. `zSign' is ignored if the
  3028. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3029. Standard for Binary Floating-Point Arithmetic.
  3030. -------------------------------------------------------------------------------
  3031. *}
  3032. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3033. Var
  3034. aExp, bExp, zExp: int16;
  3035. aSig, bSig, zSig: bits32;
  3036. expDiff : int16;
  3037. label aExpBigger;
  3038. label bExpBigger;
  3039. label aBigger;
  3040. label bBigger;
  3041. label normalizeRoundAndPack;
  3042. Begin
  3043. aSig := extractFloat32Frac( a );
  3044. aExp := extractFloat32Exp( a );
  3045. bSig := extractFloat32Frac( b );
  3046. bExp := extractFloat32Exp( b );
  3047. expDiff := aExp - bExp;
  3048. aSig := aSig shl 7;
  3049. bSig := bSig shl 7;
  3050. if ( 0 < expDiff ) then goto aExpBigger;
  3051. if ( expDiff < 0 ) then goto bExpBigger;
  3052. if ( aExp = $FF ) then
  3053. Begin
  3054. if ( aSig OR bSig )<> 0 then
  3055. Begin
  3056. subFloat32Sigs := propagateFloat32NaN( a, b );
  3057. exit;
  3058. End;
  3059. float_raise( float_flag_invalid );
  3060. subFloat32Sigs := float32_default_nan;
  3061. exit;
  3062. End;
  3063. if ( aExp = 0 ) then
  3064. Begin
  3065. aExp := 1;
  3066. bExp := 1;
  3067. End;
  3068. if ( bSig < aSig ) Then goto aBigger;
  3069. if ( aSig < bSig ) Then goto bBigger;
  3070. subFloat32Sigs := packFloat32( flag(float_rounding_mode = float_round_down), 0, 0 );
  3071. exit;
  3072. bExpBigger:
  3073. if ( bExp = $FF ) then
  3074. Begin
  3075. if ( bSig<>0 ) then
  3076. Begin
  3077. subFloat32Sigs := propagateFloat32NaN( a, b );
  3078. exit;
  3079. End;
  3080. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3081. exit;
  3082. End;
  3083. if ( aExp = 0 ) then
  3084. Begin
  3085. Inc(expDiff);
  3086. End
  3087. else
  3088. Begin
  3089. aSig := aSig OR $40000000;
  3090. End;
  3091. shift32RightJamming( aSig, - expDiff, aSig );
  3092. bSig := bSig OR $40000000;
  3093. bBigger:
  3094. zSig := bSig - aSig;
  3095. zExp := bExp;
  3096. zSign := zSign xor 1;
  3097. goto normalizeRoundAndPack;
  3098. aExpBigger:
  3099. if ( aExp = $FF ) then
  3100. Begin
  3101. if ( aSig <> 0) then
  3102. Begin
  3103. subFloat32Sigs := propagateFloat32NaN( a, b );
  3104. exit;
  3105. End;
  3106. subFloat32Sigs := a;
  3107. exit;
  3108. End;
  3109. if ( bExp = 0 ) then
  3110. Begin
  3111. Dec(expDiff);
  3112. End
  3113. else
  3114. Begin
  3115. bSig := bSig OR $40000000;
  3116. End;
  3117. shift32RightJamming( bSig, expDiff, bSig );
  3118. aSig := aSig OR $40000000;
  3119. aBigger:
  3120. zSig := aSig - bSig;
  3121. zExp := aExp;
  3122. normalizeRoundAndPack:
  3123. Dec(zExp);
  3124. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3125. End;
  3126. {*
  3127. -------------------------------------------------------------------------------
  3128. Returns the result of adding the single-precision floating-point values `a'
  3129. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3130. Binary Floating-Point Arithmetic.
  3131. -------------------------------------------------------------------------------
  3132. *}
  3133. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3134. Var
  3135. aSign, bSign: Flag;
  3136. Begin
  3137. aSign := extractFloat32Sign( a.float32 );
  3138. bSign := extractFloat32Sign( b.float32 );
  3139. if ( aSign = bSign ) then
  3140. Begin
  3141. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3142. End
  3143. else
  3144. Begin
  3145. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3146. End;
  3147. End;
  3148. {*
  3149. -------------------------------------------------------------------------------
  3150. Returns the result of subtracting the single-precision floating-point values
  3151. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3152. for Binary Floating-Point Arithmetic.
  3153. -------------------------------------------------------------------------------
  3154. *}
  3155. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3156. Var
  3157. aSign, bSign: flag;
  3158. Begin
  3159. aSign := extractFloat32Sign( a.float32 );
  3160. bSign := extractFloat32Sign( b.float32 );
  3161. if ( aSign = bSign ) then
  3162. Begin
  3163. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3164. End
  3165. else
  3166. Begin
  3167. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3168. End;
  3169. End;
  3170. {*
  3171. -------------------------------------------------------------------------------
  3172. Returns the result of multiplying the single-precision floating-point values
  3173. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3174. for Binary Floating-Point Arithmetic.
  3175. -------------------------------------------------------------------------------
  3176. *}
  3177. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3178. Var
  3179. aSign, bSign, zSign: flag;
  3180. aExp, bExp, zExp : int16;
  3181. aSig, bSig, zSig0, zSig1: bits32;
  3182. Begin
  3183. aSig := extractFloat32Frac( a.float32 );
  3184. aExp := extractFloat32Exp( a.float32 );
  3185. aSign := extractFloat32Sign( a.float32 );
  3186. bSig := extractFloat32Frac( b.float32 );
  3187. bExp := extractFloat32Exp( b.float32 );
  3188. bSign := extractFloat32Sign( b.float32 );
  3189. zSign := aSign xor bSign;
  3190. if ( aExp = $FF ) then
  3191. Begin
  3192. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3193. Begin
  3194. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3195. End;
  3196. if ( ( bExp OR bSig ) = 0 ) then
  3197. Begin
  3198. float_raise( float_flag_invalid );
  3199. float32_mul.float32 := float32_default_nan;
  3200. exit;
  3201. End;
  3202. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3203. exit;
  3204. End;
  3205. if ( bExp = $FF ) then
  3206. Begin
  3207. if ( bSig <> 0 ) then
  3208. Begin
  3209. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3210. exit;
  3211. End;
  3212. if ( ( aExp OR aSig ) = 0 ) then
  3213. Begin
  3214. float_raise( float_flag_invalid );
  3215. float32_mul.float32 := float32_default_nan;
  3216. exit;
  3217. End;
  3218. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3219. exit;
  3220. End;
  3221. if ( aExp = 0 ) then
  3222. Begin
  3223. if ( aSig = 0 ) then
  3224. Begin
  3225. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3226. exit;
  3227. End;
  3228. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3229. End;
  3230. if ( bExp = 0 ) then
  3231. Begin
  3232. if ( bSig = 0 ) then
  3233. Begin
  3234. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3235. exit;
  3236. End;
  3237. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3238. End;
  3239. zExp := aExp + bExp - $7F;
  3240. aSig := ( aSig OR $00800000 ) shl 7;
  3241. bSig := ( bSig OR $00800000 ) shl 8;
  3242. mul32To64( aSig, bSig, zSig0, zSig1 );
  3243. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3244. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3245. Begin
  3246. zSig0 := zSig0 shl 1;
  3247. Dec(zExp);
  3248. End;
  3249. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3250. End;
  3251. {*
  3252. -------------------------------------------------------------------------------
  3253. Returns the result of dividing the single-precision floating-point value `a'
  3254. by the corresponding value `b'. The operation is performed according to the
  3255. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3256. -------------------------------------------------------------------------------
  3257. *}
  3258. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3259. Var
  3260. aSign, bSign, zSign: flag;
  3261. aExp, bExp, zExp: int16;
  3262. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3263. Begin
  3264. aSig := extractFloat32Frac( a.float32 );
  3265. aExp := extractFloat32Exp( a.float32 );
  3266. aSign := extractFloat32Sign( a.float32 );
  3267. bSig := extractFloat32Frac( b.float32 );
  3268. bExp := extractFloat32Exp( b.float32 );
  3269. bSign := extractFloat32Sign( b.float32 );
  3270. zSign := aSign xor bSign;
  3271. if ( aExp = $FF ) then
  3272. Begin
  3273. if ( aSig <> 0 ) then
  3274. Begin
  3275. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3276. exit;
  3277. End;
  3278. if ( bExp = $FF ) then
  3279. Begin
  3280. if ( bSig <> 0) then
  3281. Begin
  3282. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3283. End;
  3284. float_raise( float_flag_invalid );
  3285. float32_div.float32 := float32_default_nan;
  3286. exit;
  3287. End;
  3288. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3289. exit;
  3290. End;
  3291. if ( bExp = $FF ) then
  3292. Begin
  3293. if ( bSig <> 0) then
  3294. Begin
  3295. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3296. exit;
  3297. End;
  3298. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3299. exit;
  3300. End;
  3301. if ( bExp = 0 ) Then
  3302. Begin
  3303. if ( bSig = 0 ) Then
  3304. Begin
  3305. if ( ( aExp OR aSig ) = 0 ) then
  3306. Begin
  3307. float_raise( float_flag_invalid );
  3308. float32_div.float32 := float32_default_nan;
  3309. exit;
  3310. End;
  3311. float_raise( float_flag_divbyzero );
  3312. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3313. exit;
  3314. End;
  3315. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3316. End;
  3317. if ( aExp = 0 ) Then
  3318. Begin
  3319. if ( aSig = 0 ) Then
  3320. Begin
  3321. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3322. exit;
  3323. End;
  3324. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3325. End;
  3326. zExp := aExp - bExp + $7D;
  3327. aSig := ( aSig OR $00800000 ) shl 7;
  3328. bSig := ( bSig OR $00800000 ) shl 8;
  3329. if ( bSig <= ( aSig + aSig ) ) then
  3330. Begin
  3331. aSig := aSig shr 1;
  3332. Inc(zExp);
  3333. End;
  3334. zSig := estimateDiv64To32( aSig, 0, bSig );
  3335. if ( ( zSig and $3F ) <= 2 ) then
  3336. Begin
  3337. mul32To64( bSig, zSig, term0, term1 );
  3338. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3339. while ( sbits32 (rem0) < 0 ) do
  3340. Begin
  3341. Dec(zSig);
  3342. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3343. End;
  3344. zSig := zSig or bits32( rem1 <> 0 );
  3345. End;
  3346. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3347. End;
  3348. {*
  3349. -------------------------------------------------------------------------------
  3350. Returns the remainder of the single-precision floating-point value `a'
  3351. with respect to the corresponding value `b'. The operation is performed
  3352. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3353. -------------------------------------------------------------------------------
  3354. *}
  3355. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3356. Var
  3357. aSign, bSign, zSign: flag;
  3358. aExp, bExp, expDiff: int16;
  3359. aSig, bSig, q, allZero, alternateASig: bits32;
  3360. sigMean: sbits32;
  3361. Begin
  3362. aSig := extractFloat32Frac( a.float32 );
  3363. aExp := extractFloat32Exp( a.float32 );
  3364. aSign := extractFloat32Sign( a.float32 );
  3365. bSig := extractFloat32Frac( b.float32 );
  3366. bExp := extractFloat32Exp( b.float32 );
  3367. bSign := extractFloat32Sign( b.float32 );
  3368. if ( aExp = $FF ) then
  3369. Begin
  3370. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3371. Begin
  3372. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3373. exit;
  3374. End;
  3375. float_raise( float_flag_invalid );
  3376. float32_rem.float32 := float32_default_nan;
  3377. exit;
  3378. End;
  3379. if ( bExp = $FF ) then
  3380. Begin
  3381. if ( bSig <> 0 ) then
  3382. Begin
  3383. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3384. exit;
  3385. End;
  3386. float32_rem := a;
  3387. exit;
  3388. End;
  3389. if ( bExp = 0 ) then
  3390. Begin
  3391. if ( bSig = 0 ) then
  3392. Begin
  3393. float_raise( float_flag_invalid );
  3394. float32_rem.float32 := float32_default_nan;
  3395. exit;
  3396. End;
  3397. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3398. End;
  3399. if ( aExp = 0 ) then
  3400. Begin
  3401. if ( aSig = 0 ) then
  3402. Begin
  3403. float32_rem := a;
  3404. exit;
  3405. End;
  3406. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3407. End;
  3408. expDiff := aExp - bExp;
  3409. aSig := ( aSig OR $00800000 ) shl 8;
  3410. bSig := ( bSig OR $00800000 ) shl 8;
  3411. if ( expDiff < 0 ) then
  3412. Begin
  3413. if ( expDiff < -1 ) then
  3414. Begin
  3415. float32_rem := a;
  3416. exit;
  3417. End;
  3418. aSig := aSig shr 1;
  3419. End;
  3420. q := bits32( bSig <= aSig );
  3421. if ( q <> 0) then
  3422. aSig := aSig - bSig;
  3423. expDiff := expDiff - 32;
  3424. while ( 0 < expDiff ) do
  3425. Begin
  3426. q := estimateDiv64To32( aSig, 0, bSig );
  3427. if (2 < q) then
  3428. q := q - 2
  3429. else
  3430. q := 0;
  3431. aSig := - ( ( bSig shr 2 ) * q );
  3432. expDiff := expDiff - 30;
  3433. End;
  3434. expDiff := expDiff + 32;
  3435. if ( 0 < expDiff ) then
  3436. Begin
  3437. q := estimateDiv64To32( aSig, 0, bSig );
  3438. if (2 < q) then
  3439. q := q - 2
  3440. else
  3441. q := 0;
  3442. q := q shr (32 - expDiff);
  3443. bSig := bSig shr 2;
  3444. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3445. End
  3446. else
  3447. Begin
  3448. aSig := aSig shr 2;
  3449. bSig := bSig shr 2;
  3450. End;
  3451. Repeat
  3452. alternateASig := aSig;
  3453. Inc(q);
  3454. aSig := aSig - bSig;
  3455. Until not ( 0 <= sbits32 (aSig) );
  3456. sigMean := aSig + alternateASig;
  3457. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3458. Begin
  3459. aSig := alternateASig;
  3460. End;
  3461. zSign := flag( sbits32 (aSig) < 0 );
  3462. if ( zSign<>0 ) then
  3463. aSig := - aSig;
  3464. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3465. End;
  3466. {*
  3467. -------------------------------------------------------------------------------
  3468. Returns the square root of the single-precision floating-point value `a'.
  3469. The operation is performed according to the IEC/IEEE Standard for Binary
  3470. Floating-Point Arithmetic.
  3471. -------------------------------------------------------------------------------
  3472. *}
  3473. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3474. Var
  3475. aSign : flag;
  3476. aExp, zExp : int16;
  3477. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3478. label roundAndPack;
  3479. Begin
  3480. aSig := extractFloat32Frac( a.float32 );
  3481. aExp := extractFloat32Exp( a.float32 );
  3482. aSign := extractFloat32Sign( a.float32 );
  3483. if ( aExp = $FF ) then
  3484. Begin
  3485. if ( aSig <> 0) then
  3486. Begin
  3487. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3488. exit;
  3489. End;
  3490. if ( aSign = 0) then
  3491. Begin
  3492. float32_sqrt := a;
  3493. exit;
  3494. End;
  3495. float_raise( float_flag_invalid );
  3496. float32_sqrt.float32 := float32_default_nan;
  3497. exit;
  3498. End;
  3499. if ( aSign <> 0) then
  3500. Begin
  3501. if ( ( aExp OR aSig ) = 0 ) then
  3502. Begin
  3503. float32_sqrt := a;
  3504. exit;
  3505. End;
  3506. float_raise( float_flag_invalid );
  3507. float32_sqrt.float32 := float32_default_nan;
  3508. exit;
  3509. End;
  3510. if ( aExp = 0 ) then
  3511. Begin
  3512. if ( aSig = 0 ) then
  3513. Begin
  3514. float32_sqrt.float32 := 0;
  3515. exit;
  3516. End;
  3517. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3518. End;
  3519. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3520. aSig := ( aSig OR $00800000 ) shl 8;
  3521. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3522. if ( ( zSig and $7F ) <= 5 ) then
  3523. Begin
  3524. if ( zSig < 2 ) then
  3525. Begin
  3526. zSig := $7FFFFFFF;
  3527. goto roundAndPack;
  3528. End
  3529. else
  3530. Begin
  3531. aSig := aSig shr (aExp and 1);
  3532. mul32To64( zSig, zSig, term0, term1 );
  3533. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3534. while ( sbits32 (rem0) < 0 ) do
  3535. Begin
  3536. Dec(zSig);
  3537. shortShift64Left( 0, zSig, 1, term0, term1 );
  3538. term1 := term1 or 1;
  3539. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3540. End;
  3541. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3542. End;
  3543. End;
  3544. shift32RightJamming( zSig, 1, zSig );
  3545. roundAndPack:
  3546. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3547. End;
  3548. {*
  3549. -------------------------------------------------------------------------------
  3550. Returns 1 if the single-precision floating-point value `a' is equal to
  3551. the corresponding value `b', and 0 otherwise. The comparison is performed
  3552. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3553. -------------------------------------------------------------------------------
  3554. *}
  3555. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3556. Begin
  3557. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3558. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3559. ) then
  3560. Begin
  3561. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3562. Begin
  3563. float_raise( float_flag_invalid );
  3564. End;
  3565. float32_eq := 0;
  3566. exit;
  3567. End;
  3568. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3569. End;
  3570. {*
  3571. -------------------------------------------------------------------------------
  3572. Returns 1 if the single-precision floating-point value `a' is less than
  3573. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3574. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3575. Arithmetic.
  3576. -------------------------------------------------------------------------------
  3577. *}
  3578. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3579. var
  3580. aSign, bSign: flag;
  3581. Begin
  3582. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3583. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3584. ) then
  3585. Begin
  3586. float_raise( float_flag_invalid );
  3587. float32_le := 0;
  3588. exit;
  3589. End;
  3590. aSign := extractFloat32Sign( a.float32 );
  3591. bSign := extractFloat32Sign( b.float32 );
  3592. if ( aSign <> bSign ) then
  3593. Begin
  3594. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3595. exit;
  3596. End;
  3597. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3598. End;
  3599. {*
  3600. -------------------------------------------------------------------------------
  3601. Returns 1 if the single-precision floating-point value `a' is less than
  3602. the corresponding value `b', and 0 otherwise. The comparison is performed
  3603. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3604. -------------------------------------------------------------------------------
  3605. *}
  3606. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3607. var
  3608. aSign, bSign: flag;
  3609. Begin
  3610. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3611. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3612. ) then
  3613. Begin
  3614. float_raise( float_flag_invalid );
  3615. float32_lt :=0;
  3616. exit;
  3617. End;
  3618. aSign := extractFloat32Sign( a.float32 );
  3619. bSign := extractFloat32Sign( b.float32 );
  3620. if ( aSign <> bSign ) then
  3621. Begin
  3622. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3623. exit;
  3624. End;
  3625. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3626. End;
  3627. {*
  3628. -------------------------------------------------------------------------------
  3629. Returns 1 if the single-precision floating-point value `a' is equal to
  3630. the corresponding value `b', and 0 otherwise. The invalid exception is
  3631. raised if either operand is a NaN. Otherwise, the comparison is performed
  3632. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3633. -------------------------------------------------------------------------------
  3634. *}
  3635. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3636. Begin
  3637. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3638. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3639. ) then
  3640. Begin
  3641. float_raise( float_flag_invalid );
  3642. float32_eq_signaling := 0;
  3643. exit;
  3644. End;
  3645. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3646. End;
  3647. {*
  3648. -------------------------------------------------------------------------------
  3649. Returns 1 if the single-precision floating-point value `a' is less than or
  3650. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3651. cause an exception. Otherwise, the comparison is performed according to the
  3652. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3653. -------------------------------------------------------------------------------
  3654. *}
  3655. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3656. Var
  3657. aSign, bSign: flag;
  3658. aExp, bExp: int16;
  3659. Begin
  3660. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3661. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3662. ) then
  3663. Begin
  3664. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3665. Begin
  3666. float_raise( float_flag_invalid );
  3667. End;
  3668. float32_le_quiet := 0;
  3669. exit;
  3670. End;
  3671. aSign := extractFloat32Sign( a );
  3672. bSign := extractFloat32Sign( b );
  3673. if ( aSign <> bSign ) then
  3674. Begin
  3675. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3676. exit;
  3677. End;
  3678. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3679. End;
  3680. {*
  3681. -------------------------------------------------------------------------------
  3682. Returns 1 if the single-precision floating-point value `a' is less than
  3683. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3684. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3685. Standard for Binary Floating-Point Arithmetic.
  3686. -------------------------------------------------------------------------------
  3687. *}
  3688. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  3689. Var
  3690. aSign, bSign: flag;
  3691. Begin
  3692. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3693. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3694. ) then
  3695. Begin
  3696. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3697. Begin
  3698. float_raise( float_flag_invalid );
  3699. End;
  3700. float32_lt_quiet := 0;
  3701. exit;
  3702. End;
  3703. aSign := extractFloat32Sign( a );
  3704. bSign := extractFloat32Sign( b );
  3705. if ( aSign <> bSign ) then
  3706. Begin
  3707. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  3708. exit;
  3709. End;
  3710. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  3711. End;
  3712. {*
  3713. -------------------------------------------------------------------------------
  3714. Returns the result of converting the double-precision floating-point value
  3715. `a' to the 32-bit two's complement integer format. The conversion is
  3716. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3717. Arithmetic---which means in particular that the conversion is rounded
  3718. according to the current rounding mode. If `a' is a NaN, the largest
  3719. positive integer is returned. Otherwise, if the conversion overflows, the
  3720. largest integer with the same sign as `a' is returned.
  3721. -------------------------------------------------------------------------------
  3722. *}
  3723. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  3724. var
  3725. aSign: flag;
  3726. aExp, shiftCount: int16;
  3727. aSig0, aSig1, absZ, aSigExtra: bits32;
  3728. z: int32;
  3729. roundingMode: int8;
  3730. label invalid;
  3731. Begin
  3732. aSig1 := extractFloat64Frac1( a );
  3733. aSig0 := extractFloat64Frac0( a );
  3734. aExp := extractFloat64Exp( a );
  3735. aSign := extractFloat64Sign( a );
  3736. shiftCount := aExp - $413;
  3737. if ( 0 <= shiftCount ) then
  3738. Begin
  3739. if ( $41E < aExp ) then
  3740. Begin
  3741. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3742. aSign := 0;
  3743. goto invalid;
  3744. End;
  3745. shortShift64Left(
  3746. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3747. if ( $80000000 < absZ ) then
  3748. goto invalid;
  3749. End
  3750. else
  3751. Begin
  3752. aSig1 := flag( aSig1 <> 0 );
  3753. if ( aExp < $3FE ) then
  3754. Begin
  3755. aSigExtra := aExp OR aSig0 OR aSig1;
  3756. absZ := 0;
  3757. End
  3758. else
  3759. Begin
  3760. aSig0 := aSig0 OR $00100000;
  3761. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3762. absZ := aSig0 shr ( - shiftCount );
  3763. End;
  3764. End;
  3765. roundingMode := float_rounding_mode;
  3766. if ( roundingMode = float_round_nearest_even ) then
  3767. Begin
  3768. if ( sbits32(aSigExtra) < 0 ) then
  3769. Begin
  3770. Inc(absZ);
  3771. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  3772. absZ := absZ and not 1;
  3773. End;
  3774. if aSign <> 0 then
  3775. z := - absZ
  3776. else
  3777. z := absZ;
  3778. End
  3779. else
  3780. Begin
  3781. aSigExtra := bits32( aSigExtra <> 0 );
  3782. if ( aSign <> 0) then
  3783. Begin
  3784. z := - ( absZ
  3785. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  3786. End
  3787. else
  3788. Begin
  3789. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  3790. End
  3791. End;
  3792. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  3793. Begin
  3794. invalid:
  3795. float_raise( float_flag_invalid );
  3796. if (aSign <> 0 ) then
  3797. float64_to_int32 := sbits32 ($80000000)
  3798. else
  3799. float64_to_int32 := $7FFFFFFF;
  3800. exit;
  3801. End;
  3802. if ( aSigExtra <> 0) then
  3803. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3804. float64_to_int32 := z;
  3805. End;
  3806. {*
  3807. -------------------------------------------------------------------------------
  3808. Returns the result of converting the double-precision floating-point value
  3809. `a' to the 32-bit two's complement integer format. The conversion is
  3810. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3811. Arithmetic, except that the conversion is always rounded toward zero.
  3812. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3813. the conversion overflows, the largest integer with the same sign as `a' is
  3814. returned.
  3815. -------------------------------------------------------------------------------
  3816. *}
  3817. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  3818. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  3819. Var
  3820. aSign: flag;
  3821. aExp, shiftCount: int16;
  3822. aSig0, aSig1, absZ, aSigExtra: bits32;
  3823. z: int32;
  3824. label invalid;
  3825. Begin
  3826. aSig1 := extractFloat64Frac1( a );
  3827. aSig0 := extractFloat64Frac0( a );
  3828. aExp := extractFloat64Exp( a );
  3829. aSign := extractFloat64Sign( a );
  3830. shiftCount := aExp - $413;
  3831. if ( 0 <= shiftCount ) then
  3832. Begin
  3833. if ( $41E < aExp ) then
  3834. Begin
  3835. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3836. aSign := 0;
  3837. goto invalid;
  3838. End;
  3839. shortShift64Left(
  3840. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3841. End
  3842. else
  3843. Begin
  3844. if ( aExp < $3FF ) then
  3845. Begin
  3846. if ( aExp OR aSig0 OR aSig1 )<>0 then
  3847. Begin
  3848. softfloat_exception_flags :=
  3849. softfloat_exception_flags or float_flag_inexact;
  3850. End;
  3851. float64_to_int32_round_to_zero := 0;
  3852. exit;
  3853. End;
  3854. aSig0 := aSig0 or $00100000;
  3855. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3856. absZ := aSig0 shr ( - shiftCount );
  3857. End;
  3858. if aSign <> 0 then
  3859. z := - absZ
  3860. else
  3861. z := absZ;
  3862. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  3863. Begin
  3864. invalid:
  3865. float_raise( float_flag_invalid );
  3866. if (aSign <> 0) then
  3867. float64_to_int32_round_to_zero := sbits32 ($80000000)
  3868. else
  3869. float64_to_int32_round_to_zero := $7FFFFFFF;
  3870. exit;
  3871. End;
  3872. if ( aSigExtra <> 0) then
  3873. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3874. float64_to_int32_round_to_zero := z;
  3875. End;
  3876. {*
  3877. -------------------------------------------------------------------------------
  3878. Returns the result of converting the double-precision floating-point value
  3879. `a' to the single-precision floating-point format. The conversion is
  3880. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3881. Arithmetic.
  3882. -------------------------------------------------------------------------------
  3883. *}
  3884. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  3885. Var
  3886. aSign: flag;
  3887. aExp: int16;
  3888. aSig0, aSig1, zSig: bits32;
  3889. allZero: bits32;
  3890. tmp : CommonNanT;
  3891. Begin
  3892. aSig1 := extractFloat64Frac1( a );
  3893. aSig0 := extractFloat64Frac0( a );
  3894. aExp := extractFloat64Exp( a );
  3895. aSign := extractFloat64Sign( a );
  3896. if ( aExp = $7FF ) then
  3897. Begin
  3898. if ( aSig0 OR aSig1 ) <> 0 then
  3899. Begin
  3900. float64ToCommonNaN( a, tmp );
  3901. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  3902. exit;
  3903. End;
  3904. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  3905. exit;
  3906. End;
  3907. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  3908. if ( aExp <> 0) then
  3909. zSig := zSig OR $40000000;
  3910. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  3911. End;
  3912. {*
  3913. -------------------------------------------------------------------------------
  3914. Rounds the double-precision floating-point value `a' to an integer,
  3915. and returns the result as a double-precision floating-point value. The
  3916. operation is performed according to the IEC/IEEE Standard for Binary
  3917. Floating-Point Arithmetic.
  3918. -------------------------------------------------------------------------------
  3919. *}
  3920. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  3921. Var
  3922. aSign: flag;
  3923. aExp: int16;
  3924. lastBitMask, roundBitsMask: bits32;
  3925. roundingMode: int8;
  3926. z: float64;
  3927. Begin
  3928. aExp := extractFloat64Exp( a );
  3929. if ( $413 <= aExp ) then
  3930. Begin
  3931. if ( $433 <= aExp ) then
  3932. Begin
  3933. if ( ( aExp = $7FF )
  3934. AND
  3935. (
  3936. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  3937. ) <>0)
  3938. ) then
  3939. Begin
  3940. propagateFloat64NaN( a, a, result );
  3941. exit;
  3942. End;
  3943. result := a;
  3944. exit;
  3945. End;
  3946. lastBitMask := 1;
  3947. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  3948. roundBitsMask := lastBitMask - 1;
  3949. z := a;
  3950. roundingMode := float_rounding_mode;
  3951. if ( roundingMode = float_round_nearest_even ) then
  3952. Begin
  3953. if ( lastBitMask <> 0) then
  3954. Begin
  3955. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  3956. if ( ( z.low and roundBitsMask ) = 0 ) then
  3957. z.low := z.low and not lastBitMask;
  3958. End
  3959. else
  3960. Begin
  3961. if ( sbits32 (z.low) < 0 ) then
  3962. Begin
  3963. Inc(z.high);
  3964. if ( bits32 ( z.low shl 1 ) = 0 ) then
  3965. z.high := z.high and not 1;
  3966. End;
  3967. End;
  3968. End
  3969. else if ( roundingMode <> float_round_to_zero ) then
  3970. Begin
  3971. if ( extractFloat64Sign( z )
  3972. xor flag( roundingMode = float_round_up ) )<> 0 then
  3973. Begin
  3974. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  3975. End;
  3976. End;
  3977. z.low := z.low and not roundBitsMask;
  3978. End
  3979. else
  3980. Begin
  3981. if ( aExp <= $3FE ) then
  3982. Begin
  3983. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  3984. Begin
  3985. result := a;
  3986. exit;
  3987. End;
  3988. softfloat_exception_flags := softfloat_exception_flags or
  3989. float_flag_inexact;
  3990. aSign := extractFloat64Sign( a );
  3991. case ( float_rounding_mode ) of
  3992. float_round_nearest_even:
  3993. Begin
  3994. if ( ( aExp = $3FE )
  3995. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  3996. ) then
  3997. Begin
  3998. packFloat64( aSign, $3FF, 0, 0, result );
  3999. exit;
  4000. End;
  4001. End;
  4002. float_round_down:
  4003. Begin
  4004. if aSign<>0 then
  4005. packFloat64( 1, $3FF, 0, 0, result )
  4006. else
  4007. packFloat64( 0, 0, 0, 0, result );
  4008. exit;
  4009. End;
  4010. float_round_up:
  4011. Begin
  4012. if aSign <> 0 then
  4013. packFloat64( 1, 0, 0, 0, result )
  4014. else
  4015. packFloat64( 0, $3FF, 0, 0, result );
  4016. exit;
  4017. End;
  4018. end;
  4019. packFloat64( aSign, 0, 0, 0, result );
  4020. exit;
  4021. End;
  4022. lastBitMask := 1;
  4023. lastBitMask := lastBitMask shl ($413 - aExp);
  4024. roundBitsMask := lastBitMask - 1;
  4025. z.low := 0;
  4026. z.high := a.high;
  4027. roundingMode := float_rounding_mode;
  4028. if ( roundingMode = float_round_nearest_even ) then
  4029. Begin
  4030. z.high := z.high + lastBitMask shr 1;
  4031. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4032. Begin
  4033. z.high := z.high and not lastBitMask;
  4034. End;
  4035. End
  4036. else if ( roundingMode <> float_round_to_zero ) then
  4037. Begin
  4038. if ( extractFloat64Sign( z )
  4039. xor flag( roundingMode = float_round_up ) )<> 0 then
  4040. Begin
  4041. z.high := z.high or bits32( a.low <> 0 );
  4042. z.high := z.high + roundBitsMask;
  4043. End;
  4044. End;
  4045. z.high := z.high and not roundBitsMask;
  4046. End;
  4047. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4048. Begin
  4049. softfloat_exception_flags :=
  4050. softfloat_exception_flags or float_flag_inexact;
  4051. End;
  4052. result := z;
  4053. End;
  4054. {*
  4055. -------------------------------------------------------------------------------
  4056. Returns the result of adding the absolute values of the double-precision
  4057. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4058. before being returned. `zSign' is ignored if the result is a NaN.
  4059. The addition is performed according to the IEC/IEEE Standard for Binary
  4060. Floating-Point Arithmetic.
  4061. -------------------------------------------------------------------------------
  4062. *}
  4063. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4064. Var
  4065. aExp, bExp, zExp: int16;
  4066. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4067. expDiff: int16;
  4068. label shiftRight1;
  4069. label roundAndPack;
  4070. Begin
  4071. aSig1 := extractFloat64Frac1( a );
  4072. aSig0 := extractFloat64Frac0( a );
  4073. aExp := extractFloat64Exp( a );
  4074. bSig1 := extractFloat64Frac1( b );
  4075. bSig0 := extractFloat64Frac0( b );
  4076. bExp := extractFloat64Exp( b );
  4077. expDiff := aExp - bExp;
  4078. if ( 0 < expDiff ) then
  4079. Begin
  4080. if ( aExp = $7FF ) then
  4081. Begin
  4082. if ( aSig0 OR aSig1 ) <> 0 then
  4083. Begin
  4084. propagateFloat64NaN( a, b, out );
  4085. exit;
  4086. end;
  4087. out := a;
  4088. exit;
  4089. End;
  4090. if ( bExp = 0 ) then
  4091. Begin
  4092. Dec(expDiff);
  4093. End
  4094. else
  4095. Begin
  4096. bSig0 := bSig0 or $00100000;
  4097. End;
  4098. shift64ExtraRightJamming(
  4099. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4100. zExp := aExp;
  4101. End
  4102. else if ( expDiff < 0 ) then
  4103. Begin
  4104. if ( bExp = $7FF ) then
  4105. Begin
  4106. if ( bSig0 OR bSig1 ) <> 0 then
  4107. Begin
  4108. propagateFloat64NaN( a, b, out );
  4109. exit;
  4110. End;
  4111. packFloat64( zSign, $7FF, 0, 0, out );
  4112. End;
  4113. if ( aExp = 0 ) then
  4114. Begin
  4115. Inc(expDiff);
  4116. End
  4117. else
  4118. Begin
  4119. aSig0 := aSig0 or $00100000;
  4120. End;
  4121. shift64ExtraRightJamming(
  4122. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4123. zExp := bExp;
  4124. End
  4125. else
  4126. Begin
  4127. if ( aExp = $7FF ) then
  4128. Begin
  4129. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4130. Begin
  4131. propagateFloat64NaN( a, b, out );
  4132. exit;
  4133. End;
  4134. out := a;
  4135. exit;
  4136. End;
  4137. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4138. if ( aExp = 0 ) then
  4139. Begin
  4140. packFloat64( zSign, 0, zSig0, zSig1, out );
  4141. exit;
  4142. End;
  4143. zSig2 := 0;
  4144. zSig0 := zSig0 or $00200000;
  4145. zExp := aExp;
  4146. goto shiftRight1;
  4147. End;
  4148. aSig0 := aSig0 or $00100000;
  4149. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4150. Dec(zExp);
  4151. if ( zSig0 < $00200000 ) then
  4152. goto roundAndPack;
  4153. Inc(zExp);
  4154. shiftRight1:
  4155. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4156. roundAndPack:
  4157. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4158. End;
  4159. {*
  4160. -------------------------------------------------------------------------------
  4161. Returns the result of subtracting the absolute values of the double-
  4162. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4163. difference is negated before being returned. `zSign' is ignored if the
  4164. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4165. Standard for Binary Floating-Point Arithmetic.
  4166. -------------------------------------------------------------------------------
  4167. *}
  4168. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4169. Var
  4170. aExp, bExp, zExp: int16;
  4171. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4172. expDiff: int16;
  4173. z: float64;
  4174. label aExpBigger;
  4175. label bExpBigger;
  4176. label aBigger;
  4177. label bBigger;
  4178. label normalizeRoundAndPack;
  4179. Begin
  4180. aSig1 := extractFloat64Frac1( a );
  4181. aSig0 := extractFloat64Frac0( a );
  4182. aExp := extractFloat64Exp( a );
  4183. bSig1 := extractFloat64Frac1( b );
  4184. bSig0 := extractFloat64Frac0( b );
  4185. bExp := extractFloat64Exp( b );
  4186. expDiff := aExp - bExp;
  4187. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4188. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4189. if ( 0 < expDiff ) then goto aExpBigger;
  4190. if ( expDiff < 0 ) then goto bExpBigger;
  4191. if ( aExp = $7FF ) then
  4192. Begin
  4193. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4194. Begin
  4195. propagateFloat64NaN( a, b, out );
  4196. exit;
  4197. End;
  4198. float_raise( float_flag_invalid );
  4199. z.low := float64_default_nan_low;
  4200. z.high := float64_default_nan_high;
  4201. out := z;
  4202. exit;
  4203. End;
  4204. if ( aExp = 0 ) then
  4205. Begin
  4206. aExp := 1;
  4207. bExp := 1;
  4208. End;
  4209. if ( bSig0 < aSig0 ) then goto aBigger;
  4210. if ( aSig0 < bSig0 ) then goto bBigger;
  4211. if ( bSig1 < aSig1 ) then goto aBigger;
  4212. if ( aSig1 < bSig1 ) then goto bBigger;
  4213. packFloat64( flag(float_rounding_mode = float_round_down), 0, 0, 0 , out);
  4214. exit;
  4215. bExpBigger:
  4216. if ( bExp = $7FF ) then
  4217. Begin
  4218. if ( bSig0 OR bSig1 ) <> 0 then
  4219. Begin
  4220. propagateFloat64NaN( a, b, out );
  4221. exit;
  4222. End;
  4223. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4224. exit;
  4225. End;
  4226. if ( aExp = 0 ) then
  4227. Begin
  4228. Inc(expDiff);
  4229. End
  4230. else
  4231. Begin
  4232. aSig0 := aSig0 or $40000000;
  4233. End;
  4234. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4235. bSig0 := bSig0 or $40000000;
  4236. bBigger:
  4237. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4238. zExp := bExp;
  4239. zSign := zSign xor 1;
  4240. goto normalizeRoundAndPack;
  4241. aExpBigger:
  4242. if ( aExp = $7FF ) then
  4243. Begin
  4244. if ( aSig0 OR aSig1 ) <> 0 then
  4245. Begin
  4246. propagateFloat64NaN( a, b, out );
  4247. exit;
  4248. End;
  4249. out := a;
  4250. exit;
  4251. End;
  4252. if ( bExp = 0 ) then
  4253. Begin
  4254. Dec(expDiff);
  4255. End
  4256. else
  4257. Begin
  4258. bSig0 := bSig0 or $40000000;
  4259. End;
  4260. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4261. aSig0 := aSig0 or $40000000;
  4262. aBigger:
  4263. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4264. zExp := aExp;
  4265. normalizeRoundAndPack:
  4266. Dec(zExp);
  4267. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4268. End;
  4269. {*
  4270. -------------------------------------------------------------------------------
  4271. Returns the result of adding the double-precision floating-point values `a'
  4272. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4273. Binary Floating-Point Arithmetic.
  4274. -------------------------------------------------------------------------------
  4275. *}
  4276. Function float64_add( a: float64; b : float64) : Float64;
  4277. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4278. Var
  4279. aSign, bSign: flag;
  4280. Begin
  4281. aSign := extractFloat64Sign( a );
  4282. bSign := extractFloat64Sign( b );
  4283. if ( aSign = bSign ) then
  4284. Begin
  4285. addFloat64Sigs( a, b, aSign, result );
  4286. End
  4287. else
  4288. Begin
  4289. subFloat64Sigs( a, b, aSign, result );
  4290. End;
  4291. End;
  4292. {*
  4293. -------------------------------------------------------------------------------
  4294. Returns the result of subtracting the double-precision floating-point values
  4295. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4296. for Binary Floating-Point Arithmetic.
  4297. -------------------------------------------------------------------------------
  4298. *}
  4299. Function float64_sub(a: float64; b : float64) : Float64;
  4300. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4301. Var
  4302. aSign, bSign: flag;
  4303. Begin
  4304. aSign := extractFloat64Sign( a );
  4305. bSign := extractFloat64Sign( b );
  4306. if ( aSign = bSign ) then
  4307. Begin
  4308. subFloat64Sigs( a, b, aSign, result );
  4309. End
  4310. else
  4311. Begin
  4312. addFloat64Sigs( a, b, aSign, result );
  4313. End;
  4314. End;
  4315. {*
  4316. -------------------------------------------------------------------------------
  4317. Returns the result of multiplying the double-precision floating-point values
  4318. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4319. for Binary Floating-Point Arithmetic.
  4320. -------------------------------------------------------------------------------
  4321. *}
  4322. Function float64_mul( a: float64; b:float64) : Float64;
  4323. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4324. Var
  4325. aSign, bSign, zSign: flag;
  4326. aExp, bExp, zExp: int16;
  4327. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4328. z: float64;
  4329. label invalid;
  4330. Begin
  4331. aSig1 := extractFloat64Frac1( a );
  4332. aSig0 := extractFloat64Frac0( a );
  4333. aExp := extractFloat64Exp( a );
  4334. aSign := extractFloat64Sign( a );
  4335. bSig1 := extractFloat64Frac1( b );
  4336. bSig0 := extractFloat64Frac0( b );
  4337. bExp := extractFloat64Exp( b );
  4338. bSign := extractFloat64Sign( b );
  4339. zSign := aSign xor bSign;
  4340. if ( aExp = $7FF ) then
  4341. Begin
  4342. if ( (( aSig0 OR aSig1 ) <>0)
  4343. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4344. Begin
  4345. propagateFloat64NaN( a, b, result );
  4346. exit;
  4347. End;
  4348. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4349. packFloat64( zSign, $7FF, 0, 0, result );
  4350. exit;
  4351. End;
  4352. if ( bExp = $7FF ) then
  4353. Begin
  4354. if ( bSig0 OR bSig1 )<> 0 then
  4355. Begin
  4356. propagateFloat64NaN( a, b, result );
  4357. exit;
  4358. End;
  4359. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4360. Begin
  4361. invalid:
  4362. float_raise( float_flag_invalid );
  4363. z.low := float64_default_nan_low;
  4364. z.high := float64_default_nan_high;
  4365. result := z;
  4366. exit;
  4367. End;
  4368. packFloat64( zSign, $7FF, 0, 0, result );
  4369. exit;
  4370. End;
  4371. if ( aExp = 0 ) then
  4372. Begin
  4373. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4374. Begin
  4375. packFloat64( zSign, 0, 0, 0, result );
  4376. exit;
  4377. End;
  4378. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4379. End;
  4380. if ( bExp = 0 ) then
  4381. Begin
  4382. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4383. Begin
  4384. packFloat64( zSign, 0, 0, 0, result );
  4385. exit;
  4386. End;
  4387. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4388. End;
  4389. zExp := aExp + bExp - $400;
  4390. aSig0 := aSig0 or $00100000;
  4391. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4392. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4393. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4394. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4395. if ( $00200000 <= zSig0 ) then
  4396. Begin
  4397. shift64ExtraRightJamming(
  4398. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4399. Inc(zExp);
  4400. End;
  4401. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4402. End;
  4403. {*
  4404. -------------------------------------------------------------------------------
  4405. Returns the result of dividing the double-precision floating-point value `a'
  4406. by the corresponding value `b'. The operation is performed according to the
  4407. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4408. -------------------------------------------------------------------------------
  4409. *}
  4410. Function float64_div(a: float64; b : float64) : Float64;
  4411. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4412. Var
  4413. aSign, bSign, zSign: flag;
  4414. aExp, bExp, zExp: int16;
  4415. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4416. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4417. z: float64;
  4418. label invalid;
  4419. Begin
  4420. aSig1 := extractFloat64Frac1( a );
  4421. aSig0 := extractFloat64Frac0( a );
  4422. aExp := extractFloat64Exp( a );
  4423. aSign := extractFloat64Sign( a );
  4424. bSig1 := extractFloat64Frac1( b );
  4425. bSig0 := extractFloat64Frac0( b );
  4426. bExp := extractFloat64Exp( b );
  4427. bSign := extractFloat64Sign( b );
  4428. zSign := aSign xor bSign;
  4429. if ( aExp = $7FF ) then
  4430. Begin
  4431. if ( aSig0 OR aSig1 )<> 0 then
  4432. Begin
  4433. propagateFloat64NaN( a, b, result );
  4434. exit;
  4435. end;
  4436. if ( bExp = $7FF ) then
  4437. Begin
  4438. if ( bSig0 OR bSig1 )<>0 then
  4439. Begin
  4440. propagateFloat64NaN( a, b, result );
  4441. exit;
  4442. End;
  4443. goto invalid;
  4444. End;
  4445. packFloat64( zSign, $7FF, 0, 0, result );
  4446. exit;
  4447. End;
  4448. if ( bExp = $7FF ) then
  4449. Begin
  4450. if ( bSig0 OR bSig1 )<> 0 then
  4451. Begin
  4452. propagateFloat64NaN( a, b, result );
  4453. exit;
  4454. End;
  4455. packFloat64( zSign, 0, 0, 0, result );
  4456. exit;
  4457. End;
  4458. if ( bExp = 0 ) then
  4459. Begin
  4460. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4461. Begin
  4462. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4463. Begin
  4464. invalid:
  4465. float_raise( float_flag_invalid );
  4466. z.low := float64_default_nan_low;
  4467. z.high := float64_default_nan_high;
  4468. result := z;
  4469. exit;
  4470. End;
  4471. float_raise( float_flag_divbyzero );
  4472. packFloat64( zSign, $7FF, 0, 0, result );
  4473. exit;
  4474. End;
  4475. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4476. End;
  4477. if ( aExp = 0 ) then
  4478. Begin
  4479. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4480. Begin
  4481. packFloat64( zSign, 0, 0, 0, result );
  4482. exit;
  4483. End;
  4484. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4485. End;
  4486. zExp := aExp - bExp + $3FD;
  4487. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4488. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4489. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4490. Begin
  4491. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4492. Inc(zExp);
  4493. End;
  4494. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4495. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4496. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4497. while ( sbits32 (rem0) < 0 ) do
  4498. Begin
  4499. Dec(zSig0);
  4500. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4501. End;
  4502. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4503. if ( ( zSig1 and $3FF ) <= 4 ) then
  4504. Begin
  4505. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4506. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4507. while ( sbits32 (rem1) < 0 ) do
  4508. Begin
  4509. Dec(zSig1);
  4510. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4511. End;
  4512. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4513. End;
  4514. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4515. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4516. End;
  4517. {*
  4518. -------------------------------------------------------------------------------
  4519. Returns the remainder of the double-precision floating-point value `a'
  4520. with respect to the corresponding value `b'. The operation is performed
  4521. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4522. -------------------------------------------------------------------------------
  4523. *}
  4524. Function float64_rem(a: float64; b : float64) : float64;
  4525. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4526. Var
  4527. aSign, bSign, zSign: flag;
  4528. aExp, bExp, expDiff: int16;
  4529. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4530. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4531. sigMean0: sbits32;
  4532. z: float64;
  4533. label invalid;
  4534. Begin
  4535. aSig1 := extractFloat64Frac1( a );
  4536. aSig0 := extractFloat64Frac0( a );
  4537. aExp := extractFloat64Exp( a );
  4538. aSign := extractFloat64Sign( a );
  4539. bSig1 := extractFloat64Frac1( b );
  4540. bSig0 := extractFloat64Frac0( b );
  4541. bExp := extractFloat64Exp( b );
  4542. bSign := extractFloat64Sign( b );
  4543. if ( aExp = $7FF ) then
  4544. Begin
  4545. if ((( aSig0 OR aSig1 )<>0)
  4546. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4547. Begin
  4548. propagateFloat64NaN( a, b, result );
  4549. exit;
  4550. End;
  4551. goto invalid;
  4552. End;
  4553. if ( bExp = $7FF ) then
  4554. Begin
  4555. if ( bSig0 OR bSig1 ) <> 0 then
  4556. Begin
  4557. propagateFloat64NaN( a, b, result );
  4558. exit;
  4559. End;
  4560. result := a;
  4561. exit;
  4562. End;
  4563. if ( bExp = 0 ) then
  4564. Begin
  4565. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4566. Begin
  4567. invalid:
  4568. float_raise( float_flag_invalid );
  4569. z.low := float64_default_nan_low;
  4570. z.high := float64_default_nan_high;
  4571. result := z;
  4572. exit;
  4573. End;
  4574. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4575. End;
  4576. if ( aExp = 0 ) then
  4577. Begin
  4578. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4579. Begin
  4580. result := a;
  4581. exit;
  4582. End;
  4583. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4584. End;
  4585. expDiff := aExp - bExp;
  4586. if ( expDiff < -1 ) then
  4587. Begin
  4588. result := a;
  4589. exit;
  4590. End;
  4591. shortShift64Left(
  4592. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  4593. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4594. q := le64( bSig0, bSig1, aSig0, aSig1 );
  4595. if ( q )<>0 then
  4596. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4597. expDiff := expDiff - 32;
  4598. while ( 0 < expDiff ) do
  4599. Begin
  4600. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4601. if 4 < q then
  4602. q:= q - 4
  4603. else
  4604. q := 0;
  4605. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4606. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  4607. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  4608. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  4609. expDiff := expDiff - 29;
  4610. End;
  4611. if ( -32 < expDiff ) then
  4612. Begin
  4613. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4614. if 4 < q then
  4615. q := q - 4
  4616. else
  4617. q := 0;
  4618. q := q shr (- expDiff);
  4619. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4620. expDiff := expDiff + 24;
  4621. if ( expDiff < 0 ) then
  4622. Begin
  4623. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4624. End
  4625. else
  4626. Begin
  4627. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  4628. End;
  4629. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4630. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  4631. End
  4632. else
  4633. Begin
  4634. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  4635. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4636. End;
  4637. Repeat
  4638. alternateASig0 := aSig0;
  4639. alternateASig1 := aSig1;
  4640. Inc(q);
  4641. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4642. Until not ( 0 <= sbits32 (aSig0) );
  4643. add64(
  4644. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  4645. if ( ( sigMean0 < 0 )
  4646. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  4647. Begin
  4648. aSig0 := alternateASig0;
  4649. aSig1 := alternateASig1;
  4650. End;
  4651. zSign := flag( sbits32 (aSig0) < 0 );
  4652. if ( zSign <> 0 ) then
  4653. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  4654. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  4655. End;
  4656. {*
  4657. -------------------------------------------------------------------------------
  4658. Returns the square root of the double-precision floating-point value `a'.
  4659. The operation is performed according to the IEC/IEEE Standard for Binary
  4660. Floating-Point Arithmetic.
  4661. -------------------------------------------------------------------------------
  4662. *}
  4663. Procedure float64_sqrt( a: float64; var out: float64 );
  4664. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  4665. Var
  4666. aSign: flag;
  4667. aExp, zExp: int16;
  4668. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  4669. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4670. z: float64;
  4671. label invalid;
  4672. Begin
  4673. aSig1 := extractFloat64Frac1( a );
  4674. aSig0 := extractFloat64Frac0( a );
  4675. aExp := extractFloat64Exp( a );
  4676. aSign := extractFloat64Sign( a );
  4677. if ( aExp = $7FF ) then
  4678. Begin
  4679. if ( aSig0 OR aSig1 ) <> 0 then
  4680. Begin
  4681. propagateFloat64NaN( a, a, out );
  4682. exit;
  4683. End;
  4684. if ( aSign = 0) then
  4685. Begin
  4686. out := a;
  4687. exit;
  4688. End;
  4689. goto invalid;
  4690. End;
  4691. if ( aSign <> 0 ) then
  4692. Begin
  4693. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4694. Begin
  4695. out := a;
  4696. exit;
  4697. End;
  4698. invalid:
  4699. float_raise( float_flag_invalid );
  4700. z.low := float64_default_nan_low;
  4701. z.high := float64_default_nan_high;
  4702. out := z;
  4703. exit;
  4704. End;
  4705. if ( aExp = 0 ) then
  4706. Begin
  4707. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4708. Begin
  4709. packFloat64( 0, 0, 0, 0, out );
  4710. exit;
  4711. End;
  4712. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4713. End;
  4714. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  4715. aSig0 := aSig0 or $00100000;
  4716. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  4717. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  4718. if ( zSig0 = 0 ) then
  4719. zSig0 := $7FFFFFFF;
  4720. doubleZSig0 := zSig0 + zSig0;
  4721. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  4722. mul32To64( zSig0, zSig0, term0, term1 );
  4723. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  4724. while ( sbits32 (rem0) < 0 ) do
  4725. Begin
  4726. Dec(zSig0);
  4727. doubleZSig0 := doubleZSig0 - 2;
  4728. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  4729. End;
  4730. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  4731. if ( ( zSig1 and $1FF ) <= 5 ) then
  4732. Begin
  4733. if ( zSig1 = 0 ) then
  4734. zSig1 := 1;
  4735. mul32To64( doubleZSig0, zSig1, term1, term2 );
  4736. sub64( rem1, 0, term1, term2, rem1, rem2 );
  4737. mul32To64( zSig1, zSig1, term2, term3 );
  4738. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  4739. while ( sbits32 (rem1) < 0 ) do
  4740. Begin
  4741. Dec(zSig1);
  4742. shortShift64Left( 0, zSig1, 1, term2, term3 );
  4743. term3 := term3 or 1;
  4744. term2 := term2 or doubleZSig0;
  4745. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  4746. End;
  4747. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4748. End;
  4749. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  4750. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  4751. End;
  4752. {*
  4753. -------------------------------------------------------------------------------
  4754. Returns 1 if the double-precision floating-point value `a' is equal to
  4755. the corresponding value `b', and 0 otherwise. The comparison is performed
  4756. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4757. -------------------------------------------------------------------------------
  4758. *}
  4759. Function float64_eq(a: float64; b: float64): flag;
  4760. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  4761. Begin
  4762. if
  4763. (
  4764. ( extractFloat64Exp( a ) = $7FF )
  4765. AND
  4766. (
  4767. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4768. )
  4769. )
  4770. OR (
  4771. ( extractFloat64Exp( b ) = $7FF )
  4772. AND (
  4773. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4774. )
  4775. )
  4776. ) then
  4777. Begin
  4778. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4779. float_raise( float_flag_invalid );
  4780. float64_eq := 0;
  4781. exit;
  4782. End;
  4783. float64_eq := flag(
  4784. ( a.low = b.low )
  4785. AND ( ( a.high = b.high )
  4786. OR ( ( a.low = 0 )
  4787. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4788. ));
  4789. End;
  4790. {*
  4791. -------------------------------------------------------------------------------
  4792. Returns 1 if the double-precision floating-point value `a' is less than
  4793. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4794. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4795. Arithmetic.
  4796. -------------------------------------------------------------------------------
  4797. *}
  4798. Function float64_le(a: float64;b: float64): flag;
  4799. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  4800. Var
  4801. aSign, bSign: flag;
  4802. Begin
  4803. if
  4804. (
  4805. ( extractFloat64Exp( a ) = $7FF )
  4806. AND
  4807. (
  4808. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4809. )
  4810. )
  4811. OR (
  4812. ( extractFloat64Exp( b ) = $7FF )
  4813. AND (
  4814. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4815. )
  4816. )
  4817. ) then
  4818. Begin
  4819. float_raise( float_flag_invalid );
  4820. float64_le := 0;
  4821. exit;
  4822. End;
  4823. aSign := extractFloat64Sign( a );
  4824. bSign := extractFloat64Sign( b );
  4825. if ( aSign <> bSign ) then
  4826. Begin
  4827. float64_le := flag(
  4828. (aSign <> 0)
  4829. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4830. = 0 ));
  4831. exit;
  4832. End;
  4833. if aSign <> 0 then
  4834. float64_le := le64( b.high, b.low, a.high, a.low )
  4835. else
  4836. float64_le := le64( a.high, a.low, b.high, b.low );
  4837. End;
  4838. {*
  4839. -------------------------------------------------------------------------------
  4840. Returns 1 if the double-precision floating-point value `a' is less than
  4841. the corresponding value `b', and 0 otherwise. The comparison is performed
  4842. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4843. -------------------------------------------------------------------------------
  4844. *}
  4845. Function float64_lt(a: float64;b: float64): flag;
  4846. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  4847. Var
  4848. aSign, bSign: flag;
  4849. Begin
  4850. if
  4851. (
  4852. ( extractFloat64Exp( a ) = $7FF )
  4853. AND
  4854. (
  4855. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4856. )
  4857. )
  4858. OR (
  4859. ( extractFloat64Exp( b ) = $7FF )
  4860. AND (
  4861. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4862. )
  4863. )
  4864. ) then
  4865. Begin
  4866. float_raise( float_flag_invalid );
  4867. float64_lt := 0;
  4868. exit;
  4869. End;
  4870. aSign := extractFloat64Sign( a );
  4871. bSign := extractFloat64Sign( b );
  4872. if ( aSign <> bSign ) then
  4873. Begin
  4874. float64_lt := flag(
  4875. (aSign <> 0)
  4876. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4877. <> 0 ));
  4878. exit;
  4879. End;
  4880. if aSign <> 0 then
  4881. float64_lt := lt64( b.high, b.low, a.high, a.low )
  4882. else
  4883. float64_lt := lt64( a.high, a.low, b.high, b.low );
  4884. End;
  4885. {*
  4886. -------------------------------------------------------------------------------
  4887. Returns 1 if the double-precision floating-point value `a' is equal to
  4888. the corresponding value `b', and 0 otherwise. The invalid exception is
  4889. raised if either operand is a NaN. Otherwise, the comparison is performed
  4890. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4891. -------------------------------------------------------------------------------
  4892. *}
  4893. Function float64_eq_signaling( a: float64; b: float64): flag;
  4894. Begin
  4895. if
  4896. (
  4897. ( extractFloat64Exp( a ) = $7FF )
  4898. AND
  4899. (
  4900. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4901. )
  4902. )
  4903. OR (
  4904. ( extractFloat64Exp( b ) = $7FF )
  4905. AND (
  4906. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4907. )
  4908. )
  4909. ) then
  4910. Begin
  4911. float_raise( float_flag_invalid );
  4912. float64_eq_signaling := 0;
  4913. exit;
  4914. End;
  4915. float64_eq_signaling := flag(
  4916. ( a.low = b.low )
  4917. AND ( ( a.high = b.high )
  4918. OR ( ( a.low = 0 )
  4919. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4920. ));
  4921. End;
  4922. {*
  4923. -------------------------------------------------------------------------------
  4924. Returns 1 if the double-precision floating-point value `a' is less than or
  4925. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4926. cause an exception. Otherwise, the comparison is performed according to the
  4927. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4928. -------------------------------------------------------------------------------
  4929. *}
  4930. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  4931. Var
  4932. aSign, bSign : flag;
  4933. Begin
  4934. if
  4935. (
  4936. ( extractFloat64Exp( a ) = $7FF )
  4937. AND
  4938. (
  4939. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4940. )
  4941. )
  4942. OR (
  4943. ( extractFloat64Exp( b ) = $7FF )
  4944. AND (
  4945. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4946. )
  4947. )
  4948. ) then
  4949. Begin
  4950. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4951. float_raise( float_flag_invalid );
  4952. float64_le_quiet := 0;
  4953. exit;
  4954. End;
  4955. aSign := extractFloat64Sign( a );
  4956. bSign := extractFloat64Sign( b );
  4957. if ( aSign <> bSign ) then
  4958. Begin
  4959. float64_le_quiet := flag
  4960. ((aSign <> 0)
  4961. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4962. = 0 ));
  4963. exit;
  4964. End;
  4965. if aSign <> 0 then
  4966. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  4967. else
  4968. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  4969. End;
  4970. {*
  4971. -------------------------------------------------------------------------------
  4972. Returns 1 if the double-precision floating-point value `a' is less than
  4973. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4974. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4975. Standard for Binary Floating-Point Arithmetic.
  4976. -------------------------------------------------------------------------------
  4977. *}
  4978. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  4979. Var
  4980. aSign, bSign: flag;
  4981. Begin
  4982. if
  4983. (
  4984. ( extractFloat64Exp( a ) = $7FF )
  4985. AND
  4986. (
  4987. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4988. )
  4989. )
  4990. OR (
  4991. ( extractFloat64Exp( b ) = $7FF )
  4992. AND (
  4993. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4994. )
  4995. )
  4996. ) then
  4997. Begin
  4998. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4999. float_raise( float_flag_invalid );
  5000. float64_lt_quiet := 0;
  5001. exit;
  5002. End;
  5003. aSign := extractFloat64Sign( a );
  5004. bSign := extractFloat64Sign( b );
  5005. if ( aSign <> bSign ) then
  5006. Begin
  5007. float64_lt_quiet := flag(
  5008. (aSign<>0)
  5009. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5010. <> 0 ));
  5011. exit;
  5012. End;
  5013. If aSign <> 0 then
  5014. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5015. else
  5016. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5017. End;
  5018. {*----------------------------------------------------------------------------
  5019. | Returns the result of converting the 64-bit two's complement integer `a'
  5020. | to the single-precision floating-point format. The conversion is performed
  5021. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5022. *----------------------------------------------------------------------------*}
  5023. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5024. var
  5025. zSign : flag;
  5026. absA : uint64;
  5027. shiftCount: int8;
  5028. zSig : bits32;
  5029. intval : int64rec;
  5030. Begin
  5031. if ( a = 0 ) then
  5032. begin
  5033. int64_to_float32.float32 := 0;
  5034. exit;
  5035. end;
  5036. if a < 0 then
  5037. zSign := flag(TRUE)
  5038. else
  5039. zSign := flag(FALSE);
  5040. if zSign<>0 then
  5041. absA := -a
  5042. else
  5043. absA := a;
  5044. shiftCount := countLeadingZeros64( absA ) - 40;
  5045. if ( 0 <= shiftCount ) then
  5046. begin
  5047. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5048. end
  5049. else
  5050. begin
  5051. shiftCount := shiftCount + 7;
  5052. if ( shiftCount < 0 ) then
  5053. begin
  5054. intval.low := int64rec(AbsA).low;
  5055. intval.high := int64rec(AbsA).high;
  5056. shift64RightJamming( intval.low, intval.high, - shiftCount,
  5057. intval.low, intval.high);
  5058. int64rec(absA).low := intval.low;
  5059. int64rec(absA).high := intval.high;
  5060. end
  5061. else
  5062. absA := absA shl shiftCount;
  5063. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5064. end;
  5065. End;
  5066. {*----------------------------------------------------------------------------
  5067. | Returns the result of converting the 64-bit two's complement integer `a'
  5068. | to the double-precision floating-point format. The conversion is performed
  5069. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5070. *----------------------------------------------------------------------------*}
  5071. function int64_to_float64( a: int64 ): float64;
  5072. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5073. var
  5074. zSign : flag;
  5075. float_result : float64;
  5076. intval : int64rec;
  5077. AbsA : bits64;
  5078. shiftcount : int8;
  5079. zSig0, zSig1 : bits32;
  5080. Begin
  5081. if ( a = 0 ) then
  5082. Begin
  5083. packFloat64( 0, 0, 0, 0, result );
  5084. exit;
  5085. end;
  5086. zSign := flag( a < 0 );
  5087. if ZSign<>0 then
  5088. AbsA := -a
  5089. else
  5090. AbsA := a;
  5091. shiftCount := countLeadingZeros64( absA ) - 11;
  5092. if ( 0 <= shiftCount ) then
  5093. Begin
  5094. absA := absA shl shiftcount;
  5095. zSig0:=int64rec(absA).high;
  5096. zSig1:=int64rec(absA).low;
  5097. End
  5098. else
  5099. Begin
  5100. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  5101. End;
  5102. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5103. int64_to_float64:= float_result;
  5104. End;
  5105. {*----------------------------------------------------------------------------
  5106. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5107. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5108. | Otherwise, returns 0.
  5109. *----------------------------------------------------------------------------*}
  5110. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5111. begin
  5112. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5113. end;
  5114. {*----------------------------------------------------------------------------
  5115. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5116. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5117. | Otherwise, returns 0.
  5118. *----------------------------------------------------------------------------*}
  5119. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5120. begin
  5121. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5122. end;
  5123. {*----------------------------------------------------------------------------
  5124. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5125. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5126. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5127. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5128. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5129. | the most-significant bit of the extra result, and the other 63 bits of the
  5130. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5131. | were all zero. This extra result is stored in the location pointed to by
  5132. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5133. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5134. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5135. | fixed-point value is shifted right by the number of bits given in `count',
  5136. | and the integer part of the result is returned at the locations pointed to
  5137. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5138. | corrupted as described above, and is returned at the location pointed to by
  5139. | `z2Ptr'.)
  5140. *----------------------------------------------------------------------------*}
  5141. procedure shift128ExtraRightJamming(
  5142. a0: bits64;
  5143. a1: bits64;
  5144. a2: bits64;
  5145. count: int16;
  5146. var z0Ptr: bits64;
  5147. var z1Ptr: bits64;
  5148. var z2Ptr: bits64);
  5149. var
  5150. z0, z1, z2: bits64;
  5151. negCount: int8;
  5152. begin
  5153. negCount := ( - count ) and 63;
  5154. if ( count = 0 ) then
  5155. begin
  5156. z2 := a2;
  5157. z1 := a1;
  5158. z0 := a0;
  5159. end
  5160. else begin
  5161. if ( count < 64 ) then
  5162. begin
  5163. z2 := a1 shr negCount;
  5164. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5165. z0 := a0 shr count;
  5166. end
  5167. else begin
  5168. if ( count = 64 ) then
  5169. begin
  5170. z2 := a1;
  5171. z1 := a0;
  5172. end
  5173. else begin
  5174. a2 := a2 or a1;
  5175. if ( count < 128 ) then
  5176. begin
  5177. z2 := a0 shl negCount;
  5178. z1 := a0 shr ( count and 63 );
  5179. end
  5180. else begin
  5181. if ( count = 128 ) then
  5182. z2 := a0
  5183. else
  5184. z2 := ord( a0 <> 0 );
  5185. z1 := 0;
  5186. end;
  5187. end;
  5188. z0 := 0;
  5189. end;
  5190. z2 := z2 or ord( a2 <> 0 );
  5191. end;
  5192. z2Ptr := z2;
  5193. z1Ptr := z1;
  5194. z0Ptr := z0;
  5195. end;
  5196. {*----------------------------------------------------------------------------
  5197. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5198. | _plus_ the number of bits given in `count'. The shifted result is at most
  5199. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5200. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5201. | shifted off is the most-significant bit of the extra result, and the other
  5202. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5203. | bits shifted off were all zero. This extra result is stored in the location
  5204. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5205. | (This routine makes more sense if `a0' and `a1' are considered to form
  5206. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5207. | point value is shifted right by the number of bits given in `count', and
  5208. | the integer part of the result is returned at the location pointed to by
  5209. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5210. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5211. *----------------------------------------------------------------------------*}
  5212. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5213. var
  5214. z0, z1: bits64;
  5215. negCount: int8;
  5216. begin
  5217. negCount := ( - count ) and 63;
  5218. if ( count = 0 ) then
  5219. begin
  5220. z1 := a1;
  5221. z0 := a0;
  5222. end
  5223. else if ( count < 64 ) then
  5224. begin
  5225. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5226. z0 := a0 shr count;
  5227. end
  5228. else begin
  5229. if ( count = 64 ) then
  5230. begin
  5231. z1 := a0 or ord( a1 <> 0 );
  5232. end
  5233. else begin
  5234. z1 := ord( ( a0 or a1 ) <> 0 );
  5235. end;
  5236. z0 := 0;
  5237. end;
  5238. z1Ptr := z1;
  5239. z0Ptr := z0;
  5240. end;
  5241. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5242. {*----------------------------------------------------------------------------
  5243. | Returns the fraction bits of the extended double-precision floating-point
  5244. | value `a'.
  5245. *----------------------------------------------------------------------------*}
  5246. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5247. begin
  5248. result:=a.low;
  5249. end;
  5250. {*----------------------------------------------------------------------------
  5251. | Returns the exponent bits of the extended double-precision floating-point
  5252. | value `a'.
  5253. *----------------------------------------------------------------------------*}
  5254. function extractFloatx80Exp(a : floatx80): int32;inline;
  5255. begin
  5256. result:=a.high and $7FFF;
  5257. end;
  5258. {*----------------------------------------------------------------------------
  5259. | Returns the sign bit of the extended double-precision floating-point value
  5260. | `a'.
  5261. *----------------------------------------------------------------------------*}
  5262. function extractFloatx80Sign(a : floatx80): flag;inline;
  5263. begin
  5264. result:=a.high shr 15;
  5265. end;
  5266. {*----------------------------------------------------------------------------
  5267. | Normalizes the subnormal extended double-precision floating-point value
  5268. | represented by the denormalized significand `aSig'. The normalized exponent
  5269. | and significand are stored at the locations pointed to by `zExpPtr' and
  5270. | `zSigPtr', respectively.
  5271. *----------------------------------------------------------------------------*}
  5272. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5273. var
  5274. shiftCount: int8;
  5275. begin
  5276. shiftCount := countLeadingZeros64( aSig );
  5277. zSigPtr := aSig shl shiftCount;
  5278. zExpPtr := 1 - shiftCount;
  5279. end;
  5280. {*----------------------------------------------------------------------------
  5281. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5282. | extended double-precision floating-point value, returning the result.
  5283. *----------------------------------------------------------------------------*}
  5284. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5285. var
  5286. z: floatx80;
  5287. begin
  5288. z.low := zSig;
  5289. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5290. result:=z;
  5291. end;
  5292. {*----------------------------------------------------------------------------
  5293. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5294. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5295. | and returns the proper extended double-precision floating-point value
  5296. | corresponding to the abstract input. Ordinarily, the abstract value is
  5297. | rounded and packed into the extended double-precision format, with the
  5298. | inexact exception raised if the abstract input cannot be represented
  5299. | exactly. However, if the abstract value is too large, the overflow and
  5300. | inexact exceptions are raised and an infinity or maximal finite value is
  5301. | returned. If the abstract value is too small, the input value is rounded to
  5302. | a subnormal number, and the underflow and inexact exceptions are raised if
  5303. | the abstract input cannot be represented exactly as a subnormal extended
  5304. | double-precision floating-point number.
  5305. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5306. | number of bits as single or double precision, respectively. Otherwise, the
  5307. | result is rounded to the full precision of the extended double-precision
  5308. | format.
  5309. | The input significand must be normalized or smaller. If the input
  5310. | significand is not normalized, `zExp' must be 0; in that case, the result
  5311. | returned is a subnormal number, and it must not require rounding. The
  5312. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5313. | Floating-Point Arithmetic.
  5314. *----------------------------------------------------------------------------*}
  5315. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5316. var
  5317. roundingMode: int8;
  5318. roundNearestEven, increment, isTiny: flag;
  5319. roundIncrement, roundMask, roundBits: int64;
  5320. label
  5321. precision80;
  5322. begin
  5323. roundingMode := float_rounding_mode;
  5324. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5325. if ( roundingPrecision = 80 ) then
  5326. goto precision80;
  5327. if ( roundingPrecision = 64 ) then
  5328. begin
  5329. roundIncrement := int64( $0000000000000400 );
  5330. roundMask := int64( $00000000000007FF );
  5331. end
  5332. else if ( roundingPrecision = 32 ) then
  5333. begin
  5334. roundIncrement := int64( $0000008000000000 );
  5335. roundMask := int64( $000000FFFFFFFFFF );
  5336. end
  5337. else begin
  5338. goto precision80;
  5339. end;
  5340. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5341. if ( not (roundNearestEven<>0) ) then
  5342. begin
  5343. if ( roundingMode = float_round_to_zero ) then
  5344. begin
  5345. roundIncrement := 0;
  5346. end
  5347. else begin
  5348. roundIncrement := roundMask;
  5349. if ( zSign<>0 ) then
  5350. begin
  5351. if ( roundingMode = float_round_up ) then
  5352. roundIncrement := 0;
  5353. end
  5354. else begin
  5355. if ( roundingMode = float_round_down ) then
  5356. roundIncrement := 0;
  5357. end;
  5358. end;
  5359. end;
  5360. roundBits := zSig0 and roundMask;
  5361. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5362. if ( ( $7FFE < zExp )
  5363. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5364. ) begin
  5365. goto overflow;
  5366. end;
  5367. if ( zExp <= 0 ) begin
  5368. isTiny =
  5369. ( float_detect_tininess = float_tininess_before_rounding )
  5370. or ( zExp < 0 )
  5371. or ( zSig0 <= zSig0 + roundIncrement );
  5372. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  5373. zExp := 0;
  5374. roundBits := zSig0 and roundMask;
  5375. if ( isTiny and roundBits ) float_raise( float_flag_underflow );
  5376. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5377. zSig0 += roundIncrement;
  5378. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5379. roundIncrement := roundMask + 1;
  5380. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5381. roundMask |= roundIncrement;
  5382. end;
  5383. zSig0 = ~ roundMask;
  5384. result:=packFloatx80( zSign, zExp, zSig0 );
  5385. end;
  5386. end;
  5387. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5388. zSig0 += roundIncrement;
  5389. if ( zSig0 < roundIncrement ) begin
  5390. ++zExp;
  5391. zSig0 := LIT64( $8000000000000000 );
  5392. end;
  5393. roundIncrement := roundMask + 1;
  5394. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5395. roundMask |= roundIncrement;
  5396. end;
  5397. zSig0 = ~ roundMask;
  5398. if ( zSig0 = 0 ) zExp := 0;
  5399. result:=packFloatx80( zSign, zExp, zSig0 );
  5400. precision80:
  5401. increment := ( (sbits64) zSig1 < 0 );
  5402. if ( ! roundNearestEven ) begin
  5403. if ( roundingMode = float_round_to_zero ) begin
  5404. increment := 0;
  5405. end;
  5406. else begin
  5407. if ( zSign ) begin
  5408. increment := ( roundingMode = float_round_down ) and zSig1;
  5409. end;
  5410. else begin
  5411. increment := ( roundingMode = float_round_up ) and zSig1;
  5412. end;
  5413. end;
  5414. end;
  5415. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5416. if ( ( $7FFE < zExp )
  5417. or ( ( zExp = $7FFE )
  5418. and ( zSig0 = LIT64( $FFFFFFFFFFFFFFFF ) )
  5419. and increment
  5420. )
  5421. ) begin
  5422. roundMask := 0;
  5423. overflow:
  5424. float_raise( float_flag_overflow or float_flag_inexact );
  5425. if ( ( roundingMode = float_round_to_zero )
  5426. or ( zSign and ( roundingMode = float_round_up ) )
  5427. or ( ! zSign and ( roundingMode = float_round_down ) )
  5428. ) begin
  5429. result:=packFloatx80( zSign, $7FFE, ~ roundMask );
  5430. end;
  5431. result:=packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5432. end;
  5433. if ( zExp <= 0 ) begin
  5434. isTiny =
  5435. ( float_detect_tininess = float_tininess_before_rounding )
  5436. or ( zExp < 0 )
  5437. or ! increment
  5438. or ( zSig0 < LIT64( $FFFFFFFFFFFFFFFF ) );
  5439. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  5440. zExp := 0;
  5441. if ( isTiny and zSig1 ) float_raise( float_flag_underflow );
  5442. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5443. if ( roundNearestEven ) begin
  5444. increment := ( (sbits64) zSig1 < 0 );
  5445. end;
  5446. else begin
  5447. if ( zSign ) begin
  5448. increment := ( roundingMode = float_round_down ) and zSig1;
  5449. end;
  5450. else begin
  5451. increment := ( roundingMode = float_round_up ) and zSig1;
  5452. end;
  5453. end;
  5454. if ( increment ) begin
  5455. ++zSig0;
  5456. zSig0 =
  5457. ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5458. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5459. end;
  5460. result:=packFloatx80( zSign, zExp, zSig0 );
  5461. end;
  5462. end;
  5463. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5464. if ( increment ) begin
  5465. ++zSig0;
  5466. if ( zSig0 = 0 ) begin
  5467. ++zExp;
  5468. zSig0 := LIT64( $8000000000000000 );
  5469. end;
  5470. else begin
  5471. zSig0 = ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5472. end;
  5473. end;
  5474. else begin
  5475. if ( zSig0 = 0 ) zExp := 0;
  5476. end;
  5477. result:=packFloatx80( zSign, zExp, zSig0 );
  5478. end;
  5479. {*----------------------------------------------------------------------------
  5480. | Takes an abstract floating-point value having sign `zSign', exponent
  5481. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  5482. | and returns the proper extended double-precision floating-point value
  5483. | corresponding to the abstract input. This routine is just like
  5484. | `roundAndPackFloatx80' except that the input significand does not have to be
  5485. | normalized.
  5486. *----------------------------------------------------------------------------*}
  5487. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5488. var
  5489. shiftCount: int8;
  5490. begin
  5491. if ( zSig0 = 0 ) begin
  5492. zSig0 := zSig1;
  5493. zSig1 := 0;
  5494. zExp -= 64;
  5495. end;
  5496. shiftCount := countLeadingZeros64( zSig0 );
  5497. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5498. zExp := eExp - shiftCount;
  5499. return
  5500. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  5501. end;
  5502. {*----------------------------------------------------------------------------
  5503. | Returns the result of converting the extended double-precision floating-
  5504. | point value `a' to the 32-bit two's complement integer format. The
  5505. | conversion is performed according to the IEC/IEEE Standard for Binary
  5506. | Floating-Point Arithmetic---which means in particular that the conversion
  5507. | is rounded according to the current rounding mode. If `a' is a NaN, the
  5508. | largest positive integer is returned. Otherwise, if the conversion
  5509. | overflows, the largest integer with the same sign as `a' is returned.
  5510. *----------------------------------------------------------------------------*}
  5511. function floatx80_to_int32(a: floatx80): int32;
  5512. var
  5513. aSign: flag;
  5514. aExp, shiftCount: int32;
  5515. aSig: bits64;
  5516. begin
  5517. aSig := extractFloatx80Frac( a );
  5518. aExp := extractFloatx80Exp( a );
  5519. aSign := extractFloatx80Sign( a );
  5520. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5521. shiftCount := $4037 - aExp;
  5522. if ( shiftCount <= 0 ) shiftCount := 1;
  5523. shift64RightJamming( aSig, shiftCount, aSig );
  5524. result := roundAndPackInt32( aSign, aSig );
  5525. end;
  5526. {*----------------------------------------------------------------------------
  5527. | Returns the result of converting the extended double-precision floating-
  5528. | point value `a' to the 32-bit two's complement integer format. The
  5529. | conversion is performed according to the IEC/IEEE Standard for Binary
  5530. | Floating-Point Arithmetic, except that the conversion is always rounded
  5531. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5532. | Otherwise, if the conversion overflows, the largest integer with the same
  5533. | sign as `a' is returned.
  5534. *----------------------------------------------------------------------------*}
  5535. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  5536. var
  5537. aSign: flag;
  5538. aExp, shiftCount: int32;
  5539. aSig, savedASig: bits64;
  5540. z: int32;
  5541. begin
  5542. aSig := extractFloatx80Frac( a );
  5543. aExp := extractFloatx80Exp( a );
  5544. aSign := extractFloatx80Sign( a );
  5545. if ( $401E < aExp ) begin
  5546. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5547. goto invalid;
  5548. end;
  5549. else if ( aExp < $3FFF ) begin
  5550. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5551. result := 0;
  5552. end;
  5553. shiftCount := $403E - aExp;
  5554. savedASig := aSig;
  5555. aSig >>= shiftCount;
  5556. z := aSig;
  5557. if ( aSign ) z := - z;
  5558. if ( ( z < 0 ) xor aSign ) begin
  5559. invalid:
  5560. float_raise( float_flag_invalid );
  5561. result := aSign ? (sbits32) $80000000 : $7FFFFFFF;
  5562. end;
  5563. if ( ( aSig shl shiftCount ) <> savedASig ) begin
  5564. softfloat_exception_flags or= float_flag_inexact;
  5565. end;
  5566. result := z;
  5567. end;
  5568. {*----------------------------------------------------------------------------
  5569. | Returns the result of converting the extended double-precision floating-
  5570. | point value `a' to the 64-bit two's complement integer format. The
  5571. | conversion is performed according to the IEC/IEEE Standard for Binary
  5572. | Floating-Point Arithmetic---which means in particular that the conversion
  5573. | is rounded according to the current rounding mode. If `a' is a NaN,
  5574. | the largest positive integer is returned. Otherwise, if the conversion
  5575. | overflows, the largest integer with the same sign as `a' is returned.
  5576. *----------------------------------------------------------------------------*}
  5577. function floatx80_to_int64(a: floatx80): int64;
  5578. var
  5579. aSign: flag;
  5580. aExp, shiftCount: int32;
  5581. aSig, aSigExtra: bits64;
  5582. begin
  5583. aSig := extractFloatx80Frac( a );
  5584. aExp := extractFloatx80Exp( a );
  5585. aSign := extractFloatx80Sign( a );
  5586. shiftCount := $403E - aExp;
  5587. if ( shiftCount <= 0 ) begin
  5588. if ( shiftCount ) begin
  5589. float_raise( float_flag_invalid );
  5590. if ( ! aSign
  5591. or ( ( aExp = $7FFF )
  5592. and ( aSig <> LIT64( $8000000000000000 ) ) )
  5593. ) begin
  5594. result := LIT64( $7FFFFFFFFFFFFFFF );
  5595. end;
  5596. result := (sbits64) LIT64( $8000000000000000 );
  5597. end;
  5598. aSigExtra := 0;
  5599. end;
  5600. else begin
  5601. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  5602. end;
  5603. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  5604. end;
  5605. {*----------------------------------------------------------------------------
  5606. | Returns the result of converting the extended double-precision floating-
  5607. | point value `a' to the 64-bit two's complement integer format. The
  5608. | conversion is performed according to the IEC/IEEE Standard for Binary
  5609. | Floating-Point Arithmetic, except that the conversion is always rounded
  5610. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5611. | Otherwise, if the conversion overflows, the largest integer with the same
  5612. | sign as `a' is returned.
  5613. *----------------------------------------------------------------------------*}
  5614. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  5615. var
  5616. aSign: flag;
  5617. aExp, shiftCount: int32;
  5618. aSig: bits64;
  5619. z: int64;
  5620. begin
  5621. aSig := extractFloatx80Frac( a );
  5622. aExp := extractFloatx80Exp( a );
  5623. aSign := extractFloatx80Sign( a );
  5624. shiftCount := aExp - $403E;
  5625. if ( 0 <= shiftCount ) begin
  5626. aSig = LIT64( $7FFFFFFFFFFFFFFF );
  5627. if ( ( a.high <> $C03E ) or aSig ) begin
  5628. float_raise( float_flag_invalid );
  5629. if ( ! aSign or ( ( aExp = $7FFF ) and aSig ) ) begin
  5630. result := LIT64( $7FFFFFFFFFFFFFFF );
  5631. end;
  5632. end;
  5633. result := (sbits64) LIT64( $8000000000000000 );
  5634. end;
  5635. else if ( aExp < $3FFF ) begin
  5636. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5637. result := 0;
  5638. end;
  5639. z := aSig>>( - shiftCount );
  5640. if ( (bits64) ( aSig shl ( shiftCount and 63 ) ) ) begin
  5641. softfloat_exception_flags or= float_flag_inexact;
  5642. end;
  5643. if ( aSign ) z := - z;
  5644. result := z;
  5645. end;
  5646. {*----------------------------------------------------------------------------
  5647. | Returns the result of converting the extended double-precision floating-
  5648. | point value `a' to the single-precision floating-point format. The
  5649. | conversion is performed according to the IEC/IEEE Standard for Binary
  5650. | Floating-Point Arithmetic.
  5651. *----------------------------------------------------------------------------*}
  5652. function floatx80_to_float32(a: floatx80): float32;
  5653. var
  5654. aSign: flag;
  5655. aExp: int32;
  5656. aSig: bits64;
  5657. begin
  5658. aSig := extractFloatx80Frac( a );
  5659. aExp := extractFloatx80Exp( a );
  5660. aSign := extractFloatx80Sign( a );
  5661. if ( aExp = $7FFF ) begin
  5662. if ( (bits64) ( aSig shl 1 ) ) begin
  5663. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  5664. end;
  5665. result := packFloat32( aSign, $FF, 0 );
  5666. end;
  5667. shift64RightJamming( aSig, 33, aSig );
  5668. if ( aExp or aSig ) aExp -= $3F81;
  5669. result := roundAndPackFloat32( aSign, aExp, aSig );
  5670. end;
  5671. {*----------------------------------------------------------------------------
  5672. | Returns the result of converting the extended double-precision floating-
  5673. | point value `a' to the double-precision floating-point format. The
  5674. | conversion is performed according to the IEC/IEEE Standard for Binary
  5675. | Floating-Point Arithmetic.
  5676. *----------------------------------------------------------------------------*}
  5677. function floatx80_to_float64(a: floatx80): float64;
  5678. var
  5679. aSign: flag;
  5680. aExp: int32;
  5681. aSig, zSig: bits64;
  5682. begin
  5683. aSig := extractFloatx80Frac( a );
  5684. aExp := extractFloatx80Exp( a );
  5685. aSign := extractFloatx80Sign( a );
  5686. if ( aExp = $7FFF ) begin
  5687. if ( (bits64) ( aSig shl 1 ) ) begin
  5688. result := commonNaNToFloat64( floatx80ToCommonNaN( a ) );
  5689. end;
  5690. result := packFloat64( aSign, $7FF, 0 );
  5691. end;
  5692. shift64RightJamming( aSig, 1, zSig );
  5693. if ( aExp or aSig ) aExp -= $3C01;
  5694. result := roundAndPackFloat64( aSign, aExp, zSig );
  5695. end;
  5696. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5697. {*----------------------------------------------------------------------------
  5698. | Returns the result of converting the extended double-precision floating-
  5699. | point value `a' to the quadruple-precision floating-point format. The
  5700. | conversion is performed according to the IEC/IEEE Standard for Binary
  5701. | Floating-Point Arithmetic.
  5702. *----------------------------------------------------------------------------*}
  5703. function floatx80_to_float128(a: floatx80): float128;
  5704. var
  5705. aSign: flag;
  5706. aExp: int16;
  5707. aSig, zSig0, zSig1: bits64;
  5708. begin
  5709. aSig := extractFloatx80Frac( a );
  5710. aExp := extractFloatx80Exp( a );
  5711. aSign := extractFloatx80Sign( a );
  5712. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) begin
  5713. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  5714. end;
  5715. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  5716. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  5717. end;
  5718. {$endif FPC_SOFTFLOAT_FLOAT128}
  5719. {*----------------------------------------------------------------------------
  5720. | Rounds the extended double-precision floating-point value `a' to an integer,
  5721. | and Returns the result as an extended quadruple-precision floating-point
  5722. | value. The operation is performed according to the IEC/IEEE Standard for
  5723. | Binary Floating-Point Arithmetic.
  5724. *----------------------------------------------------------------------------*}
  5725. function floatx80_round_to_int(a: floatx80): floatx80;
  5726. var
  5727. aSign: flag;
  5728. aExp: int32;
  5729. lastBitMask, roundBitsMask: bits64;
  5730. roundingMode: int8;
  5731. z: floatx80;
  5732. begin
  5733. aExp := extractFloatx80Exp( a );
  5734. if ( $403E <= aExp ) begin
  5735. if ( ( aExp = $7FFF ) and (bits64) ( extractFloatx80Frac( a ) shl 1 ) ) begin
  5736. result := propagateFloatx80NaN( a, a );
  5737. end;
  5738. result := a;
  5739. end;
  5740. if ( aExp < $3FFF ) begin
  5741. if ( ( aExp = 0 )
  5742. and ( (bits64) ( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) begin
  5743. result := a;
  5744. end;
  5745. softfloat_exception_flags or= float_flag_inexact;
  5746. aSign := extractFloatx80Sign( a );
  5747. switch ( float_rounding_mode ) begin
  5748. case float_round_nearest_even:
  5749. if ( ( aExp = $3FFE ) and (bits64) ( extractFloatx80Frac( a ) shl 1 )
  5750. ) begin
  5751. result :=
  5752. packFloatx80( aSign, $3FFF, LIT64( $8000000000000000 ) );
  5753. end;
  5754. break;
  5755. case float_round_down:
  5756. result :=
  5757. aSign ?
  5758. packFloatx80( 1, $3FFF, LIT64( $8000000000000000 ) )
  5759. : packFloatx80( 0, 0, 0 );
  5760. case float_round_up:
  5761. result :=
  5762. aSign ? packFloatx80( 1, 0, 0 )
  5763. : packFloatx80( 0, $3FFF, LIT64( $8000000000000000 ) );
  5764. end;
  5765. result := packFloatx80( aSign, 0, 0 );
  5766. end;
  5767. lastBitMask := 1;
  5768. lastBitMask shl = $403E - aExp;
  5769. roundBitsMask := lastBitMask - 1;
  5770. z := a;
  5771. roundingMode := float_rounding_mode;
  5772. if ( roundingMode = float_round_nearest_even ) begin
  5773. z.low += lastBitMask>>1;
  5774. if ( ( z.low and roundBitsMask ) = 0 ) z.low = ~ lastBitMask;
  5775. end;
  5776. else if ( roundingMode <> float_round_to_zero ) begin
  5777. if ( extractFloatx80Sign( z ) xor ( roundingMode = float_round_up ) ) begin
  5778. z.low += roundBitsMask;
  5779. end;
  5780. end;
  5781. z.low = ~ roundBitsMask;
  5782. if ( z.low = 0 ) begin
  5783. ++z.high;
  5784. z.low := LIT64( $8000000000000000 );
  5785. end;
  5786. if ( z.low <> a.low ) softfloat_exception_flags or= float_flag_inexact;
  5787. result := z;
  5788. end;
  5789. {*----------------------------------------------------------------------------
  5790. | Returns the result of adding the absolute values of the extended double-
  5791. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  5792. | negated before being returned. `zSign' is ignored if the result is a NaN.
  5793. | The addition is performed according to the IEC/IEEE Standard for Binary
  5794. | Floating-Point Arithmetic.
  5795. *----------------------------------------------------------------------------*}
  5796. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5797. var
  5798. aExp, bExp, zExp: int32;
  5799. aSig, bSig, zSig0, zSig1: bits64;
  5800. expDiff: int32;
  5801. begin
  5802. aSig := extractFloatx80Frac( a );
  5803. aExp := extractFloatx80Exp( a );
  5804. bSig := extractFloatx80Frac( b );
  5805. bExp := extractFloatx80Exp( b );
  5806. expDiff := aExp - bExp;
  5807. if ( 0 < expDiff ) begin
  5808. if ( aExp = $7FFF ) begin
  5809. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5810. result := a;
  5811. end;
  5812. if ( bExp = 0 ) --expDiff;
  5813. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  5814. zExp := aExp;
  5815. end;
  5816. else if ( expDiff < 0 ) begin
  5817. if ( bExp = $7FFF ) begin
  5818. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5819. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5820. end;
  5821. if ( aExp = 0 ) ++expDiff;
  5822. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  5823. zExp := bExp;
  5824. end;
  5825. else begin
  5826. if ( aExp = $7FFF ) begin
  5827. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5828. result := propagateFloatx80NaN( a, b );
  5829. end;
  5830. result := a;
  5831. end;
  5832. zSig1 := 0;
  5833. zSig0 := aSig + bSig;
  5834. if ( aExp = 0 ) begin
  5835. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  5836. goto roundAndPack;
  5837. end;
  5838. zExp := aExp;
  5839. goto shiftRight1;
  5840. end;
  5841. zSig0 := aSig + bSig;
  5842. if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
  5843. shiftRight1:
  5844. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  5845. zSig0 or= LIT64( $8000000000000000 );
  5846. ++zExp;
  5847. roundAndPack:
  5848. result :=
  5849. roundAndPackFloatx80(
  5850. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5851. end;
  5852. {*----------------------------------------------------------------------------
  5853. | Returns the result of subtracting the absolute values of the extended
  5854. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  5855. | difference is negated before being returned. `zSign' is ignored if the
  5856. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  5857. | Standard for Binary Floating-Point Arithmetic.
  5858. *----------------------------------------------------------------------------*}
  5859. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5860. var
  5861. aExp, bExp, zExp: int32;
  5862. aSig, bSig, zSig0, zSig1: bits64;
  5863. expDiff: int32;
  5864. z: floatx80;
  5865. begin
  5866. aSig := extractFloatx80Frac( a );
  5867. aExp := extractFloatx80Exp( a );
  5868. bSig := extractFloatx80Frac( b );
  5869. bExp := extractFloatx80Exp( b );
  5870. expDiff := aExp - bExp;
  5871. if ( 0 < expDiff ) goto aExpBigger;
  5872. if ( expDiff < 0 ) goto bExpBigger;
  5873. if ( aExp = $7FFF ) begin
  5874. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5875. result := propagateFloatx80NaN( a, b );
  5876. end;
  5877. float_raise( float_flag_invalid );
  5878. z.low := floatx80_default_nan_low;
  5879. z.high := floatx80_default_nan_high;
  5880. result := z;
  5881. end;
  5882. if ( aExp = 0 ) begin
  5883. aExp := 1;
  5884. bExp := 1;
  5885. end;
  5886. zSig1 := 0;
  5887. if ( bSig < aSig ) goto aBigger;
  5888. if ( aSig < bSig ) goto bBigger;
  5889. result := packFloatx80( float_rounding_mode = float_round_down, 0, 0 );
  5890. bExpBigger:
  5891. if ( bExp = $7FFF ) begin
  5892. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5893. result := packFloatx80( zSign xor 1, $7FFF, LIT64( $8000000000000000 ) );
  5894. end;
  5895. if ( aExp = 0 ) ++expDiff;
  5896. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  5897. bBigger:
  5898. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  5899. zExp := bExp;
  5900. zSign xor = 1;
  5901. goto normalizeRoundAndPack;
  5902. aExpBigger:
  5903. if ( aExp = $7FFF ) begin
  5904. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5905. result := a;
  5906. end;
  5907. if ( bExp = 0 ) --expDiff;
  5908. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  5909. aBigger:
  5910. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  5911. zExp := aExp;
  5912. normalizeRoundAndPack:
  5913. result :=
  5914. normalizeRoundAndPackFloatx80(
  5915. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5916. end;
  5917. {*----------------------------------------------------------------------------
  5918. | Returns the result of adding the extended double-precision floating-point
  5919. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  5920. | Standard for Binary Floating-Point Arithmetic.
  5921. *----------------------------------------------------------------------------*}
  5922. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  5923. var
  5924. aSign, bSign: flag;
  5925. begin
  5926. aSign := extractFloatx80Sign( a );
  5927. bSign := extractFloatx80Sign( b );
  5928. if ( aSign = bSign ) begin
  5929. result := addFloatx80Sigs( a, b, aSign );
  5930. end;
  5931. else begin
  5932. result := subFloatx80Sigs( a, b, aSign );
  5933. end;
  5934. end;
  5935. {*----------------------------------------------------------------------------
  5936. | Returns the result of subtracting the extended double-precision floating-
  5937. | point values `a' and `b'. The operation is performed according to the
  5938. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5939. *----------------------------------------------------------------------------*}
  5940. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  5941. var
  5942. aSign, bSign: flag;
  5943. begin
  5944. aSign := extractFloatx80Sign( a );
  5945. bSign := extractFloatx80Sign( b );
  5946. if ( aSign = bSign ) begin
  5947. result := subFloatx80Sigs( a, b, aSign );
  5948. end;
  5949. else begin
  5950. result := addFloatx80Sigs( a, b, aSign );
  5951. end;
  5952. end;
  5953. {*----------------------------------------------------------------------------
  5954. | Returns the result of multiplying the extended double-precision floating-
  5955. | point values `a' and `b'. The operation is performed according to the
  5956. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5957. *----------------------------------------------------------------------------*}
  5958. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  5959. var
  5960. aSign, bSign, zSign: flag;
  5961. aExp, bExp, zExp: int32;
  5962. aSig, bSig, zSig0, zSig1: bits64;
  5963. z: floatx80;
  5964. begin
  5965. aSig := extractFloatx80Frac( a );
  5966. aExp := extractFloatx80Exp( a );
  5967. aSign := extractFloatx80Sign( a );
  5968. bSig := extractFloatx80Frac( b );
  5969. bExp := extractFloatx80Exp( b );
  5970. bSign := extractFloatx80Sign( b );
  5971. zSign := aSign xor bSign;
  5972. if ( aExp = $7FFF ) begin
  5973. if ( (bits64) ( aSig shl 1 )
  5974. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  5975. result := propagateFloatx80NaN( a, b );
  5976. end;
  5977. if ( ( bExp or bSig ) = 0 ) goto invalid;
  5978. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5979. end;
  5980. if ( bExp = $7FFF ) begin
  5981. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5982. if ( ( aExp or aSig ) = 0 ) begin
  5983. invalid:
  5984. float_raise( float_flag_invalid );
  5985. z.low := floatx80_default_nan_low;
  5986. z.high := floatx80_default_nan_high;
  5987. result := z;
  5988. end;
  5989. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5990. end;
  5991. if ( aExp = 0 ) begin
  5992. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  5993. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  5994. end;
  5995. if ( bExp = 0 ) begin
  5996. if ( bSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  5997. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  5998. end;
  5999. zExp := aExp + bExp - $3FFE;
  6000. mul64To128( aSig, bSig, zSig0, zSig1 );
  6001. if ( 0 < (sbits64) zSig0 ) begin
  6002. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6003. --zExp;
  6004. end;
  6005. result :=
  6006. roundAndPackFloatx80(
  6007. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6008. end;
  6009. {*----------------------------------------------------------------------------
  6010. | Returns the result of dividing the extended double-precision floating-point
  6011. | value `a' by the corresponding value `b'. The operation is performed
  6012. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6013. *----------------------------------------------------------------------------*}
  6014. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6015. var
  6016. aSign, bSign, zSign: flag;
  6017. aExp, bExp, zExp: int32;
  6018. aSig, bSig, zSig0, zSig1: bits64;
  6019. rem0, rem1, rem2, term0, term1, term2: bits64;
  6020. z: floatx80;
  6021. begin
  6022. aSig := extractFloatx80Frac( a );
  6023. aExp := extractFloatx80Exp( a );
  6024. aSign := extractFloatx80Sign( a );
  6025. bSig := extractFloatx80Frac( b );
  6026. bExp := extractFloatx80Exp( b );
  6027. bSign := extractFloatx80Sign( b );
  6028. zSign := aSign xor bSign;
  6029. if ( aExp = $7FFF ) begin
  6030. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6031. if ( bExp = $7FFF ) begin
  6032. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6033. goto invalid;
  6034. end;
  6035. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6036. end;
  6037. if ( bExp = $7FFF ) begin
  6038. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6039. result := packFloatx80( zSign, 0, 0 );
  6040. end;
  6041. if ( bExp = 0 ) begin
  6042. if ( bSig = 0 ) begin
  6043. if ( ( aExp or aSig ) = 0 ) begin
  6044. invalid:
  6045. float_raise( float_flag_invalid );
  6046. z.low := floatx80_default_nan_low;
  6047. z.high := floatx80_default_nan_high;
  6048. result := z;
  6049. end;
  6050. float_raise( float_flag_divbyzero );
  6051. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6052. end;
  6053. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6054. end;
  6055. if ( aExp = 0 ) begin
  6056. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6057. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6058. end;
  6059. zExp := aExp - bExp + $3FFE;
  6060. rem1 := 0;
  6061. if ( bSig <= aSig ) begin
  6062. shift128Right( aSig, 0, 1, aSig, rem1 );
  6063. ++zExp;
  6064. end;
  6065. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6066. mul64To128( bSig, zSig0, term0, term1 );
  6067. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6068. while ( (sbits64) rem0 < 0 ) begin
  6069. --zSig0;
  6070. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6071. end;
  6072. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6073. if ( (bits64) ( zSig1 shl 1 ) <= 8 ) begin
  6074. mul64To128( bSig, zSig1, term1, term2 );
  6075. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6076. while ( (sbits64) rem1 < 0 ) begin
  6077. --zSig1;
  6078. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6079. end;
  6080. zSig1 or= ( ( rem1 or rem2 ) <> 0 );
  6081. end;
  6082. result :=
  6083. roundAndPackFloatx80(
  6084. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6085. end;
  6086. {*----------------------------------------------------------------------------
  6087. | Returns the remainder of the extended double-precision floating-point value
  6088. | `a' with respect to the corresponding value `b'. The operation is performed
  6089. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6090. *----------------------------------------------------------------------------*}
  6091. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6092. var
  6093. aSign, bSign, zSign: flag;
  6094. aExp, bExp, expDiff: int32;
  6095. aSig0, aSig1, bSig: bits64;
  6096. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6097. z: floatx80;
  6098. begin
  6099. aSig0 := extractFloatx80Frac( a );
  6100. aExp := extractFloatx80Exp( a );
  6101. aSign := extractFloatx80Sign( a );
  6102. bSig := extractFloatx80Frac( b );
  6103. bExp := extractFloatx80Exp( b );
  6104. bSign := extractFloatx80Sign( b );
  6105. if ( aExp = $7FFF ) begin
  6106. if ( (bits64) ( aSig0 shl 1 )
  6107. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  6108. result := propagateFloatx80NaN( a, b );
  6109. end;
  6110. goto invalid;
  6111. end;
  6112. if ( bExp = $7FFF ) begin
  6113. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6114. result := a;
  6115. end;
  6116. if ( bExp = 0 ) begin
  6117. if ( bSig = 0 ) begin
  6118. invalid:
  6119. float_raise( float_flag_invalid );
  6120. z.low := floatx80_default_nan_low;
  6121. z.high := floatx80_default_nan_high;
  6122. result := z;
  6123. end;
  6124. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6125. end;
  6126. if ( aExp = 0 ) begin
  6127. if ( (bits64) ( aSig0 shl 1 ) = 0 ) result := a;
  6128. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6129. end;
  6130. bSig or= LIT64( $8000000000000000 );
  6131. zSign := aSign;
  6132. expDiff := aExp - bExp;
  6133. aSig1 := 0;
  6134. if ( expDiff < 0 ) begin
  6135. if ( expDiff < -1 ) result := a;
  6136. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6137. expDiff := 0;
  6138. end;
  6139. q := ( bSig <= aSig0 );
  6140. if ( q ) aSig0 -= bSig;
  6141. expDiff -= 64;
  6142. while ( 0 < expDiff ) begin
  6143. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6144. q := ( 2 < q ) ? q - 2 : 0;
  6145. mul64To128( bSig, q, term0, term1 );
  6146. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6147. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  6148. expDiff -= 62;
  6149. end;
  6150. expDiff += 64;
  6151. if ( 0 < expDiff ) begin
  6152. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6153. q := ( 2 < q ) ? q - 2 : 0;
  6154. q >>= 64 - expDiff;
  6155. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  6156. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6157. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  6158. while ( le128( term0, term1, aSig0, aSig1 ) ) begin
  6159. ++q;
  6160. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6161. end;
  6162. end;
  6163. else begin
  6164. term1 := 0;
  6165. term0 := bSig;
  6166. end;
  6167. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  6168. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6169. or ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6170. and ( q and 1 ) )
  6171. ) begin
  6172. aSig0 := alternateASig0;
  6173. aSig1 := alternateASig1;
  6174. zSign := ! zSign;
  6175. end;
  6176. result :=
  6177. normalizeRoundAndPackFloatx80(
  6178. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  6179. end;
  6180. {*----------------------------------------------------------------------------
  6181. | Returns the square root of the extended double-precision floating-point
  6182. | value `a'. The operation is performed according to the IEC/IEEE Standard
  6183. | for Binary Floating-Point Arithmetic.
  6184. *----------------------------------------------------------------------------*}
  6185. function floatx80_sqrt(a: floatx80): floatx80;
  6186. var
  6187. aSign: flag;
  6188. aExp, zExp: int32;
  6189. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  6190. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  6191. z: floatx80;
  6192. label
  6193. invalid;
  6194. begin
  6195. aSig0 := extractFloatx80Frac( a );
  6196. aExp := extractFloatx80Exp( a );
  6197. aSign := extractFloatx80Sign( a );
  6198. if ( aExp = $7FFF ) begin
  6199. if ( (bits64) ( aSig0 shl 1 ) ) result := propagateFloatx80NaN( a, a );
  6200. if ( ! aSign ) result := a;
  6201. goto invalid;
  6202. end;
  6203. if ( aSign ) begin
  6204. if ( ( aExp or aSig0 ) = 0 ) result := a;
  6205. invalid:
  6206. float_raise( float_flag_invalid );
  6207. z.low := floatx80_default_nan_low;
  6208. z.high := floatx80_default_nan_high;
  6209. result := z;
  6210. end;
  6211. if ( aExp = 0 ) begin
  6212. if ( aSig0 = 0 ) result := packFloatx80( 0, 0, 0 );
  6213. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6214. end;
  6215. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
  6216. zSig0 := estimateSqrt32( aExp, aSig0>>32 );
  6217. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  6218. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  6219. doubleZSig0 := zSig0 shl 1;
  6220. mul64To128( zSig0, zSig0, term0, term1 );
  6221. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  6222. while ( (sbits64) rem0 < 0 ) begin
  6223. --zSig0;
  6224. doubleZSig0 -= 2;
  6225. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, rem0, rem1 );
  6226. end;
  6227. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  6228. if ( ( zSig1 and LIT64( $3FFFFFFFFFFFFFFF ) ) <= 5 ) begin
  6229. if ( zSig1 = 0 ) zSig1 := 1;
  6230. mul64To128( doubleZSig0, zSig1, term1, term2 );
  6231. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6232. mul64To128( zSig1, zSig1, term2, term3 );
  6233. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  6234. while ( (sbits64) rem1 < 0 ) begin
  6235. --zSig1;
  6236. shortShift128Left( 0, zSig1, 1, term2, term3 );
  6237. term3 or= 1;
  6238. term2 or= doubleZSig0;
  6239. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  6240. end;
  6241. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  6242. end;
  6243. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  6244. zSig0 or= doubleZSig0;
  6245. result :=
  6246. roundAndPackFloatx80(
  6247. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  6248. end;
  6249. {*----------------------------------------------------------------------------
  6250. | Returns 1 if the extended double-precision floating-point value `a' is
  6251. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  6252. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  6253. | Arithmetic.
  6254. *----------------------------------------------------------------------------*}
  6255. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  6256. begin
  6257. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6258. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6259. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6260. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6261. ) begin
  6262. if ( floatx80_is_signaling_nan( a )
  6263. or floatx80_is_signaling_nan( b ) ) begin
  6264. float_raise( float_flag_invalid );
  6265. end;
  6266. result := 0;
  6267. end;
  6268. result :=
  6269. ( a.low = b.low )
  6270. and ( ( a.high = b.high )
  6271. or ( ( a.low = 0 )
  6272. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6273. );
  6274. end;
  6275. {*----------------------------------------------------------------------------
  6276. | Returns 1 if the extended double-precision floating-point value `a' is
  6277. | less than or equal to the corresponding value `b', and 0 otherwise. The
  6278. | comparison is performed according to the IEC/IEEE Standard for Binary
  6279. | Floating-Point Arithmetic.
  6280. *----------------------------------------------------------------------------*}
  6281. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  6282. var
  6283. aSign, bSign: flag;
  6284. begin
  6285. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6286. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6287. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6288. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6289. ) begin
  6290. float_raise( float_flag_invalid );
  6291. result := 0;
  6292. end;
  6293. aSign := extractFloatx80Sign( a );
  6294. bSign := extractFloatx80Sign( b );
  6295. if ( aSign <> bSign ) begin
  6296. result :=
  6297. aSign
  6298. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6299. = 0 );
  6300. end;
  6301. result :=
  6302. aSign ? le128( b.high, b.low, a.high, a.low )
  6303. : le128( a.high, a.low, b.high, b.low );
  6304. end;
  6305. {*----------------------------------------------------------------------------
  6306. | Returns 1 if the extended double-precision floating-point value `a' is
  6307. | less than the corresponding value `b', and 0 otherwise. The comparison
  6308. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6309. | Arithmetic.
  6310. *----------------------------------------------------------------------------*}
  6311. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  6312. var
  6313. aSign, bSign: flag;
  6314. begin
  6315. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6316. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6317. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6318. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6319. ) begin
  6320. float_raise( float_flag_invalid );
  6321. result := 0;
  6322. end;
  6323. aSign := extractFloatx80Sign( a );
  6324. bSign := extractFloatx80Sign( b );
  6325. if ( aSign <> bSign ) begin
  6326. result :=
  6327. aSign
  6328. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6329. <> 0 );
  6330. end;
  6331. result :=
  6332. aSign ? lt128( b.high, b.low, a.high, a.low )
  6333. : lt128( a.high, a.low, b.high, b.low );
  6334. end;
  6335. {*----------------------------------------------------------------------------
  6336. | Returns 1 if the extended double-precision floating-point value `a' is equal
  6337. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  6338. | raised if either operand is a NaN. Otherwise, the comparison is performed
  6339. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6340. *----------------------------------------------------------------------------*}
  6341. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  6342. begin
  6343. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6344. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6345. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6346. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6347. ) begin
  6348. float_raise( float_flag_invalid );
  6349. result := 0;
  6350. end;
  6351. result :=
  6352. ( a.low = b.low )
  6353. and ( ( a.high = b.high )
  6354. or ( ( a.low = 0 )
  6355. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6356. );
  6357. end;
  6358. {*----------------------------------------------------------------------------
  6359. | Returns 1 if the extended double-precision floating-point value `a' is less
  6360. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  6361. | do not cause an exception. Otherwise, the comparison is performed according
  6362. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6363. *----------------------------------------------------------------------------*}
  6364. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  6365. var
  6366. aSign, bSign: flag;
  6367. begin
  6368. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6369. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6370. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6371. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6372. ) begin
  6373. if ( floatx80_is_signaling_nan( a )
  6374. or floatx80_is_signaling_nan( b ) ) begin
  6375. float_raise( float_flag_invalid );
  6376. end;
  6377. result := 0;
  6378. end;
  6379. aSign := extractFloatx80Sign( a );
  6380. bSign := extractFloatx80Sign( b );
  6381. if ( aSign <> bSign ) begin
  6382. result :=
  6383. aSign
  6384. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6385. = 0 );
  6386. end;
  6387. result :=
  6388. aSign ? le128( b.high, b.low, a.high, a.low )
  6389. : le128( a.high, a.low, b.high, b.low );
  6390. end;
  6391. {*----------------------------------------------------------------------------
  6392. | Returns 1 if the extended double-precision floating-point value `a' is less
  6393. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  6394. | an exception. Otherwise, the comparison is performed according to the
  6395. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6396. *----------------------------------------------------------------------------*}
  6397. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  6398. var
  6399. aSign, bSign: flag;
  6400. begin
  6401. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6402. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6403. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6404. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6405. ) begin
  6406. if ( floatx80_is_signaling_nan( a )
  6407. or floatx80_is_signaling_nan( b ) ) begin
  6408. float_raise( float_flag_invalid );
  6409. end;
  6410. result := 0;
  6411. end;
  6412. aSign := extractFloatx80Sign( a );
  6413. bSign := extractFloatx80Sign( b );
  6414. if ( aSign <> bSign ) begin
  6415. result :=
  6416. aSign
  6417. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6418. <> 0 );
  6419. end;
  6420. result :=
  6421. aSign ? lt128( b.high, b.low, a.high, a.low )
  6422. : lt128( a.high, a.low, b.high, b.low );
  6423. end;
  6424. {$endif FPC_SOFTFLOAT_FLOATX80}
  6425. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6426. {*----------------------------------------------------------------------------
  6427. | Returns the least-significant 64 fraction bits of the quadruple-precision
  6428. | floating-point value `a'.
  6429. *----------------------------------------------------------------------------*}
  6430. function extractFloat128Frac1(a : float128): bits64;
  6431. begin
  6432. result:=a.low;
  6433. end;
  6434. {*----------------------------------------------------------------------------
  6435. | Returns the most-significant 48 fraction bits of the quadruple-precision
  6436. | floating-point value `a'.
  6437. *----------------------------------------------------------------------------*}
  6438. function extractFloat128Frac0(a : float128): bits64;
  6439. begin
  6440. result:=a.high and int64($0000FFFFFFFFFFFF);
  6441. end;
  6442. {*----------------------------------------------------------------------------
  6443. | Returns the exponent bits of the quadruple-precision floating-point value
  6444. | `a'.
  6445. *----------------------------------------------------------------------------*}
  6446. function extractFloat128Exp(a : float128): int32;
  6447. begin
  6448. result:=( a.high shr 48 ) and $7FFF;
  6449. end;
  6450. {*----------------------------------------------------------------------------
  6451. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  6452. *----------------------------------------------------------------------------*}
  6453. function extractFloat128Sign(a : float128): flag;
  6454. begin
  6455. result:=a.high shr 63;
  6456. end;
  6457. {*----------------------------------------------------------------------------
  6458. | Normalizes the subnormal quadruple-precision floating-point value
  6459. | represented by the denormalized significand formed by the concatenation of
  6460. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  6461. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  6462. | significand are stored at the location pointed to by `zSig0Ptr', and the
  6463. | least significant 64 bits of the normalized significand are stored at the
  6464. | location pointed to by `zSig1Ptr'.
  6465. *----------------------------------------------------------------------------*}
  6466. procedure normalizeFloat128Subnormal(
  6467. aSig0: bits64;
  6468. aSig1: bits64;
  6469. var zExpPtr: int32;
  6470. var zSig0Ptr: bits64;
  6471. var zSig1Ptr: bits64);
  6472. var
  6473. shiftCount: int8;
  6474. begin
  6475. if ( aSig0 = 0 ) then
  6476. begin
  6477. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  6478. if ( shiftCount < 0 ) then
  6479. begin
  6480. zSig0Ptr := aSig1 shr ( - shiftCount );
  6481. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  6482. end
  6483. else begin
  6484. zSig0Ptr := aSig1 shl shiftCount;
  6485. zSig1Ptr := 0;
  6486. end;
  6487. zExpPtr := - shiftCount - 63;
  6488. end
  6489. else begin
  6490. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  6491. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  6492. zExpPtr := 1 - shiftCount;
  6493. end;
  6494. end;
  6495. {*----------------------------------------------------------------------------
  6496. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  6497. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  6498. | floating-point value, returning the result. After being shifted into the
  6499. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  6500. | added together to form the most significant 32 bits of the result. This
  6501. | means that any integer portion of `zSig0' will be added into the exponent.
  6502. | Since a properly normalized significand will have an integer portion equal
  6503. | to 1, the `zExp' input should be 1 less than the desired result exponent
  6504. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  6505. | significand.
  6506. *----------------------------------------------------------------------------*}
  6507. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  6508. var
  6509. z: float128;
  6510. begin
  6511. z.low := zSig1;
  6512. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  6513. result:=z;
  6514. end;
  6515. {*----------------------------------------------------------------------------
  6516. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6517. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  6518. | and `zSig2', and returns the proper quadruple-precision floating-point value
  6519. | corresponding to the abstract input. Ordinarily, the abstract value is
  6520. | simply rounded and packed into the quadruple-precision format, with the
  6521. | inexact exception raised if the abstract input cannot be represented
  6522. | exactly. However, if the abstract value is too large, the overflow and
  6523. | inexact exceptions are raised and an infinity or maximal finite value is
  6524. | returned. If the abstract value is too small, the input value is rounded to
  6525. | a subnormal number, and the underflow and inexact exceptions are raised if
  6526. | the abstract input cannot be represented exactly as a subnormal quadruple-
  6527. | precision floating-point number.
  6528. | The input significand must be normalized or smaller. If the input
  6529. | significand is not normalized, `zExp' must be 0; in that case, the result
  6530. | returned is a subnormal number, and it must not require rounding. In the
  6531. | usual case that the input significand is normalized, `zExp' must be 1 less
  6532. | than the ``true'' floating-point exponent. The handling of underflow and
  6533. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6534. *----------------------------------------------------------------------------*}
  6535. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  6536. var
  6537. roundingMode: int8;
  6538. roundNearestEven, increment, isTiny: flag;
  6539. begin
  6540. roundingMode := float_rounding_mode;
  6541. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  6542. increment := ord( sbits64(zSig2) < 0 );
  6543. if ( roundNearestEven=0 ) then
  6544. begin
  6545. if ( roundingMode = float_round_to_zero ) then
  6546. begin
  6547. increment := 0;
  6548. end
  6549. else begin
  6550. if ( zSign<>0 ) then
  6551. begin
  6552. increment := ord( roundingMode = float_round_down ) and zSig2;
  6553. end
  6554. else begin
  6555. increment := ord( roundingMode = float_round_up ) and zSig2;
  6556. end;
  6557. end;
  6558. end;
  6559. if ( $7FFD <= bits32(zExp) ) then
  6560. begin
  6561. if ( ord( $7FFD < zExp )
  6562. or ( ord( zExp = $7FFD )
  6563. and eq128(
  6564. int64( $0001FFFFFFFFFFFF ),
  6565. int64( $FFFFFFFFFFFFFFFF ),
  6566. zSig0,
  6567. zSig1
  6568. )
  6569. and increment
  6570. )
  6571. )<>0 then
  6572. begin
  6573. float_raise( float_flag_overflow or float_flag_inexact );
  6574. if ( ord( roundingMode = float_round_to_zero )
  6575. or ( zSign and ord( roundingMode = float_round_up ) )
  6576. or ( not(zSign) and ord( roundingMode = float_round_down ) )
  6577. )<>0 then
  6578. begin
  6579. result :=
  6580. packFloat128(
  6581. zSign,
  6582. $7FFE,
  6583. int64( $0000FFFFFFFFFFFF ),
  6584. int64( $FFFFFFFFFFFFFFFF )
  6585. );
  6586. end;
  6587. result:=packFloat128( zSign, $7FFF, 0, 0 );
  6588. end;
  6589. if ( zExp < 0 ) then
  6590. begin
  6591. isTiny :=
  6592. ord(( float_detect_tininess = float_tininess_before_rounding )
  6593. or ( zExp < -1 )
  6594. or not( increment<>0 )
  6595. or boolean(lt128(
  6596. zSig0,
  6597. zSig1,
  6598. int64( $0001FFFFFFFFFFFF ),
  6599. int64( $FFFFFFFFFFFFFFFF )
  6600. )));
  6601. shift128ExtraRightJamming(
  6602. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  6603. zExp := 0;
  6604. if ( isTiny and zSig2 )<>0 then
  6605. float_raise( float_flag_underflow );
  6606. if ( roundNearestEven<>0 ) then
  6607. begin
  6608. increment := ord( sbits64(zSig2) < 0 );
  6609. end
  6610. else begin
  6611. if ( zSign<>0 ) then
  6612. begin
  6613. increment := ord( roundingMode = float_round_down ) and zSig2;
  6614. end
  6615. else begin
  6616. increment := ord( roundingMode = float_round_up ) and zSig2;
  6617. end;
  6618. end;
  6619. end;
  6620. end;
  6621. if ( zSig2<>0 ) then
  6622. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6623. if ( increment<>0 ) then
  6624. begin
  6625. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  6626. zSig1 := zSig1 and not( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  6627. end
  6628. else begin
  6629. if ( ( zSig0 or zSig1 ) = 0 ) then
  6630. zExp := 0;
  6631. end;
  6632. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  6633. end;
  6634. {*----------------------------------------------------------------------------
  6635. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6636. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  6637. | returns the proper quadruple-precision floating-point value corresponding
  6638. | to the abstract input. This routine is just like `roundAndPackFloat128'
  6639. | except that the input significand has fewer bits and does not have to be
  6640. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  6641. | point exponent.
  6642. *----------------------------------------------------------------------------*}
  6643. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  6644. var
  6645. shiftCount: int8;
  6646. zSig2: bits64;
  6647. begin
  6648. if ( zSig0 = 0 ) then
  6649. begin
  6650. zSig0 := zSig1;
  6651. zSig1 := 0;
  6652. dec(zExp, 64);
  6653. end;
  6654. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  6655. if ( 0 <= shiftCount ) then
  6656. begin
  6657. zSig2 := 0;
  6658. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6659. end
  6660. else begin
  6661. shift128ExtraRightJamming(
  6662. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  6663. end;
  6664. dec(zExp, shiftCount);
  6665. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  6666. end;
  6667. {*----------------------------------------------------------------------------
  6668. | Returns the result of converting the quadruple-precision floating-point
  6669. | value `a' to the 32-bit two's complement integer format. The conversion
  6670. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6671. | Arithmetic---which means in particular that the conversion is rounded
  6672. | according to the current rounding mode. If `a' is a NaN, the largest
  6673. | positive integer is returned. Otherwise, if the conversion overflows, the
  6674. | largest integer with the same sign as `a' is returned.
  6675. *----------------------------------------------------------------------------*}
  6676. function float128_to_int32(a: float128): int32;
  6677. var
  6678. aSign: flag;
  6679. aExp, shiftCount: int32;
  6680. aSig0, aSig1: bits64;
  6681. begin
  6682. aSig1 := extractFloat128Frac1( a );
  6683. aSig0 := extractFloat128Frac0( a );
  6684. aExp := extractFloat128Exp( a );
  6685. aSign := extractFloat128Sign( a );
  6686. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  6687. aSign := 0;
  6688. if ( aExp<>0 ) then
  6689. aSig0 := aSig0 or int64( $0001000000000000 );
  6690. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6691. shiftCount := $4028 - aExp;
  6692. if ( 0 < shiftCount ) then
  6693. shift64RightJamming( aSig0, shiftCount, aSig0 );
  6694. result := roundAndPackInt32( aSign, aSig0 );
  6695. end;
  6696. {*----------------------------------------------------------------------------
  6697. | Returns the result of converting the quadruple-precision floating-point
  6698. | value `a' to the 32-bit two's complement integer format. The conversion
  6699. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6700. | Arithmetic, except that the conversion is always rounded toward zero. If
  6701. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  6702. | conversion overflows, the largest integer with the same sign as `a' is
  6703. | returned.
  6704. *----------------------------------------------------------------------------*}
  6705. function float128_to_int32_round_to_zero(a: float128): int32;
  6706. var
  6707. aSign: flag;
  6708. aExp, shiftCount: int32;
  6709. aSig0, aSig1, savedASig: bits64;
  6710. z: int32;
  6711. label
  6712. invalid;
  6713. begin
  6714. aSig1 := extractFloat128Frac1( a );
  6715. aSig0 := extractFloat128Frac0( a );
  6716. aExp := extractFloat128Exp( a );
  6717. aSign := extractFloat128Sign( a );
  6718. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6719. if ( $401E < aExp ) then
  6720. begin
  6721. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  6722. aSign := 0;
  6723. goto invalid;
  6724. end
  6725. else if ( aExp < $3FFF ) then
  6726. begin
  6727. if ( aExp or aSig0 )<>0 then
  6728. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6729. result := 0;
  6730. exit;
  6731. end;
  6732. aSig0 := aSig0 or int64( $0001000000000000 );
  6733. shiftCount := $402F - aExp;
  6734. savedASig := aSig0;
  6735. aSig0 := aSig0 shr shiftCount;
  6736. z := aSig0;
  6737. if ( aSign )<>0 then
  6738. z := - z;
  6739. if ( ord( z < 0 ) xor aSign )<>0 then
  6740. begin
  6741. invalid:
  6742. float_raise( float_flag_invalid );
  6743. if aSign<>0 then
  6744. result:=$80000000
  6745. else
  6746. result:=$7FFFFFFF;
  6747. exit;
  6748. end;
  6749. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  6750. begin
  6751. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6752. end;
  6753. result := z;
  6754. end;
  6755. {*----------------------------------------------------------------------------
  6756. | Returns the result of converting the quadruple-precision floating-point
  6757. | value `a' to the 64-bit two's complement integer format. The conversion
  6758. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6759. | Arithmetic---which means in particular that the conversion is rounded
  6760. | according to the current rounding mode. If `a' is a NaN, the largest
  6761. | positive integer is returned. Otherwise, if the conversion overflows, the
  6762. | largest integer with the same sign as `a' is returned.
  6763. *----------------------------------------------------------------------------*}
  6764. function float128_to_int64(a: float128): int64;
  6765. var
  6766. aSign: flag;
  6767. aExp, shiftCount: int32;
  6768. aSig0, aSig1: bits64;
  6769. begin
  6770. aSig1 := extractFloat128Frac1( a );
  6771. aSig0 := extractFloat128Frac0( a );
  6772. aExp := extractFloat128Exp( a );
  6773. aSign := extractFloat128Sign( a );
  6774. if ( aExp<>0 ) then
  6775. aSig0 := aSig0 or int64( $0001000000000000 );
  6776. shiftCount := $402F - aExp;
  6777. if ( shiftCount <= 0 ) then
  6778. begin
  6779. if ( $403E < aExp ) then
  6780. begin
  6781. float_raise( float_flag_invalid );
  6782. if ( (aSign=0)
  6783. or ( ( aExp = $7FFF )
  6784. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  6785. )
  6786. ) then
  6787. begin
  6788. result := int64( $7FFFFFFFFFFFFFFF );
  6789. end;
  6790. result := int64( $8000000000000000 );
  6791. end;
  6792. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  6793. end
  6794. else begin
  6795. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  6796. end;
  6797. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  6798. end;
  6799. {*----------------------------------------------------------------------------
  6800. | Returns the result of converting the quadruple-precision floating-point
  6801. | value `a' to the 64-bit two's complement integer format. The conversion
  6802. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6803. | Arithmetic, except that the conversion is always rounded toward zero.
  6804. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  6805. | the conversion overflows, the largest integer with the same sign as `a' is
  6806. | returned.
  6807. *----------------------------------------------------------------------------*}
  6808. function float128_to_int64_round_to_zero(a: float128): int64;
  6809. var
  6810. aSign: flag;
  6811. aExp, shiftCount: int32;
  6812. aSig0, aSig1: bits64;
  6813. z: int64;
  6814. begin
  6815. aSig1 := extractFloat128Frac1( a );
  6816. aSig0 := extractFloat128Frac0( a );
  6817. aExp := extractFloat128Exp( a );
  6818. aSign := extractFloat128Sign( a );
  6819. if ( aExp<>0 ) then
  6820. aSig0 := aSig0 or int64( $0001000000000000 );
  6821. shiftCount := aExp - $402F;
  6822. if ( 0 < shiftCount ) then
  6823. begin
  6824. if ( $403E <= aExp ) then
  6825. begin
  6826. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  6827. if ( ( a.high = int64( $C03E000000000000 ) )
  6828. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  6829. begin
  6830. if ( aSig1<>0 ) then
  6831. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6832. end
  6833. else begin
  6834. float_raise( float_flag_invalid );
  6835. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  6836. begin
  6837. result := int64( $7FFFFFFFFFFFFFFF );
  6838. exit;
  6839. end;
  6840. end;
  6841. result := int64( $8000000000000000 );
  6842. exit;
  6843. end;
  6844. z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
  6845. if ( int64( aSig1 shl shiftCount )<>0 ) then
  6846. begin
  6847. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6848. end;
  6849. end
  6850. else begin
  6851. if ( aExp < $3FFF ) then
  6852. begin
  6853. if ( aExp or aSig0 or aSig1 )<>0 then
  6854. begin
  6855. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6856. end;
  6857. result := 0;
  6858. exit;
  6859. end;
  6860. z := aSig0 shr ( - shiftCount );
  6861. if ( (aSig1<>0)
  6862. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  6863. begin
  6864. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6865. end;
  6866. end;
  6867. if ( aSign<>0 ) then
  6868. z := - z;
  6869. result := z;
  6870. end;
  6871. {*----------------------------------------------------------------------------
  6872. | Returns the result of converting the quadruple-precision floating-point
  6873. | value `a' to the single-precision floating-point format. The conversion
  6874. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6875. | Arithmetic.
  6876. *----------------------------------------------------------------------------*}
  6877. function float128_to_float32(a: float128): float32;
  6878. var
  6879. aSign: flag;
  6880. aExp: int32;
  6881. aSig0, aSig1: bits64;
  6882. zSig: bits32;
  6883. begin
  6884. aSig1 := extractFloat128Frac1( a );
  6885. aSig0 := extractFloat128Frac0( a );
  6886. aExp := extractFloat128Exp( a );
  6887. aSign := extractFloat128Sign( a );
  6888. if ( aExp = $7FFF ) then
  6889. begin
  6890. if ( aSig0 or aSig1 )<>0 then
  6891. begin
  6892. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  6893. exit;
  6894. end;
  6895. result := packFloat32( aSign, $FF, 0 );
  6896. exit;
  6897. end;
  6898. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6899. shift64RightJamming( aSig0, 18, aSig0 );
  6900. zSig := aSig0;
  6901. if ( aExp or zSig )<>0 then
  6902. begin
  6903. zSig := zSig or $40000000;
  6904. dec(aExp,$3F81);
  6905. end;
  6906. result := roundAndPackFloat32( aSign, aExp, zSig );
  6907. end;
  6908. {*----------------------------------------------------------------------------
  6909. | Returns the result of converting the quadruple-precision floating-point
  6910. | value `a' to the double-precision floating-point format. The conversion
  6911. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6912. | Arithmetic.
  6913. *----------------------------------------------------------------------------*}
  6914. function float128_to_float64(a: float128): float64;
  6915. var
  6916. aSign: flag;
  6917. aExp: int32;
  6918. aSig0, aSig1: bits64;
  6919. begin
  6920. aSig1 := extractFloat128Frac1( a );
  6921. aSig0 := extractFloat128Frac0( a );
  6922. aExp := extractFloat128Exp( a );
  6923. aSign := extractFloat128Sign( a );
  6924. if ( aExp = $7FFF ) then
  6925. begin
  6926. if ( aSig0 or aSig1 )<>0 then
  6927. begin
  6928. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  6929. exit;
  6930. end;
  6931. result:=packFloat64( aSign, $7FF, 0);
  6932. exit;
  6933. end;
  6934. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  6935. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6936. if ( aExp or aSig0 )<>0 then
  6937. begin
  6938. aSig0 := aSig0 or int64( $4000000000000000 );
  6939. dec(aExp,$3C01);
  6940. end;
  6941. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  6942. end;
  6943. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  6944. {*----------------------------------------------------------------------------
  6945. | Returns the result of converting the quadruple-precision floating-point
  6946. | value `a' to the extended double-precision floating-point format. The
  6947. | conversion is performed according to the IEC/IEEE Standard for Binary
  6948. | Floating-Point Arithmetic.
  6949. *----------------------------------------------------------------------------*}
  6950. function float128_to_floatx80(a: float128): floatx80;
  6951. var
  6952. aSign: flag;
  6953. aExp: int32;
  6954. aSig0, aSig1: bits64;
  6955. begin
  6956. aSig1 := extractFloat128Frac1( a );
  6957. aSig0 := extractFloat128Frac0( a );
  6958. aExp := extractFloat128Exp( a );
  6959. aSign := extractFloat128Sign( a );
  6960. if ( aExp = $7FFF ) begin
  6961. if ( aSig0 or aSig1 ) begin
  6962. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  6963. exit;
  6964. end;
  6965. result := packFloatx80( aSign, $7FFF, int64( $8000000000000000 ) );
  6966. exit;
  6967. end;
  6968. if ( aExp = 0 ) begin
  6969. if ( ( aSig0 or aSig1 ) = 0 ) then
  6970. begin
  6971. result := packFloatx80( aSign, 0, 0 );
  6972. exit;
  6973. end;
  6974. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  6975. end;
  6976. else begin
  6977. aSig0 or= int64( $0001000000000000 );
  6978. end;
  6979. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  6980. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  6981. end;
  6982. {$endif FPC_SOFTFLOAT_FLOATX80}
  6983. {*----------------------------------------------------------------------------
  6984. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  6985. | Returns the result as a quadruple-precision floating-point value. The
  6986. | operation is performed according to the IEC/IEEE Standard for Binary
  6987. | Floating-Point Arithmetic.
  6988. *----------------------------------------------------------------------------*}
  6989. function float128_round_to_int(a: float128): float128;
  6990. var
  6991. aSign: flag;
  6992. aExp: int32;
  6993. lastBitMask, roundBitsMask: bits64;
  6994. roundingMode: int8;
  6995. z: float128;
  6996. begin
  6997. aExp := extractFloat128Exp( a );
  6998. if ( $402F <= aExp ) then
  6999. begin
  7000. if ( $406F <= aExp ) then
  7001. begin
  7002. if ( ( aExp = $7FFF )
  7003. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7004. ) then
  7005. begin
  7006. result := propagateFloat128NaN( a, a );
  7007. exit;
  7008. end;
  7009. result := a;
  7010. exit;
  7011. end;
  7012. lastBitMask := 1;
  7013. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7014. roundBitsMask := lastBitMask - 1;
  7015. z := a;
  7016. roundingMode := float_rounding_mode;
  7017. if ( roundingMode = float_round_nearest_even ) then
  7018. begin
  7019. if ( lastBitMask )<>0 then
  7020. begin
  7021. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7022. if ( ( z.low and roundBitsMask ) = 0 ) then
  7023. z.low := z.low and not(lastBitMask);
  7024. end
  7025. else begin
  7026. if ( sbits64(z.low) < 0 ) then
  7027. begin
  7028. inc(z.high);
  7029. if ( bits64( z.low shl 1 ) = 0 ) then
  7030. z.high := z.high and not(1);
  7031. end;
  7032. end;
  7033. end
  7034. else if ( roundingMode <> float_round_to_zero ) then
  7035. begin
  7036. if ( extractFloat128Sign( z )
  7037. xor ord( roundingMode = float_round_up ) )<>0 then
  7038. begin
  7039. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7040. end;
  7041. end;
  7042. z.low := z.low and not(roundBitsMask);
  7043. end
  7044. else begin
  7045. if ( aExp < $3FFF ) then
  7046. begin
  7047. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7048. begin
  7049. result := a;
  7050. exit;
  7051. end;
  7052. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7053. aSign := extractFloat128Sign( a );
  7054. case float_rounding_mode of
  7055. float_round_nearest_even:
  7056. if ( ( aExp = $3FFE )
  7057. and ( (extractFloat128Frac0( a )<>0)
  7058. or (extractFloat128Frac1( a )<>0) )
  7059. ) then begin
  7060. begin
  7061. result := packFloat128( aSign, $3FFF, 0, 0 );
  7062. exit;
  7063. end;
  7064. end;
  7065. float_round_down:
  7066. begin
  7067. if aSign<>0 then
  7068. result:=packFloat128( 1, $3FFF, 0, 0 )
  7069. else
  7070. result:=packFloat128( 0, 0, 0, 0 );
  7071. exit;
  7072. end;
  7073. float_round_up:
  7074. begin
  7075. if aSign<>0 then
  7076. result := packFloat128( 1, 0, 0, 0 )
  7077. else
  7078. result:=packFloat128( 0, $3FFF, 0, 0 );
  7079. exit;
  7080. end;
  7081. end;
  7082. result := packFloat128( aSign, 0, 0, 0 );
  7083. exit;
  7084. end;
  7085. lastBitMask := 1;
  7086. lastBitMask := lastBitMask shl ($402F - aExp);
  7087. roundBitsMask := lastBitMask - 1;
  7088. z.low := 0;
  7089. z.high := a.high;
  7090. roundingMode := float_rounding_mode;
  7091. if ( roundingMode = float_round_nearest_even ) then begin
  7092. inc(z.high,lastBitMask shr 1);
  7093. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7094. z.high := z.high and not(lastBitMask);
  7095. end;
  7096. end
  7097. else if ( roundingMode <> float_round_to_zero ) then begin
  7098. if ( (extractFloat128Sign( z )<>0)
  7099. xor ( roundingMode = float_round_up ) ) then begin
  7100. z.high := z.high or ord( a.low <> 0 );
  7101. z.high := z.high+roundBitsMask;
  7102. end;
  7103. end;
  7104. z.high := z.high and not(roundBitsMask);
  7105. end;
  7106. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7107. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7108. end;
  7109. result := z;
  7110. end;
  7111. {*----------------------------------------------------------------------------
  7112. | Returns the result of adding the absolute values of the quadruple-precision
  7113. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7114. | before being returned. `zSign' is ignored if the result is a NaN.
  7115. | The addition is performed according to the IEC/IEEE Standard for Binary
  7116. | Floating-Point Arithmetic.
  7117. *----------------------------------------------------------------------------*}
  7118. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7119. var
  7120. aExp, bExp, zExp: int32;
  7121. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7122. expDiff: int32;
  7123. label
  7124. shiftRight1,roundAndPack;
  7125. begin
  7126. aSig1 := extractFloat128Frac1( a );
  7127. aSig0 := extractFloat128Frac0( a );
  7128. aExp := extractFloat128Exp( a );
  7129. bSig1 := extractFloat128Frac1( b );
  7130. bSig0 := extractFloat128Frac0( b );
  7131. bExp := extractFloat128Exp( b );
  7132. expDiff := aExp - bExp;
  7133. if ( 0 < expDiff ) then begin
  7134. if ( aExp = $7FFF ) then begin
  7135. if ( aSig0 or aSig1 )<>0 then
  7136. begin
  7137. result := propagateFloat128NaN( a, b );
  7138. exit;
  7139. end;
  7140. result := a;
  7141. exit;
  7142. end;
  7143. if ( bExp = 0 ) then begin
  7144. dec(expDiff);
  7145. end
  7146. else begin
  7147. bSig0 := bSig0 or int64( $0001000000000000 );
  7148. end;
  7149. shift128ExtraRightJamming(
  7150. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  7151. zExp := aExp;
  7152. end
  7153. else if ( expDiff < 0 ) then begin
  7154. if ( bExp = $7FFF ) then begin
  7155. if ( bSig0 or bSig1 )<>0 then
  7156. begin
  7157. result := propagateFloat128NaN( a, b );
  7158. exit;
  7159. end;
  7160. result := packFloat128( zSign, $7FFF, 0, 0 );
  7161. exit;
  7162. end;
  7163. if ( aExp = 0 ) then begin
  7164. inc(expDiff);
  7165. end
  7166. else begin
  7167. aSig0 := aSig0 or int64( $0001000000000000 );
  7168. end;
  7169. shift128ExtraRightJamming(
  7170. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  7171. zExp := bExp;
  7172. end
  7173. else begin
  7174. if ( aExp = $7FFF ) then begin
  7175. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7176. result := propagateFloat128NaN( a, b );
  7177. exit;
  7178. end;
  7179. result := a;
  7180. exit;
  7181. end;
  7182. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7183. if ( aExp = 0 ) then
  7184. begin
  7185. result := packFloat128( zSign, 0, zSig0, zSig1 );
  7186. exit;
  7187. end;
  7188. zSig2 := 0;
  7189. zSig0 := zSig0 or int64( $0002000000000000 );
  7190. zExp := aExp;
  7191. goto shiftRight1;
  7192. end;
  7193. aSig0 := aSig0 or int64( $0001000000000000 );
  7194. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7195. dec(zExp);
  7196. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  7197. inc(zExp);
  7198. shiftRight1:
  7199. shift128ExtraRightJamming(
  7200. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7201. roundAndPack:
  7202. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7203. end;
  7204. {*----------------------------------------------------------------------------
  7205. | Returns the result of subtracting the absolute values of the quadruple-
  7206. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  7207. | difference is negated before being returned. `zSign' is ignored if the
  7208. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  7209. | Standard for Binary Floating-Point Arithmetic.
  7210. *----------------------------------------------------------------------------*}
  7211. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  7212. var
  7213. aExp, bExp, zExp: int32;
  7214. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  7215. expDiff: int32;
  7216. z: float128;
  7217. label
  7218. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  7219. begin
  7220. aSig1 := extractFloat128Frac1( a );
  7221. aSig0 := extractFloat128Frac0( a );
  7222. aExp := extractFloat128Exp( a );
  7223. bSig1 := extractFloat128Frac1( b );
  7224. bSig0 := extractFloat128Frac0( b );
  7225. bExp := extractFloat128Exp( b );
  7226. expDiff := aExp - bExp;
  7227. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7228. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  7229. if ( 0 < expDiff ) then goto aExpBigger;
  7230. if ( expDiff < 0 ) then goto bExpBigger;
  7231. if ( aExp = $7FFF ) then begin
  7232. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7233. result := propagateFloat128NaN( a, b );
  7234. exit;
  7235. end;
  7236. float_raise( float_flag_invalid );
  7237. z.low := float128_default_nan_low;
  7238. z.high := float128_default_nan_high;
  7239. result := z;
  7240. exit;
  7241. end;
  7242. if ( aExp = 0 ) then begin
  7243. aExp := 1;
  7244. bExp := 1;
  7245. end;
  7246. if ( bSig0 < aSig0 ) then goto aBigger;
  7247. if ( aSig0 < bSig0 ) then goto bBigger;
  7248. if ( bSig1 < aSig1 ) then goto aBigger;
  7249. if ( aSig1 < bSig1 ) then goto bBigger;
  7250. result := packFloat128( ord(float_rounding_mode = float_round_down), 0, 0, 0 );
  7251. exit;
  7252. bExpBigger:
  7253. if ( bExp = $7FFF ) then begin
  7254. if ( bSig0 or bSig1 )<>0 then
  7255. begin
  7256. result := propagateFloat128NaN( a, b );
  7257. exit;
  7258. end;
  7259. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  7260. exit;
  7261. end;
  7262. if ( aExp = 0 ) then begin
  7263. inc(expDiff);
  7264. end
  7265. else begin
  7266. aSig0 := aSig0 or int64( $4000000000000000 );
  7267. end;
  7268. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7269. bSig0 := bSig0 or int64( $4000000000000000 );
  7270. bBigger:
  7271. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  7272. zExp := bExp;
  7273. zSign := zSign xor 1;
  7274. goto normalizeRoundAndPack;
  7275. aExpBigger:
  7276. if ( aExp = $7FFF ) then begin
  7277. if ( aSig0 or aSig1 )<>0 then
  7278. begin
  7279. result := propagateFloat128NaN( a, b );
  7280. exit;
  7281. end;
  7282. result := a;
  7283. exit;
  7284. end;
  7285. if ( bExp = 0 ) then begin
  7286. dec(expDiff);
  7287. end
  7288. else begin
  7289. bSig0 := bSig0 or int64( $4000000000000000 );
  7290. end;
  7291. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  7292. aSig0 := aSig0 or int64( $4000000000000000 );
  7293. aBigger:
  7294. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7295. zExp := aExp;
  7296. normalizeRoundAndPack:
  7297. dec(zExp);
  7298. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  7299. end;
  7300. {*----------------------------------------------------------------------------
  7301. | Returns the result of adding the quadruple-precision floating-point values
  7302. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  7303. | for Binary Floating-Point Arithmetic.
  7304. *----------------------------------------------------------------------------*}
  7305. function float128_add(a: float128; b: float128): float128;
  7306. var
  7307. aSign, bSign: flag;
  7308. begin
  7309. aSign := extractFloat128Sign( a );
  7310. bSign := extractFloat128Sign( b );
  7311. if ( aSign = bSign ) then begin
  7312. result := addFloat128Sigs( a, b, aSign );
  7313. end
  7314. else begin
  7315. result := subFloat128Sigs( a, b, aSign );
  7316. end;
  7317. end;
  7318. {*----------------------------------------------------------------------------
  7319. | Returns the result of subtracting the quadruple-precision floating-point
  7320. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7321. | Standard for Binary Floating-Point Arithmetic.
  7322. *----------------------------------------------------------------------------*}
  7323. function float128_sub(a: float128; b: float128): float128;
  7324. var
  7325. aSign, bSign: flag;
  7326. begin
  7327. aSign := extractFloat128Sign( a );
  7328. bSign := extractFloat128Sign( b );
  7329. if ( aSign = bSign ) then begin
  7330. result := subFloat128Sigs( a, b, aSign );
  7331. end
  7332. else begin
  7333. result := addFloat128Sigs( a, b, aSign );
  7334. end;
  7335. end;
  7336. {*----------------------------------------------------------------------------
  7337. | Returns the result of multiplying the quadruple-precision floating-point
  7338. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7339. | Standard for Binary Floating-Point Arithmetic.
  7340. *----------------------------------------------------------------------------*}
  7341. function float128_mul(a: float128; b: float128): float128;
  7342. var
  7343. aSign, bSign, zSign: flag;
  7344. aExp, bExp, zExp: int32;
  7345. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  7346. z: float128;
  7347. label
  7348. invalid;
  7349. begin
  7350. aSig1 := extractFloat128Frac1( a );
  7351. aSig0 := extractFloat128Frac0( a );
  7352. aExp := extractFloat128Exp( a );
  7353. aSign := extractFloat128Sign( a );
  7354. bSig1 := extractFloat128Frac1( b );
  7355. bSig0 := extractFloat128Frac0( b );
  7356. bExp := extractFloat128Exp( b );
  7357. bSign := extractFloat128Sign( b );
  7358. zSign := aSign xor bSign;
  7359. if ( aExp = $7FFF ) then begin
  7360. if ( (( aSig0 or aSig1 )<>0)
  7361. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7362. result := propagateFloat128NaN( a, b );
  7363. exit;
  7364. end;
  7365. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  7366. result := packFloat128( zSign, $7FFF, 0, 0 );
  7367. exit;
  7368. end;
  7369. if ( bExp = $7FFF ) then begin
  7370. if ( bSig0 or bSig1 )<>0 then
  7371. begin
  7372. result := propagateFloat128NaN( a, b );
  7373. exit;
  7374. end;
  7375. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7376. invalid:
  7377. float_raise( float_flag_invalid );
  7378. z.low := float128_default_nan_low;
  7379. z.high := float128_default_nan_high;
  7380. result := z;
  7381. exit;
  7382. end;
  7383. result := packFloat128( zSign, $7FFF, 0, 0 );
  7384. exit;
  7385. end;
  7386. if ( aExp = 0 ) then begin
  7387. if ( ( aSig0 or aSig1 ) = 0 ) then
  7388. begin
  7389. result := packFloat128( zSign, 0, 0, 0 );
  7390. exit;
  7391. end;
  7392. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7393. end;
  7394. if ( bExp = 0 ) then begin
  7395. if ( ( bSig0 or bSig1 ) = 0 ) then
  7396. begin
  7397. result := packFloat128( zSign, 0, 0, 0 );
  7398. exit;
  7399. end;
  7400. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7401. end;
  7402. zExp := aExp + bExp - $4000;
  7403. aSig0 := aSig0 or int64( $0001000000000000 );
  7404. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  7405. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  7406. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  7407. zSig2 := zSig2 or ord( zSig3 <> 0 );
  7408. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  7409. shift128ExtraRightJamming(
  7410. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7411. inc(zExp);
  7412. end;
  7413. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7414. end;
  7415. {*----------------------------------------------------------------------------
  7416. | Returns the result of dividing the quadruple-precision floating-point value
  7417. | `a' by the corresponding value `b'. The operation is performed according to
  7418. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7419. *----------------------------------------------------------------------------*}
  7420. function float128_div(a: float128; b: float128): float128;
  7421. var
  7422. aSign, bSign, zSign: flag;
  7423. aExp, bExp, zExp: int32;
  7424. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7425. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7426. z: float128;
  7427. label
  7428. invalid;
  7429. begin
  7430. aSig1 := extractFloat128Frac1( a );
  7431. aSig0 := extractFloat128Frac0( a );
  7432. aExp := extractFloat128Exp( a );
  7433. aSign := extractFloat128Sign( a );
  7434. bSig1 := extractFloat128Frac1( b );
  7435. bSig0 := extractFloat128Frac0( b );
  7436. bExp := extractFloat128Exp( b );
  7437. bSign := extractFloat128Sign( b );
  7438. zSign := aSign xor bSign;
  7439. if ( aExp = $7FFF ) then begin
  7440. if ( aSig0 or aSig1 )<>0 then
  7441. begin
  7442. result := propagateFloat128NaN( a, b );
  7443. exit;
  7444. end;
  7445. if ( bExp = $7FFF ) then begin
  7446. if ( bSig0 or bSig1 )<>0 then
  7447. begin
  7448. result := propagateFloat128NaN( a, b );
  7449. exit;
  7450. end;
  7451. goto invalid;
  7452. end;
  7453. result := packFloat128( zSign, $7FFF, 0, 0 );
  7454. exit;
  7455. end;
  7456. if ( bExp = $7FFF ) then begin
  7457. if ( bSig0 or bSig1 )<>0 then
  7458. begin
  7459. result := propagateFloat128NaN( a, b );
  7460. exit;
  7461. end;
  7462. result := packFloat128( zSign, 0, 0, 0 );
  7463. exit;
  7464. end;
  7465. if ( bExp = 0 ) then begin
  7466. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7467. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7468. invalid:
  7469. float_raise( float_flag_invalid );
  7470. z.low := float128_default_nan_low;
  7471. z.high := float128_default_nan_high;
  7472. result := z;
  7473. exit;
  7474. end;
  7475. float_raise( float_flag_divbyzero );
  7476. result := packFloat128( zSign, $7FFF, 0, 0 );
  7477. exit;
  7478. end;
  7479. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7480. end;
  7481. if ( aExp = 0 ) then begin
  7482. if ( ( aSig0 or aSig1 ) = 0 ) then
  7483. begin
  7484. result := packFloat128( zSign, 0, 0, 0 );
  7485. exit;
  7486. end;
  7487. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7488. end;
  7489. zExp := aExp - bExp + $3FFD;
  7490. shortShift128Left(
  7491. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  7492. shortShift128Left(
  7493. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7494. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  7495. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  7496. inc(zExp);
  7497. end;
  7498. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7499. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  7500. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  7501. while ( sbits64(rem0) < 0 ) do begin
  7502. dec(zSig0);
  7503. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  7504. end;
  7505. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  7506. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  7507. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  7508. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  7509. while ( sbits64(rem1) < 0 ) do begin
  7510. dec(zSig1);
  7511. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  7512. end;
  7513. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7514. end;
  7515. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  7516. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7517. end;
  7518. {*----------------------------------------------------------------------------
  7519. | Returns the remainder of the quadruple-precision floating-point value `a'
  7520. | with respect to the corresponding value `b'. The operation is performed
  7521. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7522. *----------------------------------------------------------------------------*}
  7523. function float128_rem(a: float128; b: float128): float128;
  7524. var
  7525. aSign, bSign, zSign: flag;
  7526. aExp, bExp, expDiff: int32;
  7527. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  7528. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  7529. sigMean0: sbits64;
  7530. z: float128;
  7531. label
  7532. invalid;
  7533. begin
  7534. aSig1 := extractFloat128Frac1( a );
  7535. aSig0 := extractFloat128Frac0( a );
  7536. aExp := extractFloat128Exp( a );
  7537. aSign := extractFloat128Sign( a );
  7538. bSig1 := extractFloat128Frac1( b );
  7539. bSig0 := extractFloat128Frac0( b );
  7540. bExp := extractFloat128Exp( b );
  7541. bSign := extractFloat128Sign( b );
  7542. if ( aExp = $7FFF ) then begin
  7543. if ( (( aSig0 or aSig1 )<>0)
  7544. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7545. result := propagateFloat128NaN( a, b );
  7546. exit;
  7547. end;
  7548. goto invalid;
  7549. end;
  7550. if ( bExp = $7FFF ) then begin
  7551. if ( bSig0 or bSig1 )<>0 then
  7552. begin
  7553. result := propagateFloat128NaN( a, b );
  7554. exit;
  7555. end;
  7556. result := a;
  7557. exit;
  7558. end;
  7559. if ( bExp = 0 ) then begin
  7560. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7561. invalid:
  7562. float_raise( float_flag_invalid );
  7563. z.low := float128_default_nan_low;
  7564. z.high := float128_default_nan_high;
  7565. result := z;
  7566. exit;
  7567. end;
  7568. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7569. end;
  7570. if ( aExp = 0 ) then begin
  7571. if ( ( aSig0 or aSig1 ) = 0 ) then
  7572. begin
  7573. result := a;
  7574. exit;
  7575. end;
  7576. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7577. end;
  7578. expDiff := aExp - bExp;
  7579. if ( expDiff < -1 ) then
  7580. begin
  7581. result := a;
  7582. exit;
  7583. end;
  7584. shortShift128Left(
  7585. aSig0 or int64( $0001000000000000 ),
  7586. aSig1,
  7587. 15 - ord( expDiff < 0 ),
  7588. aSig0,
  7589. aSig1
  7590. );
  7591. shortShift128Left(
  7592. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7593. q := le128( bSig0, bSig1, aSig0, aSig1 );
  7594. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7595. dec(expDiff,64);
  7596. while ( 0 < expDiff ) do begin
  7597. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7598. if ( 4 < q ) then
  7599. q := q - 4
  7600. else
  7601. q := 0;
  7602. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7603. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  7604. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  7605. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  7606. dec(expDiff,61);
  7607. end;
  7608. if ( -64 < expDiff ) then begin
  7609. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7610. if ( 4 < q ) then
  7611. q := q - 4
  7612. else
  7613. q := 0;
  7614. q := q shr (- expDiff);
  7615. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7616. inc(expDiff,52);
  7617. if ( expDiff < 0 ) then begin
  7618. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7619. end
  7620. else begin
  7621. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  7622. end;
  7623. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7624. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  7625. end
  7626. else begin
  7627. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  7628. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7629. end;
  7630. repeat
  7631. alternateASig0 := aSig0;
  7632. alternateASig1 := aSig1;
  7633. inc(q);
  7634. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7635. until not( 0 <= sbits64(aSig0) );
  7636. add128(
  7637. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  7638. if ( ( sigMean0 < 0 )
  7639. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  7640. aSig0 := alternateASig0;
  7641. aSig1 := alternateASig1;
  7642. end;
  7643. zSign := ord( sbits64(aSig0) < 0 );
  7644. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  7645. result :=
  7646. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  7647. end;
  7648. {*----------------------------------------------------------------------------
  7649. | Returns the square root of the quadruple-precision floating-point value `a'.
  7650. | The operation is performed according to the IEC/IEEE Standard for Binary
  7651. | Floating-Point Arithmetic.
  7652. *----------------------------------------------------------------------------*}
  7653. function float128_sqrt(a: float128): float128;
  7654. var
  7655. aSign: flag;
  7656. aExp, zExp: int32;
  7657. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  7658. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7659. z: float128;
  7660. label
  7661. invalid;
  7662. begin
  7663. aSig1 := extractFloat128Frac1( a );
  7664. aSig0 := extractFloat128Frac0( a );
  7665. aExp := extractFloat128Exp( a );
  7666. aSign := extractFloat128Sign( a );
  7667. if ( aExp = $7FFF ) then begin
  7668. if ( aSig0 or aSig1 )<>0 then
  7669. begin
  7670. result := propagateFloat128NaN( a, a );
  7671. exit;
  7672. end;
  7673. if ( aSign=0 ) then
  7674. begin
  7675. result := a;
  7676. exit;
  7677. end;
  7678. goto invalid;
  7679. end;
  7680. if ( aSign<>0 ) then begin
  7681. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  7682. begin
  7683. result := a;
  7684. exit;
  7685. end;
  7686. invalid:
  7687. float_raise( float_flag_invalid );
  7688. z.low := float128_default_nan_low;
  7689. z.high := float128_default_nan_high;
  7690. result := z;
  7691. exit;
  7692. end;
  7693. if ( aExp = 0 ) then begin
  7694. if ( ( aSig0 or aSig1 ) = 0 ) then
  7695. begin
  7696. result := packFloat128( 0, 0, 0, 0 );
  7697. exit;
  7698. end;
  7699. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7700. end;
  7701. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
  7702. aSig0 := aSig0 or int64( $0001000000000000 );
  7703. zSig0 := estimateSqrt32( aExp, aSig0>>17 );
  7704. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  7705. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7706. doubleZSig0 := zSig0 shl 1;
  7707. mul64To128( zSig0, zSig0, term0, term1 );
  7708. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7709. while ( sbits64(rem0) < 0 ) do begin
  7710. dec(zSig0);
  7711. dec(doubleZSig0,2);
  7712. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7713. end;
  7714. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7715. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  7716. if ( zSig1 = 0 ) then zSig1 := 1;
  7717. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7718. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7719. mul64To128( zSig1, zSig1, term2, term3 );
  7720. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7721. while ( sbits64(rem1) < 0 ) do begin
  7722. dec(zSig1);
  7723. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7724. term3 := term3 or 1;
  7725. term2 := term2 or doubleZSig0;
  7726. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7727. end;
  7728. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7729. end;
  7730. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  7731. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  7732. end;
  7733. {*----------------------------------------------------------------------------
  7734. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7735. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7736. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7737. *----------------------------------------------------------------------------*}
  7738. function float128_eq(a: float128; b: float128): flag;
  7739. begin
  7740. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7741. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7742. or ( ( extractFloat128Exp( b ) = $7FFF )
  7743. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7744. ) then begin
  7745. if ( (float128_is_signaling_nan( a )<>0)
  7746. or (float128_is_signaling_nan( b )<>0) ) then begin
  7747. float_raise( float_flag_invalid );
  7748. end;
  7749. result := 0;
  7750. exit;
  7751. end;
  7752. result := ord(
  7753. ( a.low = b.low )
  7754. and ( ( a.high = b.high )
  7755. or ( ( a.low = 0 )
  7756. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  7757. ));
  7758. end;
  7759. {*----------------------------------------------------------------------------
  7760. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7761. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  7762. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7763. | Arithmetic.
  7764. *----------------------------------------------------------------------------*}
  7765. function float128_le(a: float128; b: float128): flag;
  7766. var
  7767. aSign, bSign: flag;
  7768. begin
  7769. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7770. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7771. or ( ( extractFloat128Exp( b ) = $7FFF )
  7772. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7773. ) then begin
  7774. float_raise( float_flag_invalid );
  7775. result := 0;
  7776. exit;
  7777. end;
  7778. aSign := extractFloat128Sign( a );
  7779. bSign := extractFloat128Sign( b );
  7780. if ( aSign <> bSign ) then begin
  7781. result := ord(
  7782. (aSign<>0)
  7783. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7784. = 0 ));
  7785. exit;
  7786. end;
  7787. if aSign<>0 then
  7788. result := le128( b.high, b.low, a.high, a.low )
  7789. else
  7790. result := le128( a.high, a.low, b.high, b.low );
  7791. end;
  7792. {*----------------------------------------------------------------------------
  7793. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7794. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7795. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7796. *----------------------------------------------------------------------------*}
  7797. function float128_lt(a: float128; b: float128): flag;
  7798. var
  7799. aSign, bSign: flag;
  7800. begin
  7801. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7802. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7803. or ( ( extractFloat128Exp( b ) = $7FFF )
  7804. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7805. ) then begin
  7806. float_raise( float_flag_invalid );
  7807. result := 0;
  7808. exit;
  7809. end;
  7810. aSign := extractFloat128Sign( a );
  7811. bSign := extractFloat128Sign( b );
  7812. if ( aSign <> bSign ) then begin
  7813. result := ord(
  7814. (aSign<>0)
  7815. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7816. <> 0 ));
  7817. exit;
  7818. end;
  7819. if aSign<>0 then
  7820. result := lt128( b.high, b.low, a.high, a.low )
  7821. else
  7822. result := lt128( a.high, a.low, b.high, b.low );
  7823. end;
  7824. {*----------------------------------------------------------------------------
  7825. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7826. | the corresponding value `b', and 0 otherwise. The invalid exception is
  7827. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7828. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7829. *----------------------------------------------------------------------------*}
  7830. function float128_eq_signaling(a: float128; b: float128): flag;
  7831. begin
  7832. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7833. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7834. or ( ( extractFloat128Exp( b ) = $7FFF )
  7835. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7836. ) then begin
  7837. float_raise( float_flag_invalid );
  7838. result := 0;
  7839. exit;
  7840. end;
  7841. result := ord(
  7842. ( a.low = b.low )
  7843. and ( ( a.high = b.high )
  7844. or ( ( a.low = 0 )
  7845. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7846. ));
  7847. end;
  7848. {*----------------------------------------------------------------------------
  7849. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7850. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  7851. | cause an exception. Otherwise, the comparison is performed according to the
  7852. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7853. *----------------------------------------------------------------------------*}
  7854. function float128_le_quiet(a: float128; b: float128): flag;
  7855. var
  7856. aSign, bSign: flag;
  7857. begin
  7858. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7859. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7860. or ( ( extractFloat128Exp( b ) = $7FFF )
  7861. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7862. ) then begin
  7863. if ( (float128_is_signaling_nan( a )<>0)
  7864. or (float128_is_signaling_nan( b )<>0) ) then begin
  7865. float_raise( float_flag_invalid );
  7866. end;
  7867. result := 0;
  7868. exit;
  7869. end;
  7870. aSign := extractFloat128Sign( a );
  7871. bSign := extractFloat128Sign( b );
  7872. if ( aSign <> bSign ) then begin
  7873. result := ord(
  7874. (aSign<>0)
  7875. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7876. = 0 ));
  7877. exit;
  7878. end;
  7879. if aSign<>0 then
  7880. result := le128( b.high, b.low, a.high, a.low )
  7881. else
  7882. result := le128( a.high, a.low, b.high, b.low );
  7883. end;
  7884. {*----------------------------------------------------------------------------
  7885. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7886. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  7887. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  7888. | Standard for Binary Floating-Point Arithmetic.
  7889. *----------------------------------------------------------------------------*}
  7890. function float128_lt_quiet(a: float128; b: float128): flag;
  7891. var
  7892. aSign, bSign: flag;
  7893. begin
  7894. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7895. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7896. or ( ( extractFloat128Exp( b ) = $7FFF )
  7897. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7898. ) then begin
  7899. if ( (float128_is_signaling_nan( a )<>0)
  7900. or (float128_is_signaling_nan( b )<>0) ) then begin
  7901. float_raise( float_flag_invalid );
  7902. end;
  7903. result := 0;
  7904. exit;
  7905. end;
  7906. aSign := extractFloat128Sign( a );
  7907. bSign := extractFloat128Sign( b );
  7908. if ( aSign <> bSign ) then begin
  7909. result := ord(
  7910. (aSign<>0)
  7911. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7912. <> 0 ));
  7913. exit;
  7914. end;
  7915. if aSign<>0 then
  7916. result:=lt128( b.high, b.low, a.high, a.low )
  7917. else
  7918. result:=lt128( a.high, a.low, b.high, b.low );
  7919. end;
  7920. {$endif FPC_SOFTFLOAT_FLOAT128}
  7921. {$endif not(defined(fpc_softfpu_interface))}
  7922. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  7923. end.
  7924. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}