softfpu.pp 326 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. {$IFNDEF FPC_DOTTEDUNITS}
  63. unit softfpu;
  64. {$ENDIF FPC_DOTTEDUNITS}
  65. { Overflow checking must be disabled,
  66. since some operations expect overflow!
  67. }
  68. {$Q-}
  69. {$goto on}
  70. interface
  71. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  72. {$if not(defined(fpc_softfpu_implementation))}
  73. {
  74. -------------------------------------------------------------------------------
  75. Software IEC/IEEE floating-point types.
  76. -------------------------------------------------------------------------------
  77. }
  78. TYPE
  79. {$ifndef FPC_SYSTEM_HAS_float32}
  80. float32 = longword;
  81. {$define FPC_SYSTEM_HAS_float32}
  82. {$endif ndef FPC_SYSTEM_HAS_float32}
  83. { we use here a record in the function header because
  84. the record allows bitwise conversion to single }
  85. float32rec = record
  86. float32 : float32;
  87. end;
  88. flag = byte;
  89. bits8 = byte;
  90. sbits8 = shortint;
  91. bits16 = word;
  92. sbits16 = smallint;
  93. sbits32 = longint;
  94. bits32 = longword;
  95. {$ifndef fpc}
  96. qword = int64;
  97. {$endif}
  98. { now part of the system unit
  99. uint64 = qword;
  100. }
  101. bits64 = qword;
  102. sbits64 = int64;
  103. {$ifdef ENDIAN_LITTLE}
  104. {$ifndef FPC_SYSTEM_HAS_float64}
  105. float64 = record
  106. case byte of
  107. // force the record to be aligned like a double
  108. // else *_to_double will fail for cpus like sparc
  109. // and avoid expensive unpacking/packing operations
  110. 1: (dummy : double);
  111. 2: (low,high : bits32);
  112. end;
  113. {$endif ndef FPC_SYSTEM_HAS_float64}
  114. floatx80 = record
  115. case byte of
  116. // force the record to be aligned like a double
  117. // else *_to_double will fail for cpus like sparc
  118. // and avoid expensive unpacking/packing operations
  119. 1: (dummy : extended);
  120. 2: (low : qword;high : word);
  121. end;
  122. float128 = record
  123. case byte of
  124. // force the record to be aligned like a double
  125. // else *_to_double will fail for cpus like sparc
  126. // and avoid expensive unpacking/packing operations
  127. 1: (dummy : qword);
  128. 2: (low,high : qword);
  129. end;
  130. {$else}
  131. {$ifndef FPC_SYSTEM_HAS_float64}
  132. float64 = record
  133. case byte of
  134. // force the record to be aligned like a double
  135. // else *_to_double will fail for cpus like sparc
  136. 1: (dummy : double);
  137. 2: (high,low : bits32);
  138. end;
  139. {$endif ndef FPC_SYSTEM_HAS_float64}
  140. floatx80 = record
  141. case byte of
  142. // force the record to be aligned like a double
  143. // else *_to_double will fail for cpus like sparc
  144. // and avoid expensive unpacking/packing operations
  145. 1: (dummy : qword);
  146. 2: (high : word;low : qword);
  147. end;
  148. float128 = record
  149. case byte of
  150. // force the record to be aligned like a double
  151. // else *_to_double will fail for cpus like sparc
  152. // and avoid expensive unpacking/packing operations
  153. 1: (dummy : qword);
  154. 2: (high : qword;low : qword);
  155. end;
  156. {$endif}
  157. {$define FPC_SYSTEM_HAS_float64}
  158. {*
  159. -------------------------------------------------------------------------------
  160. Returns 1 if the double-precision floating-point value `a' is less than
  161. the corresponding value `b', and 0 otherwise. The comparison is performed
  162. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  163. -------------------------------------------------------------------------------
  164. *}
  165. Function float64_lt(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  166. {*
  167. -------------------------------------------------------------------------------
  168. Returns 1 if the double-precision floating-point value `a' is less than
  169. or equal to the corresponding value `b', and 0 otherwise. The comparison
  170. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  171. Arithmetic.
  172. -------------------------------------------------------------------------------
  173. *}
  174. Function float64_le(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  175. {*
  176. -------------------------------------------------------------------------------
  177. Returns 1 if the double-precision floating-point value `a' is equal to
  178. the corresponding value `b', and 0 otherwise. The comparison is performed
  179. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  180. -------------------------------------------------------------------------------
  181. *}
  182. Function float64_eq(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  183. {*
  184. -------------------------------------------------------------------------------
  185. Returns the square root of the double-precision floating-point value `a'.
  186. The operation is performed according to the IEC/IEEE Standard for Binary
  187. Floating-Point Arithmetic.
  188. -------------------------------------------------------------------------------
  189. *}
  190. function float64_sqrt( a: float64 ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  191. {*
  192. -------------------------------------------------------------------------------
  193. Returns the remainder of the double-precision floating-point value `a'
  194. with respect to the corresponding value `b'. The operation is performed
  195. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  196. -------------------------------------------------------------------------------
  197. *}
  198. Function float64_rem(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  199. {*
  200. -------------------------------------------------------------------------------
  201. Returns the result of dividing the double-precision floating-point value `a'
  202. by the corresponding value `b'. The operation is performed according to the
  203. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  204. -------------------------------------------------------------------------------
  205. *}
  206. Function float64_div(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  207. {*
  208. -------------------------------------------------------------------------------
  209. Returns the result of multiplying the double-precision floating-point values
  210. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  211. for Binary Floating-Point Arithmetic.
  212. -------------------------------------------------------------------------------
  213. *}
  214. Function float64_mul( a: float64; b:float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  215. {*
  216. -------------------------------------------------------------------------------
  217. Returns the result of subtracting the double-precision floating-point values
  218. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  219. for Binary Floating-Point Arithmetic.
  220. -------------------------------------------------------------------------------
  221. *}
  222. Function float64_sub(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  223. {*
  224. -------------------------------------------------------------------------------
  225. Returns the result of adding the double-precision floating-point values `a'
  226. and `b'. The operation is performed according to the IEC/IEEE Standard for
  227. Binary Floating-Point Arithmetic.
  228. -------------------------------------------------------------------------------
  229. *}
  230. Function float64_add( a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  231. {*
  232. -------------------------------------------------------------------------------
  233. Rounds the double-precision floating-point value `a' to an integer,
  234. and returns the result as a double-precision floating-point value. The
  235. operation is performed according to the IEC/IEEE Standard for Binary
  236. Floating-Point Arithmetic.
  237. -------------------------------------------------------------------------------
  238. *}
  239. Function float64_round_to_int(a: float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  240. {*
  241. -------------------------------------------------------------------------------
  242. Returns the result of converting the double-precision floating-point value
  243. `a' to the single-precision floating-point format. The conversion is
  244. performed according to the IEC/IEEE Standard for Binary Floating-Point
  245. Arithmetic.
  246. -------------------------------------------------------------------------------
  247. *}
  248. Function float64_to_float32(a: float64) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  249. {*
  250. -------------------------------------------------------------------------------
  251. Returns the result of converting the double-precision floating-point value
  252. `a' to the 32-bit two's complement integer format. The conversion is
  253. performed according to the IEC/IEEE Standard for Binary Floating-Point
  254. Arithmetic, except that the conversion is always rounded toward zero.
  255. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  256. the conversion overflows, the largest integer with the same sign as `a' is
  257. returned.
  258. -------------------------------------------------------------------------------
  259. *}
  260. Function float64_to_int32_round_to_zero(a: float64 ): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  261. {*
  262. -------------------------------------------------------------------------------
  263. Returns the result of converting the double-precision floating-point value
  264. `a' to the 32-bit two's complement integer format. The conversion is
  265. performed according to the IEC/IEEE Standard for Binary Floating-Point
  266. Arithmetic---which means in particular that the conversion is rounded
  267. according to the current rounding mode. If `a' is a NaN, the largest
  268. positive integer is returned. Otherwise, if the conversion overflows, the
  269. largest integer with the same sign as `a' is returned.
  270. -------------------------------------------------------------------------------
  271. *}
  272. Function float64_to_int32(a: float64): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  273. {*
  274. -------------------------------------------------------------------------------
  275. Returns 1 if the single-precision floating-point value `a' is less than
  276. the corresponding value `b', and 0 otherwise. The comparison is performed
  277. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  278. -------------------------------------------------------------------------------
  279. *}
  280. Function float32_lt( a:float32rec ; b : float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  281. {*
  282. -------------------------------------------------------------------------------
  283. Returns 1 if the single-precision floating-point value `a' is less than
  284. or equal to the corresponding value `b', and 0 otherwise. The comparison
  285. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  286. Arithmetic.
  287. -------------------------------------------------------------------------------
  288. *}
  289. Function float32_le( a: float32rec; b : float32rec ):flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  290. {*
  291. -------------------------------------------------------------------------------
  292. Returns 1 if the single-precision floating-point value `a' is equal to
  293. the corresponding value `b', and 0 otherwise. The comparison is performed
  294. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  295. -------------------------------------------------------------------------------
  296. *}
  297. Function float32_eq( a:float32rec; b:float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  298. {*
  299. -------------------------------------------------------------------------------
  300. Returns the square root of the single-precision floating-point value `a'.
  301. The operation is performed according to the IEC/IEEE Standard for Binary
  302. Floating-Point Arithmetic.
  303. -------------------------------------------------------------------------------
  304. *}
  305. Function float32_sqrt(a: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  306. {*
  307. -------------------------------------------------------------------------------
  308. Returns the remainder of the single-precision floating-point value `a'
  309. with respect to the corresponding value `b'. The operation is performed
  310. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  311. -------------------------------------------------------------------------------
  312. *}
  313. Function float32_rem(a: float32rec; b: float32rec ):float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  314. {*
  315. -------------------------------------------------------------------------------
  316. Returns the result of dividing the single-precision floating-point value `a'
  317. by the corresponding value `b'. The operation is performed according to the
  318. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  319. -------------------------------------------------------------------------------
  320. *}
  321. Function float32_div(a: float32rec;b: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  322. {*
  323. -------------------------------------------------------------------------------
  324. Returns the result of multiplying the single-precision floating-point values
  325. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  326. for Binary Floating-Point Arithmetic.
  327. -------------------------------------------------------------------------------
  328. *}
  329. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  330. {*
  331. -------------------------------------------------------------------------------
  332. Returns the result of subtracting the single-precision floating-point values
  333. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  334. for Binary Floating-Point Arithmetic.
  335. -------------------------------------------------------------------------------
  336. *}
  337. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  338. {*
  339. -------------------------------------------------------------------------------
  340. Returns the result of adding the single-precision floating-point values `a'
  341. and `b'. The operation is performed according to the IEC/IEEE Standard for
  342. Binary Floating-Point Arithmetic.
  343. -------------------------------------------------------------------------------
  344. *}
  345. Function float32_add( a: float32rec; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  346. {*
  347. -------------------------------------------------------------------------------
  348. Rounds the single-precision floating-point value `a' to an integer,
  349. and returns the result as a single-precision floating-point value. The
  350. operation is performed according to the IEC/IEEE Standard for Binary
  351. Floating-Point Arithmetic.
  352. -------------------------------------------------------------------------------
  353. *}
  354. Function float32_round_to_int( a: float32rec): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  355. {*
  356. -------------------------------------------------------------------------------
  357. Returns the result of converting the single-precision floating-point value
  358. `a' to the double-precision floating-point format. The conversion is
  359. performed according to the IEC/IEEE Standard for Binary Floating-Point
  360. Arithmetic.
  361. -------------------------------------------------------------------------------
  362. *}
  363. Function float32_to_float64( a : float32rec) : Float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  364. {*
  365. -------------------------------------------------------------------------------
  366. Returns the result of converting the single-precision floating-point value
  367. `a' to the 32-bit two's complement integer format. The conversion is
  368. performed according to the IEC/IEEE Standard for Binary Floating-Point
  369. Arithmetic, except that the conversion is always rounded toward zero.
  370. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  371. the conversion overflows, the largest integer with the same sign as `a' is
  372. returned.
  373. -------------------------------------------------------------------------------
  374. *}
  375. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  376. {*
  377. -------------------------------------------------------------------------------
  378. Returns the result of converting the single-precision floating-point value
  379. `a' to the 32-bit two's complement integer format. The conversion is
  380. performed according to the IEC/IEEE Standard for Binary Floating-Point
  381. Arithmetic---which means in particular that the conversion is rounded
  382. according to the current rounding mode. If `a' is a NaN, the largest
  383. positive integer is returned. Otherwise, if the conversion overflows, the
  384. largest integer with the same sign as `a' is returned.
  385. -------------------------------------------------------------------------------
  386. *}
  387. Function float32_to_int32( a : float32rec) : int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  388. {*
  389. -------------------------------------------------------------------------------
  390. Returns the result of converting the 32-bit two's complement integer `a' to
  391. the double-precision floating-point format. The conversion is performed
  392. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  393. -------------------------------------------------------------------------------
  394. *}
  395. Function int32_to_float64( a: int32) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  396. {*
  397. -------------------------------------------------------------------------------
  398. Returns the result of converting the 32-bit two's complement integer `a' to
  399. the single-precision floating-point format. The conversion is performed
  400. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  401. -------------------------------------------------------------------------------
  402. *}
  403. Function int32_to_float32( a: int32): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  404. {*----------------------------------------------------------------------------
  405. | Returns the result of converting the 64-bit two's complement integer `a'
  406. | to the double-precision floating-point format. The conversion is performed
  407. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  408. *----------------------------------------------------------------------------*}
  409. Function int64_to_float64( a: int64 ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  410. Function qword_to_float64( a: qword ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  411. {*----------------------------------------------------------------------------
  412. | Returns the result of converting the 64-bit two's complement integer `a'
  413. | to the single-precision floating-point format. The conversion is performed
  414. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  415. *----------------------------------------------------------------------------*}
  416. Function int64_to_float32( a: int64 ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  417. Function qword_to_float32( a: qword ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  418. // +++
  419. function float32_to_int64( a: float32 ): int64;
  420. function float32_to_int64_round_to_zero( a: float32 ): int64;
  421. function float32_eq_signaling( a: float32; b: float32) : flag;
  422. function float32_le_quiet( a: float32 ; b : float32 ): flag;
  423. function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  424. function float32_is_signaling_nan( a : float32 ): flag;
  425. function float32_is_nan( a : float32 ): flag;
  426. function float64_to_int64( a: float64 ): int64;
  427. function float64_to_int64_round_to_zero( a: float64 ): int64;
  428. function float64_eq_signaling( a: float64; b: float64): flag;
  429. function float64_le_quiet(a: float64 ; b: float64 ): flag;
  430. function float64_lt_quiet(a: float64; b: float64 ): Flag;
  431. function float64_is_signaling_nan( a : float64 ): flag;
  432. function float64_is_nan( a : float64 ): flag;
  433. // ===
  434. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  435. {*----------------------------------------------------------------------------
  436. | Extended double-precision rounding precision
  437. *----------------------------------------------------------------------------*}
  438. var // threadvar!?
  439. floatx80_rounding_precision : int8 = 80;
  440. function int32_to_floatx80( a: int32 ): floatx80;
  441. function int64_to_floatx80( a: int64 ): floatx80;
  442. function qword_to_floatx80( a: qword ): floatx80;
  443. function float32_to_floatx80( a: float32 ): floatx80;
  444. function float64_to_floatx80( a: float64 ): floatx80;
  445. function floatx80_to_int32( a: floatx80 ): int32;
  446. function floatx80_to_int32_round_to_zero( a: floatx80 ): int32;
  447. function floatx80_to_int64( a: floatx80 ): int64;
  448. function floatx80_to_int64_round_to_zero( a: floatx80 ): int64;
  449. function floatx80_to_float32( a: floatx80 ): float32;
  450. function floatx80_to_float64( a: floatx80 ): float64;
  451. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  452. function floatx80_to_float128( a: floatx80 ): float128;
  453. {$endif FPC_SOFTFLOAT_FLOAT128}
  454. function floatx80_round_to_int( a: floatx80 ): floatx80;
  455. function floatx80_add( a: floatx80; b: floatx80 ): floatx80;
  456. function floatx80_sub( a: floatx80; b: floatx80 ): floatx80;
  457. function floatx80_mul( a: floatx80; b: floatx80 ): floatx80;
  458. function floatx80_div( a: floatx80; b: floatx80 ): floatx80;
  459. function floatx80_rem( a: floatx80; b: floatx80 ): floatx80;
  460. function floatx80_sqrt( a: floatx80 ): floatx80;
  461. function floatx80_eq( a: floatx80; b: floatx80 ): flag;
  462. function floatx80_le( a: floatx80; b: floatx80 ): flag;
  463. function floatx80_lt( a: floatx80; b: floatx80 ): flag;
  464. function floatx80_eq_signaling( a: floatx80; b: floatx80 ): flag;
  465. function floatx80_le_quiet( a: floatx80; b: floatx80 ): flag;
  466. function floatx80_lt_quiet( a: floatx80; b: floatx80 ): flag;
  467. function floatx80_is_signaling_nan( a: floatx80 ): flag;
  468. function floatx80_is_nan(a : floatx80 ): flag;
  469. {$endif FPC_SOFTFLOAT_FLOATX80}
  470. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  471. function int32_to_float128( a: int32 ): float128;
  472. function int64_to_float128( a: int64 ): float128;
  473. function qword_to_float128( a: qword ): float128;
  474. function float32_to_float128( a: float32 ): float128;
  475. function float128_is_nan( a : float128): flag;
  476. function float128_is_signaling_nan( a : float128): flag;
  477. function float128_to_int32(a: float128): int32;
  478. function float128_to_int32_round_to_zero(a: float128): int32;
  479. function float128_to_int64(a: float128): int64;
  480. function float128_to_int64_round_to_zero(a: float128): int64;
  481. function float128_to_float32(a: float128): float32;
  482. function float128_to_float64(a: float128): float64;
  483. function float64_to_float128( a : float64) : float128;
  484. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  485. function float128_to_floatx80(a: float128): floatx80;
  486. {$endif FPC_SOFTFLOAT_FLOATX80}
  487. function float128_round_to_int(a: float128): float128;
  488. function float128_add(a: float128; b: float128): float128;
  489. function float128_sub(a: float128; b: float128): float128;
  490. function float128_mul(a: float128; b: float128): float128;
  491. function float128_div(a: float128; b: float128): float128;
  492. function float128_rem(a: float128; b: float128): float128;
  493. function float128_sqrt(a: float128): float128;
  494. function float128_eq(a: float128; b: float128): flag;
  495. function float128_le(a: float128; b: float128): flag;
  496. function float128_lt(a: float128; b: float128): flag;
  497. function float128_eq_signaling(a: float128; b: float128): flag;
  498. function float128_le_quiet(a: float128; b: float128): flag;
  499. function float128_lt_quiet(a: float128; b: float128): flag;
  500. {$endif FPC_SOFTFLOAT_FLOAT128}
  501. CONST
  502. {-------------------------------------------------------------------------------
  503. Software IEC/IEEE floating-point underflow tininess-detection mode.
  504. -------------------------------------------------------------------------------
  505. *}
  506. float_tininess_after_rounding = 0;
  507. float_tininess_before_rounding = 1;
  508. {*
  509. -------------------------------------------------------------------------------
  510. Underflow tininess-detection mode, statically initialized to default value.
  511. (The declaration in `softfloat.h' must match the `int8' type here.)
  512. -------------------------------------------------------------------------------
  513. *}
  514. var // threadvar!?
  515. softfloat_detect_tininess: int8 = float_tininess_after_rounding;
  516. {$endif not(defined(fpc_softfpu_implementation))}
  517. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  518. implementation
  519. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  520. {$if not(defined(fpc_softfpu_interface))}
  521. {$ifdef FPC}
  522. { disable range and overflow checking explicitly }
  523. { This might be more essential for x80 and 128-bit
  524. floating point types and could, maybe be
  525. restricted to code handle floatx80 and float128 }
  526. {$push}
  527. {$R-}
  528. {$Q-}
  529. {$endif FPC}
  530. (*****************************************************************************)
  531. (*----------------------------------------------------------------------------*)
  532. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  533. (* division and square root approximations. (Can be specialized to target if *)
  534. (* desired.) *)
  535. (* ---------------------------------------------------------------------------*)
  536. (*****************************************************************************)
  537. { This procedure serves as a single access point to softfloat_exception_flags.
  538. It also helps to reduce code size a bit because softfloat_exception_flags is
  539. a threadvar. }
  540. procedure set_inexact_flag;
  541. begin
  542. include(softfloat_exception_flags,float_flag_inexact);
  543. end;
  544. {*----------------------------------------------------------------------------
  545. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  546. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  547. | input. If `zSign' is 1, the input is negated before being converted to an
  548. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  549. | is simply rounded to an integer, with the inexact exception raised if the
  550. | input cannot be represented exactly as an integer. However, if the fixed-
  551. | point input is too large, the invalid exception is raised and the largest
  552. | positive or negative integer is returned.
  553. *----------------------------------------------------------------------------*}
  554. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  555. var
  556. roundingMode: TFPURoundingMode;
  557. roundNearestEven: boolean;
  558. roundIncrement, roundBits: int8;
  559. z: int32;
  560. begin
  561. roundingMode := softfloat_rounding_mode;
  562. roundNearestEven := (roundingMode = float_round_nearest_even);
  563. roundIncrement := $40;
  564. if not roundNearestEven then
  565. begin
  566. if ( roundingMode = float_round_to_zero ) then
  567. begin
  568. roundIncrement := 0;
  569. end
  570. else begin
  571. roundIncrement := $7F;
  572. if ( zSign<>0 ) then
  573. begin
  574. if ( roundingMode = float_round_up ) then
  575. roundIncrement := 0;
  576. end
  577. else begin
  578. if ( roundingMode = float_round_down ) then
  579. roundIncrement := 0;
  580. end;
  581. end;
  582. end;
  583. roundBits := lo(absZ) and $7F;
  584. absZ := ( absZ + roundIncrement ) shr 7;
  585. absZ := absZ and not( bits64( ord( ( roundBits xor $40 ) = 0 ) and ord(roundNearestEven) ));
  586. z := absZ;
  587. if ( zSign<>0 ) then
  588. z := - z;
  589. if ( longint(hi( absZ )) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  590. begin
  591. float_raise( float_flag_invalid );
  592. if zSign<>0 then
  593. result:=sbits32($80000000)
  594. else
  595. result:=$7FFFFFFF;
  596. exit;
  597. end;
  598. if ( roundBits<>0 ) then
  599. set_inexact_flag;
  600. result:=z;
  601. end;
  602. {*----------------------------------------------------------------------------
  603. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  604. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  605. | and returns the properly rounded 64-bit integer corresponding to the input.
  606. | If `zSign' is 1, the input is negated before being converted to an integer.
  607. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  608. | the inexact exception raised if the input cannot be represented exactly as
  609. | an integer. However, if the fixed-point input is too large, the invalid
  610. | exception is raised and the largest positive or negative integer is
  611. | returned.
  612. *----------------------------------------------------------------------------*}
  613. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  614. var
  615. roundingMode: TFPURoundingMode;
  616. roundNearestEven, increment: flag;
  617. z: int64;
  618. label
  619. overflow;
  620. begin
  621. roundingMode := softfloat_rounding_mode;
  622. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  623. increment := ord( sbits64(absZ1) < 0 );
  624. if ( roundNearestEven=0 ) then
  625. begin
  626. if ( roundingMode = float_round_to_zero ) then
  627. begin
  628. increment := 0;
  629. end
  630. else begin
  631. if ( zSign<>0 ) then
  632. begin
  633. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  634. end
  635. else begin
  636. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  637. end;
  638. end;
  639. end;
  640. if ( increment<>0 ) then
  641. begin
  642. inc(absZ0);
  643. if ( absZ0 = 0 ) then
  644. goto overflow;
  645. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  646. end;
  647. z := absZ0;
  648. if ( zSign<>0 ) then
  649. z := - z;
  650. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  651. begin
  652. overflow:
  653. float_raise( float_flag_invalid );
  654. if zSign<>0 then
  655. result:=int64($8000000000000000)
  656. else
  657. result:=int64($7FFFFFFFFFFFFFFF);
  658. exit;
  659. end;
  660. if ( absZ1<>0 ) then
  661. set_inexact_flag;
  662. result:=z;
  663. end;
  664. {*
  665. -------------------------------------------------------------------------------
  666. Shifts `a' right by the number of bits given in `count'. If any nonzero
  667. bits are shifted off, they are ``jammed'' into the least significant bit of
  668. the result by setting the least significant bit to 1. The value of `count'
  669. can be arbitrarily large; in particular, if `count' is greater than 32, the
  670. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  671. The result is stored in the location pointed to by `zPtr'.
  672. -------------------------------------------------------------------------------
  673. *}
  674. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  675. var
  676. z: Bits32;
  677. Begin
  678. if ( count = 0 ) then
  679. z := a
  680. else
  681. if ( count < 32 ) then
  682. Begin
  683. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  684. End
  685. else
  686. Begin
  687. z := bits32( a <> 0 );
  688. End;
  689. zPtr := z;
  690. End;
  691. {*----------------------------------------------------------------------------
  692. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  693. | number of bits given in `count'. Any bits shifted off are lost. The value
  694. | of `count' can be arbitrarily large; in particular, if `count' is greater
  695. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  696. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  697. *----------------------------------------------------------------------------*}
  698. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  699. var
  700. z0, z1: bits64;
  701. negCount: int8;
  702. begin
  703. negCount := ( - count ) and 63;
  704. if ( count = 0 ) then
  705. begin
  706. z1 := a1;
  707. z0 := a0;
  708. end
  709. else if ( count < 64 ) then
  710. begin
  711. z1 := ( a0 shl negCount ) or ( a1 shr count );
  712. z0 := a0 shr count;
  713. end
  714. else
  715. begin
  716. if ( count < 128 ) then
  717. z1 := a0 shr ( count and 63 )
  718. else
  719. z1 := 0;
  720. z0 := 0;
  721. end;
  722. z1Ptr := z1;
  723. z0Ptr := z0;
  724. end;
  725. {*----------------------------------------------------------------------------
  726. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  727. | number of bits given in `count'. If any nonzero bits are shifted off, they
  728. | are ``jammed'' into the least significant bit of the result by setting the
  729. | least significant bit to 1. The value of `count' can be arbitrarily large;
  730. | in particular, if `count' is greater than 128, the result will be either
  731. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  732. | nonzero. The result is broken into two 64-bit pieces which are stored at
  733. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  734. *----------------------------------------------------------------------------*}
  735. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  736. var
  737. z0,z1 : bits64;
  738. negCount : int8;
  739. begin
  740. negCount := ( - count ) and 63;
  741. if ( count = 0 ) then begin
  742. z1 := a1;
  743. z0 := a0;
  744. end
  745. else if ( count < 64 ) then begin
  746. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  747. z0 := a0 shr count;
  748. end
  749. else begin
  750. if ( count = 64 ) then begin
  751. z1 := a0 or ord( a1 <> 0 );
  752. end
  753. else if ( count < 128 ) then begin
  754. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  755. end
  756. else begin
  757. z1 := ord( ( a0 or a1 ) <> 0 );
  758. end;
  759. z0 := 0;
  760. end;
  761. z1Ptr := z1;
  762. z0Ptr := z0;
  763. end;
  764. {*
  765. -------------------------------------------------------------------------------
  766. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  767. number of bits given in `count'. Any bits shifted off are lost. The value
  768. of `count' can be arbitrarily large; in particular, if `count' is greater
  769. than 64, the result will be 0. The result is broken into two 32-bit pieces
  770. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  771. -------------------------------------------------------------------------------
  772. *}
  773. Procedure
  774. shift64Right(
  775. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  776. Var
  777. z0, z1: bits32;
  778. negCount : int8;
  779. Begin
  780. negCount := ( - count ) AND 31;
  781. if ( count = 0 ) then
  782. Begin
  783. z1 := a1;
  784. z0 := a0;
  785. End
  786. else if ( count < 32 ) then
  787. Begin
  788. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  789. z0 := a0 shr count;
  790. End
  791. else
  792. Begin
  793. if (count < 64) then
  794. z1 := ( a0 shr ( count AND 31 ) )
  795. else
  796. z1 := 0;
  797. z0 := 0;
  798. End;
  799. z1Ptr := z1;
  800. z0Ptr := z0;
  801. End;
  802. {*
  803. -------------------------------------------------------------------------------
  804. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  805. number of bits given in `count'. If any nonzero bits are shifted off, they
  806. are ``jammed'' into the least significant bit of the result by setting the
  807. least significant bit to 1. The value of `count' can be arbitrarily large;
  808. in particular, if `count' is greater than 64, the result will be either 0
  809. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  810. nonzero. The result is broken into two 32-bit pieces which are stored at
  811. the locations pointed to by `z0Ptr' and `z1Ptr'.
  812. -------------------------------------------------------------------------------
  813. *}
  814. Procedure
  815. shift64RightJamming(
  816. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  817. VAR
  818. z0, z1 : bits32;
  819. negCount : int8;
  820. Begin
  821. negCount := ( - count ) AND 31;
  822. if ( count = 0 ) then
  823. Begin
  824. z1 := a1;
  825. z0 := a0;
  826. End
  827. else
  828. if ( count < 32 ) then
  829. Begin
  830. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  831. z0 := a0 shr count;
  832. End
  833. else
  834. Begin
  835. if ( count = 32 ) then
  836. Begin
  837. z1 := a0 OR bits32( a1 <> 0 );
  838. End
  839. else
  840. if ( count < 64 ) Then
  841. Begin
  842. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  843. End
  844. else
  845. Begin
  846. z1 := bits32( ( a0 OR a1 ) <> 0 );
  847. End;
  848. z0 := 0;
  849. End;
  850. z1Ptr := z1;
  851. z0Ptr := z0;
  852. End;
  853. {*----------------------------------------------------------------------------
  854. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  855. | bits are shifted off, they are ``jammed'' into the least significant bit of
  856. | the result by setting the least significant bit to 1. The value of `count'
  857. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  858. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  859. | The result is stored in the location pointed to by `zPtr'.
  860. *----------------------------------------------------------------------------*}
  861. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  862. var
  863. z: bits64;
  864. begin
  865. if ( count = 0 ) then
  866. begin
  867. z := a;
  868. end
  869. else if ( count < 64 ) then
  870. begin
  871. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  872. end
  873. else
  874. begin
  875. z := ord( a <> 0 );
  876. end;
  877. zPtr := z;
  878. end;
  879. {$if not defined(shift64ExtraRightJamming)}
  880. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  881. overload;
  882. forward;
  883. {$endif}
  884. {*
  885. -------------------------------------------------------------------------------
  886. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  887. by 32 _plus_ the number of bits given in `count'. The shifted result is
  888. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  889. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  890. off form a third 32-bit result as follows: The _last_ bit shifted off is
  891. the most-significant bit of the extra result, and the other 31 bits of the
  892. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  893. were all zero. This extra result is stored in the location pointed to by
  894. `z2Ptr'. The value of `count' can be arbitrarily large.
  895. (This routine makes more sense if `a0', `a1', and `a2' are considered
  896. to form a fixed-point value with binary point between `a1' and `a2'. This
  897. fixed-point value is shifted right by the number of bits given in `count',
  898. and the integer part of the result is returned at the locations pointed to
  899. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  900. corrupted as described above, and is returned at the location pointed to by
  901. `z2Ptr'.)
  902. -------------------------------------------------------------------------------
  903. }
  904. Procedure
  905. shift64ExtraRightJamming(
  906. a0: bits32;
  907. a1: bits32;
  908. a2: bits32;
  909. count: int16;
  910. VAR z0Ptr: bits32;
  911. VAR z1Ptr: bits32;
  912. VAR z2Ptr: bits32
  913. ); overload;
  914. Var
  915. z0, z1, z2: bits32;
  916. negCount : int8;
  917. Begin
  918. negCount := ( - count ) AND 31;
  919. if ( count = 0 ) then
  920. Begin
  921. z2 := a2;
  922. z1 := a1;
  923. z0 := a0;
  924. End
  925. else
  926. Begin
  927. if ( count < 32 ) Then
  928. Begin
  929. z2 := a1 shl negCount;
  930. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  931. z0 := a0 shr count;
  932. End
  933. else
  934. Begin
  935. if ( count = 32 ) then
  936. Begin
  937. z2 := a1;
  938. z1 := a0;
  939. End
  940. else
  941. Begin
  942. a2 := a2 or a1;
  943. if ( count < 64 ) then
  944. Begin
  945. z2 := a0 shl negCount;
  946. z1 := a0 shr ( count AND 31 );
  947. End
  948. else
  949. Begin
  950. if count = 64 then
  951. z2 := a0
  952. else
  953. z2 := bits32(a0 <> 0);
  954. z1 := 0;
  955. End;
  956. End;
  957. z0 := 0;
  958. End;
  959. z2 := z2 or bits32( a2 <> 0 );
  960. End;
  961. z2Ptr := z2;
  962. z1Ptr := z1;
  963. z0Ptr := z0;
  964. End;
  965. {*
  966. -------------------------------------------------------------------------------
  967. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  968. number of bits given in `count'. Any bits shifted off are lost. The value
  969. of `count' must be less than 32. The result is broken into two 32-bit
  970. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  971. -------------------------------------------------------------------------------
  972. *}
  973. Procedure
  974. shortShift64Left(
  975. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  976. Begin
  977. z1Ptr := a1 shl count;
  978. if count = 0 then
  979. z0Ptr := a0
  980. else
  981. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  982. End;
  983. {*
  984. -------------------------------------------------------------------------------
  985. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  986. by the number of bits given in `count'. Any bits shifted off are lost.
  987. The value of `count' must be less than 32. The result is broken into three
  988. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  989. `z1Ptr', and `z2Ptr'.
  990. -------------------------------------------------------------------------------
  991. *}
  992. Procedure
  993. shortShift96Left(
  994. a0: bits32;
  995. a1: bits32;
  996. a2: bits32;
  997. count: int16;
  998. VAR z0Ptr: bits32;
  999. VAR z1Ptr: bits32;
  1000. VAR z2Ptr: bits32
  1001. );
  1002. Var
  1003. z0, z1, z2: bits32;
  1004. negCount: int8;
  1005. Begin
  1006. z2 := a2 shl count;
  1007. z1 := a1 shl count;
  1008. z0 := a0 shl count;
  1009. if ( 0 < count ) then
  1010. Begin
  1011. negCount := ( ( - count ) AND 31 );
  1012. z1 := z1 or (a2 shr negCount);
  1013. z0 := z0 or (a1 shr negCount);
  1014. End;
  1015. z2Ptr := z2;
  1016. z1Ptr := z1;
  1017. z0Ptr := z0;
  1018. End;
  1019. {*----------------------------------------------------------------------------
  1020. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  1021. | number of bits given in `count'. Any bits shifted off are lost. The value
  1022. | of `count' must be less than 64. The result is broken into two 64-bit
  1023. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1024. *----------------------------------------------------------------------------*}
  1025. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  1026. begin
  1027. z1Ptr := a1 shl count;
  1028. if count=0 then
  1029. z0Ptr:=a0
  1030. else
  1031. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  1032. end;
  1033. {*
  1034. -------------------------------------------------------------------------------
  1035. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  1036. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  1037. any carry out is lost. The result is broken into two 32-bit pieces which
  1038. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1039. -------------------------------------------------------------------------------
  1040. *}
  1041. Procedure
  1042. add64(
  1043. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1044. Var
  1045. z1: bits32;
  1046. Begin
  1047. z1 := a1 + b1;
  1048. z1Ptr := z1;
  1049. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  1050. End;
  1051. {*
  1052. -------------------------------------------------------------------------------
  1053. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  1054. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1055. modulo 2^96, so any carry out is lost. The result is broken into three
  1056. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1057. `z1Ptr', and `z2Ptr'.
  1058. -------------------------------------------------------------------------------
  1059. *}
  1060. Procedure
  1061. add96(
  1062. a0: bits32;
  1063. a1: bits32;
  1064. a2: bits32;
  1065. b0: bits32;
  1066. b1: bits32;
  1067. b2: bits32;
  1068. VAR z0Ptr: bits32;
  1069. VAR z1Ptr: bits32;
  1070. VAR z2Ptr: bits32
  1071. );
  1072. var
  1073. z0, z1, z2: bits32;
  1074. carry0, carry1: int8;
  1075. Begin
  1076. z2 := a2 + b2;
  1077. carry1 := int8( z2 < a2 );
  1078. z1 := a1 + b1;
  1079. carry0 := int8( z1 < a1 );
  1080. z0 := a0 + b0;
  1081. z1 := z1 + carry1;
  1082. z0 := z0 + bits32( z1 < carry1 );
  1083. z0 := z0 + carry0;
  1084. z2Ptr := z2;
  1085. z1Ptr := z1;
  1086. z0Ptr := z0;
  1087. End;
  1088. {*----------------------------------------------------------------------------
  1089. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1090. | by the number of bits given in `count'. Any bits shifted off are lost.
  1091. | The value of `count' must be less than 64. The result is broken into three
  1092. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1093. | `z1Ptr', and `z2Ptr'.
  1094. *----------------------------------------------------------------------------*}
  1095. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1096. var
  1097. z0, z1, z2 : bits64;
  1098. negCount : int8;
  1099. begin
  1100. z2 := a2 shl count;
  1101. z1 := a1 shl count;
  1102. z0 := a0 shl count;
  1103. if ( 0 < count ) then
  1104. begin
  1105. negCount := ( ( - count ) and 63 );
  1106. z1 := z1 or (a2 shr negCount);
  1107. z0 := z0 or (a1 shr negCount);
  1108. end;
  1109. z2Ptr := z2;
  1110. z1Ptr := z1;
  1111. z0Ptr := z0;
  1112. end;
  1113. {*----------------------------------------------------------------------------
  1114. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1115. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1116. | any carry out is lost. The result is broken into two 64-bit pieces which
  1117. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1118. *----------------------------------------------------------------------------*}
  1119. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1120. var
  1121. z1 : bits64;
  1122. begin
  1123. z1 := a1 + b1;
  1124. z1Ptr := z1;
  1125. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1126. end;
  1127. {*----------------------------------------------------------------------------
  1128. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1129. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1130. | modulo 2^192, so any carry out is lost. The result is broken into three
  1131. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1132. | `z1Ptr', and `z2Ptr'.
  1133. *----------------------------------------------------------------------------*}
  1134. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1135. var
  1136. z0, z1, z2 : bits64;
  1137. carry0, carry1 : int8;
  1138. begin
  1139. z2 := a2 + b2;
  1140. carry1 := ord( z2 < a2 );
  1141. z1 := a1 + b1;
  1142. carry0 := ord( z1 < a1 );
  1143. z0 := a0 + b0;
  1144. inc(z1, carry1);
  1145. inc(z0, ord( z1 < carry1 ));
  1146. inc(z0, carry0);
  1147. z2Ptr := z2;
  1148. z1Ptr := z1;
  1149. z0Ptr := z0;
  1150. end;
  1151. {*
  1152. -------------------------------------------------------------------------------
  1153. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1154. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1155. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1156. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1157. `z1Ptr'.
  1158. -------------------------------------------------------------------------------
  1159. *}
  1160. Procedure
  1161. sub64(
  1162. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1163. Begin
  1164. z1Ptr := a1 - b1;
  1165. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1166. End;
  1167. {*
  1168. -------------------------------------------------------------------------------
  1169. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1170. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1171. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1172. into three 32-bit pieces which are stored at the locations pointed to by
  1173. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1174. -------------------------------------------------------------------------------
  1175. *}
  1176. Procedure
  1177. sub96(
  1178. a0:bits32;
  1179. a1:bits32;
  1180. a2:bits32;
  1181. b0:bits32;
  1182. b1:bits32;
  1183. b2:bits32;
  1184. VAR z0Ptr:bits32;
  1185. VAR z1Ptr:bits32;
  1186. VAR z2Ptr:bits32
  1187. );
  1188. Var
  1189. z0, z1, z2: bits32;
  1190. borrow0, borrow1: int8;
  1191. Begin
  1192. z2 := a2 - b2;
  1193. borrow1 := int8( a2 < b2 );
  1194. z1 := a1 - b1;
  1195. borrow0 := int8( a1 < b1 );
  1196. z0 := a0 - b0;
  1197. z0 := z0 - bits32( z1 < borrow1 );
  1198. z1 := z1 - borrow1;
  1199. z0 := z0 -borrow0;
  1200. z2Ptr := z2;
  1201. z1Ptr := z1;
  1202. z0Ptr := z0;
  1203. End;
  1204. {*----------------------------------------------------------------------------
  1205. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1206. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1207. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1208. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1209. | `z1Ptr'.
  1210. *----------------------------------------------------------------------------*}
  1211. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1212. begin
  1213. z1Ptr := a1 - b1;
  1214. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1215. end;
  1216. {*----------------------------------------------------------------------------
  1217. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1218. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1219. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1220. | result is broken into three 64-bit pieces which are stored at the locations
  1221. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1222. *----------------------------------------------------------------------------*}
  1223. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1224. var
  1225. z0, z1, z2 : bits64;
  1226. borrow0, borrow1 : int8;
  1227. begin
  1228. z2 := a2 - b2;
  1229. borrow1 := ord( a2 < b2 );
  1230. z1 := a1 - b1;
  1231. borrow0 := ord( a1 < b1 );
  1232. z0 := a0 - b0;
  1233. dec(z0, ord( z1 < borrow1 ));
  1234. dec(z1, borrow1);
  1235. dec(z0, borrow0);
  1236. z2Ptr := z2;
  1237. z1Ptr := z1;
  1238. z0Ptr := z0;
  1239. end;
  1240. {*
  1241. -------------------------------------------------------------------------------
  1242. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1243. into two 32-bit pieces which are stored at the locations pointed to by
  1244. `z0Ptr' and `z1Ptr'.
  1245. -------------------------------------------------------------------------------
  1246. *}
  1247. {$IFDEF SOFTFPU_COMPILER_MUL32TO64}
  1248. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr :bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1249. var
  1250. tmp: qword;
  1251. begin
  1252. tmp:=qword(a) * b;
  1253. z0ptr:=hi(tmp);
  1254. z1ptr:=lo(tmp);
  1255. end;
  1256. {$ELSE}
  1257. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1258. :bits32 );
  1259. Var
  1260. aHigh, aLow, bHigh, bLow: bits16;
  1261. z0, zMiddleA, zMiddleB, z1: bits32;
  1262. Begin
  1263. aLow := bits16(a);
  1264. aHigh := a shr 16;
  1265. bLow := bits16(b);
  1266. bHigh := b shr 16;
  1267. z1 := ( bits32( aLow) ) * bLow;
  1268. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1269. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1270. z0 := ( bits32 (aHigh) ) * bHigh;
  1271. zMiddleA := zMiddleA + zMiddleB;
  1272. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1273. zMiddleA := zmiddleA shl 16;
  1274. z1 := z1 + zMiddleA;
  1275. z0 := z0 + bits32( z1 < zMiddleA );
  1276. z1Ptr := z1;
  1277. z0Ptr := z0;
  1278. End;
  1279. {$ENDIF}
  1280. {*
  1281. -------------------------------------------------------------------------------
  1282. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1283. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1284. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1285. `z2Ptr'.
  1286. -------------------------------------------------------------------------------
  1287. *}
  1288. Procedure
  1289. mul64By32To96(
  1290. a0:bits32;
  1291. a1:bits32;
  1292. b:bits32;
  1293. VAR z0Ptr:bits32;
  1294. VAR z1Ptr:bits32;
  1295. VAR z2Ptr:bits32
  1296. );
  1297. Var
  1298. z0, z1, z2, more1: bits32;
  1299. Begin
  1300. mul32To64( a1, b, z1, z2 );
  1301. mul32To64( a0, b, z0, more1 );
  1302. add64( z0, more1, 0, z1, z0, z1 );
  1303. z2Ptr := z2;
  1304. z1Ptr := z1;
  1305. z0Ptr := z0;
  1306. End;
  1307. {*
  1308. -------------------------------------------------------------------------------
  1309. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1310. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1311. product. The product is broken into four 32-bit pieces which are stored at
  1312. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1313. -------------------------------------------------------------------------------
  1314. *}
  1315. Procedure
  1316. mul64To128(
  1317. a0:bits32;
  1318. a1:bits32;
  1319. b0:bits32;
  1320. b1:bits32;
  1321. VAR z0Ptr:bits32;
  1322. VAR z1Ptr:bits32;
  1323. VAR z2Ptr:bits32;
  1324. VAR z3Ptr:bits32
  1325. );
  1326. Var
  1327. z0, z1, z2, z3: bits32;
  1328. more1, more2: bits32;
  1329. Begin
  1330. mul32To64( a1, b1, z2, z3 );
  1331. mul32To64( a1, b0, z1, more2 );
  1332. add64( z1, more2, 0, z2, z1, z2 );
  1333. mul32To64( a0, b0, z0, more1 );
  1334. add64( z0, more1, 0, z1, z0, z1 );
  1335. mul32To64( a0, b1, more1, more2 );
  1336. add64( more1, more2, 0, z2, more1, z2 );
  1337. add64( z0, z1, 0, more1, z0, z1 );
  1338. z3Ptr := z3;
  1339. z2Ptr := z2;
  1340. z1Ptr := z1;
  1341. z0Ptr := z0;
  1342. End;
  1343. {*----------------------------------------------------------------------------
  1344. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1345. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1346. | product. The product is broken into four 64-bit pieces which are stored at
  1347. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1348. *----------------------------------------------------------------------------*}
  1349. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1350. var
  1351. z0,z1,z2,z3,more1,more2 : bits64;
  1352. begin
  1353. z3 := UMul64x64_128( a1, b1, z2 );
  1354. more2 := UMul64x64_128( a1, b0, z1 );
  1355. add128( z1, more2, 0, z2, z1, z2 );
  1356. more1 := UMul64x64_128( a0, b0, z0 );
  1357. add128( z0, more1, 0, z1, z0, z1 );
  1358. more2 := UMul64x64_128( a0, b1, more1 );
  1359. add128( more1, more2, 0, z2, more1, z2 );
  1360. add128( z0, z1, 0, more1, z0, z1 );
  1361. z3Ptr := z3;
  1362. z2Ptr := z2;
  1363. z1Ptr := z1;
  1364. z0Ptr := z0;
  1365. end;
  1366. {*----------------------------------------------------------------------------
  1367. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1368. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1369. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1370. | `z2Ptr'.
  1371. *----------------------------------------------------------------------------*}
  1372. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1373. var
  1374. z0, z1, z2, more1 : bits64;
  1375. begin
  1376. z2 := UMul64x64_128( a1, b, z1 );
  1377. more1 := UMul64x64_128( a0, b, z0 );
  1378. add128( z0, more1, 0, z1, z0, z1 );
  1379. z2Ptr := z2;
  1380. z1Ptr := z1;
  1381. z0Ptr := z0;
  1382. end;
  1383. {*----------------------------------------------------------------------------
  1384. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1385. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1386. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1387. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1388. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1389. | unsigned integer is returned.
  1390. *----------------------------------------------------------------------------*}
  1391. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1392. var
  1393. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1394. begin
  1395. if ( b <= a0 ) then
  1396. begin
  1397. result:=qword( $FFFFFFFFFFFFFFFF );
  1398. exit;
  1399. end;
  1400. b0 := b shr 32;
  1401. if ( b0 shl 32 <= a0 ) then
  1402. z:=qword( $FFFFFFFF00000000 )
  1403. else
  1404. z:=( a0 div b0 ) shl 32;
  1405. term1 := UMul64x64_128( b, z, term0 );
  1406. sub128( a0, a1, term0, term1, rem0, rem1 );
  1407. while ( ( sbits64(rem0) ) < 0 ) do begin
  1408. dec(z,qword( $100000000 ));
  1409. b1 := b shl 32;
  1410. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1411. end;
  1412. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1413. if ( b0 shl 32 <= rem0 ) then
  1414. z:=z or $FFFFFFFF
  1415. else
  1416. z:=z or rem0 div b0;
  1417. result:=z;
  1418. end;
  1419. {*
  1420. -------------------------------------------------------------------------------
  1421. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1422. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1423. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1424. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1425. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1426. unsigned integer is returned.
  1427. -------------------------------------------------------------------------------
  1428. *}
  1429. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1430. Var
  1431. b0, b1: bits32;
  1432. rem0, rem1, term0, term1: bits32;
  1433. z: bits32;
  1434. Begin
  1435. if ( b <= a0 ) then
  1436. Begin
  1437. estimateDiv64To32 := $FFFFFFFF;
  1438. exit;
  1439. End;
  1440. b0 := b shr 16;
  1441. if ( b0 shl 16 <= a0 ) then
  1442. z:= $FFFF0000
  1443. else
  1444. z:= ( a0 div b0 ) shl 16;
  1445. mul32To64( b, z, term0, term1 );
  1446. sub64( a0, a1, term0, term1, rem0, rem1 );
  1447. while ( ( sbits32 (rem0) ) < 0 ) do
  1448. Begin
  1449. z := z - $10000;
  1450. b1 := b shl 16;
  1451. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1452. End;
  1453. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1454. if ( b0 shl 16 <= rem0 ) then
  1455. z := z or $FFFF
  1456. else
  1457. z := z or (rem0 div b0);
  1458. estimateDiv64To32 := z;
  1459. End;
  1460. {*
  1461. -------------------------------------------------------------------------------
  1462. Returns an approximation to the square root of the 32-bit significand given
  1463. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1464. `aExp' (the least significant bit) is 1, the integer returned approximates
  1465. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1466. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1467. case, the approximation returned lies strictly within +/-2 of the exact
  1468. value.
  1469. -------------------------------------------------------------------------------
  1470. *}
  1471. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1472. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1473. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1474. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1475. );
  1476. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1477. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1478. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1479. );
  1480. Var
  1481. index: int8;
  1482. z: bits32;
  1483. Begin
  1484. index := ( a shr 27 ) AND 15;
  1485. if ( aExp AND 1 ) <> 0 then
  1486. Begin
  1487. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1488. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1489. a := a shr 1;
  1490. End
  1491. else
  1492. Begin
  1493. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1494. z := a div z + z;
  1495. if ( $20000 <= z ) then
  1496. z := $FFFF8000
  1497. else
  1498. z := ( z shl 15 );
  1499. if ( z <= a ) then
  1500. Begin
  1501. estimateSqrt32 := bits32 ( SarLongint( sbits32 (a)) );
  1502. exit;
  1503. End;
  1504. End;
  1505. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1506. End;
  1507. {*
  1508. -------------------------------------------------------------------------------
  1509. Returns the number of leading 0 bits before the most-significant 1 bit of
  1510. `a'. If `a' is zero, 32 is returned.
  1511. -------------------------------------------------------------------------------
  1512. *}
  1513. Function countLeadingZeros32( a:bits32 ): int8;
  1514. const countLeadingZerosHigh:array[0..255] of int8 = (
  1515. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1516. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1517. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1518. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1519. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1520. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1521. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1522. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1523. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1524. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1525. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1526. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1527. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1528. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1529. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1530. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1531. );
  1532. Var
  1533. shiftCount: int8;
  1534. Begin
  1535. shiftCount := 0;
  1536. if ( a < $10000 ) then
  1537. Begin
  1538. shiftCount := shiftcount + 16;
  1539. a := a shl 16;
  1540. End;
  1541. if ( a < $1000000 ) then
  1542. Begin
  1543. shiftCount := shiftcount + 8;
  1544. a := a shl 8;
  1545. end;
  1546. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1547. countLeadingZeros32:= shiftCount;
  1548. End;
  1549. {*----------------------------------------------------------------------------
  1550. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1551. | `a'. If `a' is zero, 64 is returned.
  1552. *----------------------------------------------------------------------------*}
  1553. function countLeadingZeros64( a : bits64): int8;
  1554. var
  1555. shiftcount : int8;
  1556. Begin
  1557. shiftCount := 0;
  1558. if ( a < bits64(bits64(1) shl 32 )) then
  1559. shiftCount := shiftcount + 32
  1560. else
  1561. a := a shr 32;
  1562. shiftCount := shiftCount + countLeadingZeros32( a );
  1563. countLeadingZeros64:= shiftCount;
  1564. End;
  1565. {*
  1566. -------------------------------------------------------------------------------
  1567. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1568. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1569. Otherwise, returns 0.
  1570. -------------------------------------------------------------------------------
  1571. *}
  1572. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1573. Begin
  1574. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1575. End;
  1576. {*
  1577. -------------------------------------------------------------------------------
  1578. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1579. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1580. returns 0.
  1581. -------------------------------------------------------------------------------
  1582. *}
  1583. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1584. Begin
  1585. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1586. End;
  1587. const
  1588. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1589. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1590. (*****************************************************************************)
  1591. (* End Low-Level arithmetic *)
  1592. (*****************************************************************************)
  1593. {*----------------------------------------------------------------------------
  1594. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1595. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1596. | returns 0.
  1597. *----------------------------------------------------------------------------*}
  1598. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1599. begin
  1600. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1601. end;
  1602. {*
  1603. -------------------------------------------------------------------------------
  1604. Functions and definitions to determine: (1) whether tininess for underflow
  1605. is detected before or after rounding by default, (2) what (if anything)
  1606. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1607. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1608. are propagated from function inputs to output. These details are ENDIAN
  1609. specific
  1610. -------------------------------------------------------------------------------
  1611. *}
  1612. {$IFDEF ENDIAN_LITTLE}
  1613. {*
  1614. -------------------------------------------------------------------------------
  1615. Internal canonical NaN format.
  1616. -------------------------------------------------------------------------------
  1617. *}
  1618. TYPE
  1619. commonNaNT = record
  1620. high, low : bits32;
  1621. sign: flag;
  1622. end;
  1623. {*
  1624. -------------------------------------------------------------------------------
  1625. The pattern for a default generated single-precision NaN.
  1626. -------------------------------------------------------------------------------
  1627. *}
  1628. const float32_default_nan = $FFC00000;
  1629. {*
  1630. -------------------------------------------------------------------------------
  1631. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1632. otherwise returns 0.
  1633. -------------------------------------------------------------------------------
  1634. *}
  1635. Function float32_is_nan( a : float32 ): flag;
  1636. Begin
  1637. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1638. End;
  1639. {*
  1640. -------------------------------------------------------------------------------
  1641. Returns 1 if the single-precision floating-point value `a' is a signaling
  1642. NaN; otherwise returns 0.
  1643. -------------------------------------------------------------------------------
  1644. *}
  1645. Function float32_is_signaling_nan( a : float32 ): flag;
  1646. Begin
  1647. float32_is_signaling_nan := flag
  1648. (( ( ( a shr 22 ) and $1FF ) = $1FE ) and (( a and $003FFFFF )<>0));
  1649. End;
  1650. {*
  1651. -------------------------------------------------------------------------------
  1652. Returns the result of converting the single-precision floating-point NaN
  1653. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1654. exception is raised.
  1655. -------------------------------------------------------------------------------
  1656. *}
  1657. function float32ToCommonNaN(a: float32) : commonNaNT;
  1658. var
  1659. z : commonNaNT ;
  1660. Begin
  1661. if ( float32_is_signaling_nan( a ) <> 0) then
  1662. float_raise( float_flag_invalid );
  1663. z.sign := a shr 31;
  1664. z.low := 0;
  1665. z.high := a shl 9;
  1666. result := z;
  1667. End;
  1668. {*
  1669. -------------------------------------------------------------------------------
  1670. Returns the result of converting the canonical NaN `a' to the single-
  1671. precision floating-point format.
  1672. -------------------------------------------------------------------------------
  1673. *}
  1674. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1675. Begin
  1676. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1677. End;
  1678. {*
  1679. -------------------------------------------------------------------------------
  1680. Takes two single-precision floating-point values `a' and `b', one of which
  1681. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1682. signaling NaN, the invalid exception is raised.
  1683. -------------------------------------------------------------------------------
  1684. *}
  1685. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1686. Var
  1687. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1688. label returnLargerSignificand;
  1689. Begin
  1690. aIsNaN := float32_is_nan( a );
  1691. aIsSignalingNaN := float32_is_signaling_nan( a );
  1692. bIsNaN := float32_is_nan( b );
  1693. bIsSignalingNaN := float32_is_signaling_nan( b );
  1694. a := a or $00400000;
  1695. b := b or $00400000;
  1696. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1697. float_raise( float_flag_invalid );
  1698. if ( aIsSignalingNaN )<> 0 then
  1699. Begin
  1700. if ( bIsSignalingNaN ) <> 0 then
  1701. goto returnLargerSignificand;
  1702. if bIsNan <> 0 then
  1703. propagateFloat32NaN := b
  1704. else
  1705. propagateFloat32NaN := a;
  1706. exit;
  1707. End
  1708. else if ( aIsNaN <> 0) then
  1709. Begin
  1710. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1711. Begin
  1712. propagateFloat32NaN := a;
  1713. exit;
  1714. End;
  1715. returnLargerSignificand:
  1716. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1717. Begin
  1718. propagateFloat32NaN := b;
  1719. exit;
  1720. End;
  1721. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1722. Begin
  1723. propagateFloat32NaN := a;
  1724. End;
  1725. if a < b then
  1726. propagateFloat32NaN := a
  1727. else
  1728. propagateFloat32NaN := b;
  1729. exit;
  1730. End
  1731. else
  1732. Begin
  1733. propagateFloat32NaN := b;
  1734. exit;
  1735. End;
  1736. End;
  1737. {*
  1738. -------------------------------------------------------------------------------
  1739. The pattern for a default generated double-precision NaN. The `high' and
  1740. `low' values hold the most- and least-significant bits, respectively.
  1741. -------------------------------------------------------------------------------
  1742. *}
  1743. const
  1744. float64_default_nan_high = $FFF80000;
  1745. float64_default_nan_low = $00000000;
  1746. {*
  1747. -------------------------------------------------------------------------------
  1748. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1749. otherwise returns 0.
  1750. -------------------------------------------------------------------------------
  1751. *}
  1752. Function float64_is_nan( a : float64 ) : flag;
  1753. Begin
  1754. float64_is_nan :=
  1755. flag(( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1756. and (( a.low or ( a.high and $000FFFFF ) )<>0));
  1757. End;
  1758. {*
  1759. -------------------------------------------------------------------------------
  1760. Returns 1 if the double-precision floating-point value `a' is a signaling
  1761. NaN; otherwise returns 0.
  1762. -------------------------------------------------------------------------------
  1763. *}
  1764. Function float64_is_signaling_nan( a : float64 ): flag;
  1765. Begin
  1766. float64_is_signaling_nan :=
  1767. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1768. and ( a.low or ( a.high and $0007FFFF ) );
  1769. End;
  1770. {*
  1771. -------------------------------------------------------------------------------
  1772. Returns the result of converting the double-precision floating-point NaN
  1773. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1774. exception is raised.
  1775. -------------------------------------------------------------------------------
  1776. *}
  1777. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1778. Var
  1779. z : commonNaNT;
  1780. Begin
  1781. if ( float64_is_signaling_nan( a )<>0 ) then
  1782. float_raise( float_flag_invalid );
  1783. z.sign := a.high shr 31;
  1784. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1785. result := z;
  1786. End;
  1787. {*
  1788. -------------------------------------------------------------------------------
  1789. Returns the result of converting the canonical NaN `a' to the double-
  1790. precision floating-point format.
  1791. -------------------------------------------------------------------------------
  1792. *}
  1793. function commonNaNToFloat64( a : commonNaNT) : float64;
  1794. Var
  1795. z: float64;
  1796. Begin
  1797. shift64Right( a.high, a.low, 12, z.high, z.low );
  1798. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1799. result := z;
  1800. End;
  1801. {*
  1802. -------------------------------------------------------------------------------
  1803. Takes two double-precision floating-point values `a' and `b', one of which
  1804. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1805. signaling NaN, the invalid exception is raised.
  1806. -------------------------------------------------------------------------------
  1807. *}
  1808. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1809. Var
  1810. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1811. label returnLargerSignificand;
  1812. Begin
  1813. aIsNaN := float64_is_nan( a );
  1814. aIsSignalingNaN := float64_is_signaling_nan( a );
  1815. bIsNaN := float64_is_nan( b );
  1816. bIsSignalingNaN := float64_is_signaling_nan( b );
  1817. a.high := a.high or $00080000;
  1818. b.high := b.high or $00080000;
  1819. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1820. float_raise( float_flag_invalid );
  1821. if ( aIsSignalingNaN )<>0 then
  1822. Begin
  1823. if ( bIsSignalingNaN )<>0 then
  1824. goto returnLargerSignificand;
  1825. if bIsNan <> 0 then
  1826. c := b
  1827. else
  1828. c := a;
  1829. exit;
  1830. End
  1831. else if ( aIsNaN )<> 0 then
  1832. Begin
  1833. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1834. Begin
  1835. c := a;
  1836. exit;
  1837. End;
  1838. returnLargerSignificand:
  1839. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1840. Begin
  1841. c := b;
  1842. exit;
  1843. End;
  1844. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1845. Begin
  1846. c := a;
  1847. exit;
  1848. End;
  1849. if a.high < b.high then
  1850. c := a
  1851. else
  1852. c := b;
  1853. exit;
  1854. End
  1855. else
  1856. Begin
  1857. c := b;
  1858. exit;
  1859. End;
  1860. End;
  1861. {*----------------------------------------------------------------------------
  1862. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1863. | otherwise returns 0.
  1864. *----------------------------------------------------------------------------*}
  1865. function float128_is_nan( a : float128): flag;
  1866. begin
  1867. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1868. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1869. end;
  1870. {*----------------------------------------------------------------------------
  1871. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1872. | signaling NaN; otherwise returns 0.
  1873. *----------------------------------------------------------------------------*}
  1874. function float128_is_signaling_nan( a : float128): flag;
  1875. begin
  1876. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1877. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1878. end;
  1879. {*----------------------------------------------------------------------------
  1880. | Returns the result of converting the quadruple-precision floating-point NaN
  1881. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1882. | exception is raised.
  1883. *----------------------------------------------------------------------------*}
  1884. function float128ToCommonNaN( a : float128): commonNaNT;
  1885. var
  1886. z: commonNaNT;
  1887. qhigh,qlow : qword;
  1888. begin
  1889. if ( float128_is_signaling_nan( a )<>0) then
  1890. float_raise( float_flag_invalid );
  1891. z.sign := a.high shr 63;
  1892. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1893. z.high:=qhigh shr 32;
  1894. z.low:=qhigh and $ffffffff;
  1895. result:=z;
  1896. end;
  1897. {*----------------------------------------------------------------------------
  1898. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1899. | precision floating-point format.
  1900. *----------------------------------------------------------------------------*}
  1901. function commonNaNToFloat128( a : commonNaNT): float128;
  1902. var
  1903. z: float128;
  1904. begin
  1905. shift128Right( a.high, a.low, 16, z.high, z.low );
  1906. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1907. result:=z;
  1908. end;
  1909. {*----------------------------------------------------------------------------
  1910. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1911. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1912. | `b' is a signaling NaN, the invalid exception is raised.
  1913. *----------------------------------------------------------------------------*}
  1914. function propagateFloat128NaN( a: float128; b : float128): float128;
  1915. var
  1916. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1917. label
  1918. returnLargerSignificand;
  1919. begin
  1920. aIsNaN := float128_is_nan( a );
  1921. aIsSignalingNaN := float128_is_signaling_nan( a );
  1922. bIsNaN := float128_is_nan( b );
  1923. bIsSignalingNaN := float128_is_signaling_nan( b );
  1924. a.high := a.high or int64( $0000800000000000 );
  1925. b.high := b.high or int64( $0000800000000000 );
  1926. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1927. float_raise( float_flag_invalid );
  1928. if ( aIsSignalingNaN )<>0 then
  1929. begin
  1930. if ( bIsSignalingNaN )<>0 then
  1931. goto returnLargerSignificand;
  1932. if bIsNaN<>0 then
  1933. result := b
  1934. else
  1935. result := a;
  1936. exit;
  1937. end
  1938. else if ( aIsNaN )<>0 then
  1939. begin
  1940. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1941. begin
  1942. result := a;
  1943. exit;
  1944. end;
  1945. returnLargerSignificand:
  1946. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1947. begin
  1948. result := b;
  1949. exit;
  1950. end;
  1951. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1952. begin
  1953. result := a;
  1954. exit
  1955. end;
  1956. if ( a.high < b.high ) then
  1957. result := a
  1958. else
  1959. result := b;
  1960. exit;
  1961. end
  1962. else
  1963. result:=b;
  1964. end;
  1965. {$ELSE}
  1966. { Big endian code }
  1967. (*----------------------------------------------------------------------------
  1968. | Internal canonical NaN format.
  1969. *----------------------------------------------------------------------------*)
  1970. type
  1971. commonNANT = record
  1972. high, low : bits32;
  1973. sign : flag;
  1974. end;
  1975. (*----------------------------------------------------------------------------
  1976. | The pattern for a default generated single-precision NaN.
  1977. *----------------------------------------------------------------------------*)
  1978. const float32_default_nan = $7FFFFFFF;
  1979. (*----------------------------------------------------------------------------
  1980. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1981. | otherwise returns 0.
  1982. *----------------------------------------------------------------------------*)
  1983. function float32_is_nan(a: float32): flag;
  1984. begin
  1985. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1986. end;
  1987. (*----------------------------------------------------------------------------
  1988. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1989. | NaN; otherwise returns 0.
  1990. *----------------------------------------------------------------------------*)
  1991. function float32_is_signaling_nan(a: float32):flag;
  1992. begin
  1993. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  1994. end;
  1995. (*----------------------------------------------------------------------------
  1996. | Returns the result of converting the single-precision floating-point NaN
  1997. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1998. | exception is raised.
  1999. *----------------------------------------------------------------------------*)
  2000. function float32ToCommonNaN( a: float32) : commonNaNT;
  2001. var
  2002. z: commonNANT;
  2003. begin
  2004. if float32_is_signaling_nan(a)<>0 then
  2005. float_raise(float_flag_invalid);
  2006. z.sign := a shr 31;
  2007. z.low := 0;
  2008. z.high := a shl 9;
  2009. result:=z;
  2010. end;
  2011. (*----------------------------------------------------------------------------
  2012. | Returns the result of converting the canonical NaN `a' to the single-
  2013. | precision floating-point format.
  2014. *----------------------------------------------------------------------------*)
  2015. function CommonNanToFloat32(a : CommonNaNT): float32;
  2016. begin
  2017. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  2018. end;
  2019. (*----------------------------------------------------------------------------
  2020. | Takes two single-precision floating-point values `a' and `b', one of which
  2021. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2022. | signaling NaN, the invalid exception is raised.
  2023. *----------------------------------------------------------------------------*)
  2024. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  2025. var
  2026. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2027. begin
  2028. aIsNaN := float32_is_nan( a );
  2029. aIsSignalingNaN := float32_is_signaling_nan( a );
  2030. bIsNaN := float32_is_nan( b );
  2031. bIsSignalingNaN := float32_is_signaling_nan( b );
  2032. a := a or $00400000;
  2033. b := b or $00400000;
  2034. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2035. float_raise( float_flag_invalid );
  2036. if bIsSignalingNaN<>0 then
  2037. propagateFloat32Nan := b
  2038. else if aIsSignalingNan<>0 then
  2039. propagateFloat32Nan := a
  2040. else if bIsNan<>0 then
  2041. propagateFloat32Nan := b
  2042. else
  2043. propagateFloat32Nan := a;
  2044. end;
  2045. (*----------------------------------------------------------------------------
  2046. | The pattern for a default generated double-precision NaN. The `high' and
  2047. | `low' values hold the most- and least-significant bits, respectively.
  2048. *----------------------------------------------------------------------------*)
  2049. const
  2050. float64_default_nan_high = $7FFFFFFF;
  2051. float64_default_nan_low = $FFFFFFFF;
  2052. (*----------------------------------------------------------------------------
  2053. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2054. | otherwise returns 0.
  2055. *----------------------------------------------------------------------------*)
  2056. function float64_is_nan(a: float64): flag;
  2057. begin
  2058. float64_is_nan := flag (
  2059. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2060. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2061. end;
  2062. (*----------------------------------------------------------------------------
  2063. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2064. | NaN; otherwise returns 0.
  2065. *----------------------------------------------------------------------------*)
  2066. function float64_is_signaling_nan( a:float64): flag;
  2067. begin
  2068. float64_is_signaling_nan := flag(
  2069. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2070. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2071. end;
  2072. (*----------------------------------------------------------------------------
  2073. | Returns the result of converting the double-precision floating-point NaN
  2074. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2075. | exception is raised.
  2076. *----------------------------------------------------------------------------*)
  2077. function float64ToCommonNaN( a : float64) : commonNaNT;
  2078. var
  2079. z : commonNaNT;
  2080. begin
  2081. if ( float64_is_signaling_nan( a )<>0 ) then
  2082. float_raise( float_flag_invalid );
  2083. z.sign := a.high shr 31;
  2084. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2085. result:=z;
  2086. end;
  2087. (*----------------------------------------------------------------------------
  2088. | Returns the result of converting the canonical NaN `a' to the double-
  2089. | precision floating-point format.
  2090. *----------------------------------------------------------------------------*)
  2091. function commonNaNToFloat64( a : commonNaNT): float64;
  2092. var
  2093. z: float64;
  2094. begin
  2095. shift64Right( a.high, a.low, 12, z.high, z.low );
  2096. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2097. result:=z;
  2098. end;
  2099. (*----------------------------------------------------------------------------
  2100. | Takes two double-precision floating-point values `a' and `b', one of which
  2101. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2102. | signaling NaN, the invalid exception is raised.
  2103. *----------------------------------------------------------------------------*)
  2104. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2105. var
  2106. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2107. begin
  2108. aIsNaN := float64_is_nan( a );
  2109. aIsSignalingNaN := float64_is_signaling_nan( a );
  2110. bIsNaN := float64_is_nan( b );
  2111. bIsSignalingNaN := float64_is_signaling_nan( b );
  2112. a.high := a.high or $00080000;
  2113. b.high := b.high or $00080000;
  2114. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2115. float_raise( float_flag_invalid );
  2116. if bIsSignalingNaN<>0 then
  2117. c := b
  2118. else if aIsSignalingNan<>0 then
  2119. c := a
  2120. else if bIsNan<>0 then
  2121. c := b
  2122. else
  2123. c := a;
  2124. end;
  2125. {*----------------------------------------------------------------------------
  2126. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  2127. | otherwise returns 0.
  2128. *----------------------------------------------------------------------------*}
  2129. function float128_is_nan( a : float128): flag;
  2130. begin
  2131. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  2132. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  2133. end;
  2134. {*----------------------------------------------------------------------------
  2135. | Returns 1 if the quadruple-precision floating-point value `a' is a
  2136. | signaling NaN; otherwise returns 0.
  2137. *----------------------------------------------------------------------------*}
  2138. function float128_is_signaling_nan( a : float128): flag;
  2139. begin
  2140. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  2141. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  2142. end;
  2143. {*----------------------------------------------------------------------------
  2144. | Returns the result of converting the quadruple-precision floating-point NaN
  2145. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2146. | exception is raised.
  2147. *----------------------------------------------------------------------------*}
  2148. function float128ToCommonNaN( a : float128): commonNaNT;
  2149. var
  2150. z: commonNaNT;
  2151. qhigh,qlow : qword;
  2152. begin
  2153. if ( float128_is_signaling_nan( a )<>0) then
  2154. float_raise( float_flag_invalid );
  2155. z.sign := a.high shr 63;
  2156. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  2157. z.high:=qhigh shr 32;
  2158. z.low:=qhigh and $ffffffff;
  2159. result:=z;
  2160. end;
  2161. {*----------------------------------------------------------------------------
  2162. | Returns the result of converting the canonical NaN `a' to the quadruple-
  2163. | precision floating-point format.
  2164. *----------------------------------------------------------------------------*}
  2165. function commonNaNToFloat128( a : commonNaNT): float128;
  2166. var
  2167. z: float128;
  2168. begin
  2169. shift128Right( a.high, a.low, 16, z.high, z.low );
  2170. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  2171. result:=z;
  2172. end;
  2173. {*----------------------------------------------------------------------------
  2174. | Takes two quadruple-precision floating-point values `a' and `b', one of
  2175. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  2176. | `b' is a signaling NaN, the invalid exception is raised.
  2177. *----------------------------------------------------------------------------*}
  2178. function propagateFloat128NaN( a: float128; b : float128): float128;
  2179. var
  2180. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2181. label
  2182. returnLargerSignificand;
  2183. begin
  2184. aIsNaN := float128_is_nan( a );
  2185. aIsSignalingNaN := float128_is_signaling_nan( a );
  2186. bIsNaN := float128_is_nan( b );
  2187. bIsSignalingNaN := float128_is_signaling_nan( b );
  2188. a.high := a.high or int64( $0000800000000000 );
  2189. b.high := b.high or int64( $0000800000000000 );
  2190. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2191. float_raise( float_flag_invalid );
  2192. if ( aIsSignalingNaN )<>0 then
  2193. begin
  2194. if ( bIsSignalingNaN )<>0 then
  2195. goto returnLargerSignificand;
  2196. if bIsNaN<>0 then
  2197. result := b
  2198. else
  2199. result := a;
  2200. exit;
  2201. end
  2202. else if ( aIsNaN )<>0 then
  2203. begin
  2204. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  2205. begin
  2206. result := a;
  2207. exit;
  2208. end;
  2209. returnLargerSignificand:
  2210. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  2211. begin
  2212. result := b;
  2213. exit;
  2214. end;
  2215. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  2216. begin
  2217. result := a;
  2218. exit
  2219. end;
  2220. if ( a.high < b.high ) then
  2221. result := a
  2222. else
  2223. result := b;
  2224. exit;
  2225. end
  2226. else
  2227. result:=b;
  2228. end;
  2229. {$ENDIF}
  2230. (****************************************************************************)
  2231. (* END ENDIAN SPECIFIC CODE *)
  2232. (****************************************************************************)
  2233. {*
  2234. -------------------------------------------------------------------------------
  2235. Returns the fraction bits of the single-precision floating-point value `a'.
  2236. -------------------------------------------------------------------------------
  2237. *}
  2238. Function ExtractFloat32Frac(a : Float32) : Bits32; inline;
  2239. Begin
  2240. ExtractFloat32Frac := A AND $007FFFFF;
  2241. End;
  2242. {*
  2243. -------------------------------------------------------------------------------
  2244. Returns the exponent bits of the single-precision floating-point value `a'.
  2245. -------------------------------------------------------------------------------
  2246. *}
  2247. Function extractFloat32Exp( a: float32 ): Int16; inline;
  2248. Begin
  2249. extractFloat32Exp := (a shr 23) AND $FF;
  2250. End;
  2251. {*
  2252. -------------------------------------------------------------------------------
  2253. Returns the sign bit of the single-precision floating-point value `a'.
  2254. -------------------------------------------------------------------------------
  2255. *}
  2256. Function extractFloat32Sign( a: float32 ): Flag; inline;
  2257. Begin
  2258. extractFloat32Sign := a shr 31;
  2259. End;
  2260. {*
  2261. -------------------------------------------------------------------------------
  2262. Normalizes the subnormal single-precision floating-point value represented
  2263. by the denormalized significand `aSig'. The normalized exponent and
  2264. significand are stored at the locations pointed to by `zExpPtr' and
  2265. `zSigPtr', respectively.
  2266. -------------------------------------------------------------------------------
  2267. *}
  2268. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2269. Var
  2270. ShiftCount : BYTE;
  2271. Begin
  2272. shiftCount := countLeadingZeros32( aSig ) - 8;
  2273. zSigPtr := aSig shl shiftCount;
  2274. zExpPtr := 1 - shiftCount;
  2275. End;
  2276. {*
  2277. -------------------------------------------------------------------------------
  2278. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2279. single-precision floating-point value, returning the result. After being
  2280. shifted into the proper positions, the three fields are simply added
  2281. together to form the result. This means that any integer portion of `zSig'
  2282. will be added into the exponent. Since a properly normalized significand
  2283. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2284. than the desired result exponent whenever `zSig' is a complete, normalized
  2285. significand.
  2286. -------------------------------------------------------------------------------
  2287. *}
  2288. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32; inline;
  2289. Begin
  2290. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2291. + zSig;
  2292. End;
  2293. {*
  2294. -------------------------------------------------------------------------------
  2295. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2296. and significand `zSig', and returns the proper single-precision floating-
  2297. point value corresponding to the abstract input. Ordinarily, the abstract
  2298. value is simply rounded and packed into the single-precision format, with
  2299. the inexact exception raised if the abstract input cannot be represented
  2300. exactly. However, if the abstract value is too large, the overflow and
  2301. inexact exceptions are raised and an infinity or maximal finite value is
  2302. returned. If the abstract value is too small, the input value is rounded to
  2303. a subnormal number, and the underflow and inexact exceptions are raised if
  2304. the abstract input cannot be represented exactly as a subnormal single-
  2305. precision floating-point number.
  2306. The input significand `zSig' has its binary point between bits 30
  2307. and 29, which is 7 bits to the left of the usual location. This shifted
  2308. significand must be normalized or smaller. If `zSig' is not normalized,
  2309. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2310. and it must not require rounding. In the usual case that `zSig' is
  2311. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2312. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2313. Binary Floating-Point Arithmetic.
  2314. -------------------------------------------------------------------------------
  2315. *}
  2316. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2317. Var
  2318. roundingMode : TFPURoundingMode;
  2319. roundNearestEven : boolean;
  2320. roundIncrement, roundBits : BYTE;
  2321. IsTiny : boolean;
  2322. Begin
  2323. roundingMode := softfloat_rounding_mode;
  2324. roundNearestEven := (roundingMode = float_round_nearest_even);
  2325. roundIncrement := $40;
  2326. if not roundNearestEven then
  2327. Begin
  2328. if ( roundingMode = float_round_to_zero ) Then
  2329. Begin
  2330. roundIncrement := 0;
  2331. End
  2332. else
  2333. Begin
  2334. roundIncrement := $7F;
  2335. if ( zSign <> 0 ) then
  2336. Begin
  2337. if roundingMode = float_round_up then roundIncrement := 0;
  2338. End
  2339. else
  2340. Begin
  2341. if roundingMode = float_round_down then roundIncrement := 0;
  2342. End;
  2343. End
  2344. End;
  2345. roundBits := zSig AND $7F;
  2346. if ($FD <= bits16 (zExp) ) then
  2347. Begin
  2348. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2349. Begin
  2350. float_raise( [float_flag_overflow,float_flag_inexact] );
  2351. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2352. exit;
  2353. End;
  2354. if ( zExp < 0 ) then
  2355. Begin
  2356. isTiny :=
  2357. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2358. OR ( zExp < -1 )
  2359. OR ( (zSig + roundIncrement) < $80000000 );
  2360. shift32RightJamming( zSig, - zExp, zSig );
  2361. zExp := 0;
  2362. roundBits := zSig AND $7F;
  2363. if ( isTiny and (roundBits<>0) ) then
  2364. float_raise( float_flag_underflow );
  2365. End;
  2366. End;
  2367. if ( roundBits )<> 0 then
  2368. set_inexact_flag;
  2369. zSig := ( zSig + roundIncrement ) shr 7;
  2370. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and ord(roundNearestEven) );
  2371. if ( zSig = 0 ) then zExp := 0;
  2372. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2373. End;
  2374. {*
  2375. -------------------------------------------------------------------------------
  2376. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2377. and significand `zSig', and returns the proper single-precision floating-
  2378. point value corresponding to the abstract input. This routine is just like
  2379. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2380. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2381. floating-point exponent.
  2382. -------------------------------------------------------------------------------
  2383. *}
  2384. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2385. Var
  2386. ShiftCount : int8;
  2387. Begin
  2388. shiftCount := countLeadingZeros32( zSig ) - 1;
  2389. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2390. End;
  2391. {*
  2392. -------------------------------------------------------------------------------
  2393. Returns the most-significant 20 fraction bits of the double-precision
  2394. floating-point value `a'.
  2395. -------------------------------------------------------------------------------
  2396. *}
  2397. Function extractFloat64Frac0(a: float64): bits32; inline;
  2398. Begin
  2399. extractFloat64Frac0 := a.high and $000FFFFF;
  2400. End;
  2401. {*
  2402. -------------------------------------------------------------------------------
  2403. Returns the least-significant 32 fraction bits of the double-precision
  2404. floating-point value `a'.
  2405. -------------------------------------------------------------------------------
  2406. *}
  2407. Function extractFloat64Frac1(a: float64): bits32; inline;
  2408. Begin
  2409. extractFloat64Frac1 := a.low;
  2410. End;
  2411. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2412. Function extractFloat64Frac(a: float64): bits64; inline;
  2413. Begin
  2414. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2415. End;
  2416. {*
  2417. -------------------------------------------------------------------------------
  2418. Returns the exponent bits of the double-precision floating-point value `a'.
  2419. -------------------------------------------------------------------------------
  2420. *}
  2421. Function extractFloat64Exp(a: float64): int16; inline;
  2422. Begin
  2423. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2424. End;
  2425. {*
  2426. -------------------------------------------------------------------------------
  2427. Returns the sign bit of the double-precision floating-point value `a'.
  2428. -------------------------------------------------------------------------------
  2429. *}
  2430. Function extractFloat64Sign(a: float64) : flag; inline;
  2431. Begin
  2432. extractFloat64Sign := a.high shr 31;
  2433. End;
  2434. {*
  2435. -------------------------------------------------------------------------------
  2436. Normalizes the subnormal double-precision floating-point value represented
  2437. by the denormalized significand formed by the concatenation of `aSig0' and
  2438. `aSig1'. The normalized exponent is stored at the location pointed to by
  2439. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2440. stored at the location pointed to by `zSig0Ptr', and the least significant
  2441. 32 bits of the normalized significand are stored at the location pointed to
  2442. by `zSig1Ptr'.
  2443. -------------------------------------------------------------------------------
  2444. *}
  2445. Procedure normalizeFloat64Subnormal(
  2446. aSig0: bits32;
  2447. aSig1: bits32;
  2448. VAR zExpPtr : Int16;
  2449. VAR zSig0Ptr : Bits32;
  2450. VAR zSig1Ptr : Bits32
  2451. );
  2452. Var
  2453. ShiftCount : Int8;
  2454. Begin
  2455. if ( aSig0 = 0 ) then
  2456. Begin
  2457. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2458. if ( shiftCount < 0 ) then
  2459. Begin
  2460. zSig0Ptr := aSig1 shr ( - shiftCount );
  2461. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2462. End
  2463. else
  2464. Begin
  2465. zSig0Ptr := aSig1 shl shiftCount;
  2466. zSig1Ptr := 0;
  2467. End;
  2468. zExpPtr := - shiftCount - 31;
  2469. End
  2470. else
  2471. Begin
  2472. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2473. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2474. zExpPtr := 1 - shiftCount;
  2475. End;
  2476. End;
  2477. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2478. var
  2479. shiftCount : int8;
  2480. begin
  2481. shiftCount := countLeadingZeros64( aSig ) - 11;
  2482. zSigPtr := aSig shl shiftCount;
  2483. zExpPtr := 1 - shiftCount;
  2484. end;
  2485. {*
  2486. -------------------------------------------------------------------------------
  2487. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2488. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2489. point value, returning the result. After being shifted into the proper
  2490. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2491. together to form the most significant 32 bits of the result. This means
  2492. that any integer portion of `zSig0' will be added into the exponent. Since
  2493. a properly normalized significand will have an integer portion equal to 1,
  2494. the `zExp' input should be 1 less than the desired result exponent whenever
  2495. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2496. -------------------------------------------------------------------------------
  2497. *}
  2498. Procedure
  2499. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2500. var
  2501. z: Float64;
  2502. Begin
  2503. z.low := zSig1;
  2504. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2505. c := z;
  2506. End;
  2507. {*----------------------------------------------------------------------------
  2508. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2509. | double-precision floating-point value, returning the result. After being
  2510. | shifted into the proper positions, the three fields are simply added
  2511. | together to form the result. This means that any integer portion of `zSig'
  2512. | will be added into the exponent. Since a properly normalized significand
  2513. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2514. | than the desired result exponent whenever `zSig' is a complete, normalized
  2515. | significand.
  2516. *----------------------------------------------------------------------------*}
  2517. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2518. begin
  2519. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2520. end;
  2521. {*
  2522. -------------------------------------------------------------------------------
  2523. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2524. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2525. and `zSig2', and returns the proper double-precision floating-point value
  2526. corresponding to the abstract input. Ordinarily, the abstract value is
  2527. simply rounded and packed into the double-precision format, with the inexact
  2528. exception raised if the abstract input cannot be represented exactly.
  2529. However, if the abstract value is too large, the overflow and inexact
  2530. exceptions are raised and an infinity or maximal finite value is returned.
  2531. If the abstract value is too small, the input value is rounded to a
  2532. subnormal number, and the underflow and inexact exceptions are raised if the
  2533. abstract input cannot be represented exactly as a subnormal double-precision
  2534. floating-point number.
  2535. The input significand must be normalized or smaller. If the input
  2536. significand is not normalized, `zExp' must be 0; in that case, the result
  2537. returned is a subnormal number, and it must not require rounding. In the
  2538. usual case that the input significand is normalized, `zExp' must be 1 less
  2539. than the ``true'' floating-point exponent. The handling of underflow and
  2540. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2541. -------------------------------------------------------------------------------
  2542. *}
  2543. Procedure
  2544. roundAndPackFloat64(
  2545. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2546. Var
  2547. roundingMode : TFPURoundingMode;
  2548. roundNearestEven, increment, isTiny : Flag;
  2549. Begin
  2550. roundingMode := softfloat_rounding_mode;
  2551. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2552. increment := flag( sbits32 (zSig2) < 0 );
  2553. if ( roundNearestEven = flag(FALSE) ) then
  2554. Begin
  2555. if ( roundingMode = float_round_to_zero ) then
  2556. increment := 0
  2557. else
  2558. Begin
  2559. if ( zSign )<> 0 then
  2560. Begin
  2561. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2562. End
  2563. else
  2564. Begin
  2565. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2566. End
  2567. End
  2568. End;
  2569. if ( $7FD <= bits16 (zExp) ) then
  2570. Begin
  2571. if (( $7FD < zExp )
  2572. or (( zExp = $7FD )
  2573. and (zSig0=$001FFFFF) and (zSig1=$FFFFFFFF)
  2574. and (increment<>0)
  2575. )
  2576. ) then
  2577. Begin
  2578. float_raise( [float_flag_overflow,float_flag_inexact] );
  2579. if (( roundingMode = float_round_to_zero )
  2580. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2581. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2582. ) then
  2583. Begin
  2584. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2585. exit;
  2586. End;
  2587. packFloat64( zSign, $7FF, 0, 0, c );
  2588. exit;
  2589. End;
  2590. if ( zExp < 0 ) then
  2591. Begin
  2592. isTiny :=
  2593. flag( softfloat_detect_tininess = float_tininess_before_rounding )
  2594. or flag( zExp < -1 )
  2595. or flag(increment = 0)
  2596. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2597. shift64ExtraRightJamming(
  2598. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2599. zExp := 0;
  2600. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2601. if ( roundNearestEven )<>0 then
  2602. Begin
  2603. increment := flag( sbits32 (zSig2) < 0 );
  2604. End
  2605. else
  2606. Begin
  2607. if ( zSign )<>0 then
  2608. Begin
  2609. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2610. End
  2611. else
  2612. Begin
  2613. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2614. End
  2615. End;
  2616. End;
  2617. End;
  2618. if ( zSig2 )<>0 then
  2619. set_inexact_flag;
  2620. if ( increment )<>0 then
  2621. Begin
  2622. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2623. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2624. End
  2625. else
  2626. Begin
  2627. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2628. End;
  2629. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2630. End;
  2631. {*----------------------------------------------------------------------------
  2632. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2633. | and significand `zSig', and returns the proper double-precision floating-
  2634. | point value corresponding to the abstract input. Ordinarily, the abstract
  2635. | value is simply rounded and packed into the double-precision format, with
  2636. | the inexact exception raised if the abstract input cannot be represented
  2637. | exactly. However, if the abstract value is too large, the overflow and
  2638. | inexact exceptions are raised and an infinity or maximal finite value is
  2639. | returned. If the abstract value is too small, the input value is rounded
  2640. | to a subnormal number, and the underflow and inexact exceptions are raised
  2641. | if the abstract input cannot be represented exactly as a subnormal double-
  2642. | precision floating-point number.
  2643. | The input significand `zSig' has its binary point between bits 62
  2644. | and 61, which is 10 bits to the left of the usual location. This shifted
  2645. | significand must be normalized or smaller. If `zSig' is not normalized,
  2646. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2647. | and it must not require rounding. In the usual case that `zSig' is
  2648. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2649. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2650. | Binary Floating-Point Arithmetic.
  2651. *----------------------------------------------------------------------------*}
  2652. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2653. var
  2654. roundingMode: TFPURoundingMode;
  2655. roundNearestEven: flag;
  2656. roundIncrement, roundBits: int16;
  2657. isTiny: flag;
  2658. begin
  2659. roundingMode := softfloat_rounding_mode;
  2660. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2661. roundIncrement := $200;
  2662. if ( roundNearestEven=0 ) then
  2663. begin
  2664. if ( roundingMode = float_round_to_zero ) then
  2665. begin
  2666. roundIncrement := 0;
  2667. end
  2668. else begin
  2669. roundIncrement := $3FF;
  2670. if ( zSign<>0 ) then
  2671. begin
  2672. if ( roundingMode = float_round_up ) then
  2673. roundIncrement := 0;
  2674. end
  2675. else begin
  2676. if ( roundingMode = float_round_down ) then
  2677. roundIncrement := 0;
  2678. end
  2679. end
  2680. end;
  2681. roundBits := zSig and $3FF;
  2682. if ( $7FD <= bits16(zExp) ) then
  2683. begin
  2684. if ( ( $7FD < zExp )
  2685. or ( ( zExp = $7FD )
  2686. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2687. ) then
  2688. begin
  2689. float_raise( [float_flag_overflow,float_flag_inexact] );
  2690. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2691. exit;
  2692. end;
  2693. if ( zExp < 0 ) then
  2694. begin
  2695. isTiny := ord(
  2696. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2697. or ( zExp < -1 )
  2698. or ( (zSig + roundIncrement) < bits64( $8000000000000000 ) ) );
  2699. shift64RightJamming( zSig, - zExp, zSig );
  2700. zExp := 0;
  2701. roundBits := zSig and $3FF;
  2702. if ( isTiny and roundBits )<>0 then
  2703. float_raise( float_flag_underflow );
  2704. end
  2705. end;
  2706. if ( roundBits<>0 ) then
  2707. set_inexact_flag;
  2708. zSig := ( zSig + roundIncrement ) shr 10;
  2709. zSig := zSig and not(qword(ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven ));
  2710. if ( zSig = 0 ) then
  2711. zExp := 0;
  2712. result:=packFloat64( zSign, zExp, zSig );
  2713. end;
  2714. {*
  2715. -------------------------------------------------------------------------------
  2716. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2717. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2718. returns the proper double-precision floating-point value corresponding
  2719. to the abstract input. This routine is just like `roundAndPackFloat64'
  2720. except that the input significand has fewer bits and does not have to be
  2721. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2722. point exponent.
  2723. -------------------------------------------------------------------------------
  2724. *}
  2725. Procedure
  2726. normalizeRoundAndPackFloat64(
  2727. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2728. Var
  2729. shiftCount : int8;
  2730. zSig2 : bits32;
  2731. Begin
  2732. if ( zSig0 = 0 ) then
  2733. Begin
  2734. zSig0 := zSig1;
  2735. zSig1 := 0;
  2736. zExp := zExp -32;
  2737. End;
  2738. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2739. if ( 0 <= shiftCount ) then
  2740. Begin
  2741. zSig2 := 0;
  2742. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2743. End
  2744. else
  2745. Begin
  2746. shift64ExtraRightJamming
  2747. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2748. End;
  2749. zExp := zExp - shiftCount;
  2750. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2751. End;
  2752. {*
  2753. ----------------------------------------------------------------------------
  2754. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2755. and significand `zSig', and returns the proper double-precision floating-
  2756. point value corresponding to the abstract input. This routine is just like
  2757. `roundAndPackFloat64' except that `zSig' does not have to be normalized.
  2758. Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2759. floating-point exponent.
  2760. ----------------------------------------------------------------------------
  2761. *}
  2762. function normalizeRoundAndPackFloat64(zSign: flag; zExp: int16; zSig: bits64): float64;
  2763. var
  2764. shiftCount: int8;
  2765. begin
  2766. shiftCount := countLeadingZeros64( zSig ) - 1;
  2767. result := roundAndPackFloat64( zSign, zExp - shiftCount, zSig shl shiftCount);
  2768. end;
  2769. {*
  2770. -------------------------------------------------------------------------------
  2771. Returns the result of converting the 32-bit two's complement integer `a' to
  2772. the single-precision floating-point format. The conversion is performed
  2773. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2774. -------------------------------------------------------------------------------
  2775. *}
  2776. Function int32_to_float32( a: int32): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  2777. Var
  2778. zSign : Flag;
  2779. Begin
  2780. if ( a = 0 ) then
  2781. Begin
  2782. int32_to_float32.float32 := 0;
  2783. exit;
  2784. End;
  2785. if ( a = sbits32 ($80000000) ) then
  2786. Begin
  2787. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2788. exit;
  2789. end;
  2790. zSign := flag( a < 0 );
  2791. If zSign<>0 then
  2792. a := -a;
  2793. int32_to_float32.float32:=
  2794. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2795. End;
  2796. {*
  2797. -------------------------------------------------------------------------------
  2798. Returns the result of converting the 32-bit two's complement integer `a' to
  2799. the double-precision floating-point format. The conversion is performed
  2800. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2801. -------------------------------------------------------------------------------
  2802. *}
  2803. Function int32_to_float64( a: int32) : float64;{$ifdef FPC_IS_SYSTEM} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2804. var
  2805. zSign : flag;
  2806. absA : bits32;
  2807. shiftCount : int8;
  2808. zSig0, zSig1 : bits32;
  2809. Begin
  2810. if ( a = 0 ) then
  2811. Begin
  2812. packFloat64( 0, 0, 0, 0, result );
  2813. exit;
  2814. end;
  2815. zSign := flag( a < 0 );
  2816. if ZSign<>0 then
  2817. AbsA := -a
  2818. else
  2819. AbsA := a;
  2820. shiftCount := countLeadingZeros32( absA ) - 11;
  2821. if ( 0 <= shiftCount ) then
  2822. Begin
  2823. zSig0 := absA shl shiftCount;
  2824. zSig1 := 0;
  2825. End
  2826. else
  2827. Begin
  2828. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2829. End;
  2830. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2831. End;
  2832. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  2833. {$if not defined(packFloatx80)}
  2834. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  2835. forward;
  2836. {$endif}
  2837. {*----------------------------------------------------------------------------
  2838. | Returns the result of converting the 32-bit two's complement integer `a'
  2839. | to the extended double-precision floating-point format. The conversion
  2840. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2841. | Arithmetic.
  2842. *----------------------------------------------------------------------------*}
  2843. function int32_to_floatx80( a: int32 ): floatx80;
  2844. var
  2845. zSign: flag;
  2846. absA: uint32;
  2847. shiftCount: int8;
  2848. zSig: bits64;
  2849. begin
  2850. if ( a = 0 ) then begin
  2851. result := packFloatx80( 0, 0, 0 );
  2852. exit;
  2853. end;
  2854. zSign := ord( a < 0 );
  2855. if zSign <> 0 then absA := - a else absA := a;
  2856. shiftCount := countLeadingZeros32( absA ) + 32;
  2857. zSig := absA;
  2858. result := packFloatx80( zSign, $403E - shiftCount, zSig shl shiftCount );
  2859. end;
  2860. {$endif FPC_SOFTFLOAT_FLOATX80}
  2861. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  2862. {$if not defined(packFloat128)}
  2863. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64 ) : float128;
  2864. forward;
  2865. {$endif}
  2866. {*----------------------------------------------------------------------------
  2867. | Returns the result of converting the 32-bit two's complement integer `a' to
  2868. | the quadruple-precision floating-point format. The conversion is performed
  2869. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2870. *----------------------------------------------------------------------------*}
  2871. function int32_to_float128( a: int32 ): float128;
  2872. var
  2873. zSign: flag;
  2874. absA: uint32;
  2875. shiftCount: int8;
  2876. zSig0: bits64;
  2877. begin
  2878. if ( a = 0 ) then begin
  2879. result := packFloat128( 0, 0, 0, 0 );
  2880. exit;
  2881. end;
  2882. zSign := ord( a < 0 );
  2883. if zSign <> 0 then absA := - a else absA := a;
  2884. shiftCount := countLeadingZeros32( absA ) + 17;
  2885. zSig0 := absA;
  2886. result := packFloat128( zSign, $402E - shiftCount, zSig0 shl shiftCount, 0 );
  2887. end;
  2888. {$endif FPC_SOFTFLOAT_FLOAT128}
  2889. {*
  2890. -------------------------------------------------------------------------------
  2891. Returns the result of converting the single-precision floating-point value
  2892. `a' to the 32-bit two's complement integer format. The conversion is
  2893. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2894. Arithmetic---which means in particular that the conversion is rounded
  2895. according to the current rounding mode. If `a' is a NaN, the largest
  2896. positive integer is returned. Otherwise, if the conversion overflows, the
  2897. largest integer with the same sign as `a' is returned.
  2898. -------------------------------------------------------------------------------
  2899. *}
  2900. Function float32_to_int32( a : float32rec) : int32;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  2901. Var
  2902. aSign: flag;
  2903. aExp, shiftCount: int16;
  2904. aSig, aSigExtra: bits32;
  2905. z: int32;
  2906. roundingMode: TFPURoundingMode;
  2907. Begin
  2908. aSig := extractFloat32Frac( a.float32 );
  2909. aExp := extractFloat32Exp( a.float32 );
  2910. aSign := extractFloat32Sign( a.float32 );
  2911. shiftCount := aExp - $96;
  2912. if ( 0 <= shiftCount ) then
  2913. Begin
  2914. if ( $9E <= aExp ) then
  2915. Begin
  2916. if ( a.float32 <> $CF000000 ) then
  2917. Begin
  2918. float_raise( float_flag_invalid );
  2919. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2920. Begin
  2921. float32_to_int32 := $7FFFFFFF;
  2922. exit;
  2923. End;
  2924. End;
  2925. float32_to_int32 := sbits32 ($80000000);
  2926. exit;
  2927. End;
  2928. z := ( aSig or $00800000 ) shl shiftCount;
  2929. if ( aSign<>0 ) then z := - z;
  2930. End
  2931. else
  2932. Begin
  2933. if ( aExp < $7E ) then
  2934. Begin
  2935. aSigExtra := aExp OR aSig;
  2936. z := 0;
  2937. End
  2938. else
  2939. Begin
  2940. aSig := aSig OR $00800000;
  2941. aSigExtra := aSig shl ( shiftCount and 31 );
  2942. z := aSig shr ( - shiftCount );
  2943. End;
  2944. if ( aSigExtra<>0 ) then
  2945. set_inexact_flag;
  2946. roundingMode := softfloat_rounding_mode;
  2947. if ( roundingMode = float_round_nearest_even ) then
  2948. Begin
  2949. if ( sbits32 (aSigExtra) < 0 ) then
  2950. Begin
  2951. Inc(z);
  2952. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2953. z := z and not 1;
  2954. End;
  2955. if ( aSign<>0 ) then
  2956. z := - z;
  2957. End
  2958. else
  2959. Begin
  2960. aSigExtra := flag( aSigExtra <> 0 );
  2961. if ( aSign<>0 ) then
  2962. Begin
  2963. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2964. z := - z;
  2965. End
  2966. else
  2967. Begin
  2968. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2969. End
  2970. End;
  2971. End;
  2972. float32_to_int32 := z;
  2973. End;
  2974. {*
  2975. -------------------------------------------------------------------------------
  2976. Returns the result of converting the single-precision floating-point value
  2977. `a' to the 32-bit two's complement integer format. The conversion is
  2978. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2979. Arithmetic, except that the conversion is always rounded toward zero.
  2980. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2981. the conversion overflows, the largest integer with the same sign as `a' is
  2982. returned.
  2983. -------------------------------------------------------------------------------
  2984. *}
  2985. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  2986. Var
  2987. aSign : flag;
  2988. aExp, shiftCount : int16;
  2989. aSig : bits32;
  2990. z : int32;
  2991. Begin
  2992. aSig := extractFloat32Frac( a.float32 );
  2993. aExp := extractFloat32Exp( a.float32 );
  2994. aSign := extractFloat32Sign( a.float32 );
  2995. shiftCount := aExp - $9E;
  2996. if ( 0 <= shiftCount ) then
  2997. Begin
  2998. if ( a.float32 <> $CF000000 ) then
  2999. Begin
  3000. float_raise( float_flag_invalid );
  3001. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  3002. Begin
  3003. float32_to_int32_round_to_zero := $7FFFFFFF;
  3004. exit;
  3005. end;
  3006. End;
  3007. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  3008. exit;
  3009. End
  3010. else
  3011. if ( aExp <= $7E ) then
  3012. Begin
  3013. if ( aExp or aSig )<>0 then
  3014. set_inexact_flag;
  3015. float32_to_int32_round_to_zero := 0;
  3016. exit;
  3017. End;
  3018. aSig := ( aSig or $00800000 ) shl 8;
  3019. z := aSig shr ( - shiftCount );
  3020. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  3021. Begin
  3022. set_inexact_flag;
  3023. End;
  3024. if ( aSign<>0 ) then z := - z;
  3025. float32_to_int32_round_to_zero := z;
  3026. End;
  3027. {*----------------------------------------------------------------------------
  3028. | Returns the result of converting the single-precision floating-point value
  3029. | `a' to the 64-bit two's complement integer format. The conversion is
  3030. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3031. | Arithmetic---which means in particular that the conversion is rounded
  3032. | according to the current rounding mode. If `a' is a NaN, the largest
  3033. | positive integer is returned. Otherwise, if the conversion overflows, the
  3034. | largest integer with the same sign as `a' is returned.
  3035. *----------------------------------------------------------------------------*}
  3036. function float32_to_int64( a: float32 ): int64;
  3037. var
  3038. aSign: flag;
  3039. aExp, shiftCount: int16;
  3040. aSig: bits32;
  3041. aSig64, aSigExtra: bits64;
  3042. begin
  3043. aSig := extractFloat32Frac( a );
  3044. aExp := extractFloat32Exp( a );
  3045. aSign := extractFloat32Sign( a );
  3046. shiftCount := $BE - aExp;
  3047. if ( shiftCount < 0 ) then begin
  3048. float_raise( float_flag_invalid );
  3049. if ( aSign = 0 ) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3050. result := $7FFFFFFFFFFFFFFF;
  3051. exit;
  3052. end;
  3053. result := $8000000000000000;
  3054. exit;
  3055. end;
  3056. if ( aExp <> 0 ) then aSig := aSig or $00800000;
  3057. aSig64 := aSig;
  3058. aSig64 := aSig64 shl 40;
  3059. shift64ExtraRightJamming( aSig64, 0, shiftCount, aSig64, aSigExtra );
  3060. result := roundAndPackInt64( aSign, aSig64, aSigExtra );
  3061. end;
  3062. {*----------------------------------------------------------------------------
  3063. | Returns the result of converting the single-precision floating-point value
  3064. | `a' to the 64-bit two's complement integer format. The conversion is
  3065. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3066. | Arithmetic, except that the conversion is always rounded toward zero. If
  3067. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  3068. | conversion overflows, the largest integer with the same sign as `a' is
  3069. | returned.
  3070. *----------------------------------------------------------------------------*}
  3071. function float32_to_int64_round_to_zero( a: float32 ): int64;
  3072. var
  3073. aSign: flag;
  3074. aExp, shiftCount: int16;
  3075. aSig: bits32;
  3076. aSig64: bits64;
  3077. z: int64;
  3078. begin
  3079. aSig := extractFloat32Frac( a );
  3080. aExp := extractFloat32Exp( a );
  3081. aSign := extractFloat32Sign( a );
  3082. shiftCount := aExp - $BE;
  3083. if ( 0 <= shiftCount ) then begin
  3084. if ( a <> $DF000000 ) then begin
  3085. float_raise( float_flag_invalid );
  3086. if ( aSign = 0) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3087. result := $7FFFFFFFFFFFFFFF;
  3088. exit;
  3089. end;
  3090. end;
  3091. result := $8000000000000000;
  3092. exit;
  3093. end
  3094. else if ( aExp <= $7E ) then begin
  3095. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  3096. result := 0;
  3097. exit;
  3098. end;
  3099. aSig64 := aSig or $00800000;
  3100. aSig64 := aSig64 shl 40;
  3101. z := aSig64 shr ( - shiftCount );
  3102. if bits64( aSig64 shl ( shiftCount and 63 ) ) <> 0 then
  3103. set_inexact_flag;
  3104. if ( aSign <> 0 ) then z := - z;
  3105. result := z;
  3106. end;
  3107. {*
  3108. -------------------------------------------------------------------------------
  3109. Returns the result of converting the single-precision floating-point value
  3110. `a' to the double-precision floating-point format. The conversion is
  3111. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3112. Arithmetic.
  3113. -------------------------------------------------------------------------------
  3114. *}
  3115. Function float32_to_float64( a : float32rec) : Float64;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3116. Var
  3117. aSign : flag;
  3118. aExp : int16;
  3119. aSig, zSig0, zSig1: bits32;
  3120. tmp : CommonNanT;
  3121. Begin
  3122. aSig := extractFloat32Frac( a.float32 );
  3123. aExp := extractFloat32Exp( a.float32 );
  3124. aSign := extractFloat32Sign( a.float32 );
  3125. if ( aExp = $FF ) then
  3126. Begin
  3127. if ( aSig<>0 ) then
  3128. Begin
  3129. tmp:=float32ToCommonNaN(a.float32);
  3130. result:=commonNaNToFloat64(tmp);
  3131. exit;
  3132. End;
  3133. packFloat64( aSign, $7FF, 0, 0, result);
  3134. exit;
  3135. End;
  3136. if ( aExp = 0 ) then
  3137. Begin
  3138. if ( aSig = 0 ) then
  3139. Begin
  3140. packFloat64( aSign, 0, 0, 0, result );
  3141. exit;
  3142. end;
  3143. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3144. Dec(aExp);
  3145. End;
  3146. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  3147. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  3148. End;
  3149. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  3150. {*----------------------------------------------------------------------------
  3151. | Returns the result of converting the canonical NaN `a' to the extended
  3152. | double-precision floating-point format.
  3153. *----------------------------------------------------------------------------*}
  3154. function commonNaNToFloatx80( a : commonNaNT ) : floatx80;
  3155. var
  3156. z : floatx80;
  3157. begin
  3158. z.low := bits64( $C000000000000000 ) or ( a.high shr 1 );
  3159. z.high := ( bits16( a.sign ) shl 15 ) or $7FFF;
  3160. result := z;
  3161. end;
  3162. {*----------------------------------------------------------------------------
  3163. | Returns the result of converting the single-precision floating-point value
  3164. | `a' to the extended double-precision floating-point format. The conversion
  3165. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3166. | Arithmetic.
  3167. *----------------------------------------------------------------------------*}
  3168. function float32_to_floatx80( a: float32 ): floatx80;
  3169. var
  3170. aSign: flag;
  3171. aExp: int16;
  3172. aSig: bits32;
  3173. tmp: commonNaNT;
  3174. begin
  3175. aSig := extractFloat32Frac( a );
  3176. aExp := extractFloat32Exp( a );
  3177. aSign := extractFloat32Sign( a );
  3178. if ( aExp = $FF ) then begin
  3179. if ( aSig <> 0 ) then begin
  3180. tmp:=float32ToCommonNaN(a);
  3181. result := commonNaNToFloatx80( tmp );
  3182. exit;
  3183. end;
  3184. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  3185. exit;
  3186. end;
  3187. if ( aExp = 0 ) then begin
  3188. if ( aSig = 0 ) then begin
  3189. result := packFloatx80( aSign, 0, 0 );
  3190. exit;
  3191. end;
  3192. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3193. end;
  3194. aSig := aSig or $00800000;
  3195. result := packFloatx80( aSign, aExp + $3F80, bits64(aSig) shl 40 );
  3196. end;
  3197. {$endif FPC_SOFTFLOAT_FLOATX80}
  3198. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  3199. {*----------------------------------------------------------------------------
  3200. | Returns the result of converting the single-precision floating-point value
  3201. | `a' to the double-precision floating-point format. The conversion is
  3202. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3203. | Arithmetic.
  3204. *----------------------------------------------------------------------------*}
  3205. function float32_to_float128( a: float32 ): float128;
  3206. var
  3207. aSign: flag;
  3208. aExp: int16;
  3209. aSig: bits32;
  3210. tmp: commonNaNT;
  3211. begin
  3212. aSig := extractFloat32Frac( a );
  3213. aExp := extractFloat32Exp( a );
  3214. aSign := extractFloat32Sign( a );
  3215. if ( aExp = $FF ) then begin
  3216. if ( aSig <> 0 ) then begin
  3217. tmp:=float32ToCommonNaN(a);
  3218. result := commonNaNToFloat128( tmp );
  3219. exit;
  3220. end;
  3221. result := packFloat128( aSign, $7FFF, 0, 0 );
  3222. exit;
  3223. end;
  3224. if ( aExp = 0 ) then begin
  3225. if ( aSig = 0 ) then begin
  3226. result := packFloat128( aSign, 0, 0, 0 );
  3227. exit;
  3228. end;
  3229. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3230. dec( aExp );
  3231. end;
  3232. result := packFloat128( aSign, aExp + $3F80, bits64( aSig ) shl 25, 0 );
  3233. end;
  3234. {$endif FPC_SOFTFLOAT_FLOAT128}
  3235. {*
  3236. -------------------------------------------------------------------------------
  3237. Rounds the single-precision floating-point value `a' to an integer,
  3238. and returns the result as a single-precision floating-point value. The
  3239. operation is performed according to the IEC/IEEE Standard for Binary
  3240. Floating-Point Arithmetic.
  3241. -------------------------------------------------------------------------------
  3242. *}
  3243. Function float32_round_to_int( a: float32rec): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3244. Var
  3245. aSign: flag;
  3246. aExp: int16;
  3247. lastBitMask, roundBitsMask: bits32;
  3248. roundingMode: TFPURoundingMode;
  3249. z: float32;
  3250. Begin
  3251. aExp := extractFloat32Exp( a.float32 );
  3252. if ( $96 <= aExp ) then
  3253. Begin
  3254. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3255. Begin
  3256. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  3257. exit;
  3258. End;
  3259. float32_round_to_int:=a;
  3260. exit;
  3261. End;
  3262. if ( aExp <= $7E ) then
  3263. Begin
  3264. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  3265. Begin
  3266. float32_round_to_int:=a;
  3267. exit;
  3268. end;
  3269. set_inexact_flag;
  3270. aSign := extractFloat32Sign( a.float32 );
  3271. case ( softfloat_rounding_mode ) of
  3272. float_round_nearest_even:
  3273. Begin
  3274. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3275. Begin
  3276. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  3277. exit;
  3278. End;
  3279. End;
  3280. float_round_down:
  3281. Begin
  3282. if aSign <> 0 then
  3283. float32_round_to_int.float32 := $BF800000
  3284. else
  3285. float32_round_to_int.float32 := 0;
  3286. exit;
  3287. End;
  3288. float_round_up:
  3289. Begin
  3290. if aSign <> 0 then
  3291. float32_round_to_int.float32 := $80000000
  3292. else
  3293. float32_round_to_int.float32 := $3F800000;
  3294. exit;
  3295. End;
  3296. end;
  3297. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  3298. exit;
  3299. End;
  3300. lastBitMask := 1;
  3301. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  3302. lastBitMask := lastBitMask shl ($96 - aExp);
  3303. roundBitsMask := lastBitMask - 1;
  3304. z := a.float32;
  3305. roundingMode := softfloat_rounding_mode;
  3306. if ( roundingMode = float_round_nearest_even ) then
  3307. Begin
  3308. z := z + (lastBitMask shr 1);
  3309. if ( ( z and roundBitsMask ) = 0 ) then
  3310. z := z and not lastBitMask;
  3311. End
  3312. else if ( roundingMode <> float_round_to_zero ) then
  3313. Begin
  3314. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  3315. Begin
  3316. z := z + roundBitsMask;
  3317. End;
  3318. End;
  3319. z := z and not roundBitsMask;
  3320. if ( z <> a.float32 ) then
  3321. set_inexact_flag;
  3322. float32_round_to_int.float32 := z;
  3323. End;
  3324. {*
  3325. -------------------------------------------------------------------------------
  3326. Returns the result of adding the absolute values of the single-precision
  3327. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3328. before being returned. `zSign' is ignored if the result is a NaN.
  3329. The addition is performed according to the IEC/IEEE Standard for Binary
  3330. Floating-Point Arithmetic.
  3331. -------------------------------------------------------------------------------
  3332. *}
  3333. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  3334. Var
  3335. aExp, bExp, zExp: int16;
  3336. aSig, bSig, zSig: bits32;
  3337. expDiff: int16;
  3338. label roundAndPack;
  3339. Begin
  3340. aSig:=extractFloat32Frac( a );
  3341. aExp:=extractFloat32Exp( a );
  3342. bSig:=extractFloat32Frac( b );
  3343. bExp := extractFloat32Exp( b );
  3344. expDiff := aExp - bExp;
  3345. aSig := aSig shl 6;
  3346. bSig := bSig shl 6;
  3347. if ( 0 < expDiff ) then
  3348. Begin
  3349. if ( aExp = $FF ) then
  3350. Begin
  3351. if ( aSig <> 0) then
  3352. Begin
  3353. addFloat32Sigs := propagateFloat32NaN( a, b );
  3354. exit;
  3355. End;
  3356. addFloat32Sigs := a;
  3357. exit;
  3358. End;
  3359. if ( bExp = 0 ) then
  3360. Begin
  3361. Dec(expDiff);
  3362. End
  3363. else
  3364. Begin
  3365. bSig := bSig or $20000000;
  3366. End;
  3367. shift32RightJamming( bSig, expDiff, bSig );
  3368. zExp := aExp;
  3369. End
  3370. else
  3371. If ( expDiff < 0 ) then
  3372. Begin
  3373. if ( bExp = $FF ) then
  3374. Begin
  3375. if ( bSig<>0 ) then
  3376. Begin
  3377. addFloat32Sigs := propagateFloat32NaN( a, b );
  3378. exit;
  3379. end;
  3380. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3381. exit;
  3382. End;
  3383. if ( aExp = 0 ) then
  3384. Begin
  3385. Inc(expDiff);
  3386. End
  3387. else
  3388. Begin
  3389. aSig := aSig OR $20000000;
  3390. End;
  3391. shift32RightJamming( aSig, - expDiff, aSig );
  3392. zExp := bExp;
  3393. End
  3394. else
  3395. Begin
  3396. if ( aExp = $FF ) then
  3397. Begin
  3398. if ( aSig OR bSig )<> 0 then
  3399. Begin
  3400. addFloat32Sigs := propagateFloat32NaN( a, b );
  3401. exit;
  3402. end;
  3403. addFloat32Sigs := a;
  3404. exit;
  3405. End;
  3406. if ( aExp = 0 ) then
  3407. Begin
  3408. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3409. exit;
  3410. end;
  3411. zSig := $40000000 + aSig + bSig;
  3412. zExp := aExp;
  3413. goto roundAndPack;
  3414. End;
  3415. aSig := aSig OR $20000000;
  3416. zSig := ( aSig + bSig ) shl 1;
  3417. Dec(zExp);
  3418. if ( sbits32 (zSig) < 0 ) then
  3419. Begin
  3420. zSig := aSig + bSig;
  3421. Inc(zExp);
  3422. End;
  3423. roundAndPack:
  3424. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3425. End;
  3426. {*
  3427. -------------------------------------------------------------------------------
  3428. Returns the result of subtracting the absolute values of the single-
  3429. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3430. difference is negated before being returned. `zSign' is ignored if the
  3431. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3432. Standard for Binary Floating-Point Arithmetic.
  3433. -------------------------------------------------------------------------------
  3434. *}
  3435. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3436. Var
  3437. aExp, bExp, zExp: int16;
  3438. aSig, bSig, zSig: bits32;
  3439. expDiff : int16;
  3440. label aExpBigger;
  3441. label bExpBigger;
  3442. label aBigger;
  3443. label bBigger;
  3444. label normalizeRoundAndPack;
  3445. Begin
  3446. aSig := extractFloat32Frac( a );
  3447. aExp := extractFloat32Exp( a );
  3448. bSig := extractFloat32Frac( b );
  3449. bExp := extractFloat32Exp( b );
  3450. expDiff := aExp - bExp;
  3451. aSig := aSig shl 7;
  3452. bSig := bSig shl 7;
  3453. if ( 0 < expDiff ) then goto aExpBigger;
  3454. if ( expDiff < 0 ) then goto bExpBigger;
  3455. if ( aExp = $FF ) then
  3456. Begin
  3457. if ( aSig OR bSig )<> 0 then
  3458. Begin
  3459. subFloat32Sigs := propagateFloat32NaN( a, b );
  3460. exit;
  3461. End;
  3462. float_raise( float_flag_invalid );
  3463. subFloat32Sigs := float32_default_nan;
  3464. exit;
  3465. End;
  3466. if ( aExp = 0 ) then
  3467. Begin
  3468. aExp := 1;
  3469. bExp := 1;
  3470. End;
  3471. if ( bSig < aSig ) Then goto aBigger;
  3472. if ( aSig < bSig ) Then goto bBigger;
  3473. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3474. exit;
  3475. bExpBigger:
  3476. if ( bExp = $FF ) then
  3477. Begin
  3478. if ( bSig<>0 ) then
  3479. Begin
  3480. subFloat32Sigs := propagateFloat32NaN( a, b );
  3481. exit;
  3482. End;
  3483. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3484. exit;
  3485. End;
  3486. if ( aExp = 0 ) then
  3487. Begin
  3488. Inc(expDiff);
  3489. End
  3490. else
  3491. Begin
  3492. aSig := aSig OR $40000000;
  3493. End;
  3494. shift32RightJamming( aSig, - expDiff, aSig );
  3495. bSig := bSig OR $40000000;
  3496. bBigger:
  3497. zSig := bSig - aSig;
  3498. zExp := bExp;
  3499. zSign := zSign xor 1;
  3500. goto normalizeRoundAndPack;
  3501. aExpBigger:
  3502. if ( aExp = $FF ) then
  3503. Begin
  3504. if ( aSig <> 0) then
  3505. Begin
  3506. subFloat32Sigs := propagateFloat32NaN( a, b );
  3507. exit;
  3508. End;
  3509. subFloat32Sigs := a;
  3510. exit;
  3511. End;
  3512. if ( bExp = 0 ) then
  3513. Begin
  3514. Dec(expDiff);
  3515. End
  3516. else
  3517. Begin
  3518. bSig := bSig OR $40000000;
  3519. End;
  3520. shift32RightJamming( bSig, expDiff, bSig );
  3521. aSig := aSig OR $40000000;
  3522. aBigger:
  3523. zSig := aSig - bSig;
  3524. zExp := aExp;
  3525. normalizeRoundAndPack:
  3526. Dec(zExp);
  3527. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3528. End;
  3529. {*
  3530. -------------------------------------------------------------------------------
  3531. Returns the result of adding the single-precision floating-point values `a'
  3532. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3533. Binary Floating-Point Arithmetic.
  3534. -------------------------------------------------------------------------------
  3535. *}
  3536. Function float32_add( a: float32rec; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3537. Var
  3538. aSign, bSign: Flag;
  3539. Begin
  3540. aSign := extractFloat32Sign( a.float32 );
  3541. bSign := extractFloat32Sign( b.float32 );
  3542. if ( aSign = bSign ) then
  3543. Begin
  3544. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3545. End
  3546. else
  3547. Begin
  3548. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3549. End;
  3550. End;
  3551. {*
  3552. -------------------------------------------------------------------------------
  3553. Returns the result of subtracting the single-precision floating-point values
  3554. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3555. for Binary Floating-Point Arithmetic.
  3556. -------------------------------------------------------------------------------
  3557. *}
  3558. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3559. Var
  3560. aSign, bSign: flag;
  3561. Begin
  3562. aSign := extractFloat32Sign( a.float32 );
  3563. bSign := extractFloat32Sign( b.float32 );
  3564. if ( aSign = bSign ) then
  3565. Begin
  3566. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3567. End
  3568. else
  3569. Begin
  3570. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3571. End;
  3572. End;
  3573. {*
  3574. -------------------------------------------------------------------------------
  3575. Returns the result of multiplying the single-precision floating-point values
  3576. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3577. for Binary Floating-Point Arithmetic.
  3578. -------------------------------------------------------------------------------
  3579. *}
  3580. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3581. Var
  3582. aSign, bSign, zSign: flag;
  3583. aExp, bExp, zExp : int16;
  3584. aSig, bSig, zSig0, zSig1: bits32;
  3585. Begin
  3586. aSig := extractFloat32Frac( a.float32 );
  3587. aExp := extractFloat32Exp( a.float32 );
  3588. aSign := extractFloat32Sign( a.float32 );
  3589. bSig := extractFloat32Frac( b.float32 );
  3590. bExp := extractFloat32Exp( b.float32 );
  3591. bSign := extractFloat32Sign( b.float32 );
  3592. zSign := aSign xor bSign;
  3593. if ( aExp = $FF ) then
  3594. Begin
  3595. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3596. Begin
  3597. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3598. exit;
  3599. End;
  3600. if ( ( bits32(bExp) OR bSig ) = 0 ) then
  3601. Begin
  3602. float_raise( float_flag_invalid );
  3603. float32_mul.float32 := float32_default_nan;
  3604. exit;
  3605. End;
  3606. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3607. exit;
  3608. End;
  3609. if ( bExp = $FF ) then
  3610. Begin
  3611. if ( bSig <> 0 ) then
  3612. Begin
  3613. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3614. exit;
  3615. End;
  3616. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3617. Begin
  3618. float_raise( float_flag_invalid );
  3619. float32_mul.float32 := float32_default_nan;
  3620. exit;
  3621. End;
  3622. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3623. exit;
  3624. End;
  3625. if ( aExp = 0 ) then
  3626. Begin
  3627. if ( aSig = 0 ) then
  3628. Begin
  3629. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3630. exit;
  3631. End;
  3632. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3633. End;
  3634. if ( bExp = 0 ) then
  3635. Begin
  3636. if ( bSig = 0 ) then
  3637. Begin
  3638. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3639. exit;
  3640. End;
  3641. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3642. End;
  3643. zExp := aExp + bExp - $7F;
  3644. aSig := ( aSig OR $00800000 ) shl 7;
  3645. bSig := ( bSig OR $00800000 ) shl 8;
  3646. mul32To64( aSig, bSig, zSig0, zSig1 );
  3647. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3648. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3649. Begin
  3650. zSig0 := zSig0 shl 1;
  3651. Dec(zExp);
  3652. End;
  3653. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3654. End;
  3655. {*
  3656. -------------------------------------------------------------------------------
  3657. Returns the result of dividing the single-precision floating-point value `a'
  3658. by the corresponding value `b'. The operation is performed according to the
  3659. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3660. -------------------------------------------------------------------------------
  3661. *}
  3662. Function float32_div(a: float32rec;b: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3663. Var
  3664. aSign, bSign, zSign: flag;
  3665. aExp, bExp, zExp: int16;
  3666. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3667. Begin
  3668. aSig := extractFloat32Frac( a.float32 );
  3669. aExp := extractFloat32Exp( a.float32 );
  3670. aSign := extractFloat32Sign( a.float32 );
  3671. bSig := extractFloat32Frac( b.float32 );
  3672. bExp := extractFloat32Exp( b.float32 );
  3673. bSign := extractFloat32Sign( b.float32 );
  3674. zSign := aSign xor bSign;
  3675. if ( aExp = $FF ) then
  3676. Begin
  3677. if ( aSig <> 0 ) then
  3678. Begin
  3679. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3680. exit;
  3681. End;
  3682. if ( bExp = $FF ) then
  3683. Begin
  3684. if ( bSig <> 0) then
  3685. Begin
  3686. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3687. exit;
  3688. End;
  3689. float_raise( float_flag_invalid );
  3690. float32_div.float32 := float32_default_nan;
  3691. exit;
  3692. End;
  3693. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3694. exit;
  3695. End;
  3696. if ( bExp = $FF ) then
  3697. Begin
  3698. if ( bSig <> 0) then
  3699. Begin
  3700. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3701. exit;
  3702. End;
  3703. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3704. exit;
  3705. End;
  3706. if ( bExp = 0 ) Then
  3707. Begin
  3708. if ( bSig = 0 ) Then
  3709. Begin
  3710. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3711. Begin
  3712. float_raise( float_flag_invalid );
  3713. float32_div.float32 := float32_default_nan;
  3714. exit;
  3715. End;
  3716. float_raise( float_flag_divbyzero );
  3717. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3718. exit;
  3719. End;
  3720. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3721. End;
  3722. if ( aExp = 0 ) Then
  3723. Begin
  3724. if ( aSig = 0 ) Then
  3725. Begin
  3726. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3727. exit;
  3728. End;
  3729. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3730. End;
  3731. zExp := aExp - bExp + $7D;
  3732. aSig := ( aSig OR $00800000 ) shl 7;
  3733. bSig := ( bSig OR $00800000 ) shl 8;
  3734. if ( bSig <= ( aSig + aSig ) ) then
  3735. Begin
  3736. aSig := aSig shr 1;
  3737. Inc(zExp);
  3738. End;
  3739. zSig := estimateDiv64To32( aSig, 0, bSig );
  3740. if ( ( zSig and $3F ) <= 2 ) then
  3741. Begin
  3742. mul32To64( bSig, zSig, term0, term1 );
  3743. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3744. while ( sbits32 (rem0) < 0 ) do
  3745. Begin
  3746. Dec(zSig);
  3747. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3748. End;
  3749. zSig := zSig or bits32( rem1 <> 0 );
  3750. End;
  3751. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3752. End;
  3753. {*
  3754. -------------------------------------------------------------------------------
  3755. Returns the remainder of the single-precision floating-point value `a'
  3756. with respect to the corresponding value `b'. The operation is performed
  3757. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3758. -------------------------------------------------------------------------------
  3759. *}
  3760. Function float32_rem(a: float32rec; b: float32rec ):float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3761. Var
  3762. aSign, zSign: flag;
  3763. aExp, bExp, expDiff: int16;
  3764. aSig, bSig, q, alternateASig: bits32;
  3765. sigMean: sbits32;
  3766. Begin
  3767. aSig := extractFloat32Frac( a.float32 );
  3768. aExp := extractFloat32Exp( a.float32 );
  3769. aSign := extractFloat32Sign( a.float32 );
  3770. bSig := extractFloat32Frac( b.float32 );
  3771. bExp := extractFloat32Exp( b.float32 );
  3772. if ( aExp = $FF ) then
  3773. Begin
  3774. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3775. Begin
  3776. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3777. exit;
  3778. End;
  3779. float_raise( float_flag_invalid );
  3780. float32_rem.float32 := float32_default_nan;
  3781. exit;
  3782. End;
  3783. if ( bExp = $FF ) then
  3784. Begin
  3785. if ( bSig <> 0 ) then
  3786. Begin
  3787. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3788. exit;
  3789. End;
  3790. float32_rem := a;
  3791. exit;
  3792. End;
  3793. if ( bExp = 0 ) then
  3794. Begin
  3795. if ( bSig = 0 ) then
  3796. Begin
  3797. float_raise( float_flag_invalid );
  3798. float32_rem.float32 := float32_default_nan;
  3799. exit;
  3800. End;
  3801. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3802. End;
  3803. if ( aExp = 0 ) then
  3804. Begin
  3805. if ( aSig = 0 ) then
  3806. Begin
  3807. float32_rem := a;
  3808. exit;
  3809. End;
  3810. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3811. End;
  3812. expDiff := aExp - bExp;
  3813. aSig := ( aSig OR $00800000 ) shl 8;
  3814. bSig := ( bSig OR $00800000 ) shl 8;
  3815. if ( expDiff < 0 ) then
  3816. Begin
  3817. if ( expDiff < -1 ) then
  3818. Begin
  3819. float32_rem := a;
  3820. exit;
  3821. End;
  3822. aSig := aSig shr 1;
  3823. End;
  3824. q := bits32( bSig <= aSig );
  3825. if ( q <> 0) then
  3826. aSig := aSig - bSig;
  3827. expDiff := expDiff - 32;
  3828. while ( 0 < expDiff ) do
  3829. Begin
  3830. q := estimateDiv64To32( aSig, 0, bSig );
  3831. if (2 < q) then
  3832. q := q - 2
  3833. else
  3834. q := 0;
  3835. aSig := - ( ( bSig shr 2 ) * q );
  3836. expDiff := expDiff - 30;
  3837. End;
  3838. expDiff := expDiff + 32;
  3839. if ( 0 < expDiff ) then
  3840. Begin
  3841. q := estimateDiv64To32( aSig, 0, bSig );
  3842. if (2 < q) then
  3843. q := q - 2
  3844. else
  3845. q := 0;
  3846. q := q shr (32 - expDiff);
  3847. bSig := bSig shr 2;
  3848. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3849. End
  3850. else
  3851. Begin
  3852. aSig := aSig shr 2;
  3853. bSig := bSig shr 2;
  3854. End;
  3855. Repeat
  3856. alternateASig := aSig;
  3857. Inc(q);
  3858. aSig := aSig - bSig;
  3859. Until not ( 0 <= sbits32 (aSig) );
  3860. sigMean := aSig + alternateASig;
  3861. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3862. Begin
  3863. aSig := alternateASig;
  3864. End;
  3865. zSign := flag( sbits32 (aSig) < 0 );
  3866. if ( zSign<>0 ) then
  3867. aSig := - aSig;
  3868. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3869. End;
  3870. {*
  3871. -------------------------------------------------------------------------------
  3872. Returns the square root of the single-precision floating-point value `a'.
  3873. The operation is performed according to the IEC/IEEE Standard for Binary
  3874. Floating-Point Arithmetic.
  3875. -------------------------------------------------------------------------------
  3876. *}
  3877. Function float32_sqrt(a: float32rec ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3878. Var
  3879. aSign : flag;
  3880. aExp, zExp : int16;
  3881. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3882. label roundAndPack;
  3883. Begin
  3884. aSig := extractFloat32Frac( a.float32 );
  3885. aExp := extractFloat32Exp( a.float32 );
  3886. aSign := extractFloat32Sign( a.float32 );
  3887. if ( aExp = $FF ) then
  3888. Begin
  3889. if ( aSig <> 0) then
  3890. Begin
  3891. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3892. exit;
  3893. End;
  3894. if ( aSign = 0) then
  3895. Begin
  3896. float32_sqrt := a;
  3897. exit;
  3898. End;
  3899. float_raise( float_flag_invalid );
  3900. float32_sqrt.float32 := float32_default_nan;
  3901. exit;
  3902. End;
  3903. if ( aSign <> 0) then
  3904. Begin
  3905. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3906. Begin
  3907. float32_sqrt := a;
  3908. exit;
  3909. End;
  3910. float_raise( float_flag_invalid );
  3911. float32_sqrt.float32 := float32_default_nan;
  3912. exit;
  3913. End;
  3914. if ( aExp = 0 ) then
  3915. Begin
  3916. if ( aSig = 0 ) then
  3917. Begin
  3918. float32_sqrt.float32 := 0;
  3919. exit;
  3920. End;
  3921. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3922. End;
  3923. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3924. aSig := ( aSig OR $00800000 ) shl 8;
  3925. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3926. if ( ( zSig and $7F ) <= 5 ) then
  3927. Begin
  3928. if ( zSig < 2 ) then
  3929. Begin
  3930. zSig := $7FFFFFFF;
  3931. goto roundAndPack;
  3932. End
  3933. else
  3934. Begin
  3935. aSig := aSig shr (aExp and 1);
  3936. mul32To64( zSig, zSig, term0, term1 );
  3937. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3938. while ( sbits32 (rem0) < 0 ) do
  3939. Begin
  3940. Dec(zSig);
  3941. shortShift64Left( 0, zSig, 1, term0, term1 );
  3942. term1 := term1 or 1;
  3943. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3944. End;
  3945. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3946. End;
  3947. End;
  3948. shift32RightJamming( zSig, 1, zSig );
  3949. roundAndPack:
  3950. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3951. End;
  3952. {*
  3953. -------------------------------------------------------------------------------
  3954. Returns 1 if the single-precision floating-point value `a' is equal to
  3955. the corresponding value `b', and 0 otherwise. The comparison is performed
  3956. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3957. -------------------------------------------------------------------------------
  3958. *}
  3959. Function float32_eq( a:float32rec; b:float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3960. Begin
  3961. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3962. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3963. ) then
  3964. Begin
  3965. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3966. Begin
  3967. float_raise( float_flag_invalid );
  3968. End;
  3969. float32_eq := 0;
  3970. exit;
  3971. End;
  3972. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3973. End;
  3974. {*
  3975. -------------------------------------------------------------------------------
  3976. Returns 1 if the single-precision floating-point value `a' is less than
  3977. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3978. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3979. Arithmetic.
  3980. -------------------------------------------------------------------------------
  3981. *}
  3982. Function float32_le( a: float32rec; b : float32rec ):flag;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3983. var
  3984. aSign, bSign: flag;
  3985. Begin
  3986. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3987. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3988. ) then
  3989. Begin
  3990. float_raise( float_flag_invalid );
  3991. float32_le := 0;
  3992. exit;
  3993. End;
  3994. aSign := extractFloat32Sign( a.float32 );
  3995. bSign := extractFloat32Sign( b.float32 );
  3996. if ( aSign <> bSign ) then
  3997. Begin
  3998. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3999. exit;
  4000. End;
  4001. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  4002. End;
  4003. {*
  4004. -------------------------------------------------------------------------------
  4005. Returns 1 if the single-precision floating-point value `a' is less than
  4006. the corresponding value `b', and 0 otherwise. The comparison is performed
  4007. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4008. -------------------------------------------------------------------------------
  4009. *}
  4010. Function float32_lt( a:float32rec ; b : float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  4011. var
  4012. aSign, bSign: flag;
  4013. Begin
  4014. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  4015. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  4016. ) then
  4017. Begin
  4018. float_raise( float_flag_invalid );
  4019. float32_lt :=0;
  4020. exit;
  4021. End;
  4022. aSign := extractFloat32Sign( a.float32 );
  4023. bSign := extractFloat32Sign( b.float32 );
  4024. if ( aSign <> bSign ) then
  4025. Begin
  4026. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  4027. exit;
  4028. End;
  4029. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  4030. End;
  4031. {*
  4032. -------------------------------------------------------------------------------
  4033. Returns 1 if the single-precision floating-point value `a' is equal to
  4034. the corresponding value `b', and 0 otherwise. The invalid exception is
  4035. raised if either operand is a NaN. Otherwise, the comparison is performed
  4036. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4037. -------------------------------------------------------------------------------
  4038. *}
  4039. Function float32_eq_signaling( a: float32; b: float32) : flag;
  4040. Begin
  4041. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  4042. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  4043. ) then
  4044. Begin
  4045. float_raise( float_flag_invalid );
  4046. float32_eq_signaling := 0;
  4047. exit;
  4048. End;
  4049. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  4050. End;
  4051. {*
  4052. -------------------------------------------------------------------------------
  4053. Returns 1 if the single-precision floating-point value `a' is less than or
  4054. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4055. cause an exception. Otherwise, the comparison is performed according to the
  4056. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4057. -------------------------------------------------------------------------------
  4058. *}
  4059. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  4060. Var
  4061. aSign, bSign: flag;
  4062. Begin
  4063. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4064. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4065. ) then
  4066. Begin
  4067. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4068. Begin
  4069. float_raise( float_flag_invalid );
  4070. End;
  4071. float32_le_quiet := 0;
  4072. exit;
  4073. End;
  4074. aSign := extractFloat32Sign( a );
  4075. bSign := extractFloat32Sign( b );
  4076. if ( aSign <> bSign ) then
  4077. Begin
  4078. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  4079. exit;
  4080. End;
  4081. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  4082. End;
  4083. {*
  4084. -------------------------------------------------------------------------------
  4085. Returns 1 if the single-precision floating-point value `a' is less than
  4086. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4087. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4088. Standard for Binary Floating-Point Arithmetic.
  4089. -------------------------------------------------------------------------------
  4090. *}
  4091. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  4092. Var
  4093. aSign, bSign: flag;
  4094. Begin
  4095. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4096. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4097. ) then
  4098. Begin
  4099. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4100. Begin
  4101. float_raise( float_flag_invalid );
  4102. End;
  4103. float32_lt_quiet := 0;
  4104. exit;
  4105. End;
  4106. aSign := extractFloat32Sign( a );
  4107. bSign := extractFloat32Sign( b );
  4108. if ( aSign <> bSign ) then
  4109. Begin
  4110. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  4111. exit;
  4112. End;
  4113. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  4114. End;
  4115. {*
  4116. -------------------------------------------------------------------------------
  4117. Returns the result of converting the double-precision floating-point value
  4118. `a' to the 32-bit two's complement integer format. The conversion is
  4119. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4120. Arithmetic---which means in particular that the conversion is rounded
  4121. according to the current rounding mode. If `a' is a NaN, the largest
  4122. positive integer is returned. Otherwise, if the conversion overflows, the
  4123. largest integer with the same sign as `a' is returned.
  4124. -------------------------------------------------------------------------------
  4125. *}
  4126. Function float64_to_int32(a: float64): int32;{$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  4127. var
  4128. aSign: flag;
  4129. aExp, shiftCount: int16;
  4130. aSig0, aSig1, absZ, aSigExtra: bits32;
  4131. z: int32;
  4132. roundingMode: TFPURoundingMode;
  4133. label invalid;
  4134. Begin
  4135. aSig1 := extractFloat64Frac1( a );
  4136. aSig0 := extractFloat64Frac0( a );
  4137. aExp := extractFloat64Exp( a );
  4138. aSign := extractFloat64Sign( a );
  4139. shiftCount := aExp - $413;
  4140. if ( 0 <= shiftCount ) then
  4141. Begin
  4142. if ( $41E < aExp ) then
  4143. Begin
  4144. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4145. aSign := 0;
  4146. goto invalid;
  4147. End;
  4148. shortShift64Left(
  4149. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4150. if ( $80000000 < absZ ) then
  4151. goto invalid;
  4152. End
  4153. else
  4154. Begin
  4155. aSig1 := flag( aSig1 <> 0 );
  4156. if ( aExp < $3FE ) then
  4157. Begin
  4158. aSigExtra := aExp OR aSig0 OR aSig1;
  4159. absZ := 0;
  4160. End
  4161. else
  4162. Begin
  4163. aSig0 := aSig0 OR $00100000;
  4164. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4165. absZ := aSig0 shr ( - shiftCount );
  4166. End;
  4167. End;
  4168. roundingMode := softfloat_rounding_mode;
  4169. if ( roundingMode = float_round_nearest_even ) then
  4170. Begin
  4171. if ( sbits32(aSigExtra) < 0 ) then
  4172. Begin
  4173. Inc(absZ);
  4174. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  4175. absZ := absZ and not 1;
  4176. End;
  4177. if aSign <> 0 then
  4178. z := - absZ
  4179. else
  4180. z := absZ;
  4181. End
  4182. else
  4183. Begin
  4184. aSigExtra := bits32( aSigExtra <> 0 );
  4185. if ( aSign <> 0) then
  4186. Begin
  4187. z := - ( absZ
  4188. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  4189. End
  4190. else
  4191. Begin
  4192. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  4193. End
  4194. End;
  4195. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  4196. Begin
  4197. invalid:
  4198. float_raise( float_flag_invalid );
  4199. if (aSign <> 0 ) then
  4200. float64_to_int32 := sbits32 ($80000000)
  4201. else
  4202. float64_to_int32 := $7FFFFFFF;
  4203. exit;
  4204. End;
  4205. if ( aSigExtra <> 0) then
  4206. set_inexact_flag;
  4207. float64_to_int32 := z;
  4208. End;
  4209. {*
  4210. -------------------------------------------------------------------------------
  4211. Returns the result of converting the double-precision floating-point value
  4212. `a' to the 32-bit two's complement integer format. The conversion is
  4213. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4214. Arithmetic, except that the conversion is always rounded toward zero.
  4215. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4216. the conversion overflows, the largest integer with the same sign as `a' is
  4217. returned.
  4218. -------------------------------------------------------------------------------
  4219. *}
  4220. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  4221. {$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  4222. Var
  4223. aSign: flag;
  4224. aExp, shiftCount: int16;
  4225. aSig0, aSig1, absZ, aSigExtra: bits32;
  4226. z: int32;
  4227. label invalid;
  4228. Begin
  4229. aSig1 := extractFloat64Frac1( a );
  4230. aSig0 := extractFloat64Frac0( a );
  4231. aExp := extractFloat64Exp( a );
  4232. aSign := extractFloat64Sign( a );
  4233. shiftCount := aExp - $413;
  4234. if ( 0 <= shiftCount ) then
  4235. Begin
  4236. if ( $41E < aExp ) then
  4237. Begin
  4238. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4239. aSign := 0;
  4240. goto invalid;
  4241. End;
  4242. shortShift64Left(
  4243. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4244. End
  4245. else
  4246. Begin
  4247. if ( aExp < $3FF ) then
  4248. Begin
  4249. if ( bits32(aExp) OR aSig0 OR aSig1 )<>0 then
  4250. Begin
  4251. set_inexact_flag;
  4252. End;
  4253. float64_to_int32_round_to_zero := 0;
  4254. exit;
  4255. End;
  4256. aSig0 := aSig0 or $00100000;
  4257. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4258. absZ := aSig0 shr ( - shiftCount );
  4259. End;
  4260. if aSign <> 0 then
  4261. z := - absZ
  4262. else
  4263. z := absZ;
  4264. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  4265. Begin
  4266. invalid:
  4267. float_raise( float_flag_invalid );
  4268. if (aSign <> 0) then
  4269. float64_to_int32_round_to_zero := sbits32 ($80000000)
  4270. else
  4271. float64_to_int32_round_to_zero := $7FFFFFFF;
  4272. exit;
  4273. End;
  4274. if ( aSigExtra <> 0) then
  4275. set_inexact_flag;
  4276. float64_to_int32_round_to_zero := z;
  4277. End;
  4278. {*----------------------------------------------------------------------------
  4279. | Returns the result of converting the double-precision floating-point value
  4280. | `a' to the 64-bit two's complement integer format. The conversion is
  4281. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4282. | Arithmetic---which means in particular that the conversion is rounded
  4283. | according to the current rounding mode. If `a' is a NaN, the largest
  4284. | positive integer is returned. Otherwise, if the conversion overflows, the
  4285. | largest integer with the same sign as `a' is returned.
  4286. *----------------------------------------------------------------------------*}
  4287. function float64_to_int64( a: float64 ): int64;
  4288. var
  4289. aSign: flag;
  4290. aExp, shiftCount: int16;
  4291. aSig, aSigExtra: bits64;
  4292. begin
  4293. aSig := extractFloat64Frac( a );
  4294. aExp := extractFloat64Exp( a );
  4295. aSign := extractFloat64Sign( a );
  4296. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4297. shiftCount := $433 - aExp;
  4298. if ( shiftCount <= 0 ) then begin
  4299. if ( $43E < aExp ) then begin
  4300. float_raise( float_flag_invalid );
  4301. if ( ( aSign = 0 )
  4302. or ( ( aExp = $7FF )
  4303. and ( aSig <> $0010000000000000 ) )
  4304. ) then begin
  4305. result := $7FFFFFFFFFFFFFFF;
  4306. exit;
  4307. end;
  4308. result := $8000000000000000;
  4309. exit;
  4310. end;
  4311. aSigExtra := 0;
  4312. aSig := aSig shl ( - shiftCount );
  4313. end
  4314. else
  4315. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  4316. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  4317. end;
  4318. {*----------------------------------------------------------------------------
  4319. | Returns the result of converting the double-precision floating-point value
  4320. | `a' to the 64-bit two's complement integer format. The conversion is
  4321. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4322. | Arithmetic, except that the conversion is always rounded toward zero.
  4323. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4324. | the conversion overflows, the largest integer with the same sign as `a' is
  4325. | returned.
  4326. *----------------------------------------------------------------------------*}
  4327. {$define FPC_SYSTEM_HAS_float64_to_int64_round_to_zero}
  4328. function float64_to_int64_round_to_zero( a: float64 ): int64;
  4329. var
  4330. aSign: flag;
  4331. aExp, shiftCount: int16;
  4332. aSig: bits64;
  4333. z: int64;
  4334. begin
  4335. aSig := extractFloat64Frac( a );
  4336. aExp := extractFloat64Exp( a );
  4337. aSign := extractFloat64Sign( a );
  4338. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4339. shiftCount := aExp - $433;
  4340. if ( 0 <= shiftCount ) then begin
  4341. if ( $43E <= aExp ) then begin
  4342. if ( bits64 ( a ) <> bits64( $C3E0000000000000 ) ) then begin
  4343. float_raise( float_flag_invalid );
  4344. if ( ( aSign = 0 )
  4345. or ( ( aExp = $7FF )
  4346. and ( aSig <> $0010000000000000 ) )
  4347. ) then begin
  4348. result := $7FFFFFFFFFFFFFFF;
  4349. exit;
  4350. end;
  4351. end;
  4352. result := $8000000000000000;
  4353. exit;
  4354. end;
  4355. z := aSig shl shiftCount;
  4356. end
  4357. else begin
  4358. if ( aExp < $3FE ) then begin
  4359. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  4360. result := 0;
  4361. exit;
  4362. end;
  4363. z := aSig shr ( - shiftCount );
  4364. if ( bits64( aSig shl ( shiftCount and 63 ) ) <> 0 ) then
  4365. set_inexact_flag;
  4366. end;
  4367. if ( aSign <> 0 ) then z := - z;
  4368. result := z;
  4369. end;
  4370. {*
  4371. -------------------------------------------------------------------------------
  4372. Returns the result of converting the double-precision floating-point value
  4373. `a' to the single-precision floating-point format. The conversion is
  4374. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4375. Arithmetic.
  4376. -------------------------------------------------------------------------------
  4377. *}
  4378. Function float64_to_float32(a: float64 ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  4379. Var
  4380. aSign: flag;
  4381. aExp: int16;
  4382. aSig0, aSig1, zSig: bits32;
  4383. allZero: bits32;
  4384. tmp : CommonNanT;
  4385. Begin
  4386. aSig1 := extractFloat64Frac1( a );
  4387. aSig0 := extractFloat64Frac0( a );
  4388. aExp := extractFloat64Exp( a );
  4389. aSign := extractFloat64Sign( a );
  4390. if ( aExp = $7FF ) then
  4391. Begin
  4392. if ( aSig0 OR aSig1 ) <> 0 then
  4393. Begin
  4394. tmp:=float64ToCommonNaN(a);
  4395. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  4396. exit;
  4397. End;
  4398. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  4399. exit;
  4400. End;
  4401. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  4402. if ( aExp <> 0) then
  4403. zSig := zSig OR $40000000;
  4404. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  4405. End;
  4406. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  4407. {*----------------------------------------------------------------------------
  4408. | Returns the result of converting the double-precision floating-point value
  4409. | `a' to the extended double-precision floating-point format. The conversion
  4410. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4411. | Arithmetic.
  4412. *----------------------------------------------------------------------------*}
  4413. function float64_to_floatx80( a: float64 ): floatx80;
  4414. var
  4415. aSign: flag;
  4416. aExp: int16;
  4417. aSig: bits64;
  4418. begin
  4419. aSig := extractFloat64Frac( a );
  4420. aExp := extractFloat64Exp( a );
  4421. aSign := extractFloat64Sign( a );
  4422. if ( aExp = $7FF ) then begin
  4423. if ( aSig <> 0 ) then begin
  4424. result := commonNaNToFloatx80( float64ToCommonNaN( a ) );
  4425. exit;
  4426. end;
  4427. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  4428. exit;
  4429. end;
  4430. if ( aExp = 0 ) then begin
  4431. if ( aSig = 0 ) then begin
  4432. result := packFloatx80( aSign, 0, 0 );
  4433. exit;
  4434. end;
  4435. normalizeFloat64Subnormal( aSig, aExp, aSig );
  4436. end;
  4437. result :=
  4438. packFloatx80(
  4439. aSign, aExp + $3C00, ( aSig or $0010000000000000 ) shl 11 );
  4440. end;
  4441. {$endif FPC_SOFTFLOAT_FLOATX80}
  4442. {*
  4443. -------------------------------------------------------------------------------
  4444. Rounds the double-precision floating-point value `a' to an integer,
  4445. and returns the result as a double-precision floating-point value. The
  4446. operation is performed according to the IEC/IEEE Standard for Binary
  4447. Floating-Point Arithmetic.
  4448. -------------------------------------------------------------------------------
  4449. *}
  4450. function float64_round_to_int(a: float64) : Float64;{$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  4451. Var
  4452. aSign: flag;
  4453. aExp: int16;
  4454. lastBitMask, roundBitsMask: bits32;
  4455. roundingMode: TFPURoundingMode;
  4456. z: float64;
  4457. Begin
  4458. aExp := extractFloat64Exp( a );
  4459. if ( $413 <= aExp ) then
  4460. Begin
  4461. if ( $433 <= aExp ) then
  4462. Begin
  4463. if ( ( aExp = $7FF )
  4464. AND
  4465. (
  4466. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  4467. ) <>0)
  4468. ) then
  4469. Begin
  4470. propagateFloat64NaN( a, a, result );
  4471. exit;
  4472. End;
  4473. result := a;
  4474. exit;
  4475. End;
  4476. lastBitMask := 1;
  4477. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  4478. roundBitsMask := lastBitMask - 1;
  4479. z := a;
  4480. roundingMode := softfloat_rounding_mode;
  4481. if ( roundingMode = float_round_nearest_even ) then
  4482. Begin
  4483. if ( lastBitMask <> 0) then
  4484. Begin
  4485. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  4486. if ( ( z.low and roundBitsMask ) = 0 ) then
  4487. z.low := z.low and not lastBitMask;
  4488. End
  4489. else
  4490. Begin
  4491. if ( sbits32 (z.low) < 0 ) then
  4492. Begin
  4493. Inc(z.high);
  4494. if ( bits32 ( z.low shl 1 ) = 0 ) then
  4495. z.high := z.high and not 1;
  4496. End;
  4497. End;
  4498. End
  4499. else if ( roundingMode <> float_round_to_zero ) then
  4500. Begin
  4501. if ( extractFloat64Sign( z )
  4502. xor flag( roundingMode = float_round_up ) )<> 0 then
  4503. Begin
  4504. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4505. End;
  4506. End;
  4507. z.low := z.low and not roundBitsMask;
  4508. End
  4509. else
  4510. Begin
  4511. if ( aExp <= $3FE ) then
  4512. Begin
  4513. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4514. Begin
  4515. result := a;
  4516. exit;
  4517. End;
  4518. set_inexact_flag;
  4519. aSign := extractFloat64Sign( a );
  4520. case ( softfloat_rounding_mode ) of
  4521. float_round_nearest_even:
  4522. Begin
  4523. if ( ( aExp = $3FE )
  4524. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4525. ) then
  4526. Begin
  4527. packFloat64( aSign, $3FF, 0, 0, result );
  4528. exit;
  4529. End;
  4530. End;
  4531. float_round_down:
  4532. Begin
  4533. if aSign<>0 then
  4534. packFloat64( 1, $3FF, 0, 0, result )
  4535. else
  4536. packFloat64( 0, 0, 0, 0, result );
  4537. exit;
  4538. End;
  4539. float_round_up:
  4540. Begin
  4541. if aSign <> 0 then
  4542. packFloat64( 1, 0, 0, 0, result )
  4543. else
  4544. packFloat64( 0, $3FF, 0, 0, result );
  4545. exit;
  4546. End;
  4547. end;
  4548. packFloat64( aSign, 0, 0, 0, result );
  4549. exit;
  4550. End;
  4551. lastBitMask := 1;
  4552. lastBitMask := lastBitMask shl ($413 - aExp);
  4553. roundBitsMask := lastBitMask - 1;
  4554. z.low := 0;
  4555. z.high := a.high;
  4556. roundingMode := softfloat_rounding_mode;
  4557. if ( roundingMode = float_round_nearest_even ) then
  4558. Begin
  4559. z.high := z.high + lastBitMask shr 1;
  4560. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4561. Begin
  4562. z.high := z.high and not lastBitMask;
  4563. End;
  4564. End
  4565. else if ( roundingMode <> float_round_to_zero ) then
  4566. Begin
  4567. if ( extractFloat64Sign( z )
  4568. xor flag( roundingMode = float_round_up ) )<> 0 then
  4569. Begin
  4570. z.high := z.high or bits32( a.low <> 0 );
  4571. z.high := z.high + roundBitsMask;
  4572. End;
  4573. End;
  4574. z.high := z.high and not roundBitsMask;
  4575. End;
  4576. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4577. Begin
  4578. set_inexact_flag;
  4579. End;
  4580. result := z;
  4581. End;
  4582. {*
  4583. -------------------------------------------------------------------------------
  4584. Returns the result of adding the absolute values of the double-precision
  4585. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4586. before being returned. `zSign' is ignored if the result is a NaN.
  4587. The addition is performed according to the IEC/IEEE Standard for Binary
  4588. Floating-Point Arithmetic.
  4589. -------------------------------------------------------------------------------
  4590. *}
  4591. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4592. Var
  4593. aExp, bExp, zExp: int16;
  4594. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4595. expDiff: int16;
  4596. label shiftRight1;
  4597. label roundAndPack;
  4598. Begin
  4599. aSig1 := extractFloat64Frac1( a );
  4600. aSig0 := extractFloat64Frac0( a );
  4601. aExp := extractFloat64Exp( a );
  4602. bSig1 := extractFloat64Frac1( b );
  4603. bSig0 := extractFloat64Frac0( b );
  4604. bExp := extractFloat64Exp( b );
  4605. expDiff := aExp - bExp;
  4606. if ( 0 < expDiff ) then
  4607. Begin
  4608. if ( aExp = $7FF ) then
  4609. Begin
  4610. if ( aSig0 OR aSig1 ) <> 0 then
  4611. Begin
  4612. propagateFloat64NaN( a, b, out );
  4613. exit;
  4614. end;
  4615. out := a;
  4616. exit;
  4617. End;
  4618. if ( bExp = 0 ) then
  4619. Begin
  4620. Dec(expDiff);
  4621. End
  4622. else
  4623. Begin
  4624. bSig0 := bSig0 or $00100000;
  4625. End;
  4626. shift64ExtraRightJamming(
  4627. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4628. zExp := aExp;
  4629. End
  4630. else if ( expDiff < 0 ) then
  4631. Begin
  4632. if ( bExp = $7FF ) then
  4633. Begin
  4634. if ( bSig0 OR bSig1 ) <> 0 then
  4635. Begin
  4636. propagateFloat64NaN( a, b, out );
  4637. exit;
  4638. End;
  4639. packFloat64( zSign, $7FF, 0, 0, out );
  4640. exit;
  4641. End;
  4642. if ( aExp = 0 ) then
  4643. Begin
  4644. Inc(expDiff);
  4645. End
  4646. else
  4647. Begin
  4648. aSig0 := aSig0 or $00100000;
  4649. End;
  4650. shift64ExtraRightJamming(
  4651. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4652. zExp := bExp;
  4653. End
  4654. else
  4655. Begin
  4656. if ( aExp = $7FF ) then
  4657. Begin
  4658. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4659. Begin
  4660. propagateFloat64NaN( a, b, out );
  4661. exit;
  4662. End;
  4663. out := a;
  4664. exit;
  4665. End;
  4666. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4667. if ( aExp = 0 ) then
  4668. Begin
  4669. packFloat64( zSign, 0, zSig0, zSig1, out );
  4670. exit;
  4671. End;
  4672. zSig2 := 0;
  4673. zSig0 := zSig0 or $00200000;
  4674. zExp := aExp;
  4675. goto shiftRight1;
  4676. End;
  4677. aSig0 := aSig0 or $00100000;
  4678. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4679. Dec(zExp);
  4680. if ( zSig0 < $00200000 ) then
  4681. goto roundAndPack;
  4682. Inc(zExp);
  4683. shiftRight1:
  4684. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4685. roundAndPack:
  4686. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4687. End;
  4688. {*
  4689. -------------------------------------------------------------------------------
  4690. Returns the result of subtracting the absolute values of the double-
  4691. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4692. difference is negated before being returned. `zSign' is ignored if the
  4693. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4694. Standard for Binary Floating-Point Arithmetic.
  4695. -------------------------------------------------------------------------------
  4696. *}
  4697. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4698. Var
  4699. aExp, bExp, zExp: int16;
  4700. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4701. expDiff: int16;
  4702. z: float64;
  4703. label aExpBigger;
  4704. label bExpBigger;
  4705. label aBigger;
  4706. label bBigger;
  4707. label normalizeRoundAndPack;
  4708. Begin
  4709. aSig1 := extractFloat64Frac1( a );
  4710. aSig0 := extractFloat64Frac0( a );
  4711. aExp := extractFloat64Exp( a );
  4712. bSig1 := extractFloat64Frac1( b );
  4713. bSig0 := extractFloat64Frac0( b );
  4714. bExp := extractFloat64Exp( b );
  4715. expDiff := aExp - bExp;
  4716. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4717. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4718. if ( 0 < expDiff ) then goto aExpBigger;
  4719. if ( expDiff < 0 ) then goto bExpBigger;
  4720. if ( aExp = $7FF ) then
  4721. Begin
  4722. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4723. Begin
  4724. propagateFloat64NaN( a, b, out );
  4725. exit;
  4726. End;
  4727. float_raise( float_flag_invalid );
  4728. z.low := float64_default_nan_low;
  4729. z.high := float64_default_nan_high;
  4730. out := z;
  4731. exit;
  4732. End;
  4733. if ( aExp = 0 ) then
  4734. Begin
  4735. aExp := 1;
  4736. bExp := 1;
  4737. End;
  4738. if ( bSig0 < aSig0 ) then goto aBigger;
  4739. if ( aSig0 < bSig0 ) then goto bBigger;
  4740. if ( bSig1 < aSig1 ) then goto aBigger;
  4741. if ( aSig1 < bSig1 ) then goto bBigger;
  4742. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4743. exit;
  4744. bExpBigger:
  4745. if ( bExp = $7FF ) then
  4746. Begin
  4747. if ( bSig0 OR bSig1 ) <> 0 then
  4748. Begin
  4749. propagateFloat64NaN( a, b, out );
  4750. exit;
  4751. End;
  4752. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4753. exit;
  4754. End;
  4755. if ( aExp = 0 ) then
  4756. Begin
  4757. Inc(expDiff);
  4758. End
  4759. else
  4760. Begin
  4761. aSig0 := aSig0 or $40000000;
  4762. End;
  4763. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4764. bSig0 := bSig0 or $40000000;
  4765. bBigger:
  4766. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4767. zExp := bExp;
  4768. zSign := zSign xor 1;
  4769. goto normalizeRoundAndPack;
  4770. aExpBigger:
  4771. if ( aExp = $7FF ) then
  4772. Begin
  4773. if ( aSig0 OR aSig1 ) <> 0 then
  4774. Begin
  4775. propagateFloat64NaN( a, b, out );
  4776. exit;
  4777. End;
  4778. out := a;
  4779. exit;
  4780. End;
  4781. if ( bExp = 0 ) then
  4782. Begin
  4783. Dec(expDiff);
  4784. End
  4785. else
  4786. Begin
  4787. bSig0 := bSig0 or $40000000;
  4788. End;
  4789. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4790. aSig0 := aSig0 or $40000000;
  4791. aBigger:
  4792. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4793. zExp := aExp;
  4794. normalizeRoundAndPack:
  4795. Dec(zExp);
  4796. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4797. End;
  4798. {*
  4799. -------------------------------------------------------------------------------
  4800. Returns the result of adding the double-precision floating-point values `a'
  4801. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4802. Binary Floating-Point Arithmetic.
  4803. -------------------------------------------------------------------------------
  4804. *}
  4805. Function float64_add( a: float64; b : float64) : Float64;
  4806. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4807. Var
  4808. aSign, bSign: flag;
  4809. Begin
  4810. aSign := extractFloat64Sign( a );
  4811. bSign := extractFloat64Sign( b );
  4812. if ( aSign = bSign ) then
  4813. Begin
  4814. addFloat64Sigs( a, b, aSign, result );
  4815. End
  4816. else
  4817. Begin
  4818. subFloat64Sigs( a, b, aSign, result );
  4819. End;
  4820. End;
  4821. {*
  4822. -------------------------------------------------------------------------------
  4823. Returns the result of subtracting the double-precision floating-point values
  4824. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4825. for Binary Floating-Point Arithmetic.
  4826. -------------------------------------------------------------------------------
  4827. *}
  4828. Function float64_sub(a: float64; b : float64) : Float64;
  4829. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4830. Var
  4831. aSign, bSign: flag;
  4832. Begin
  4833. aSign := extractFloat64Sign( a );
  4834. bSign := extractFloat64Sign( b );
  4835. if ( aSign = bSign ) then
  4836. Begin
  4837. subFloat64Sigs( a, b, aSign, result );
  4838. End
  4839. else
  4840. Begin
  4841. addFloat64Sigs( a, b, aSign, result );
  4842. End;
  4843. End;
  4844. {*
  4845. -------------------------------------------------------------------------------
  4846. Returns the result of multiplying the double-precision floating-point values
  4847. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4848. for Binary Floating-Point Arithmetic.
  4849. -------------------------------------------------------------------------------
  4850. *}
  4851. Function float64_mul( a: float64; b:float64) : Float64;
  4852. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4853. Var
  4854. aSign, bSign, zSign: flag;
  4855. aExp, bExp, zExp: int16;
  4856. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4857. z: float64;
  4858. label invalid;
  4859. Begin
  4860. aSig1 := extractFloat64Frac1( a );
  4861. aSig0 := extractFloat64Frac0( a );
  4862. aExp := extractFloat64Exp( a );
  4863. aSign := extractFloat64Sign( a );
  4864. bSig1 := extractFloat64Frac1( b );
  4865. bSig0 := extractFloat64Frac0( b );
  4866. bExp := extractFloat64Exp( b );
  4867. bSign := extractFloat64Sign( b );
  4868. zSign := aSign xor bSign;
  4869. if ( aExp = $7FF ) then
  4870. Begin
  4871. if ( (( aSig0 OR aSig1 ) <>0)
  4872. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4873. Begin
  4874. propagateFloat64NaN( a, b, result );
  4875. exit;
  4876. End;
  4877. if ( ( bits32(bExp) OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4878. packFloat64( zSign, $7FF, 0, 0, result );
  4879. exit;
  4880. End;
  4881. if ( bExp = $7FF ) then
  4882. Begin
  4883. if ( bSig0 OR bSig1 )<> 0 then
  4884. Begin
  4885. propagateFloat64NaN( a, b, result );
  4886. exit;
  4887. End;
  4888. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4889. Begin
  4890. invalid:
  4891. float_raise( float_flag_invalid );
  4892. z.low := float64_default_nan_low;
  4893. z.high := float64_default_nan_high;
  4894. result := z;
  4895. exit;
  4896. End;
  4897. packFloat64( zSign, $7FF, 0, 0, result );
  4898. exit;
  4899. End;
  4900. if ( aExp = 0 ) then
  4901. Begin
  4902. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4903. Begin
  4904. packFloat64( zSign, 0, 0, 0, result );
  4905. exit;
  4906. End;
  4907. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4908. End;
  4909. if ( bExp = 0 ) then
  4910. Begin
  4911. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4912. Begin
  4913. packFloat64( zSign, 0, 0, 0, result );
  4914. exit;
  4915. End;
  4916. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4917. End;
  4918. zExp := aExp + bExp - $400;
  4919. aSig0 := aSig0 or $00100000;
  4920. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4921. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4922. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4923. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4924. if ( $00200000 <= zSig0 ) then
  4925. Begin
  4926. shift64ExtraRightJamming(
  4927. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4928. Inc(zExp);
  4929. End;
  4930. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4931. End;
  4932. {*
  4933. -------------------------------------------------------------------------------
  4934. Returns the result of dividing the double-precision floating-point value `a'
  4935. by the corresponding value `b'. The operation is performed according to the
  4936. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4937. -------------------------------------------------------------------------------
  4938. *}
  4939. Function float64_div(a: float64; b : float64) : Float64;
  4940. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4941. Var
  4942. aSign, bSign, zSign: flag;
  4943. aExp, bExp, zExp: int16;
  4944. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4945. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4946. z: float64;
  4947. label invalid;
  4948. Begin
  4949. aSig1 := extractFloat64Frac1( a );
  4950. aSig0 := extractFloat64Frac0( a );
  4951. aExp := extractFloat64Exp( a );
  4952. aSign := extractFloat64Sign( a );
  4953. bSig1 := extractFloat64Frac1( b );
  4954. bSig0 := extractFloat64Frac0( b );
  4955. bExp := extractFloat64Exp( b );
  4956. bSign := extractFloat64Sign( b );
  4957. zSign := aSign xor bSign;
  4958. if ( aExp = $7FF ) then
  4959. Begin
  4960. if ( aSig0 OR aSig1 )<> 0 then
  4961. Begin
  4962. propagateFloat64NaN( a, b, result );
  4963. exit;
  4964. end;
  4965. if ( bExp = $7FF ) then
  4966. Begin
  4967. if ( bSig0 OR bSig1 )<>0 then
  4968. Begin
  4969. propagateFloat64NaN( a, b, result );
  4970. exit;
  4971. End;
  4972. goto invalid;
  4973. End;
  4974. packFloat64( zSign, $7FF, 0, 0, result );
  4975. exit;
  4976. End;
  4977. if ( bExp = $7FF ) then
  4978. Begin
  4979. if ( bSig0 OR bSig1 )<> 0 then
  4980. Begin
  4981. propagateFloat64NaN( a, b, result );
  4982. exit;
  4983. End;
  4984. packFloat64( zSign, 0, 0, 0, result );
  4985. exit;
  4986. End;
  4987. if ( bExp = 0 ) then
  4988. Begin
  4989. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4990. Begin
  4991. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  4992. Begin
  4993. invalid:
  4994. float_raise( float_flag_invalid );
  4995. z.low := float64_default_nan_low;
  4996. z.high := float64_default_nan_high;
  4997. result := z;
  4998. exit;
  4999. End;
  5000. float_raise( float_flag_divbyzero );
  5001. packFloat64( zSign, $7FF, 0, 0, result );
  5002. exit;
  5003. End;
  5004. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5005. End;
  5006. if ( aExp = 0 ) then
  5007. Begin
  5008. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5009. Begin
  5010. packFloat64( zSign, 0, 0, 0, result );
  5011. exit;
  5012. End;
  5013. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5014. End;
  5015. zExp := aExp - bExp + $3FD;
  5016. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  5017. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5018. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  5019. Begin
  5020. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  5021. Inc(zExp);
  5022. End;
  5023. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5024. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  5025. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  5026. while ( sbits32 (rem0) < 0 ) do
  5027. Begin
  5028. Dec(zSig0);
  5029. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  5030. End;
  5031. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  5032. if ( ( zSig1 and $3FF ) <= 4 ) then
  5033. Begin
  5034. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  5035. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  5036. while ( sbits32 (rem1) < 0 ) do
  5037. Begin
  5038. Dec(zSig1);
  5039. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  5040. End;
  5041. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5042. End;
  5043. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  5044. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  5045. End;
  5046. {*
  5047. -------------------------------------------------------------------------------
  5048. Returns the remainder of the double-precision floating-point value `a'
  5049. with respect to the corresponding value `b'. The operation is performed
  5050. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5051. -------------------------------------------------------------------------------
  5052. *}
  5053. Function float64_rem(a: float64; b : float64) : float64;
  5054. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  5055. Var
  5056. aSign, zSign: flag;
  5057. aExp, bExp, expDiff: int16;
  5058. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  5059. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  5060. sigMean0: sbits32;
  5061. z: float64;
  5062. label invalid;
  5063. Begin
  5064. aSig1 := extractFloat64Frac1( a );
  5065. aSig0 := extractFloat64Frac0( a );
  5066. aExp := extractFloat64Exp( a );
  5067. aSign := extractFloat64Sign( a );
  5068. bSig1 := extractFloat64Frac1( b );
  5069. bSig0 := extractFloat64Frac0( b );
  5070. bExp := extractFloat64Exp( b );
  5071. if ( aExp = $7FF ) then
  5072. Begin
  5073. if ((( aSig0 OR aSig1 )<>0)
  5074. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  5075. Begin
  5076. propagateFloat64NaN( a, b, result );
  5077. exit;
  5078. End;
  5079. goto invalid;
  5080. End;
  5081. if ( bExp = $7FF ) then
  5082. Begin
  5083. if ( bSig0 OR bSig1 ) <> 0 then
  5084. Begin
  5085. propagateFloat64NaN( a, b, result );
  5086. exit;
  5087. End;
  5088. result := a;
  5089. exit;
  5090. End;
  5091. if ( bExp = 0 ) then
  5092. Begin
  5093. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5094. Begin
  5095. invalid:
  5096. float_raise( float_flag_invalid );
  5097. z.low := float64_default_nan_low;
  5098. z.high := float64_default_nan_high;
  5099. result := z;
  5100. exit;
  5101. End;
  5102. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5103. End;
  5104. if ( aExp = 0 ) then
  5105. Begin
  5106. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5107. Begin
  5108. result := a;
  5109. exit;
  5110. End;
  5111. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5112. End;
  5113. expDiff := aExp - bExp;
  5114. if ( expDiff < -1 ) then
  5115. Begin
  5116. result := a;
  5117. exit;
  5118. End;
  5119. shortShift64Left(
  5120. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  5121. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5122. q := le64( bSig0, bSig1, aSig0, aSig1 );
  5123. if ( q )<>0 then
  5124. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5125. expDiff := expDiff - 32;
  5126. while ( 0 < expDiff ) do
  5127. Begin
  5128. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5129. if 4 < q then
  5130. q:= q - 4
  5131. else
  5132. q := 0;
  5133. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5134. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  5135. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  5136. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  5137. expDiff := expDiff - 29;
  5138. End;
  5139. if ( -32 < expDiff ) then
  5140. Begin
  5141. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5142. if 4 < q then
  5143. q := q - 4
  5144. else
  5145. q := 0;
  5146. q := q shr (- expDiff);
  5147. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5148. expDiff := expDiff + 24;
  5149. if ( expDiff < 0 ) then
  5150. Begin
  5151. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  5152. End
  5153. else
  5154. Begin
  5155. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  5156. End;
  5157. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5158. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  5159. End
  5160. else
  5161. Begin
  5162. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  5163. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5164. End;
  5165. Repeat
  5166. alternateASig0 := aSig0;
  5167. alternateASig1 := aSig1;
  5168. Inc(q);
  5169. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5170. Until not ( 0 <= sbits32 (aSig0) );
  5171. add64(
  5172. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  5173. if ( ( sigMean0 < 0 )
  5174. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  5175. Begin
  5176. aSig0 := alternateASig0;
  5177. aSig1 := alternateASig1;
  5178. End;
  5179. zSign := flag( sbits32 (aSig0) < 0 );
  5180. if ( zSign <> 0 ) then
  5181. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  5182. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  5183. End;
  5184. {*
  5185. -------------------------------------------------------------------------------
  5186. Returns the square root of the double-precision floating-point value `a'.
  5187. The operation is performed according to the IEC/IEEE Standard for Binary
  5188. Floating-Point Arithmetic.
  5189. -------------------------------------------------------------------------------
  5190. *}
  5191. function float64_sqrt( a: float64 ): float64;
  5192. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  5193. Var
  5194. aSign: flag;
  5195. aExp, zExp: int16;
  5196. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  5197. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  5198. label invalid;
  5199. Begin
  5200. aSig1 := extractFloat64Frac1( a );
  5201. aSig0 := extractFloat64Frac0( a );
  5202. aExp := extractFloat64Exp( a );
  5203. aSign := extractFloat64Sign( a );
  5204. if ( aExp = $7FF ) then
  5205. Begin
  5206. if ( aSig0 OR aSig1 ) <> 0 then
  5207. Begin
  5208. propagateFloat64NaN( a, a, result );
  5209. exit;
  5210. End;
  5211. if ( aSign = 0) then
  5212. Begin
  5213. result := a;
  5214. exit;
  5215. End;
  5216. goto invalid;
  5217. End;
  5218. if ( aSign <> 0 ) then
  5219. Begin
  5220. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  5221. Begin
  5222. result := a;
  5223. exit;
  5224. End;
  5225. invalid:
  5226. float_raise( float_flag_invalid );
  5227. result.low := float64_default_nan_low;
  5228. result.high := float64_default_nan_high;
  5229. exit;
  5230. End;
  5231. if ( aExp = 0 ) then
  5232. Begin
  5233. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5234. Begin
  5235. packFloat64( 0, 0, 0, 0, result );
  5236. exit;
  5237. End;
  5238. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5239. End;
  5240. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  5241. aSig0 := aSig0 or $00100000;
  5242. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  5243. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  5244. if ( zSig0 = 0 ) then
  5245. zSig0 := $7FFFFFFF;
  5246. doubleZSig0 := zSig0 + zSig0;
  5247. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  5248. mul32To64( zSig0, zSig0, term0, term1 );
  5249. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  5250. while ( sbits32 (rem0) < 0 ) do
  5251. Begin
  5252. Dec(zSig0);
  5253. doubleZSig0 := doubleZSig0 - 2;
  5254. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  5255. End;
  5256. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  5257. if ( ( zSig1 and $1FF ) <= 5 ) then
  5258. Begin
  5259. if ( zSig1 = 0 ) then
  5260. zSig1 := 1;
  5261. mul32To64( doubleZSig0, zSig1, term1, term2 );
  5262. sub64( rem1, 0, term1, term2, rem1, rem2 );
  5263. mul32To64( zSig1, zSig1, term2, term3 );
  5264. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  5265. while ( sbits32 (rem1) < 0 ) do
  5266. Begin
  5267. Dec(zSig1);
  5268. shortShift64Left( 0, zSig1, 1, term2, term3 );
  5269. term3 := term3 or 1;
  5270. term2 := term2 or doubleZSig0;
  5271. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  5272. End;
  5273. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5274. End;
  5275. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  5276. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, result );
  5277. End;
  5278. {*
  5279. -------------------------------------------------------------------------------
  5280. Returns 1 if the double-precision floating-point value `a' is equal to
  5281. the corresponding value `b', and 0 otherwise. The comparison is performed
  5282. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5283. -------------------------------------------------------------------------------
  5284. *}
  5285. Function float64_eq(a: float64; b: float64): flag;
  5286. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  5287. Begin
  5288. if
  5289. (
  5290. ( extractFloat64Exp( a ) = $7FF )
  5291. AND
  5292. (
  5293. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5294. )
  5295. )
  5296. OR (
  5297. ( extractFloat64Exp( b ) = $7FF )
  5298. AND (
  5299. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5300. )
  5301. )
  5302. ) then
  5303. Begin
  5304. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5305. float_raise( float_flag_invalid );
  5306. float64_eq := 0;
  5307. exit;
  5308. End;
  5309. float64_eq := flag(
  5310. ( a.low = b.low )
  5311. AND ( ( a.high = b.high )
  5312. OR ( ( a.low = 0 )
  5313. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5314. ));
  5315. End;
  5316. {*
  5317. -------------------------------------------------------------------------------
  5318. Returns 1 if the double-precision floating-point value `a' is less than
  5319. or equal to the corresponding value `b', and 0 otherwise. The comparison
  5320. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5321. Arithmetic.
  5322. -------------------------------------------------------------------------------
  5323. *}
  5324. Function float64_le(a: float64;b: float64): flag;
  5325. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  5326. Var
  5327. aSign, bSign: flag;
  5328. Begin
  5329. if
  5330. (
  5331. ( extractFloat64Exp( a ) = $7FF )
  5332. AND
  5333. (
  5334. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5335. )
  5336. )
  5337. OR (
  5338. ( extractFloat64Exp( b ) = $7FF )
  5339. AND (
  5340. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5341. )
  5342. )
  5343. ) then
  5344. Begin
  5345. float_raise( float_flag_invalid );
  5346. float64_le := 0;
  5347. exit;
  5348. End;
  5349. aSign := extractFloat64Sign( a );
  5350. bSign := extractFloat64Sign( b );
  5351. if ( aSign <> bSign ) then
  5352. Begin
  5353. float64_le := flag(
  5354. (aSign <> 0)
  5355. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5356. = 0 ));
  5357. exit;
  5358. End;
  5359. if aSign <> 0 then
  5360. float64_le := le64( b.high, b.low, a.high, a.low )
  5361. else
  5362. float64_le := le64( a.high, a.low, b.high, b.low );
  5363. End;
  5364. {*
  5365. -------------------------------------------------------------------------------
  5366. Returns 1 if the double-precision floating-point value `a' is less than
  5367. the corresponding value `b', and 0 otherwise. The comparison is performed
  5368. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5369. -------------------------------------------------------------------------------
  5370. *}
  5371. Function float64_lt(a: float64;b: float64): flag;
  5372. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  5373. Var
  5374. aSign, bSign: flag;
  5375. Begin
  5376. if
  5377. (
  5378. ( extractFloat64Exp( a ) = $7FF )
  5379. AND
  5380. (
  5381. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5382. )
  5383. )
  5384. OR (
  5385. ( extractFloat64Exp( b ) = $7FF )
  5386. AND (
  5387. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5388. )
  5389. )
  5390. ) then
  5391. Begin
  5392. float_raise( float_flag_invalid );
  5393. float64_lt := 0;
  5394. exit;
  5395. End;
  5396. aSign := extractFloat64Sign( a );
  5397. bSign := extractFloat64Sign( b );
  5398. if ( aSign <> bSign ) then
  5399. Begin
  5400. float64_lt := flag(
  5401. (aSign <> 0)
  5402. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5403. <> 0 ));
  5404. exit;
  5405. End;
  5406. if aSign <> 0 then
  5407. float64_lt := lt64( b.high, b.low, a.high, a.low )
  5408. else
  5409. float64_lt := lt64( a.high, a.low, b.high, b.low );
  5410. End;
  5411. {*
  5412. -------------------------------------------------------------------------------
  5413. Returns 1 if the double-precision floating-point value `a' is equal to
  5414. the corresponding value `b', and 0 otherwise. The invalid exception is
  5415. raised if either operand is a NaN. Otherwise, the comparison is performed
  5416. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5417. -------------------------------------------------------------------------------
  5418. *}
  5419. Function float64_eq_signaling( a: float64; b: float64): flag;
  5420. Begin
  5421. if
  5422. (
  5423. ( extractFloat64Exp( a ) = $7FF )
  5424. AND
  5425. (
  5426. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5427. )
  5428. )
  5429. OR (
  5430. ( extractFloat64Exp( b ) = $7FF )
  5431. AND (
  5432. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5433. )
  5434. )
  5435. ) then
  5436. Begin
  5437. float_raise( float_flag_invalid );
  5438. float64_eq_signaling := 0;
  5439. exit;
  5440. End;
  5441. float64_eq_signaling := flag(
  5442. ( a.low = b.low )
  5443. AND ( ( a.high = b.high )
  5444. OR ( ( a.low = 0 )
  5445. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5446. ));
  5447. End;
  5448. {*
  5449. -------------------------------------------------------------------------------
  5450. Returns 1 if the double-precision floating-point value `a' is less than or
  5451. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  5452. cause an exception. Otherwise, the comparison is performed according to the
  5453. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5454. -------------------------------------------------------------------------------
  5455. *}
  5456. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  5457. Var
  5458. aSign, bSign : flag;
  5459. Begin
  5460. if
  5461. (
  5462. ( extractFloat64Exp( a ) = $7FF )
  5463. AND
  5464. (
  5465. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5466. )
  5467. )
  5468. OR (
  5469. ( extractFloat64Exp( b ) = $7FF )
  5470. AND (
  5471. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5472. )
  5473. )
  5474. ) then
  5475. Begin
  5476. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5477. float_raise( float_flag_invalid );
  5478. float64_le_quiet := 0;
  5479. exit;
  5480. End;
  5481. aSign := extractFloat64Sign( a );
  5482. bSign := extractFloat64Sign( b );
  5483. if ( aSign <> bSign ) then
  5484. Begin
  5485. float64_le_quiet := flag
  5486. ((aSign <> 0)
  5487. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5488. = 0 ));
  5489. exit;
  5490. End;
  5491. if aSign <> 0 then
  5492. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  5493. else
  5494. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  5495. End;
  5496. {*
  5497. -------------------------------------------------------------------------------
  5498. Returns 1 if the double-precision floating-point value `a' is less than
  5499. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5500. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5501. Standard for Binary Floating-Point Arithmetic.
  5502. -------------------------------------------------------------------------------
  5503. *}
  5504. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5505. Var
  5506. aSign, bSign: flag;
  5507. Begin
  5508. if
  5509. (
  5510. ( extractFloat64Exp( a ) = $7FF )
  5511. AND
  5512. (
  5513. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5514. )
  5515. )
  5516. OR (
  5517. ( extractFloat64Exp( b ) = $7FF )
  5518. AND (
  5519. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5520. )
  5521. )
  5522. ) then
  5523. Begin
  5524. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5525. float_raise( float_flag_invalid );
  5526. float64_lt_quiet := 0;
  5527. exit;
  5528. End;
  5529. aSign := extractFloat64Sign( a );
  5530. bSign := extractFloat64Sign( b );
  5531. if ( aSign <> bSign ) then
  5532. Begin
  5533. float64_lt_quiet := flag(
  5534. (aSign<>0)
  5535. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5536. <> 0 ));
  5537. exit;
  5538. End;
  5539. If aSign <> 0 then
  5540. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5541. else
  5542. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5543. End;
  5544. {*----------------------------------------------------------------------------
  5545. | Returns the result of converting the 64-bit two's complement integer `a'
  5546. | to the single-precision floating-point format. The conversion is performed
  5547. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5548. *----------------------------------------------------------------------------*}
  5549. function int64_to_float32( a: int64 ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  5550. var
  5551. zSign : flag;
  5552. absA : uint64;
  5553. shiftCount: int8;
  5554. Begin
  5555. if ( a = 0 ) then
  5556. begin
  5557. int64_to_float32.float32 := 0;
  5558. exit;
  5559. end;
  5560. if a < 0 then
  5561. zSign := flag(TRUE)
  5562. else
  5563. zSign := flag(FALSE);
  5564. if zSign<>0 then
  5565. absA := -a
  5566. else
  5567. absA := a;
  5568. shiftCount := countLeadingZeros64( absA ) - 40;
  5569. if ( 0 <= shiftCount ) then
  5570. begin
  5571. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5572. end
  5573. else
  5574. begin
  5575. shiftCount := shiftCount + 7;
  5576. if ( shiftCount < 0 ) then
  5577. shift64RightJamming( absA, - shiftCount, absA )
  5578. else
  5579. absA := absA shl shiftCount;
  5580. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5581. end;
  5582. End;
  5583. {*----------------------------------------------------------------------------
  5584. | Returns the result of converting the 64-bit two's complement integer `a'
  5585. | to the single-precision floating-point format. The conversion is performed
  5586. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5587. | Unisgned version.
  5588. *----------------------------------------------------------------------------*}
  5589. function qword_to_float32( a: qword ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  5590. var
  5591. absA : uint64;
  5592. shiftCount: int8;
  5593. Begin
  5594. if ( a = 0 ) then
  5595. begin
  5596. qword_to_float32.float32 := 0;
  5597. exit;
  5598. end;
  5599. absA := a;
  5600. shiftCount := countLeadingZeros64( absA ) - 40;
  5601. if ( 0 <= shiftCount ) then
  5602. begin
  5603. qword_to_float32.float32:= packFloat32( 0, $95 - shiftCount, absA shl shiftCount );
  5604. end
  5605. else
  5606. begin
  5607. shiftCount := shiftCount + 7;
  5608. if ( shiftCount < 0 ) then
  5609. shift64RightJamming( absA, - shiftCount, absA )
  5610. else
  5611. absA := absA shl shiftCount;
  5612. qword_to_float32.float32:=roundAndPackFloat32( 0, $9C - shiftCount, absA );
  5613. end;
  5614. End;
  5615. {*----------------------------------------------------------------------------
  5616. | Returns the result of converting the 64-bit two's complement integer `a'
  5617. | to the double-precision floating-point format. The conversion is performed
  5618. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5619. *----------------------------------------------------------------------------*}
  5620. function qword_to_float64( a: qword ): float64;
  5621. {$ifdef FPC_IS_SYSTEM}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5622. var
  5623. shiftCount: int8;
  5624. Begin
  5625. if ( a = 0 ) then
  5626. result := packFloat64( 0, 0, 0 )
  5627. else
  5628. begin
  5629. shiftCount := countLeadingZeros64(a) - 1;
  5630. { numbers with <= 53 significant bits are converted exactly }
  5631. if (shiftCount > 9) then
  5632. result := packFloat64(0, $43c - shiftCount, a shl (shiftCount-10))
  5633. else if (shiftCount>=0) then
  5634. result := roundAndPackFloat64( 0, $43c - shiftCount, a shl shiftCount)
  5635. else
  5636. begin
  5637. { the only possible negative value is -1, in case bit 63 of 'a' is set }
  5638. shift64RightJamming(a, 1, a);
  5639. result := roundAndPackFloat64(0, $43d, a);
  5640. end;
  5641. end;
  5642. End;
  5643. {*----------------------------------------------------------------------------
  5644. | Returns the result of converting the 64-bit two's complement integer `a'
  5645. | to the double-precision floating-point format. The conversion is performed
  5646. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5647. *----------------------------------------------------------------------------*}
  5648. function int64_to_float64( a: int64 ): float64;
  5649. {$ifdef FPC_IS_SYSTEM}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5650. Begin
  5651. if ( a = 0 ) then
  5652. result := packFloat64( 0, 0, 0 )
  5653. else if (a = int64($8000000000000000)) then
  5654. result := packFloat64( 1, $43e, 0 )
  5655. else if (a < 0) then
  5656. result := normalizeRoundAndPackFloat64( 1, $43c, -a )
  5657. else
  5658. result := normalizeRoundAndPackFloat64( 0, $43c, a );
  5659. End;
  5660. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5661. {*----------------------------------------------------------------------------
  5662. | Returns the result of converting the 64-bit two's complement integer `a'
  5663. | to the extended double-precision floating-point format. The conversion
  5664. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5665. | Arithmetic.
  5666. *----------------------------------------------------------------------------*}
  5667. function int64_to_floatx80( a: int64 ): floatx80;
  5668. var
  5669. zSign: flag;
  5670. absA: uint64;
  5671. shiftCount: int8;
  5672. begin
  5673. if ( a = 0 ) then begin
  5674. result := packFloatx80( 0, 0, 0 );
  5675. exit;
  5676. end;
  5677. zSign := ord( a < 0 );
  5678. if zSign <> 0 then absA := - a else absA := a;
  5679. shiftCount := countLeadingZeros64( absA );
  5680. result := packFloatx80( zSign, $403E - shiftCount, absA shl shiftCount );
  5681. end;
  5682. {*----------------------------------------------------------------------------
  5683. | Returns the result of converting the 64-bit two's complement integer `a'
  5684. | to the extended double-precision floating-point format. The conversion
  5685. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5686. | Arithmetic.
  5687. | Unsigned version.
  5688. *----------------------------------------------------------------------------*}
  5689. function qword_to_floatx80( a: qword ): floatx80;
  5690. var
  5691. absA: bits64;
  5692. shiftCount: int8;
  5693. begin
  5694. if ( a = 0 ) then begin
  5695. result := packFloatx80( 0, 0, 0 );
  5696. exit;
  5697. end;
  5698. absA := a;
  5699. shiftCount := countLeadingZeros64( absA );
  5700. result := packFloatx80( 0, $403E - shiftCount, absA shl shiftCount );
  5701. end;
  5702. {$endif FPC_SOFTFLOAT_FLOATX80}
  5703. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5704. {*----------------------------------------------------------------------------
  5705. | Returns the result of converting the 64-bit two's complement integer `a' to
  5706. | the quadruple-precision floating-point format. The conversion is performed
  5707. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5708. *----------------------------------------------------------------------------*}
  5709. function int64_to_float128( a: int64 ): float128;
  5710. var
  5711. zSign: flag;
  5712. absA: uint64;
  5713. shiftCount: int8;
  5714. zExp: int32;
  5715. zSig0, zSig1: bits64;
  5716. begin
  5717. if ( a = 0 ) then begin
  5718. result := packFloat128( 0, 0, 0, 0 );
  5719. exit;
  5720. end;
  5721. zSign := ord( a < 0 );
  5722. if zSign <> 0 then absA := - a else absA := a;
  5723. shiftCount := countLeadingZeros64( absA ) + 49;
  5724. zExp := $406E - shiftCount;
  5725. if ( 64 <= shiftCount ) then begin
  5726. zSig1 := 0;
  5727. zSig0 := absA;
  5728. dec( shiftCount, 64 );
  5729. end
  5730. else begin
  5731. zSig1 := absA;
  5732. zSig0 := 0;
  5733. end;
  5734. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5735. result := packFloat128( zSign, zExp, zSig0, zSig1 );
  5736. end;
  5737. {*----------------------------------------------------------------------------
  5738. | Returns the result of converting the 64-bit two's complement integer `a' to
  5739. | the quadruple-precision floating-point format. The conversion is performed
  5740. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5741. | Unsigned version.
  5742. *----------------------------------------------------------------------------*}
  5743. function qword_to_float128( a: qword ): float128;
  5744. var
  5745. absA: bits64;
  5746. shiftCount: int8;
  5747. zExp: int32;
  5748. zSig0, zSig1: bits64;
  5749. begin
  5750. if ( a = 0 ) then begin
  5751. result := packFloat128( 0, 0, 0, 0 );
  5752. exit;
  5753. end;
  5754. absA := a;
  5755. shiftCount := countLeadingZeros64( absA ) + 49;
  5756. zExp := $406E - shiftCount;
  5757. if ( 64 <= shiftCount ) then begin
  5758. zSig1 := 0;
  5759. zSig0 := absA;
  5760. dec( shiftCount, 64 );
  5761. end
  5762. else begin
  5763. zSig1 := absA;
  5764. zSig0 := 0;
  5765. end;
  5766. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5767. result := packFloat128( 0, zExp, zSig0, zSig1 );
  5768. end;
  5769. {$endif FPC_SOFTFLOAT_FLOAT128}
  5770. {*----------------------------------------------------------------------------
  5771. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5772. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5773. | Otherwise, returns 0.
  5774. *----------------------------------------------------------------------------*}
  5775. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5776. begin
  5777. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5778. end;
  5779. {*----------------------------------------------------------------------------
  5780. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5781. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5782. | Otherwise, returns 0.
  5783. *----------------------------------------------------------------------------*}
  5784. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5785. begin
  5786. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5787. end;
  5788. {*----------------------------------------------------------------------------
  5789. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5790. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5791. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5792. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5793. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5794. | the most-significant bit of the extra result, and the other 63 bits of the
  5795. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5796. | were all zero. This extra result is stored in the location pointed to by
  5797. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5798. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5799. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5800. | fixed-point value is shifted right by the number of bits given in `count',
  5801. | and the integer part of the result is returned at the locations pointed to
  5802. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5803. | corrupted as described above, and is returned at the location pointed to by
  5804. | `z2Ptr'.)
  5805. *----------------------------------------------------------------------------*}
  5806. procedure shift128ExtraRightJamming(
  5807. a0: bits64;
  5808. a1: bits64;
  5809. a2: bits64;
  5810. count: int16;
  5811. var z0Ptr: bits64;
  5812. var z1Ptr: bits64;
  5813. var z2Ptr: bits64);
  5814. var
  5815. z0, z1, z2: bits64;
  5816. negCount: int8;
  5817. begin
  5818. negCount := ( - count ) and 63;
  5819. if ( count = 0 ) then
  5820. begin
  5821. z2 := a2;
  5822. z1 := a1;
  5823. z0 := a0;
  5824. end
  5825. else begin
  5826. if ( count < 64 ) then
  5827. begin
  5828. z2 := a1 shl negCount;
  5829. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5830. z0 := a0 shr count;
  5831. end
  5832. else begin
  5833. if ( count = 64 ) then
  5834. begin
  5835. z2 := a1;
  5836. z1 := a0;
  5837. end
  5838. else begin
  5839. a2 := a2 or a1;
  5840. if ( count < 128 ) then
  5841. begin
  5842. z2 := a0 shl negCount;
  5843. z1 := a0 shr ( count and 63 );
  5844. end
  5845. else begin
  5846. if ( count = 128 ) then
  5847. z2 := a0
  5848. else
  5849. z2 := ord( a0 <> 0 );
  5850. z1 := 0;
  5851. end;
  5852. end;
  5853. z0 := 0;
  5854. end;
  5855. z2 := z2 or ord( a2 <> 0 );
  5856. end;
  5857. z2Ptr := z2;
  5858. z1Ptr := z1;
  5859. z0Ptr := z0;
  5860. end;
  5861. {*----------------------------------------------------------------------------
  5862. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5863. | _plus_ the number of bits given in `count'. The shifted result is at most
  5864. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5865. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5866. | shifted off is the most-significant bit of the extra result, and the other
  5867. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5868. | bits shifted off were all zero. This extra result is stored in the location
  5869. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5870. | (This routine makes more sense if `a0' and `a1' are considered to form
  5871. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5872. | point value is shifted right by the number of bits given in `count', and
  5873. | the integer part of the result is returned at the location pointed to by
  5874. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5875. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5876. *----------------------------------------------------------------------------*}
  5877. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5878. var
  5879. z0, z1: bits64;
  5880. negCount: int8;
  5881. begin
  5882. negCount := ( - count ) and 63;
  5883. if ( count = 0 ) then
  5884. begin
  5885. z1 := a1;
  5886. z0 := a0;
  5887. end
  5888. else if ( count < 64 ) then
  5889. begin
  5890. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5891. z0 := a0 shr count;
  5892. end
  5893. else begin
  5894. if ( count = 64 ) then
  5895. begin
  5896. z1 := a0 or ord( a1 <> 0 );
  5897. end
  5898. else begin
  5899. z1 := ord( ( a0 or a1 ) <> 0 );
  5900. end;
  5901. z0 := 0;
  5902. end;
  5903. z1Ptr := z1;
  5904. z0Ptr := z0;
  5905. end;
  5906. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5907. {*----------------------------------------------------------------------------
  5908. | Returns the fraction bits of the extended double-precision floating-point
  5909. | value `a'.
  5910. *----------------------------------------------------------------------------*}
  5911. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5912. begin
  5913. result:=a.low;
  5914. end;
  5915. {*----------------------------------------------------------------------------
  5916. | Returns the exponent bits of the extended double-precision floating-point
  5917. | value `a'.
  5918. *----------------------------------------------------------------------------*}
  5919. function extractFloatx80Exp(a : floatx80): int32;inline;
  5920. begin
  5921. result:=a.high and $7FFF;
  5922. end;
  5923. {*----------------------------------------------------------------------------
  5924. | Returns the sign bit of the extended double-precision floating-point value
  5925. | `a'.
  5926. *----------------------------------------------------------------------------*}
  5927. function extractFloatx80Sign(a : floatx80): flag;inline;
  5928. begin
  5929. result:=a.high shr 15;
  5930. end;
  5931. {*----------------------------------------------------------------------------
  5932. | Normalizes the subnormal extended double-precision floating-point value
  5933. | represented by the denormalized significand `aSig'. The normalized exponent
  5934. | and significand are stored at the locations pointed to by `zExpPtr' and
  5935. | `zSigPtr', respectively.
  5936. *----------------------------------------------------------------------------*}
  5937. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5938. var
  5939. shiftCount: int8;
  5940. begin
  5941. shiftCount := countLeadingZeros64( aSig );
  5942. zSigPtr := aSig shl shiftCount;
  5943. zExpPtr := 1 - shiftCount;
  5944. end;
  5945. {*----------------------------------------------------------------------------
  5946. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5947. | extended double-precision floating-point value, returning the result.
  5948. *----------------------------------------------------------------------------*}
  5949. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5950. var
  5951. z: floatx80;
  5952. begin
  5953. z.low := zSig;
  5954. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5955. result:=z;
  5956. end;
  5957. {*----------------------------------------------------------------------------
  5958. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5959. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5960. | and returns the proper extended double-precision floating-point value
  5961. | corresponding to the abstract input. Ordinarily, the abstract value is
  5962. | rounded and packed into the extended double-precision format, with the
  5963. | inexact exception raised if the abstract input cannot be represented
  5964. | exactly. However, if the abstract value is too large, the overflow and
  5965. | inexact exceptions are raised and an infinity or maximal finite value is
  5966. | returned. If the abstract value is too small, the input value is rounded to
  5967. | a subnormal number, and the underflow and inexact exceptions are raised if
  5968. | the abstract input cannot be represented exactly as a subnormal extended
  5969. | double-precision floating-point number.
  5970. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5971. | number of bits as single or double precision, respectively. Otherwise, the
  5972. | result is rounded to the full precision of the extended double-precision
  5973. | format.
  5974. | The input significand must be normalized or smaller. If the input
  5975. | significand is not normalized, `zExp' must be 0; in that case, the result
  5976. | returned is a subnormal number, and it must not require rounding. The
  5977. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5978. | Floating-Point Arithmetic.
  5979. *----------------------------------------------------------------------------*}
  5980. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5981. var
  5982. roundingMode: TFPURoundingMode;
  5983. roundNearestEven, increment, isTiny: flag;
  5984. roundIncrement, roundMask, roundBits: int64;
  5985. label
  5986. precision80, overflow;
  5987. begin
  5988. roundingMode := softfloat_rounding_mode;
  5989. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5990. if ( roundingPrecision = 80 ) then
  5991. goto precision80;
  5992. if ( roundingPrecision = 64 ) then
  5993. begin
  5994. roundIncrement := int64( $0000000000000400 );
  5995. roundMask := int64( $00000000000007FF );
  5996. end
  5997. else if ( roundingPrecision = 32 ) then
  5998. begin
  5999. roundIncrement := int64( $0000008000000000 );
  6000. roundMask := int64( $000000FFFFFFFFFF );
  6001. end
  6002. else begin
  6003. goto precision80;
  6004. end;
  6005. zSig0 := zSig0 or ord( zSig1 <> 0 );
  6006. if ( not (roundNearestEven<>0) ) then
  6007. begin
  6008. if ( roundingMode = float_round_to_zero ) then
  6009. begin
  6010. roundIncrement := 0;
  6011. end
  6012. else begin
  6013. roundIncrement := roundMask;
  6014. if ( zSign<>0 ) then
  6015. begin
  6016. if ( roundingMode = float_round_up ) then
  6017. roundIncrement := 0;
  6018. end
  6019. else begin
  6020. if ( roundingMode = float_round_down ) then
  6021. roundIncrement := 0;
  6022. end;
  6023. end;
  6024. end;
  6025. roundBits := zSig0 and roundMask;
  6026. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6027. if ( ( $7FFE < zExp )
  6028. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  6029. ) then begin
  6030. goto overflow;
  6031. end;
  6032. if ( zExp <= 0 ) then begin
  6033. isTiny := ord (
  6034. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6035. or ( zExp < 0 )
  6036. or ( zSig0 <= zSig0 + roundIncrement ) );
  6037. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  6038. zExp := 0;
  6039. roundBits := zSig0 and roundMask;
  6040. if ( isTiny <> 0 ) and ( roundBits <> 0 ) then float_raise( float_flag_underflow );
  6041. if ( roundBits <> 0 ) then set_inexact_flag;
  6042. inc( zSig0, roundIncrement );
  6043. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6044. roundIncrement := roundMask + 1;
  6045. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6046. roundMask := roundMask or roundIncrement;
  6047. end;
  6048. zSig0 := zSig0 and not roundMask;
  6049. result:=packFloatx80( zSign, zExp, zSig0 );
  6050. exit;
  6051. end;
  6052. end;
  6053. if ( roundBits <> 0 ) then set_inexact_flag;
  6054. inc( zSig0, roundIncrement );
  6055. if ( zSig0 < roundIncrement ) then begin
  6056. inc(zExp);
  6057. zSig0 := bits64( $8000000000000000 );
  6058. end;
  6059. roundIncrement := roundMask + 1;
  6060. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6061. roundMask := roundMask or roundIncrement;
  6062. end;
  6063. zSig0 := zSig0 and not roundMask;
  6064. if ( zSig0 = 0 ) then zExp := 0;
  6065. result:=packFloatx80( zSign, zExp, zSig0 );
  6066. exit;
  6067. precision80:
  6068. increment := ord ( sbits64( zSig1 ) < 0 );
  6069. if ( roundNearestEven = 0 ) then begin
  6070. if ( roundingMode = float_round_to_zero ) then begin
  6071. increment := 0;
  6072. end
  6073. else begin
  6074. if ( zSign <> 0 ) then begin
  6075. increment := ord ( roundingMode = float_round_down ) and zSig1;
  6076. end
  6077. else begin
  6078. increment := ord ( roundingMode = float_round_up ) and zSig1;
  6079. end;
  6080. end;
  6081. end;
  6082. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6083. if ( ( $7FFE < zExp )
  6084. or ( ( zExp = $7FFE )
  6085. and ( zSig0 = bits64( $FFFFFFFFFFFFFFFF ) )
  6086. and ( increment <> 0 )
  6087. )
  6088. ) then begin
  6089. roundMask := 0;
  6090. overflow:
  6091. float_raise( [float_flag_overflow,float_flag_inexact] );
  6092. if ( ( roundingMode = float_round_to_zero )
  6093. or ( ( zSign <> 0) and ( roundingMode = float_round_up ) )
  6094. or ( ( zSign = 0) and ( roundingMode = float_round_down ) )
  6095. ) then begin
  6096. result:=packFloatx80( zSign, $7FFE, not roundMask );
  6097. exit;
  6098. end;
  6099. result:=packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6100. exit;
  6101. end;
  6102. if ( zExp <= 0 ) then begin
  6103. isTiny := ord(
  6104. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6105. or ( zExp < 0 )
  6106. or ( increment = 0 )
  6107. or ( zSig0 < bits64( $FFFFFFFFFFFFFFFF ) ) );
  6108. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  6109. zExp := 0;
  6110. if ( ( isTiny <> 0 ) and ( zSig1 <> 0 ) ) then float_raise( float_flag_underflow );
  6111. if ( zSig1 <> 0 ) then set_inexact_flag;
  6112. if ( roundNearestEven <> 0 ) then begin
  6113. increment := ord( sbits64( zSig1 ) < 0 );
  6114. end
  6115. else begin
  6116. if ( zSign <> 0 ) then begin
  6117. increment := ord( roundingMode = float_round_down ) and zSig1;
  6118. end
  6119. else begin
  6120. increment := ord( roundingMode = float_round_up ) and zSig1;
  6121. end;
  6122. end;
  6123. if ( increment <> 0 ) then begin
  6124. inc(zSig0);
  6125. zSig0 :=
  6126. not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6127. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6128. end;
  6129. result:=packFloatx80( zSign, zExp, zSig0 );
  6130. exit;
  6131. end;
  6132. end;
  6133. if ( zSig1 <> 0 ) then set_inexact_flag;
  6134. if ( increment <> 0 ) then begin
  6135. inc(zSig0);
  6136. if ( zSig0 = 0 ) then begin
  6137. inc(zExp);
  6138. zSig0 := bits64( $8000000000000000 );
  6139. end
  6140. else begin
  6141. zSig0 := zSig0 and not bits64( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6142. end;
  6143. end
  6144. else begin
  6145. if ( zSig0 = 0 ) then zExp := 0;
  6146. end;
  6147. result:=packFloatx80( zSign, zExp, zSig0 );
  6148. end;
  6149. {*----------------------------------------------------------------------------
  6150. | Takes an abstract floating-point value having sign `zSign', exponent
  6151. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  6152. | and returns the proper extended double-precision floating-point value
  6153. | corresponding to the abstract input. This routine is just like
  6154. | `roundAndPackFloatx80' except that the input significand does not have to be
  6155. | normalized.
  6156. *----------------------------------------------------------------------------*}
  6157. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6158. var
  6159. shiftCount: int8;
  6160. begin
  6161. if ( zSig0 = 0 ) then begin
  6162. zSig0 := zSig1;
  6163. zSig1 := 0;
  6164. dec( zExp, 64 );
  6165. end;
  6166. shiftCount := countLeadingZeros64( zSig0 );
  6167. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6168. zExp := zExp - shiftCount;
  6169. result :=
  6170. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  6171. end;
  6172. {*----------------------------------------------------------------------------
  6173. | Returns the result of converting the extended double-precision floating-
  6174. | point value `a' to the 32-bit two's complement integer format. The
  6175. | conversion is performed according to the IEC/IEEE Standard for Binary
  6176. | Floating-Point Arithmetic---which means in particular that the conversion
  6177. | is rounded according to the current rounding mode. If `a' is a NaN, the
  6178. | largest positive integer is returned. Otherwise, if the conversion
  6179. | overflows, the largest integer with the same sign as `a' is returned.
  6180. *----------------------------------------------------------------------------*}
  6181. function floatx80_to_int32(a: floatx80): int32;
  6182. var
  6183. aSign: flag;
  6184. aExp, shiftCount: int32;
  6185. aSig: bits64;
  6186. begin
  6187. aSig := extractFloatx80Frac( a );
  6188. aExp := extractFloatx80Exp( a );
  6189. aSign := extractFloatx80Sign( a );
  6190. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then aSign := 0;
  6191. shiftCount := $4037 - aExp;
  6192. if ( shiftCount <= 0 ) then shiftCount := 1;
  6193. shift64RightJamming( aSig, shiftCount, aSig );
  6194. result := roundAndPackInt32( aSign, aSig );
  6195. end;
  6196. {*----------------------------------------------------------------------------
  6197. | Returns the result of converting the extended double-precision floating-
  6198. | point value `a' to the 32-bit two's complement integer format. The
  6199. | conversion is performed according to the IEC/IEEE Standard for Binary
  6200. | Floating-Point Arithmetic, except that the conversion is always rounded
  6201. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6202. | Otherwise, if the conversion overflows, the largest integer with the same
  6203. | sign as `a' is returned.
  6204. *----------------------------------------------------------------------------*}
  6205. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  6206. var
  6207. aSign: flag;
  6208. aExp, shiftCount: int32;
  6209. aSig, savedASig: bits64;
  6210. z: int32;
  6211. label
  6212. invalid;
  6213. begin
  6214. aSig := extractFloatx80Frac( a );
  6215. aExp := extractFloatx80Exp( a );
  6216. aSign := extractFloatx80Sign( a );
  6217. if ( $401E < aExp ) then begin
  6218. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 )<>0 ) then aSign := 0;
  6219. goto invalid;
  6220. end
  6221. else if ( aExp < $3FFF ) then begin
  6222. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6223. result := 0;
  6224. exit;
  6225. end;
  6226. shiftCount := $403E - aExp;
  6227. savedASig := aSig;
  6228. aSig := aSig shr shiftCount;
  6229. z := aSig;
  6230. if ( aSign <> 0 ) then z := - z;
  6231. if ( ord( z < 0 ) xor aSign ) <> 0 then begin
  6232. invalid:
  6233. float_raise( float_flag_invalid );
  6234. if aSign <> 0 then result := sbits32( $80000000 ) else result := $7FFFFFFF;
  6235. exit;
  6236. end;
  6237. if ( ( aSig shl shiftCount ) <> savedASig ) then begin
  6238. set_inexact_flag;
  6239. end;
  6240. result := z;
  6241. end;
  6242. {*----------------------------------------------------------------------------
  6243. | Returns the result of converting the extended double-precision floating-
  6244. | point value `a' to the 64-bit two's complement integer format. The
  6245. | conversion is performed according to the IEC/IEEE Standard for Binary
  6246. | Floating-Point Arithmetic---which means in particular that the conversion
  6247. | is rounded according to the current rounding mode. If `a' is a NaN,
  6248. | the largest positive integer is returned. Otherwise, if the conversion
  6249. | overflows, the largest integer with the same sign as `a' is returned.
  6250. *----------------------------------------------------------------------------*}
  6251. function floatx80_to_int64(a: floatx80): int64;
  6252. var
  6253. aSign: flag;
  6254. aExp, shiftCount: int32;
  6255. aSig, aSigExtra: bits64;
  6256. begin
  6257. aSig := extractFloatx80Frac( a );
  6258. aExp := extractFloatx80Exp( a );
  6259. aSign := extractFloatx80Sign( a );
  6260. shiftCount := $403E - aExp;
  6261. if ( shiftCount <= 0 ) then begin
  6262. if ( shiftCount <> 0 ) then begin
  6263. float_raise( float_flag_invalid );
  6264. if ( ( aSign = 0 )
  6265. or ( ( aExp = $7FFF )
  6266. and ( aSig <> bits64( $8000000000000000 ) ) )
  6267. ) then begin
  6268. result := $7FFFFFFFFFFFFFFF;
  6269. exit;
  6270. end;
  6271. result := $8000000000000000;
  6272. exit;
  6273. end;
  6274. aSigExtra := 0;
  6275. end
  6276. else begin
  6277. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  6278. end;
  6279. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  6280. end;
  6281. {*----------------------------------------------------------------------------
  6282. | Returns the result of converting the extended double-precision floating-
  6283. | point value `a' to the 64-bit two's complement integer format. The
  6284. | conversion is performed according to the IEC/IEEE Standard for Binary
  6285. | Floating-Point Arithmetic, except that the conversion is always rounded
  6286. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6287. | Otherwise, if the conversion overflows, the largest integer with the same
  6288. | sign as `a' is returned.
  6289. *----------------------------------------------------------------------------*}
  6290. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  6291. var
  6292. aSign: flag;
  6293. aExp, shiftCount: int32;
  6294. aSig: bits64;
  6295. z: int64;
  6296. begin
  6297. aSig := extractFloatx80Frac( a );
  6298. aExp := extractFloatx80Exp( a );
  6299. aSign := extractFloatx80Sign( a );
  6300. shiftCount := aExp - $403E;
  6301. if ( 0 <= shiftCount ) then begin
  6302. aSig := $7FFFFFFFFFFFFFFF;
  6303. if ( ( a.high <> $C03E ) or ( aSig <> 0 ) ) then begin
  6304. float_raise( float_flag_invalid );
  6305. if ( ( aSign = 0 ) or ( ( aExp = $7FFF ) and ( aSig <> 0 ) ) ) then begin
  6306. result := $7FFFFFFFFFFFFFFF;
  6307. exit;
  6308. end;
  6309. end;
  6310. result := $8000000000000000;
  6311. exit;
  6312. end
  6313. else if ( aExp < $3FFF ) then begin
  6314. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6315. result := 0;
  6316. exit;
  6317. end;
  6318. z := aSig shr ( - shiftCount );
  6319. if bits64( aSig shl ( shiftCount and 63 ) ) <> 0 then begin
  6320. set_inexact_flag;
  6321. end;
  6322. if ( aSign <> 0 ) then z := - z;
  6323. result := z;
  6324. end;
  6325. {*----------------------------------------------------------------------------
  6326. | The pattern for a default generated extended double-precision NaN. The
  6327. | `high' and `low' values hold the most- and least-significant bits,
  6328. | respectively.
  6329. *----------------------------------------------------------------------------*}
  6330. const
  6331. floatx80_default_nan_high = $FFFF;
  6332. floatx80_default_nan_low = bits64( $C000000000000000 );
  6333. {*----------------------------------------------------------------------------
  6334. | Returns 1 if the extended double-precision floating-point value `a' is a
  6335. | signaling NaN; otherwise returns 0.
  6336. *----------------------------------------------------------------------------*}
  6337. function floatx80_is_signaling_nan(a : floatx80): flag;
  6338. var
  6339. aLow: bits64;
  6340. begin
  6341. aLow := a.low and not $4000000000000000;
  6342. result := ord(
  6343. ( a.high and $7FFF = $7FFF )
  6344. and ( bits64( aLow shl 1 ) <> 0 )
  6345. and ( a.low = aLow ) );
  6346. end;
  6347. {*----------------------------------------------------------------------------
  6348. | Returns the result of converting the extended double-precision floating-
  6349. | point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
  6350. | invalid exception is raised.
  6351. *----------------------------------------------------------------------------*}
  6352. function floatx80ToCommonNaN(a : floatx80): commonNaNT;
  6353. var
  6354. z: commonNaNT;
  6355. begin
  6356. if floatx80_is_signaling_nan( a ) <> 0 then float_raise( float_flag_invalid );
  6357. z.sign := a.high shr 15;
  6358. z.low := 0;
  6359. z.high := a.low shl 1;
  6360. result := z;
  6361. end;
  6362. {*----------------------------------------------------------------------------
  6363. | Returns 1 if the extended double-precision floating-point value `a' is a
  6364. | NaN; otherwise returns 0.
  6365. *----------------------------------------------------------------------------*}
  6366. function floatx80_is_nan(a : floatx80 ): flag;
  6367. begin
  6368. result := ord( ( ( a.high and $7FFF ) = $7FFF ) and ( bits64( a.low shl 1 ) <> 0 ) );
  6369. end;
  6370. {*----------------------------------------------------------------------------
  6371. | Takes two extended double-precision floating-point values `a' and `b', one
  6372. | of which is a NaN, and returns the appropriate NaN result. If either `a' or
  6373. | `b' is a signaling NaN, the invalid exception is raised.
  6374. *----------------------------------------------------------------------------*}
  6375. function propagateFloatx80NaN(a, b: floatx80): floatx80;
  6376. var
  6377. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  6378. label
  6379. returnLargerSignificand;
  6380. begin
  6381. aIsNaN := floatx80_is_nan( a );
  6382. aIsSignalingNaN := floatx80_is_signaling_nan( a );
  6383. bIsNaN := floatx80_is_nan( b );
  6384. bIsSignalingNaN := floatx80_is_signaling_nan( b );
  6385. a.low := a.low or $C000000000000000;
  6386. b.low := b.low or $C000000000000000;
  6387. if aIsSignalingNaN or bIsSignalingNaN <> 0 then float_raise( float_flag_invalid );
  6388. if aIsSignalingNaN <> 0 then begin
  6389. if bIsSignalingNaN <> 0 then goto returnLargerSignificand;
  6390. if bIsNaN <> 0 then result := b else result := a;
  6391. exit;
  6392. end
  6393. else if aIsNaN <>0 then begin
  6394. if ( bIsSignalingNaN <> 0 ) or ( bIsNaN = 0) then begin
  6395. result := a;
  6396. exit;
  6397. end;
  6398. returnLargerSignificand:
  6399. if ( a.low < b.low ) then begin
  6400. result := b;
  6401. exit;
  6402. end;
  6403. if ( b.low < a.low ) then begin
  6404. result := a;
  6405. exit;
  6406. end;
  6407. if a.high < b.high then result := a else result := b;
  6408. exit;
  6409. end
  6410. else
  6411. result := b;
  6412. end;
  6413. {*----------------------------------------------------------------------------
  6414. | Returns the result of converting the extended double-precision floating-
  6415. | point value `a' to the single-precision floating-point format. The
  6416. | conversion is performed according to the IEC/IEEE Standard for Binary
  6417. | Floating-Point Arithmetic.
  6418. *----------------------------------------------------------------------------*}
  6419. function floatx80_to_float32(a: floatx80): float32;
  6420. var
  6421. aSign: flag;
  6422. aExp: int32;
  6423. aSig: bits64;
  6424. begin
  6425. aSig := extractFloatx80Frac( a );
  6426. aExp := extractFloatx80Exp( a );
  6427. aSign := extractFloatx80Sign( a );
  6428. if ( aExp = $7FFF ) then begin
  6429. if bits64( aSig shl 1 ) <> 0 then begin
  6430. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  6431. exit;
  6432. end;
  6433. result := packFloat32( aSign, $FF, 0 );
  6434. exit;
  6435. end;
  6436. shift64RightJamming( aSig, 33, aSig );
  6437. if ( aExp or aSig <> 0 ) then dec( aExp, $3F81 );
  6438. result := roundAndPackFloat32( aSign, aExp, aSig );
  6439. end;
  6440. {*----------------------------------------------------------------------------
  6441. | Returns the result of converting the extended double-precision floating-
  6442. | point value `a' to the double-precision floating-point format. The
  6443. | conversion is performed according to the IEC/IEEE Standard for Binary
  6444. | Floating-Point Arithmetic.
  6445. *----------------------------------------------------------------------------*}
  6446. function floatx80_to_float64(a: floatx80): float64;
  6447. var
  6448. aSign: flag;
  6449. aExp: int32;
  6450. aSig, zSig: bits64;
  6451. begin
  6452. aSig := extractFloatx80Frac( a );
  6453. aExp := extractFloatx80Exp( a );
  6454. aSign := extractFloatx80Sign( a );
  6455. if ( aExp = $7FFF ) then begin
  6456. if bits64( aSig shl 1 ) <> 0 then begin
  6457. result:=commonNaNToFloat64(floatx80ToCommonNaN(a));
  6458. exit;
  6459. end;
  6460. result := packFloat64( aSign, $7FF, 0 );
  6461. exit;
  6462. end;
  6463. shift64RightJamming( aSig, 1, zSig );
  6464. if ( aExp or aSig <> 0 ) then dec( aExp, $3C01 );
  6465. result := roundAndPackFloat64( aSign, aExp, zSig );
  6466. end;
  6467. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6468. {*----------------------------------------------------------------------------
  6469. | Returns the result of converting the extended double-precision floating-
  6470. | point value `a' to the quadruple-precision floating-point format. The
  6471. | conversion is performed according to the IEC/IEEE Standard for Binary
  6472. | Floating-Point Arithmetic.
  6473. *----------------------------------------------------------------------------*}
  6474. function floatx80_to_float128(a: floatx80): float128;
  6475. var
  6476. aSign: flag;
  6477. aExp: int16;
  6478. aSig, zSig0, zSig1: bits64;
  6479. begin
  6480. aSig := extractFloatx80Frac( a );
  6481. aExp := extractFloatx80Exp( a );
  6482. aSign := extractFloatx80Sign( a );
  6483. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then begin
  6484. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  6485. exit;
  6486. end;
  6487. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  6488. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  6489. end;
  6490. {$endif FPC_SOFTFLOAT_FLOAT128}
  6491. {*----------------------------------------------------------------------------
  6492. | Rounds the extended double-precision floating-point value `a' to an integer,
  6493. | and Returns the result as an extended quadruple-precision floating-point
  6494. | value. The operation is performed according to the IEC/IEEE Standard for
  6495. | Binary Floating-Point Arithmetic.
  6496. *----------------------------------------------------------------------------*}
  6497. function floatx80_round_to_int(a: floatx80): floatx80;
  6498. var
  6499. aSign: flag;
  6500. aExp: int32;
  6501. lastBitMask, roundBitsMask: bits64;
  6502. roundingMode: TFPURoundingMode;
  6503. z: floatx80;
  6504. begin
  6505. aExp := extractFloatx80Exp( a );
  6506. if ( $403E <= aExp ) then begin
  6507. if ( aExp = $7FFF ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) then begin
  6508. result := propagateFloatx80NaN( a, a );
  6509. exit;
  6510. end;
  6511. result := a;
  6512. exit;
  6513. end;
  6514. if ( aExp < $3FFF ) then begin
  6515. if ( ( aExp = 0 )
  6516. and ( bits64( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) then begin
  6517. result := a;
  6518. exit;
  6519. end;
  6520. set_inexact_flag;
  6521. aSign := extractFloatx80Sign( a );
  6522. case softfloat_rounding_mode of
  6523. float_round_nearest_even:
  6524. if ( ( aExp = $3FFE ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  6525. ) then begin
  6526. result :=
  6527. packFloatx80( aSign, $3FFF, bits64( $8000000000000000 ) );
  6528. exit;
  6529. end;
  6530. float_round_down: begin
  6531. if aSign <> 0 then
  6532. result := packFloatx80( 1, $3FFF, bits64( $8000000000000000 ) )
  6533. else
  6534. result := packFloatx80( 0, 0, 0 );
  6535. exit;
  6536. end;
  6537. float_round_up: begin
  6538. if aSign <> 0 then
  6539. result := packFloatx80( 1, 0, 0 )
  6540. else
  6541. result := packFloatx80( 0, $3FFF, bits64( $8000000000000000 ) );
  6542. exit;
  6543. end;
  6544. end;
  6545. result := packFloatx80( aSign, 0, 0 );
  6546. exit;
  6547. end;
  6548. lastBitMask := 1;
  6549. lastBitMask := lastBitMask shl ( $403E - aExp );
  6550. roundBitsMask := lastBitMask - 1;
  6551. z := a;
  6552. roundingMode := softfloat_rounding_mode;
  6553. if ( roundingMode = float_round_nearest_even ) then begin
  6554. inc( z.low, lastBitMask shr 1 );
  6555. if ( ( z.low and roundBitsMask ) = 0 ) then z.low := z.low and not lastBitMask;
  6556. end
  6557. else if ( roundingMode <> float_round_to_zero ) then begin
  6558. if ( extractFloatx80Sign( z ) <> 0 ) xor ( roundingMode = float_round_up ) then begin
  6559. inc( z.low, roundBitsMask );
  6560. end;
  6561. end;
  6562. z.low := z.low and not roundBitsMask;
  6563. if ( z.low = 0 ) then begin
  6564. inc(z.high);
  6565. z.low := bits64( $8000000000000000 );
  6566. end;
  6567. if ( z.low <> a.low ) then set_inexact_flag;
  6568. result := z;
  6569. end;
  6570. {*----------------------------------------------------------------------------
  6571. | Returns the result of adding the absolute values of the extended double-
  6572. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  6573. | negated before being returned. `zSign' is ignored if the result is a NaN.
  6574. | The addition is performed according to the IEC/IEEE Standard for Binary
  6575. | Floating-Point Arithmetic.
  6576. *----------------------------------------------------------------------------*}
  6577. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6578. var
  6579. aExp, bExp, zExp: int32;
  6580. aSig, bSig, zSig0, zSig1: bits64;
  6581. expDiff: int32;
  6582. label
  6583. shiftRight1, roundAndPack;
  6584. begin
  6585. aSig := extractFloatx80Frac( a );
  6586. aExp := extractFloatx80Exp( a );
  6587. bSig := extractFloatx80Frac( b );
  6588. bExp := extractFloatx80Exp( b );
  6589. expDiff := aExp - bExp;
  6590. if ( 0 < expDiff ) then begin
  6591. if ( aExp = $7FFF ) then begin
  6592. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6593. result := propagateFloatx80NaN( a, b );
  6594. exit;
  6595. end;
  6596. result := a;
  6597. exit;
  6598. end;
  6599. if ( bExp = 0 ) then dec(expDiff);
  6600. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6601. zExp := aExp;
  6602. end
  6603. else if ( expDiff < 0 ) then begin
  6604. if ( bExp = $7FFF ) then begin
  6605. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6606. result := propagateFloatx80NaN( a, b );
  6607. exit;
  6608. end;
  6609. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6610. exit;
  6611. end;
  6612. if ( aExp = 0 ) then inc(expDiff);
  6613. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6614. zExp := bExp;
  6615. end
  6616. else begin
  6617. if ( aExp = $7FFF ) then begin
  6618. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6619. result := propagateFloatx80NaN( a, b );
  6620. exit;
  6621. end;
  6622. result := a;
  6623. exit;
  6624. end;
  6625. zSig1 := 0;
  6626. zSig0 := aSig + bSig;
  6627. if ( aExp = 0 ) then begin
  6628. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  6629. goto roundAndPack;
  6630. end;
  6631. zExp := aExp;
  6632. goto shiftRight1;
  6633. end;
  6634. zSig0 := aSig + bSig;
  6635. if ( sbits64( zSig0 ) < 0 ) then goto roundAndPack;
  6636. shiftRight1:
  6637. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  6638. zSig0 := zSig0 or $8000000000000000;
  6639. inc(zExp);
  6640. roundAndPack:
  6641. result :=
  6642. roundAndPackFloatx80(
  6643. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6644. end;
  6645. {*----------------------------------------------------------------------------
  6646. | Returns the result of subtracting the absolute values of the extended
  6647. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  6648. | difference is negated before being returned. `zSign' is ignored if the
  6649. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6650. | Standard for Binary Floating-Point Arithmetic.
  6651. *----------------------------------------------------------------------------*}
  6652. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6653. var
  6654. aExp, bExp, zExp: int32;
  6655. aSig, bSig, zSig0, zSig1: bits64;
  6656. expDiff: int32;
  6657. z: floatx80;
  6658. label
  6659. bExpBigger, bBigger, aExpBigger, aBigger, normalizeRoundAndPack;
  6660. begin
  6661. aSig := extractFloatx80Frac( a );
  6662. aExp := extractFloatx80Exp( a );
  6663. bSig := extractFloatx80Frac( b );
  6664. bExp := extractFloatx80Exp( b );
  6665. expDiff := aExp - bExp;
  6666. if ( 0 < expDiff ) then goto aExpBigger;
  6667. if ( expDiff < 0 ) then goto bExpBigger;
  6668. if ( aExp = $7FFF ) then begin
  6669. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6670. result := propagateFloatx80NaN( a, b );
  6671. exit;
  6672. end;
  6673. float_raise( float_flag_invalid );
  6674. z.low := floatx80_default_nan_low;
  6675. z.high := floatx80_default_nan_high;
  6676. result := z;
  6677. exit;
  6678. end;
  6679. if ( aExp = 0 ) then begin
  6680. aExp := 1;
  6681. bExp := 1;
  6682. end;
  6683. zSig1 := 0;
  6684. if ( bSig < aSig ) then goto aBigger;
  6685. if ( aSig < bSig ) then goto bBigger;
  6686. result := packFloatx80( ord( softfloat_rounding_mode = float_round_down ), 0, 0 );
  6687. exit;
  6688. bExpBigger:
  6689. if ( bExp = $7FFF ) then begin
  6690. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6691. result := propagateFloatx80NaN( a, b );
  6692. exit;
  6693. end;
  6694. result := packFloatx80( zSign xor 1, $7FFF, bits64( $8000000000000000 ) );
  6695. exit;
  6696. end;
  6697. if ( aExp = 0 ) then inc(expDiff);
  6698. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6699. bBigger:
  6700. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6701. zExp := bExp;
  6702. zSign := zSign xor 1;
  6703. goto normalizeRoundAndPack;
  6704. aExpBigger:
  6705. if ( aExp = $7FFF ) then begin
  6706. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6707. result := propagateFloatx80NaN( a, b );
  6708. exit;
  6709. end;
  6710. result := a;
  6711. exit;
  6712. end;
  6713. if ( bExp = 0 ) then dec(expDiff);
  6714. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6715. aBigger:
  6716. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6717. zExp := aExp;
  6718. normalizeRoundAndPack:
  6719. result :=
  6720. normalizeRoundAndPackFloatx80(
  6721. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6722. end;
  6723. {*----------------------------------------------------------------------------
  6724. | Returns the result of adding the extended double-precision floating-point
  6725. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6726. | Standard for Binary Floating-Point Arithmetic.
  6727. *----------------------------------------------------------------------------*}
  6728. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6729. var
  6730. aSign, bSign: flag;
  6731. begin
  6732. aSign := extractFloatx80Sign( a );
  6733. bSign := extractFloatx80Sign( b );
  6734. if ( aSign = bSign ) then begin
  6735. result := addFloatx80Sigs( a, b, aSign );
  6736. end
  6737. else begin
  6738. result := subFloatx80Sigs( a, b, aSign );
  6739. end;
  6740. end;
  6741. {*----------------------------------------------------------------------------
  6742. | Returns the result of subtracting the extended double-precision floating-
  6743. | point values `a' and `b'. The operation is performed according to the
  6744. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6745. *----------------------------------------------------------------------------*}
  6746. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6747. var
  6748. aSign, bSign: flag;
  6749. begin
  6750. aSign := extractFloatx80Sign( a );
  6751. bSign := extractFloatx80Sign( b );
  6752. if ( aSign = bSign ) then begin
  6753. result := subFloatx80Sigs( a, b, aSign );
  6754. end
  6755. else begin
  6756. result := addFloatx80Sigs( a, b, aSign );
  6757. end;
  6758. end;
  6759. {*----------------------------------------------------------------------------
  6760. | Returns the result of multiplying the extended double-precision floating-
  6761. | point values `a' and `b'. The operation is performed according to the
  6762. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6763. *----------------------------------------------------------------------------*}
  6764. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6765. var
  6766. aSign, bSign, zSign: flag;
  6767. aExp, bExp, zExp: int32;
  6768. aSig, bSig, zSig0, zSig1: bits64;
  6769. z: floatx80;
  6770. label
  6771. invalid;
  6772. begin
  6773. aSig := extractFloatx80Frac( a );
  6774. aExp := extractFloatx80Exp( a );
  6775. aSign := extractFloatx80Sign( a );
  6776. bSig := extractFloatx80Frac( b );
  6777. bExp := extractFloatx80Exp( b );
  6778. bSign := extractFloatx80Sign( b );
  6779. zSign := aSign xor bSign;
  6780. if ( aExp = $7FFF ) then begin
  6781. if ( bits64( aSig shl 1 ) <> 0 )
  6782. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6783. result := propagateFloatx80NaN( a, b );
  6784. exit;
  6785. end;
  6786. if ( ( bExp or bSig ) = 0 ) then goto invalid;
  6787. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6788. exit;
  6789. end;
  6790. if ( bExp = $7FFF ) then begin
  6791. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6792. result := propagateFloatx80NaN( a, b );
  6793. exit;
  6794. end;
  6795. if ( ( aExp or aSig ) = 0 ) then begin
  6796. invalid:
  6797. float_raise( float_flag_invalid );
  6798. z.low := floatx80_default_nan_low;
  6799. z.high := floatx80_default_nan_high;
  6800. result := z;
  6801. exit;
  6802. end;
  6803. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6804. exit;
  6805. end;
  6806. if ( aExp = 0 ) then begin
  6807. if ( aSig = 0 ) then begin
  6808. result := packFloatx80( zSign, 0, 0 );
  6809. exit;
  6810. end;
  6811. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6812. end;
  6813. if ( bExp = 0 ) then begin
  6814. if ( bSig = 0 ) then begin
  6815. result := packFloatx80( zSign, 0, 0 );
  6816. exit;
  6817. end;
  6818. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6819. end;
  6820. zExp := aExp + bExp - $3FFE;
  6821. zSig1 := UMul64x64_128( aSig, bSig, zSig0 );
  6822. if 0 < sbits64( zSig0 ) then begin
  6823. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6824. dec(zExp);
  6825. end;
  6826. result :=
  6827. roundAndPackFloatx80(
  6828. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6829. end;
  6830. {*----------------------------------------------------------------------------
  6831. | Returns the result of dividing the extended double-precision floating-point
  6832. | value `a' by the corresponding value `b'. The operation is performed
  6833. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6834. *----------------------------------------------------------------------------*}
  6835. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6836. var
  6837. aSign, bSign, zSign: flag;
  6838. aExp, bExp, zExp: int32;
  6839. aSig, bSig, zSig0, zSig1: bits64;
  6840. rem0, rem1, rem2, term0, term1, term2: bits64;
  6841. z: floatx80;
  6842. label
  6843. invalid;
  6844. begin
  6845. aSig := extractFloatx80Frac( a );
  6846. aExp := extractFloatx80Exp( a );
  6847. aSign := extractFloatx80Sign( a );
  6848. bSig := extractFloatx80Frac( b );
  6849. bExp := extractFloatx80Exp( b );
  6850. bSign := extractFloatx80Sign( b );
  6851. zSign := aSign xor bSign;
  6852. if ( aExp = $7FFF ) then begin
  6853. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6854. result := propagateFloatx80NaN( a, b );
  6855. exit;
  6856. end;
  6857. if ( bExp = $7FFF ) then begin
  6858. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6859. result := propagateFloatx80NaN( a, b );
  6860. exit;
  6861. end;
  6862. goto invalid;
  6863. end;
  6864. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6865. exit;
  6866. end;
  6867. if ( bExp = $7FFF ) then begin
  6868. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6869. result := propagateFloatx80NaN( a, b );
  6870. exit;
  6871. end;
  6872. result := packFloatx80( zSign, 0, 0 );
  6873. exit;
  6874. end;
  6875. if ( bExp = 0 ) then begin
  6876. if ( bSig = 0 ) then begin
  6877. if ( ( aExp or aSig ) = 0 ) then begin
  6878. invalid:
  6879. float_raise( float_flag_invalid );
  6880. z.low := floatx80_default_nan_low;
  6881. z.high := floatx80_default_nan_high;
  6882. result := z;
  6883. exit;
  6884. end;
  6885. float_raise( float_flag_divbyzero );
  6886. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6887. exit;
  6888. end;
  6889. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6890. end;
  6891. if ( aExp = 0 ) then begin
  6892. if ( aSig = 0 ) then begin
  6893. result := packFloatx80( zSign, 0, 0 );
  6894. exit;
  6895. end;
  6896. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6897. end;
  6898. zExp := aExp - bExp + $3FFE;
  6899. rem1 := 0;
  6900. if ( bSig <= aSig ) then begin
  6901. shift128Right( aSig, 0, 1, aSig, rem1 );
  6902. inc(zExp);
  6903. end;
  6904. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6905. term1 := UMul64x64_128( bSig, zSig0, term0 );
  6906. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6907. while ( sbits64( rem0 ) < 0 ) do begin
  6908. dec(zSig0);
  6909. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6910. end;
  6911. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6912. if ( bits64( zSig1 shl 1 ) <= 8 ) then begin
  6913. term2 := UMul64x64_128( bSig, zSig1, term1 );
  6914. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6915. while ( sbits64( rem1 ) < 0 ) do begin
  6916. dec(zSig1);
  6917. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6918. end;
  6919. zSig1 := zSig1 or ord( ( rem1 or rem2 ) <> 0 );
  6920. end;
  6921. result :=
  6922. roundAndPackFloatx80(
  6923. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6924. end;
  6925. {*----------------------------------------------------------------------------
  6926. | Returns the remainder of the extended double-precision floating-point value
  6927. | `a' with respect to the corresponding value `b'. The operation is performed
  6928. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6929. *----------------------------------------------------------------------------*}
  6930. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6931. var
  6932. aSign, zSign: flag;
  6933. aExp, bExp, expDiff: int32;
  6934. aSig0, aSig1, bSig: bits64;
  6935. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6936. z: floatx80;
  6937. label
  6938. invalid;
  6939. begin
  6940. aSig0 := extractFloatx80Frac( a );
  6941. aExp := extractFloatx80Exp( a );
  6942. aSign := extractFloatx80Sign( a );
  6943. bSig := extractFloatx80Frac( b );
  6944. bExp := extractFloatx80Exp( b );
  6945. if ( aExp = $7FFF ) then begin
  6946. if ( bits64( aSig0 shl 1 ) <> 0 )
  6947. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6948. result := propagateFloatx80NaN( a, b );
  6949. exit;
  6950. end;
  6951. goto invalid;
  6952. end;
  6953. if ( bExp = $7FFF ) then begin
  6954. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6955. result := propagateFloatx80NaN( a, b );
  6956. exit;
  6957. end;
  6958. result := a;
  6959. exit;
  6960. end;
  6961. if ( bExp = 0 ) then begin
  6962. if ( bSig = 0 ) then begin
  6963. invalid:
  6964. float_raise( float_flag_invalid );
  6965. z.low := floatx80_default_nan_low;
  6966. z.high := floatx80_default_nan_high;
  6967. result := z;
  6968. exit;
  6969. end;
  6970. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6971. end;
  6972. if ( aExp = 0 ) then begin
  6973. if ( bits64( aSig0 shl 1 ) = 0 ) then begin
  6974. result := a;
  6975. exit;
  6976. end;
  6977. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6978. end;
  6979. bSig := bSig or $8000000000000000;
  6980. zSign := aSign;
  6981. expDiff := aExp - bExp;
  6982. aSig1 := 0;
  6983. if ( expDiff < 0 ) then begin
  6984. if ( expDiff < -1 ) then begin
  6985. result := a;
  6986. exit;
  6987. end;
  6988. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6989. expDiff := 0;
  6990. end;
  6991. q := ord( bSig <= aSig0 );
  6992. if ( q <> 0 ) then dec( aSig0, bSig );
  6993. dec( expDiff, 64 );
  6994. while ( 0 < expDiff ) do begin
  6995. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6996. if ( 2 < q ) then q := q - 2 else q := 0;
  6997. term1 := UMul64x64_128( bSig, q, term0 );
  6998. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6999. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  7000. dec( expDiff, 62 );
  7001. end;
  7002. inc( expDiff, 64 );
  7003. if ( 0 < expDiff ) then begin
  7004. q := estimateDiv128To64( aSig0, aSig1, bSig );
  7005. if ( 2 < q ) then q:= q - 2 else q := 0;
  7006. q := q shr ( 64 - expDiff );
  7007. term1 := UMul64x64_128( bSig, q shl ( 64 - expDiff ), term0 );
  7008. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7009. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  7010. while ( le128( term0, term1, aSig0, aSig1 ) <> 0 ) do begin
  7011. inc(q);
  7012. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7013. end;
  7014. end
  7015. else begin
  7016. term1 := 0;
  7017. term0 := bSig;
  7018. end;
  7019. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  7020. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7021. or ( ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7022. and ( q and 1 <> 0 ) )
  7023. then begin
  7024. aSig0 := alternateASig0;
  7025. aSig1 := alternateASig1;
  7026. zSign := ord( zSign = 0 );
  7027. end;
  7028. result :=
  7029. normalizeRoundAndPackFloatx80(
  7030. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  7031. end;
  7032. {*----------------------------------------------------------------------------
  7033. | Returns the square root of the extended double-precision floating-point
  7034. | value `a'. The operation is performed according to the IEC/IEEE Standard
  7035. | for Binary Floating-Point Arithmetic.
  7036. *----------------------------------------------------------------------------*}
  7037. function floatx80_sqrt(a: floatx80): floatx80;
  7038. var
  7039. aSign: flag;
  7040. aExp, zExp: int32;
  7041. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  7042. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7043. z: floatx80;
  7044. label
  7045. invalid;
  7046. begin
  7047. aSig0 := extractFloatx80Frac( a );
  7048. aExp := extractFloatx80Exp( a );
  7049. aSign := extractFloatx80Sign( a );
  7050. if ( aExp = $7FFF ) then begin
  7051. if ( bits64( aSig0 shl 1 ) <> 0 ) then begin
  7052. result := propagateFloatx80NaN( a, a );
  7053. exit;
  7054. end;
  7055. if ( aSign = 0 ) then begin
  7056. result := a;
  7057. exit;
  7058. end;
  7059. goto invalid;
  7060. end;
  7061. if ( aSign <> 0 ) then begin
  7062. if ( ( aExp or aSig0 ) = 0 ) then begin
  7063. result := a;
  7064. exit;
  7065. end;
  7066. invalid:
  7067. float_raise( float_flag_invalid );
  7068. z.low := floatx80_default_nan_low;
  7069. z.high := floatx80_default_nan_high;
  7070. result := z;
  7071. exit;
  7072. end;
  7073. if ( aExp = 0 ) then begin
  7074. if ( aSig0 = 0 ) then begin
  7075. result := packFloatx80( 0, 0, 0 );
  7076. exit;
  7077. end;
  7078. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  7079. end;
  7080. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFF;
  7081. zSig0 := estimateSqrt32( aExp, aSig0 shr 32 );
  7082. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  7083. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7084. doubleZSig0 := zSig0 shl 1;
  7085. term1 := UMul64x64_128( zSig0, zSig0, term0 );
  7086. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7087. while ( sbits64( rem0 ) < 0 ) do begin
  7088. dec(zSig0);
  7089. dec( doubleZSig0, 2 );
  7090. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7091. end;
  7092. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7093. if ( ( zSig1 and $3FFFFFFFFFFFFFFF ) <= 5 ) then begin
  7094. if ( zSig1 = 0 ) then zSig1 := 1;
  7095. term2 := UMul64x64_128( doubleZSig0, zSig1, term1 );
  7096. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7097. term3 := UMul64x64_128( zSig1, zSig1, term2 );
  7098. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7099. while ( sbits64( rem1 ) < 0 ) do begin
  7100. dec(zSig1);
  7101. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7102. term3 := term3 or 1;
  7103. term2 := term2 or doubleZSig0;
  7104. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7105. end;
  7106. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7107. end;
  7108. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  7109. zSig0 := zSig0 or doubleZSig0;
  7110. result :=
  7111. roundAndPackFloatx80(
  7112. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  7113. end;
  7114. {*----------------------------------------------------------------------------
  7115. | Returns 1 if the extended double-precision floating-point value `a' is
  7116. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  7117. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  7118. | Arithmetic.
  7119. *----------------------------------------------------------------------------*}
  7120. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  7121. begin
  7122. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7123. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  7124. ) or ( ( extractFloatx80Exp( b ) = $7FFF )
  7125. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 )
  7126. ) then begin
  7127. if ( floatx80_is_signaling_nan( a )
  7128. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7129. float_raise( float_flag_invalid );
  7130. end;
  7131. result := 0;
  7132. exit;
  7133. end;
  7134. result := ord(
  7135. ( a.low = b.low )
  7136. and ( ( a.high = b.high )
  7137. or ( ( a.low = 0 )
  7138. and ( bits16 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7139. ) );
  7140. end;
  7141. {*----------------------------------------------------------------------------
  7142. | Returns 1 if the extended double-precision floating-point value `a' is
  7143. | less than or equal to the corresponding value `b', and 0 otherwise. The
  7144. | comparison is performed according to the IEC/IEEE Standard for Binary
  7145. | Floating-Point Arithmetic.
  7146. *----------------------------------------------------------------------------*}
  7147. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  7148. var
  7149. aSign, bSign: flag;
  7150. begin
  7151. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7152. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7153. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7154. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7155. then begin
  7156. float_raise( float_flag_invalid );
  7157. result := 0;
  7158. exit;
  7159. end;
  7160. aSign := extractFloatx80Sign( a );
  7161. bSign := extractFloatx80Sign( b );
  7162. if ( aSign <> bSign ) then begin
  7163. result := ord(
  7164. ( aSign <> 0 )
  7165. or ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low = 0 ) );
  7166. exit;
  7167. end;
  7168. if aSign<>0 then
  7169. result := le128( b.high, b.low, a.high, a.low )
  7170. else
  7171. result := le128( a.high, a.low, b.high, b.low );
  7172. end;
  7173. {*----------------------------------------------------------------------------
  7174. | Returns 1 if the extended double-precision floating-point value `a' is
  7175. | less than the corresponding value `b', and 0 otherwise. The comparison
  7176. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7177. | Arithmetic.
  7178. *----------------------------------------------------------------------------*}
  7179. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  7180. var
  7181. aSign, bSign: flag;
  7182. begin
  7183. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7184. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7185. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7186. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7187. then begin
  7188. float_raise( float_flag_invalid );
  7189. result := 0;
  7190. exit;
  7191. end;
  7192. aSign := extractFloatx80Sign( a );
  7193. bSign := extractFloatx80Sign( b );
  7194. if ( aSign <> bSign ) then begin
  7195. result := ord(
  7196. ( aSign <> 0 )
  7197. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7198. exit;
  7199. end;
  7200. if aSign <> 0 then
  7201. result := lt128( b.high, b.low, a.high, a.low )
  7202. else
  7203. result := lt128( a.high, a.low, b.high, b.low );
  7204. end;
  7205. {*----------------------------------------------------------------------------
  7206. | Returns 1 if the extended double-precision floating-point value `a' is equal
  7207. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  7208. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7209. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7210. *----------------------------------------------------------------------------*}
  7211. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  7212. begin
  7213. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7214. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7215. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7216. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7217. then begin
  7218. float_raise( float_flag_invalid );
  7219. result := 0;
  7220. exit;
  7221. end;
  7222. result := ord(
  7223. ( a.low = b.low )
  7224. and ( ( a.high = b.high )
  7225. or ( ( a.low = 0 )
  7226. and ( bits16( ( a.high or b.high ) shl 1 ) = 0 ) )
  7227. ) );
  7228. end;
  7229. {*----------------------------------------------------------------------------
  7230. | Returns 1 if the extended double-precision floating-point value `a' is less
  7231. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  7232. | do not cause an exception. Otherwise, the comparison is performed according
  7233. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7234. *----------------------------------------------------------------------------*}
  7235. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  7236. var
  7237. aSign, bSign: flag;
  7238. begin
  7239. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7240. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7241. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7242. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7243. then begin
  7244. if ( floatx80_is_signaling_nan( a )
  7245. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7246. float_raise( float_flag_invalid );
  7247. end;
  7248. result := 0;
  7249. exit;
  7250. end;
  7251. aSign := extractFloatx80Sign( a );
  7252. bSign := extractFloatx80Sign( b );
  7253. if ( aSign <> bSign ) then begin
  7254. result := ord(
  7255. ( aSign <> 0 )
  7256. or ( ( bits16( ( a.high or b.high ) shl 1 ) ) or a.low or b.low = 0 ) );
  7257. exit;
  7258. end;
  7259. if aSign <> 0 then
  7260. result := le128( b.high, b.low, a.high, a.low )
  7261. else
  7262. result := le128( a.high, a.low, b.high, b.low );
  7263. end;
  7264. {*----------------------------------------------------------------------------
  7265. | Returns 1 if the extended double-precision floating-point value `a' is less
  7266. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  7267. | an exception. Otherwise, the comparison is performed according to the
  7268. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7269. *----------------------------------------------------------------------------*}
  7270. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  7271. var
  7272. aSign, bSign: flag;
  7273. begin
  7274. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7275. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7276. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7277. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7278. then begin
  7279. if ( floatx80_is_signaling_nan( a )
  7280. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7281. float_raise( float_flag_invalid );
  7282. end;
  7283. result := 0;
  7284. exit;
  7285. end;
  7286. aSign := extractFloatx80Sign( a );
  7287. bSign := extractFloatx80Sign( b );
  7288. if ( aSign <> bSign ) then begin
  7289. result := ord(
  7290. ( aSign <> 0 )
  7291. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7292. exit;
  7293. end;
  7294. if aSign <> 0 then
  7295. result := lt128( b.high, b.low, a.high, a.low )
  7296. else
  7297. result := lt128( a.high, a.low, b.high, b.low );
  7298. end;
  7299. {$endif FPC_SOFTFLOAT_FLOATX80}
  7300. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  7301. {*----------------------------------------------------------------------------
  7302. | Returns the least-significant 64 fraction bits of the quadruple-precision
  7303. | floating-point value `a'.
  7304. *----------------------------------------------------------------------------*}
  7305. function extractFloat128Frac1(a : float128): bits64;
  7306. begin
  7307. result:=a.low;
  7308. end;
  7309. {*----------------------------------------------------------------------------
  7310. | Returns the most-significant 48 fraction bits of the quadruple-precision
  7311. | floating-point value `a'.
  7312. *----------------------------------------------------------------------------*}
  7313. function extractFloat128Frac0(a : float128): bits64;
  7314. begin
  7315. result:=a.high and int64($0000FFFFFFFFFFFF);
  7316. end;
  7317. {*----------------------------------------------------------------------------
  7318. | Returns the exponent bits of the quadruple-precision floating-point value
  7319. | `a'.
  7320. *----------------------------------------------------------------------------*}
  7321. function extractFloat128Exp(a : float128): int32;
  7322. begin
  7323. result:=( a.high shr 48 ) and $7FFF;
  7324. end;
  7325. {*----------------------------------------------------------------------------
  7326. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  7327. *----------------------------------------------------------------------------*}
  7328. function extractFloat128Sign(a : float128): flag;
  7329. begin
  7330. result:=a.high shr 63;
  7331. end;
  7332. {*----------------------------------------------------------------------------
  7333. | Normalizes the subnormal quadruple-precision floating-point value
  7334. | represented by the denormalized significand formed by the concatenation of
  7335. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  7336. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  7337. | significand are stored at the location pointed to by `zSig0Ptr', and the
  7338. | least significant 64 bits of the normalized significand are stored at the
  7339. | location pointed to by `zSig1Ptr'.
  7340. *----------------------------------------------------------------------------*}
  7341. procedure normalizeFloat128Subnormal(
  7342. aSig0: bits64;
  7343. aSig1: bits64;
  7344. var zExpPtr: int32;
  7345. var zSig0Ptr: bits64;
  7346. var zSig1Ptr: bits64);
  7347. var
  7348. shiftCount: int8;
  7349. begin
  7350. if ( aSig0 = 0 ) then
  7351. begin
  7352. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  7353. if ( shiftCount < 0 ) then
  7354. begin
  7355. zSig0Ptr := aSig1 shr ( - shiftCount );
  7356. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  7357. end
  7358. else begin
  7359. zSig0Ptr := aSig1 shl shiftCount;
  7360. zSig1Ptr := 0;
  7361. end;
  7362. zExpPtr := - shiftCount - 63;
  7363. end
  7364. else begin
  7365. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  7366. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  7367. zExpPtr := 1 - shiftCount;
  7368. end;
  7369. end;
  7370. {*----------------------------------------------------------------------------
  7371. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  7372. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  7373. | floating-point value, returning the result. After being shifted into the
  7374. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  7375. | added together to form the most significant 32 bits of the result. This
  7376. | means that any integer portion of `zSig0' will be added into the exponent.
  7377. | Since a properly normalized significand will have an integer portion equal
  7378. | to 1, the `zExp' input should be 1 less than the desired result exponent
  7379. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  7380. | significand.
  7381. *----------------------------------------------------------------------------*}
  7382. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  7383. var
  7384. z: float128;
  7385. begin
  7386. z.low := zSig1;
  7387. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  7388. result:=z;
  7389. end;
  7390. {*----------------------------------------------------------------------------
  7391. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7392. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  7393. | and `zSig2', and returns the proper quadruple-precision floating-point value
  7394. | corresponding to the abstract input. Ordinarily, the abstract value is
  7395. | simply rounded and packed into the quadruple-precision format, with the
  7396. | inexact exception raised if the abstract input cannot be represented
  7397. | exactly. However, if the abstract value is too large, the overflow and
  7398. | inexact exceptions are raised and an infinity or maximal finite value is
  7399. | returned. If the abstract value is too small, the input value is rounded to
  7400. | a subnormal number, and the underflow and inexact exceptions are raised if
  7401. | the abstract input cannot be represented exactly as a subnormal quadruple-
  7402. | precision floating-point number.
  7403. | The input significand must be normalized or smaller. If the input
  7404. | significand is not normalized, `zExp' must be 0; in that case, the result
  7405. | returned is a subnormal number, and it must not require rounding. In the
  7406. | usual case that the input significand is normalized, `zExp' must be 1 less
  7407. | than the ``true'' floating-point exponent. The handling of underflow and
  7408. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7409. *----------------------------------------------------------------------------*}
  7410. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  7411. var
  7412. roundingMode: TFPURoundingMode;
  7413. roundNearestEven, increment, isTiny: flag;
  7414. begin
  7415. roundingMode := softfloat_rounding_mode;
  7416. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  7417. increment := ord( sbits64(zSig2) < 0 );
  7418. if ( roundNearestEven=0 ) then
  7419. begin
  7420. if ( roundingMode = float_round_to_zero ) then
  7421. begin
  7422. increment := 0;
  7423. end
  7424. else begin
  7425. if ( zSign<>0 ) then
  7426. begin
  7427. increment := ord( roundingMode = float_round_down ) and zSig2;
  7428. end
  7429. else begin
  7430. increment := ord( roundingMode = float_round_up ) and zSig2;
  7431. end;
  7432. end;
  7433. end;
  7434. if ( $7FFD <= bits32(zExp) ) then
  7435. begin
  7436. if ( ord( $7FFD < zExp )
  7437. or ( ord( zExp = $7FFD )
  7438. and eq128(
  7439. int64( $0001FFFFFFFFFFFF ),
  7440. bits64( $FFFFFFFFFFFFFFFF ),
  7441. zSig0,
  7442. zSig1
  7443. )
  7444. and increment
  7445. )
  7446. )<>0 then
  7447. begin
  7448. float_raise( [float_flag_overflow,float_flag_inexact] );
  7449. if ( ord( roundingMode = float_round_to_zero )
  7450. or ( zSign and ord( roundingMode = float_round_up ) )
  7451. or ( ord( zSign = 0) and ord( roundingMode = float_round_down ) )
  7452. )<>0 then
  7453. begin
  7454. result :=
  7455. packFloat128(
  7456. zSign,
  7457. $7FFE,
  7458. int64( $0000FFFFFFFFFFFF ),
  7459. bits64( $FFFFFFFFFFFFFFFF )
  7460. );
  7461. exit;
  7462. end;
  7463. result:=packFloat128( zSign, $7FFF, 0, 0 );
  7464. exit;
  7465. end;
  7466. if ( zExp < 0 ) then
  7467. begin
  7468. isTiny :=
  7469. ord(( softfloat_detect_tininess = float_tininess_before_rounding )
  7470. or ( zExp < -1 )
  7471. or not( increment<>0 )
  7472. or boolean(lt128(
  7473. zSig0,
  7474. zSig1,
  7475. int64( $0001FFFFFFFFFFFF ),
  7476. bits64( $FFFFFFFFFFFFFFFF )
  7477. )));
  7478. shift128ExtraRightJamming(
  7479. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  7480. zExp := 0;
  7481. if ( isTiny and zSig2 )<>0 then
  7482. float_raise( float_flag_underflow );
  7483. if ( roundNearestEven<>0 ) then
  7484. begin
  7485. increment := ord( sbits64(zSig2) < 0 );
  7486. end
  7487. else begin
  7488. if ( zSign<>0 ) then
  7489. begin
  7490. increment := ord( roundingMode = float_round_down ) and zSig2;
  7491. end
  7492. else begin
  7493. increment := ord( roundingMode = float_round_up ) and zSig2;
  7494. end;
  7495. end;
  7496. end;
  7497. end;
  7498. if ( zSig2<>0 ) then
  7499. set_inexact_flag;
  7500. if ( increment<>0 ) then
  7501. begin
  7502. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  7503. zSig1 := zSig1 and not bits64( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  7504. end
  7505. else begin
  7506. if ( ( zSig0 or zSig1 ) = 0 ) then
  7507. zExp := 0;
  7508. end;
  7509. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  7510. end;
  7511. {*----------------------------------------------------------------------------
  7512. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7513. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  7514. | returns the proper quadruple-precision floating-point value corresponding
  7515. | to the abstract input. This routine is just like `roundAndPackFloat128'
  7516. | except that the input significand has fewer bits and does not have to be
  7517. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  7518. | point exponent.
  7519. *----------------------------------------------------------------------------*}
  7520. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  7521. var
  7522. shiftCount: int8;
  7523. zSig2: bits64;
  7524. begin
  7525. if ( zSig0 = 0 ) then
  7526. begin
  7527. zSig0 := zSig1;
  7528. zSig1 := 0;
  7529. dec(zExp, 64);
  7530. end;
  7531. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  7532. if ( 0 <= shiftCount ) then
  7533. begin
  7534. zSig2 := 0;
  7535. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  7536. end
  7537. else begin
  7538. shift128ExtraRightJamming(
  7539. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  7540. end;
  7541. dec(zExp, shiftCount);
  7542. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7543. end;
  7544. {*----------------------------------------------------------------------------
  7545. | Returns the result of converting the quadruple-precision floating-point
  7546. | value `a' to the 32-bit two's complement integer format. The conversion
  7547. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7548. | Arithmetic---which means in particular that the conversion is rounded
  7549. | according to the current rounding mode. If `a' is a NaN, the largest
  7550. | positive integer is returned. Otherwise, if the conversion overflows, the
  7551. | largest integer with the same sign as `a' is returned.
  7552. *----------------------------------------------------------------------------*}
  7553. function float128_to_int32(a: float128): int32;
  7554. var
  7555. aSign: flag;
  7556. aExp, shiftCount: int32;
  7557. aSig0, aSig1: bits64;
  7558. begin
  7559. aSig1 := extractFloat128Frac1( a );
  7560. aSig0 := extractFloat128Frac0( a );
  7561. aExp := extractFloat128Exp( a );
  7562. aSign := extractFloat128Sign( a );
  7563. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  7564. aSign := 0;
  7565. if ( aExp<>0 ) then
  7566. aSig0 := aSig0 or int64( $0001000000000000 );
  7567. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7568. shiftCount := $4028 - aExp;
  7569. if ( 0 < shiftCount ) then
  7570. shift64RightJamming( aSig0, shiftCount, aSig0 );
  7571. result := roundAndPackInt32( aSign, aSig0 );
  7572. end;
  7573. {*----------------------------------------------------------------------------
  7574. | Returns the result of converting the quadruple-precision floating-point
  7575. | value `a' to the 32-bit two's complement integer format. The conversion
  7576. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7577. | Arithmetic, except that the conversion is always rounded toward zero. If
  7578. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  7579. | conversion overflows, the largest integer with the same sign as `a' is
  7580. | returned.
  7581. *----------------------------------------------------------------------------*}
  7582. function float128_to_int32_round_to_zero(a: float128): int32;
  7583. var
  7584. aSign: flag;
  7585. aExp, shiftCount: int32;
  7586. aSig0, aSig1, savedASig: bits64;
  7587. z: int32;
  7588. label
  7589. invalid;
  7590. begin
  7591. aSig1 := extractFloat128Frac1( a );
  7592. aSig0 := extractFloat128Frac0( a );
  7593. aExp := extractFloat128Exp( a );
  7594. aSign := extractFloat128Sign( a );
  7595. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7596. if ( $401E < aExp ) then
  7597. begin
  7598. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  7599. aSign := 0;
  7600. goto invalid;
  7601. end
  7602. else if ( aExp < $3FFF ) then
  7603. begin
  7604. if ( aExp or aSig0 )<>0 then
  7605. set_inexact_flag;
  7606. result := 0;
  7607. exit;
  7608. end;
  7609. aSig0 := aSig0 or int64( $0001000000000000 );
  7610. shiftCount := $402F - aExp;
  7611. savedASig := aSig0;
  7612. aSig0 := aSig0 shr shiftCount;
  7613. z := aSig0;
  7614. if ( aSign )<>0 then
  7615. z := - z;
  7616. if ( ord( z < 0 ) xor aSign )<>0 then
  7617. begin
  7618. invalid:
  7619. float_raise( float_flag_invalid );
  7620. if aSign<>0 then
  7621. result:= int32( $80000000 )
  7622. else
  7623. result:=$7FFFFFFF;
  7624. exit;
  7625. end;
  7626. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  7627. begin
  7628. set_inexact_flag;
  7629. end;
  7630. result := z;
  7631. end;
  7632. {*----------------------------------------------------------------------------
  7633. | Returns the result of converting the quadruple-precision floating-point
  7634. | value `a' to the 64-bit two's complement integer format. The conversion
  7635. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7636. | Arithmetic---which means in particular that the conversion is rounded
  7637. | according to the current rounding mode. If `a' is a NaN, the largest
  7638. | positive integer is returned. Otherwise, if the conversion overflows, the
  7639. | largest integer with the same sign as `a' is returned.
  7640. *----------------------------------------------------------------------------*}
  7641. function float128_to_int64(a: float128): int64;
  7642. var
  7643. aSign: flag;
  7644. aExp, shiftCount: int32;
  7645. aSig0, aSig1: bits64;
  7646. begin
  7647. aSig1 := extractFloat128Frac1( a );
  7648. aSig0 := extractFloat128Frac0( a );
  7649. aExp := extractFloat128Exp( a );
  7650. aSign := extractFloat128Sign( a );
  7651. if ( aExp<>0 ) then
  7652. aSig0 := aSig0 or int64( $0001000000000000 );
  7653. shiftCount := $402F - aExp;
  7654. if ( shiftCount <= 0 ) then
  7655. begin
  7656. if ( $403E < aExp ) then
  7657. begin
  7658. float_raise( float_flag_invalid );
  7659. if ( (aSign=0)
  7660. or ( ( aExp = $7FFF )
  7661. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  7662. )
  7663. ) then
  7664. begin
  7665. result := int64( $7FFFFFFFFFFFFFFF );
  7666. exit;
  7667. end;
  7668. result := int64( $8000000000000000 );
  7669. exit;
  7670. end;
  7671. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  7672. end
  7673. else begin
  7674. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  7675. end;
  7676. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  7677. end;
  7678. {*----------------------------------------------------------------------------
  7679. | Returns the result of converting the quadruple-precision floating-point
  7680. | value `a' to the 64-bit two's complement integer format. The conversion
  7681. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7682. | Arithmetic, except that the conversion is always rounded toward zero.
  7683. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  7684. | the conversion overflows, the largest integer with the same sign as `a' is
  7685. | returned.
  7686. *----------------------------------------------------------------------------*}
  7687. function float128_to_int64_round_to_zero(a: float128): int64;
  7688. var
  7689. aSign: flag;
  7690. aExp, shiftCount: int32;
  7691. aSig0, aSig1: bits64;
  7692. z: int64;
  7693. begin
  7694. aSig1 := extractFloat128Frac1( a );
  7695. aSig0 := extractFloat128Frac0( a );
  7696. aExp := extractFloat128Exp( a );
  7697. aSign := extractFloat128Sign( a );
  7698. if ( aExp<>0 ) then
  7699. aSig0 := aSig0 or int64( $0001000000000000 );
  7700. shiftCount := aExp - $402F;
  7701. if ( 0 < shiftCount ) then
  7702. begin
  7703. if ( $403E <= aExp ) then
  7704. begin
  7705. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  7706. if ( ( a.high = bits64( $C03E000000000000 ) )
  7707. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  7708. begin
  7709. if ( aSig1<>0 ) then
  7710. set_inexact_flag;
  7711. end
  7712. else begin
  7713. float_raise( float_flag_invalid );
  7714. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  7715. begin
  7716. result := int64( $7FFFFFFFFFFFFFFF );
  7717. exit;
  7718. end;
  7719. end;
  7720. result := int64( $8000000000000000 );
  7721. exit;
  7722. end;
  7723. z := ( aSig0 shl shiftCount ) or ( aSig1 shr ( ( - shiftCount ) and 63 ) );
  7724. if ( int64( aSig1 shl shiftCount )<>0 ) then
  7725. begin
  7726. set_inexact_flag;
  7727. end;
  7728. end
  7729. else begin
  7730. if ( aExp < $3FFF ) then
  7731. begin
  7732. if ( aExp or aSig0 or aSig1 )<>0 then
  7733. begin
  7734. set_inexact_flag;
  7735. end;
  7736. result := 0;
  7737. exit;
  7738. end;
  7739. z := aSig0 shr ( - shiftCount );
  7740. if ( (aSig1<>0)
  7741. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  7742. begin
  7743. set_inexact_flag;
  7744. end;
  7745. end;
  7746. if ( aSign<>0 ) then
  7747. z := - z;
  7748. result := z;
  7749. end;
  7750. {*----------------------------------------------------------------------------
  7751. | Returns the result of converting the quadruple-precision floating-point
  7752. | value `a' to the single-precision floating-point format. The conversion
  7753. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7754. | Arithmetic.
  7755. *----------------------------------------------------------------------------*}
  7756. function float128_to_float32(a: float128): float32;
  7757. var
  7758. aSign: flag;
  7759. aExp: int32;
  7760. aSig0, aSig1: bits64;
  7761. zSig: bits32;
  7762. begin
  7763. aSig1 := extractFloat128Frac1( a );
  7764. aSig0 := extractFloat128Frac0( a );
  7765. aExp := extractFloat128Exp( a );
  7766. aSign := extractFloat128Sign( a );
  7767. if ( aExp = $7FFF ) then
  7768. begin
  7769. if ( aSig0 or aSig1 )<>0 then
  7770. begin
  7771. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  7772. exit;
  7773. end;
  7774. result := packFloat32( aSign, $FF, 0 );
  7775. exit;
  7776. end;
  7777. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7778. shift64RightJamming( aSig0, 18, aSig0 );
  7779. zSig := aSig0;
  7780. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7781. begin
  7782. zSig := zSig or $40000000;
  7783. dec(aExp,$3F81);
  7784. end;
  7785. result := roundAndPackFloat32( aSign, aExp, zSig );
  7786. end;
  7787. {*----------------------------------------------------------------------------
  7788. | Returns the result of converting the quadruple-precision floating-point
  7789. | value `a' to the double-precision floating-point format. The conversion
  7790. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7791. | Arithmetic.
  7792. *----------------------------------------------------------------------------*}
  7793. function float128_to_float64(a: float128): float64;
  7794. var
  7795. aSign: flag;
  7796. aExp: int32;
  7797. aSig0, aSig1: bits64;
  7798. begin
  7799. aSig1 := extractFloat128Frac1( a );
  7800. aSig0 := extractFloat128Frac0( a );
  7801. aExp := extractFloat128Exp( a );
  7802. aSign := extractFloat128Sign( a );
  7803. if ( aExp = $7FFF ) then
  7804. begin
  7805. if ( aSig0 or aSig1 )<>0 then
  7806. begin
  7807. result:=commonNaNToFloat64(float128ToCommonNaN(a));
  7808. exit;
  7809. end;
  7810. result:=packFloat64( aSign, $7FF, 0);
  7811. exit;
  7812. end;
  7813. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7814. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7815. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7816. begin
  7817. aSig0 := aSig0 or int64( $4000000000000000 );
  7818. dec(aExp,$3C01);
  7819. end;
  7820. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7821. end;
  7822. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7823. {*----------------------------------------------------------------------------
  7824. | Returns the result of converting the quadruple-precision floating-point
  7825. | value `a' to the extended double-precision floating-point format. The
  7826. | conversion is performed according to the IEC/IEEE Standard for Binary
  7827. | Floating-Point Arithmetic.
  7828. *----------------------------------------------------------------------------*}
  7829. function float128_to_floatx80(a: float128): floatx80;
  7830. var
  7831. aSign: flag;
  7832. aExp: int32;
  7833. aSig0, aSig1: bits64;
  7834. begin
  7835. aSig1 := extractFloat128Frac1( a );
  7836. aSig0 := extractFloat128Frac0( a );
  7837. aExp := extractFloat128Exp( a );
  7838. aSign := extractFloat128Sign( a );
  7839. if ( aExp = $7FFF ) then begin
  7840. if ( aSig0 or aSig1 <> 0 ) then begin
  7841. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7842. exit;
  7843. end;
  7844. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  7845. exit;
  7846. end;
  7847. if ( aExp = 0 ) then begin
  7848. if ( ( aSig0 or aSig1 ) = 0 ) then
  7849. begin
  7850. result := packFloatx80( aSign, 0, 0 );
  7851. exit;
  7852. end;
  7853. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7854. end
  7855. else begin
  7856. aSig0 := aSig0 or int64( $0001000000000000 );
  7857. end;
  7858. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7859. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7860. end;
  7861. {$endif FPC_SOFTFLOAT_FLOATX80}
  7862. {*----------------------------------------------------------------------------
  7863. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7864. | Returns the result as a quadruple-precision floating-point value. The
  7865. | operation is performed according to the IEC/IEEE Standard for Binary
  7866. | Floating-Point Arithmetic.
  7867. *----------------------------------------------------------------------------*}
  7868. function float128_round_to_int(a: float128): float128;
  7869. var
  7870. aSign: flag;
  7871. aExp: int32;
  7872. lastBitMask, roundBitsMask: bits64;
  7873. roundingMode: TFPURoundingMode;
  7874. z: float128;
  7875. begin
  7876. aExp := extractFloat128Exp( a );
  7877. if ( $402F <= aExp ) then
  7878. begin
  7879. if ( $406F <= aExp ) then
  7880. begin
  7881. if ( ( aExp = $7FFF )
  7882. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7883. ) then
  7884. begin
  7885. result := propagateFloat128NaN( a, a );
  7886. exit;
  7887. end;
  7888. result := a;
  7889. exit;
  7890. end;
  7891. lastBitMask := 1;
  7892. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7893. roundBitsMask := lastBitMask - 1;
  7894. z := a;
  7895. roundingMode := softfloat_rounding_mode;
  7896. if ( roundingMode = float_round_nearest_even ) then
  7897. begin
  7898. if ( lastBitMask )<>0 then
  7899. begin
  7900. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7901. if ( ( z.low and roundBitsMask ) = 0 ) then
  7902. z.low := z.low and not(lastBitMask);
  7903. end
  7904. else begin
  7905. if ( sbits64(z.low) < 0 ) then
  7906. begin
  7907. inc(z.high);
  7908. if ( bits64( z.low shl 1 ) = 0 ) then
  7909. z.high := z.high and not bits64( 1 );
  7910. end;
  7911. end;
  7912. end
  7913. else if ( roundingMode <> float_round_to_zero ) then
  7914. begin
  7915. if ( extractFloat128Sign( z )
  7916. xor ord( roundingMode = float_round_up ) )<>0 then
  7917. begin
  7918. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7919. end;
  7920. end;
  7921. z.low := z.low and not(roundBitsMask);
  7922. end
  7923. else begin
  7924. if ( aExp < $3FFF ) then
  7925. begin
  7926. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7927. begin
  7928. result := a;
  7929. exit;
  7930. end;
  7931. set_inexact_flag;
  7932. aSign := extractFloat128Sign( a );
  7933. case softfloat_rounding_mode of
  7934. float_round_nearest_even:
  7935. if ( ( aExp = $3FFE )
  7936. and ( (extractFloat128Frac0( a )<>0)
  7937. or (extractFloat128Frac1( a )<>0) )
  7938. ) then begin
  7939. begin
  7940. result := packFloat128( aSign, $3FFF, 0, 0 );
  7941. exit;
  7942. end;
  7943. end;
  7944. float_round_down:
  7945. begin
  7946. if aSign<>0 then
  7947. result:=packFloat128( 1, $3FFF, 0, 0 )
  7948. else
  7949. result:=packFloat128( 0, 0, 0, 0 );
  7950. exit;
  7951. end;
  7952. float_round_up:
  7953. begin
  7954. if aSign<>0 then
  7955. result := packFloat128( 1, 0, 0, 0 )
  7956. else
  7957. result:=packFloat128( 0, $3FFF, 0, 0 );
  7958. exit;
  7959. end;
  7960. end;
  7961. result := packFloat128( aSign, 0, 0, 0 );
  7962. exit;
  7963. end;
  7964. lastBitMask := 1;
  7965. lastBitMask := lastBitMask shl ($402F - aExp);
  7966. roundBitsMask := lastBitMask - 1;
  7967. z.low := 0;
  7968. z.high := a.high;
  7969. roundingMode := softfloat_rounding_mode;
  7970. if ( roundingMode = float_round_nearest_even ) then begin
  7971. inc(z.high,lastBitMask shr 1);
  7972. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7973. z.high := z.high and not(lastBitMask);
  7974. end;
  7975. end
  7976. else if ( roundingMode <> float_round_to_zero ) then begin
  7977. if ( (extractFloat128Sign( z )<>0)
  7978. xor ( roundingMode = float_round_up ) ) then begin
  7979. z.high := z.high or ord( a.low <> 0 );
  7980. z.high := z.high+roundBitsMask;
  7981. end;
  7982. end;
  7983. z.high := z.high and not(roundBitsMask);
  7984. end;
  7985. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7986. set_inexact_flag;
  7987. end;
  7988. result := z;
  7989. end;
  7990. {*----------------------------------------------------------------------------
  7991. | Returns the result of adding the absolute values of the quadruple-precision
  7992. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7993. | before being returned. `zSign' is ignored if the result is a NaN.
  7994. | The addition is performed according to the IEC/IEEE Standard for Binary
  7995. | Floating-Point Arithmetic.
  7996. *----------------------------------------------------------------------------*}
  7997. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7998. var
  7999. aExp, bExp, zExp: int32;
  8000. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8001. expDiff: int32;
  8002. label
  8003. shiftRight1,roundAndPack;
  8004. begin
  8005. aSig1 := extractFloat128Frac1( a );
  8006. aSig0 := extractFloat128Frac0( a );
  8007. aExp := extractFloat128Exp( a );
  8008. bSig1 := extractFloat128Frac1( b );
  8009. bSig0 := extractFloat128Frac0( b );
  8010. bExp := extractFloat128Exp( b );
  8011. expDiff := aExp - bExp;
  8012. if ( 0 < expDiff ) then begin
  8013. if ( aExp = $7FFF ) then begin
  8014. if ( aSig0 or aSig1 )<>0 then
  8015. begin
  8016. result := propagateFloat128NaN( a, b );
  8017. exit;
  8018. end;
  8019. result := a;
  8020. exit;
  8021. end;
  8022. if ( bExp = 0 ) then begin
  8023. dec(expDiff);
  8024. end
  8025. else begin
  8026. bSig0 := bSig0 or int64( $0001000000000000 );
  8027. end;
  8028. shift128ExtraRightJamming(
  8029. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  8030. zExp := aExp;
  8031. end
  8032. else if ( expDiff < 0 ) then begin
  8033. if ( bExp = $7FFF ) then begin
  8034. if ( bSig0 or bSig1 )<>0 then
  8035. begin
  8036. result := propagateFloat128NaN( a, b );
  8037. exit;
  8038. end;
  8039. result := packFloat128( zSign, $7FFF, 0, 0 );
  8040. exit;
  8041. end;
  8042. if ( aExp = 0 ) then begin
  8043. inc(expDiff);
  8044. end
  8045. else begin
  8046. aSig0 := aSig0 or int64( $0001000000000000 );
  8047. end;
  8048. shift128ExtraRightJamming(
  8049. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  8050. zExp := bExp;
  8051. end
  8052. else begin
  8053. if ( aExp = $7FFF ) then begin
  8054. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8055. result := propagateFloat128NaN( a, b );
  8056. exit;
  8057. end;
  8058. result := a;
  8059. exit;
  8060. end;
  8061. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8062. if ( aExp = 0 ) then
  8063. begin
  8064. result := packFloat128( zSign, 0, zSig0, zSig1 );
  8065. exit;
  8066. end;
  8067. zSig2 := 0;
  8068. zSig0 := zSig0 or int64( $0002000000000000 );
  8069. zExp := aExp;
  8070. goto shiftRight1;
  8071. end;
  8072. aSig0 := aSig0 or int64( $0001000000000000 );
  8073. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8074. dec(zExp);
  8075. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  8076. inc(zExp);
  8077. shiftRight1:
  8078. shift128ExtraRightJamming(
  8079. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8080. roundAndPack:
  8081. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8082. end;
  8083. {*----------------------------------------------------------------------------
  8084. | Returns the result of subtracting the absolute values of the quadruple-
  8085. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  8086. | difference is negated before being returned. `zSign' is ignored if the
  8087. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  8088. | Standard for Binary Floating-Point Arithmetic.
  8089. *----------------------------------------------------------------------------*}
  8090. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  8091. var
  8092. aExp, bExp, zExp: int32;
  8093. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  8094. expDiff: int32;
  8095. z: float128;
  8096. label
  8097. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  8098. begin
  8099. aSig1 := extractFloat128Frac1( a );
  8100. aSig0 := extractFloat128Frac0( a );
  8101. aExp := extractFloat128Exp( a );
  8102. bSig1 := extractFloat128Frac1( b );
  8103. bSig0 := extractFloat128Frac0( b );
  8104. bExp := extractFloat128Exp( b );
  8105. expDiff := aExp - bExp;
  8106. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  8107. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  8108. if ( 0 < expDiff ) then goto aExpBigger;
  8109. if ( expDiff < 0 ) then goto bExpBigger;
  8110. if ( aExp = $7FFF ) then begin
  8111. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8112. result := propagateFloat128NaN( a, b );
  8113. exit;
  8114. end;
  8115. float_raise( float_flag_invalid );
  8116. z.low := float128_default_nan_low;
  8117. z.high := float128_default_nan_high;
  8118. result := z;
  8119. exit;
  8120. end;
  8121. if ( aExp = 0 ) then begin
  8122. aExp := 1;
  8123. bExp := 1;
  8124. end;
  8125. if ( bSig0 < aSig0 ) then goto aBigger;
  8126. if ( aSig0 < bSig0 ) then goto bBigger;
  8127. if ( bSig1 < aSig1 ) then goto aBigger;
  8128. if ( aSig1 < bSig1 ) then goto bBigger;
  8129. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  8130. exit;
  8131. bExpBigger:
  8132. if ( bExp = $7FFF ) then begin
  8133. if ( bSig0 or bSig1 )<>0 then
  8134. begin
  8135. result := propagateFloat128NaN( a, b );
  8136. exit;
  8137. end;
  8138. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  8139. exit;
  8140. end;
  8141. if ( aExp = 0 ) then begin
  8142. inc(expDiff);
  8143. end
  8144. else begin
  8145. aSig0 := aSig0 or int64( $4000000000000000 );
  8146. end;
  8147. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8148. bSig0 := bSig0 or int64( $4000000000000000 );
  8149. bBigger:
  8150. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  8151. zExp := bExp;
  8152. zSign := zSign xor 1;
  8153. goto normalizeRoundAndPack;
  8154. aExpBigger:
  8155. if ( aExp = $7FFF ) then begin
  8156. if ( aSig0 or aSig1 )<>0 then
  8157. begin
  8158. result := propagateFloat128NaN( a, b );
  8159. exit;
  8160. end;
  8161. result := a;
  8162. exit;
  8163. end;
  8164. if ( bExp = 0 ) then begin
  8165. dec(expDiff);
  8166. end
  8167. else begin
  8168. bSig0 := bSig0 or int64( $4000000000000000 );
  8169. end;
  8170. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  8171. aSig0 := aSig0 or int64( $4000000000000000 );
  8172. aBigger:
  8173. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8174. zExp := aExp;
  8175. normalizeRoundAndPack:
  8176. dec(zExp);
  8177. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  8178. end;
  8179. {*----------------------------------------------------------------------------
  8180. | Returns the result of adding the quadruple-precision floating-point values
  8181. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  8182. | for Binary Floating-Point Arithmetic.
  8183. *----------------------------------------------------------------------------*}
  8184. function float128_add(a: float128; b: float128): float128;
  8185. var
  8186. aSign, bSign: flag;
  8187. begin
  8188. aSign := extractFloat128Sign( a );
  8189. bSign := extractFloat128Sign( b );
  8190. if ( aSign = bSign ) then begin
  8191. result := addFloat128Sigs( a, b, aSign );
  8192. end
  8193. else begin
  8194. result := subFloat128Sigs( a, b, aSign );
  8195. end;
  8196. end;
  8197. {*----------------------------------------------------------------------------
  8198. | Returns the result of subtracting the quadruple-precision floating-point
  8199. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8200. | Standard for Binary Floating-Point Arithmetic.
  8201. *----------------------------------------------------------------------------*}
  8202. function float128_sub(a: float128; b: float128): float128;
  8203. var
  8204. aSign, bSign: flag;
  8205. begin
  8206. aSign := extractFloat128Sign( a );
  8207. bSign := extractFloat128Sign( b );
  8208. if ( aSign = bSign ) then begin
  8209. result := subFloat128Sigs( a, b, aSign );
  8210. end
  8211. else begin
  8212. result := addFloat128Sigs( a, b, aSign );
  8213. end;
  8214. end;
  8215. {*----------------------------------------------------------------------------
  8216. | Returns the result of multiplying the quadruple-precision floating-point
  8217. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8218. | Standard for Binary Floating-Point Arithmetic.
  8219. *----------------------------------------------------------------------------*}
  8220. function float128_mul(a: float128; b: float128): float128;
  8221. var
  8222. aSign, bSign, zSign: flag;
  8223. aExp, bExp, zExp: int32;
  8224. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  8225. z: float128;
  8226. label
  8227. invalid;
  8228. begin
  8229. aSig1 := extractFloat128Frac1( a );
  8230. aSig0 := extractFloat128Frac0( a );
  8231. aExp := extractFloat128Exp( a );
  8232. aSign := extractFloat128Sign( a );
  8233. bSig1 := extractFloat128Frac1( b );
  8234. bSig0 := extractFloat128Frac0( b );
  8235. bExp := extractFloat128Exp( b );
  8236. bSign := extractFloat128Sign( b );
  8237. zSign := aSign xor bSign;
  8238. if ( aExp = $7FFF ) then begin
  8239. if ( (( aSig0 or aSig1 )<>0)
  8240. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8241. result := propagateFloat128NaN( a, b );
  8242. exit;
  8243. end;
  8244. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  8245. result := packFloat128( zSign, $7FFF, 0, 0 );
  8246. exit;
  8247. end;
  8248. if ( bExp = $7FFF ) then begin
  8249. if ( bSig0 or bSig1 )<>0 then
  8250. begin
  8251. result := propagateFloat128NaN( a, b );
  8252. exit;
  8253. end;
  8254. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8255. invalid:
  8256. float_raise( float_flag_invalid );
  8257. z.low := float128_default_nan_low;
  8258. z.high := float128_default_nan_high;
  8259. result := z;
  8260. exit;
  8261. end;
  8262. result := packFloat128( zSign, $7FFF, 0, 0 );
  8263. exit;
  8264. end;
  8265. if ( aExp = 0 ) then begin
  8266. if ( ( aSig0 or aSig1 ) = 0 ) then
  8267. begin
  8268. result := packFloat128( zSign, 0, 0, 0 );
  8269. exit;
  8270. end;
  8271. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8272. end;
  8273. if ( bExp = 0 ) then begin
  8274. if ( ( bSig0 or bSig1 ) = 0 ) then
  8275. begin
  8276. result := packFloat128( zSign, 0, 0, 0 );
  8277. exit;
  8278. end;
  8279. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8280. end;
  8281. zExp := aExp + bExp - $4000;
  8282. aSig0 := aSig0 or int64( $0001000000000000 );
  8283. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  8284. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  8285. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  8286. zSig2 := zSig2 or ord( zSig3 <> 0 );
  8287. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  8288. shift128ExtraRightJamming(
  8289. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8290. inc(zExp);
  8291. end;
  8292. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8293. end;
  8294. {*----------------------------------------------------------------------------
  8295. | Returns the result of dividing the quadruple-precision floating-point value
  8296. | `a' by the corresponding value `b'. The operation is performed according to
  8297. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8298. *----------------------------------------------------------------------------*}
  8299. function float128_div(a: float128; b: float128): float128;
  8300. var
  8301. aSign, bSign, zSign: flag;
  8302. aExp, bExp, zExp: int32;
  8303. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8304. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8305. z: float128;
  8306. label
  8307. invalid;
  8308. begin
  8309. aSig1 := extractFloat128Frac1( a );
  8310. aSig0 := extractFloat128Frac0( a );
  8311. aExp := extractFloat128Exp( a );
  8312. aSign := extractFloat128Sign( a );
  8313. bSig1 := extractFloat128Frac1( b );
  8314. bSig0 := extractFloat128Frac0( b );
  8315. bExp := extractFloat128Exp( b );
  8316. bSign := extractFloat128Sign( b );
  8317. zSign := aSign xor bSign;
  8318. if ( aExp = $7FFF ) then begin
  8319. if ( aSig0 or aSig1 )<>0 then
  8320. begin
  8321. result := propagateFloat128NaN( a, b );
  8322. exit;
  8323. end;
  8324. if ( bExp = $7FFF ) then begin
  8325. if ( bSig0 or bSig1 )<>0 then
  8326. begin
  8327. result := propagateFloat128NaN( a, b );
  8328. exit;
  8329. end;
  8330. goto invalid;
  8331. end;
  8332. result := packFloat128( zSign, $7FFF, 0, 0 );
  8333. exit;
  8334. end;
  8335. if ( bExp = $7FFF ) then begin
  8336. if ( bSig0 or bSig1 )<>0 then
  8337. begin
  8338. result := propagateFloat128NaN( a, b );
  8339. exit;
  8340. end;
  8341. result := packFloat128( zSign, 0, 0, 0 );
  8342. exit;
  8343. end;
  8344. if ( bExp = 0 ) then begin
  8345. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8346. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8347. invalid:
  8348. float_raise( float_flag_invalid );
  8349. z.low := float128_default_nan_low;
  8350. z.high := float128_default_nan_high;
  8351. result := z;
  8352. exit;
  8353. end;
  8354. float_raise( float_flag_divbyzero );
  8355. result := packFloat128( zSign, $7FFF, 0, 0 );
  8356. exit;
  8357. end;
  8358. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8359. end;
  8360. if ( aExp = 0 ) then begin
  8361. if ( ( aSig0 or aSig1 ) = 0 ) then
  8362. begin
  8363. result := packFloat128( zSign, 0, 0, 0 );
  8364. exit;
  8365. end;
  8366. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8367. end;
  8368. zExp := aExp - bExp + $3FFD;
  8369. shortShift128Left(
  8370. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  8371. shortShift128Left(
  8372. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8373. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  8374. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  8375. inc(zExp);
  8376. end;
  8377. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8378. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  8379. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  8380. while ( sbits64(rem0) < 0 ) do begin
  8381. dec(zSig0);
  8382. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  8383. end;
  8384. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  8385. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  8386. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  8387. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  8388. while ( sbits64(rem1) < 0 ) do begin
  8389. dec(zSig1);
  8390. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  8391. end;
  8392. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8393. end;
  8394. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  8395. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8396. end;
  8397. {*----------------------------------------------------------------------------
  8398. | Returns the remainder of the quadruple-precision floating-point value `a'
  8399. | with respect to the corresponding value `b'. The operation is performed
  8400. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8401. *----------------------------------------------------------------------------*}
  8402. function float128_rem(a: float128; b: float128): float128;
  8403. var
  8404. aSign, zSign: flag;
  8405. aExp, bExp, expDiff: int32;
  8406. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  8407. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  8408. sigMean0: sbits64;
  8409. z: float128;
  8410. label
  8411. invalid;
  8412. begin
  8413. aSig1 := extractFloat128Frac1( a );
  8414. aSig0 := extractFloat128Frac0( a );
  8415. aExp := extractFloat128Exp( a );
  8416. aSign := extractFloat128Sign( a );
  8417. bSig1 := extractFloat128Frac1( b );
  8418. bSig0 := extractFloat128Frac0( b );
  8419. bExp := extractFloat128Exp( b );
  8420. if ( aExp = $7FFF ) then begin
  8421. if ( (( aSig0 or aSig1 )<>0)
  8422. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8423. result := propagateFloat128NaN( a, b );
  8424. exit;
  8425. end;
  8426. goto invalid;
  8427. end;
  8428. if ( bExp = $7FFF ) then begin
  8429. if ( bSig0 or bSig1 )<>0 then
  8430. begin
  8431. result := propagateFloat128NaN( a, b );
  8432. exit;
  8433. end;
  8434. result := a;
  8435. exit;
  8436. end;
  8437. if ( bExp = 0 ) then begin
  8438. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8439. invalid:
  8440. float_raise( float_flag_invalid );
  8441. z.low := float128_default_nan_low;
  8442. z.high := float128_default_nan_high;
  8443. result := z;
  8444. exit;
  8445. end;
  8446. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8447. end;
  8448. if ( aExp = 0 ) then begin
  8449. if ( ( aSig0 or aSig1 ) = 0 ) then
  8450. begin
  8451. result := a;
  8452. exit;
  8453. end;
  8454. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8455. end;
  8456. expDiff := aExp - bExp;
  8457. if ( expDiff < -1 ) then
  8458. begin
  8459. result := a;
  8460. exit;
  8461. end;
  8462. shortShift128Left(
  8463. aSig0 or int64( $0001000000000000 ),
  8464. aSig1,
  8465. 15 - ord( expDiff < 0 ),
  8466. aSig0,
  8467. aSig1
  8468. );
  8469. shortShift128Left(
  8470. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8471. q := le128( bSig0, bSig1, aSig0, aSig1 );
  8472. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8473. dec(expDiff,64);
  8474. while ( 0 < expDiff ) do begin
  8475. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8476. if ( 4 < q ) then
  8477. q := q - 4
  8478. else
  8479. q := 0;
  8480. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8481. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  8482. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  8483. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  8484. dec(expDiff,61);
  8485. end;
  8486. if ( -64 < expDiff ) then begin
  8487. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8488. if ( 4 < q ) then
  8489. q := q - 4
  8490. else
  8491. q := 0;
  8492. q := q shr (- expDiff);
  8493. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8494. inc(expDiff,52);
  8495. if ( expDiff < 0 ) then begin
  8496. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8497. end
  8498. else begin
  8499. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  8500. end;
  8501. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8502. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  8503. end
  8504. else begin
  8505. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  8506. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8507. end;
  8508. repeat
  8509. alternateASig0 := aSig0;
  8510. alternateASig1 := aSig1;
  8511. inc(q);
  8512. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8513. until not( 0 <= sbits64(aSig0) );
  8514. add128(
  8515. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  8516. if ( ( sigMean0 < 0 )
  8517. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  8518. aSig0 := alternateASig0;
  8519. aSig1 := alternateASig1;
  8520. end;
  8521. zSign := ord( sbits64(aSig0) < 0 );
  8522. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  8523. result :=
  8524. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  8525. end;
  8526. {*----------------------------------------------------------------------------
  8527. | Returns the square root of the quadruple-precision floating-point value `a'.
  8528. | The operation is performed according to the IEC/IEEE Standard for Binary
  8529. | Floating-Point Arithmetic.
  8530. *----------------------------------------------------------------------------*}
  8531. function float128_sqrt(a: float128): float128;
  8532. var
  8533. aSign: flag;
  8534. aExp, zExp: int32;
  8535. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  8536. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8537. z: float128;
  8538. label
  8539. invalid;
  8540. begin
  8541. aSig1 := extractFloat128Frac1( a );
  8542. aSig0 := extractFloat128Frac0( a );
  8543. aExp := extractFloat128Exp( a );
  8544. aSign := extractFloat128Sign( a );
  8545. if ( aExp = $7FFF ) then begin
  8546. if ( aSig0 or aSig1 )<>0 then
  8547. begin
  8548. result := propagateFloat128NaN( a, a );
  8549. exit;
  8550. end;
  8551. if ( aSign=0 ) then
  8552. begin
  8553. result := a;
  8554. exit;
  8555. end;
  8556. goto invalid;
  8557. end;
  8558. if ( aSign<>0 ) then begin
  8559. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  8560. begin
  8561. result := a;
  8562. exit;
  8563. end;
  8564. invalid:
  8565. float_raise( float_flag_invalid );
  8566. z.low := float128_default_nan_low;
  8567. z.high := float128_default_nan_high;
  8568. result := z;
  8569. exit;
  8570. end;
  8571. if ( aExp = 0 ) then begin
  8572. if ( ( aSig0 or aSig1 ) = 0 ) then
  8573. begin
  8574. result := packFloat128( 0, 0, 0, 0 );
  8575. exit;
  8576. end;
  8577. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8578. end;
  8579. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFE;
  8580. aSig0 := aSig0 or int64( $0001000000000000 );
  8581. zSig0 := estimateSqrt32( aExp, aSig0 shr 17 );
  8582. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  8583. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  8584. doubleZSig0 := zSig0 shl 1;
  8585. term1 := UMul64x64_128( zSig0, zSig0, term0);
  8586. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  8587. while ( sbits64(rem0) < 0 ) do begin
  8588. dec(zSig0);
  8589. dec(doubleZSig0,2);
  8590. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  8591. end;
  8592. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  8593. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  8594. if ( zSig1 = 0 ) then zSig1 := 1;
  8595. term2 := UMul64x64_128( doubleZSig0, zSig1, term1 );
  8596. sub128( rem1, 0, term1, term2, rem1, rem2 );
  8597. term3 := UMul64x64_128( zSig1, zSig1, term2 );
  8598. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  8599. while ( sbits64(rem1) < 0 ) do begin
  8600. dec(zSig1);
  8601. shortShift128Left( 0, zSig1, 1, term2, term3 );
  8602. term3 := term3 or 1;
  8603. term2 := term2 or doubleZSig0;
  8604. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  8605. end;
  8606. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8607. end;
  8608. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  8609. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  8610. end;
  8611. {*----------------------------------------------------------------------------
  8612. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8613. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8614. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8615. *----------------------------------------------------------------------------*}
  8616. function float128_eq(a: float128; b: float128): flag;
  8617. begin
  8618. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8619. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8620. or ( ( extractFloat128Exp( b ) = $7FFF )
  8621. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8622. ) then begin
  8623. if ( (float128_is_signaling_nan( a )<>0)
  8624. or (float128_is_signaling_nan( b )<>0) ) then begin
  8625. float_raise( float_flag_invalid );
  8626. end;
  8627. result := 0;
  8628. exit;
  8629. end;
  8630. result := ord(
  8631. ( a.low = b.low )
  8632. and ( ( a.high = b.high )
  8633. or ( ( a.low = 0 )
  8634. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  8635. ));
  8636. end;
  8637. {*----------------------------------------------------------------------------
  8638. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8639. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  8640. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  8641. | Arithmetic.
  8642. *----------------------------------------------------------------------------*}
  8643. function float128_le(a: float128; b: float128): flag;
  8644. var
  8645. aSign, bSign: flag;
  8646. begin
  8647. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8648. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8649. or ( ( extractFloat128Exp( b ) = $7FFF )
  8650. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8651. ) then begin
  8652. float_raise( float_flag_invalid );
  8653. result := 0;
  8654. exit;
  8655. end;
  8656. aSign := extractFloat128Sign( a );
  8657. bSign := extractFloat128Sign( b );
  8658. if ( aSign <> bSign ) then begin
  8659. result := ord(
  8660. (aSign<>0)
  8661. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8662. = 0 ));
  8663. exit;
  8664. end;
  8665. if aSign<>0 then
  8666. result := le128( b.high, b.low, a.high, a.low )
  8667. else
  8668. result := le128( a.high, a.low, b.high, b.low );
  8669. end;
  8670. {*----------------------------------------------------------------------------
  8671. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8672. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8673. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8674. *----------------------------------------------------------------------------*}
  8675. function float128_lt(a: float128; b: float128): flag;
  8676. var
  8677. aSign, bSign: flag;
  8678. begin
  8679. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8680. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8681. or ( ( extractFloat128Exp( b ) = $7FFF )
  8682. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8683. ) then begin
  8684. float_raise( float_flag_invalid );
  8685. result := 0;
  8686. exit;
  8687. end;
  8688. aSign := extractFloat128Sign( a );
  8689. bSign := extractFloat128Sign( b );
  8690. if ( aSign <> bSign ) then begin
  8691. result := ord(
  8692. (aSign<>0)
  8693. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8694. <> 0 ));
  8695. exit;
  8696. end;
  8697. if aSign<>0 then
  8698. result := lt128( b.high, b.low, a.high, a.low )
  8699. else
  8700. result := lt128( a.high, a.low, b.high, b.low );
  8701. end;
  8702. {*----------------------------------------------------------------------------
  8703. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8704. | the corresponding value `b', and 0 otherwise. The invalid exception is
  8705. | raised if either operand is a NaN. Otherwise, the comparison is performed
  8706. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8707. *----------------------------------------------------------------------------*}
  8708. function float128_eq_signaling(a: float128; b: float128): flag;
  8709. begin
  8710. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8711. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8712. or ( ( extractFloat128Exp( b ) = $7FFF )
  8713. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8714. ) then begin
  8715. float_raise( float_flag_invalid );
  8716. result := 0;
  8717. exit;
  8718. end;
  8719. result := ord(
  8720. ( a.low = b.low )
  8721. and ( ( a.high = b.high )
  8722. or ( ( a.low = 0 )
  8723. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  8724. ));
  8725. end;
  8726. {*----------------------------------------------------------------------------
  8727. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8728. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  8729. | cause an exception. Otherwise, the comparison is performed according to the
  8730. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8731. *----------------------------------------------------------------------------*}
  8732. function float128_le_quiet(a: float128; b: float128): flag;
  8733. var
  8734. aSign, bSign: flag;
  8735. begin
  8736. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8737. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8738. or ( ( extractFloat128Exp( b ) = $7FFF )
  8739. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8740. ) then begin
  8741. if ( (float128_is_signaling_nan( a )<>0)
  8742. or (float128_is_signaling_nan( b )<>0) ) then begin
  8743. float_raise( float_flag_invalid );
  8744. end;
  8745. result := 0;
  8746. exit;
  8747. end;
  8748. aSign := extractFloat128Sign( a );
  8749. bSign := extractFloat128Sign( b );
  8750. if ( aSign <> bSign ) then begin
  8751. result := ord(
  8752. (aSign<>0)
  8753. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8754. = 0 ));
  8755. exit;
  8756. end;
  8757. if aSign<>0 then
  8758. result := le128( b.high, b.low, a.high, a.low )
  8759. else
  8760. result := le128( a.high, a.low, b.high, b.low );
  8761. end;
  8762. {*----------------------------------------------------------------------------
  8763. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8764. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  8765. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  8766. | Standard for Binary Floating-Point Arithmetic.
  8767. *----------------------------------------------------------------------------*}
  8768. function float128_lt_quiet(a: float128; b: float128): flag;
  8769. var
  8770. aSign, bSign: flag;
  8771. begin
  8772. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8773. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8774. or ( ( extractFloat128Exp( b ) = $7FFF )
  8775. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8776. ) then begin
  8777. if ( (float128_is_signaling_nan( a )<>0)
  8778. or (float128_is_signaling_nan( b )<>0) ) then begin
  8779. float_raise( float_flag_invalid );
  8780. end;
  8781. result := 0;
  8782. exit;
  8783. end;
  8784. aSign := extractFloat128Sign( a );
  8785. bSign := extractFloat128Sign( b );
  8786. if ( aSign <> bSign ) then begin
  8787. result := ord(
  8788. (aSign<>0)
  8789. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8790. <> 0 ));
  8791. exit;
  8792. end;
  8793. if aSign<>0 then
  8794. result:=lt128( b.high, b.low, a.high, a.low )
  8795. else
  8796. result:=lt128( a.high, a.low, b.high, b.low );
  8797. end;
  8798. {----------------------------------------------------------------------------
  8799. | Returns the result of converting the double-precision floating-point value
  8800. | `a' to the quadruple-precision floating-point format. The conversion is
  8801. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8802. | Arithmetic.
  8803. *----------------------------------------------------------------------------}
  8804. function float64_to_float128( a : float64) : float128;
  8805. var
  8806. aSign : flag;
  8807. aExp : int16;
  8808. aSig, zSig0, zSig1 : bits64;
  8809. begin
  8810. aSig := extractFloat64Frac( a );
  8811. aExp := extractFloat64Exp( a );
  8812. aSign := extractFloat64Sign( a );
  8813. if ( aExp = $7FF ) then begin
  8814. if ( aSig<>0 ) then begin
  8815. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8816. exit;
  8817. end;
  8818. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8819. exit;
  8820. end;
  8821. if ( aExp = 0 ) then begin
  8822. if ( aSig = 0 ) then
  8823. begin
  8824. result:=packFloat128( aSign, 0, 0, 0 );
  8825. exit;
  8826. end;
  8827. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8828. dec(aExp);
  8829. end;
  8830. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8831. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8832. end;
  8833. {$endif FPC_SOFTFLOAT_FLOAT128}
  8834. {$endif not(defined(fpc_softfpu_interface))}
  8835. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8836. end.
  8837. {$ifdef FPC}
  8838. { restore context modified at implmentation start
  8839. to possibly re-enable range and overflow checking explicitly}
  8840. {$pop}
  8841. {$endif FPC}
  8842. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}