softfpu.pp 321 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286928792889289929092919292929392949295929692979298929993009301930293039304930593069307930893099310931193129313
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. {$macro on}
  69. {$define compilerproc:=stdcall }
  70. interface
  71. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  72. {$if not(defined(fpc_softfpu_implementation))}
  73. {
  74. -------------------------------------------------------------------------------
  75. Software IEC/IEEE floating-point types.
  76. -------------------------------------------------------------------------------
  77. }
  78. TYPE
  79. float32 = longword;
  80. {$define FPC_SYSTEM_HAS_float32}
  81. { we use here a record in the function header because
  82. the record allows bitwise conversion to single }
  83. float32rec = record
  84. float32 : float32;
  85. end;
  86. flag = byte;
  87. bits8 = byte;
  88. sbits8 = shortint;
  89. bits16 = word;
  90. sbits16 = smallint;
  91. sbits32 = longint;
  92. bits32 = longword;
  93. {$ifndef fpc}
  94. qword = int64;
  95. {$endif}
  96. { now part of the system unit
  97. uint64 = qword;
  98. }
  99. bits64 = qword;
  100. sbits64 = int64;
  101. {$ifdef ENDIAN_LITTLE}
  102. float64 = record
  103. case byte of
  104. 1: (low,high : bits32);
  105. // force the record to be aligned like a double
  106. // else *_to_double will fail for cpus like sparc
  107. // and avoid expensive unpacking/packing operations
  108. 2: (dummy : double);
  109. end;
  110. int64rec = record
  111. case byte of
  112. 1: (low,high : bits32);
  113. // force the record to be aligned like a double
  114. // else *_to_double will fail for cpus like sparc
  115. // and avoid expensive unpacking/packing operations
  116. 2: (dummy : int64);
  117. end;
  118. floatx80 = record
  119. case byte of
  120. 1: (low : qword;high : word);
  121. // force the record to be aligned like a double
  122. // else *_to_double will fail for cpus like sparc
  123. // and avoid expensive unpacking/packing operations
  124. 2: (dummy : extended);
  125. end;
  126. float128 = record
  127. case byte of
  128. 1: (low,high : qword);
  129. // force the record to be aligned like a double
  130. // else *_to_double will fail for cpus like sparc
  131. // and avoid expensive unpacking/packing operations
  132. 2: (dummy : qword);
  133. end;
  134. {$else}
  135. float64 = record
  136. case byte of
  137. 1: (high,low : bits32);
  138. // force the record to be aligned like a double
  139. // else *_to_double will fail for cpus like sparc
  140. 2: (dummy : double);
  141. end;
  142. int64rec = record
  143. case byte of
  144. 1: (high,low : bits32);
  145. // force the record to be aligned like a double
  146. // else *_to_double will fail for cpus like sparc
  147. // and avoid expensive unpacking/packing operations
  148. 2: (dummy : int64);
  149. end;
  150. floatx80 = record
  151. case byte of
  152. 1: (high : word;low : qword);
  153. // force the record to be aligned like a double
  154. // else *_to_double will fail for cpus like sparc
  155. // and avoid expensive unpacking/packing operations
  156. 2: (dummy : qword);
  157. end;
  158. float128 = record
  159. case byte of
  160. 1: (high : qword;low : qword);
  161. // force the record to be aligned like a double
  162. // else *_to_double will fail for cpus like sparc
  163. // and avoid expensive unpacking/packing operations
  164. 2: (dummy : qword);
  165. end;
  166. {$endif}
  167. {$define FPC_SYSTEM_HAS_float64}
  168. {*
  169. -------------------------------------------------------------------------------
  170. Returns 1 if the double-precision floating-point value `a' is less than
  171. the corresponding value `b', and 0 otherwise. The comparison is performed
  172. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  173. -------------------------------------------------------------------------------
  174. *}
  175. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  176. {*
  177. -------------------------------------------------------------------------------
  178. Returns 1 if the double-precision floating-point value `a' is less than
  179. or equal to the corresponding value `b', and 0 otherwise. The comparison
  180. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  181. Arithmetic.
  182. -------------------------------------------------------------------------------
  183. *}
  184. Function float64_le(a: float64;b: float64): flag; compilerproc;
  185. {*
  186. -------------------------------------------------------------------------------
  187. Returns 1 if the double-precision floating-point value `a' is equal to
  188. the corresponding value `b', and 0 otherwise. The comparison is performed
  189. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  190. -------------------------------------------------------------------------------
  191. *}
  192. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  193. {*
  194. -------------------------------------------------------------------------------
  195. Returns the square root of the double-precision floating-point value `a'.
  196. The operation is performed according to the IEC/IEEE Standard for Binary
  197. Floating-Point Arithmetic.
  198. -------------------------------------------------------------------------------
  199. *}
  200. Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
  201. {*
  202. -------------------------------------------------------------------------------
  203. Returns the remainder of the double-precision floating-point value `a'
  204. with respect to the corresponding value `b'. The operation is performed
  205. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  206. -------------------------------------------------------------------------------
  207. *}
  208. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  209. {*
  210. -------------------------------------------------------------------------------
  211. Returns the result of dividing the double-precision floating-point value `a'
  212. by the corresponding value `b'. The operation is performed according to the
  213. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  214. -------------------------------------------------------------------------------
  215. *}
  216. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  217. {*
  218. -------------------------------------------------------------------------------
  219. Returns the result of multiplying the double-precision floating-point values
  220. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  221. for Binary Floating-Point Arithmetic.
  222. -------------------------------------------------------------------------------
  223. *}
  224. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  225. {*
  226. -------------------------------------------------------------------------------
  227. Returns the result of subtracting the double-precision floating-point values
  228. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  229. for Binary Floating-Point Arithmetic.
  230. -------------------------------------------------------------------------------
  231. *}
  232. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  233. {*
  234. -------------------------------------------------------------------------------
  235. Returns the result of adding the double-precision floating-point values `a'
  236. and `b'. The operation is performed according to the IEC/IEEE Standard for
  237. Binary Floating-Point Arithmetic.
  238. -------------------------------------------------------------------------------
  239. *}
  240. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  241. {*
  242. -------------------------------------------------------------------------------
  243. Rounds the double-precision floating-point value `a' to an integer,
  244. and returns the result as a double-precision floating-point value. The
  245. operation is performed according to the IEC/IEEE Standard for Binary
  246. Floating-Point Arithmetic.
  247. -------------------------------------------------------------------------------
  248. *}
  249. Function float64_round_to_int(a: float64) : float64; compilerproc;
  250. {*
  251. -------------------------------------------------------------------------------
  252. Returns the result of converting the double-precision floating-point value
  253. `a' to the single-precision floating-point format. The conversion is
  254. performed according to the IEC/IEEE Standard for Binary Floating-Point
  255. Arithmetic.
  256. -------------------------------------------------------------------------------
  257. *}
  258. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  259. {*
  260. -------------------------------------------------------------------------------
  261. Returns the result of converting the double-precision floating-point value
  262. `a' to the 32-bit two's complement integer format. The conversion is
  263. performed according to the IEC/IEEE Standard for Binary Floating-Point
  264. Arithmetic, except that the conversion is always rounded toward zero.
  265. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  266. the conversion overflows, the largest integer with the same sign as `a' is
  267. returned.
  268. -------------------------------------------------------------------------------
  269. *}
  270. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  271. {*
  272. -------------------------------------------------------------------------------
  273. Returns the result of converting the double-precision floating-point value
  274. `a' to the 32-bit two's complement integer format. The conversion is
  275. performed according to the IEC/IEEE Standard for Binary Floating-Point
  276. Arithmetic---which means in particular that the conversion is rounded
  277. according to the current rounding mode. If `a' is a NaN, the largest
  278. positive integer is returned. Otherwise, if the conversion overflows, the
  279. largest integer with the same sign as `a' is returned.
  280. -------------------------------------------------------------------------------
  281. *}
  282. Function float64_to_int32(a: float64): int32; compilerproc;
  283. {*
  284. -------------------------------------------------------------------------------
  285. Returns 1 if the single-precision floating-point value `a' is less than
  286. the corresponding value `b', and 0 otherwise. The comparison is performed
  287. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  288. -------------------------------------------------------------------------------
  289. *}
  290. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  291. {*
  292. -------------------------------------------------------------------------------
  293. Returns 1 if the single-precision floating-point value `a' is less than
  294. or equal to the corresponding value `b', and 0 otherwise. The comparison
  295. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  296. Arithmetic.
  297. -------------------------------------------------------------------------------
  298. *}
  299. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  300. {*
  301. -------------------------------------------------------------------------------
  302. Returns 1 if the single-precision floating-point value `a' is equal to
  303. the corresponding value `b', and 0 otherwise. The comparison is performed
  304. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  305. -------------------------------------------------------------------------------
  306. *}
  307. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  308. {*
  309. -------------------------------------------------------------------------------
  310. Returns the square root of the single-precision floating-point value `a'.
  311. The operation is performed according to the IEC/IEEE Standard for Binary
  312. Floating-Point Arithmetic.
  313. -------------------------------------------------------------------------------
  314. *}
  315. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  316. {*
  317. -------------------------------------------------------------------------------
  318. Returns the remainder of the single-precision floating-point value `a'
  319. with respect to the corresponding value `b'. The operation is performed
  320. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  321. -------------------------------------------------------------------------------
  322. *}
  323. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  324. {*
  325. -------------------------------------------------------------------------------
  326. Returns the result of dividing the single-precision floating-point value `a'
  327. by the corresponding value `b'. The operation is performed according to the
  328. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  329. -------------------------------------------------------------------------------
  330. *}
  331. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  332. {*
  333. -------------------------------------------------------------------------------
  334. Returns the result of multiplying the single-precision floating-point values
  335. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  336. for Binary Floating-Point Arithmetic.
  337. -------------------------------------------------------------------------------
  338. *}
  339. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  340. {*
  341. -------------------------------------------------------------------------------
  342. Returns the result of subtracting the single-precision floating-point values
  343. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  344. for Binary Floating-Point Arithmetic.
  345. -------------------------------------------------------------------------------
  346. *}
  347. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  348. {*
  349. -------------------------------------------------------------------------------
  350. Returns the result of adding the single-precision floating-point values `a'
  351. and `b'. The operation is performed according to the IEC/IEEE Standard for
  352. Binary Floating-Point Arithmetic.
  353. -------------------------------------------------------------------------------
  354. *}
  355. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  356. {*
  357. -------------------------------------------------------------------------------
  358. Rounds the single-precision floating-point value `a' to an integer,
  359. and returns the result as a single-precision floating-point value. The
  360. operation is performed according to the IEC/IEEE Standard for Binary
  361. Floating-Point Arithmetic.
  362. -------------------------------------------------------------------------------
  363. *}
  364. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  365. {*
  366. -------------------------------------------------------------------------------
  367. Returns the result of converting the single-precision floating-point value
  368. `a' to the double-precision floating-point format. The conversion is
  369. performed according to the IEC/IEEE Standard for Binary Floating-Point
  370. Arithmetic.
  371. -------------------------------------------------------------------------------
  372. *}
  373. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  374. {*
  375. -------------------------------------------------------------------------------
  376. Returns the result of converting the single-precision floating-point value
  377. `a' to the 32-bit two's complement integer format. The conversion is
  378. performed according to the IEC/IEEE Standard for Binary Floating-Point
  379. Arithmetic, except that the conversion is always rounded toward zero.
  380. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  381. the conversion overflows, the largest integer with the same sign as `a' is
  382. returned.
  383. -------------------------------------------------------------------------------
  384. *}
  385. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  386. {*
  387. -------------------------------------------------------------------------------
  388. Returns the result of converting the single-precision floating-point value
  389. `a' to the 32-bit two's complement integer format. The conversion is
  390. performed according to the IEC/IEEE Standard for Binary Floating-Point
  391. Arithmetic---which means in particular that the conversion is rounded
  392. according to the current rounding mode. If `a' is a NaN, the largest
  393. positive integer is returned. Otherwise, if the conversion overflows, the
  394. largest integer with the same sign as `a' is returned.
  395. -------------------------------------------------------------------------------
  396. *}
  397. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  398. {*
  399. -------------------------------------------------------------------------------
  400. Returns the result of converting the 32-bit two's complement integer `a' to
  401. the double-precision floating-point format. The conversion is performed
  402. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  403. -------------------------------------------------------------------------------
  404. *}
  405. Function int32_to_float64( a: int32) : float64; compilerproc;
  406. {*
  407. -------------------------------------------------------------------------------
  408. Returns the result of converting the 32-bit two's complement integer `a' to
  409. the single-precision floating-point format. The conversion is performed
  410. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  411. -------------------------------------------------------------------------------
  412. *}
  413. Function int32_to_float32( a: int32): float32rec; compilerproc;
  414. {*----------------------------------------------------------------------------
  415. | Returns the result of converting the 64-bit two's complement integer `a'
  416. | to the double-precision floating-point format. The conversion is performed
  417. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  418. *----------------------------------------------------------------------------*}
  419. Function int64_to_float64( a: int64 ): float64; compilerproc;
  420. Function qword_to_float64( a: qword ): float64; compilerproc;
  421. {*----------------------------------------------------------------------------
  422. | Returns the result of converting the 64-bit two's complement integer `a'
  423. | to the single-precision floating-point format. The conversion is performed
  424. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  425. *----------------------------------------------------------------------------*}
  426. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  427. Function qword_to_float32( a: qword ): float32rec; compilerproc;
  428. // +++
  429. function float32_to_int64( a: float32 ): int64;
  430. function float32_to_int64_round_to_zero( a: float32 ): int64;
  431. function float32_eq_signaling( a: float32; b: float32) : flag;
  432. function float32_le_quiet( a: float32 ; b : float32 ): flag;
  433. function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  434. function float32_is_signaling_nan( a : float32 ): flag;
  435. function float32_is_nan( a : float32 ): flag;
  436. function float64_to_int64( a: float64 ): int64;
  437. function float64_to_int64_round_to_zero( a: float64 ): int64;
  438. function float64_eq_signaling( a: float64; b: float64): flag;
  439. function float64_le_quiet(a: float64 ; b: float64 ): flag;
  440. function float64_lt_quiet(a: float64; b: float64 ): Flag;
  441. function float64_is_signaling_nan( a : float64 ): flag;
  442. function float64_is_nan( a : float64 ): flag;
  443. // ===
  444. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  445. {*----------------------------------------------------------------------------
  446. | Extended double-precision rounding precision
  447. *----------------------------------------------------------------------------*}
  448. var // threadvar!?
  449. floatx80_rounding_precision : int8 = 80;
  450. function int32_to_floatx80( a: int32 ): floatx80;
  451. function int64_to_floatx80( a: int64 ): floatx80;
  452. function qword_to_floatx80( a: qword ): floatx80;
  453. function float32_to_floatx80( a: float32 ): floatx80;
  454. function float64_to_floatx80( a: float64 ): floatx80;
  455. function floatx80_to_int32( a: floatx80 ): int32;
  456. function floatx80_to_int32_round_to_zero( a: floatx80 ): int32;
  457. function floatx80_to_int64( a: floatx80 ): int64;
  458. function floatx80_to_int64_round_to_zero( a: floatx80 ): int64;
  459. function floatx80_to_float32( a: floatx80 ): float32;
  460. function floatx80_to_float64( a: floatx80 ): float64;
  461. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  462. function floatx80_to_float128( a: floatx80 ): float128;
  463. {$endif FPC_SOFTFLOAT_FLOAT128}
  464. function floatx80_round_to_int( a: floatx80 ): floatx80;
  465. function floatx80_add( a: floatx80; b: floatx80 ): floatx80;
  466. function floatx80_sub( a: floatx80; b: floatx80 ): floatx80;
  467. function floatx80_mul( a: floatx80; b: floatx80 ): floatx80;
  468. function floatx80_div( a: floatx80; b: floatx80 ): floatx80;
  469. function floatx80_rem( a: floatx80; b: floatx80 ): floatx80;
  470. function floatx80_sqrt( a: floatx80 ): floatx80;
  471. function floatx80_eq( a: floatx80; b: floatx80 ): flag;
  472. function floatx80_le( a: floatx80; b: floatx80 ): flag;
  473. function floatx80_lt( a: floatx80; b: floatx80 ): flag;
  474. function floatx80_eq_signaling( a: floatx80; b: floatx80 ): flag;
  475. function floatx80_le_quiet( a: floatx80; b: floatx80 ): flag;
  476. function floatx80_lt_quiet( a: floatx80; b: floatx80 ): flag;
  477. function floatx80_is_signaling_nan( a: floatx80 ): flag;
  478. function floatx80_is_nan(a : floatx80 ): flag;
  479. {$endif FPC_SOFTFLOAT_FLOATX80}
  480. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  481. function int32_to_float128( a: int32 ): float128;
  482. function int64_to_float128( a: int64 ): float128;
  483. function qword_to_float128( a: qword ): float128;
  484. function float32_to_float128( a: float32 ): float128;
  485. function float128_is_nan( a : float128): flag;
  486. function float128_is_signaling_nan( a : float128): flag;
  487. function float128_to_int32(a: float128): int32;
  488. function float128_to_int32_round_to_zero(a: float128): int32;
  489. function float128_to_int64(a: float128): int64;
  490. function float128_to_int64_round_to_zero(a: float128): int64;
  491. function float128_to_float32(a: float128): float32;
  492. function float128_to_float64(a: float128): float64;
  493. function float64_to_float128( a : float64) : float128;
  494. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  495. function float128_to_floatx80(a: float128): floatx80;
  496. {$endif FPC_SOFTFLOAT_FLOATX80}
  497. function float128_round_to_int(a: float128): float128;
  498. function float128_add(a: float128; b: float128): float128;
  499. function float128_sub(a: float128; b: float128): float128;
  500. function float128_mul(a: float128; b: float128): float128;
  501. function float128_div(a: float128; b: float128): float128;
  502. function float128_rem(a: float128; b: float128): float128;
  503. function float128_sqrt(a: float128): float128;
  504. function float128_eq(a: float128; b: float128): flag;
  505. function float128_le(a: float128; b: float128): flag;
  506. function float128_lt(a: float128; b: float128): flag;
  507. function float128_eq_signaling(a: float128; b: float128): flag;
  508. function float128_le_quiet(a: float128; b: float128): flag;
  509. function float128_lt_quiet(a: float128; b: float128): flag;
  510. {$endif FPC_SOFTFLOAT_FLOAT128}
  511. CONST
  512. {-------------------------------------------------------------------------------
  513. Software IEC/IEEE floating-point underflow tininess-detection mode.
  514. -------------------------------------------------------------------------------
  515. *}
  516. float_tininess_after_rounding = 0;
  517. float_tininess_before_rounding = 1;
  518. {*
  519. -------------------------------------------------------------------------------
  520. Underflow tininess-detection mode, statically initialized to default value.
  521. (The declaration in `softfloat.h' must match the `int8' type here.)
  522. -------------------------------------------------------------------------------
  523. *}
  524. var // threadvar!?
  525. softfloat_detect_tininess: int8 = float_tininess_after_rounding;
  526. {$endif not(defined(fpc_softfpu_implementation))}
  527. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  528. implementation
  529. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  530. {$if not(defined(fpc_softfpu_interface))}
  531. (*****************************************************************************)
  532. (*----------------------------------------------------------------------------*)
  533. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  534. (* division and square root approximations. (Can be specialized to target if *)
  535. (* desired.) *)
  536. (* ---------------------------------------------------------------------------*)
  537. (*****************************************************************************)
  538. { This procedure serves as a single access point to softfloat_exception_flags.
  539. It also helps to reduce code size a bit because softfloat_exception_flags is
  540. a threadvar. }
  541. procedure set_inexact_flag;
  542. begin
  543. include(softfloat_exception_flags,float_flag_inexact);
  544. end;
  545. {*----------------------------------------------------------------------------
  546. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  547. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  548. | input. If `zSign' is 1, the input is negated before being converted to an
  549. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  550. | is simply rounded to an integer, with the inexact exception raised if the
  551. | input cannot be represented exactly as an integer. However, if the fixed-
  552. | point input is too large, the invalid exception is raised and the largest
  553. | positive or negative integer is returned.
  554. *----------------------------------------------------------------------------*}
  555. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  556. var
  557. roundingMode: TFPURoundingMode;
  558. roundNearestEven: flag;
  559. roundIncrement, roundBits: int8;
  560. z: int32;
  561. begin
  562. roundingMode := softfloat_rounding_mode;
  563. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  564. roundIncrement := $40;
  565. if ( roundNearestEven=0 ) then
  566. begin
  567. if ( roundingMode = float_round_to_zero ) then
  568. begin
  569. roundIncrement := 0;
  570. end
  571. else begin
  572. roundIncrement := $7F;
  573. if ( zSign<>0 ) then
  574. begin
  575. if ( roundingMode = float_round_up ) then
  576. roundIncrement := 0;
  577. end
  578. else begin
  579. if ( roundingMode = float_round_down ) then
  580. roundIncrement := 0;
  581. end;
  582. end;
  583. end;
  584. roundBits := absZ and $7F;
  585. absZ := ( absZ + roundIncrement ) shr 7;
  586. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
  587. z := absZ;
  588. if ( zSign<>0 ) then
  589. z := - z;
  590. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  591. begin
  592. float_raise( float_flag_invalid );
  593. if zSign<>0 then
  594. result:=sbits32($80000000)
  595. else
  596. result:=$7FFFFFFF;
  597. exit;
  598. end;
  599. if ( roundBits<>0 ) then
  600. set_inexact_flag;
  601. result:=z;
  602. end;
  603. {*----------------------------------------------------------------------------
  604. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  605. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  606. | and returns the properly rounded 64-bit integer corresponding to the input.
  607. | If `zSign' is 1, the input is negated before being converted to an integer.
  608. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  609. | the inexact exception raised if the input cannot be represented exactly as
  610. | an integer. However, if the fixed-point input is too large, the invalid
  611. | exception is raised and the largest positive or negative integer is
  612. | returned.
  613. *----------------------------------------------------------------------------*}
  614. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  615. var
  616. roundingMode: TFPURoundingMode;
  617. roundNearestEven, increment: flag;
  618. z: int64;
  619. label
  620. overflow;
  621. begin
  622. roundingMode := softfloat_rounding_mode;
  623. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  624. increment := ord( sbits64(absZ1) < 0 );
  625. if ( roundNearestEven=0 ) then
  626. begin
  627. if ( roundingMode = float_round_to_zero ) then
  628. begin
  629. increment := 0;
  630. end
  631. else begin
  632. if ( zSign<>0 ) then
  633. begin
  634. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  635. end
  636. else begin
  637. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  638. end;
  639. end;
  640. end;
  641. if ( increment<>0 ) then
  642. begin
  643. inc(absZ0);
  644. if ( absZ0 = 0 ) then
  645. goto overflow;
  646. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  647. end;
  648. z := absZ0;
  649. if ( zSign<>0 ) then
  650. z := - z;
  651. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  652. begin
  653. overflow:
  654. float_raise( float_flag_invalid );
  655. if zSign<>0 then
  656. result:=int64($8000000000000000)
  657. else
  658. result:=int64($7FFFFFFFFFFFFFFF);
  659. exit;
  660. end;
  661. if ( absZ1<>0 ) then
  662. set_inexact_flag;
  663. result:=z;
  664. end;
  665. {*
  666. -------------------------------------------------------------------------------
  667. Shifts `a' right by the number of bits given in `count'. If any nonzero
  668. bits are shifted off, they are ``jammed'' into the least significant bit of
  669. the result by setting the least significant bit to 1. The value of `count'
  670. can be arbitrarily large; in particular, if `count' is greater than 32, the
  671. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  672. The result is stored in the location pointed to by `zPtr'.
  673. -------------------------------------------------------------------------------
  674. *}
  675. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  676. var
  677. z: Bits32;
  678. Begin
  679. if ( count = 0 ) then
  680. z := a
  681. else
  682. if ( count < 32 ) then
  683. Begin
  684. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  685. End
  686. else
  687. Begin
  688. z := bits32( a <> 0 );
  689. End;
  690. zPtr := z;
  691. End;
  692. {*----------------------------------------------------------------------------
  693. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  694. | number of bits given in `count'. Any bits shifted off are lost. The value
  695. | of `count' can be arbitrarily large; in particular, if `count' is greater
  696. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  697. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  698. *----------------------------------------------------------------------------*}
  699. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  700. var
  701. z0, z1: bits64;
  702. negCount: int8;
  703. begin
  704. negCount := ( - count ) and 63;
  705. if ( count = 0 ) then
  706. begin
  707. z1 := a1;
  708. z0 := a0;
  709. end
  710. else if ( count < 64 ) then
  711. begin
  712. z1 := ( a0 shl negCount ) or ( a1 shr count );
  713. z0 := a0 shr count;
  714. end
  715. else
  716. begin
  717. if ( count < 128 ) then
  718. z1 := a0 shr ( count and 63 )
  719. else
  720. z1 := 0;
  721. z0 := 0;
  722. end;
  723. z1Ptr := z1;
  724. z0Ptr := z0;
  725. end;
  726. {*----------------------------------------------------------------------------
  727. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  728. | number of bits given in `count'. If any nonzero bits are shifted off, they
  729. | are ``jammed'' into the least significant bit of the result by setting the
  730. | least significant bit to 1. The value of `count' can be arbitrarily large;
  731. | in particular, if `count' is greater than 128, the result will be either
  732. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  733. | nonzero. The result is broken into two 64-bit pieces which are stored at
  734. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  735. *----------------------------------------------------------------------------*}
  736. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  737. var
  738. z0,z1 : bits64;
  739. negCount : int8;
  740. begin
  741. negCount := ( - count ) and 63;
  742. if ( count = 0 ) then begin
  743. z1 := a1;
  744. z0 := a0;
  745. end
  746. else if ( count < 64 ) then begin
  747. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  748. z0 := a0>>count;
  749. end
  750. else begin
  751. if ( count = 64 ) then begin
  752. z1 := a0 or ord( a1 <> 0 );
  753. end
  754. else if ( count < 128 ) then begin
  755. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  756. end
  757. else begin
  758. z1 := ord( ( a0 or a1 ) <> 0 );
  759. end;
  760. z0 := 0;
  761. end;
  762. z1Ptr := z1;
  763. z0Ptr := z0;
  764. end;
  765. {*
  766. -------------------------------------------------------------------------------
  767. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  768. number of bits given in `count'. Any bits shifted off are lost. The value
  769. of `count' can be arbitrarily large; in particular, if `count' is greater
  770. than 64, the result will be 0. The result is broken into two 32-bit pieces
  771. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  772. -------------------------------------------------------------------------------
  773. *}
  774. Procedure
  775. shift64Right(
  776. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  777. Var
  778. z0, z1: bits32;
  779. negCount : int8;
  780. Begin
  781. negCount := ( - count ) AND 31;
  782. if ( count = 0 ) then
  783. Begin
  784. z1 := a1;
  785. z0 := a0;
  786. End
  787. else if ( count < 32 ) then
  788. Begin
  789. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  790. z0 := a0 shr count;
  791. End
  792. else
  793. Begin
  794. if (count < 64) then
  795. z1 := ( a0 shr ( count AND 31 ) )
  796. else
  797. z1 := 0;
  798. z0 := 0;
  799. End;
  800. z1Ptr := z1;
  801. z0Ptr := z0;
  802. End;
  803. {*
  804. -------------------------------------------------------------------------------
  805. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  806. number of bits given in `count'. If any nonzero bits are shifted off, they
  807. are ``jammed'' into the least significant bit of the result by setting the
  808. least significant bit to 1. The value of `count' can be arbitrarily large;
  809. in particular, if `count' is greater than 64, the result will be either 0
  810. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  811. nonzero. The result is broken into two 32-bit pieces which are stored at
  812. the locations pointed to by `z0Ptr' and `z1Ptr'.
  813. -------------------------------------------------------------------------------
  814. *}
  815. Procedure
  816. shift64RightJamming(
  817. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  818. VAR
  819. z0, z1 : bits32;
  820. negCount : int8;
  821. Begin
  822. negCount := ( - count ) AND 31;
  823. if ( count = 0 ) then
  824. Begin
  825. z1 := a1;
  826. z0 := a0;
  827. End
  828. else
  829. if ( count < 32 ) then
  830. Begin
  831. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  832. z0 := a0 shr count;
  833. End
  834. else
  835. Begin
  836. if ( count = 32 ) then
  837. Begin
  838. z1 := a0 OR bits32( a1 <> 0 );
  839. End
  840. else
  841. if ( count < 64 ) Then
  842. Begin
  843. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  844. End
  845. else
  846. Begin
  847. z1 := bits32( ( a0 OR a1 ) <> 0 );
  848. End;
  849. z0 := 0;
  850. End;
  851. z1Ptr := z1;
  852. z0Ptr := z0;
  853. End;
  854. {*----------------------------------------------------------------------------
  855. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  856. | bits are shifted off, they are ``jammed'' into the least significant bit of
  857. | the result by setting the least significant bit to 1. The value of `count'
  858. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  859. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  860. | The result is stored in the location pointed to by `zPtr'.
  861. *----------------------------------------------------------------------------*}
  862. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  863. var
  864. z: bits64;
  865. begin
  866. if ( count = 0 ) then
  867. begin
  868. z := a;
  869. end
  870. else if ( count < 64 ) then
  871. begin
  872. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  873. end
  874. else
  875. begin
  876. z := ord( a <> 0 );
  877. end;
  878. zPtr := z;
  879. end;
  880. {$if not defined(shift64ExtraRightJamming)}
  881. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  882. overload;
  883. forward;
  884. {$endif}
  885. {*
  886. -------------------------------------------------------------------------------
  887. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  888. by 32 _plus_ the number of bits given in `count'. The shifted result is
  889. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  890. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  891. off form a third 32-bit result as follows: The _last_ bit shifted off is
  892. the most-significant bit of the extra result, and the other 31 bits of the
  893. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  894. were all zero. This extra result is stored in the location pointed to by
  895. `z2Ptr'. The value of `count' can be arbitrarily large.
  896. (This routine makes more sense if `a0', `a1', and `a2' are considered
  897. to form a fixed-point value with binary point between `a1' and `a2'. This
  898. fixed-point value is shifted right by the number of bits given in `count',
  899. and the integer part of the result is returned at the locations pointed to
  900. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  901. corrupted as described above, and is returned at the location pointed to by
  902. `z2Ptr'.)
  903. -------------------------------------------------------------------------------
  904. }
  905. Procedure
  906. shift64ExtraRightJamming(
  907. a0: bits32;
  908. a1: bits32;
  909. a2: bits32;
  910. count: int16;
  911. VAR z0Ptr: bits32;
  912. VAR z1Ptr: bits32;
  913. VAR z2Ptr: bits32
  914. ); overload;
  915. Var
  916. z0, z1, z2: bits32;
  917. negCount : int8;
  918. Begin
  919. negCount := ( - count ) AND 31;
  920. if ( count = 0 ) then
  921. Begin
  922. z2 := a2;
  923. z1 := a1;
  924. z0 := a0;
  925. End
  926. else
  927. Begin
  928. if ( count < 32 ) Then
  929. Begin
  930. z2 := a1 shl negCount;
  931. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  932. z0 := a0 shr count;
  933. End
  934. else
  935. Begin
  936. if ( count = 32 ) then
  937. Begin
  938. z2 := a1;
  939. z1 := a0;
  940. End
  941. else
  942. Begin
  943. a2 := a2 or a1;
  944. if ( count < 64 ) then
  945. Begin
  946. z2 := a0 shl negCount;
  947. z1 := a0 shr ( count AND 31 );
  948. End
  949. else
  950. Begin
  951. if count = 64 then
  952. z2 := a0
  953. else
  954. z2 := bits32(a0 <> 0);
  955. z1 := 0;
  956. End;
  957. End;
  958. z0 := 0;
  959. End;
  960. z2 := z2 or bits32( a2 <> 0 );
  961. End;
  962. z2Ptr := z2;
  963. z1Ptr := z1;
  964. z0Ptr := z0;
  965. End;
  966. {*
  967. -------------------------------------------------------------------------------
  968. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  969. number of bits given in `count'. Any bits shifted off are lost. The value
  970. of `count' must be less than 32. The result is broken into two 32-bit
  971. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  972. -------------------------------------------------------------------------------
  973. *}
  974. Procedure
  975. shortShift64Left(
  976. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  977. Begin
  978. z1Ptr := a1 shl count;
  979. if count = 0 then
  980. z0Ptr := a0
  981. else
  982. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  983. End;
  984. {*
  985. -------------------------------------------------------------------------------
  986. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  987. by the number of bits given in `count'. Any bits shifted off are lost.
  988. The value of `count' must be less than 32. The result is broken into three
  989. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  990. `z1Ptr', and `z2Ptr'.
  991. -------------------------------------------------------------------------------
  992. *}
  993. Procedure
  994. shortShift96Left(
  995. a0: bits32;
  996. a1: bits32;
  997. a2: bits32;
  998. count: int16;
  999. VAR z0Ptr: bits32;
  1000. VAR z1Ptr: bits32;
  1001. VAR z2Ptr: bits32
  1002. );
  1003. Var
  1004. z0, z1, z2: bits32;
  1005. negCount: int8;
  1006. Begin
  1007. z2 := a2 shl count;
  1008. z1 := a1 shl count;
  1009. z0 := a0 shl count;
  1010. if ( 0 < count ) then
  1011. Begin
  1012. negCount := ( ( - count ) AND 31 );
  1013. z1 := z1 or (a2 shr negCount);
  1014. z0 := z0 or (a1 shr negCount);
  1015. End;
  1016. z2Ptr := z2;
  1017. z1Ptr := z1;
  1018. z0Ptr := z0;
  1019. End;
  1020. {*----------------------------------------------------------------------------
  1021. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  1022. | number of bits given in `count'. Any bits shifted off are lost. The value
  1023. | of `count' must be less than 64. The result is broken into two 64-bit
  1024. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1025. *----------------------------------------------------------------------------*}
  1026. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  1027. begin
  1028. z1Ptr := a1 shl count;
  1029. if count=0 then
  1030. z0Ptr:=a0
  1031. else
  1032. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  1033. end;
  1034. {*
  1035. -------------------------------------------------------------------------------
  1036. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  1037. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  1038. any carry out is lost. The result is broken into two 32-bit pieces which
  1039. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1040. -------------------------------------------------------------------------------
  1041. *}
  1042. Procedure
  1043. add64(
  1044. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  1045. Var
  1046. z1: bits32;
  1047. Begin
  1048. z1 := a1 + b1;
  1049. z1Ptr := z1;
  1050. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  1051. End;
  1052. {*
  1053. -------------------------------------------------------------------------------
  1054. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  1055. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1056. modulo 2^96, so any carry out is lost. The result is broken into three
  1057. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1058. `z1Ptr', and `z2Ptr'.
  1059. -------------------------------------------------------------------------------
  1060. *}
  1061. Procedure
  1062. add96(
  1063. a0: bits32;
  1064. a1: bits32;
  1065. a2: bits32;
  1066. b0: bits32;
  1067. b1: bits32;
  1068. b2: bits32;
  1069. VAR z0Ptr: bits32;
  1070. VAR z1Ptr: bits32;
  1071. VAR z2Ptr: bits32
  1072. );
  1073. var
  1074. z0, z1, z2: bits32;
  1075. carry0, carry1: int8;
  1076. Begin
  1077. z2 := a2 + b2;
  1078. carry1 := int8( z2 < a2 );
  1079. z1 := a1 + b1;
  1080. carry0 := int8( z1 < a1 );
  1081. z0 := a0 + b0;
  1082. z1 := z1 + carry1;
  1083. z0 := z0 + bits32( z1 < carry1 );
  1084. z0 := z0 + carry0;
  1085. z2Ptr := z2;
  1086. z1Ptr := z1;
  1087. z0Ptr := z0;
  1088. End;
  1089. {*----------------------------------------------------------------------------
  1090. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1091. | by the number of bits given in `count'. Any bits shifted off are lost.
  1092. | The value of `count' must be less than 64. The result is broken into three
  1093. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1094. | `z1Ptr', and `z2Ptr'.
  1095. *----------------------------------------------------------------------------*}
  1096. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1097. var
  1098. z0, z1, z2 : bits64;
  1099. negCount : int8;
  1100. begin
  1101. z2 := a2 shl count;
  1102. z1 := a1 shl count;
  1103. z0 := a0 shl count;
  1104. if ( 0 < count ) then
  1105. begin
  1106. negCount := ( ( - count ) and 63 );
  1107. z1 := z1 or (a2 shr negCount);
  1108. z0 := z0 or (a1 shr negCount);
  1109. end;
  1110. z2Ptr := z2;
  1111. z1Ptr := z1;
  1112. z0Ptr := z0;
  1113. end;
  1114. {*----------------------------------------------------------------------------
  1115. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1116. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1117. | any carry out is lost. The result is broken into two 64-bit pieces which
  1118. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1119. *----------------------------------------------------------------------------*}
  1120. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);inline;
  1121. var
  1122. z1 : bits64;
  1123. begin
  1124. z1 := a1 + b1;
  1125. z1Ptr := z1;
  1126. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1127. end;
  1128. {*----------------------------------------------------------------------------
  1129. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1130. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1131. | modulo 2^192, so any carry out is lost. The result is broken into three
  1132. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1133. | `z1Ptr', and `z2Ptr'.
  1134. *----------------------------------------------------------------------------*}
  1135. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1136. var
  1137. z0, z1, z2 : bits64;
  1138. carry0, carry1 : int8;
  1139. begin
  1140. z2 := a2 + b2;
  1141. carry1 := ord( z2 < a2 );
  1142. z1 := a1 + b1;
  1143. carry0 := ord( z1 < a1 );
  1144. z0 := a0 + b0;
  1145. inc(z1, carry1);
  1146. inc(z0, ord( z1 < carry1 ));
  1147. inc(z0, carry0);
  1148. z2Ptr := z2;
  1149. z1Ptr := z1;
  1150. z0Ptr := z0;
  1151. end;
  1152. {*
  1153. -------------------------------------------------------------------------------
  1154. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1155. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1156. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1157. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1158. `z1Ptr'.
  1159. -------------------------------------------------------------------------------
  1160. *}
  1161. Procedure
  1162. sub64(
  1163. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  1164. Begin
  1165. z1Ptr := a1 - b1;
  1166. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1167. End;
  1168. {*
  1169. -------------------------------------------------------------------------------
  1170. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1171. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1172. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1173. into three 32-bit pieces which are stored at the locations pointed to by
  1174. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1175. -------------------------------------------------------------------------------
  1176. *}
  1177. Procedure
  1178. sub96(
  1179. a0:bits32;
  1180. a1:bits32;
  1181. a2:bits32;
  1182. b0:bits32;
  1183. b1:bits32;
  1184. b2:bits32;
  1185. VAR z0Ptr:bits32;
  1186. VAR z1Ptr:bits32;
  1187. VAR z2Ptr:bits32
  1188. );
  1189. Var
  1190. z0, z1, z2: bits32;
  1191. borrow0, borrow1: int8;
  1192. Begin
  1193. z2 := a2 - b2;
  1194. borrow1 := int8( a2 < b2 );
  1195. z1 := a1 - b1;
  1196. borrow0 := int8( a1 < b1 );
  1197. z0 := a0 - b0;
  1198. z0 := z0 - bits32( z1 < borrow1 );
  1199. z1 := z1 - borrow1;
  1200. z0 := z0 -borrow0;
  1201. z2Ptr := z2;
  1202. z1Ptr := z1;
  1203. z0Ptr := z0;
  1204. End;
  1205. {*----------------------------------------------------------------------------
  1206. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1207. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1208. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1209. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1210. | `z1Ptr'.
  1211. *----------------------------------------------------------------------------*}
  1212. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1213. begin
  1214. z1Ptr := a1 - b1;
  1215. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1216. end;
  1217. {*----------------------------------------------------------------------------
  1218. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1219. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1220. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1221. | result is broken into three 64-bit pieces which are stored at the locations
  1222. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1223. *----------------------------------------------------------------------------*}
  1224. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1225. var
  1226. z0, z1, z2 : bits64;
  1227. borrow0, borrow1 : int8;
  1228. begin
  1229. z2 := a2 - b2;
  1230. borrow1 := ord( a2 < b2 );
  1231. z1 := a1 - b1;
  1232. borrow0 := ord( a1 < b1 );
  1233. z0 := a0 - b0;
  1234. dec(z0, ord( z1 < borrow1 ));
  1235. dec(z1, borrow1);
  1236. dec(z0, borrow0);
  1237. z2Ptr := z2;
  1238. z1Ptr := z1;
  1239. z0Ptr := z0;
  1240. end;
  1241. {*
  1242. -------------------------------------------------------------------------------
  1243. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1244. into two 32-bit pieces which are stored at the locations pointed to by
  1245. `z0Ptr' and `z1Ptr'.
  1246. -------------------------------------------------------------------------------
  1247. *}
  1248. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1249. :bits32 );
  1250. Var
  1251. aHigh, aLow, bHigh, bLow: bits16;
  1252. z0, zMiddleA, zMiddleB, z1: bits32;
  1253. Begin
  1254. aLow := a and $ffff;
  1255. aHigh := a shr 16;
  1256. bLow := b and $ffff;
  1257. bHigh := b shr 16;
  1258. z1 := ( bits32( aLow) ) * bLow;
  1259. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1260. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1261. z0 := ( bits32 (aHigh) ) * bHigh;
  1262. zMiddleA := zMiddleA + zMiddleB;
  1263. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1264. zMiddleA := zmiddleA shl 16;
  1265. z1 := z1 + zMiddleA;
  1266. z0 := z0 + bits32( z1 < zMiddleA );
  1267. z1Ptr := z1;
  1268. z0Ptr := z0;
  1269. End;
  1270. {*
  1271. -------------------------------------------------------------------------------
  1272. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1273. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1274. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1275. `z2Ptr'.
  1276. -------------------------------------------------------------------------------
  1277. *}
  1278. Procedure
  1279. mul64By32To96(
  1280. a0:bits32;
  1281. a1:bits32;
  1282. b:bits32;
  1283. VAR z0Ptr:bits32;
  1284. VAR z1Ptr:bits32;
  1285. VAR z2Ptr:bits32
  1286. );
  1287. Var
  1288. z0, z1, z2, more1: bits32;
  1289. Begin
  1290. mul32To64( a1, b, z1, z2 );
  1291. mul32To64( a0, b, z0, more1 );
  1292. add64( z0, more1, 0, z1, z0, z1 );
  1293. z2Ptr := z2;
  1294. z1Ptr := z1;
  1295. z0Ptr := z0;
  1296. End;
  1297. {*
  1298. -------------------------------------------------------------------------------
  1299. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1300. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1301. product. The product is broken into four 32-bit pieces which are stored at
  1302. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1303. -------------------------------------------------------------------------------
  1304. *}
  1305. Procedure
  1306. mul64To128(
  1307. a0:bits32;
  1308. a1:bits32;
  1309. b0:bits32;
  1310. b1:bits32;
  1311. VAR z0Ptr:bits32;
  1312. VAR z1Ptr:bits32;
  1313. VAR z2Ptr:bits32;
  1314. VAR z3Ptr:bits32
  1315. );
  1316. Var
  1317. z0, z1, z2, z3: bits32;
  1318. more1, more2: bits32;
  1319. Begin
  1320. mul32To64( a1, b1, z2, z3 );
  1321. mul32To64( a1, b0, z1, more2 );
  1322. add64( z1, more2, 0, z2, z1, z2 );
  1323. mul32To64( a0, b0, z0, more1 );
  1324. add64( z0, more1, 0, z1, z0, z1 );
  1325. mul32To64( a0, b1, more1, more2 );
  1326. add64( more1, more2, 0, z2, more1, z2 );
  1327. add64( z0, z1, 0, more1, z0, z1 );
  1328. z3Ptr := z3;
  1329. z2Ptr := z2;
  1330. z1Ptr := z1;
  1331. z0Ptr := z0;
  1332. End;
  1333. {*----------------------------------------------------------------------------
  1334. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1335. | into two 64-bit pieces which are stored at the locations pointed to by
  1336. | `z0Ptr' and `z1Ptr'.
  1337. *----------------------------------------------------------------------------*}
  1338. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1339. var
  1340. aHigh, aLow, bHigh, bLow : bits32;
  1341. z0, zMiddleA, zMiddleB, z1 : bits64;
  1342. begin
  1343. aLow := a;
  1344. aHigh := a shr 32;
  1345. bLow := b;
  1346. bHigh := b shr 32;
  1347. z1 := ( bits64(aLow) ) * bLow;
  1348. zMiddleA := ( bits64( aLow )) * bHigh;
  1349. zMiddleB := ( bits64( aHigh )) * bLow;
  1350. z0 := ( bits64(aHigh) ) * bHigh;
  1351. inc(zMiddleA, zMiddleB);
  1352. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1353. zMiddleA := zMiddleA shl 32;
  1354. inc(z1, zMiddleA);
  1355. inc(z0, ord( z1 < zMiddleA ));
  1356. z1Ptr := z1;
  1357. z0Ptr := z0;
  1358. end;
  1359. {*----------------------------------------------------------------------------
  1360. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1361. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1362. | product. The product is broken into four 64-bit pieces which are stored at
  1363. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1364. *----------------------------------------------------------------------------*}
  1365. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1366. var
  1367. z0,z1,z2,z3,more1,more2 : bits64;
  1368. begin
  1369. mul64To128( a1, b1, z2, z3 );
  1370. mul64To128( a1, b0, z1, more2 );
  1371. add128( z1, more2, 0, z2, z1, z2 );
  1372. mul64To128( a0, b0, z0, more1 );
  1373. add128( z0, more1, 0, z1, z0, z1 );
  1374. mul64To128( a0, b1, more1, more2 );
  1375. add128( more1, more2, 0, z2, more1, z2 );
  1376. add128( z0, z1, 0, more1, z0, z1 );
  1377. z3Ptr := z3;
  1378. z2Ptr := z2;
  1379. z1Ptr := z1;
  1380. z0Ptr := z0;
  1381. end;
  1382. {*----------------------------------------------------------------------------
  1383. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1384. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1385. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1386. | `z2Ptr'.
  1387. *----------------------------------------------------------------------------*}
  1388. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1389. var
  1390. z0, z1, z2, more1 : bits64;
  1391. begin
  1392. mul64To128( a1, b, z1, z2 );
  1393. mul64To128( a0, b, z0, more1 );
  1394. add128( z0, more1, 0, z1, z0, z1 );
  1395. z2Ptr := z2;
  1396. z1Ptr := z1;
  1397. z0Ptr := z0;
  1398. end;
  1399. {*----------------------------------------------------------------------------
  1400. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1401. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1402. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1403. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1404. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1405. | unsigned integer is returned.
  1406. *----------------------------------------------------------------------------*}
  1407. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1408. var
  1409. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1410. begin
  1411. if ( b <= a0 ) then
  1412. begin
  1413. result:=qword( $FFFFFFFFFFFFFFFF );
  1414. exit;
  1415. end;
  1416. b0 := b shr 32;
  1417. if ( b0 shl 32 <= a0 ) then
  1418. z:=qword( $FFFFFFFF00000000 )
  1419. else
  1420. z:=( a0 div b0 ) shl 32;
  1421. mul64To128( b, z, term0, term1 );
  1422. sub128( a0, a1, term0, term1, rem0, rem1 );
  1423. while ( ( sbits64(rem0) ) < 0 ) do begin
  1424. dec(z,qword( $100000000 ));
  1425. b1 := b shl 32;
  1426. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1427. end;
  1428. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1429. if ( b0 shl 32 <= rem0 ) then
  1430. z:=z or $FFFFFFFF
  1431. else
  1432. z:=z or rem0 div b0;
  1433. result:=z;
  1434. end;
  1435. {*
  1436. -------------------------------------------------------------------------------
  1437. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1438. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1439. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1440. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1441. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1442. unsigned integer is returned.
  1443. -------------------------------------------------------------------------------
  1444. *}
  1445. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1446. Var
  1447. b0, b1: bits32;
  1448. rem0, rem1, term0, term1: bits32;
  1449. z: bits32;
  1450. Begin
  1451. if ( b <= a0 ) then
  1452. Begin
  1453. estimateDiv64To32 := $FFFFFFFF;
  1454. exit;
  1455. End;
  1456. b0 := b shr 16;
  1457. if ( b0 shl 16 <= a0 ) then
  1458. z:= $FFFF0000
  1459. else
  1460. z:= ( a0 div b0 ) shl 16;
  1461. mul32To64( b, z, term0, term1 );
  1462. sub64( a0, a1, term0, term1, rem0, rem1 );
  1463. while ( ( sbits32 (rem0) ) < 0 ) do
  1464. Begin
  1465. z := z - $10000;
  1466. b1 := b shl 16;
  1467. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1468. End;
  1469. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1470. if ( b0 shl 16 <= rem0 ) then
  1471. z := z or $FFFF
  1472. else
  1473. z := z or (rem0 div b0);
  1474. estimateDiv64To32 := z;
  1475. End;
  1476. {*
  1477. -------------------------------------------------------------------------------
  1478. Returns an approximation to the square root of the 32-bit significand given
  1479. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1480. `aExp' (the least significant bit) is 1, the integer returned approximates
  1481. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1482. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1483. case, the approximation returned lies strictly within +/-2 of the exact
  1484. value.
  1485. -------------------------------------------------------------------------------
  1486. *}
  1487. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1488. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1489. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1490. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1491. );
  1492. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1493. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1494. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1495. );
  1496. Var
  1497. index: int8;
  1498. z: bits32;
  1499. Begin
  1500. index := ( a shr 27 ) AND 15;
  1501. if ( aExp AND 1 ) <> 0 then
  1502. Begin
  1503. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1504. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1505. a := a shr 1;
  1506. End
  1507. else
  1508. Begin
  1509. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1510. z := a div z + z;
  1511. if ( $20000 <= z ) then
  1512. z := $FFFF8000
  1513. else
  1514. z := ( z shl 15 );
  1515. if ( z <= a ) then
  1516. Begin
  1517. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1518. exit;
  1519. End;
  1520. End;
  1521. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1522. End;
  1523. {*
  1524. -------------------------------------------------------------------------------
  1525. Returns the number of leading 0 bits before the most-significant 1 bit of
  1526. `a'. If `a' is zero, 32 is returned.
  1527. -------------------------------------------------------------------------------
  1528. *}
  1529. Function countLeadingZeros32( a:bits32 ): int8;
  1530. const countLeadingZerosHigh:array[0..255] of int8 = (
  1531. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1532. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1533. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1534. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1535. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1536. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1537. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1538. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1539. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1540. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1541. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1542. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1543. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1544. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1545. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1546. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1547. );
  1548. Var
  1549. shiftCount: int8;
  1550. Begin
  1551. shiftCount := 0;
  1552. if ( a < $10000 ) then
  1553. Begin
  1554. shiftCount := shiftcount + 16;
  1555. a := a shl 16;
  1556. End;
  1557. if ( a < $1000000 ) then
  1558. Begin
  1559. shiftCount := shiftcount + 8;
  1560. a := a shl 8;
  1561. end;
  1562. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1563. countLeadingZeros32:= shiftCount;
  1564. End;
  1565. {*----------------------------------------------------------------------------
  1566. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1567. | `a'. If `a' is zero, 64 is returned.
  1568. *----------------------------------------------------------------------------*}
  1569. function countLeadingZeros64( a : bits64): int8;
  1570. var
  1571. shiftcount : int8;
  1572. Begin
  1573. shiftCount := 0;
  1574. if ( a < bits64(bits64(1) shl 32 )) then
  1575. shiftCount := shiftcount + 32
  1576. else
  1577. a := a shr 32;
  1578. shiftCount := shiftCount + countLeadingZeros32( a );
  1579. countLeadingZeros64:= shiftCount;
  1580. End;
  1581. {*
  1582. -------------------------------------------------------------------------------
  1583. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1584. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1585. Otherwise, returns 0.
  1586. -------------------------------------------------------------------------------
  1587. *}
  1588. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1589. Begin
  1590. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1591. End;
  1592. {*
  1593. -------------------------------------------------------------------------------
  1594. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1595. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1596. returns 0.
  1597. -------------------------------------------------------------------------------
  1598. *}
  1599. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1600. Begin
  1601. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1602. End;
  1603. const
  1604. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1605. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1606. (*****************************************************************************)
  1607. (* End Low-Level arithmetic *)
  1608. (*****************************************************************************)
  1609. {*
  1610. -------------------------------------------------------------------------------
  1611. Functions and definitions to determine: (1) whether tininess for underflow
  1612. is detected before or after rounding by default, (2) what (if anything)
  1613. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1614. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1615. are propagated from function inputs to output. These details are ENDIAN
  1616. specific
  1617. -------------------------------------------------------------------------------
  1618. *}
  1619. {$IFDEF ENDIAN_LITTLE}
  1620. {*
  1621. -------------------------------------------------------------------------------
  1622. Internal canonical NaN format.
  1623. -------------------------------------------------------------------------------
  1624. *}
  1625. TYPE
  1626. commonNaNT = record
  1627. high, low : bits32;
  1628. sign: flag;
  1629. end;
  1630. {*
  1631. -------------------------------------------------------------------------------
  1632. The pattern for a default generated single-precision NaN.
  1633. -------------------------------------------------------------------------------
  1634. *}
  1635. const float32_default_nan = $FFC00000;
  1636. {*
  1637. -------------------------------------------------------------------------------
  1638. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1639. otherwise returns 0.
  1640. -------------------------------------------------------------------------------
  1641. *}
  1642. Function float32_is_nan( a : float32 ): flag;
  1643. Begin
  1644. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1645. End;
  1646. {*
  1647. -------------------------------------------------------------------------------
  1648. Returns 1 if the single-precision floating-point value `a' is a signaling
  1649. NaN; otherwise returns 0.
  1650. -------------------------------------------------------------------------------
  1651. *}
  1652. Function float32_is_signaling_nan( a : float32 ): flag;
  1653. Begin
  1654. float32_is_signaling_nan := flag
  1655. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1656. End;
  1657. {*
  1658. -------------------------------------------------------------------------------
  1659. Returns the result of converting the single-precision floating-point NaN
  1660. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1661. exception is raised.
  1662. -------------------------------------------------------------------------------
  1663. *}
  1664. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1665. var
  1666. z : commonNaNT ;
  1667. Begin
  1668. if ( float32_is_signaling_nan( a ) <> 0) then
  1669. float_raise( float_flag_invalid );
  1670. z.sign := a shr 31;
  1671. z.low := 0;
  1672. z.high := a shl 9;
  1673. c := z;
  1674. End;
  1675. {*
  1676. -------------------------------------------------------------------------------
  1677. Returns the result of converting the canonical NaN `a' to the single-
  1678. precision floating-point format.
  1679. -------------------------------------------------------------------------------
  1680. *}
  1681. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1682. Begin
  1683. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1684. End;
  1685. {*
  1686. -------------------------------------------------------------------------------
  1687. Takes two single-precision floating-point values `a' and `b', one of which
  1688. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1689. signaling NaN, the invalid exception is raised.
  1690. -------------------------------------------------------------------------------
  1691. *}
  1692. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1693. Var
  1694. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1695. label returnLargerSignificand;
  1696. Begin
  1697. aIsNaN := float32_is_nan( a );
  1698. aIsSignalingNaN := float32_is_signaling_nan( a );
  1699. bIsNaN := float32_is_nan( b );
  1700. bIsSignalingNaN := float32_is_signaling_nan( b );
  1701. a := a or $00400000;
  1702. b := b or $00400000;
  1703. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1704. float_raise( float_flag_invalid );
  1705. if ( aIsSignalingNaN )<> 0 then
  1706. Begin
  1707. if ( bIsSignalingNaN ) <> 0 then
  1708. goto returnLargerSignificand;
  1709. if bIsNan <> 0 then
  1710. propagateFloat32NaN := b
  1711. else
  1712. propagateFloat32NaN := a;
  1713. exit;
  1714. End
  1715. else if ( aIsNaN <> 0) then
  1716. Begin
  1717. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1718. Begin
  1719. propagateFloat32NaN := a;
  1720. exit;
  1721. End;
  1722. returnLargerSignificand:
  1723. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1724. Begin
  1725. propagateFloat32NaN := b;
  1726. exit;
  1727. End;
  1728. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1729. Begin
  1730. propagateFloat32NaN := a;
  1731. End;
  1732. if a < b then
  1733. propagateFloat32NaN := a
  1734. else
  1735. propagateFloat32NaN := b;
  1736. exit;
  1737. End
  1738. else
  1739. Begin
  1740. propagateFloat32NaN := b;
  1741. exit;
  1742. End;
  1743. End;
  1744. {*
  1745. -------------------------------------------------------------------------------
  1746. The pattern for a default generated double-precision NaN. The `high' and
  1747. `low' values hold the most- and least-significant bits, respectively.
  1748. -------------------------------------------------------------------------------
  1749. *}
  1750. const
  1751. float64_default_nan_high = $FFF80000;
  1752. float64_default_nan_low = $00000000;
  1753. {*
  1754. -------------------------------------------------------------------------------
  1755. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1756. otherwise returns 0.
  1757. -------------------------------------------------------------------------------
  1758. *}
  1759. Function float64_is_nan( a : float64 ) : flag;
  1760. Begin
  1761. float64_is_nan :=
  1762. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1763. and ( a.low or ( a.high and $000FFFFF ) );
  1764. End;
  1765. {*
  1766. -------------------------------------------------------------------------------
  1767. Returns 1 if the double-precision floating-point value `a' is a signaling
  1768. NaN; otherwise returns 0.
  1769. -------------------------------------------------------------------------------
  1770. *}
  1771. Function float64_is_signaling_nan( a : float64 ): flag;
  1772. Begin
  1773. float64_is_signaling_nan :=
  1774. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1775. and ( a.low or ( a.high and $0007FFFF ) );
  1776. End;
  1777. {*
  1778. -------------------------------------------------------------------------------
  1779. Returns the result of converting the double-precision floating-point NaN
  1780. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1781. exception is raised.
  1782. -------------------------------------------------------------------------------
  1783. *}
  1784. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1785. Var
  1786. z : commonNaNT;
  1787. Begin
  1788. if ( float64_is_signaling_nan( a )<>0 ) then
  1789. float_raise( float_flag_invalid );
  1790. z.sign := a.high shr 31;
  1791. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1792. c := z;
  1793. End;
  1794. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1795. Var
  1796. z : commonNaNT;
  1797. Begin
  1798. if ( float64_is_signaling_nan( a )<>0 ) then
  1799. float_raise( float_flag_invalid );
  1800. z.sign := a.high shr 31;
  1801. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1802. result := z;
  1803. End;
  1804. {*
  1805. -------------------------------------------------------------------------------
  1806. Returns the result of converting the canonical NaN `a' to the double-
  1807. precision floating-point format.
  1808. -------------------------------------------------------------------------------
  1809. *}
  1810. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1811. Var
  1812. z: float64;
  1813. Begin
  1814. shift64Right( a.high, a.low, 12, z.high, z.low );
  1815. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1816. c := z;
  1817. End;
  1818. {*
  1819. -------------------------------------------------------------------------------
  1820. Takes two double-precision floating-point values `a' and `b', one of which
  1821. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1822. signaling NaN, the invalid exception is raised.
  1823. -------------------------------------------------------------------------------
  1824. *}
  1825. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1826. Var
  1827. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1828. label returnLargerSignificand;
  1829. Begin
  1830. aIsNaN := float64_is_nan( a );
  1831. aIsSignalingNaN := float64_is_signaling_nan( a );
  1832. bIsNaN := float64_is_nan( b );
  1833. bIsSignalingNaN := float64_is_signaling_nan( b );
  1834. a.high := a.high or $00080000;
  1835. b.high := b.high or $00080000;
  1836. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1837. float_raise( float_flag_invalid );
  1838. if ( aIsSignalingNaN )<>0 then
  1839. Begin
  1840. if ( bIsSignalingNaN )<>0 then
  1841. goto returnLargerSignificand;
  1842. if bIsNan <> 0 then
  1843. c := b
  1844. else
  1845. c := a;
  1846. exit;
  1847. End
  1848. else if ( aIsNaN )<> 0 then
  1849. Begin
  1850. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1851. Begin
  1852. c := a;
  1853. exit;
  1854. End;
  1855. returnLargerSignificand:
  1856. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1857. Begin
  1858. c := b;
  1859. exit;
  1860. End;
  1861. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1862. Begin
  1863. c := a;
  1864. exit;
  1865. End;
  1866. if a.high < b.high then
  1867. c := a
  1868. else
  1869. c := b;
  1870. exit;
  1871. End
  1872. else
  1873. Begin
  1874. c := b;
  1875. exit;
  1876. End;
  1877. End;
  1878. {*----------------------------------------------------------------------------
  1879. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1880. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1881. | returns 0.
  1882. *----------------------------------------------------------------------------*}
  1883. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1884. begin
  1885. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1886. end;
  1887. {*----------------------------------------------------------------------------
  1888. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1889. | otherwise returns 0.
  1890. *----------------------------------------------------------------------------*}
  1891. function float128_is_nan( a : float128): flag;
  1892. begin
  1893. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1894. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1895. end;
  1896. {*----------------------------------------------------------------------------
  1897. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1898. | signaling NaN; otherwise returns 0.
  1899. *----------------------------------------------------------------------------*}
  1900. function float128_is_signaling_nan( a : float128): flag;
  1901. begin
  1902. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1903. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1904. end;
  1905. {*----------------------------------------------------------------------------
  1906. | Returns the result of converting the quadruple-precision floating-point NaN
  1907. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1908. | exception is raised.
  1909. *----------------------------------------------------------------------------*}
  1910. function float128ToCommonNaN( a : float128): commonNaNT;
  1911. var
  1912. z: commonNaNT;
  1913. qhigh,qlow : qword;
  1914. begin
  1915. if ( float128_is_signaling_nan( a )<>0) then
  1916. float_raise( float_flag_invalid );
  1917. z.sign := a.high shr 63;
  1918. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1919. z.high:=qhigh shr 32;
  1920. z.low:=qhigh and $ffffffff;
  1921. result:=z;
  1922. end;
  1923. {*----------------------------------------------------------------------------
  1924. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1925. | precision floating-point format.
  1926. *----------------------------------------------------------------------------*}
  1927. function commonNaNToFloat128( a : commonNaNT): float128;
  1928. var
  1929. z: float128;
  1930. begin
  1931. shift128Right( a.high, a.low, 16, z.high, z.low );
  1932. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1933. result:=z;
  1934. end;
  1935. {*----------------------------------------------------------------------------
  1936. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1937. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1938. | `b' is a signaling NaN, the invalid exception is raised.
  1939. *----------------------------------------------------------------------------*}
  1940. function propagateFloat128NaN( a: float128; b : float128): float128;
  1941. var
  1942. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1943. label
  1944. returnLargerSignificand;
  1945. begin
  1946. aIsNaN := float128_is_nan( a );
  1947. aIsSignalingNaN := float128_is_signaling_nan( a );
  1948. bIsNaN := float128_is_nan( b );
  1949. bIsSignalingNaN := float128_is_signaling_nan( b );
  1950. a.high := a.high or int64( $0000800000000000 );
  1951. b.high := b.high or int64( $0000800000000000 );
  1952. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1953. float_raise( float_flag_invalid );
  1954. if ( aIsSignalingNaN )<>0 then
  1955. begin
  1956. if ( bIsSignalingNaN )<>0 then
  1957. goto returnLargerSignificand;
  1958. if bIsNaN<>0 then
  1959. result := b
  1960. else
  1961. result := a;
  1962. exit;
  1963. end
  1964. else if ( aIsNaN )<>0 then
  1965. begin
  1966. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1967. begin
  1968. result := a;
  1969. exit;
  1970. end;
  1971. returnLargerSignificand:
  1972. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1973. begin
  1974. result := b;
  1975. exit;
  1976. end;
  1977. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1978. begin
  1979. result := a;
  1980. exit
  1981. end;
  1982. if ( a.high < b.high ) then
  1983. result := a
  1984. else
  1985. result := b;
  1986. exit;
  1987. end
  1988. else
  1989. result:=b;
  1990. end;
  1991. {$ELSE}
  1992. { Big endian code }
  1993. (*----------------------------------------------------------------------------
  1994. | Internal canonical NaN format.
  1995. *----------------------------------------------------------------------------*)
  1996. type
  1997. commonNANT = record
  1998. high, low : bits32;
  1999. sign : flag;
  2000. end;
  2001. (*----------------------------------------------------------------------------
  2002. | The pattern for a default generated single-precision NaN.
  2003. *----------------------------------------------------------------------------*)
  2004. const float32_default_nan = $7FFFFFFF;
  2005. (*----------------------------------------------------------------------------
  2006. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  2007. | otherwise returns 0.
  2008. *----------------------------------------------------------------------------*)
  2009. function float32_is_nan(a: float32): flag;
  2010. begin
  2011. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  2012. end;
  2013. (*----------------------------------------------------------------------------
  2014. | Returns 1 if the single-precision floating-point value `a' is a signaling
  2015. | NaN; otherwise returns 0.
  2016. *----------------------------------------------------------------------------*)
  2017. function float32_is_signaling_nan(a: float32):flag;
  2018. begin
  2019. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  2020. end;
  2021. (*----------------------------------------------------------------------------
  2022. | Returns the result of converting the single-precision floating-point NaN
  2023. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2024. | exception is raised.
  2025. *----------------------------------------------------------------------------*)
  2026. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  2027. var
  2028. z: commonNANT;
  2029. begin
  2030. if float32_is_signaling_nan(a)<>0 then
  2031. float_raise(float_flag_invalid);
  2032. z.sign := a shr 31;
  2033. z.low := 0;
  2034. z.high := a shl 9;
  2035. c:=z;
  2036. end;
  2037. (*----------------------------------------------------------------------------
  2038. | Returns the result of converting the canonical NaN `a' to the single-
  2039. | precision floating-point format.
  2040. *----------------------------------------------------------------------------*)
  2041. function CommonNanToFloat32(a : CommonNaNT): float32;
  2042. begin
  2043. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  2044. end;
  2045. (*----------------------------------------------------------------------------
  2046. | Takes two single-precision floating-point values `a' and `b', one of which
  2047. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2048. | signaling NaN, the invalid exception is raised.
  2049. *----------------------------------------------------------------------------*)
  2050. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  2051. var
  2052. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2053. begin
  2054. aIsNaN := float32_is_nan( a );
  2055. aIsSignalingNaN := float32_is_signaling_nan( a );
  2056. bIsNaN := float32_is_nan( b );
  2057. bIsSignalingNaN := float32_is_signaling_nan( b );
  2058. a := a or $00400000;
  2059. b := b or $00400000;
  2060. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2061. float_raise( float_flag_invalid );
  2062. if bIsSignalingNaN<>0 then
  2063. propagateFloat32Nan := b
  2064. else if aIsSignalingNan<>0 then
  2065. propagateFloat32Nan := a
  2066. else if bIsNan<>0 then
  2067. propagateFloat32Nan := b
  2068. else
  2069. propagateFloat32Nan := a;
  2070. end;
  2071. (*----------------------------------------------------------------------------
  2072. | The pattern for a default generated double-precision NaN. The `high' and
  2073. | `low' values hold the most- and least-significant bits, respectively.
  2074. *----------------------------------------------------------------------------*)
  2075. const
  2076. float64_default_nan_high = $7FFFFFFF;
  2077. float64_default_nan_low = $FFFFFFFF;
  2078. (*----------------------------------------------------------------------------
  2079. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2080. | otherwise returns 0.
  2081. *----------------------------------------------------------------------------*)
  2082. function float64_is_nan(a: float64): flag;
  2083. begin
  2084. float64_is_nan := flag (
  2085. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2086. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2087. end;
  2088. (*----------------------------------------------------------------------------
  2089. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2090. | NaN; otherwise returns 0.
  2091. *----------------------------------------------------------------------------*)
  2092. function float64_is_signaling_nan( a:float64): flag;
  2093. begin
  2094. float64_is_signaling_nan := flag(
  2095. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2096. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2097. end;
  2098. (*----------------------------------------------------------------------------
  2099. | Returns the result of converting the double-precision floating-point NaN
  2100. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2101. | exception is raised.
  2102. *----------------------------------------------------------------------------*)
  2103. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  2104. var
  2105. z : commonNaNT;
  2106. begin
  2107. if ( float64_is_signaling_nan( a )<>0 ) then
  2108. float_raise( float_flag_invalid );
  2109. z.sign := a.high shr 31;
  2110. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2111. c:=z;
  2112. end;
  2113. (*----------------------------------------------------------------------------
  2114. | Returns the result of converting the canonical NaN `a' to the double-
  2115. | precision floating-point format.
  2116. *----------------------------------------------------------------------------*)
  2117. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  2118. var
  2119. z: float64;
  2120. begin
  2121. shift64Right( a.high, a.low, 12, z.high, z.low );
  2122. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2123. c:=z;
  2124. end;
  2125. (*----------------------------------------------------------------------------
  2126. | Takes two double-precision floating-point values `a' and `b', one of which
  2127. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2128. | signaling NaN, the invalid exception is raised.
  2129. *----------------------------------------------------------------------------*)
  2130. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2131. var
  2132. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2133. begin
  2134. aIsNaN := float64_is_nan( a );
  2135. aIsSignalingNaN := float64_is_signaling_nan( a );
  2136. bIsNaN := float64_is_nan( b );
  2137. bIsSignalingNaN := float64_is_signaling_nan( b );
  2138. a.high := a.high or $00080000;
  2139. b.high := b.high or $00080000;
  2140. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2141. float_raise( float_flag_invalid );
  2142. if bIsSignalingNaN<>0 then
  2143. c := b
  2144. else if aIsSignalingNan<>0 then
  2145. c := a
  2146. else if bIsNan<>0 then
  2147. c := b
  2148. else
  2149. c := a;
  2150. end;
  2151. {$ENDIF}
  2152. (****************************************************************************)
  2153. (* END ENDIAN SPECIFIC CODE *)
  2154. (****************************************************************************)
  2155. {*
  2156. -------------------------------------------------------------------------------
  2157. Returns the fraction bits of the single-precision floating-point value `a'.
  2158. -------------------------------------------------------------------------------
  2159. *}
  2160. Function ExtractFloat32Frac(a : Float32) : Bits32;
  2161. Begin
  2162. ExtractFloat32Frac := A AND $007FFFFF;
  2163. End;
  2164. {*
  2165. -------------------------------------------------------------------------------
  2166. Returns the exponent bits of the single-precision floating-point value `a'.
  2167. -------------------------------------------------------------------------------
  2168. *}
  2169. Function extractFloat32Exp( a: float32 ): Int16;
  2170. Begin
  2171. extractFloat32Exp := (a shr 23) AND $FF;
  2172. End;
  2173. {*
  2174. -------------------------------------------------------------------------------
  2175. Returns the sign bit of the single-precision floating-point value `a'.
  2176. -------------------------------------------------------------------------------
  2177. *}
  2178. Function extractFloat32Sign( a: float32 ): Flag;
  2179. Begin
  2180. extractFloat32Sign := a shr 31;
  2181. End;
  2182. {*
  2183. -------------------------------------------------------------------------------
  2184. Normalizes the subnormal single-precision floating-point value represented
  2185. by the denormalized significand `aSig'. The normalized exponent and
  2186. significand are stored at the locations pointed to by `zExpPtr' and
  2187. `zSigPtr', respectively.
  2188. -------------------------------------------------------------------------------
  2189. *}
  2190. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2191. Var
  2192. ShiftCount : BYTE;
  2193. Begin
  2194. shiftCount := countLeadingZeros32( aSig ) - 8;
  2195. zSigPtr := aSig shl shiftCount;
  2196. zExpPtr := 1 - shiftCount;
  2197. End;
  2198. {*
  2199. -------------------------------------------------------------------------------
  2200. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2201. single-precision floating-point value, returning the result. After being
  2202. shifted into the proper positions, the three fields are simply added
  2203. together to form the result. This means that any integer portion of `zSig'
  2204. will be added into the exponent. Since a properly normalized significand
  2205. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2206. than the desired result exponent whenever `zSig' is a complete, normalized
  2207. significand.
  2208. -------------------------------------------------------------------------------
  2209. *}
  2210. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  2211. Begin
  2212. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2213. + zSig;
  2214. End;
  2215. {*
  2216. -------------------------------------------------------------------------------
  2217. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2218. and significand `zSig', and returns the proper single-precision floating-
  2219. point value corresponding to the abstract input. Ordinarily, the abstract
  2220. value is simply rounded and packed into the single-precision format, with
  2221. the inexact exception raised if the abstract input cannot be represented
  2222. exactly. However, if the abstract value is too large, the overflow and
  2223. inexact exceptions are raised and an infinity or maximal finite value is
  2224. returned. If the abstract value is too small, the input value is rounded to
  2225. a subnormal number, and the underflow and inexact exceptions are raised if
  2226. the abstract input cannot be represented exactly as a subnormal single-
  2227. precision floating-point number.
  2228. The input significand `zSig' has its binary point between bits 30
  2229. and 29, which is 7 bits to the left of the usual location. This shifted
  2230. significand must be normalized or smaller. If `zSig' is not normalized,
  2231. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2232. and it must not require rounding. In the usual case that `zSig' is
  2233. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2234. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2235. Binary Floating-Point Arithmetic.
  2236. -------------------------------------------------------------------------------
  2237. *}
  2238. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2239. Var
  2240. roundingMode : TFPURoundingMode;
  2241. roundNearestEven : Flag;
  2242. roundIncrement, roundBits : BYTE;
  2243. IsTiny : Flag;
  2244. Begin
  2245. roundingMode := softfloat_rounding_mode;
  2246. if (roundingMode = float_round_nearest_even) then
  2247. Begin
  2248. roundNearestEven := Flag(TRUE);
  2249. end
  2250. else
  2251. roundNearestEven := Flag(FALSE);
  2252. roundIncrement := $40;
  2253. if ( Boolean(roundNearestEven) = FALSE) then
  2254. Begin
  2255. if ( roundingMode = float_round_to_zero ) Then
  2256. Begin
  2257. roundIncrement := 0;
  2258. End
  2259. else
  2260. Begin
  2261. roundIncrement := $7F;
  2262. if ( zSign <> 0 ) then
  2263. Begin
  2264. if roundingMode = float_round_up then roundIncrement := 0;
  2265. End
  2266. else
  2267. Begin
  2268. if roundingMode = float_round_down then roundIncrement := 0;
  2269. End;
  2270. End
  2271. End;
  2272. roundBits := zSig AND $7F;
  2273. if ($FD <= bits16 (zExp) ) then
  2274. Begin
  2275. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2276. Begin
  2277. float_raise( [float_flag_overflow,float_flag_inexact] );
  2278. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2279. exit;
  2280. End;
  2281. if ( zExp < 0 ) then
  2282. Begin
  2283. isTiny :=
  2284. flag(( softfloat_detect_tininess = float_tininess_before_rounding )
  2285. OR ( zExp < -1 )
  2286. OR ( (zSig + roundIncrement) < $80000000 ));
  2287. shift32RightJamming( zSig, - zExp, zSig );
  2288. zExp := 0;
  2289. roundBits := zSig AND $7F;
  2290. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  2291. float_raise( float_flag_underflow );
  2292. End;
  2293. End;
  2294. if ( roundBits )<> 0 then
  2295. set_inexact_flag;
  2296. zSig := ( zSig + roundIncrement ) shr 7;
  2297. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  2298. if ( zSig = 0 ) then zExp := 0;
  2299. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2300. exit;
  2301. End;
  2302. {*
  2303. -------------------------------------------------------------------------------
  2304. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2305. and significand `zSig', and returns the proper single-precision floating-
  2306. point value corresponding to the abstract input. This routine is just like
  2307. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2308. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2309. floating-point exponent.
  2310. -------------------------------------------------------------------------------
  2311. *}
  2312. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2313. Var
  2314. ShiftCount : int8;
  2315. Begin
  2316. shiftCount := countLeadingZeros32( zSig ) - 1;
  2317. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2318. End;
  2319. {*
  2320. -------------------------------------------------------------------------------
  2321. Returns the most-significant 20 fraction bits of the double-precision
  2322. floating-point value `a'.
  2323. -------------------------------------------------------------------------------
  2324. *}
  2325. Function extractFloat64Frac0(a: float64): bits32;
  2326. Begin
  2327. extractFloat64Frac0 := a.high and $000FFFFF;
  2328. End;
  2329. {*
  2330. -------------------------------------------------------------------------------
  2331. Returns the least-significant 32 fraction bits of the double-precision
  2332. floating-point value `a'.
  2333. -------------------------------------------------------------------------------
  2334. *}
  2335. Function extractFloat64Frac1(a: float64): bits32;
  2336. Begin
  2337. extractFloat64Frac1 := a.low;
  2338. End;
  2339. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2340. Function extractFloat64Frac(a: float64): bits64;
  2341. Begin
  2342. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2343. End;
  2344. {*
  2345. -------------------------------------------------------------------------------
  2346. Returns the exponent bits of the double-precision floating-point value `a'.
  2347. -------------------------------------------------------------------------------
  2348. *}
  2349. Function extractFloat64Exp(a: float64): int16;
  2350. Begin
  2351. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2352. End;
  2353. {*
  2354. -------------------------------------------------------------------------------
  2355. Returns the sign bit of the double-precision floating-point value `a'.
  2356. -------------------------------------------------------------------------------
  2357. *}
  2358. Function extractFloat64Sign(a: float64) : flag;
  2359. Begin
  2360. extractFloat64Sign := a.high shr 31;
  2361. End;
  2362. {*
  2363. -------------------------------------------------------------------------------
  2364. Normalizes the subnormal double-precision floating-point value represented
  2365. by the denormalized significand formed by the concatenation of `aSig0' and
  2366. `aSig1'. The normalized exponent is stored at the location pointed to by
  2367. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2368. stored at the location pointed to by `zSig0Ptr', and the least significant
  2369. 32 bits of the normalized significand are stored at the location pointed to
  2370. by `zSig1Ptr'.
  2371. -------------------------------------------------------------------------------
  2372. *}
  2373. Procedure normalizeFloat64Subnormal(
  2374. aSig0: bits32;
  2375. aSig1: bits32;
  2376. VAR zExpPtr : Int16;
  2377. VAR zSig0Ptr : Bits32;
  2378. VAR zSig1Ptr : Bits32
  2379. );
  2380. Var
  2381. ShiftCount : Int8;
  2382. Begin
  2383. if ( aSig0 = 0 ) then
  2384. Begin
  2385. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2386. if ( shiftCount < 0 ) then
  2387. Begin
  2388. zSig0Ptr := aSig1 shr ( - shiftCount );
  2389. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2390. End
  2391. else
  2392. Begin
  2393. zSig0Ptr := aSig1 shl shiftCount;
  2394. zSig1Ptr := 0;
  2395. End;
  2396. zExpPtr := - shiftCount - 31;
  2397. End
  2398. else
  2399. Begin
  2400. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2401. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2402. zExpPtr := 1 - shiftCount;
  2403. End;
  2404. End;
  2405. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2406. var
  2407. shiftCount : int8;
  2408. begin
  2409. shiftCount := countLeadingZeros64( aSig ) - 11;
  2410. zSigPtr := aSig shl shiftCount;
  2411. zExpPtr := 1 - shiftCount;
  2412. end;
  2413. {*
  2414. -------------------------------------------------------------------------------
  2415. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2416. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2417. point value, returning the result. After being shifted into the proper
  2418. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2419. together to form the most significant 32 bits of the result. This means
  2420. that any integer portion of `zSig0' will be added into the exponent. Since
  2421. a properly normalized significand will have an integer portion equal to 1,
  2422. the `zExp' input should be 1 less than the desired result exponent whenever
  2423. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2424. -------------------------------------------------------------------------------
  2425. *}
  2426. Procedure
  2427. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2428. var
  2429. z: Float64;
  2430. Begin
  2431. z.low := zSig1;
  2432. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2433. c := z;
  2434. End;
  2435. {*----------------------------------------------------------------------------
  2436. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2437. | double-precision floating-point value, returning the result. After being
  2438. | shifted into the proper positions, the three fields are simply added
  2439. | together to form the result. This means that any integer portion of `zSig'
  2440. | will be added into the exponent. Since a properly normalized significand
  2441. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2442. | than the desired result exponent whenever `zSig' is a complete, normalized
  2443. | significand.
  2444. *----------------------------------------------------------------------------*}
  2445. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2446. begin
  2447. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2448. end;
  2449. {*
  2450. -------------------------------------------------------------------------------
  2451. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2452. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2453. and `zSig2', and returns the proper double-precision floating-point value
  2454. corresponding to the abstract input. Ordinarily, the abstract value is
  2455. simply rounded and packed into the double-precision format, with the inexact
  2456. exception raised if the abstract input cannot be represented exactly.
  2457. However, if the abstract value is too large, the overflow and inexact
  2458. exceptions are raised and an infinity or maximal finite value is returned.
  2459. If the abstract value is too small, the input value is rounded to a
  2460. subnormal number, and the underflow and inexact exceptions are raised if the
  2461. abstract input cannot be represented exactly as a subnormal double-precision
  2462. floating-point number.
  2463. The input significand must be normalized or smaller. If the input
  2464. significand is not normalized, `zExp' must be 0; in that case, the result
  2465. returned is a subnormal number, and it must not require rounding. In the
  2466. usual case that the input significand is normalized, `zExp' must be 1 less
  2467. than the ``true'' floating-point exponent. The handling of underflow and
  2468. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2469. -------------------------------------------------------------------------------
  2470. *}
  2471. Procedure
  2472. roundAndPackFloat64(
  2473. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2474. Var
  2475. roundingMode : TFPURoundingMode;
  2476. roundNearestEven, increment, isTiny : Flag;
  2477. Begin
  2478. roundingMode := softfloat_rounding_mode;
  2479. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2480. increment := flag( sbits32 (zSig2) < 0 );
  2481. if ( roundNearestEven = flag(FALSE) ) then
  2482. Begin
  2483. if ( roundingMode = float_round_to_zero ) then
  2484. increment := 0
  2485. else
  2486. Begin
  2487. if ( zSign )<> 0 then
  2488. Begin
  2489. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2490. End
  2491. else
  2492. Begin
  2493. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2494. End
  2495. End
  2496. End;
  2497. if ( $7FD <= bits16 (zExp) ) then
  2498. Begin
  2499. if (( $7FD < zExp )
  2500. or (( zExp = $7FD )
  2501. and (zSig0=$001FFFFF) and (zSig1=$FFFFFFFF)
  2502. and (increment<>0)
  2503. )
  2504. ) then
  2505. Begin
  2506. float_raise( [float_flag_overflow,float_flag_inexact] );
  2507. if (( roundingMode = float_round_to_zero )
  2508. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2509. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2510. ) then
  2511. Begin
  2512. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2513. exit;
  2514. End;
  2515. packFloat64( zSign, $7FF, 0, 0, c );
  2516. exit;
  2517. End;
  2518. if ( zExp < 0 ) then
  2519. Begin
  2520. isTiny :=
  2521. flag( softfloat_detect_tininess = float_tininess_before_rounding )
  2522. or flag( zExp < -1 )
  2523. or flag(increment = 0)
  2524. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2525. shift64ExtraRightJamming(
  2526. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2527. zExp := 0;
  2528. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2529. if ( roundNearestEven )<>0 then
  2530. Begin
  2531. increment := flag( sbits32 (zSig2) < 0 );
  2532. End
  2533. else
  2534. Begin
  2535. if ( zSign )<>0 then
  2536. Begin
  2537. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2538. End
  2539. else
  2540. Begin
  2541. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2542. End
  2543. End;
  2544. End;
  2545. End;
  2546. if ( zSig2 )<>0 then
  2547. set_inexact_flag;
  2548. if ( increment )<>0 then
  2549. Begin
  2550. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2551. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2552. End
  2553. else
  2554. Begin
  2555. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2556. End;
  2557. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2558. End;
  2559. {*----------------------------------------------------------------------------
  2560. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2561. | and significand `zSig', and returns the proper double-precision floating-
  2562. | point value corresponding to the abstract input. Ordinarily, the abstract
  2563. | value is simply rounded and packed into the double-precision format, with
  2564. | the inexact exception raised if the abstract input cannot be represented
  2565. | exactly. However, if the abstract value is too large, the overflow and
  2566. | inexact exceptions are raised and an infinity or maximal finite value is
  2567. | returned. If the abstract value is too small, the input value is rounded
  2568. | to a subnormal number, and the underflow and inexact exceptions are raised
  2569. | if the abstract input cannot be represented exactly as a subnormal double-
  2570. | precision floating-point number.
  2571. | The input significand `zSig' has its binary point between bits 62
  2572. | and 61, which is 10 bits to the left of the usual location. This shifted
  2573. | significand must be normalized or smaller. If `zSig' is not normalized,
  2574. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2575. | and it must not require rounding. In the usual case that `zSig' is
  2576. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2577. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2578. | Binary Floating-Point Arithmetic.
  2579. *----------------------------------------------------------------------------*}
  2580. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2581. var
  2582. roundingMode: TFPURoundingMode;
  2583. roundNearestEven: flag;
  2584. roundIncrement, roundBits: int16;
  2585. isTiny: flag;
  2586. begin
  2587. roundingMode := softfloat_rounding_mode;
  2588. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2589. roundIncrement := $200;
  2590. if ( roundNearestEven=0 ) then
  2591. begin
  2592. if ( roundingMode = float_round_to_zero ) then
  2593. begin
  2594. roundIncrement := 0;
  2595. end
  2596. else begin
  2597. roundIncrement := $3FF;
  2598. if ( zSign<>0 ) then
  2599. begin
  2600. if ( roundingMode = float_round_up ) then
  2601. roundIncrement := 0;
  2602. end
  2603. else begin
  2604. if ( roundingMode = float_round_down ) then
  2605. roundIncrement := 0;
  2606. end
  2607. end
  2608. end;
  2609. roundBits := zSig and $3FF;
  2610. if ( $7FD <= bits16(zExp) ) then
  2611. begin
  2612. if ( ( $7FD < zExp )
  2613. or ( ( zExp = $7FD )
  2614. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2615. ) then
  2616. begin
  2617. float_raise( [float_flag_overflow,float_flag_inexact] );
  2618. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2619. exit;
  2620. end;
  2621. if ( zExp < 0 ) then
  2622. begin
  2623. isTiny := ord(
  2624. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2625. or ( zExp < -1 )
  2626. or ( (zSig + roundIncrement) < bits64( $8000000000000000 ) ) );
  2627. shift64RightJamming( zSig, - zExp, zSig );
  2628. zExp := 0;
  2629. roundBits := zSig and $3FF;
  2630. if ( isTiny and roundBits )<>0 then
  2631. float_raise( float_flag_underflow );
  2632. end
  2633. end;
  2634. if ( roundBits<>0 ) then
  2635. set_inexact_flag;
  2636. zSig := ( zSig + roundIncrement ) shr 10;
  2637. zSig := zSig and not(qword(ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven ));
  2638. if ( zSig = 0 ) then
  2639. zExp := 0;
  2640. result:=packFloat64( zSign, zExp, zSig );
  2641. end;
  2642. {*
  2643. -------------------------------------------------------------------------------
  2644. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2645. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2646. returns the proper double-precision floating-point value corresponding
  2647. to the abstract input. This routine is just like `roundAndPackFloat64'
  2648. except that the input significand has fewer bits and does not have to be
  2649. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2650. point exponent.
  2651. -------------------------------------------------------------------------------
  2652. *}
  2653. Procedure
  2654. normalizeRoundAndPackFloat64(
  2655. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2656. Var
  2657. shiftCount : int8;
  2658. zSig2 : bits32;
  2659. Begin
  2660. if ( zSig0 = 0 ) then
  2661. Begin
  2662. zSig0 := zSig1;
  2663. zSig1 := 0;
  2664. zExp := zExp -32;
  2665. End;
  2666. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2667. if ( 0 <= shiftCount ) then
  2668. Begin
  2669. zSig2 := 0;
  2670. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2671. End
  2672. else
  2673. Begin
  2674. shift64ExtraRightJamming
  2675. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2676. End;
  2677. zExp := zExp - shiftCount;
  2678. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2679. End;
  2680. {*
  2681. -------------------------------------------------------------------------------
  2682. Returns the result of converting the 32-bit two's complement integer `a' to
  2683. the single-precision floating-point format. The conversion is performed
  2684. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2685. -------------------------------------------------------------------------------
  2686. *}
  2687. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2688. Var
  2689. zSign : Flag;
  2690. Begin
  2691. if ( a = 0 ) then
  2692. Begin
  2693. int32_to_float32.float32 := 0;
  2694. exit;
  2695. End;
  2696. if ( a = sbits32 ($80000000) ) then
  2697. Begin
  2698. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2699. exit;
  2700. end;
  2701. zSign := flag( a < 0 );
  2702. If zSign<>0 then
  2703. a := -a;
  2704. int32_to_float32.float32:=
  2705. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2706. End;
  2707. {*
  2708. -------------------------------------------------------------------------------
  2709. Returns the result of converting the 32-bit two's complement integer `a' to
  2710. the double-precision floating-point format. The conversion is performed
  2711. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2712. -------------------------------------------------------------------------------
  2713. *}
  2714. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2715. var
  2716. zSign : flag;
  2717. absA : bits32;
  2718. shiftCount : int8;
  2719. zSig0, zSig1 : bits32;
  2720. Begin
  2721. if ( a = 0 ) then
  2722. Begin
  2723. packFloat64( 0, 0, 0, 0, result );
  2724. exit;
  2725. end;
  2726. zSign := flag( a < 0 );
  2727. if ZSign<>0 then
  2728. AbsA := -a
  2729. else
  2730. AbsA := a;
  2731. shiftCount := countLeadingZeros32( absA ) - 11;
  2732. if ( 0 <= shiftCount ) then
  2733. Begin
  2734. zSig0 := absA shl shiftCount;
  2735. zSig1 := 0;
  2736. End
  2737. else
  2738. Begin
  2739. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2740. End;
  2741. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2742. End;
  2743. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  2744. {$if not defined(packFloatx80)}
  2745. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  2746. forward;
  2747. {$endif}
  2748. {*----------------------------------------------------------------------------
  2749. | Returns the result of converting the 32-bit two's complement integer `a'
  2750. | to the extended double-precision floating-point format. The conversion
  2751. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2752. | Arithmetic.
  2753. *----------------------------------------------------------------------------*}
  2754. function int32_to_floatx80( a: int32 ): floatx80;
  2755. var
  2756. zSign: flag;
  2757. absA: uint32;
  2758. shiftCount: int8;
  2759. zSig: bits64;
  2760. begin
  2761. if ( a = 0 ) then begin
  2762. result := packFloatx80( 0, 0, 0 );
  2763. exit;
  2764. end;
  2765. zSign := ord( a < 0 );
  2766. if zSign <> 0 then absA := - a else absA := a;
  2767. shiftCount := countLeadingZeros32( absA ) + 32;
  2768. zSig := absA;
  2769. result := packFloatx80( zSign, $403E - shiftCount, zSig shl shiftCount );
  2770. end;
  2771. {$endif FPC_SOFTFLOAT_FLOATX80}
  2772. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  2773. {$if not defined(packFloat128)}
  2774. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64 ) : float128;
  2775. forward;
  2776. {$endif}
  2777. {*----------------------------------------------------------------------------
  2778. | Returns the result of converting the 32-bit two's complement integer `a' to
  2779. | the quadruple-precision floating-point format. The conversion is performed
  2780. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2781. *----------------------------------------------------------------------------*}
  2782. function int32_to_float128( a: int32 ): float128;
  2783. var
  2784. zSign: flag;
  2785. absA: uint32;
  2786. shiftCount: int8;
  2787. zSig0: bits64;
  2788. begin
  2789. if ( a = 0 ) then begin
  2790. result := packFloat128( 0, 0, 0, 0 );
  2791. exit;
  2792. end;
  2793. zSign := ord( a < 0 );
  2794. if zSign <> 0 then absA := - a else absA := a;
  2795. shiftCount := countLeadingZeros32( absA ) + 17;
  2796. zSig0 := absA;
  2797. result := packFloat128( zSign, $402E - shiftCount, zSig0 shl shiftCount, 0 );
  2798. end;
  2799. {$endif FPC_SOFTFLOAT_FLOAT128}
  2800. {*
  2801. -------------------------------------------------------------------------------
  2802. Returns the result of converting the single-precision floating-point value
  2803. `a' to the 32-bit two's complement integer format. The conversion is
  2804. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2805. Arithmetic---which means in particular that the conversion is rounded
  2806. according to the current rounding mode. If `a' is a NaN, the largest
  2807. positive integer is returned. Otherwise, if the conversion overflows, the
  2808. largest integer with the same sign as `a' is returned.
  2809. -------------------------------------------------------------------------------
  2810. *}
  2811. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2812. Var
  2813. aSign: flag;
  2814. aExp, shiftCount: int16;
  2815. aSig, aSigExtra: bits32;
  2816. z: int32;
  2817. roundingMode: TFPURoundingMode;
  2818. Begin
  2819. aSig := extractFloat32Frac( a.float32 );
  2820. aExp := extractFloat32Exp( a.float32 );
  2821. aSign := extractFloat32Sign( a.float32 );
  2822. shiftCount := aExp - $96;
  2823. if ( 0 <= shiftCount ) then
  2824. Begin
  2825. if ( $9E <= aExp ) then
  2826. Begin
  2827. if ( a.float32 <> $CF000000 ) then
  2828. Begin
  2829. float_raise( float_flag_invalid );
  2830. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2831. Begin
  2832. float32_to_int32 := $7FFFFFFF;
  2833. exit;
  2834. End;
  2835. End;
  2836. float32_to_int32 := sbits32 ($80000000);
  2837. exit;
  2838. End;
  2839. z := ( aSig or $00800000 ) shl shiftCount;
  2840. if ( aSign<>0 ) then z := - z;
  2841. End
  2842. else
  2843. Begin
  2844. if ( aExp < $7E ) then
  2845. Begin
  2846. aSigExtra := aExp OR aSig;
  2847. z := 0;
  2848. End
  2849. else
  2850. Begin
  2851. aSig := aSig OR $00800000;
  2852. aSigExtra := aSig shl ( shiftCount and 31 );
  2853. z := aSig shr ( - shiftCount );
  2854. End;
  2855. if ( aSigExtra<>0 ) then
  2856. set_inexact_flag;
  2857. roundingMode := softfloat_rounding_mode;
  2858. if ( roundingMode = float_round_nearest_even ) then
  2859. Begin
  2860. if ( sbits32 (aSigExtra) < 0 ) then
  2861. Begin
  2862. Inc(z);
  2863. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2864. z := z and not 1;
  2865. End;
  2866. if ( aSign<>0 ) then
  2867. z := - z;
  2868. End
  2869. else
  2870. Begin
  2871. aSigExtra := flag( aSigExtra <> 0 );
  2872. if ( aSign<>0 ) then
  2873. Begin
  2874. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2875. z := - z;
  2876. End
  2877. else
  2878. Begin
  2879. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2880. End
  2881. End;
  2882. End;
  2883. float32_to_int32 := z;
  2884. End;
  2885. {*
  2886. -------------------------------------------------------------------------------
  2887. Returns the result of converting the single-precision floating-point value
  2888. `a' to the 32-bit two's complement integer format. The conversion is
  2889. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2890. Arithmetic, except that the conversion is always rounded toward zero.
  2891. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2892. the conversion overflows, the largest integer with the same sign as `a' is
  2893. returned.
  2894. -------------------------------------------------------------------------------
  2895. *}
  2896. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2897. Var
  2898. aSign : flag;
  2899. aExp, shiftCount : int16;
  2900. aSig : bits32;
  2901. z : int32;
  2902. Begin
  2903. aSig := extractFloat32Frac( a.float32 );
  2904. aExp := extractFloat32Exp( a.float32 );
  2905. aSign := extractFloat32Sign( a.float32 );
  2906. shiftCount := aExp - $9E;
  2907. if ( 0 <= shiftCount ) then
  2908. Begin
  2909. if ( a.float32 <> $CF000000 ) then
  2910. Begin
  2911. float_raise( float_flag_invalid );
  2912. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2913. Begin
  2914. float32_to_int32_round_to_zero := $7FFFFFFF;
  2915. exit;
  2916. end;
  2917. End;
  2918. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2919. exit;
  2920. End
  2921. else
  2922. if ( aExp <= $7E ) then
  2923. Begin
  2924. if ( aExp or aSig )<>0 then
  2925. set_inexact_flag;
  2926. float32_to_int32_round_to_zero := 0;
  2927. exit;
  2928. End;
  2929. aSig := ( aSig or $00800000 ) shl 8;
  2930. z := aSig shr ( - shiftCount );
  2931. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2932. Begin
  2933. set_inexact_flag;
  2934. End;
  2935. if ( aSign<>0 ) then z := - z;
  2936. float32_to_int32_round_to_zero := z;
  2937. End;
  2938. {*----------------------------------------------------------------------------
  2939. | Returns the result of converting the single-precision floating-point value
  2940. | `a' to the 64-bit two's complement integer format. The conversion is
  2941. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2942. | Arithmetic---which means in particular that the conversion is rounded
  2943. | according to the current rounding mode. If `a' is a NaN, the largest
  2944. | positive integer is returned. Otherwise, if the conversion overflows, the
  2945. | largest integer with the same sign as `a' is returned.
  2946. *----------------------------------------------------------------------------*}
  2947. function float32_to_int64( a: float32 ): int64;
  2948. var
  2949. aSign: flag;
  2950. aExp, shiftCount: int16;
  2951. aSig: bits32;
  2952. aSig64, aSigExtra: bits64;
  2953. begin
  2954. aSig := extractFloat32Frac( a );
  2955. aExp := extractFloat32Exp( a );
  2956. aSign := extractFloat32Sign( a );
  2957. shiftCount := $BE - aExp;
  2958. if ( shiftCount < 0 ) then begin
  2959. float_raise( float_flag_invalid );
  2960. if ( aSign = 0 ) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  2961. result := $7FFFFFFFFFFFFFFF;
  2962. exit;
  2963. end;
  2964. result := $8000000000000000;
  2965. exit;
  2966. end;
  2967. if ( aExp <> 0 ) then aSig := aSig or $00800000;
  2968. aSig64 := aSig;
  2969. aSig64 := aSig64 shl 40;
  2970. shift64ExtraRightJamming( aSig64, 0, shiftCount, aSig64, aSigExtra );
  2971. result := roundAndPackInt64( aSign, aSig64, aSigExtra );
  2972. end;
  2973. {*----------------------------------------------------------------------------
  2974. | Returns the result of converting the single-precision floating-point value
  2975. | `a' to the 64-bit two's complement integer format. The conversion is
  2976. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2977. | Arithmetic, except that the conversion is always rounded toward zero. If
  2978. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  2979. | conversion overflows, the largest integer with the same sign as `a' is
  2980. | returned.
  2981. *----------------------------------------------------------------------------*}
  2982. function float32_to_int64_round_to_zero( a: float32 ): int64;
  2983. var
  2984. aSign: flag;
  2985. aExp, shiftCount: int16;
  2986. aSig: bits32;
  2987. aSig64: bits64;
  2988. z: int64;
  2989. begin
  2990. aSig := extractFloat32Frac( a );
  2991. aExp := extractFloat32Exp( a );
  2992. aSign := extractFloat32Sign( a );
  2993. shiftCount := aExp - $BE;
  2994. if ( 0 <= shiftCount ) then begin
  2995. if ( a <> $DF000000 ) then begin
  2996. float_raise( float_flag_invalid );
  2997. if ( aSign = 0) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  2998. result := $7FFFFFFFFFFFFFFF;
  2999. exit;
  3000. end;
  3001. end;
  3002. result := $8000000000000000;
  3003. exit;
  3004. end
  3005. else if ( aExp <= $7E ) then begin
  3006. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  3007. result := 0;
  3008. exit;
  3009. end;
  3010. aSig64 := aSig or $00800000;
  3011. aSig64 := aSig64 shl 40;
  3012. z := aSig64 shr ( - shiftCount );
  3013. if bits64( aSig64 shl ( shiftCount and 63 ) ) <> 0 then
  3014. set_inexact_flag;
  3015. if ( aSign <> 0 ) then z := - z;
  3016. result := z;
  3017. end;
  3018. {*
  3019. -------------------------------------------------------------------------------
  3020. Returns the result of converting the single-precision floating-point value
  3021. `a' to the double-precision floating-point format. The conversion is
  3022. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3023. Arithmetic.
  3024. -------------------------------------------------------------------------------
  3025. *}
  3026. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  3027. Var
  3028. aSign : flag;
  3029. aExp : int16;
  3030. aSig, zSig0, zSig1: bits32;
  3031. tmp : CommonNanT;
  3032. Begin
  3033. aSig := extractFloat32Frac( a.float32 );
  3034. aExp := extractFloat32Exp( a.float32 );
  3035. aSign := extractFloat32Sign( a.float32 );
  3036. if ( aExp = $FF ) then
  3037. Begin
  3038. if ( aSig<>0 ) then
  3039. Begin
  3040. float32ToCommonNaN(a.float32, tmp);
  3041. commonNaNToFloat64(tmp , result);
  3042. exit;
  3043. End;
  3044. packFloat64( aSign, $7FF, 0, 0, result);
  3045. exit;
  3046. End;
  3047. if ( aExp = 0 ) then
  3048. Begin
  3049. if ( aSig = 0 ) then
  3050. Begin
  3051. packFloat64( aSign, 0, 0, 0, result );
  3052. exit;
  3053. end;
  3054. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3055. Dec(aExp);
  3056. End;
  3057. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  3058. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  3059. End;
  3060. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  3061. {*----------------------------------------------------------------------------
  3062. | Returns the result of converting the canonical NaN `a' to the extended
  3063. | double-precision floating-point format.
  3064. *----------------------------------------------------------------------------*}
  3065. function commonNaNToFloatx80( a : commonNaNT ) : floatx80;
  3066. var
  3067. z : floatx80;
  3068. begin
  3069. z.low := bits64( $C000000000000000 ) or ( a.high shr 1 );
  3070. z.high := ( bits16( a.sign ) shl 15 ) or $7FFF;
  3071. result := z;
  3072. end;
  3073. {*----------------------------------------------------------------------------
  3074. | Returns the result of converting the single-precision floating-point value
  3075. | `a' to the extended double-precision floating-point format. The conversion
  3076. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3077. | Arithmetic.
  3078. *----------------------------------------------------------------------------*}
  3079. function float32_to_floatx80( a: float32 ): floatx80;
  3080. var
  3081. aSign: flag;
  3082. aExp: int16;
  3083. aSig: bits32;
  3084. tmp: commonNaNT;
  3085. begin
  3086. aSig := extractFloat32Frac( a );
  3087. aExp := extractFloat32Exp( a );
  3088. aSign := extractFloat32Sign( a );
  3089. if ( aExp = $FF ) then begin
  3090. if ( aSig <> 0 ) then begin
  3091. float32ToCommonNaN( a, tmp );
  3092. result := commonNaNToFloatx80( tmp );
  3093. exit;
  3094. end;
  3095. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  3096. exit;
  3097. end;
  3098. if ( aExp = 0 ) then begin
  3099. if ( aSig = 0 ) then begin
  3100. result := packFloatx80( aSign, 0, 0 );
  3101. exit;
  3102. end;
  3103. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3104. end;
  3105. aSig := aSig or $00800000;
  3106. result := packFloatx80( aSign, aExp + $3F80, bits64(aSig) shl 40 );
  3107. end;
  3108. {$endif FPC_SOFTFLOAT_FLOATX80}
  3109. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  3110. {*----------------------------------------------------------------------------
  3111. | Returns the result of converting the single-precision floating-point value
  3112. | `a' to the double-precision floating-point format. The conversion is
  3113. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3114. | Arithmetic.
  3115. *----------------------------------------------------------------------------*}
  3116. function float32_to_float128( a: float32 ): float128;
  3117. var
  3118. aSign: flag;
  3119. aExp: int16;
  3120. aSig: bits32;
  3121. tmp: commonNaNT;
  3122. begin
  3123. aSig := extractFloat32Frac( a );
  3124. aExp := extractFloat32Exp( a );
  3125. aSign := extractFloat32Sign( a );
  3126. if ( aExp = $FF ) then begin
  3127. if ( aSig <> 0 ) then begin
  3128. float32ToCommonNaN( a, tmp );
  3129. result := commonNaNToFloat128( tmp );
  3130. exit;
  3131. end;
  3132. result := packFloat128( aSign, $7FFF, 0, 0 );
  3133. exit;
  3134. end;
  3135. if ( aExp = 0 ) then begin
  3136. if ( aSig = 0 ) then begin
  3137. result := packFloat128( aSign, 0, 0, 0 );
  3138. exit;
  3139. end;
  3140. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3141. dec( aExp );
  3142. end;
  3143. result := packFloat128( aSign, aExp + $3F80, bits64( aSig ) shl 25, 0 );
  3144. end;
  3145. {$endif FPC_SOFTFLOAT_FLOAT128}
  3146. {*
  3147. -------------------------------------------------------------------------------
  3148. Rounds the single-precision floating-point value `a' to an integer,
  3149. and returns the result as a single-precision floating-point value. The
  3150. operation is performed according to the IEC/IEEE Standard for Binary
  3151. Floating-Point Arithmetic.
  3152. -------------------------------------------------------------------------------
  3153. *}
  3154. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  3155. Var
  3156. aSign: flag;
  3157. aExp: int16;
  3158. lastBitMask, roundBitsMask: bits32;
  3159. roundingMode: TFPURoundingMode;
  3160. z: float32;
  3161. Begin
  3162. aExp := extractFloat32Exp( a.float32 );
  3163. if ( $96 <= aExp ) then
  3164. Begin
  3165. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3166. Begin
  3167. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  3168. exit;
  3169. End;
  3170. float32_round_to_int:=a;
  3171. exit;
  3172. End;
  3173. if ( aExp <= $7E ) then
  3174. Begin
  3175. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  3176. Begin
  3177. float32_round_to_int:=a;
  3178. exit;
  3179. end;
  3180. set_inexact_flag;
  3181. aSign := extractFloat32Sign( a.float32 );
  3182. case ( softfloat_rounding_mode ) of
  3183. float_round_nearest_even:
  3184. Begin
  3185. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3186. Begin
  3187. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  3188. exit;
  3189. End;
  3190. End;
  3191. float_round_down:
  3192. Begin
  3193. if aSign <> 0 then
  3194. float32_round_to_int.float32 := $BF800000
  3195. else
  3196. float32_round_to_int.float32 := 0;
  3197. exit;
  3198. End;
  3199. float_round_up:
  3200. Begin
  3201. if aSign <> 0 then
  3202. float32_round_to_int.float32 := $80000000
  3203. else
  3204. float32_round_to_int.float32 := $3F800000;
  3205. exit;
  3206. End;
  3207. end;
  3208. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  3209. exit;
  3210. End;
  3211. lastBitMask := 1;
  3212. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  3213. lastBitMask := lastBitMask shl ($96 - aExp);
  3214. roundBitsMask := lastBitMask - 1;
  3215. z := a.float32;
  3216. roundingMode := softfloat_rounding_mode;
  3217. if ( roundingMode = float_round_nearest_even ) then
  3218. Begin
  3219. z := z + (lastBitMask shr 1);
  3220. if ( ( z and roundBitsMask ) = 0 ) then
  3221. z := z and not lastBitMask;
  3222. End
  3223. else if ( roundingMode <> float_round_to_zero ) then
  3224. Begin
  3225. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  3226. Begin
  3227. z := z + roundBitsMask;
  3228. End;
  3229. End;
  3230. z := z and not roundBitsMask;
  3231. if ( z <> a.float32 ) then
  3232. set_inexact_flag;
  3233. float32_round_to_int.float32 := z;
  3234. End;
  3235. {*
  3236. -------------------------------------------------------------------------------
  3237. Returns the result of adding the absolute values of the single-precision
  3238. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3239. before being returned. `zSign' is ignored if the result is a NaN.
  3240. The addition is performed according to the IEC/IEEE Standard for Binary
  3241. Floating-Point Arithmetic.
  3242. -------------------------------------------------------------------------------
  3243. *}
  3244. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  3245. Var
  3246. aExp, bExp, zExp: int16;
  3247. aSig, bSig, zSig: bits32;
  3248. expDiff: int16;
  3249. label roundAndPack;
  3250. Begin
  3251. aSig:=extractFloat32Frac( a );
  3252. aExp:=extractFloat32Exp( a );
  3253. bSig:=extractFloat32Frac( b );
  3254. bExp := extractFloat32Exp( b );
  3255. expDiff := aExp - bExp;
  3256. aSig := aSig shl 6;
  3257. bSig := bSig shl 6;
  3258. if ( 0 < expDiff ) then
  3259. Begin
  3260. if ( aExp = $FF ) then
  3261. Begin
  3262. if ( aSig <> 0) then
  3263. Begin
  3264. addFloat32Sigs := propagateFloat32NaN( a, b );
  3265. exit;
  3266. End;
  3267. addFloat32Sigs := a;
  3268. exit;
  3269. End;
  3270. if ( bExp = 0 ) then
  3271. Begin
  3272. Dec(expDiff);
  3273. End
  3274. else
  3275. Begin
  3276. bSig := bSig or $20000000;
  3277. End;
  3278. shift32RightJamming( bSig, expDiff, bSig );
  3279. zExp := aExp;
  3280. End
  3281. else
  3282. If ( expDiff < 0 ) then
  3283. Begin
  3284. if ( bExp = $FF ) then
  3285. Begin
  3286. if ( bSig<>0 ) then
  3287. Begin
  3288. addFloat32Sigs := propagateFloat32NaN( a, b );
  3289. exit;
  3290. end;
  3291. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3292. exit;
  3293. End;
  3294. if ( aExp = 0 ) then
  3295. Begin
  3296. Inc(expDiff);
  3297. End
  3298. else
  3299. Begin
  3300. aSig := aSig OR $20000000;
  3301. End;
  3302. shift32RightJamming( aSig, - expDiff, aSig );
  3303. zExp := bExp;
  3304. End
  3305. else
  3306. Begin
  3307. if ( aExp = $FF ) then
  3308. Begin
  3309. if ( aSig OR bSig )<> 0 then
  3310. Begin
  3311. addFloat32Sigs := propagateFloat32NaN( a, b );
  3312. exit;
  3313. end;
  3314. addFloat32Sigs := a;
  3315. exit;
  3316. End;
  3317. if ( aExp = 0 ) then
  3318. Begin
  3319. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3320. exit;
  3321. end;
  3322. zSig := $40000000 + aSig + bSig;
  3323. zExp := aExp;
  3324. goto roundAndPack;
  3325. End;
  3326. aSig := aSig OR $20000000;
  3327. zSig := ( aSig + bSig ) shl 1;
  3328. Dec(zExp);
  3329. if ( sbits32 (zSig) < 0 ) then
  3330. Begin
  3331. zSig := aSig + bSig;
  3332. Inc(zExp);
  3333. End;
  3334. roundAndPack:
  3335. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3336. End;
  3337. {*
  3338. -------------------------------------------------------------------------------
  3339. Returns the result of subtracting the absolute values of the single-
  3340. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3341. difference is negated before being returned. `zSign' is ignored if the
  3342. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3343. Standard for Binary Floating-Point Arithmetic.
  3344. -------------------------------------------------------------------------------
  3345. *}
  3346. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3347. Var
  3348. aExp, bExp, zExp: int16;
  3349. aSig, bSig, zSig: bits32;
  3350. expDiff : int16;
  3351. label aExpBigger;
  3352. label bExpBigger;
  3353. label aBigger;
  3354. label bBigger;
  3355. label normalizeRoundAndPack;
  3356. Begin
  3357. aSig := extractFloat32Frac( a );
  3358. aExp := extractFloat32Exp( a );
  3359. bSig := extractFloat32Frac( b );
  3360. bExp := extractFloat32Exp( b );
  3361. expDiff := aExp - bExp;
  3362. aSig := aSig shl 7;
  3363. bSig := bSig shl 7;
  3364. if ( 0 < expDiff ) then goto aExpBigger;
  3365. if ( expDiff < 0 ) then goto bExpBigger;
  3366. if ( aExp = $FF ) then
  3367. Begin
  3368. if ( aSig OR bSig )<> 0 then
  3369. Begin
  3370. subFloat32Sigs := propagateFloat32NaN( a, b );
  3371. exit;
  3372. End;
  3373. float_raise( float_flag_invalid );
  3374. subFloat32Sigs := float32_default_nan;
  3375. exit;
  3376. End;
  3377. if ( aExp = 0 ) then
  3378. Begin
  3379. aExp := 1;
  3380. bExp := 1;
  3381. End;
  3382. if ( bSig < aSig ) Then goto aBigger;
  3383. if ( aSig < bSig ) Then goto bBigger;
  3384. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3385. exit;
  3386. bExpBigger:
  3387. if ( bExp = $FF ) then
  3388. Begin
  3389. if ( bSig<>0 ) then
  3390. Begin
  3391. subFloat32Sigs := propagateFloat32NaN( a, b );
  3392. exit;
  3393. End;
  3394. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3395. exit;
  3396. End;
  3397. if ( aExp = 0 ) then
  3398. Begin
  3399. Inc(expDiff);
  3400. End
  3401. else
  3402. Begin
  3403. aSig := aSig OR $40000000;
  3404. End;
  3405. shift32RightJamming( aSig, - expDiff, aSig );
  3406. bSig := bSig OR $40000000;
  3407. bBigger:
  3408. zSig := bSig - aSig;
  3409. zExp := bExp;
  3410. zSign := zSign xor 1;
  3411. goto normalizeRoundAndPack;
  3412. aExpBigger:
  3413. if ( aExp = $FF ) then
  3414. Begin
  3415. if ( aSig <> 0) then
  3416. Begin
  3417. subFloat32Sigs := propagateFloat32NaN( a, b );
  3418. exit;
  3419. End;
  3420. subFloat32Sigs := a;
  3421. exit;
  3422. End;
  3423. if ( bExp = 0 ) then
  3424. Begin
  3425. Dec(expDiff);
  3426. End
  3427. else
  3428. Begin
  3429. bSig := bSig OR $40000000;
  3430. End;
  3431. shift32RightJamming( bSig, expDiff, bSig );
  3432. aSig := aSig OR $40000000;
  3433. aBigger:
  3434. zSig := aSig - bSig;
  3435. zExp := aExp;
  3436. normalizeRoundAndPack:
  3437. Dec(zExp);
  3438. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3439. End;
  3440. {*
  3441. -------------------------------------------------------------------------------
  3442. Returns the result of adding the single-precision floating-point values `a'
  3443. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3444. Binary Floating-Point Arithmetic.
  3445. -------------------------------------------------------------------------------
  3446. *}
  3447. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3448. Var
  3449. aSign, bSign: Flag;
  3450. Begin
  3451. aSign := extractFloat32Sign( a.float32 );
  3452. bSign := extractFloat32Sign( b.float32 );
  3453. if ( aSign = bSign ) then
  3454. Begin
  3455. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3456. End
  3457. else
  3458. Begin
  3459. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3460. End;
  3461. End;
  3462. {*
  3463. -------------------------------------------------------------------------------
  3464. Returns the result of subtracting the single-precision floating-point values
  3465. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3466. for Binary Floating-Point Arithmetic.
  3467. -------------------------------------------------------------------------------
  3468. *}
  3469. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3470. Var
  3471. aSign, bSign: flag;
  3472. Begin
  3473. aSign := extractFloat32Sign( a.float32 );
  3474. bSign := extractFloat32Sign( b.float32 );
  3475. if ( aSign = bSign ) then
  3476. Begin
  3477. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3478. End
  3479. else
  3480. Begin
  3481. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3482. End;
  3483. End;
  3484. {*
  3485. -------------------------------------------------------------------------------
  3486. Returns the result of multiplying the single-precision floating-point values
  3487. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3488. for Binary Floating-Point Arithmetic.
  3489. -------------------------------------------------------------------------------
  3490. *}
  3491. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3492. Var
  3493. aSign, bSign, zSign: flag;
  3494. aExp, bExp, zExp : int16;
  3495. aSig, bSig, zSig0, zSig1: bits32;
  3496. Begin
  3497. aSig := extractFloat32Frac( a.float32 );
  3498. aExp := extractFloat32Exp( a.float32 );
  3499. aSign := extractFloat32Sign( a.float32 );
  3500. bSig := extractFloat32Frac( b.float32 );
  3501. bExp := extractFloat32Exp( b.float32 );
  3502. bSign := extractFloat32Sign( b.float32 );
  3503. zSign := aSign xor bSign;
  3504. if ( aExp = $FF ) then
  3505. Begin
  3506. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3507. Begin
  3508. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3509. exit;
  3510. End;
  3511. if ( ( bExp OR bSig ) = 0 ) then
  3512. Begin
  3513. float_raise( float_flag_invalid );
  3514. float32_mul.float32 := float32_default_nan;
  3515. exit;
  3516. End;
  3517. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3518. exit;
  3519. End;
  3520. if ( bExp = $FF ) then
  3521. Begin
  3522. if ( bSig <> 0 ) then
  3523. Begin
  3524. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3525. exit;
  3526. End;
  3527. if ( ( aExp OR aSig ) = 0 ) then
  3528. Begin
  3529. float_raise( float_flag_invalid );
  3530. float32_mul.float32 := float32_default_nan;
  3531. exit;
  3532. End;
  3533. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3534. exit;
  3535. End;
  3536. if ( aExp = 0 ) then
  3537. Begin
  3538. if ( aSig = 0 ) then
  3539. Begin
  3540. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3541. exit;
  3542. End;
  3543. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3544. End;
  3545. if ( bExp = 0 ) then
  3546. Begin
  3547. if ( bSig = 0 ) then
  3548. Begin
  3549. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3550. exit;
  3551. End;
  3552. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3553. End;
  3554. zExp := aExp + bExp - $7F;
  3555. aSig := ( aSig OR $00800000 ) shl 7;
  3556. bSig := ( bSig OR $00800000 ) shl 8;
  3557. mul32To64( aSig, bSig, zSig0, zSig1 );
  3558. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3559. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3560. Begin
  3561. zSig0 := zSig0 shl 1;
  3562. Dec(zExp);
  3563. End;
  3564. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3565. End;
  3566. {*
  3567. -------------------------------------------------------------------------------
  3568. Returns the result of dividing the single-precision floating-point value `a'
  3569. by the corresponding value `b'. The operation is performed according to the
  3570. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3571. -------------------------------------------------------------------------------
  3572. *}
  3573. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3574. Var
  3575. aSign, bSign, zSign: flag;
  3576. aExp, bExp, zExp: int16;
  3577. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3578. Begin
  3579. aSig := extractFloat32Frac( a.float32 );
  3580. aExp := extractFloat32Exp( a.float32 );
  3581. aSign := extractFloat32Sign( a.float32 );
  3582. bSig := extractFloat32Frac( b.float32 );
  3583. bExp := extractFloat32Exp( b.float32 );
  3584. bSign := extractFloat32Sign( b.float32 );
  3585. zSign := aSign xor bSign;
  3586. if ( aExp = $FF ) then
  3587. Begin
  3588. if ( aSig <> 0 ) then
  3589. Begin
  3590. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3591. exit;
  3592. End;
  3593. if ( bExp = $FF ) then
  3594. Begin
  3595. if ( bSig <> 0) then
  3596. Begin
  3597. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3598. exit;
  3599. End;
  3600. float_raise( float_flag_invalid );
  3601. float32_div.float32 := float32_default_nan;
  3602. exit;
  3603. End;
  3604. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3605. exit;
  3606. End;
  3607. if ( bExp = $FF ) then
  3608. Begin
  3609. if ( bSig <> 0) then
  3610. Begin
  3611. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3612. exit;
  3613. End;
  3614. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3615. exit;
  3616. End;
  3617. if ( bExp = 0 ) Then
  3618. Begin
  3619. if ( bSig = 0 ) Then
  3620. Begin
  3621. if ( ( aExp OR aSig ) = 0 ) then
  3622. Begin
  3623. float_raise( float_flag_invalid );
  3624. float32_div.float32 := float32_default_nan;
  3625. exit;
  3626. End;
  3627. float_raise( float_flag_divbyzero );
  3628. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3629. exit;
  3630. End;
  3631. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3632. End;
  3633. if ( aExp = 0 ) Then
  3634. Begin
  3635. if ( aSig = 0 ) Then
  3636. Begin
  3637. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3638. exit;
  3639. End;
  3640. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3641. End;
  3642. zExp := aExp - bExp + $7D;
  3643. aSig := ( aSig OR $00800000 ) shl 7;
  3644. bSig := ( bSig OR $00800000 ) shl 8;
  3645. if ( bSig <= ( aSig + aSig ) ) then
  3646. Begin
  3647. aSig := aSig shr 1;
  3648. Inc(zExp);
  3649. End;
  3650. zSig := estimateDiv64To32( aSig, 0, bSig );
  3651. if ( ( zSig and $3F ) <= 2 ) then
  3652. Begin
  3653. mul32To64( bSig, zSig, term0, term1 );
  3654. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3655. while ( sbits32 (rem0) < 0 ) do
  3656. Begin
  3657. Dec(zSig);
  3658. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3659. End;
  3660. zSig := zSig or bits32( rem1 <> 0 );
  3661. End;
  3662. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3663. End;
  3664. {*
  3665. -------------------------------------------------------------------------------
  3666. Returns the remainder of the single-precision floating-point value `a'
  3667. with respect to the corresponding value `b'. The operation is performed
  3668. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3669. -------------------------------------------------------------------------------
  3670. *}
  3671. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3672. Var
  3673. aSign, zSign: flag;
  3674. aExp, bExp, expDiff: int16;
  3675. aSig, bSig, q, alternateASig: bits32;
  3676. sigMean: sbits32;
  3677. Begin
  3678. aSig := extractFloat32Frac( a.float32 );
  3679. aExp := extractFloat32Exp( a.float32 );
  3680. aSign := extractFloat32Sign( a.float32 );
  3681. bSig := extractFloat32Frac( b.float32 );
  3682. bExp := extractFloat32Exp( b.float32 );
  3683. if ( aExp = $FF ) then
  3684. Begin
  3685. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3686. Begin
  3687. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3688. exit;
  3689. End;
  3690. float_raise( float_flag_invalid );
  3691. float32_rem.float32 := float32_default_nan;
  3692. exit;
  3693. End;
  3694. if ( bExp = $FF ) then
  3695. Begin
  3696. if ( bSig <> 0 ) then
  3697. Begin
  3698. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3699. exit;
  3700. End;
  3701. float32_rem := a;
  3702. exit;
  3703. End;
  3704. if ( bExp = 0 ) then
  3705. Begin
  3706. if ( bSig = 0 ) then
  3707. Begin
  3708. float_raise( float_flag_invalid );
  3709. float32_rem.float32 := float32_default_nan;
  3710. exit;
  3711. End;
  3712. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3713. End;
  3714. if ( aExp = 0 ) then
  3715. Begin
  3716. if ( aSig = 0 ) then
  3717. Begin
  3718. float32_rem := a;
  3719. exit;
  3720. End;
  3721. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3722. End;
  3723. expDiff := aExp - bExp;
  3724. aSig := ( aSig OR $00800000 ) shl 8;
  3725. bSig := ( bSig OR $00800000 ) shl 8;
  3726. if ( expDiff < 0 ) then
  3727. Begin
  3728. if ( expDiff < -1 ) then
  3729. Begin
  3730. float32_rem := a;
  3731. exit;
  3732. End;
  3733. aSig := aSig shr 1;
  3734. End;
  3735. q := bits32( bSig <= aSig );
  3736. if ( q <> 0) then
  3737. aSig := aSig - bSig;
  3738. expDiff := expDiff - 32;
  3739. while ( 0 < expDiff ) do
  3740. Begin
  3741. q := estimateDiv64To32( aSig, 0, bSig );
  3742. if (2 < q) then
  3743. q := q - 2
  3744. else
  3745. q := 0;
  3746. aSig := - ( ( bSig shr 2 ) * q );
  3747. expDiff := expDiff - 30;
  3748. End;
  3749. expDiff := expDiff + 32;
  3750. if ( 0 < expDiff ) then
  3751. Begin
  3752. q := estimateDiv64To32( aSig, 0, bSig );
  3753. if (2 < q) then
  3754. q := q - 2
  3755. else
  3756. q := 0;
  3757. q := q shr (32 - expDiff);
  3758. bSig := bSig shr 2;
  3759. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3760. End
  3761. else
  3762. Begin
  3763. aSig := aSig shr 2;
  3764. bSig := bSig shr 2;
  3765. End;
  3766. Repeat
  3767. alternateASig := aSig;
  3768. Inc(q);
  3769. aSig := aSig - bSig;
  3770. Until not ( 0 <= sbits32 (aSig) );
  3771. sigMean := aSig + alternateASig;
  3772. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3773. Begin
  3774. aSig := alternateASig;
  3775. End;
  3776. zSign := flag( sbits32 (aSig) < 0 );
  3777. if ( zSign<>0 ) then
  3778. aSig := - aSig;
  3779. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3780. End;
  3781. {*
  3782. -------------------------------------------------------------------------------
  3783. Returns the square root of the single-precision floating-point value `a'.
  3784. The operation is performed according to the IEC/IEEE Standard for Binary
  3785. Floating-Point Arithmetic.
  3786. -------------------------------------------------------------------------------
  3787. *}
  3788. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3789. Var
  3790. aSign : flag;
  3791. aExp, zExp : int16;
  3792. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3793. label roundAndPack;
  3794. Begin
  3795. aSig := extractFloat32Frac( a.float32 );
  3796. aExp := extractFloat32Exp( a.float32 );
  3797. aSign := extractFloat32Sign( a.float32 );
  3798. if ( aExp = $FF ) then
  3799. Begin
  3800. if ( aSig <> 0) then
  3801. Begin
  3802. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3803. exit;
  3804. End;
  3805. if ( aSign = 0) then
  3806. Begin
  3807. float32_sqrt := a;
  3808. exit;
  3809. End;
  3810. float_raise( float_flag_invalid );
  3811. float32_sqrt.float32 := float32_default_nan;
  3812. exit;
  3813. End;
  3814. if ( aSign <> 0) then
  3815. Begin
  3816. if ( ( aExp OR aSig ) = 0 ) then
  3817. Begin
  3818. float32_sqrt := a;
  3819. exit;
  3820. End;
  3821. float_raise( float_flag_invalid );
  3822. float32_sqrt.float32 := float32_default_nan;
  3823. exit;
  3824. End;
  3825. if ( aExp = 0 ) then
  3826. Begin
  3827. if ( aSig = 0 ) then
  3828. Begin
  3829. float32_sqrt.float32 := 0;
  3830. exit;
  3831. End;
  3832. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3833. End;
  3834. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3835. aSig := ( aSig OR $00800000 ) shl 8;
  3836. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3837. if ( ( zSig and $7F ) <= 5 ) then
  3838. Begin
  3839. if ( zSig < 2 ) then
  3840. Begin
  3841. zSig := $7FFFFFFF;
  3842. goto roundAndPack;
  3843. End
  3844. else
  3845. Begin
  3846. aSig := aSig shr (aExp and 1);
  3847. mul32To64( zSig, zSig, term0, term1 );
  3848. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3849. while ( sbits32 (rem0) < 0 ) do
  3850. Begin
  3851. Dec(zSig);
  3852. shortShift64Left( 0, zSig, 1, term0, term1 );
  3853. term1 := term1 or 1;
  3854. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3855. End;
  3856. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3857. End;
  3858. End;
  3859. shift32RightJamming( zSig, 1, zSig );
  3860. roundAndPack:
  3861. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3862. End;
  3863. {*
  3864. -------------------------------------------------------------------------------
  3865. Returns 1 if the single-precision floating-point value `a' is equal to
  3866. the corresponding value `b', and 0 otherwise. The comparison is performed
  3867. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3868. -------------------------------------------------------------------------------
  3869. *}
  3870. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3871. Begin
  3872. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3873. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3874. ) then
  3875. Begin
  3876. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3877. Begin
  3878. float_raise( float_flag_invalid );
  3879. End;
  3880. float32_eq := 0;
  3881. exit;
  3882. End;
  3883. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3884. End;
  3885. {*
  3886. -------------------------------------------------------------------------------
  3887. Returns 1 if the single-precision floating-point value `a' is less than
  3888. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3889. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3890. Arithmetic.
  3891. -------------------------------------------------------------------------------
  3892. *}
  3893. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3894. var
  3895. aSign, bSign: flag;
  3896. Begin
  3897. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3898. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3899. ) then
  3900. Begin
  3901. float_raise( float_flag_invalid );
  3902. float32_le := 0;
  3903. exit;
  3904. End;
  3905. aSign := extractFloat32Sign( a.float32 );
  3906. bSign := extractFloat32Sign( b.float32 );
  3907. if ( aSign <> bSign ) then
  3908. Begin
  3909. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3910. exit;
  3911. End;
  3912. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3913. End;
  3914. {*
  3915. -------------------------------------------------------------------------------
  3916. Returns 1 if the single-precision floating-point value `a' is less than
  3917. the corresponding value `b', and 0 otherwise. The comparison is performed
  3918. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3919. -------------------------------------------------------------------------------
  3920. *}
  3921. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3922. var
  3923. aSign, bSign: flag;
  3924. Begin
  3925. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3926. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3927. ) then
  3928. Begin
  3929. float_raise( float_flag_invalid );
  3930. float32_lt :=0;
  3931. exit;
  3932. End;
  3933. aSign := extractFloat32Sign( a.float32 );
  3934. bSign := extractFloat32Sign( b.float32 );
  3935. if ( aSign <> bSign ) then
  3936. Begin
  3937. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3938. exit;
  3939. End;
  3940. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3941. End;
  3942. {*
  3943. -------------------------------------------------------------------------------
  3944. Returns 1 if the single-precision floating-point value `a' is equal to
  3945. the corresponding value `b', and 0 otherwise. The invalid exception is
  3946. raised if either operand is a NaN. Otherwise, the comparison is performed
  3947. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3948. -------------------------------------------------------------------------------
  3949. *}
  3950. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3951. Begin
  3952. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3953. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3954. ) then
  3955. Begin
  3956. float_raise( float_flag_invalid );
  3957. float32_eq_signaling := 0;
  3958. exit;
  3959. End;
  3960. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3961. End;
  3962. {*
  3963. -------------------------------------------------------------------------------
  3964. Returns 1 if the single-precision floating-point value `a' is less than or
  3965. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3966. cause an exception. Otherwise, the comparison is performed according to the
  3967. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3968. -------------------------------------------------------------------------------
  3969. *}
  3970. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3971. Var
  3972. aSign, bSign: flag;
  3973. Begin
  3974. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3975. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3976. ) then
  3977. Begin
  3978. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3979. Begin
  3980. float_raise( float_flag_invalid );
  3981. End;
  3982. float32_le_quiet := 0;
  3983. exit;
  3984. End;
  3985. aSign := extractFloat32Sign( a );
  3986. bSign := extractFloat32Sign( b );
  3987. if ( aSign <> bSign ) then
  3988. Begin
  3989. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3990. exit;
  3991. End;
  3992. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3993. End;
  3994. {*
  3995. -------------------------------------------------------------------------------
  3996. Returns 1 if the single-precision floating-point value `a' is less than
  3997. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3998. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3999. Standard for Binary Floating-Point Arithmetic.
  4000. -------------------------------------------------------------------------------
  4001. *}
  4002. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  4003. Var
  4004. aSign, bSign: flag;
  4005. Begin
  4006. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4007. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4008. ) then
  4009. Begin
  4010. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4011. Begin
  4012. float_raise( float_flag_invalid );
  4013. End;
  4014. float32_lt_quiet := 0;
  4015. exit;
  4016. End;
  4017. aSign := extractFloat32Sign( a );
  4018. bSign := extractFloat32Sign( b );
  4019. if ( aSign <> bSign ) then
  4020. Begin
  4021. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  4022. exit;
  4023. End;
  4024. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  4025. End;
  4026. {*
  4027. -------------------------------------------------------------------------------
  4028. Returns the result of converting the double-precision floating-point value
  4029. `a' to the 32-bit two's complement integer format. The conversion is
  4030. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4031. Arithmetic---which means in particular that the conversion is rounded
  4032. according to the current rounding mode. If `a' is a NaN, the largest
  4033. positive integer is returned. Otherwise, if the conversion overflows, the
  4034. largest integer with the same sign as `a' is returned.
  4035. -------------------------------------------------------------------------------
  4036. *}
  4037. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  4038. var
  4039. aSign: flag;
  4040. aExp, shiftCount: int16;
  4041. aSig0, aSig1, absZ, aSigExtra: bits32;
  4042. z: int32;
  4043. roundingMode: TFPURoundingMode;
  4044. label invalid;
  4045. Begin
  4046. aSig1 := extractFloat64Frac1( a );
  4047. aSig0 := extractFloat64Frac0( a );
  4048. aExp := extractFloat64Exp( a );
  4049. aSign := extractFloat64Sign( a );
  4050. shiftCount := aExp - $413;
  4051. if ( 0 <= shiftCount ) then
  4052. Begin
  4053. if ( $41E < aExp ) then
  4054. Begin
  4055. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4056. aSign := 0;
  4057. goto invalid;
  4058. End;
  4059. shortShift64Left(
  4060. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4061. if ( $80000000 < absZ ) then
  4062. goto invalid;
  4063. End
  4064. else
  4065. Begin
  4066. aSig1 := flag( aSig1 <> 0 );
  4067. if ( aExp < $3FE ) then
  4068. Begin
  4069. aSigExtra := aExp OR aSig0 OR aSig1;
  4070. absZ := 0;
  4071. End
  4072. else
  4073. Begin
  4074. aSig0 := aSig0 OR $00100000;
  4075. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4076. absZ := aSig0 shr ( - shiftCount );
  4077. End;
  4078. End;
  4079. roundingMode := softfloat_rounding_mode;
  4080. if ( roundingMode = float_round_nearest_even ) then
  4081. Begin
  4082. if ( sbits32(aSigExtra) < 0 ) then
  4083. Begin
  4084. Inc(absZ);
  4085. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  4086. absZ := absZ and not 1;
  4087. End;
  4088. if aSign <> 0 then
  4089. z := - absZ
  4090. else
  4091. z := absZ;
  4092. End
  4093. else
  4094. Begin
  4095. aSigExtra := bits32( aSigExtra <> 0 );
  4096. if ( aSign <> 0) then
  4097. Begin
  4098. z := - ( absZ
  4099. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  4100. End
  4101. else
  4102. Begin
  4103. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  4104. End
  4105. End;
  4106. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  4107. Begin
  4108. invalid:
  4109. float_raise( float_flag_invalid );
  4110. if (aSign <> 0 ) then
  4111. float64_to_int32 := sbits32 ($80000000)
  4112. else
  4113. float64_to_int32 := $7FFFFFFF;
  4114. exit;
  4115. End;
  4116. if ( aSigExtra <> 0) then
  4117. set_inexact_flag;
  4118. float64_to_int32 := z;
  4119. End;
  4120. {*
  4121. -------------------------------------------------------------------------------
  4122. Returns the result of converting the double-precision floating-point value
  4123. `a' to the 32-bit two's complement integer format. The conversion is
  4124. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4125. Arithmetic, except that the conversion is always rounded toward zero.
  4126. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4127. the conversion overflows, the largest integer with the same sign as `a' is
  4128. returned.
  4129. -------------------------------------------------------------------------------
  4130. *}
  4131. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  4132. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  4133. Var
  4134. aSign: flag;
  4135. aExp, shiftCount: int16;
  4136. aSig0, aSig1, absZ, aSigExtra: bits32;
  4137. z: int32;
  4138. label invalid;
  4139. Begin
  4140. aSig1 := extractFloat64Frac1( a );
  4141. aSig0 := extractFloat64Frac0( a );
  4142. aExp := extractFloat64Exp( a );
  4143. aSign := extractFloat64Sign( a );
  4144. shiftCount := aExp - $413;
  4145. if ( 0 <= shiftCount ) then
  4146. Begin
  4147. if ( $41E < aExp ) then
  4148. Begin
  4149. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4150. aSign := 0;
  4151. goto invalid;
  4152. End;
  4153. shortShift64Left(
  4154. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4155. End
  4156. else
  4157. Begin
  4158. if ( aExp < $3FF ) then
  4159. Begin
  4160. if ( aExp OR aSig0 OR aSig1 )<>0 then
  4161. Begin
  4162. set_inexact_flag;
  4163. End;
  4164. float64_to_int32_round_to_zero := 0;
  4165. exit;
  4166. End;
  4167. aSig0 := aSig0 or $00100000;
  4168. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4169. absZ := aSig0 shr ( - shiftCount );
  4170. End;
  4171. if aSign <> 0 then
  4172. z := - absZ
  4173. else
  4174. z := absZ;
  4175. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  4176. Begin
  4177. invalid:
  4178. float_raise( float_flag_invalid );
  4179. if (aSign <> 0) then
  4180. float64_to_int32_round_to_zero := sbits32 ($80000000)
  4181. else
  4182. float64_to_int32_round_to_zero := $7FFFFFFF;
  4183. exit;
  4184. End;
  4185. if ( aSigExtra <> 0) then
  4186. set_inexact_flag;
  4187. float64_to_int32_round_to_zero := z;
  4188. End;
  4189. {*----------------------------------------------------------------------------
  4190. | Returns the result of converting the double-precision floating-point value
  4191. | `a' to the 64-bit two's complement integer format. The conversion is
  4192. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4193. | Arithmetic---which means in particular that the conversion is rounded
  4194. | according to the current rounding mode. If `a' is a NaN, the largest
  4195. | positive integer is returned. Otherwise, if the conversion overflows, the
  4196. | largest integer with the same sign as `a' is returned.
  4197. *----------------------------------------------------------------------------*}
  4198. function float64_to_int64( a: float64 ): int64;
  4199. var
  4200. aSign: flag;
  4201. aExp, shiftCount: int16;
  4202. aSig, aSigExtra: bits64;
  4203. begin
  4204. aSig := extractFloat64Frac( a );
  4205. aExp := extractFloat64Exp( a );
  4206. aSign := extractFloat64Sign( a );
  4207. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4208. shiftCount := $433 - aExp;
  4209. if ( shiftCount <= 0 ) then begin
  4210. if ( $43E < aExp ) then begin
  4211. float_raise( float_flag_invalid );
  4212. if ( ( aSign = 0 )
  4213. or ( ( aExp = $7FF )
  4214. and ( aSig <> $0010000000000000 ) )
  4215. ) then begin
  4216. result := $7FFFFFFFFFFFFFFF;
  4217. exit;
  4218. end;
  4219. result := $8000000000000000;
  4220. exit;
  4221. end;
  4222. aSigExtra := 0;
  4223. aSig := aSig shl ( - shiftCount );
  4224. end
  4225. else
  4226. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  4227. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  4228. end;
  4229. {*----------------------------------------------------------------------------
  4230. | Returns the result of converting the double-precision floating-point value
  4231. | `a' to the 64-bit two's complement integer format. The conversion is
  4232. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4233. | Arithmetic, except that the conversion is always rounded toward zero.
  4234. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4235. | the conversion overflows, the largest integer with the same sign as `a' is
  4236. | returned.
  4237. *----------------------------------------------------------------------------*}
  4238. {$define FPC_SYSTEM_HAS_float64_to_int64_round_to_zero}
  4239. function float64_to_int64_round_to_zero( a: float64 ): int64;
  4240. var
  4241. aSign: flag;
  4242. aExp, shiftCount: int16;
  4243. aSig: bits64;
  4244. z: int64;
  4245. begin
  4246. aSig := extractFloat64Frac( a );
  4247. aExp := extractFloat64Exp( a );
  4248. aSign := extractFloat64Sign( a );
  4249. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4250. shiftCount := aExp - $433;
  4251. if ( 0 <= shiftCount ) then begin
  4252. if ( $43E <= aExp ) then begin
  4253. if ( bits64 ( a ) <> bits64( $C3E0000000000000 ) ) then begin
  4254. float_raise( float_flag_invalid );
  4255. if ( ( aSign = 0 )
  4256. or ( ( aExp = $7FF )
  4257. and ( aSig <> $0010000000000000 ) )
  4258. ) then begin
  4259. result := $7FFFFFFFFFFFFFFF;
  4260. exit;
  4261. end;
  4262. end;
  4263. result := $8000000000000000;
  4264. exit;
  4265. end;
  4266. z := aSig shl shiftCount;
  4267. end
  4268. else begin
  4269. if ( aExp < $3FE ) then begin
  4270. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  4271. result := 0;
  4272. exit;
  4273. end;
  4274. z := aSig shr ( - shiftCount );
  4275. if ( bits64( aSig shl ( shiftCount and 63 ) ) <> 0 ) then
  4276. set_inexact_flag;
  4277. end;
  4278. if ( aSign <> 0 ) then z := - z;
  4279. result := z;
  4280. end;
  4281. {*
  4282. -------------------------------------------------------------------------------
  4283. Returns the result of converting the double-precision floating-point value
  4284. `a' to the single-precision floating-point format. The conversion is
  4285. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4286. Arithmetic.
  4287. -------------------------------------------------------------------------------
  4288. *}
  4289. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  4290. Var
  4291. aSign: flag;
  4292. aExp: int16;
  4293. aSig0, aSig1, zSig: bits32;
  4294. allZero: bits32;
  4295. tmp : CommonNanT;
  4296. Begin
  4297. aSig1 := extractFloat64Frac1( a );
  4298. aSig0 := extractFloat64Frac0( a );
  4299. aExp := extractFloat64Exp( a );
  4300. aSign := extractFloat64Sign( a );
  4301. if ( aExp = $7FF ) then
  4302. Begin
  4303. if ( aSig0 OR aSig1 ) <> 0 then
  4304. Begin
  4305. float64ToCommonNaN( a, tmp );
  4306. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  4307. exit;
  4308. End;
  4309. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  4310. exit;
  4311. End;
  4312. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  4313. if ( aExp <> 0) then
  4314. zSig := zSig OR $40000000;
  4315. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  4316. End;
  4317. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  4318. {*----------------------------------------------------------------------------
  4319. | Returns the result of converting the double-precision floating-point value
  4320. | `a' to the extended double-precision floating-point format. The conversion
  4321. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4322. | Arithmetic.
  4323. *----------------------------------------------------------------------------*}
  4324. function float64_to_floatx80( a: float64 ): floatx80;
  4325. var
  4326. aSign: flag;
  4327. aExp: int16;
  4328. aSig: bits64;
  4329. begin
  4330. aSig := extractFloat64Frac( a );
  4331. aExp := extractFloat64Exp( a );
  4332. aSign := extractFloat64Sign( a );
  4333. if ( aExp = $7FF ) then begin
  4334. if ( aSig <> 0 ) then begin
  4335. result := commonNaNToFloatx80( float64ToCommonNaN( a ) );
  4336. exit;
  4337. end;
  4338. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  4339. exit;
  4340. end;
  4341. if ( aExp = 0 ) then begin
  4342. if ( aSig = 0 ) then begin
  4343. result := packFloatx80( aSign, 0, 0 );
  4344. exit;
  4345. end;
  4346. normalizeFloat64Subnormal( aSig, aExp, aSig );
  4347. end;
  4348. result :=
  4349. packFloatx80(
  4350. aSign, aExp + $3C00, ( aSig or $0010000000000000 ) shl 11 );
  4351. end;
  4352. {$endif FPC_SOFTFLOAT_FLOATX80}
  4353. {*
  4354. -------------------------------------------------------------------------------
  4355. Rounds the double-precision floating-point value `a' to an integer,
  4356. and returns the result as a double-precision floating-point value. The
  4357. operation is performed according to the IEC/IEEE Standard for Binary
  4358. Floating-Point Arithmetic.
  4359. -------------------------------------------------------------------------------
  4360. *}
  4361. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  4362. Var
  4363. aSign: flag;
  4364. aExp: int16;
  4365. lastBitMask, roundBitsMask: bits32;
  4366. roundingMode: TFPURoundingMode;
  4367. z: float64;
  4368. Begin
  4369. aExp := extractFloat64Exp( a );
  4370. if ( $413 <= aExp ) then
  4371. Begin
  4372. if ( $433 <= aExp ) then
  4373. Begin
  4374. if ( ( aExp = $7FF )
  4375. AND
  4376. (
  4377. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  4378. ) <>0)
  4379. ) then
  4380. Begin
  4381. propagateFloat64NaN( a, a, result );
  4382. exit;
  4383. End;
  4384. result := a;
  4385. exit;
  4386. End;
  4387. lastBitMask := 1;
  4388. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  4389. roundBitsMask := lastBitMask - 1;
  4390. z := a;
  4391. roundingMode := softfloat_rounding_mode;
  4392. if ( roundingMode = float_round_nearest_even ) then
  4393. Begin
  4394. if ( lastBitMask <> 0) then
  4395. Begin
  4396. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  4397. if ( ( z.low and roundBitsMask ) = 0 ) then
  4398. z.low := z.low and not lastBitMask;
  4399. End
  4400. else
  4401. Begin
  4402. if ( sbits32 (z.low) < 0 ) then
  4403. Begin
  4404. Inc(z.high);
  4405. if ( bits32 ( z.low shl 1 ) = 0 ) then
  4406. z.high := z.high and not 1;
  4407. End;
  4408. End;
  4409. End
  4410. else if ( roundingMode <> float_round_to_zero ) then
  4411. Begin
  4412. if ( extractFloat64Sign( z )
  4413. xor flag( roundingMode = float_round_up ) )<> 0 then
  4414. Begin
  4415. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4416. End;
  4417. End;
  4418. z.low := z.low and not roundBitsMask;
  4419. End
  4420. else
  4421. Begin
  4422. if ( aExp <= $3FE ) then
  4423. Begin
  4424. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4425. Begin
  4426. result := a;
  4427. exit;
  4428. End;
  4429. set_inexact_flag;
  4430. aSign := extractFloat64Sign( a );
  4431. case ( softfloat_rounding_mode ) of
  4432. float_round_nearest_even:
  4433. Begin
  4434. if ( ( aExp = $3FE )
  4435. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4436. ) then
  4437. Begin
  4438. packFloat64( aSign, $3FF, 0, 0, result );
  4439. exit;
  4440. End;
  4441. End;
  4442. float_round_down:
  4443. Begin
  4444. if aSign<>0 then
  4445. packFloat64( 1, $3FF, 0, 0, result )
  4446. else
  4447. packFloat64( 0, 0, 0, 0, result );
  4448. exit;
  4449. End;
  4450. float_round_up:
  4451. Begin
  4452. if aSign <> 0 then
  4453. packFloat64( 1, 0, 0, 0, result )
  4454. else
  4455. packFloat64( 0, $3FF, 0, 0, result );
  4456. exit;
  4457. End;
  4458. end;
  4459. packFloat64( aSign, 0, 0, 0, result );
  4460. exit;
  4461. End;
  4462. lastBitMask := 1;
  4463. lastBitMask := lastBitMask shl ($413 - aExp);
  4464. roundBitsMask := lastBitMask - 1;
  4465. z.low := 0;
  4466. z.high := a.high;
  4467. roundingMode := softfloat_rounding_mode;
  4468. if ( roundingMode = float_round_nearest_even ) then
  4469. Begin
  4470. z.high := z.high + lastBitMask shr 1;
  4471. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4472. Begin
  4473. z.high := z.high and not lastBitMask;
  4474. End;
  4475. End
  4476. else if ( roundingMode <> float_round_to_zero ) then
  4477. Begin
  4478. if ( extractFloat64Sign( z )
  4479. xor flag( roundingMode = float_round_up ) )<> 0 then
  4480. Begin
  4481. z.high := z.high or bits32( a.low <> 0 );
  4482. z.high := z.high + roundBitsMask;
  4483. End;
  4484. End;
  4485. z.high := z.high and not roundBitsMask;
  4486. End;
  4487. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4488. Begin
  4489. set_inexact_flag;
  4490. End;
  4491. result := z;
  4492. End;
  4493. {*
  4494. -------------------------------------------------------------------------------
  4495. Returns the result of adding the absolute values of the double-precision
  4496. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4497. before being returned. `zSign' is ignored if the result is a NaN.
  4498. The addition is performed according to the IEC/IEEE Standard for Binary
  4499. Floating-Point Arithmetic.
  4500. -------------------------------------------------------------------------------
  4501. *}
  4502. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4503. Var
  4504. aExp, bExp, zExp: int16;
  4505. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4506. expDiff: int16;
  4507. label shiftRight1;
  4508. label roundAndPack;
  4509. Begin
  4510. aSig1 := extractFloat64Frac1( a );
  4511. aSig0 := extractFloat64Frac0( a );
  4512. aExp := extractFloat64Exp( a );
  4513. bSig1 := extractFloat64Frac1( b );
  4514. bSig0 := extractFloat64Frac0( b );
  4515. bExp := extractFloat64Exp( b );
  4516. expDiff := aExp - bExp;
  4517. if ( 0 < expDiff ) then
  4518. Begin
  4519. if ( aExp = $7FF ) then
  4520. Begin
  4521. if ( aSig0 OR aSig1 ) <> 0 then
  4522. Begin
  4523. propagateFloat64NaN( a, b, out );
  4524. exit;
  4525. end;
  4526. out := a;
  4527. exit;
  4528. End;
  4529. if ( bExp = 0 ) then
  4530. Begin
  4531. Dec(expDiff);
  4532. End
  4533. else
  4534. Begin
  4535. bSig0 := bSig0 or $00100000;
  4536. End;
  4537. shift64ExtraRightJamming(
  4538. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4539. zExp := aExp;
  4540. End
  4541. else if ( expDiff < 0 ) then
  4542. Begin
  4543. if ( bExp = $7FF ) then
  4544. Begin
  4545. if ( bSig0 OR bSig1 ) <> 0 then
  4546. Begin
  4547. propagateFloat64NaN( a, b, out );
  4548. exit;
  4549. End;
  4550. packFloat64( zSign, $7FF, 0, 0, out );
  4551. exit;
  4552. End;
  4553. if ( aExp = 0 ) then
  4554. Begin
  4555. Inc(expDiff);
  4556. End
  4557. else
  4558. Begin
  4559. aSig0 := aSig0 or $00100000;
  4560. End;
  4561. shift64ExtraRightJamming(
  4562. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4563. zExp := bExp;
  4564. End
  4565. else
  4566. Begin
  4567. if ( aExp = $7FF ) then
  4568. Begin
  4569. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4570. Begin
  4571. propagateFloat64NaN( a, b, out );
  4572. exit;
  4573. End;
  4574. out := a;
  4575. exit;
  4576. End;
  4577. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4578. if ( aExp = 0 ) then
  4579. Begin
  4580. packFloat64( zSign, 0, zSig0, zSig1, out );
  4581. exit;
  4582. End;
  4583. zSig2 := 0;
  4584. zSig0 := zSig0 or $00200000;
  4585. zExp := aExp;
  4586. goto shiftRight1;
  4587. End;
  4588. aSig0 := aSig0 or $00100000;
  4589. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4590. Dec(zExp);
  4591. if ( zSig0 < $00200000 ) then
  4592. goto roundAndPack;
  4593. Inc(zExp);
  4594. shiftRight1:
  4595. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4596. roundAndPack:
  4597. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4598. End;
  4599. {*
  4600. -------------------------------------------------------------------------------
  4601. Returns the result of subtracting the absolute values of the double-
  4602. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4603. difference is negated before being returned. `zSign' is ignored if the
  4604. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4605. Standard for Binary Floating-Point Arithmetic.
  4606. -------------------------------------------------------------------------------
  4607. *}
  4608. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4609. Var
  4610. aExp, bExp, zExp: int16;
  4611. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4612. expDiff: int16;
  4613. z: float64;
  4614. label aExpBigger;
  4615. label bExpBigger;
  4616. label aBigger;
  4617. label bBigger;
  4618. label normalizeRoundAndPack;
  4619. Begin
  4620. aSig1 := extractFloat64Frac1( a );
  4621. aSig0 := extractFloat64Frac0( a );
  4622. aExp := extractFloat64Exp( a );
  4623. bSig1 := extractFloat64Frac1( b );
  4624. bSig0 := extractFloat64Frac0( b );
  4625. bExp := extractFloat64Exp( b );
  4626. expDiff := aExp - bExp;
  4627. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4628. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4629. if ( 0 < expDiff ) then goto aExpBigger;
  4630. if ( expDiff < 0 ) then goto bExpBigger;
  4631. if ( aExp = $7FF ) then
  4632. Begin
  4633. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4634. Begin
  4635. propagateFloat64NaN( a, b, out );
  4636. exit;
  4637. End;
  4638. float_raise( float_flag_invalid );
  4639. z.low := float64_default_nan_low;
  4640. z.high := float64_default_nan_high;
  4641. out := z;
  4642. exit;
  4643. End;
  4644. if ( aExp = 0 ) then
  4645. Begin
  4646. aExp := 1;
  4647. bExp := 1;
  4648. End;
  4649. if ( bSig0 < aSig0 ) then goto aBigger;
  4650. if ( aSig0 < bSig0 ) then goto bBigger;
  4651. if ( bSig1 < aSig1 ) then goto aBigger;
  4652. if ( aSig1 < bSig1 ) then goto bBigger;
  4653. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4654. exit;
  4655. bExpBigger:
  4656. if ( bExp = $7FF ) then
  4657. Begin
  4658. if ( bSig0 OR bSig1 ) <> 0 then
  4659. Begin
  4660. propagateFloat64NaN( a, b, out );
  4661. exit;
  4662. End;
  4663. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4664. exit;
  4665. End;
  4666. if ( aExp = 0 ) then
  4667. Begin
  4668. Inc(expDiff);
  4669. End
  4670. else
  4671. Begin
  4672. aSig0 := aSig0 or $40000000;
  4673. End;
  4674. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4675. bSig0 := bSig0 or $40000000;
  4676. bBigger:
  4677. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4678. zExp := bExp;
  4679. zSign := zSign xor 1;
  4680. goto normalizeRoundAndPack;
  4681. aExpBigger:
  4682. if ( aExp = $7FF ) then
  4683. Begin
  4684. if ( aSig0 OR aSig1 ) <> 0 then
  4685. Begin
  4686. propagateFloat64NaN( a, b, out );
  4687. exit;
  4688. End;
  4689. out := a;
  4690. exit;
  4691. End;
  4692. if ( bExp = 0 ) then
  4693. Begin
  4694. Dec(expDiff);
  4695. End
  4696. else
  4697. Begin
  4698. bSig0 := bSig0 or $40000000;
  4699. End;
  4700. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4701. aSig0 := aSig0 or $40000000;
  4702. aBigger:
  4703. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4704. zExp := aExp;
  4705. normalizeRoundAndPack:
  4706. Dec(zExp);
  4707. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4708. End;
  4709. {*
  4710. -------------------------------------------------------------------------------
  4711. Returns the result of adding the double-precision floating-point values `a'
  4712. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4713. Binary Floating-Point Arithmetic.
  4714. -------------------------------------------------------------------------------
  4715. *}
  4716. Function float64_add( a: float64; b : float64) : Float64;
  4717. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4718. Var
  4719. aSign, bSign: flag;
  4720. Begin
  4721. aSign := extractFloat64Sign( a );
  4722. bSign := extractFloat64Sign( b );
  4723. if ( aSign = bSign ) then
  4724. Begin
  4725. addFloat64Sigs( a, b, aSign, result );
  4726. End
  4727. else
  4728. Begin
  4729. subFloat64Sigs( a, b, aSign, result );
  4730. End;
  4731. End;
  4732. {*
  4733. -------------------------------------------------------------------------------
  4734. Returns the result of subtracting the double-precision floating-point values
  4735. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4736. for Binary Floating-Point Arithmetic.
  4737. -------------------------------------------------------------------------------
  4738. *}
  4739. Function float64_sub(a: float64; b : float64) : Float64;
  4740. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4741. Var
  4742. aSign, bSign: flag;
  4743. Begin
  4744. aSign := extractFloat64Sign( a );
  4745. bSign := extractFloat64Sign( b );
  4746. if ( aSign = bSign ) then
  4747. Begin
  4748. subFloat64Sigs( a, b, aSign, result );
  4749. End
  4750. else
  4751. Begin
  4752. addFloat64Sigs( a, b, aSign, result );
  4753. End;
  4754. End;
  4755. {*
  4756. -------------------------------------------------------------------------------
  4757. Returns the result of multiplying the double-precision floating-point values
  4758. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4759. for Binary Floating-Point Arithmetic.
  4760. -------------------------------------------------------------------------------
  4761. *}
  4762. Function float64_mul( a: float64; b:float64) : Float64;
  4763. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4764. Var
  4765. aSign, bSign, zSign: flag;
  4766. aExp, bExp, zExp: int16;
  4767. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4768. z: float64;
  4769. label invalid;
  4770. Begin
  4771. aSig1 := extractFloat64Frac1( a );
  4772. aSig0 := extractFloat64Frac0( a );
  4773. aExp := extractFloat64Exp( a );
  4774. aSign := extractFloat64Sign( a );
  4775. bSig1 := extractFloat64Frac1( b );
  4776. bSig0 := extractFloat64Frac0( b );
  4777. bExp := extractFloat64Exp( b );
  4778. bSign := extractFloat64Sign( b );
  4779. zSign := aSign xor bSign;
  4780. if ( aExp = $7FF ) then
  4781. Begin
  4782. if ( (( aSig0 OR aSig1 ) <>0)
  4783. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4784. Begin
  4785. propagateFloat64NaN( a, b, result );
  4786. exit;
  4787. End;
  4788. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4789. packFloat64( zSign, $7FF, 0, 0, result );
  4790. exit;
  4791. End;
  4792. if ( bExp = $7FF ) then
  4793. Begin
  4794. if ( bSig0 OR bSig1 )<> 0 then
  4795. Begin
  4796. propagateFloat64NaN( a, b, result );
  4797. exit;
  4798. End;
  4799. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4800. Begin
  4801. invalid:
  4802. float_raise( float_flag_invalid );
  4803. z.low := float64_default_nan_low;
  4804. z.high := float64_default_nan_high;
  4805. result := z;
  4806. exit;
  4807. End;
  4808. packFloat64( zSign, $7FF, 0, 0, result );
  4809. exit;
  4810. End;
  4811. if ( aExp = 0 ) then
  4812. Begin
  4813. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4814. Begin
  4815. packFloat64( zSign, 0, 0, 0, result );
  4816. exit;
  4817. End;
  4818. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4819. End;
  4820. if ( bExp = 0 ) then
  4821. Begin
  4822. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4823. Begin
  4824. packFloat64( zSign, 0, 0, 0, result );
  4825. exit;
  4826. End;
  4827. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4828. End;
  4829. zExp := aExp + bExp - $400;
  4830. aSig0 := aSig0 or $00100000;
  4831. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4832. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4833. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4834. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4835. if ( $00200000 <= zSig0 ) then
  4836. Begin
  4837. shift64ExtraRightJamming(
  4838. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4839. Inc(zExp);
  4840. End;
  4841. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4842. End;
  4843. {*
  4844. -------------------------------------------------------------------------------
  4845. Returns the result of dividing the double-precision floating-point value `a'
  4846. by the corresponding value `b'. The operation is performed according to the
  4847. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4848. -------------------------------------------------------------------------------
  4849. *}
  4850. Function float64_div(a: float64; b : float64) : Float64;
  4851. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4852. Var
  4853. aSign, bSign, zSign: flag;
  4854. aExp, bExp, zExp: int16;
  4855. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4856. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4857. z: float64;
  4858. label invalid;
  4859. Begin
  4860. aSig1 := extractFloat64Frac1( a );
  4861. aSig0 := extractFloat64Frac0( a );
  4862. aExp := extractFloat64Exp( a );
  4863. aSign := extractFloat64Sign( a );
  4864. bSig1 := extractFloat64Frac1( b );
  4865. bSig0 := extractFloat64Frac0( b );
  4866. bExp := extractFloat64Exp( b );
  4867. bSign := extractFloat64Sign( b );
  4868. zSign := aSign xor bSign;
  4869. if ( aExp = $7FF ) then
  4870. Begin
  4871. if ( aSig0 OR aSig1 )<> 0 then
  4872. Begin
  4873. propagateFloat64NaN( a, b, result );
  4874. exit;
  4875. end;
  4876. if ( bExp = $7FF ) then
  4877. Begin
  4878. if ( bSig0 OR bSig1 )<>0 then
  4879. Begin
  4880. propagateFloat64NaN( a, b, result );
  4881. exit;
  4882. End;
  4883. goto invalid;
  4884. End;
  4885. packFloat64( zSign, $7FF, 0, 0, result );
  4886. exit;
  4887. End;
  4888. if ( bExp = $7FF ) then
  4889. Begin
  4890. if ( bSig0 OR bSig1 )<> 0 then
  4891. Begin
  4892. propagateFloat64NaN( a, b, result );
  4893. exit;
  4894. End;
  4895. packFloat64( zSign, 0, 0, 0, result );
  4896. exit;
  4897. End;
  4898. if ( bExp = 0 ) then
  4899. Begin
  4900. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4901. Begin
  4902. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4903. Begin
  4904. invalid:
  4905. float_raise( float_flag_invalid );
  4906. z.low := float64_default_nan_low;
  4907. z.high := float64_default_nan_high;
  4908. result := z;
  4909. exit;
  4910. End;
  4911. float_raise( float_flag_divbyzero );
  4912. packFloat64( zSign, $7FF, 0, 0, result );
  4913. exit;
  4914. End;
  4915. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4916. End;
  4917. if ( aExp = 0 ) then
  4918. Begin
  4919. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4920. Begin
  4921. packFloat64( zSign, 0, 0, 0, result );
  4922. exit;
  4923. End;
  4924. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4925. End;
  4926. zExp := aExp - bExp + $3FD;
  4927. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4928. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4929. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4930. Begin
  4931. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4932. Inc(zExp);
  4933. End;
  4934. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4935. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4936. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4937. while ( sbits32 (rem0) < 0 ) do
  4938. Begin
  4939. Dec(zSig0);
  4940. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4941. End;
  4942. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4943. if ( ( zSig1 and $3FF ) <= 4 ) then
  4944. Begin
  4945. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4946. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4947. while ( sbits32 (rem1) < 0 ) do
  4948. Begin
  4949. Dec(zSig1);
  4950. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4951. End;
  4952. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4953. End;
  4954. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4955. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4956. End;
  4957. {*
  4958. -------------------------------------------------------------------------------
  4959. Returns the remainder of the double-precision floating-point value `a'
  4960. with respect to the corresponding value `b'. The operation is performed
  4961. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4962. -------------------------------------------------------------------------------
  4963. *}
  4964. Function float64_rem(a: float64; b : float64) : float64;
  4965. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4966. Var
  4967. aSign, zSign: flag;
  4968. aExp, bExp, expDiff: int16;
  4969. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4970. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4971. sigMean0: sbits32;
  4972. z: float64;
  4973. label invalid;
  4974. Begin
  4975. aSig1 := extractFloat64Frac1( a );
  4976. aSig0 := extractFloat64Frac0( a );
  4977. aExp := extractFloat64Exp( a );
  4978. aSign := extractFloat64Sign( a );
  4979. bSig1 := extractFloat64Frac1( b );
  4980. bSig0 := extractFloat64Frac0( b );
  4981. bExp := extractFloat64Exp( b );
  4982. if ( aExp = $7FF ) then
  4983. Begin
  4984. if ((( aSig0 OR aSig1 )<>0)
  4985. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4986. Begin
  4987. propagateFloat64NaN( a, b, result );
  4988. exit;
  4989. End;
  4990. goto invalid;
  4991. End;
  4992. if ( bExp = $7FF ) then
  4993. Begin
  4994. if ( bSig0 OR bSig1 ) <> 0 then
  4995. Begin
  4996. propagateFloat64NaN( a, b, result );
  4997. exit;
  4998. End;
  4999. result := a;
  5000. exit;
  5001. End;
  5002. if ( bExp = 0 ) then
  5003. Begin
  5004. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5005. Begin
  5006. invalid:
  5007. float_raise( float_flag_invalid );
  5008. z.low := float64_default_nan_low;
  5009. z.high := float64_default_nan_high;
  5010. result := z;
  5011. exit;
  5012. End;
  5013. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5014. End;
  5015. if ( aExp = 0 ) then
  5016. Begin
  5017. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5018. Begin
  5019. result := a;
  5020. exit;
  5021. End;
  5022. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5023. End;
  5024. expDiff := aExp - bExp;
  5025. if ( expDiff < -1 ) then
  5026. Begin
  5027. result := a;
  5028. exit;
  5029. End;
  5030. shortShift64Left(
  5031. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  5032. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5033. q := le64( bSig0, bSig1, aSig0, aSig1 );
  5034. if ( q )<>0 then
  5035. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5036. expDiff := expDiff - 32;
  5037. while ( 0 < expDiff ) do
  5038. Begin
  5039. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5040. if 4 < q then
  5041. q:= q - 4
  5042. else
  5043. q := 0;
  5044. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5045. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  5046. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  5047. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  5048. expDiff := expDiff - 29;
  5049. End;
  5050. if ( -32 < expDiff ) then
  5051. Begin
  5052. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5053. if 4 < q then
  5054. q := q - 4
  5055. else
  5056. q := 0;
  5057. q := q shr (- expDiff);
  5058. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5059. expDiff := expDiff + 24;
  5060. if ( expDiff < 0 ) then
  5061. Begin
  5062. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  5063. End
  5064. else
  5065. Begin
  5066. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  5067. End;
  5068. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5069. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  5070. End
  5071. else
  5072. Begin
  5073. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  5074. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5075. End;
  5076. Repeat
  5077. alternateASig0 := aSig0;
  5078. alternateASig1 := aSig1;
  5079. Inc(q);
  5080. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5081. Until not ( 0 <= sbits32 (aSig0) );
  5082. add64(
  5083. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  5084. if ( ( sigMean0 < 0 )
  5085. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  5086. Begin
  5087. aSig0 := alternateASig0;
  5088. aSig1 := alternateASig1;
  5089. End;
  5090. zSign := flag( sbits32 (aSig0) < 0 );
  5091. if ( zSign <> 0 ) then
  5092. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  5093. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  5094. End;
  5095. {*
  5096. -------------------------------------------------------------------------------
  5097. Returns the square root of the double-precision floating-point value `a'.
  5098. The operation is performed according to the IEC/IEEE Standard for Binary
  5099. Floating-Point Arithmetic.
  5100. -------------------------------------------------------------------------------
  5101. *}
  5102. Procedure float64_sqrt( a: float64; var out: float64 );
  5103. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  5104. Var
  5105. aSign: flag;
  5106. aExp, zExp: int16;
  5107. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  5108. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  5109. z: float64;
  5110. label invalid;
  5111. Begin
  5112. aSig1 := extractFloat64Frac1( a );
  5113. aSig0 := extractFloat64Frac0( a );
  5114. aExp := extractFloat64Exp( a );
  5115. aSign := extractFloat64Sign( a );
  5116. if ( aExp = $7FF ) then
  5117. Begin
  5118. if ( aSig0 OR aSig1 ) <> 0 then
  5119. Begin
  5120. propagateFloat64NaN( a, a, out );
  5121. exit;
  5122. End;
  5123. if ( aSign = 0) then
  5124. Begin
  5125. out := a;
  5126. exit;
  5127. End;
  5128. goto invalid;
  5129. End;
  5130. if ( aSign <> 0 ) then
  5131. Begin
  5132. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  5133. Begin
  5134. out := a;
  5135. exit;
  5136. End;
  5137. invalid:
  5138. float_raise( float_flag_invalid );
  5139. z.low := float64_default_nan_low;
  5140. z.high := float64_default_nan_high;
  5141. out := z;
  5142. exit;
  5143. End;
  5144. if ( aExp = 0 ) then
  5145. Begin
  5146. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5147. Begin
  5148. packFloat64( 0, 0, 0, 0, out );
  5149. exit;
  5150. End;
  5151. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5152. End;
  5153. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  5154. aSig0 := aSig0 or $00100000;
  5155. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  5156. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  5157. if ( zSig0 = 0 ) then
  5158. zSig0 := $7FFFFFFF;
  5159. doubleZSig0 := zSig0 + zSig0;
  5160. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  5161. mul32To64( zSig0, zSig0, term0, term1 );
  5162. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  5163. while ( sbits32 (rem0) < 0 ) do
  5164. Begin
  5165. Dec(zSig0);
  5166. doubleZSig0 := doubleZSig0 - 2;
  5167. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  5168. End;
  5169. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  5170. if ( ( zSig1 and $1FF ) <= 5 ) then
  5171. Begin
  5172. if ( zSig1 = 0 ) then
  5173. zSig1 := 1;
  5174. mul32To64( doubleZSig0, zSig1, term1, term2 );
  5175. sub64( rem1, 0, term1, term2, rem1, rem2 );
  5176. mul32To64( zSig1, zSig1, term2, term3 );
  5177. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  5178. while ( sbits32 (rem1) < 0 ) do
  5179. Begin
  5180. Dec(zSig1);
  5181. shortShift64Left( 0, zSig1, 1, term2, term3 );
  5182. term3 := term3 or 1;
  5183. term2 := term2 or doubleZSig0;
  5184. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  5185. End;
  5186. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5187. End;
  5188. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  5189. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  5190. End;
  5191. {*
  5192. -------------------------------------------------------------------------------
  5193. Returns 1 if the double-precision floating-point value `a' is equal to
  5194. the corresponding value `b', and 0 otherwise. The comparison is performed
  5195. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5196. -------------------------------------------------------------------------------
  5197. *}
  5198. Function float64_eq(a: float64; b: float64): flag;
  5199. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  5200. Begin
  5201. if
  5202. (
  5203. ( extractFloat64Exp( a ) = $7FF )
  5204. AND
  5205. (
  5206. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5207. )
  5208. )
  5209. OR (
  5210. ( extractFloat64Exp( b ) = $7FF )
  5211. AND (
  5212. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5213. )
  5214. )
  5215. ) then
  5216. Begin
  5217. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5218. float_raise( float_flag_invalid );
  5219. float64_eq := 0;
  5220. exit;
  5221. End;
  5222. float64_eq := flag(
  5223. ( a.low = b.low )
  5224. AND ( ( a.high = b.high )
  5225. OR ( ( a.low = 0 )
  5226. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5227. ));
  5228. End;
  5229. {*
  5230. -------------------------------------------------------------------------------
  5231. Returns 1 if the double-precision floating-point value `a' is less than
  5232. or equal to the corresponding value `b', and 0 otherwise. The comparison
  5233. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5234. Arithmetic.
  5235. -------------------------------------------------------------------------------
  5236. *}
  5237. Function float64_le(a: float64;b: float64): flag;
  5238. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  5239. Var
  5240. aSign, bSign: flag;
  5241. Begin
  5242. if
  5243. (
  5244. ( extractFloat64Exp( a ) = $7FF )
  5245. AND
  5246. (
  5247. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5248. )
  5249. )
  5250. OR (
  5251. ( extractFloat64Exp( b ) = $7FF )
  5252. AND (
  5253. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5254. )
  5255. )
  5256. ) then
  5257. Begin
  5258. float_raise( float_flag_invalid );
  5259. float64_le := 0;
  5260. exit;
  5261. End;
  5262. aSign := extractFloat64Sign( a );
  5263. bSign := extractFloat64Sign( b );
  5264. if ( aSign <> bSign ) then
  5265. Begin
  5266. float64_le := flag(
  5267. (aSign <> 0)
  5268. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5269. = 0 ));
  5270. exit;
  5271. End;
  5272. if aSign <> 0 then
  5273. float64_le := le64( b.high, b.low, a.high, a.low )
  5274. else
  5275. float64_le := le64( a.high, a.low, b.high, b.low );
  5276. End;
  5277. {*
  5278. -------------------------------------------------------------------------------
  5279. Returns 1 if the double-precision floating-point value `a' is less than
  5280. the corresponding value `b', and 0 otherwise. The comparison is performed
  5281. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5282. -------------------------------------------------------------------------------
  5283. *}
  5284. Function float64_lt(a: float64;b: float64): flag;
  5285. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  5286. Var
  5287. aSign, bSign: flag;
  5288. Begin
  5289. if
  5290. (
  5291. ( extractFloat64Exp( a ) = $7FF )
  5292. AND
  5293. (
  5294. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5295. )
  5296. )
  5297. OR (
  5298. ( extractFloat64Exp( b ) = $7FF )
  5299. AND (
  5300. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5301. )
  5302. )
  5303. ) then
  5304. Begin
  5305. float_raise( float_flag_invalid );
  5306. float64_lt := 0;
  5307. exit;
  5308. End;
  5309. aSign := extractFloat64Sign( a );
  5310. bSign := extractFloat64Sign( b );
  5311. if ( aSign <> bSign ) then
  5312. Begin
  5313. float64_lt := flag(
  5314. (aSign <> 0)
  5315. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5316. <> 0 ));
  5317. exit;
  5318. End;
  5319. if aSign <> 0 then
  5320. float64_lt := lt64( b.high, b.low, a.high, a.low )
  5321. else
  5322. float64_lt := lt64( a.high, a.low, b.high, b.low );
  5323. End;
  5324. {*
  5325. -------------------------------------------------------------------------------
  5326. Returns 1 if the double-precision floating-point value `a' is equal to
  5327. the corresponding value `b', and 0 otherwise. The invalid exception is
  5328. raised if either operand is a NaN. Otherwise, the comparison is performed
  5329. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5330. -------------------------------------------------------------------------------
  5331. *}
  5332. Function float64_eq_signaling( a: float64; b: float64): flag;
  5333. Begin
  5334. if
  5335. (
  5336. ( extractFloat64Exp( a ) = $7FF )
  5337. AND
  5338. (
  5339. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5340. )
  5341. )
  5342. OR (
  5343. ( extractFloat64Exp( b ) = $7FF )
  5344. AND (
  5345. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5346. )
  5347. )
  5348. ) then
  5349. Begin
  5350. float_raise( float_flag_invalid );
  5351. float64_eq_signaling := 0;
  5352. exit;
  5353. End;
  5354. float64_eq_signaling := flag(
  5355. ( a.low = b.low )
  5356. AND ( ( a.high = b.high )
  5357. OR ( ( a.low = 0 )
  5358. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5359. ));
  5360. End;
  5361. {*
  5362. -------------------------------------------------------------------------------
  5363. Returns 1 if the double-precision floating-point value `a' is less than or
  5364. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  5365. cause an exception. Otherwise, the comparison is performed according to the
  5366. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5367. -------------------------------------------------------------------------------
  5368. *}
  5369. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  5370. Var
  5371. aSign, bSign : flag;
  5372. Begin
  5373. if
  5374. (
  5375. ( extractFloat64Exp( a ) = $7FF )
  5376. AND
  5377. (
  5378. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5379. )
  5380. )
  5381. OR (
  5382. ( extractFloat64Exp( b ) = $7FF )
  5383. AND (
  5384. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5385. )
  5386. )
  5387. ) then
  5388. Begin
  5389. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5390. float_raise( float_flag_invalid );
  5391. float64_le_quiet := 0;
  5392. exit;
  5393. End;
  5394. aSign := extractFloat64Sign( a );
  5395. bSign := extractFloat64Sign( b );
  5396. if ( aSign <> bSign ) then
  5397. Begin
  5398. float64_le_quiet := flag
  5399. ((aSign <> 0)
  5400. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5401. = 0 ));
  5402. exit;
  5403. End;
  5404. if aSign <> 0 then
  5405. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  5406. else
  5407. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  5408. End;
  5409. {*
  5410. -------------------------------------------------------------------------------
  5411. Returns 1 if the double-precision floating-point value `a' is less than
  5412. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5413. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5414. Standard for Binary Floating-Point Arithmetic.
  5415. -------------------------------------------------------------------------------
  5416. *}
  5417. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5418. Var
  5419. aSign, bSign: flag;
  5420. Begin
  5421. if
  5422. (
  5423. ( extractFloat64Exp( a ) = $7FF )
  5424. AND
  5425. (
  5426. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5427. )
  5428. )
  5429. OR (
  5430. ( extractFloat64Exp( b ) = $7FF )
  5431. AND (
  5432. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5433. )
  5434. )
  5435. ) then
  5436. Begin
  5437. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5438. float_raise( float_flag_invalid );
  5439. float64_lt_quiet := 0;
  5440. exit;
  5441. End;
  5442. aSign := extractFloat64Sign( a );
  5443. bSign := extractFloat64Sign( b );
  5444. if ( aSign <> bSign ) then
  5445. Begin
  5446. float64_lt_quiet := flag(
  5447. (aSign<>0)
  5448. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5449. <> 0 ));
  5450. exit;
  5451. End;
  5452. If aSign <> 0 then
  5453. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5454. else
  5455. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5456. End;
  5457. {*----------------------------------------------------------------------------
  5458. | Returns the result of converting the 64-bit two's complement integer `a'
  5459. | to the single-precision floating-point format. The conversion is performed
  5460. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5461. *----------------------------------------------------------------------------*}
  5462. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5463. var
  5464. zSign : flag;
  5465. absA : uint64;
  5466. shiftCount: int8;
  5467. intval : int64rec;
  5468. Begin
  5469. if ( a = 0 ) then
  5470. begin
  5471. int64_to_float32.float32 := 0;
  5472. exit;
  5473. end;
  5474. if a < 0 then
  5475. zSign := flag(TRUE)
  5476. else
  5477. zSign := flag(FALSE);
  5478. if zSign<>0 then
  5479. absA := -a
  5480. else
  5481. absA := a;
  5482. shiftCount := countLeadingZeros64( absA ) - 40;
  5483. if ( 0 <= shiftCount ) then
  5484. begin
  5485. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5486. end
  5487. else
  5488. begin
  5489. shiftCount := shiftCount + 7;
  5490. if ( shiftCount < 0 ) then
  5491. begin
  5492. intval.low := int64rec(AbsA).low;
  5493. intval.high := int64rec(AbsA).high;
  5494. shift64RightJamming( intval.high, intval.low, - shiftCount,
  5495. intval.high, intval.low);
  5496. int64rec(absA).low := intval.low;
  5497. int64rec(absA).high := intval.high;
  5498. end
  5499. else
  5500. absA := absA shl shiftCount;
  5501. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5502. end;
  5503. End;
  5504. {*----------------------------------------------------------------------------
  5505. | Returns the result of converting the 64-bit two's complement integer `a'
  5506. | to the single-precision floating-point format. The conversion is performed
  5507. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5508. | Unisgned version.
  5509. *----------------------------------------------------------------------------*}
  5510. function qword_to_float32( a: qword ): float32rec; compilerproc;
  5511. var
  5512. zSign : flag;
  5513. absA : uint64;
  5514. shiftCount: int8;
  5515. intval : int64rec;
  5516. Begin
  5517. if ( a = 0 ) then
  5518. begin
  5519. qword_to_float32.float32 := 0;
  5520. exit;
  5521. end;
  5522. zSign := flag(FALSE);
  5523. absA := a;
  5524. shiftCount := countLeadingZeros64( absA ) - 40;
  5525. if ( 0 <= shiftCount ) then
  5526. begin
  5527. qword_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5528. end
  5529. else
  5530. begin
  5531. shiftCount := shiftCount + 7;
  5532. if ( shiftCount < 0 ) then
  5533. begin
  5534. intval.low := int64rec(AbsA).low;
  5535. intval.high := int64rec(AbsA).high;
  5536. shift64RightJamming( intval.high, intval.low, - shiftCount,
  5537. intval.high, intval.low);
  5538. int64rec(absA).low := intval.low;
  5539. int64rec(absA).high := intval.high;
  5540. end
  5541. else
  5542. absA := absA shl shiftCount;
  5543. qword_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5544. end;
  5545. End;
  5546. {*----------------------------------------------------------------------------
  5547. | Returns the result of converting the 64-bit two's complement integer `a'
  5548. | to the double-precision floating-point format. The conversion is performed
  5549. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5550. *----------------------------------------------------------------------------*}
  5551. function qword_to_float64( a: qword ): float64;
  5552. {$ifdef fpc}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5553. var
  5554. zSign : flag;
  5555. float_result : float64;
  5556. AbsA : bits64;
  5557. shiftcount : int8;
  5558. zSig0, zSig1 : bits32;
  5559. Begin
  5560. if ( a = 0 ) then
  5561. Begin
  5562. packFloat64( 0, 0, 0, 0, result );
  5563. exit;
  5564. end;
  5565. zSign := flag(FALSE);
  5566. AbsA := a;
  5567. shiftCount := countLeadingZeros64( absA ) - 11;
  5568. if ( 0 <= shiftCount ) then
  5569. Begin
  5570. absA := absA shl shiftcount;
  5571. zSig0:=int64rec(absA).high;
  5572. zSig1:=int64rec(absA).low;
  5573. End
  5574. else
  5575. Begin
  5576. shift64Right( int64rec(absA).high, int64rec(absA).low,
  5577. - shiftCount, zSig0, zSig1 );
  5578. End;
  5579. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5580. qword_to_float64:= float_result;
  5581. End;
  5582. {*----------------------------------------------------------------------------
  5583. | Returns the result of converting the 64-bit two's complement integer `a'
  5584. | to the double-precision floating-point format. The conversion is performed
  5585. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5586. *----------------------------------------------------------------------------*}
  5587. function int64_to_float64( a: int64 ): float64;
  5588. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5589. var
  5590. zSign : flag;
  5591. float_result : float64;
  5592. AbsA : bits64;
  5593. shiftcount : int8;
  5594. zSig0, zSig1 : bits32;
  5595. Begin
  5596. if ( a = 0 ) then
  5597. Begin
  5598. packFloat64( 0, 0, 0, 0, result );
  5599. exit;
  5600. end;
  5601. zSign := flag( a < 0 );
  5602. if ZSign<>0 then
  5603. AbsA := -a
  5604. else
  5605. AbsA := a;
  5606. shiftCount := countLeadingZeros64( absA ) - 11;
  5607. if ( 0 <= shiftCount ) then
  5608. Begin
  5609. absA := absA shl shiftcount;
  5610. zSig0:=int64rec(absA).high;
  5611. zSig1:=int64rec(absA).low;
  5612. End
  5613. else
  5614. Begin
  5615. shift64Right( int64rec(absA).high, int64rec(absA).low,
  5616. - shiftCount, zSig0, zSig1 );
  5617. End;
  5618. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5619. int64_to_float64:= float_result;
  5620. End;
  5621. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5622. {*----------------------------------------------------------------------------
  5623. | Returns the result of converting the 64-bit two's complement integer `a'
  5624. | to the extended double-precision floating-point format. The conversion
  5625. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5626. | Arithmetic.
  5627. *----------------------------------------------------------------------------*}
  5628. function int64_to_floatx80( a: int64 ): floatx80;
  5629. var
  5630. zSign: flag;
  5631. absA: uint64;
  5632. shiftCount: int8;
  5633. begin
  5634. if ( a = 0 ) then begin
  5635. result := packFloatx80( 0, 0, 0 );
  5636. exit;
  5637. end;
  5638. zSign := ord( a < 0 );
  5639. if zSign <> 0 then absA := - a else absA := a;
  5640. shiftCount := countLeadingZeros64( absA );
  5641. result := packFloatx80( zSign, $403E - shiftCount, absA shl shiftCount );
  5642. end;
  5643. {*----------------------------------------------------------------------------
  5644. | Returns the result of converting the 64-bit two's complement integer `a'
  5645. | to the extended double-precision floating-point format. The conversion
  5646. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5647. | Arithmetic.
  5648. | Unsigned version.
  5649. *----------------------------------------------------------------------------*}
  5650. function qword_to_floatx80( a: qword ): floatx80;
  5651. var
  5652. absA: bits64;
  5653. shiftCount: int8;
  5654. begin
  5655. if ( a = 0 ) then begin
  5656. result := packFloatx80( 0, 0, 0 );
  5657. exit;
  5658. end;
  5659. absA := a;
  5660. shiftCount := countLeadingZeros64( absA );
  5661. result := packFloatx80( 0, $403E - shiftCount, absA shl shiftCount );
  5662. end;
  5663. {$endif FPC_SOFTFLOAT_FLOATX80}
  5664. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5665. {*----------------------------------------------------------------------------
  5666. | Returns the result of converting the 64-bit two's complement integer `a' to
  5667. | the quadruple-precision floating-point format. The conversion is performed
  5668. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5669. *----------------------------------------------------------------------------*}
  5670. function int64_to_float128( a: int64 ): float128;
  5671. var
  5672. zSign: flag;
  5673. absA: uint64;
  5674. shiftCount: int8;
  5675. zExp: int32;
  5676. zSig0, zSig1: bits64;
  5677. begin
  5678. if ( a = 0 ) then begin
  5679. result := packFloat128( 0, 0, 0, 0 );
  5680. exit;
  5681. end;
  5682. zSign := ord( a < 0 );
  5683. if zSign <> 0 then absA := - a else absA := a;
  5684. shiftCount := countLeadingZeros64( absA ) + 49;
  5685. zExp := $406E - shiftCount;
  5686. if ( 64 <= shiftCount ) then begin
  5687. zSig1 := 0;
  5688. zSig0 := absA;
  5689. dec( shiftCount, 64 );
  5690. end
  5691. else begin
  5692. zSig1 := absA;
  5693. zSig0 := 0;
  5694. end;
  5695. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5696. result := packFloat128( zSign, zExp, zSig0, zSig1 );
  5697. end;
  5698. {*----------------------------------------------------------------------------
  5699. | Returns the result of converting the 64-bit two's complement integer `a' to
  5700. | the quadruple-precision floating-point format. The conversion is performed
  5701. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5702. | Unsigned version.
  5703. *----------------------------------------------------------------------------*}
  5704. function qword_to_float128( a: qword ): float128;
  5705. var
  5706. absA: bits64;
  5707. shiftCount: int8;
  5708. zExp: int32;
  5709. zSig0, zSig1: bits64;
  5710. begin
  5711. if ( a = 0 ) then begin
  5712. result := packFloat128( 0, 0, 0, 0 );
  5713. exit;
  5714. end;
  5715. absA := a;
  5716. shiftCount := countLeadingZeros64( absA ) + 49;
  5717. zExp := $406E - shiftCount;
  5718. if ( 64 <= shiftCount ) then begin
  5719. zSig1 := 0;
  5720. zSig0 := absA;
  5721. dec( shiftCount, 64 );
  5722. end
  5723. else begin
  5724. zSig1 := absA;
  5725. zSig0 := 0;
  5726. end;
  5727. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5728. result := packFloat128( 0, zExp, zSig0, zSig1 );
  5729. end;
  5730. {$endif FPC_SOFTFLOAT_FLOAT128}
  5731. {*----------------------------------------------------------------------------
  5732. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5733. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5734. | Otherwise, returns 0.
  5735. *----------------------------------------------------------------------------*}
  5736. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5737. begin
  5738. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5739. end;
  5740. {*----------------------------------------------------------------------------
  5741. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5742. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5743. | Otherwise, returns 0.
  5744. *----------------------------------------------------------------------------*}
  5745. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5746. begin
  5747. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5748. end;
  5749. {*----------------------------------------------------------------------------
  5750. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5751. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5752. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5753. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5754. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5755. | the most-significant bit of the extra result, and the other 63 bits of the
  5756. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5757. | were all zero. This extra result is stored in the location pointed to by
  5758. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5759. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5760. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5761. | fixed-point value is shifted right by the number of bits given in `count',
  5762. | and the integer part of the result is returned at the locations pointed to
  5763. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5764. | corrupted as described above, and is returned at the location pointed to by
  5765. | `z2Ptr'.)
  5766. *----------------------------------------------------------------------------*}
  5767. procedure shift128ExtraRightJamming(
  5768. a0: bits64;
  5769. a1: bits64;
  5770. a2: bits64;
  5771. count: int16;
  5772. var z0Ptr: bits64;
  5773. var z1Ptr: bits64;
  5774. var z2Ptr: bits64);
  5775. var
  5776. z0, z1, z2: bits64;
  5777. negCount: int8;
  5778. begin
  5779. negCount := ( - count ) and 63;
  5780. if ( count = 0 ) then
  5781. begin
  5782. z2 := a2;
  5783. z1 := a1;
  5784. z0 := a0;
  5785. end
  5786. else begin
  5787. if ( count < 64 ) then
  5788. begin
  5789. z2 := a1 shl negCount;
  5790. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5791. z0 := a0 shr count;
  5792. end
  5793. else begin
  5794. if ( count = 64 ) then
  5795. begin
  5796. z2 := a1;
  5797. z1 := a0;
  5798. end
  5799. else begin
  5800. a2 := a2 or a1;
  5801. if ( count < 128 ) then
  5802. begin
  5803. z2 := a0 shl negCount;
  5804. z1 := a0 shr ( count and 63 );
  5805. end
  5806. else begin
  5807. if ( count = 128 ) then
  5808. z2 := a0
  5809. else
  5810. z2 := ord( a0 <> 0 );
  5811. z1 := 0;
  5812. end;
  5813. end;
  5814. z0 := 0;
  5815. end;
  5816. z2 := z2 or ord( a2 <> 0 );
  5817. end;
  5818. z2Ptr := z2;
  5819. z1Ptr := z1;
  5820. z0Ptr := z0;
  5821. end;
  5822. {*----------------------------------------------------------------------------
  5823. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5824. | _plus_ the number of bits given in `count'. The shifted result is at most
  5825. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5826. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5827. | shifted off is the most-significant bit of the extra result, and the other
  5828. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5829. | bits shifted off were all zero. This extra result is stored in the location
  5830. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5831. | (This routine makes more sense if `a0' and `a1' are considered to form
  5832. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5833. | point value is shifted right by the number of bits given in `count', and
  5834. | the integer part of the result is returned at the location pointed to by
  5835. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5836. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5837. *----------------------------------------------------------------------------*}
  5838. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5839. var
  5840. z0, z1: bits64;
  5841. negCount: int8;
  5842. begin
  5843. negCount := ( - count ) and 63;
  5844. if ( count = 0 ) then
  5845. begin
  5846. z1 := a1;
  5847. z0 := a0;
  5848. end
  5849. else if ( count < 64 ) then
  5850. begin
  5851. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5852. z0 := a0 shr count;
  5853. end
  5854. else begin
  5855. if ( count = 64 ) then
  5856. begin
  5857. z1 := a0 or ord( a1 <> 0 );
  5858. end
  5859. else begin
  5860. z1 := ord( ( a0 or a1 ) <> 0 );
  5861. end;
  5862. z0 := 0;
  5863. end;
  5864. z1Ptr := z1;
  5865. z0Ptr := z0;
  5866. end;
  5867. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5868. {*----------------------------------------------------------------------------
  5869. | Returns the fraction bits of the extended double-precision floating-point
  5870. | value `a'.
  5871. *----------------------------------------------------------------------------*}
  5872. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5873. begin
  5874. result:=a.low;
  5875. end;
  5876. {*----------------------------------------------------------------------------
  5877. | Returns the exponent bits of the extended double-precision floating-point
  5878. | value `a'.
  5879. *----------------------------------------------------------------------------*}
  5880. function extractFloatx80Exp(a : floatx80): int32;inline;
  5881. begin
  5882. result:=a.high and $7FFF;
  5883. end;
  5884. {*----------------------------------------------------------------------------
  5885. | Returns the sign bit of the extended double-precision floating-point value
  5886. | `a'.
  5887. *----------------------------------------------------------------------------*}
  5888. function extractFloatx80Sign(a : floatx80): flag;inline;
  5889. begin
  5890. result:=a.high shr 15;
  5891. end;
  5892. {*----------------------------------------------------------------------------
  5893. | Normalizes the subnormal extended double-precision floating-point value
  5894. | represented by the denormalized significand `aSig'. The normalized exponent
  5895. | and significand are stored at the locations pointed to by `zExpPtr' and
  5896. | `zSigPtr', respectively.
  5897. *----------------------------------------------------------------------------*}
  5898. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5899. var
  5900. shiftCount: int8;
  5901. begin
  5902. shiftCount := countLeadingZeros64( aSig );
  5903. zSigPtr := aSig shl shiftCount;
  5904. zExpPtr := 1 - shiftCount;
  5905. end;
  5906. {*----------------------------------------------------------------------------
  5907. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5908. | extended double-precision floating-point value, returning the result.
  5909. *----------------------------------------------------------------------------*}
  5910. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5911. var
  5912. z: floatx80;
  5913. begin
  5914. z.low := zSig;
  5915. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5916. result:=z;
  5917. end;
  5918. {*----------------------------------------------------------------------------
  5919. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5920. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5921. | and returns the proper extended double-precision floating-point value
  5922. | corresponding to the abstract input. Ordinarily, the abstract value is
  5923. | rounded and packed into the extended double-precision format, with the
  5924. | inexact exception raised if the abstract input cannot be represented
  5925. | exactly. However, if the abstract value is too large, the overflow and
  5926. | inexact exceptions are raised and an infinity or maximal finite value is
  5927. | returned. If the abstract value is too small, the input value is rounded to
  5928. | a subnormal number, and the underflow and inexact exceptions are raised if
  5929. | the abstract input cannot be represented exactly as a subnormal extended
  5930. | double-precision floating-point number.
  5931. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5932. | number of bits as single or double precision, respectively. Otherwise, the
  5933. | result is rounded to the full precision of the extended double-precision
  5934. | format.
  5935. | The input significand must be normalized or smaller. If the input
  5936. | significand is not normalized, `zExp' must be 0; in that case, the result
  5937. | returned is a subnormal number, and it must not require rounding. The
  5938. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5939. | Floating-Point Arithmetic.
  5940. *----------------------------------------------------------------------------*}
  5941. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5942. var
  5943. roundingMode: int8;
  5944. roundNearestEven, increment, isTiny: flag;
  5945. roundIncrement, roundMask, roundBits: int64;
  5946. label
  5947. precision80, overflow;
  5948. begin
  5949. roundingMode := softfloat_rounding_mode;
  5950. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5951. if ( roundingPrecision = 80 ) then
  5952. goto precision80;
  5953. if ( roundingPrecision = 64 ) then
  5954. begin
  5955. roundIncrement := int64( $0000000000000400 );
  5956. roundMask := int64( $00000000000007FF );
  5957. end
  5958. else if ( roundingPrecision = 32 ) then
  5959. begin
  5960. roundIncrement := int64( $0000008000000000 );
  5961. roundMask := int64( $000000FFFFFFFFFF );
  5962. end
  5963. else begin
  5964. goto precision80;
  5965. end;
  5966. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5967. if ( not (roundNearestEven<>0) ) then
  5968. begin
  5969. if ( roundingMode = float_round_to_zero ) then
  5970. begin
  5971. roundIncrement := 0;
  5972. end
  5973. else begin
  5974. roundIncrement := roundMask;
  5975. if ( zSign<>0 ) then
  5976. begin
  5977. if ( roundingMode = float_round_up ) then
  5978. roundIncrement := 0;
  5979. end
  5980. else begin
  5981. if ( roundingMode = float_round_down ) then
  5982. roundIncrement := 0;
  5983. end;
  5984. end;
  5985. end;
  5986. roundBits := zSig0 and roundMask;
  5987. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  5988. if ( ( $7FFE < zExp )
  5989. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5990. ) then begin
  5991. goto overflow;
  5992. end;
  5993. if ( zExp <= 0 ) then begin
  5994. isTiny := ord (
  5995. ( softfloat_detect_tininess = float_tininess_before_rounding )
  5996. or ( zExp < 0 )
  5997. or ( zSig0 <= zSig0 + roundIncrement ) );
  5998. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  5999. zExp := 0;
  6000. roundBits := zSig0 and roundMask;
  6001. if ( isTiny <> 0 ) and ( roundBits <> 0 ) then float_raise( float_flag_underflow );
  6002. if ( roundBits <> 0 ) then set_inexact_flag;
  6003. inc( zSig0, roundIncrement );
  6004. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6005. roundIncrement := roundMask + 1;
  6006. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6007. roundMask := roundMask or roundIncrement;
  6008. end;
  6009. zSig0 := zSig0 and not roundMask;
  6010. result:=packFloatx80( zSign, zExp, zSig0 );
  6011. exit;
  6012. end;
  6013. end;
  6014. if ( roundBits <> 0 ) then set_inexact_flag;
  6015. inc( zSig0, roundIncrement );
  6016. if ( zSig0 < roundIncrement ) then begin
  6017. inc(zExp);
  6018. zSig0 := bits64( $8000000000000000 );
  6019. end;
  6020. roundIncrement := roundMask + 1;
  6021. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6022. roundMask := roundMask or roundIncrement;
  6023. end;
  6024. zSig0 := zSig0 and not roundMask;
  6025. if ( zSig0 = 0 ) then zExp := 0;
  6026. result:=packFloatx80( zSign, zExp, zSig0 );
  6027. exit;
  6028. precision80:
  6029. increment := ord ( sbits64( zSig1 ) < 0 );
  6030. if ( roundNearestEven = 0 ) then begin
  6031. if ( roundingMode = float_round_to_zero ) then begin
  6032. increment := 0;
  6033. end
  6034. else begin
  6035. if ( zSign <> 0 ) then begin
  6036. increment := ord ( roundingMode = float_round_down ) and zSig1;
  6037. end
  6038. else begin
  6039. increment := ord ( roundingMode = float_round_up ) and zSig1;
  6040. end;
  6041. end;
  6042. end;
  6043. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6044. if ( ( $7FFE < zExp )
  6045. or ( ( zExp = $7FFE )
  6046. and ( zSig0 = bits64( $FFFFFFFFFFFFFFFF ) )
  6047. and ( increment <> 0 )
  6048. )
  6049. ) then begin
  6050. roundMask := 0;
  6051. overflow:
  6052. float_raise( [float_flag_overflow,float_flag_inexact] );
  6053. if ( ( roundingMode = float_round_to_zero )
  6054. or ( ( zSign <> 0) and ( roundingMode = float_round_up ) )
  6055. or ( ( zSign = 0) and ( roundingMode = float_round_down ) )
  6056. ) then begin
  6057. result:=packFloatx80( zSign, $7FFE, not roundMask );
  6058. exit;
  6059. end;
  6060. result:=packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6061. exit;
  6062. end;
  6063. if ( zExp <= 0 ) then begin
  6064. isTiny := ord(
  6065. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6066. or ( zExp < 0 )
  6067. or ( increment = 0 )
  6068. or ( zSig0 < bits64( $FFFFFFFFFFFFFFFF ) ) );
  6069. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  6070. zExp := 0;
  6071. if ( ( isTiny <> 0 ) and ( zSig1 <> 0 ) ) then float_raise( float_flag_underflow );
  6072. if ( zSig1 <> 0 ) then set_inexact_flag;
  6073. if ( roundNearestEven <> 0 ) then begin
  6074. increment := ord( sbits64( zSig1 ) < 0 );
  6075. end
  6076. else begin
  6077. if ( zSign <> 0 ) then begin
  6078. increment := ord( roundingMode = float_round_down ) and zSig1;
  6079. end
  6080. else begin
  6081. increment := ord( roundingMode = float_round_up ) and zSig1;
  6082. end;
  6083. end;
  6084. if ( increment <> 0 ) then begin
  6085. inc(zSig0);
  6086. zSig0 :=
  6087. not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6088. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6089. end;
  6090. result:=packFloatx80( zSign, zExp, zSig0 );
  6091. exit;
  6092. end;
  6093. end;
  6094. if ( zSig1 <> 0 ) then set_inexact_flag;
  6095. if ( increment <> 0 ) then begin
  6096. inc(zSig0);
  6097. if ( zSig0 = 0 ) then begin
  6098. inc(zExp);
  6099. zSig0 := bits64( $8000000000000000 );
  6100. end
  6101. else begin
  6102. zSig0 := zSig0 and not bits64( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6103. end;
  6104. end
  6105. else begin
  6106. if ( zSig0 = 0 ) then zExp := 0;
  6107. end;
  6108. result:=packFloatx80( zSign, zExp, zSig0 );
  6109. end;
  6110. {*----------------------------------------------------------------------------
  6111. | Takes an abstract floating-point value having sign `zSign', exponent
  6112. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  6113. | and returns the proper extended double-precision floating-point value
  6114. | corresponding to the abstract input. This routine is just like
  6115. | `roundAndPackFloatx80' except that the input significand does not have to be
  6116. | normalized.
  6117. *----------------------------------------------------------------------------*}
  6118. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6119. var
  6120. shiftCount: int8;
  6121. begin
  6122. if ( zSig0 = 0 ) then begin
  6123. zSig0 := zSig1;
  6124. zSig1 := 0;
  6125. dec( zExp, 64 );
  6126. end;
  6127. shiftCount := countLeadingZeros64( zSig0 );
  6128. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6129. zExp := zExp - shiftCount;
  6130. result :=
  6131. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  6132. end;
  6133. {*----------------------------------------------------------------------------
  6134. | Returns the result of converting the extended double-precision floating-
  6135. | point value `a' to the 32-bit two's complement integer format. The
  6136. | conversion is performed according to the IEC/IEEE Standard for Binary
  6137. | Floating-Point Arithmetic---which means in particular that the conversion
  6138. | is rounded according to the current rounding mode. If `a' is a NaN, the
  6139. | largest positive integer is returned. Otherwise, if the conversion
  6140. | overflows, the largest integer with the same sign as `a' is returned.
  6141. *----------------------------------------------------------------------------*}
  6142. function floatx80_to_int32(a: floatx80): int32;
  6143. var
  6144. aSign: flag;
  6145. aExp, shiftCount: int32;
  6146. aSig: bits64;
  6147. begin
  6148. aSig := extractFloatx80Frac( a );
  6149. aExp := extractFloatx80Exp( a );
  6150. aSign := extractFloatx80Sign( a );
  6151. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then aSign := 0;
  6152. shiftCount := $4037 - aExp;
  6153. if ( shiftCount <= 0 ) then shiftCount := 1;
  6154. shift64RightJamming( aSig, shiftCount, aSig );
  6155. result := roundAndPackInt32( aSign, aSig );
  6156. end;
  6157. {*----------------------------------------------------------------------------
  6158. | Returns the result of converting the extended double-precision floating-
  6159. | point value `a' to the 32-bit two's complement integer format. The
  6160. | conversion is performed according to the IEC/IEEE Standard for Binary
  6161. | Floating-Point Arithmetic, except that the conversion is always rounded
  6162. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6163. | Otherwise, if the conversion overflows, the largest integer with the same
  6164. | sign as `a' is returned.
  6165. *----------------------------------------------------------------------------*}
  6166. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  6167. var
  6168. aSign: flag;
  6169. aExp, shiftCount: int32;
  6170. aSig, savedASig: bits64;
  6171. z: int32;
  6172. label
  6173. invalid;
  6174. begin
  6175. aSig := extractFloatx80Frac( a );
  6176. aExp := extractFloatx80Exp( a );
  6177. aSign := extractFloatx80Sign( a );
  6178. if ( $401E < aExp ) then begin
  6179. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 )<>0 ) then aSign := 0;
  6180. goto invalid;
  6181. end
  6182. else if ( aExp < $3FFF ) then begin
  6183. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6184. result := 0;
  6185. exit;
  6186. end;
  6187. shiftCount := $403E - aExp;
  6188. savedASig := aSig;
  6189. aSig := aSig shr shiftCount;
  6190. z := aSig;
  6191. if ( aSign <> 0 ) then z := - z;
  6192. if ( ord( z < 0 ) xor aSign ) <> 0 then begin
  6193. invalid:
  6194. float_raise( float_flag_invalid );
  6195. if aSign <> 0 then result := sbits32( $80000000 ) else result := $7FFFFFFF;
  6196. exit;
  6197. end;
  6198. if ( ( aSig shl shiftCount ) <> savedASig ) then begin
  6199. set_inexact_flag;
  6200. end;
  6201. result := z;
  6202. end;
  6203. {*----------------------------------------------------------------------------
  6204. | Returns the result of converting the extended double-precision floating-
  6205. | point value `a' to the 64-bit two's complement integer format. The
  6206. | conversion is performed according to the IEC/IEEE Standard for Binary
  6207. | Floating-Point Arithmetic---which means in particular that the conversion
  6208. | is rounded according to the current rounding mode. If `a' is a NaN,
  6209. | the largest positive integer is returned. Otherwise, if the conversion
  6210. | overflows, the largest integer with the same sign as `a' is returned.
  6211. *----------------------------------------------------------------------------*}
  6212. function floatx80_to_int64(a: floatx80): int64;
  6213. var
  6214. aSign: flag;
  6215. aExp, shiftCount: int32;
  6216. aSig, aSigExtra: bits64;
  6217. begin
  6218. aSig := extractFloatx80Frac( a );
  6219. aExp := extractFloatx80Exp( a );
  6220. aSign := extractFloatx80Sign( a );
  6221. shiftCount := $403E - aExp;
  6222. if ( shiftCount <= 0 ) then begin
  6223. if ( shiftCount <> 0 ) then begin
  6224. float_raise( float_flag_invalid );
  6225. if ( ( aSign = 0 )
  6226. or ( ( aExp = $7FFF )
  6227. and ( aSig <> bits64( $8000000000000000 ) ) )
  6228. ) then begin
  6229. result := $7FFFFFFFFFFFFFFF;
  6230. exit;
  6231. end;
  6232. result := $8000000000000000;
  6233. exit;
  6234. end;
  6235. aSigExtra := 0;
  6236. end
  6237. else begin
  6238. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  6239. end;
  6240. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  6241. end;
  6242. {*----------------------------------------------------------------------------
  6243. | Returns the result of converting the extended double-precision floating-
  6244. | point value `a' to the 64-bit two's complement integer format. The
  6245. | conversion is performed according to the IEC/IEEE Standard for Binary
  6246. | Floating-Point Arithmetic, except that the conversion is always rounded
  6247. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6248. | Otherwise, if the conversion overflows, the largest integer with the same
  6249. | sign as `a' is returned.
  6250. *----------------------------------------------------------------------------*}
  6251. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  6252. var
  6253. aSign: flag;
  6254. aExp, shiftCount: int32;
  6255. aSig: bits64;
  6256. z: int64;
  6257. begin
  6258. aSig := extractFloatx80Frac( a );
  6259. aExp := extractFloatx80Exp( a );
  6260. aSign := extractFloatx80Sign( a );
  6261. shiftCount := aExp - $403E;
  6262. if ( 0 <= shiftCount ) then begin
  6263. aSig := $7FFFFFFFFFFFFFFF;
  6264. if ( ( a.high <> $C03E ) or ( aSig <> 0 ) ) then begin
  6265. float_raise( float_flag_invalid );
  6266. if ( ( aSign = 0 ) or ( ( aExp = $7FFF ) and ( aSig <> 0 ) ) ) then begin
  6267. result := $7FFFFFFFFFFFFFFF;
  6268. exit;
  6269. end;
  6270. end;
  6271. result := $8000000000000000;
  6272. exit;
  6273. end
  6274. else if ( aExp < $3FFF ) then begin
  6275. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6276. result := 0;
  6277. exit;
  6278. end;
  6279. z := aSig shr ( - shiftCount );
  6280. if bits64( aSig shl ( shiftCount and 63 ) ) <> 0 then begin
  6281. set_inexact_flag;
  6282. end;
  6283. if ( aSign <> 0 ) then z := - z;
  6284. result := z;
  6285. end;
  6286. {*----------------------------------------------------------------------------
  6287. | The pattern for a default generated extended double-precision NaN. The
  6288. | `high' and `low' values hold the most- and least-significant bits,
  6289. | respectively.
  6290. *----------------------------------------------------------------------------*}
  6291. const
  6292. floatx80_default_nan_high = $FFFF;
  6293. floatx80_default_nan_low = bits64( $C000000000000000 );
  6294. {*----------------------------------------------------------------------------
  6295. | Returns 1 if the extended double-precision floating-point value `a' is a
  6296. | signaling NaN; otherwise returns 0.
  6297. *----------------------------------------------------------------------------*}
  6298. function floatx80_is_signaling_nan(a : floatx80): flag;
  6299. var
  6300. aLow: bits64;
  6301. begin
  6302. aLow := a.low and not $4000000000000000;
  6303. result := ord(
  6304. ( a.high and $7FFF = $7FFF )
  6305. and ( bits64( aLow shl 1 ) <> 0 )
  6306. and ( a.low = aLow ) );
  6307. end;
  6308. {*----------------------------------------------------------------------------
  6309. | Returns the result of converting the extended double-precision floating-
  6310. | point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
  6311. | invalid exception is raised.
  6312. *----------------------------------------------------------------------------*}
  6313. function floatx80ToCommonNaN(a : floatx80): commonNaNT;
  6314. var
  6315. z: commonNaNT;
  6316. begin
  6317. if floatx80_is_signaling_nan( a ) <> 0 then float_raise( float_flag_invalid );
  6318. z.sign := a.high shr 15;
  6319. z.low := 0;
  6320. z.high := a.low shl 1;
  6321. result := z;
  6322. end;
  6323. {*----------------------------------------------------------------------------
  6324. | Returns 1 if the extended double-precision floating-point value `a' is a
  6325. | NaN; otherwise returns 0.
  6326. *----------------------------------------------------------------------------*}
  6327. function floatx80_is_nan(a : floatx80 ): flag;
  6328. begin
  6329. result := ord( ( ( a.high and $7FFF ) = $7FFF ) and ( bits64( a.low<<1 ) <> 0 ) );
  6330. end;
  6331. {*----------------------------------------------------------------------------
  6332. | Takes two extended double-precision floating-point values `a' and `b', one
  6333. | of which is a NaN, and returns the appropriate NaN result. If either `a' or
  6334. | `b' is a signaling NaN, the invalid exception is raised.
  6335. *----------------------------------------------------------------------------*}
  6336. function propagateFloatx80NaN(a, b: floatx80): floatx80;
  6337. var
  6338. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  6339. label
  6340. returnLargerSignificand;
  6341. begin
  6342. aIsNaN := floatx80_is_nan( a );
  6343. aIsSignalingNaN := floatx80_is_signaling_nan( a );
  6344. bIsNaN := floatx80_is_nan( b );
  6345. bIsSignalingNaN := floatx80_is_signaling_nan( b );
  6346. a.low := a.low or $C000000000000000;
  6347. b.low := b.low or $C000000000000000;
  6348. if aIsSignalingNaN or bIsSignalingNaN <> 0 then float_raise( float_flag_invalid );
  6349. if aIsSignalingNaN <> 0 then begin
  6350. if bIsSignalingNaN <> 0 then goto returnLargerSignificand;
  6351. if bIsNaN <> 0 then result := b else result := a;
  6352. exit;
  6353. end
  6354. else if aIsNaN <>0 then begin
  6355. if ( bIsSignalingNaN <> 0 ) or ( bIsNaN = 0) then begin
  6356. result := a;
  6357. exit;
  6358. end;
  6359. returnLargerSignificand:
  6360. if ( a.low < b.low ) then begin
  6361. result := b;
  6362. exit;
  6363. end;
  6364. if ( b.low < a.low ) then begin
  6365. result := a;
  6366. exit;
  6367. end;
  6368. if a.high < b.high then result := a else result := b;
  6369. exit;
  6370. end
  6371. else
  6372. result := b;
  6373. end;
  6374. {*----------------------------------------------------------------------------
  6375. | Returns the result of converting the extended double-precision floating-
  6376. | point value `a' to the single-precision floating-point format. The
  6377. | conversion is performed according to the IEC/IEEE Standard for Binary
  6378. | Floating-Point Arithmetic.
  6379. *----------------------------------------------------------------------------*}
  6380. function floatx80_to_float32(a: floatx80): float32;
  6381. var
  6382. aSign: flag;
  6383. aExp: int32;
  6384. aSig: bits64;
  6385. begin
  6386. aSig := extractFloatx80Frac( a );
  6387. aExp := extractFloatx80Exp( a );
  6388. aSign := extractFloatx80Sign( a );
  6389. if ( aExp = $7FFF ) then begin
  6390. if bits64( aSig shl 1 ) <> 0 then begin
  6391. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  6392. exit;
  6393. end;
  6394. result := packFloat32( aSign, $FF, 0 );
  6395. exit;
  6396. end;
  6397. shift64RightJamming( aSig, 33, aSig );
  6398. if ( aExp or aSig <> 0 ) then dec( aExp, $3F81 );
  6399. result := roundAndPackFloat32( aSign, aExp, aSig );
  6400. end;
  6401. {*----------------------------------------------------------------------------
  6402. | Returns the result of converting the extended double-precision floating-
  6403. | point value `a' to the double-precision floating-point format. The
  6404. | conversion is performed according to the IEC/IEEE Standard for Binary
  6405. | Floating-Point Arithmetic.
  6406. *----------------------------------------------------------------------------*}
  6407. function floatx80_to_float64(a: floatx80): float64;
  6408. var
  6409. aSign: flag;
  6410. aExp: int32;
  6411. aSig, zSig: bits64;
  6412. begin
  6413. aSig := extractFloatx80Frac( a );
  6414. aExp := extractFloatx80Exp( a );
  6415. aSign := extractFloatx80Sign( a );
  6416. if ( aExp = $7FFF ) then begin
  6417. if bits64( aSig shl 1 ) <> 0 then begin
  6418. commonNaNToFloat64( floatx80ToCommonNaN( a ), result );
  6419. exit;
  6420. end;
  6421. result := packFloat64( aSign, $7FF, 0 );
  6422. exit;
  6423. end;
  6424. shift64RightJamming( aSig, 1, zSig );
  6425. if ( aExp or aSig <> 0 ) then dec( aExp, $3C01 );
  6426. result := roundAndPackFloat64( aSign, aExp, zSig );
  6427. end;
  6428. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6429. {*----------------------------------------------------------------------------
  6430. | Returns the result of converting the extended double-precision floating-
  6431. | point value `a' to the quadruple-precision floating-point format. The
  6432. | conversion is performed according to the IEC/IEEE Standard for Binary
  6433. | Floating-Point Arithmetic.
  6434. *----------------------------------------------------------------------------*}
  6435. function floatx80_to_float128(a: floatx80): float128;
  6436. var
  6437. aSign: flag;
  6438. aExp: int16;
  6439. aSig, zSig0, zSig1: bits64;
  6440. begin
  6441. aSig := extractFloatx80Frac( a );
  6442. aExp := extractFloatx80Exp( a );
  6443. aSign := extractFloatx80Sign( a );
  6444. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then begin
  6445. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  6446. exit;
  6447. end;
  6448. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  6449. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  6450. end;
  6451. {$endif FPC_SOFTFLOAT_FLOAT128}
  6452. {*----------------------------------------------------------------------------
  6453. | Rounds the extended double-precision floating-point value `a' to an integer,
  6454. | and Returns the result as an extended quadruple-precision floating-point
  6455. | value. The operation is performed according to the IEC/IEEE Standard for
  6456. | Binary Floating-Point Arithmetic.
  6457. *----------------------------------------------------------------------------*}
  6458. function floatx80_round_to_int(a: floatx80): floatx80;
  6459. var
  6460. aSign: flag;
  6461. aExp: int32;
  6462. lastBitMask, roundBitsMask: bits64;
  6463. roundingMode: int8;
  6464. z: floatx80;
  6465. begin
  6466. aExp := extractFloatx80Exp( a );
  6467. if ( $403E <= aExp ) then begin
  6468. if ( aExp = $7FFF ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) then begin
  6469. result := propagateFloatx80NaN( a, a );
  6470. exit;
  6471. end;
  6472. result := a;
  6473. exit;
  6474. end;
  6475. if ( aExp < $3FFF ) then begin
  6476. if ( ( aExp = 0 )
  6477. and ( bits64( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) then begin
  6478. result := a;
  6479. exit;
  6480. end;
  6481. set_inexact_flag;
  6482. aSign := extractFloatx80Sign( a );
  6483. case softfloat_rounding_mode of
  6484. float_round_nearest_even:
  6485. if ( ( aExp = $3FFE ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  6486. ) then begin
  6487. result :=
  6488. packFloatx80( aSign, $3FFF, bits64( $8000000000000000 ) );
  6489. exit;
  6490. end;
  6491. float_round_down: begin
  6492. if aSign <> 0 then
  6493. result := packFloatx80( 1, $3FFF, bits64( $8000000000000000 ) )
  6494. else
  6495. result := packFloatx80( 0, 0, 0 );
  6496. exit;
  6497. end;
  6498. float_round_up: begin
  6499. if aSign <> 0 then
  6500. result := packFloatx80( 1, 0, 0 )
  6501. else
  6502. result := packFloatx80( 0, $3FFF, bits64( $8000000000000000 ) );
  6503. exit;
  6504. end;
  6505. end;
  6506. result := packFloatx80( aSign, 0, 0 );
  6507. exit;
  6508. end;
  6509. lastBitMask := 1;
  6510. lastBitMask := lastBitMask shl ( $403E - aExp );
  6511. roundBitsMask := lastBitMask - 1;
  6512. z := a;
  6513. roundingMode := softfloat_rounding_mode;
  6514. if ( roundingMode = float_round_nearest_even ) then begin
  6515. inc( z.low, lastBitMask shr 1 );
  6516. if ( ( z.low and roundBitsMask ) = 0 ) then z.low := z.low and not lastBitMask;
  6517. end
  6518. else if ( roundingMode <> float_round_to_zero ) then begin
  6519. if ( extractFloatx80Sign( z ) <> 0 ) xor ( roundingMode = float_round_up ) then begin
  6520. inc( z.low, roundBitsMask );
  6521. end;
  6522. end;
  6523. z.low := z.low and not roundBitsMask;
  6524. if ( z.low = 0 ) then begin
  6525. inc(z.high);
  6526. z.low := bits64( $8000000000000000 );
  6527. end;
  6528. if ( z.low <> a.low ) then set_inexact_flag;
  6529. result := z;
  6530. end;
  6531. {*----------------------------------------------------------------------------
  6532. | Returns the result of adding the absolute values of the extended double-
  6533. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  6534. | negated before being returned. `zSign' is ignored if the result is a NaN.
  6535. | The addition is performed according to the IEC/IEEE Standard for Binary
  6536. | Floating-Point Arithmetic.
  6537. *----------------------------------------------------------------------------*}
  6538. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6539. var
  6540. aExp, bExp, zExp: int32;
  6541. aSig, bSig, zSig0, zSig1: bits64;
  6542. expDiff: int32;
  6543. label
  6544. shiftRight1, roundAndPack;
  6545. begin
  6546. aSig := extractFloatx80Frac( a );
  6547. aExp := extractFloatx80Exp( a );
  6548. bSig := extractFloatx80Frac( b );
  6549. bExp := extractFloatx80Exp( b );
  6550. expDiff := aExp - bExp;
  6551. if ( 0 < expDiff ) then begin
  6552. if ( aExp = $7FFF ) then begin
  6553. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6554. result := propagateFloatx80NaN( a, b );
  6555. exit;
  6556. end;
  6557. result := a;
  6558. exit;
  6559. end;
  6560. if ( bExp = 0 ) then dec(expDiff);
  6561. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6562. zExp := aExp;
  6563. end
  6564. else if ( expDiff < 0 ) then begin
  6565. if ( bExp = $7FFF ) then begin
  6566. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6567. result := propagateFloatx80NaN( a, b );
  6568. exit;
  6569. end;
  6570. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6571. exit;
  6572. end;
  6573. if ( aExp = 0 ) then inc(expDiff);
  6574. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6575. zExp := bExp;
  6576. end
  6577. else begin
  6578. if ( aExp = $7FFF ) then begin
  6579. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6580. result := propagateFloatx80NaN( a, b );
  6581. exit;
  6582. end;
  6583. result := a;
  6584. exit;
  6585. end;
  6586. zSig1 := 0;
  6587. zSig0 := aSig + bSig;
  6588. if ( aExp = 0 ) then begin
  6589. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  6590. goto roundAndPack;
  6591. end;
  6592. zExp := aExp;
  6593. goto shiftRight1;
  6594. end;
  6595. zSig0 := aSig + bSig;
  6596. if ( sbits64( zSig0 ) < 0 ) then goto roundAndPack;
  6597. shiftRight1:
  6598. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  6599. zSig0 := zSig0 or $8000000000000000;
  6600. inc(zExp);
  6601. roundAndPack:
  6602. result :=
  6603. roundAndPackFloatx80(
  6604. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6605. end;
  6606. {*----------------------------------------------------------------------------
  6607. | Returns the result of subtracting the absolute values of the extended
  6608. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  6609. | difference is negated before being returned. `zSign' is ignored if the
  6610. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6611. | Standard for Binary Floating-Point Arithmetic.
  6612. *----------------------------------------------------------------------------*}
  6613. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6614. var
  6615. aExp, bExp, zExp: int32;
  6616. aSig, bSig, zSig0, zSig1: bits64;
  6617. expDiff: int32;
  6618. z: floatx80;
  6619. label
  6620. bExpBigger, bBigger, aExpBigger, aBigger, normalizeRoundAndPack;
  6621. begin
  6622. aSig := extractFloatx80Frac( a );
  6623. aExp := extractFloatx80Exp( a );
  6624. bSig := extractFloatx80Frac( b );
  6625. bExp := extractFloatx80Exp( b );
  6626. expDiff := aExp - bExp;
  6627. if ( 0 < expDiff ) then goto aExpBigger;
  6628. if ( expDiff < 0 ) then goto bExpBigger;
  6629. if ( aExp = $7FFF ) then begin
  6630. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6631. result := propagateFloatx80NaN( a, b );
  6632. exit;
  6633. end;
  6634. float_raise( float_flag_invalid );
  6635. z.low := floatx80_default_nan_low;
  6636. z.high := floatx80_default_nan_high;
  6637. result := z;
  6638. exit;
  6639. end;
  6640. if ( aExp = 0 ) then begin
  6641. aExp := 1;
  6642. bExp := 1;
  6643. end;
  6644. zSig1 := 0;
  6645. if ( bSig < aSig ) then goto aBigger;
  6646. if ( aSig < bSig ) then goto bBigger;
  6647. result := packFloatx80( ord( softfloat_rounding_mode = float_round_down ), 0, 0 );
  6648. exit;
  6649. bExpBigger:
  6650. if ( bExp = $7FFF ) then begin
  6651. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6652. result := propagateFloatx80NaN( a, b );
  6653. exit;
  6654. end;
  6655. result := packFloatx80( zSign xor 1, $7FFF, bits64( $8000000000000000 ) );
  6656. exit;
  6657. end;
  6658. if ( aExp = 0 ) then inc(expDiff);
  6659. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6660. bBigger:
  6661. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6662. zExp := bExp;
  6663. zSign := zSign xor 1;
  6664. goto normalizeRoundAndPack;
  6665. aExpBigger:
  6666. if ( aExp = $7FFF ) then begin
  6667. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6668. result := propagateFloatx80NaN( a, b );
  6669. exit;
  6670. end;
  6671. result := a;
  6672. exit;
  6673. end;
  6674. if ( bExp = 0 ) then dec(expDiff);
  6675. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6676. aBigger:
  6677. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6678. zExp := aExp;
  6679. normalizeRoundAndPack:
  6680. result :=
  6681. normalizeRoundAndPackFloatx80(
  6682. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6683. end;
  6684. {*----------------------------------------------------------------------------
  6685. | Returns the result of adding the extended double-precision floating-point
  6686. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6687. | Standard for Binary Floating-Point Arithmetic.
  6688. *----------------------------------------------------------------------------*}
  6689. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6690. var
  6691. aSign, bSign: flag;
  6692. begin
  6693. aSign := extractFloatx80Sign( a );
  6694. bSign := extractFloatx80Sign( b );
  6695. if ( aSign = bSign ) then begin
  6696. result := addFloatx80Sigs( a, b, aSign );
  6697. end
  6698. else begin
  6699. result := subFloatx80Sigs( a, b, aSign );
  6700. end;
  6701. end;
  6702. {*----------------------------------------------------------------------------
  6703. | Returns the result of subtracting the extended double-precision floating-
  6704. | point values `a' and `b'. The operation is performed according to the
  6705. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6706. *----------------------------------------------------------------------------*}
  6707. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6708. var
  6709. aSign, bSign: flag;
  6710. begin
  6711. aSign := extractFloatx80Sign( a );
  6712. bSign := extractFloatx80Sign( b );
  6713. if ( aSign = bSign ) then begin
  6714. result := subFloatx80Sigs( a, b, aSign );
  6715. end
  6716. else begin
  6717. result := addFloatx80Sigs( a, b, aSign );
  6718. end;
  6719. end;
  6720. {*----------------------------------------------------------------------------
  6721. | Returns the result of multiplying the extended double-precision floating-
  6722. | point values `a' and `b'. The operation is performed according to the
  6723. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6724. *----------------------------------------------------------------------------*}
  6725. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6726. var
  6727. aSign, bSign, zSign: flag;
  6728. aExp, bExp, zExp: int32;
  6729. aSig, bSig, zSig0, zSig1: bits64;
  6730. z: floatx80;
  6731. label
  6732. invalid;
  6733. begin
  6734. aSig := extractFloatx80Frac( a );
  6735. aExp := extractFloatx80Exp( a );
  6736. aSign := extractFloatx80Sign( a );
  6737. bSig := extractFloatx80Frac( b );
  6738. bExp := extractFloatx80Exp( b );
  6739. bSign := extractFloatx80Sign( b );
  6740. zSign := aSign xor bSign;
  6741. if ( aExp = $7FFF ) then begin
  6742. if ( bits64( aSig shl 1 ) <> 0 )
  6743. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6744. result := propagateFloatx80NaN( a, b );
  6745. exit;
  6746. end;
  6747. if ( ( bExp or bSig ) = 0 ) then goto invalid;
  6748. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6749. exit;
  6750. end;
  6751. if ( bExp = $7FFF ) then begin
  6752. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6753. result := propagateFloatx80NaN( a, b );
  6754. exit;
  6755. end;
  6756. if ( ( aExp or aSig ) = 0 ) then begin
  6757. invalid:
  6758. float_raise( float_flag_invalid );
  6759. z.low := floatx80_default_nan_low;
  6760. z.high := floatx80_default_nan_high;
  6761. result := z;
  6762. exit;
  6763. end;
  6764. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6765. exit;
  6766. end;
  6767. if ( aExp = 0 ) then begin
  6768. if ( aSig = 0 ) then begin
  6769. result := packFloatx80( zSign, 0, 0 );
  6770. exit;
  6771. end;
  6772. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6773. end;
  6774. if ( bExp = 0 ) then begin
  6775. if ( bSig = 0 ) then begin
  6776. result := packFloatx80( zSign, 0, 0 );
  6777. exit;
  6778. end;
  6779. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6780. end;
  6781. zExp := aExp + bExp - $3FFE;
  6782. mul64To128( aSig, bSig, zSig0, zSig1 );
  6783. if 0 < sbits64( zSig0 ) then begin
  6784. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6785. dec(zExp);
  6786. end;
  6787. result :=
  6788. roundAndPackFloatx80(
  6789. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6790. end;
  6791. {*----------------------------------------------------------------------------
  6792. | Returns the result of dividing the extended double-precision floating-point
  6793. | value `a' by the corresponding value `b'. The operation is performed
  6794. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6795. *----------------------------------------------------------------------------*}
  6796. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6797. var
  6798. aSign, bSign, zSign: flag;
  6799. aExp, bExp, zExp: int32;
  6800. aSig, bSig, zSig0, zSig1: bits64;
  6801. rem0, rem1, rem2, term0, term1, term2: bits64;
  6802. z: floatx80;
  6803. label
  6804. invalid;
  6805. begin
  6806. aSig := extractFloatx80Frac( a );
  6807. aExp := extractFloatx80Exp( a );
  6808. aSign := extractFloatx80Sign( a );
  6809. bSig := extractFloatx80Frac( b );
  6810. bExp := extractFloatx80Exp( b );
  6811. bSign := extractFloatx80Sign( b );
  6812. zSign := aSign xor bSign;
  6813. if ( aExp = $7FFF ) then begin
  6814. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6815. result := propagateFloatx80NaN( a, b );
  6816. exit;
  6817. end;
  6818. if ( bExp = $7FFF ) then begin
  6819. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6820. result := propagateFloatx80NaN( a, b );
  6821. exit;
  6822. end;
  6823. goto invalid;
  6824. end;
  6825. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6826. exit;
  6827. end;
  6828. if ( bExp = $7FFF ) then begin
  6829. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6830. result := propagateFloatx80NaN( a, b );
  6831. exit;
  6832. end;
  6833. result := packFloatx80( zSign, 0, 0 );
  6834. exit;
  6835. end;
  6836. if ( bExp = 0 ) then begin
  6837. if ( bSig = 0 ) then begin
  6838. if ( ( aExp or aSig ) = 0 ) then begin
  6839. invalid:
  6840. float_raise( float_flag_invalid );
  6841. z.low := floatx80_default_nan_low;
  6842. z.high := floatx80_default_nan_high;
  6843. result := z;
  6844. exit;
  6845. end;
  6846. float_raise( float_flag_divbyzero );
  6847. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6848. exit;
  6849. end;
  6850. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6851. end;
  6852. if ( aExp = 0 ) then begin
  6853. if ( aSig = 0 ) then begin
  6854. result := packFloatx80( zSign, 0, 0 );
  6855. exit;
  6856. end;
  6857. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6858. end;
  6859. zExp := aExp - bExp + $3FFE;
  6860. rem1 := 0;
  6861. if ( bSig <= aSig ) then begin
  6862. shift128Right( aSig, 0, 1, aSig, rem1 );
  6863. inc(zExp);
  6864. end;
  6865. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6866. mul64To128( bSig, zSig0, term0, term1 );
  6867. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6868. while ( sbits64( rem0 ) < 0 ) do begin
  6869. dec(zSig0);
  6870. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6871. end;
  6872. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6873. if ( bits64( zSig1 shl 1 ) <= 8 ) then begin
  6874. mul64To128( bSig, zSig1, term1, term2 );
  6875. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6876. while ( sbits64( rem1 ) < 0 ) do begin
  6877. dec(zSig1);
  6878. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6879. end;
  6880. zSig1 := zSig1 or ord( ( rem1 or rem2 ) <> 0 );
  6881. end;
  6882. result :=
  6883. roundAndPackFloatx80(
  6884. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6885. end;
  6886. {*----------------------------------------------------------------------------
  6887. | Returns the remainder of the extended double-precision floating-point value
  6888. | `a' with respect to the corresponding value `b'. The operation is performed
  6889. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6890. *----------------------------------------------------------------------------*}
  6891. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6892. var
  6893. aSign, zSign: flag;
  6894. aExp, bExp, expDiff: int32;
  6895. aSig0, aSig1, bSig: bits64;
  6896. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6897. z: floatx80;
  6898. label
  6899. invalid;
  6900. begin
  6901. aSig0 := extractFloatx80Frac( a );
  6902. aExp := extractFloatx80Exp( a );
  6903. aSign := extractFloatx80Sign( a );
  6904. bSig := extractFloatx80Frac( b );
  6905. bExp := extractFloatx80Exp( b );
  6906. if ( aExp = $7FFF ) then begin
  6907. if ( bits64( aSig0 shl 1 ) <> 0 )
  6908. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6909. result := propagateFloatx80NaN( a, b );
  6910. exit;
  6911. end;
  6912. goto invalid;
  6913. end;
  6914. if ( bExp = $7FFF ) then begin
  6915. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6916. result := propagateFloatx80NaN( a, b );
  6917. exit;
  6918. end;
  6919. result := a;
  6920. exit;
  6921. end;
  6922. if ( bExp = 0 ) then begin
  6923. if ( bSig = 0 ) then begin
  6924. invalid:
  6925. float_raise( float_flag_invalid );
  6926. z.low := floatx80_default_nan_low;
  6927. z.high := floatx80_default_nan_high;
  6928. result := z;
  6929. exit;
  6930. end;
  6931. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6932. end;
  6933. if ( aExp = 0 ) then begin
  6934. if ( bits64( aSig0 shl 1 ) = 0 ) then begin
  6935. result := a;
  6936. exit;
  6937. end;
  6938. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6939. end;
  6940. bSig := bSig or $8000000000000000;
  6941. zSign := aSign;
  6942. expDiff := aExp - bExp;
  6943. aSig1 := 0;
  6944. if ( expDiff < 0 ) then begin
  6945. if ( expDiff < -1 ) then begin
  6946. result := a;
  6947. exit;
  6948. end;
  6949. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6950. expDiff := 0;
  6951. end;
  6952. q := ord( bSig <= aSig0 );
  6953. if ( q <> 0 ) then dec( aSig0, bSig );
  6954. dec( expDiff, 64 );
  6955. while ( 0 < expDiff ) do begin
  6956. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6957. if ( 2 < q ) then q := q - 2 else q := 0;
  6958. mul64To128( bSig, q, term0, term1 );
  6959. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6960. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  6961. dec( expDiff, 62 );
  6962. end;
  6963. inc( expDiff, 64 );
  6964. if ( 0 < expDiff ) then begin
  6965. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6966. if ( 2 < q ) then q:= q - 2 else q := 0;
  6967. q := q shr ( 64 - expDiff );
  6968. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  6969. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6970. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  6971. while ( le128( term0, term1, aSig0, aSig1 ) <> 0 ) do begin
  6972. inc(q);
  6973. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6974. end;
  6975. end
  6976. else begin
  6977. term1 := 0;
  6978. term0 := bSig;
  6979. end;
  6980. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  6981. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  6982. or ( ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  6983. and ( q and 1 <> 0 ) )
  6984. then begin
  6985. aSig0 := alternateASig0;
  6986. aSig1 := alternateASig1;
  6987. zSign := ord( zSign = 0 );
  6988. end;
  6989. result :=
  6990. normalizeRoundAndPackFloatx80(
  6991. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  6992. end;
  6993. {*----------------------------------------------------------------------------
  6994. | Returns the square root of the extended double-precision floating-point
  6995. | value `a'. The operation is performed according to the IEC/IEEE Standard
  6996. | for Binary Floating-Point Arithmetic.
  6997. *----------------------------------------------------------------------------*}
  6998. function floatx80_sqrt(a: floatx80): floatx80;
  6999. var
  7000. aSign: flag;
  7001. aExp, zExp: int32;
  7002. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  7003. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7004. z: floatx80;
  7005. label
  7006. invalid;
  7007. begin
  7008. aSig0 := extractFloatx80Frac( a );
  7009. aExp := extractFloatx80Exp( a );
  7010. aSign := extractFloatx80Sign( a );
  7011. if ( aExp = $7FFF ) then begin
  7012. if ( bits64( aSig0 shl 1 ) <> 0 ) then begin
  7013. result := propagateFloatx80NaN( a, a );
  7014. exit;
  7015. end;
  7016. if ( aSign = 0 ) then begin
  7017. result := a;
  7018. exit;
  7019. end;
  7020. goto invalid;
  7021. end;
  7022. if ( aSign <> 0 ) then begin
  7023. if ( ( aExp or aSig0 ) = 0 ) then begin
  7024. result := a;
  7025. exit;
  7026. end;
  7027. invalid:
  7028. float_raise( float_flag_invalid );
  7029. z.low := floatx80_default_nan_low;
  7030. z.high := floatx80_default_nan_high;
  7031. result := z;
  7032. exit;
  7033. end;
  7034. if ( aExp = 0 ) then begin
  7035. if ( aSig0 = 0 ) then begin
  7036. result := packFloatx80( 0, 0, 0 );
  7037. exit;
  7038. end;
  7039. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  7040. end;
  7041. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
  7042. zSig0 := estimateSqrt32( aExp, aSig0>>32 );
  7043. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  7044. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7045. doubleZSig0 := zSig0 shl 1;
  7046. mul64To128( zSig0, zSig0, term0, term1 );
  7047. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7048. while ( sbits64( rem0 ) < 0 ) do begin
  7049. dec(zSig0);
  7050. dec( doubleZSig0, 2 );
  7051. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, rem0, rem1 );
  7052. end;
  7053. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7054. if ( ( zSig1 and $3FFFFFFFFFFFFFFF ) <= 5 ) then begin
  7055. if ( zSig1 = 0 ) then zSig1 := 1;
  7056. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7057. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7058. mul64To128( zSig1, zSig1, term2, term3 );
  7059. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7060. while ( sbits64( rem1 ) < 0 ) do begin
  7061. dec(zSig1);
  7062. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7063. term3 := term3 or 1;
  7064. term2 := term2 or doubleZSig0;
  7065. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7066. end;
  7067. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7068. end;
  7069. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  7070. zSig0 := zSig0 or doubleZSig0;
  7071. result :=
  7072. roundAndPackFloatx80(
  7073. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  7074. end;
  7075. {*----------------------------------------------------------------------------
  7076. | Returns 1 if the extended double-precision floating-point value `a' is
  7077. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  7078. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  7079. | Arithmetic.
  7080. *----------------------------------------------------------------------------*}
  7081. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  7082. begin
  7083. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7084. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  7085. ) or ( ( extractFloatx80Exp( b ) = $7FFF )
  7086. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 )
  7087. ) then begin
  7088. if ( floatx80_is_signaling_nan( a )
  7089. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7090. float_raise( float_flag_invalid );
  7091. end;
  7092. result := 0;
  7093. exit;
  7094. end;
  7095. result := ord(
  7096. ( a.low = b.low )
  7097. and ( ( a.high = b.high )
  7098. or ( ( a.low = 0 )
  7099. and ( bits16 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7100. ) );
  7101. end;
  7102. {*----------------------------------------------------------------------------
  7103. | Returns 1 if the extended double-precision floating-point value `a' is
  7104. | less than or equal to the corresponding value `b', and 0 otherwise. The
  7105. | comparison is performed according to the IEC/IEEE Standard for Binary
  7106. | Floating-Point Arithmetic.
  7107. *----------------------------------------------------------------------------*}
  7108. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  7109. var
  7110. aSign, bSign: flag;
  7111. begin
  7112. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7113. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7114. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7115. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7116. then begin
  7117. float_raise( float_flag_invalid );
  7118. result := 0;
  7119. exit;
  7120. end;
  7121. aSign := extractFloatx80Sign( a );
  7122. bSign := extractFloatx80Sign( b );
  7123. if ( aSign <> bSign ) then begin
  7124. result := ord(
  7125. ( aSign <> 0 )
  7126. or ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low = 0 ) );
  7127. exit;
  7128. end;
  7129. if aSign<>0 then
  7130. result := le128( b.high, b.low, a.high, a.low )
  7131. else
  7132. result := le128( a.high, a.low, b.high, b.low );
  7133. end;
  7134. {*----------------------------------------------------------------------------
  7135. | Returns 1 if the extended double-precision floating-point value `a' is
  7136. | less than the corresponding value `b', and 0 otherwise. The comparison
  7137. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7138. | Arithmetic.
  7139. *----------------------------------------------------------------------------*}
  7140. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  7141. var
  7142. aSign, bSign: flag;
  7143. begin
  7144. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7145. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7146. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7147. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7148. then begin
  7149. float_raise( float_flag_invalid );
  7150. result := 0;
  7151. exit;
  7152. end;
  7153. aSign := extractFloatx80Sign( a );
  7154. bSign := extractFloatx80Sign( b );
  7155. if ( aSign <> bSign ) then begin
  7156. result := ord(
  7157. ( aSign <> 0 )
  7158. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7159. exit;
  7160. end;
  7161. if aSign <> 0 then
  7162. result := lt128( b.high, b.low, a.high, a.low )
  7163. else
  7164. result := lt128( a.high, a.low, b.high, b.low );
  7165. end;
  7166. {*----------------------------------------------------------------------------
  7167. | Returns 1 if the extended double-precision floating-point value `a' is equal
  7168. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  7169. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7170. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7171. *----------------------------------------------------------------------------*}
  7172. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  7173. begin
  7174. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7175. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7176. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7177. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7178. then begin
  7179. float_raise( float_flag_invalid );
  7180. result := 0;
  7181. exit;
  7182. end;
  7183. result := ord(
  7184. ( a.low = b.low )
  7185. and ( ( a.high = b.high )
  7186. or ( ( a.low = 0 )
  7187. and ( bits16( ( a.high or b.high ) shl 1 ) = 0 ) )
  7188. ) );
  7189. end;
  7190. {*----------------------------------------------------------------------------
  7191. | Returns 1 if the extended double-precision floating-point value `a' is less
  7192. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  7193. | do not cause an exception. Otherwise, the comparison is performed according
  7194. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7195. *----------------------------------------------------------------------------*}
  7196. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  7197. var
  7198. aSign, bSign: flag;
  7199. begin
  7200. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7201. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7202. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7203. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7204. then begin
  7205. if ( floatx80_is_signaling_nan( a )
  7206. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7207. float_raise( float_flag_invalid );
  7208. end;
  7209. result := 0;
  7210. exit;
  7211. end;
  7212. aSign := extractFloatx80Sign( a );
  7213. bSign := extractFloatx80Sign( b );
  7214. if ( aSign <> bSign ) then begin
  7215. result := ord(
  7216. ( aSign <> 0 )
  7217. or ( ( bits16( ( a.high or b.high ) shl 1 ) ) or a.low or b.low = 0 ) );
  7218. exit;
  7219. end;
  7220. if aSign <> 0 then
  7221. result := le128( b.high, b.low, a.high, a.low )
  7222. else
  7223. result := le128( a.high, a.low, b.high, b.low );
  7224. end;
  7225. {*----------------------------------------------------------------------------
  7226. | Returns 1 if the extended double-precision floating-point value `a' is less
  7227. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  7228. | an exception. Otherwise, the comparison is performed according to the
  7229. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7230. *----------------------------------------------------------------------------*}
  7231. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  7232. var
  7233. aSign, bSign: flag;
  7234. begin
  7235. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7236. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7237. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7238. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7239. then begin
  7240. if ( floatx80_is_signaling_nan( a )
  7241. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7242. float_raise( float_flag_invalid );
  7243. end;
  7244. result := 0;
  7245. exit;
  7246. end;
  7247. aSign := extractFloatx80Sign( a );
  7248. bSign := extractFloatx80Sign( b );
  7249. if ( aSign <> bSign ) then begin
  7250. result := ord(
  7251. ( aSign <> 0 )
  7252. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7253. exit;
  7254. end;
  7255. if aSign <> 0 then
  7256. result := lt128( b.high, b.low, a.high, a.low )
  7257. else
  7258. result := lt128( a.high, a.low, b.high, b.low );
  7259. end;
  7260. {$endif FPC_SOFTFLOAT_FLOATX80}
  7261. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  7262. {*----------------------------------------------------------------------------
  7263. | Returns the least-significant 64 fraction bits of the quadruple-precision
  7264. | floating-point value `a'.
  7265. *----------------------------------------------------------------------------*}
  7266. function extractFloat128Frac1(a : float128): bits64;
  7267. begin
  7268. result:=a.low;
  7269. end;
  7270. {*----------------------------------------------------------------------------
  7271. | Returns the most-significant 48 fraction bits of the quadruple-precision
  7272. | floating-point value `a'.
  7273. *----------------------------------------------------------------------------*}
  7274. function extractFloat128Frac0(a : float128): bits64;
  7275. begin
  7276. result:=a.high and int64($0000FFFFFFFFFFFF);
  7277. end;
  7278. {*----------------------------------------------------------------------------
  7279. | Returns the exponent bits of the quadruple-precision floating-point value
  7280. | `a'.
  7281. *----------------------------------------------------------------------------*}
  7282. function extractFloat128Exp(a : float128): int32;
  7283. begin
  7284. result:=( a.high shr 48 ) and $7FFF;
  7285. end;
  7286. {*----------------------------------------------------------------------------
  7287. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  7288. *----------------------------------------------------------------------------*}
  7289. function extractFloat128Sign(a : float128): flag;
  7290. begin
  7291. result:=a.high shr 63;
  7292. end;
  7293. {*----------------------------------------------------------------------------
  7294. | Normalizes the subnormal quadruple-precision floating-point value
  7295. | represented by the denormalized significand formed by the concatenation of
  7296. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  7297. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  7298. | significand are stored at the location pointed to by `zSig0Ptr', and the
  7299. | least significant 64 bits of the normalized significand are stored at the
  7300. | location pointed to by `zSig1Ptr'.
  7301. *----------------------------------------------------------------------------*}
  7302. procedure normalizeFloat128Subnormal(
  7303. aSig0: bits64;
  7304. aSig1: bits64;
  7305. var zExpPtr: int32;
  7306. var zSig0Ptr: bits64;
  7307. var zSig1Ptr: bits64);
  7308. var
  7309. shiftCount: int8;
  7310. begin
  7311. if ( aSig0 = 0 ) then
  7312. begin
  7313. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  7314. if ( shiftCount < 0 ) then
  7315. begin
  7316. zSig0Ptr := aSig1 shr ( - shiftCount );
  7317. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  7318. end
  7319. else begin
  7320. zSig0Ptr := aSig1 shl shiftCount;
  7321. zSig1Ptr := 0;
  7322. end;
  7323. zExpPtr := - shiftCount - 63;
  7324. end
  7325. else begin
  7326. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  7327. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  7328. zExpPtr := 1 - shiftCount;
  7329. end;
  7330. end;
  7331. {*----------------------------------------------------------------------------
  7332. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  7333. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  7334. | floating-point value, returning the result. After being shifted into the
  7335. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  7336. | added together to form the most significant 32 bits of the result. This
  7337. | means that any integer portion of `zSig0' will be added into the exponent.
  7338. | Since a properly normalized significand will have an integer portion equal
  7339. | to 1, the `zExp' input should be 1 less than the desired result exponent
  7340. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  7341. | significand.
  7342. *----------------------------------------------------------------------------*}
  7343. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  7344. var
  7345. z: float128;
  7346. begin
  7347. z.low := zSig1;
  7348. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  7349. result:=z;
  7350. end;
  7351. {*----------------------------------------------------------------------------
  7352. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7353. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  7354. | and `zSig2', and returns the proper quadruple-precision floating-point value
  7355. | corresponding to the abstract input. Ordinarily, the abstract value is
  7356. | simply rounded and packed into the quadruple-precision format, with the
  7357. | inexact exception raised if the abstract input cannot be represented
  7358. | exactly. However, if the abstract value is too large, the overflow and
  7359. | inexact exceptions are raised and an infinity or maximal finite value is
  7360. | returned. If the abstract value is too small, the input value is rounded to
  7361. | a subnormal number, and the underflow and inexact exceptions are raised if
  7362. | the abstract input cannot be represented exactly as a subnormal quadruple-
  7363. | precision floating-point number.
  7364. | The input significand must be normalized or smaller. If the input
  7365. | significand is not normalized, `zExp' must be 0; in that case, the result
  7366. | returned is a subnormal number, and it must not require rounding. In the
  7367. | usual case that the input significand is normalized, `zExp' must be 1 less
  7368. | than the ``true'' floating-point exponent. The handling of underflow and
  7369. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7370. *----------------------------------------------------------------------------*}
  7371. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  7372. var
  7373. roundingMode: int8;
  7374. roundNearestEven, increment, isTiny: flag;
  7375. begin
  7376. roundingMode := softfloat_rounding_mode;
  7377. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  7378. increment := ord( sbits64(zSig2) < 0 );
  7379. if ( roundNearestEven=0 ) then
  7380. begin
  7381. if ( roundingMode = float_round_to_zero ) then
  7382. begin
  7383. increment := 0;
  7384. end
  7385. else begin
  7386. if ( zSign<>0 ) then
  7387. begin
  7388. increment := ord( roundingMode = float_round_down ) and zSig2;
  7389. end
  7390. else begin
  7391. increment := ord( roundingMode = float_round_up ) and zSig2;
  7392. end;
  7393. end;
  7394. end;
  7395. if ( $7FFD <= bits32(zExp) ) then
  7396. begin
  7397. if ( ord( $7FFD < zExp )
  7398. or ( ord( zExp = $7FFD )
  7399. and eq128(
  7400. int64( $0001FFFFFFFFFFFF ),
  7401. bits64( $FFFFFFFFFFFFFFFF ),
  7402. zSig0,
  7403. zSig1
  7404. )
  7405. and increment
  7406. )
  7407. )<>0 then
  7408. begin
  7409. float_raise( [float_flag_overflow,float_flag_inexact] );
  7410. if ( ord( roundingMode = float_round_to_zero )
  7411. or ( zSign and ord( roundingMode = float_round_up ) )
  7412. or ( ord( zSign = 0) and ord( roundingMode = float_round_down ) )
  7413. )<>0 then
  7414. begin
  7415. result :=
  7416. packFloat128(
  7417. zSign,
  7418. $7FFE,
  7419. int64( $0000FFFFFFFFFFFF ),
  7420. bits64( $FFFFFFFFFFFFFFFF )
  7421. );
  7422. exit;
  7423. end;
  7424. result:=packFloat128( zSign, $7FFF, 0, 0 );
  7425. exit;
  7426. end;
  7427. if ( zExp < 0 ) then
  7428. begin
  7429. isTiny :=
  7430. ord(( softfloat_detect_tininess = float_tininess_before_rounding )
  7431. or ( zExp < -1 )
  7432. or not( increment<>0 )
  7433. or boolean(lt128(
  7434. zSig0,
  7435. zSig1,
  7436. int64( $0001FFFFFFFFFFFF ),
  7437. bits64( $FFFFFFFFFFFFFFFF )
  7438. )));
  7439. shift128ExtraRightJamming(
  7440. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  7441. zExp := 0;
  7442. if ( isTiny and zSig2 )<>0 then
  7443. float_raise( float_flag_underflow );
  7444. if ( roundNearestEven<>0 ) then
  7445. begin
  7446. increment := ord( sbits64(zSig2) < 0 );
  7447. end
  7448. else begin
  7449. if ( zSign<>0 ) then
  7450. begin
  7451. increment := ord( roundingMode = float_round_down ) and zSig2;
  7452. end
  7453. else begin
  7454. increment := ord( roundingMode = float_round_up ) and zSig2;
  7455. end;
  7456. end;
  7457. end;
  7458. end;
  7459. if ( zSig2<>0 ) then
  7460. set_inexact_flag;
  7461. if ( increment<>0 ) then
  7462. begin
  7463. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  7464. zSig1 := zSig1 and not bits64( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  7465. end
  7466. else begin
  7467. if ( ( zSig0 or zSig1 ) = 0 ) then
  7468. zExp := 0;
  7469. end;
  7470. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  7471. end;
  7472. {*----------------------------------------------------------------------------
  7473. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7474. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  7475. | returns the proper quadruple-precision floating-point value corresponding
  7476. | to the abstract input. This routine is just like `roundAndPackFloat128'
  7477. | except that the input significand has fewer bits and does not have to be
  7478. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  7479. | point exponent.
  7480. *----------------------------------------------------------------------------*}
  7481. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  7482. var
  7483. shiftCount: int8;
  7484. zSig2: bits64;
  7485. begin
  7486. if ( zSig0 = 0 ) then
  7487. begin
  7488. zSig0 := zSig1;
  7489. zSig1 := 0;
  7490. dec(zExp, 64);
  7491. end;
  7492. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  7493. if ( 0 <= shiftCount ) then
  7494. begin
  7495. zSig2 := 0;
  7496. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  7497. end
  7498. else begin
  7499. shift128ExtraRightJamming(
  7500. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  7501. end;
  7502. dec(zExp, shiftCount);
  7503. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7504. end;
  7505. {*----------------------------------------------------------------------------
  7506. | Returns the result of converting the quadruple-precision floating-point
  7507. | value `a' to the 32-bit two's complement integer format. The conversion
  7508. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7509. | Arithmetic---which means in particular that the conversion is rounded
  7510. | according to the current rounding mode. If `a' is a NaN, the largest
  7511. | positive integer is returned. Otherwise, if the conversion overflows, the
  7512. | largest integer with the same sign as `a' is returned.
  7513. *----------------------------------------------------------------------------*}
  7514. function float128_to_int32(a: float128): int32;
  7515. var
  7516. aSign: flag;
  7517. aExp, shiftCount: int32;
  7518. aSig0, aSig1: bits64;
  7519. begin
  7520. aSig1 := extractFloat128Frac1( a );
  7521. aSig0 := extractFloat128Frac0( a );
  7522. aExp := extractFloat128Exp( a );
  7523. aSign := extractFloat128Sign( a );
  7524. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  7525. aSign := 0;
  7526. if ( aExp<>0 ) then
  7527. aSig0 := aSig0 or int64( $0001000000000000 );
  7528. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7529. shiftCount := $4028 - aExp;
  7530. if ( 0 < shiftCount ) then
  7531. shift64RightJamming( aSig0, shiftCount, aSig0 );
  7532. result := roundAndPackInt32( aSign, aSig0 );
  7533. end;
  7534. {*----------------------------------------------------------------------------
  7535. | Returns the result of converting the quadruple-precision floating-point
  7536. | value `a' to the 32-bit two's complement integer format. The conversion
  7537. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7538. | Arithmetic, except that the conversion is always rounded toward zero. If
  7539. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  7540. | conversion overflows, the largest integer with the same sign as `a' is
  7541. | returned.
  7542. *----------------------------------------------------------------------------*}
  7543. function float128_to_int32_round_to_zero(a: float128): int32;
  7544. var
  7545. aSign: flag;
  7546. aExp, shiftCount: int32;
  7547. aSig0, aSig1, savedASig: bits64;
  7548. z: int32;
  7549. label
  7550. invalid;
  7551. begin
  7552. aSig1 := extractFloat128Frac1( a );
  7553. aSig0 := extractFloat128Frac0( a );
  7554. aExp := extractFloat128Exp( a );
  7555. aSign := extractFloat128Sign( a );
  7556. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7557. if ( $401E < aExp ) then
  7558. begin
  7559. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  7560. aSign := 0;
  7561. goto invalid;
  7562. end
  7563. else if ( aExp < $3FFF ) then
  7564. begin
  7565. if ( aExp or aSig0 )<>0 then
  7566. set_inexact_flag;
  7567. result := 0;
  7568. exit;
  7569. end;
  7570. aSig0 := aSig0 or int64( $0001000000000000 );
  7571. shiftCount := $402F - aExp;
  7572. savedASig := aSig0;
  7573. aSig0 := aSig0 shr shiftCount;
  7574. z := aSig0;
  7575. if ( aSign )<>0 then
  7576. z := - z;
  7577. if ( ord( z < 0 ) xor aSign )<>0 then
  7578. begin
  7579. invalid:
  7580. float_raise( float_flag_invalid );
  7581. if aSign<>0 then
  7582. result:= int32( $80000000 )
  7583. else
  7584. result:=$7FFFFFFF;
  7585. exit;
  7586. end;
  7587. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  7588. begin
  7589. set_inexact_flag;
  7590. end;
  7591. result := z;
  7592. end;
  7593. {*----------------------------------------------------------------------------
  7594. | Returns the result of converting the quadruple-precision floating-point
  7595. | value `a' to the 64-bit two's complement integer format. The conversion
  7596. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7597. | Arithmetic---which means in particular that the conversion is rounded
  7598. | according to the current rounding mode. If `a' is a NaN, the largest
  7599. | positive integer is returned. Otherwise, if the conversion overflows, the
  7600. | largest integer with the same sign as `a' is returned.
  7601. *----------------------------------------------------------------------------*}
  7602. function float128_to_int64(a: float128): int64;
  7603. var
  7604. aSign: flag;
  7605. aExp, shiftCount: int32;
  7606. aSig0, aSig1: bits64;
  7607. begin
  7608. aSig1 := extractFloat128Frac1( a );
  7609. aSig0 := extractFloat128Frac0( a );
  7610. aExp := extractFloat128Exp( a );
  7611. aSign := extractFloat128Sign( a );
  7612. if ( aExp<>0 ) then
  7613. aSig0 := aSig0 or int64( $0001000000000000 );
  7614. shiftCount := $402F - aExp;
  7615. if ( shiftCount <= 0 ) then
  7616. begin
  7617. if ( $403E < aExp ) then
  7618. begin
  7619. float_raise( float_flag_invalid );
  7620. if ( (aSign=0)
  7621. or ( ( aExp = $7FFF )
  7622. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  7623. )
  7624. ) then
  7625. begin
  7626. result := int64( $7FFFFFFFFFFFFFFF );
  7627. exit;
  7628. end;
  7629. result := int64( $8000000000000000 );
  7630. exit;
  7631. end;
  7632. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  7633. end
  7634. else begin
  7635. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  7636. end;
  7637. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  7638. end;
  7639. {*----------------------------------------------------------------------------
  7640. | Returns the result of converting the quadruple-precision floating-point
  7641. | value `a' to the 64-bit two's complement integer format. The conversion
  7642. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7643. | Arithmetic, except that the conversion is always rounded toward zero.
  7644. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  7645. | the conversion overflows, the largest integer with the same sign as `a' is
  7646. | returned.
  7647. *----------------------------------------------------------------------------*}
  7648. function float128_to_int64_round_to_zero(a: float128): int64;
  7649. var
  7650. aSign: flag;
  7651. aExp, shiftCount: int32;
  7652. aSig0, aSig1: bits64;
  7653. z: int64;
  7654. begin
  7655. aSig1 := extractFloat128Frac1( a );
  7656. aSig0 := extractFloat128Frac0( a );
  7657. aExp := extractFloat128Exp( a );
  7658. aSign := extractFloat128Sign( a );
  7659. if ( aExp<>0 ) then
  7660. aSig0 := aSig0 or int64( $0001000000000000 );
  7661. shiftCount := aExp - $402F;
  7662. if ( 0 < shiftCount ) then
  7663. begin
  7664. if ( $403E <= aExp ) then
  7665. begin
  7666. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  7667. if ( ( a.high = bits64( $C03E000000000000 ) )
  7668. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  7669. begin
  7670. if ( aSig1<>0 ) then
  7671. set_inexact_flag;
  7672. end
  7673. else begin
  7674. float_raise( float_flag_invalid );
  7675. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  7676. begin
  7677. result := int64( $7FFFFFFFFFFFFFFF );
  7678. exit;
  7679. end;
  7680. end;
  7681. result := int64( $8000000000000000 );
  7682. exit;
  7683. end;
  7684. z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
  7685. if ( int64( aSig1 shl shiftCount )<>0 ) then
  7686. begin
  7687. set_inexact_flag;
  7688. end;
  7689. end
  7690. else begin
  7691. if ( aExp < $3FFF ) then
  7692. begin
  7693. if ( aExp or aSig0 or aSig1 )<>0 then
  7694. begin
  7695. set_inexact_flag;
  7696. end;
  7697. result := 0;
  7698. exit;
  7699. end;
  7700. z := aSig0 shr ( - shiftCount );
  7701. if ( (aSig1<>0)
  7702. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  7703. begin
  7704. set_inexact_flag;
  7705. end;
  7706. end;
  7707. if ( aSign<>0 ) then
  7708. z := - z;
  7709. result := z;
  7710. end;
  7711. {*----------------------------------------------------------------------------
  7712. | Returns the result of converting the quadruple-precision floating-point
  7713. | value `a' to the single-precision floating-point format. The conversion
  7714. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7715. | Arithmetic.
  7716. *----------------------------------------------------------------------------*}
  7717. function float128_to_float32(a: float128): float32;
  7718. var
  7719. aSign: flag;
  7720. aExp: int32;
  7721. aSig0, aSig1: bits64;
  7722. zSig: bits32;
  7723. begin
  7724. aSig1 := extractFloat128Frac1( a );
  7725. aSig0 := extractFloat128Frac0( a );
  7726. aExp := extractFloat128Exp( a );
  7727. aSign := extractFloat128Sign( a );
  7728. if ( aExp = $7FFF ) then
  7729. begin
  7730. if ( aSig0 or aSig1 )<>0 then
  7731. begin
  7732. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  7733. exit;
  7734. end;
  7735. result := packFloat32( aSign, $FF, 0 );
  7736. exit;
  7737. end;
  7738. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7739. shift64RightJamming( aSig0, 18, aSig0 );
  7740. zSig := aSig0;
  7741. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7742. begin
  7743. zSig := zSig or $40000000;
  7744. dec(aExp,$3F81);
  7745. end;
  7746. result := roundAndPackFloat32( aSign, aExp, zSig );
  7747. end;
  7748. {*----------------------------------------------------------------------------
  7749. | Returns the result of converting the quadruple-precision floating-point
  7750. | value `a' to the double-precision floating-point format. The conversion
  7751. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7752. | Arithmetic.
  7753. *----------------------------------------------------------------------------*}
  7754. function float128_to_float64(a: float128): float64;
  7755. var
  7756. aSign: flag;
  7757. aExp: int32;
  7758. aSig0, aSig1: bits64;
  7759. begin
  7760. aSig1 := extractFloat128Frac1( a );
  7761. aSig0 := extractFloat128Frac0( a );
  7762. aExp := extractFloat128Exp( a );
  7763. aSign := extractFloat128Sign( a );
  7764. if ( aExp = $7FFF ) then
  7765. begin
  7766. if ( aSig0 or aSig1 )<>0 then
  7767. begin
  7768. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  7769. exit;
  7770. end;
  7771. result:=packFloat64( aSign, $7FF, 0);
  7772. exit;
  7773. end;
  7774. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7775. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7776. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7777. begin
  7778. aSig0 := aSig0 or int64( $4000000000000000 );
  7779. dec(aExp,$3C01);
  7780. end;
  7781. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7782. end;
  7783. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7784. {*----------------------------------------------------------------------------
  7785. | Returns the result of converting the quadruple-precision floating-point
  7786. | value `a' to the extended double-precision floating-point format. The
  7787. | conversion is performed according to the IEC/IEEE Standard for Binary
  7788. | Floating-Point Arithmetic.
  7789. *----------------------------------------------------------------------------*}
  7790. function float128_to_floatx80(a: float128): floatx80;
  7791. var
  7792. aSign: flag;
  7793. aExp: int32;
  7794. aSig0, aSig1: bits64;
  7795. begin
  7796. aSig1 := extractFloat128Frac1( a );
  7797. aSig0 := extractFloat128Frac0( a );
  7798. aExp := extractFloat128Exp( a );
  7799. aSign := extractFloat128Sign( a );
  7800. if ( aExp = $7FFF ) then begin
  7801. if ( aSig0 or aSig1 <> 0 ) then begin
  7802. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7803. exit;
  7804. end;
  7805. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  7806. exit;
  7807. end;
  7808. if ( aExp = 0 ) then begin
  7809. if ( ( aSig0 or aSig1 ) = 0 ) then
  7810. begin
  7811. result := packFloatx80( aSign, 0, 0 );
  7812. exit;
  7813. end;
  7814. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7815. end
  7816. else begin
  7817. aSig0 := aSig0 or int64( $0001000000000000 );
  7818. end;
  7819. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7820. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7821. end;
  7822. {$endif FPC_SOFTFLOAT_FLOATX80}
  7823. {*----------------------------------------------------------------------------
  7824. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7825. | Returns the result as a quadruple-precision floating-point value. The
  7826. | operation is performed according to the IEC/IEEE Standard for Binary
  7827. | Floating-Point Arithmetic.
  7828. *----------------------------------------------------------------------------*}
  7829. function float128_round_to_int(a: float128): float128;
  7830. var
  7831. aSign: flag;
  7832. aExp: int32;
  7833. lastBitMask, roundBitsMask: bits64;
  7834. roundingMode: int8;
  7835. z: float128;
  7836. begin
  7837. aExp := extractFloat128Exp( a );
  7838. if ( $402F <= aExp ) then
  7839. begin
  7840. if ( $406F <= aExp ) then
  7841. begin
  7842. if ( ( aExp = $7FFF )
  7843. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7844. ) then
  7845. begin
  7846. result := propagateFloat128NaN( a, a );
  7847. exit;
  7848. end;
  7849. result := a;
  7850. exit;
  7851. end;
  7852. lastBitMask := 1;
  7853. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7854. roundBitsMask := lastBitMask - 1;
  7855. z := a;
  7856. roundingMode := softfloat_rounding_mode;
  7857. if ( roundingMode = float_round_nearest_even ) then
  7858. begin
  7859. if ( lastBitMask )<>0 then
  7860. begin
  7861. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7862. if ( ( z.low and roundBitsMask ) = 0 ) then
  7863. z.low := z.low and not(lastBitMask);
  7864. end
  7865. else begin
  7866. if ( sbits64(z.low) < 0 ) then
  7867. begin
  7868. inc(z.high);
  7869. if ( bits64( z.low shl 1 ) = 0 ) then
  7870. z.high := z.high and not bits64( 1 );
  7871. end;
  7872. end;
  7873. end
  7874. else if ( roundingMode <> float_round_to_zero ) then
  7875. begin
  7876. if ( extractFloat128Sign( z )
  7877. xor ord( roundingMode = float_round_up ) )<>0 then
  7878. begin
  7879. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7880. end;
  7881. end;
  7882. z.low := z.low and not(roundBitsMask);
  7883. end
  7884. else begin
  7885. if ( aExp < $3FFF ) then
  7886. begin
  7887. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7888. begin
  7889. result := a;
  7890. exit;
  7891. end;
  7892. set_inexact_flag;
  7893. aSign := extractFloat128Sign( a );
  7894. case softfloat_rounding_mode of
  7895. float_round_nearest_even:
  7896. if ( ( aExp = $3FFE )
  7897. and ( (extractFloat128Frac0( a )<>0)
  7898. or (extractFloat128Frac1( a )<>0) )
  7899. ) then begin
  7900. begin
  7901. result := packFloat128( aSign, $3FFF, 0, 0 );
  7902. exit;
  7903. end;
  7904. end;
  7905. float_round_down:
  7906. begin
  7907. if aSign<>0 then
  7908. result:=packFloat128( 1, $3FFF, 0, 0 )
  7909. else
  7910. result:=packFloat128( 0, 0, 0, 0 );
  7911. exit;
  7912. end;
  7913. float_round_up:
  7914. begin
  7915. if aSign<>0 then
  7916. result := packFloat128( 1, 0, 0, 0 )
  7917. else
  7918. result:=packFloat128( 0, $3FFF, 0, 0 );
  7919. exit;
  7920. end;
  7921. end;
  7922. result := packFloat128( aSign, 0, 0, 0 );
  7923. exit;
  7924. end;
  7925. lastBitMask := 1;
  7926. lastBitMask := lastBitMask shl ($402F - aExp);
  7927. roundBitsMask := lastBitMask - 1;
  7928. z.low := 0;
  7929. z.high := a.high;
  7930. roundingMode := softfloat_rounding_mode;
  7931. if ( roundingMode = float_round_nearest_even ) then begin
  7932. inc(z.high,lastBitMask shr 1);
  7933. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7934. z.high := z.high and not(lastBitMask);
  7935. end;
  7936. end
  7937. else if ( roundingMode <> float_round_to_zero ) then begin
  7938. if ( (extractFloat128Sign( z )<>0)
  7939. xor ( roundingMode = float_round_up ) ) then begin
  7940. z.high := z.high or ord( a.low <> 0 );
  7941. z.high := z.high+roundBitsMask;
  7942. end;
  7943. end;
  7944. z.high := z.high and not(roundBitsMask);
  7945. end;
  7946. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7947. set_inexact_flag;
  7948. end;
  7949. result := z;
  7950. end;
  7951. {*----------------------------------------------------------------------------
  7952. | Returns the result of adding the absolute values of the quadruple-precision
  7953. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7954. | before being returned. `zSign' is ignored if the result is a NaN.
  7955. | The addition is performed according to the IEC/IEEE Standard for Binary
  7956. | Floating-Point Arithmetic.
  7957. *----------------------------------------------------------------------------*}
  7958. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7959. var
  7960. aExp, bExp, zExp: int32;
  7961. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7962. expDiff: int32;
  7963. label
  7964. shiftRight1,roundAndPack;
  7965. begin
  7966. aSig1 := extractFloat128Frac1( a );
  7967. aSig0 := extractFloat128Frac0( a );
  7968. aExp := extractFloat128Exp( a );
  7969. bSig1 := extractFloat128Frac1( b );
  7970. bSig0 := extractFloat128Frac0( b );
  7971. bExp := extractFloat128Exp( b );
  7972. expDiff := aExp - bExp;
  7973. if ( 0 < expDiff ) then begin
  7974. if ( aExp = $7FFF ) then begin
  7975. if ( aSig0 or aSig1 )<>0 then
  7976. begin
  7977. result := propagateFloat128NaN( a, b );
  7978. exit;
  7979. end;
  7980. result := a;
  7981. exit;
  7982. end;
  7983. if ( bExp = 0 ) then begin
  7984. dec(expDiff);
  7985. end
  7986. else begin
  7987. bSig0 := bSig0 or int64( $0001000000000000 );
  7988. end;
  7989. shift128ExtraRightJamming(
  7990. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  7991. zExp := aExp;
  7992. end
  7993. else if ( expDiff < 0 ) then begin
  7994. if ( bExp = $7FFF ) then begin
  7995. if ( bSig0 or bSig1 )<>0 then
  7996. begin
  7997. result := propagateFloat128NaN( a, b );
  7998. exit;
  7999. end;
  8000. result := packFloat128( zSign, $7FFF, 0, 0 );
  8001. exit;
  8002. end;
  8003. if ( aExp = 0 ) then begin
  8004. inc(expDiff);
  8005. end
  8006. else begin
  8007. aSig0 := aSig0 or int64( $0001000000000000 );
  8008. end;
  8009. shift128ExtraRightJamming(
  8010. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  8011. zExp := bExp;
  8012. end
  8013. else begin
  8014. if ( aExp = $7FFF ) then begin
  8015. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8016. result := propagateFloat128NaN( a, b );
  8017. exit;
  8018. end;
  8019. result := a;
  8020. exit;
  8021. end;
  8022. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8023. if ( aExp = 0 ) then
  8024. begin
  8025. result := packFloat128( zSign, 0, zSig0, zSig1 );
  8026. exit;
  8027. end;
  8028. zSig2 := 0;
  8029. zSig0 := zSig0 or int64( $0002000000000000 );
  8030. zExp := aExp;
  8031. goto shiftRight1;
  8032. end;
  8033. aSig0 := aSig0 or int64( $0001000000000000 );
  8034. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8035. dec(zExp);
  8036. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  8037. inc(zExp);
  8038. shiftRight1:
  8039. shift128ExtraRightJamming(
  8040. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8041. roundAndPack:
  8042. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8043. end;
  8044. {*----------------------------------------------------------------------------
  8045. | Returns the result of subtracting the absolute values of the quadruple-
  8046. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  8047. | difference is negated before being returned. `zSign' is ignored if the
  8048. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  8049. | Standard for Binary Floating-Point Arithmetic.
  8050. *----------------------------------------------------------------------------*}
  8051. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  8052. var
  8053. aExp, bExp, zExp: int32;
  8054. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  8055. expDiff: int32;
  8056. z: float128;
  8057. label
  8058. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  8059. begin
  8060. aSig1 := extractFloat128Frac1( a );
  8061. aSig0 := extractFloat128Frac0( a );
  8062. aExp := extractFloat128Exp( a );
  8063. bSig1 := extractFloat128Frac1( b );
  8064. bSig0 := extractFloat128Frac0( b );
  8065. bExp := extractFloat128Exp( b );
  8066. expDiff := aExp - bExp;
  8067. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  8068. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  8069. if ( 0 < expDiff ) then goto aExpBigger;
  8070. if ( expDiff < 0 ) then goto bExpBigger;
  8071. if ( aExp = $7FFF ) then begin
  8072. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8073. result := propagateFloat128NaN( a, b );
  8074. exit;
  8075. end;
  8076. float_raise( float_flag_invalid );
  8077. z.low := float128_default_nan_low;
  8078. z.high := float128_default_nan_high;
  8079. result := z;
  8080. exit;
  8081. end;
  8082. if ( aExp = 0 ) then begin
  8083. aExp := 1;
  8084. bExp := 1;
  8085. end;
  8086. if ( bSig0 < aSig0 ) then goto aBigger;
  8087. if ( aSig0 < bSig0 ) then goto bBigger;
  8088. if ( bSig1 < aSig1 ) then goto aBigger;
  8089. if ( aSig1 < bSig1 ) then goto bBigger;
  8090. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  8091. exit;
  8092. bExpBigger:
  8093. if ( bExp = $7FFF ) then begin
  8094. if ( bSig0 or bSig1 )<>0 then
  8095. begin
  8096. result := propagateFloat128NaN( a, b );
  8097. exit;
  8098. end;
  8099. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  8100. exit;
  8101. end;
  8102. if ( aExp = 0 ) then begin
  8103. inc(expDiff);
  8104. end
  8105. else begin
  8106. aSig0 := aSig0 or int64( $4000000000000000 );
  8107. end;
  8108. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8109. bSig0 := bSig0 or int64( $4000000000000000 );
  8110. bBigger:
  8111. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  8112. zExp := bExp;
  8113. zSign := zSign xor 1;
  8114. goto normalizeRoundAndPack;
  8115. aExpBigger:
  8116. if ( aExp = $7FFF ) then begin
  8117. if ( aSig0 or aSig1 )<>0 then
  8118. begin
  8119. result := propagateFloat128NaN( a, b );
  8120. exit;
  8121. end;
  8122. result := a;
  8123. exit;
  8124. end;
  8125. if ( bExp = 0 ) then begin
  8126. dec(expDiff);
  8127. end
  8128. else begin
  8129. bSig0 := bSig0 or int64( $4000000000000000 );
  8130. end;
  8131. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  8132. aSig0 := aSig0 or int64( $4000000000000000 );
  8133. aBigger:
  8134. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8135. zExp := aExp;
  8136. normalizeRoundAndPack:
  8137. dec(zExp);
  8138. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  8139. end;
  8140. {*----------------------------------------------------------------------------
  8141. | Returns the result of adding the quadruple-precision floating-point values
  8142. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  8143. | for Binary Floating-Point Arithmetic.
  8144. *----------------------------------------------------------------------------*}
  8145. function float128_add(a: float128; b: float128): float128;
  8146. var
  8147. aSign, bSign: flag;
  8148. begin
  8149. aSign := extractFloat128Sign( a );
  8150. bSign := extractFloat128Sign( b );
  8151. if ( aSign = bSign ) then begin
  8152. result := addFloat128Sigs( a, b, aSign );
  8153. end
  8154. else begin
  8155. result := subFloat128Sigs( a, b, aSign );
  8156. end;
  8157. end;
  8158. {*----------------------------------------------------------------------------
  8159. | Returns the result of subtracting the quadruple-precision floating-point
  8160. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8161. | Standard for Binary Floating-Point Arithmetic.
  8162. *----------------------------------------------------------------------------*}
  8163. function float128_sub(a: float128; b: float128): float128;
  8164. var
  8165. aSign, bSign: flag;
  8166. begin
  8167. aSign := extractFloat128Sign( a );
  8168. bSign := extractFloat128Sign( b );
  8169. if ( aSign = bSign ) then begin
  8170. result := subFloat128Sigs( a, b, aSign );
  8171. end
  8172. else begin
  8173. result := addFloat128Sigs( a, b, aSign );
  8174. end;
  8175. end;
  8176. {*----------------------------------------------------------------------------
  8177. | Returns the result of multiplying the quadruple-precision floating-point
  8178. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8179. | Standard for Binary Floating-Point Arithmetic.
  8180. *----------------------------------------------------------------------------*}
  8181. function float128_mul(a: float128; b: float128): float128;
  8182. var
  8183. aSign, bSign, zSign: flag;
  8184. aExp, bExp, zExp: int32;
  8185. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  8186. z: float128;
  8187. label
  8188. invalid;
  8189. begin
  8190. aSig1 := extractFloat128Frac1( a );
  8191. aSig0 := extractFloat128Frac0( a );
  8192. aExp := extractFloat128Exp( a );
  8193. aSign := extractFloat128Sign( a );
  8194. bSig1 := extractFloat128Frac1( b );
  8195. bSig0 := extractFloat128Frac0( b );
  8196. bExp := extractFloat128Exp( b );
  8197. bSign := extractFloat128Sign( b );
  8198. zSign := aSign xor bSign;
  8199. if ( aExp = $7FFF ) then begin
  8200. if ( (( aSig0 or aSig1 )<>0)
  8201. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8202. result := propagateFloat128NaN( a, b );
  8203. exit;
  8204. end;
  8205. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  8206. result := packFloat128( zSign, $7FFF, 0, 0 );
  8207. exit;
  8208. end;
  8209. if ( bExp = $7FFF ) then begin
  8210. if ( bSig0 or bSig1 )<>0 then
  8211. begin
  8212. result := propagateFloat128NaN( a, b );
  8213. exit;
  8214. end;
  8215. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8216. invalid:
  8217. float_raise( float_flag_invalid );
  8218. z.low := float128_default_nan_low;
  8219. z.high := float128_default_nan_high;
  8220. result := z;
  8221. exit;
  8222. end;
  8223. result := packFloat128( zSign, $7FFF, 0, 0 );
  8224. exit;
  8225. end;
  8226. if ( aExp = 0 ) then begin
  8227. if ( ( aSig0 or aSig1 ) = 0 ) then
  8228. begin
  8229. result := packFloat128( zSign, 0, 0, 0 );
  8230. exit;
  8231. end;
  8232. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8233. end;
  8234. if ( bExp = 0 ) then begin
  8235. if ( ( bSig0 or bSig1 ) = 0 ) then
  8236. begin
  8237. result := packFloat128( zSign, 0, 0, 0 );
  8238. exit;
  8239. end;
  8240. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8241. end;
  8242. zExp := aExp + bExp - $4000;
  8243. aSig0 := aSig0 or int64( $0001000000000000 );
  8244. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  8245. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  8246. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  8247. zSig2 := zSig2 or ord( zSig3 <> 0 );
  8248. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  8249. shift128ExtraRightJamming(
  8250. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8251. inc(zExp);
  8252. end;
  8253. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8254. end;
  8255. {*----------------------------------------------------------------------------
  8256. | Returns the result of dividing the quadruple-precision floating-point value
  8257. | `a' by the corresponding value `b'. The operation is performed according to
  8258. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8259. *----------------------------------------------------------------------------*}
  8260. function float128_div(a: float128; b: float128): float128;
  8261. var
  8262. aSign, bSign, zSign: flag;
  8263. aExp, bExp, zExp: int32;
  8264. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8265. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8266. z: float128;
  8267. label
  8268. invalid;
  8269. begin
  8270. aSig1 := extractFloat128Frac1( a );
  8271. aSig0 := extractFloat128Frac0( a );
  8272. aExp := extractFloat128Exp( a );
  8273. aSign := extractFloat128Sign( a );
  8274. bSig1 := extractFloat128Frac1( b );
  8275. bSig0 := extractFloat128Frac0( b );
  8276. bExp := extractFloat128Exp( b );
  8277. bSign := extractFloat128Sign( b );
  8278. zSign := aSign xor bSign;
  8279. if ( aExp = $7FFF ) then begin
  8280. if ( aSig0 or aSig1 )<>0 then
  8281. begin
  8282. result := propagateFloat128NaN( a, b );
  8283. exit;
  8284. end;
  8285. if ( bExp = $7FFF ) then begin
  8286. if ( bSig0 or bSig1 )<>0 then
  8287. begin
  8288. result := propagateFloat128NaN( a, b );
  8289. exit;
  8290. end;
  8291. goto invalid;
  8292. end;
  8293. result := packFloat128( zSign, $7FFF, 0, 0 );
  8294. exit;
  8295. end;
  8296. if ( bExp = $7FFF ) then begin
  8297. if ( bSig0 or bSig1 )<>0 then
  8298. begin
  8299. result := propagateFloat128NaN( a, b );
  8300. exit;
  8301. end;
  8302. result := packFloat128( zSign, 0, 0, 0 );
  8303. exit;
  8304. end;
  8305. if ( bExp = 0 ) then begin
  8306. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8307. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8308. invalid:
  8309. float_raise( float_flag_invalid );
  8310. z.low := float128_default_nan_low;
  8311. z.high := float128_default_nan_high;
  8312. result := z;
  8313. exit;
  8314. end;
  8315. float_raise( float_flag_divbyzero );
  8316. result := packFloat128( zSign, $7FFF, 0, 0 );
  8317. exit;
  8318. end;
  8319. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8320. end;
  8321. if ( aExp = 0 ) then begin
  8322. if ( ( aSig0 or aSig1 ) = 0 ) then
  8323. begin
  8324. result := packFloat128( zSign, 0, 0, 0 );
  8325. exit;
  8326. end;
  8327. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8328. end;
  8329. zExp := aExp - bExp + $3FFD;
  8330. shortShift128Left(
  8331. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  8332. shortShift128Left(
  8333. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8334. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  8335. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  8336. inc(zExp);
  8337. end;
  8338. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8339. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  8340. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  8341. while ( sbits64(rem0) < 0 ) do begin
  8342. dec(zSig0);
  8343. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  8344. end;
  8345. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  8346. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  8347. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  8348. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  8349. while ( sbits64(rem1) < 0 ) do begin
  8350. dec(zSig1);
  8351. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  8352. end;
  8353. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8354. end;
  8355. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  8356. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8357. end;
  8358. {*----------------------------------------------------------------------------
  8359. | Returns the remainder of the quadruple-precision floating-point value `a'
  8360. | with respect to the corresponding value `b'. The operation is performed
  8361. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8362. *----------------------------------------------------------------------------*}
  8363. function float128_rem(a: float128; b: float128): float128;
  8364. var
  8365. aSign, zSign: flag;
  8366. aExp, bExp, expDiff: int32;
  8367. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  8368. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  8369. sigMean0: sbits64;
  8370. z: float128;
  8371. label
  8372. invalid;
  8373. begin
  8374. aSig1 := extractFloat128Frac1( a );
  8375. aSig0 := extractFloat128Frac0( a );
  8376. aExp := extractFloat128Exp( a );
  8377. aSign := extractFloat128Sign( a );
  8378. bSig1 := extractFloat128Frac1( b );
  8379. bSig0 := extractFloat128Frac0( b );
  8380. bExp := extractFloat128Exp( b );
  8381. if ( aExp = $7FFF ) then begin
  8382. if ( (( aSig0 or aSig1 )<>0)
  8383. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8384. result := propagateFloat128NaN( a, b );
  8385. exit;
  8386. end;
  8387. goto invalid;
  8388. end;
  8389. if ( bExp = $7FFF ) then begin
  8390. if ( bSig0 or bSig1 )<>0 then
  8391. begin
  8392. result := propagateFloat128NaN( a, b );
  8393. exit;
  8394. end;
  8395. result := a;
  8396. exit;
  8397. end;
  8398. if ( bExp = 0 ) then begin
  8399. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8400. invalid:
  8401. float_raise( float_flag_invalid );
  8402. z.low := float128_default_nan_low;
  8403. z.high := float128_default_nan_high;
  8404. result := z;
  8405. exit;
  8406. end;
  8407. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8408. end;
  8409. if ( aExp = 0 ) then begin
  8410. if ( ( aSig0 or aSig1 ) = 0 ) then
  8411. begin
  8412. result := a;
  8413. exit;
  8414. end;
  8415. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8416. end;
  8417. expDiff := aExp - bExp;
  8418. if ( expDiff < -1 ) then
  8419. begin
  8420. result := a;
  8421. exit;
  8422. end;
  8423. shortShift128Left(
  8424. aSig0 or int64( $0001000000000000 ),
  8425. aSig1,
  8426. 15 - ord( expDiff < 0 ),
  8427. aSig0,
  8428. aSig1
  8429. );
  8430. shortShift128Left(
  8431. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8432. q := le128( bSig0, bSig1, aSig0, aSig1 );
  8433. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8434. dec(expDiff,64);
  8435. while ( 0 < expDiff ) do begin
  8436. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8437. if ( 4 < q ) then
  8438. q := q - 4
  8439. else
  8440. q := 0;
  8441. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8442. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  8443. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  8444. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  8445. dec(expDiff,61);
  8446. end;
  8447. if ( -64 < expDiff ) then begin
  8448. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8449. if ( 4 < q ) then
  8450. q := q - 4
  8451. else
  8452. q := 0;
  8453. q := q shr (- expDiff);
  8454. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8455. inc(expDiff,52);
  8456. if ( expDiff < 0 ) then begin
  8457. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8458. end
  8459. else begin
  8460. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  8461. end;
  8462. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8463. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  8464. end
  8465. else begin
  8466. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  8467. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8468. end;
  8469. repeat
  8470. alternateASig0 := aSig0;
  8471. alternateASig1 := aSig1;
  8472. inc(q);
  8473. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8474. until not( 0 <= sbits64(aSig0) );
  8475. add128(
  8476. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  8477. if ( ( sigMean0 < 0 )
  8478. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  8479. aSig0 := alternateASig0;
  8480. aSig1 := alternateASig1;
  8481. end;
  8482. zSign := ord( sbits64(aSig0) < 0 );
  8483. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  8484. result :=
  8485. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  8486. end;
  8487. {*----------------------------------------------------------------------------
  8488. | Returns the square root of the quadruple-precision floating-point value `a'.
  8489. | The operation is performed according to the IEC/IEEE Standard for Binary
  8490. | Floating-Point Arithmetic.
  8491. *----------------------------------------------------------------------------*}
  8492. function float128_sqrt(a: float128): float128;
  8493. var
  8494. aSign: flag;
  8495. aExp, zExp: int32;
  8496. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  8497. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8498. z: float128;
  8499. label
  8500. invalid;
  8501. begin
  8502. aSig1 := extractFloat128Frac1( a );
  8503. aSig0 := extractFloat128Frac0( a );
  8504. aExp := extractFloat128Exp( a );
  8505. aSign := extractFloat128Sign( a );
  8506. if ( aExp = $7FFF ) then begin
  8507. if ( aSig0 or aSig1 )<>0 then
  8508. begin
  8509. result := propagateFloat128NaN( a, a );
  8510. exit;
  8511. end;
  8512. if ( aSign=0 ) then
  8513. begin
  8514. result := a;
  8515. exit;
  8516. end;
  8517. goto invalid;
  8518. end;
  8519. if ( aSign<>0 ) then begin
  8520. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  8521. begin
  8522. result := a;
  8523. exit;
  8524. end;
  8525. invalid:
  8526. float_raise( float_flag_invalid );
  8527. z.low := float128_default_nan_low;
  8528. z.high := float128_default_nan_high;
  8529. result := z;
  8530. exit;
  8531. end;
  8532. if ( aExp = 0 ) then begin
  8533. if ( ( aSig0 or aSig1 ) = 0 ) then
  8534. begin
  8535. result := packFloat128( 0, 0, 0, 0 );
  8536. exit;
  8537. end;
  8538. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8539. end;
  8540. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
  8541. aSig0 := aSig0 or int64( $0001000000000000 );
  8542. zSig0 := estimateSqrt32( aExp, aSig0>>17 );
  8543. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  8544. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  8545. doubleZSig0 := zSig0 shl 1;
  8546. mul64To128( zSig0, zSig0, term0, term1 );
  8547. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  8548. while ( sbits64(rem0) < 0 ) do begin
  8549. dec(zSig0);
  8550. dec(doubleZSig0,2);
  8551. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  8552. end;
  8553. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  8554. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  8555. if ( zSig1 = 0 ) then zSig1 := 1;
  8556. mul64To128( doubleZSig0, zSig1, term1, term2 );
  8557. sub128( rem1, 0, term1, term2, rem1, rem2 );
  8558. mul64To128( zSig1, zSig1, term2, term3 );
  8559. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  8560. while ( sbits64(rem1) < 0 ) do begin
  8561. dec(zSig1);
  8562. shortShift128Left( 0, zSig1, 1, term2, term3 );
  8563. term3 := term3 or 1;
  8564. term2 := term2 or doubleZSig0;
  8565. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  8566. end;
  8567. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8568. end;
  8569. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  8570. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  8571. end;
  8572. {*----------------------------------------------------------------------------
  8573. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8574. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8575. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8576. *----------------------------------------------------------------------------*}
  8577. function float128_eq(a: float128; b: float128): flag;
  8578. begin
  8579. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8580. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8581. or ( ( extractFloat128Exp( b ) = $7FFF )
  8582. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8583. ) then begin
  8584. if ( (float128_is_signaling_nan( a )<>0)
  8585. or (float128_is_signaling_nan( b )<>0) ) then begin
  8586. float_raise( float_flag_invalid );
  8587. end;
  8588. result := 0;
  8589. exit;
  8590. end;
  8591. result := ord(
  8592. ( a.low = b.low )
  8593. and ( ( a.high = b.high )
  8594. or ( ( a.low = 0 )
  8595. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  8596. ));
  8597. end;
  8598. {*----------------------------------------------------------------------------
  8599. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8600. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  8601. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  8602. | Arithmetic.
  8603. *----------------------------------------------------------------------------*}
  8604. function float128_le(a: float128; b: float128): flag;
  8605. var
  8606. aSign, bSign: flag;
  8607. begin
  8608. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8609. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8610. or ( ( extractFloat128Exp( b ) = $7FFF )
  8611. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8612. ) then begin
  8613. float_raise( float_flag_invalid );
  8614. result := 0;
  8615. exit;
  8616. end;
  8617. aSign := extractFloat128Sign( a );
  8618. bSign := extractFloat128Sign( b );
  8619. if ( aSign <> bSign ) then begin
  8620. result := ord(
  8621. (aSign<>0)
  8622. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8623. = 0 ));
  8624. exit;
  8625. end;
  8626. if aSign<>0 then
  8627. result := le128( b.high, b.low, a.high, a.low )
  8628. else
  8629. result := le128( a.high, a.low, b.high, b.low );
  8630. end;
  8631. {*----------------------------------------------------------------------------
  8632. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8633. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8634. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8635. *----------------------------------------------------------------------------*}
  8636. function float128_lt(a: float128; b: float128): flag;
  8637. var
  8638. aSign, bSign: flag;
  8639. begin
  8640. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8641. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8642. or ( ( extractFloat128Exp( b ) = $7FFF )
  8643. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8644. ) then begin
  8645. float_raise( float_flag_invalid );
  8646. result := 0;
  8647. exit;
  8648. end;
  8649. aSign := extractFloat128Sign( a );
  8650. bSign := extractFloat128Sign( b );
  8651. if ( aSign <> bSign ) then begin
  8652. result := ord(
  8653. (aSign<>0)
  8654. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8655. <> 0 ));
  8656. exit;
  8657. end;
  8658. if aSign<>0 then
  8659. result := lt128( b.high, b.low, a.high, a.low )
  8660. else
  8661. result := lt128( a.high, a.low, b.high, b.low );
  8662. end;
  8663. {*----------------------------------------------------------------------------
  8664. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8665. | the corresponding value `b', and 0 otherwise. The invalid exception is
  8666. | raised if either operand is a NaN. Otherwise, the comparison is performed
  8667. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8668. *----------------------------------------------------------------------------*}
  8669. function float128_eq_signaling(a: float128; b: float128): flag;
  8670. begin
  8671. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8672. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8673. or ( ( extractFloat128Exp( b ) = $7FFF )
  8674. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8675. ) then begin
  8676. float_raise( float_flag_invalid );
  8677. result := 0;
  8678. exit;
  8679. end;
  8680. result := ord(
  8681. ( a.low = b.low )
  8682. and ( ( a.high = b.high )
  8683. or ( ( a.low = 0 )
  8684. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  8685. ));
  8686. end;
  8687. {*----------------------------------------------------------------------------
  8688. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8689. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  8690. | cause an exception. Otherwise, the comparison is performed according to the
  8691. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8692. *----------------------------------------------------------------------------*}
  8693. function float128_le_quiet(a: float128; b: float128): flag;
  8694. var
  8695. aSign, bSign: flag;
  8696. begin
  8697. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8698. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8699. or ( ( extractFloat128Exp( b ) = $7FFF )
  8700. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8701. ) then begin
  8702. if ( (float128_is_signaling_nan( a )<>0)
  8703. or (float128_is_signaling_nan( b )<>0) ) then begin
  8704. float_raise( float_flag_invalid );
  8705. end;
  8706. result := 0;
  8707. exit;
  8708. end;
  8709. aSign := extractFloat128Sign( a );
  8710. bSign := extractFloat128Sign( b );
  8711. if ( aSign <> bSign ) then begin
  8712. result := ord(
  8713. (aSign<>0)
  8714. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8715. = 0 ));
  8716. exit;
  8717. end;
  8718. if aSign<>0 then
  8719. result := le128( b.high, b.low, a.high, a.low )
  8720. else
  8721. result := le128( a.high, a.low, b.high, b.low );
  8722. end;
  8723. {*----------------------------------------------------------------------------
  8724. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8725. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  8726. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  8727. | Standard for Binary Floating-Point Arithmetic.
  8728. *----------------------------------------------------------------------------*}
  8729. function float128_lt_quiet(a: float128; b: float128): flag;
  8730. var
  8731. aSign, bSign: flag;
  8732. begin
  8733. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8734. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8735. or ( ( extractFloat128Exp( b ) = $7FFF )
  8736. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8737. ) then begin
  8738. if ( (float128_is_signaling_nan( a )<>0)
  8739. or (float128_is_signaling_nan( b )<>0) ) then begin
  8740. float_raise( float_flag_invalid );
  8741. end;
  8742. result := 0;
  8743. exit;
  8744. end;
  8745. aSign := extractFloat128Sign( a );
  8746. bSign := extractFloat128Sign( b );
  8747. if ( aSign <> bSign ) then begin
  8748. result := ord(
  8749. (aSign<>0)
  8750. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8751. <> 0 ));
  8752. exit;
  8753. end;
  8754. if aSign<>0 then
  8755. result:=lt128( b.high, b.low, a.high, a.low )
  8756. else
  8757. result:=lt128( a.high, a.low, b.high, b.low );
  8758. end;
  8759. {----------------------------------------------------------------------------
  8760. | Returns the result of converting the double-precision floating-point value
  8761. | `a' to the quadruple-precision floating-point format. The conversion is
  8762. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8763. | Arithmetic.
  8764. *----------------------------------------------------------------------------}
  8765. function float64_to_float128( a : float64) : float128;
  8766. var
  8767. aSign : flag;
  8768. aExp : int16;
  8769. aSig, zSig0, zSig1 : bits64;
  8770. begin
  8771. aSig := extractFloat64Frac( a );
  8772. aExp := extractFloat64Exp( a );
  8773. aSign := extractFloat64Sign( a );
  8774. if ( aExp = $7FF ) then begin
  8775. if ( aSig<>0 ) then begin
  8776. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8777. exit;
  8778. end;
  8779. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8780. exit;
  8781. end;
  8782. if ( aExp = 0 ) then begin
  8783. if ( aSig = 0 ) then
  8784. begin
  8785. result:=packFloat128( aSign, 0, 0, 0 );
  8786. exit;
  8787. end;
  8788. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8789. dec(aExp);
  8790. end;
  8791. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8792. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8793. end;
  8794. {$endif FPC_SOFTFLOAT_FLOAT128}
  8795. {$endif not(defined(fpc_softfpu_interface))}
  8796. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8797. end.
  8798. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}