softfpu.pp 325 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286928792889289929092919292929392949295929692979298929993009301930293039304930593069307930893099310931193129313931493159316931793189319932093219322932393249325932693279328932993309331933293339334933593369337933893399340934193429343934493459346934793489349935093519352935393549355935693579358935993609361936293639364936593669367936893699370937193729373937493759376
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. {$macro on}
  69. {$define compilerproc:=stdcall }
  70. interface
  71. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  72. {$if not(defined(fpc_softfpu_implementation))}
  73. {
  74. -------------------------------------------------------------------------------
  75. Software IEC/IEEE floating-point types.
  76. -------------------------------------------------------------------------------
  77. }
  78. TYPE
  79. {$ifndef FPC_SYSTEM_HAS_float32}
  80. float32 = longword;
  81. {$define FPC_SYSTEM_HAS_float32}
  82. {$endif ndef FPC_SYSTEM_HAS_float32}
  83. { we use here a record in the function header because
  84. the record allows bitwise conversion to single }
  85. float32rec = record
  86. float32 : float32;
  87. end;
  88. flag = byte;
  89. bits8 = byte;
  90. sbits8 = shortint;
  91. bits16 = word;
  92. sbits16 = smallint;
  93. sbits32 = longint;
  94. bits32 = longword;
  95. {$ifndef fpc}
  96. qword = int64;
  97. {$endif}
  98. { now part of the system unit
  99. uint64 = qword;
  100. }
  101. bits64 = qword;
  102. sbits64 = int64;
  103. {$ifdef ENDIAN_LITTLE}
  104. {$ifndef FPC_SYSTEM_HAS_float64}
  105. float64 = record
  106. case byte of
  107. // force the record to be aligned like a double
  108. // else *_to_double will fail for cpus like sparc
  109. // and avoid expensive unpacking/packing operations
  110. 1: (dummy : double);
  111. 2: (low,high : bits32);
  112. end;
  113. {$endif ndef FPC_SYSTEM_HAS_float64}
  114. floatx80 = record
  115. case byte of
  116. // force the record to be aligned like a double
  117. // else *_to_double will fail for cpus like sparc
  118. // and avoid expensive unpacking/packing operations
  119. 1: (dummy : extended);
  120. 2: (low : qword;high : word);
  121. end;
  122. float128 = record
  123. case byte of
  124. // force the record to be aligned like a double
  125. // else *_to_double will fail for cpus like sparc
  126. // and avoid expensive unpacking/packing operations
  127. 1: (dummy : qword);
  128. 2: (low,high : qword);
  129. end;
  130. {$else}
  131. {$ifndef FPC_SYSTEM_HAS_float64}
  132. float64 = record
  133. case byte of
  134. // force the record to be aligned like a double
  135. // else *_to_double will fail for cpus like sparc
  136. 1: (dummy : double);
  137. 2: (high,low : bits32);
  138. end;
  139. {$endif ndef FPC_SYSTEM_HAS_float64}
  140. floatx80 = record
  141. case byte of
  142. // force the record to be aligned like a double
  143. // else *_to_double will fail for cpus like sparc
  144. // and avoid expensive unpacking/packing operations
  145. 1: (dummy : qword);
  146. 2: (high : word;low : qword);
  147. end;
  148. float128 = record
  149. case byte of
  150. // force the record to be aligned like a double
  151. // else *_to_double will fail for cpus like sparc
  152. // and avoid expensive unpacking/packing operations
  153. 1: (dummy : qword);
  154. 2: (high : qword;low : qword);
  155. end;
  156. {$endif}
  157. {$define FPC_SYSTEM_HAS_float64}
  158. {*
  159. -------------------------------------------------------------------------------
  160. Returns 1 if the double-precision floating-point value `a' is less than
  161. the corresponding value `b', and 0 otherwise. The comparison is performed
  162. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  163. -------------------------------------------------------------------------------
  164. *}
  165. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  166. {*
  167. -------------------------------------------------------------------------------
  168. Returns 1 if the double-precision floating-point value `a' is less than
  169. or equal to the corresponding value `b', and 0 otherwise. The comparison
  170. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  171. Arithmetic.
  172. -------------------------------------------------------------------------------
  173. *}
  174. Function float64_le(a: float64;b: float64): flag; compilerproc;
  175. {*
  176. -------------------------------------------------------------------------------
  177. Returns 1 if the double-precision floating-point value `a' is equal to
  178. the corresponding value `b', and 0 otherwise. The comparison is performed
  179. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  180. -------------------------------------------------------------------------------
  181. *}
  182. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  183. {*
  184. -------------------------------------------------------------------------------
  185. Returns the square root of the double-precision floating-point value `a'.
  186. The operation is performed according to the IEC/IEEE Standard for Binary
  187. Floating-Point Arithmetic.
  188. -------------------------------------------------------------------------------
  189. *}
  190. function float64_sqrt( a: float64 ): float64; compilerproc;
  191. {*
  192. -------------------------------------------------------------------------------
  193. Returns the remainder of the double-precision floating-point value `a'
  194. with respect to the corresponding value `b'. The operation is performed
  195. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  196. -------------------------------------------------------------------------------
  197. *}
  198. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  199. {*
  200. -------------------------------------------------------------------------------
  201. Returns the result of dividing the double-precision floating-point value `a'
  202. by the corresponding value `b'. The operation is performed according to the
  203. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  204. -------------------------------------------------------------------------------
  205. *}
  206. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  207. {*
  208. -------------------------------------------------------------------------------
  209. Returns the result of multiplying the double-precision floating-point values
  210. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  211. for Binary Floating-Point Arithmetic.
  212. -------------------------------------------------------------------------------
  213. *}
  214. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  215. {*
  216. -------------------------------------------------------------------------------
  217. Returns the result of subtracting the double-precision floating-point values
  218. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  219. for Binary Floating-Point Arithmetic.
  220. -------------------------------------------------------------------------------
  221. *}
  222. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  223. {*
  224. -------------------------------------------------------------------------------
  225. Returns the result of adding the double-precision floating-point values `a'
  226. and `b'. The operation is performed according to the IEC/IEEE Standard for
  227. Binary Floating-Point Arithmetic.
  228. -------------------------------------------------------------------------------
  229. *}
  230. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  231. {*
  232. -------------------------------------------------------------------------------
  233. Rounds the double-precision floating-point value `a' to an integer,
  234. and returns the result as a double-precision floating-point value. The
  235. operation is performed according to the IEC/IEEE Standard for Binary
  236. Floating-Point Arithmetic.
  237. -------------------------------------------------------------------------------
  238. *}
  239. Function float64_round_to_int(a: float64) : float64; compilerproc;
  240. {*
  241. -------------------------------------------------------------------------------
  242. Returns the result of converting the double-precision floating-point value
  243. `a' to the single-precision floating-point format. The conversion is
  244. performed according to the IEC/IEEE Standard for Binary Floating-Point
  245. Arithmetic.
  246. -------------------------------------------------------------------------------
  247. *}
  248. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  249. {*
  250. -------------------------------------------------------------------------------
  251. Returns the result of converting the double-precision floating-point value
  252. `a' to the 32-bit two's complement integer format. The conversion is
  253. performed according to the IEC/IEEE Standard for Binary Floating-Point
  254. Arithmetic, except that the conversion is always rounded toward zero.
  255. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  256. the conversion overflows, the largest integer with the same sign as `a' is
  257. returned.
  258. -------------------------------------------------------------------------------
  259. *}
  260. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  261. {*
  262. -------------------------------------------------------------------------------
  263. Returns the result of converting the double-precision floating-point value
  264. `a' to the 32-bit two's complement integer format. The conversion is
  265. performed according to the IEC/IEEE Standard for Binary Floating-Point
  266. Arithmetic---which means in particular that the conversion is rounded
  267. according to the current rounding mode. If `a' is a NaN, the largest
  268. positive integer is returned. Otherwise, if the conversion overflows, the
  269. largest integer with the same sign as `a' is returned.
  270. -------------------------------------------------------------------------------
  271. *}
  272. Function float64_to_int32(a: float64): int32; compilerproc;
  273. {*
  274. -------------------------------------------------------------------------------
  275. Returns 1 if the single-precision floating-point value `a' is less than
  276. the corresponding value `b', and 0 otherwise. The comparison is performed
  277. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  278. -------------------------------------------------------------------------------
  279. *}
  280. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  281. {*
  282. -------------------------------------------------------------------------------
  283. Returns 1 if the single-precision floating-point value `a' is less than
  284. or equal to the corresponding value `b', and 0 otherwise. The comparison
  285. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  286. Arithmetic.
  287. -------------------------------------------------------------------------------
  288. *}
  289. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  290. {*
  291. -------------------------------------------------------------------------------
  292. Returns 1 if the single-precision floating-point value `a' is equal to
  293. the corresponding value `b', and 0 otherwise. The comparison is performed
  294. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  295. -------------------------------------------------------------------------------
  296. *}
  297. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  298. {*
  299. -------------------------------------------------------------------------------
  300. Returns the square root of the single-precision floating-point value `a'.
  301. The operation is performed according to the IEC/IEEE Standard for Binary
  302. Floating-Point Arithmetic.
  303. -------------------------------------------------------------------------------
  304. *}
  305. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  306. {*
  307. -------------------------------------------------------------------------------
  308. Returns the remainder of the single-precision floating-point value `a'
  309. with respect to the corresponding value `b'. The operation is performed
  310. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  311. -------------------------------------------------------------------------------
  312. *}
  313. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  314. {*
  315. -------------------------------------------------------------------------------
  316. Returns the result of dividing the single-precision floating-point value `a'
  317. by the corresponding value `b'. The operation is performed according to the
  318. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  319. -------------------------------------------------------------------------------
  320. *}
  321. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  322. {*
  323. -------------------------------------------------------------------------------
  324. Returns the result of multiplying the single-precision floating-point values
  325. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  326. for Binary Floating-Point Arithmetic.
  327. -------------------------------------------------------------------------------
  328. *}
  329. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  330. {*
  331. -------------------------------------------------------------------------------
  332. Returns the result of subtracting the single-precision floating-point values
  333. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  334. for Binary Floating-Point Arithmetic.
  335. -------------------------------------------------------------------------------
  336. *}
  337. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  338. {*
  339. -------------------------------------------------------------------------------
  340. Returns the result of adding the single-precision floating-point values `a'
  341. and `b'. The operation is performed according to the IEC/IEEE Standard for
  342. Binary Floating-Point Arithmetic.
  343. -------------------------------------------------------------------------------
  344. *}
  345. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  346. {*
  347. -------------------------------------------------------------------------------
  348. Rounds the single-precision floating-point value `a' to an integer,
  349. and returns the result as a single-precision floating-point value. The
  350. operation is performed according to the IEC/IEEE Standard for Binary
  351. Floating-Point Arithmetic.
  352. -------------------------------------------------------------------------------
  353. *}
  354. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  355. {*
  356. -------------------------------------------------------------------------------
  357. Returns the result of converting the single-precision floating-point value
  358. `a' to the double-precision floating-point format. The conversion is
  359. performed according to the IEC/IEEE Standard for Binary Floating-Point
  360. Arithmetic.
  361. -------------------------------------------------------------------------------
  362. *}
  363. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  364. {*
  365. -------------------------------------------------------------------------------
  366. Returns the result of converting the single-precision floating-point value
  367. `a' to the 32-bit two's complement integer format. The conversion is
  368. performed according to the IEC/IEEE Standard for Binary Floating-Point
  369. Arithmetic, except that the conversion is always rounded toward zero.
  370. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  371. the conversion overflows, the largest integer with the same sign as `a' is
  372. returned.
  373. -------------------------------------------------------------------------------
  374. *}
  375. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  376. {*
  377. -------------------------------------------------------------------------------
  378. Returns the result of converting the single-precision floating-point value
  379. `a' to the 32-bit two's complement integer format. The conversion is
  380. performed according to the IEC/IEEE Standard for Binary Floating-Point
  381. Arithmetic---which means in particular that the conversion is rounded
  382. according to the current rounding mode. If `a' is a NaN, the largest
  383. positive integer is returned. Otherwise, if the conversion overflows, the
  384. largest integer with the same sign as `a' is returned.
  385. -------------------------------------------------------------------------------
  386. *}
  387. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  388. {*
  389. -------------------------------------------------------------------------------
  390. Returns the result of converting the 32-bit two's complement integer `a' to
  391. the double-precision floating-point format. The conversion is performed
  392. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  393. -------------------------------------------------------------------------------
  394. *}
  395. Function int32_to_float64( a: int32) : float64; compilerproc;
  396. {*
  397. -------------------------------------------------------------------------------
  398. Returns the result of converting the 32-bit two's complement integer `a' to
  399. the single-precision floating-point format. The conversion is performed
  400. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  401. -------------------------------------------------------------------------------
  402. *}
  403. Function int32_to_float32( a: int32): float32rec; compilerproc;
  404. {*----------------------------------------------------------------------------
  405. | Returns the result of converting the 64-bit two's complement integer `a'
  406. | to the double-precision floating-point format. The conversion is performed
  407. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  408. *----------------------------------------------------------------------------*}
  409. Function int64_to_float64( a: int64 ): float64; compilerproc;
  410. Function qword_to_float64( a: qword ): float64; compilerproc;
  411. {*----------------------------------------------------------------------------
  412. | Returns the result of converting the 64-bit two's complement integer `a'
  413. | to the single-precision floating-point format. The conversion is performed
  414. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  415. *----------------------------------------------------------------------------*}
  416. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  417. Function qword_to_float32( a: qword ): float32rec; compilerproc;
  418. // +++
  419. function float32_to_int64( a: float32 ): int64;
  420. function float32_to_int64_round_to_zero( a: float32 ): int64;
  421. function float32_eq_signaling( a: float32; b: float32) : flag;
  422. function float32_le_quiet( a: float32 ; b : float32 ): flag;
  423. function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  424. function float32_is_signaling_nan( a : float32 ): flag;
  425. function float32_is_nan( a : float32 ): flag;
  426. function float64_to_int64( a: float64 ): int64;
  427. function float64_to_int64_round_to_zero( a: float64 ): int64;
  428. function float64_eq_signaling( a: float64; b: float64): flag;
  429. function float64_le_quiet(a: float64 ; b: float64 ): flag;
  430. function float64_lt_quiet(a: float64; b: float64 ): Flag;
  431. function float64_is_signaling_nan( a : float64 ): flag;
  432. function float64_is_nan( a : float64 ): flag;
  433. // ===
  434. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  435. {*----------------------------------------------------------------------------
  436. | Extended double-precision rounding precision
  437. *----------------------------------------------------------------------------*}
  438. var // threadvar!?
  439. floatx80_rounding_precision : int8 = 80;
  440. function int32_to_floatx80( a: int32 ): floatx80;
  441. function int64_to_floatx80( a: int64 ): floatx80;
  442. function qword_to_floatx80( a: qword ): floatx80;
  443. function float32_to_floatx80( a: float32 ): floatx80;
  444. function float64_to_floatx80( a: float64 ): floatx80;
  445. function floatx80_to_int32( a: floatx80 ): int32;
  446. function floatx80_to_int32_round_to_zero( a: floatx80 ): int32;
  447. function floatx80_to_int64( a: floatx80 ): int64;
  448. function floatx80_to_int64_round_to_zero( a: floatx80 ): int64;
  449. function floatx80_to_float32( a: floatx80 ): float32;
  450. function floatx80_to_float64( a: floatx80 ): float64;
  451. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  452. function floatx80_to_float128( a: floatx80 ): float128;
  453. {$endif FPC_SOFTFLOAT_FLOAT128}
  454. function floatx80_round_to_int( a: floatx80 ): floatx80;
  455. function floatx80_add( a: floatx80; b: floatx80 ): floatx80;
  456. function floatx80_sub( a: floatx80; b: floatx80 ): floatx80;
  457. function floatx80_mul( a: floatx80; b: floatx80 ): floatx80;
  458. function floatx80_div( a: floatx80; b: floatx80 ): floatx80;
  459. function floatx80_rem( a: floatx80; b: floatx80 ): floatx80;
  460. function floatx80_sqrt( a: floatx80 ): floatx80;
  461. function floatx80_eq( a: floatx80; b: floatx80 ): flag;
  462. function floatx80_le( a: floatx80; b: floatx80 ): flag;
  463. function floatx80_lt( a: floatx80; b: floatx80 ): flag;
  464. function floatx80_eq_signaling( a: floatx80; b: floatx80 ): flag;
  465. function floatx80_le_quiet( a: floatx80; b: floatx80 ): flag;
  466. function floatx80_lt_quiet( a: floatx80; b: floatx80 ): flag;
  467. function floatx80_is_signaling_nan( a: floatx80 ): flag;
  468. function floatx80_is_nan(a : floatx80 ): flag;
  469. {$endif FPC_SOFTFLOAT_FLOATX80}
  470. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  471. function int32_to_float128( a: int32 ): float128;
  472. function int64_to_float128( a: int64 ): float128;
  473. function qword_to_float128( a: qword ): float128;
  474. function float32_to_float128( a: float32 ): float128;
  475. function float128_is_nan( a : float128): flag;
  476. function float128_is_signaling_nan( a : float128): flag;
  477. function float128_to_int32(a: float128): int32;
  478. function float128_to_int32_round_to_zero(a: float128): int32;
  479. function float128_to_int64(a: float128): int64;
  480. function float128_to_int64_round_to_zero(a: float128): int64;
  481. function float128_to_float32(a: float128): float32;
  482. function float128_to_float64(a: float128): float64;
  483. function float64_to_float128( a : float64) : float128;
  484. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  485. function float128_to_floatx80(a: float128): floatx80;
  486. {$endif FPC_SOFTFLOAT_FLOATX80}
  487. function float128_round_to_int(a: float128): float128;
  488. function float128_add(a: float128; b: float128): float128;
  489. function float128_sub(a: float128; b: float128): float128;
  490. function float128_mul(a: float128; b: float128): float128;
  491. function float128_div(a: float128; b: float128): float128;
  492. function float128_rem(a: float128; b: float128): float128;
  493. function float128_sqrt(a: float128): float128;
  494. function float128_eq(a: float128; b: float128): flag;
  495. function float128_le(a: float128; b: float128): flag;
  496. function float128_lt(a: float128; b: float128): flag;
  497. function float128_eq_signaling(a: float128; b: float128): flag;
  498. function float128_le_quiet(a: float128; b: float128): flag;
  499. function float128_lt_quiet(a: float128; b: float128): flag;
  500. {$endif FPC_SOFTFLOAT_FLOAT128}
  501. CONST
  502. {-------------------------------------------------------------------------------
  503. Software IEC/IEEE floating-point underflow tininess-detection mode.
  504. -------------------------------------------------------------------------------
  505. *}
  506. float_tininess_after_rounding = 0;
  507. float_tininess_before_rounding = 1;
  508. {*
  509. -------------------------------------------------------------------------------
  510. Underflow tininess-detection mode, statically initialized to default value.
  511. (The declaration in `softfloat.h' must match the `int8' type here.)
  512. -------------------------------------------------------------------------------
  513. *}
  514. var // threadvar!?
  515. softfloat_detect_tininess: int8 = float_tininess_after_rounding;
  516. {$endif not(defined(fpc_softfpu_implementation))}
  517. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  518. implementation
  519. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  520. {$if not(defined(fpc_softfpu_interface))}
  521. (*****************************************************************************)
  522. (*----------------------------------------------------------------------------*)
  523. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  524. (* division and square root approximations. (Can be specialized to target if *)
  525. (* desired.) *)
  526. (* ---------------------------------------------------------------------------*)
  527. (*****************************************************************************)
  528. { This procedure serves as a single access point to softfloat_exception_flags.
  529. It also helps to reduce code size a bit because softfloat_exception_flags is
  530. a threadvar. }
  531. procedure set_inexact_flag;
  532. begin
  533. include(softfloat_exception_flags,float_flag_inexact);
  534. end;
  535. {*----------------------------------------------------------------------------
  536. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  537. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  538. | input. If `zSign' is 1, the input is negated before being converted to an
  539. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  540. | is simply rounded to an integer, with the inexact exception raised if the
  541. | input cannot be represented exactly as an integer. However, if the fixed-
  542. | point input is too large, the invalid exception is raised and the largest
  543. | positive or negative integer is returned.
  544. *----------------------------------------------------------------------------*}
  545. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  546. var
  547. roundingMode: TFPURoundingMode;
  548. roundNearestEven: boolean;
  549. roundIncrement, roundBits: int8;
  550. z: int32;
  551. begin
  552. roundingMode := softfloat_rounding_mode;
  553. roundNearestEven := (roundingMode = float_round_nearest_even);
  554. roundIncrement := $40;
  555. if not roundNearestEven then
  556. begin
  557. if ( roundingMode = float_round_to_zero ) then
  558. begin
  559. roundIncrement := 0;
  560. end
  561. else begin
  562. roundIncrement := $7F;
  563. if ( zSign<>0 ) then
  564. begin
  565. if ( roundingMode = float_round_up ) then
  566. roundIncrement := 0;
  567. end
  568. else begin
  569. if ( roundingMode = float_round_down ) then
  570. roundIncrement := 0;
  571. end;
  572. end;
  573. end;
  574. roundBits := lo(absZ) and $7F;
  575. absZ := ( absZ + roundIncrement ) shr 7;
  576. absZ := absZ and not( bits64( ord( ( roundBits xor $40 ) = 0 ) and ord(roundNearestEven) ));
  577. z := absZ;
  578. if ( zSign<>0 ) then
  579. z := - z;
  580. if ( longint(hi( absZ )) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  581. begin
  582. float_raise( float_flag_invalid );
  583. if zSign<>0 then
  584. result:=sbits32($80000000)
  585. else
  586. result:=$7FFFFFFF;
  587. exit;
  588. end;
  589. if ( roundBits<>0 ) then
  590. set_inexact_flag;
  591. result:=z;
  592. end;
  593. {*----------------------------------------------------------------------------
  594. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  595. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  596. | and returns the properly rounded 64-bit integer corresponding to the input.
  597. | If `zSign' is 1, the input is negated before being converted to an integer.
  598. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  599. | the inexact exception raised if the input cannot be represented exactly as
  600. | an integer. However, if the fixed-point input is too large, the invalid
  601. | exception is raised and the largest positive or negative integer is
  602. | returned.
  603. *----------------------------------------------------------------------------*}
  604. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  605. var
  606. roundingMode: TFPURoundingMode;
  607. roundNearestEven, increment: flag;
  608. z: int64;
  609. label
  610. overflow;
  611. begin
  612. roundingMode := softfloat_rounding_mode;
  613. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  614. increment := ord( sbits64(absZ1) < 0 );
  615. if ( roundNearestEven=0 ) then
  616. begin
  617. if ( roundingMode = float_round_to_zero ) then
  618. begin
  619. increment := 0;
  620. end
  621. else begin
  622. if ( zSign<>0 ) then
  623. begin
  624. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  625. end
  626. else begin
  627. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  628. end;
  629. end;
  630. end;
  631. if ( increment<>0 ) then
  632. begin
  633. inc(absZ0);
  634. if ( absZ0 = 0 ) then
  635. goto overflow;
  636. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  637. end;
  638. z := absZ0;
  639. if ( zSign<>0 ) then
  640. z := - z;
  641. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  642. begin
  643. overflow:
  644. float_raise( float_flag_invalid );
  645. if zSign<>0 then
  646. result:=int64($8000000000000000)
  647. else
  648. result:=int64($7FFFFFFFFFFFFFFF);
  649. exit;
  650. end;
  651. if ( absZ1<>0 ) then
  652. set_inexact_flag;
  653. result:=z;
  654. end;
  655. {*
  656. -------------------------------------------------------------------------------
  657. Shifts `a' right by the number of bits given in `count'. If any nonzero
  658. bits are shifted off, they are ``jammed'' into the least significant bit of
  659. the result by setting the least significant bit to 1. The value of `count'
  660. can be arbitrarily large; in particular, if `count' is greater than 32, the
  661. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  662. The result is stored in the location pointed to by `zPtr'.
  663. -------------------------------------------------------------------------------
  664. *}
  665. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  666. var
  667. z: Bits32;
  668. Begin
  669. if ( count = 0 ) then
  670. z := a
  671. else
  672. if ( count < 32 ) then
  673. Begin
  674. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  675. End
  676. else
  677. Begin
  678. z := bits32( a <> 0 );
  679. End;
  680. zPtr := z;
  681. End;
  682. {*----------------------------------------------------------------------------
  683. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  684. | number of bits given in `count'. Any bits shifted off are lost. The value
  685. | of `count' can be arbitrarily large; in particular, if `count' is greater
  686. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  687. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  688. *----------------------------------------------------------------------------*}
  689. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  690. var
  691. z0, z1: bits64;
  692. negCount: int8;
  693. begin
  694. negCount := ( - count ) and 63;
  695. if ( count = 0 ) then
  696. begin
  697. z1 := a1;
  698. z0 := a0;
  699. end
  700. else if ( count < 64 ) then
  701. begin
  702. z1 := ( a0 shl negCount ) or ( a1 shr count );
  703. z0 := a0 shr count;
  704. end
  705. else
  706. begin
  707. if ( count < 128 ) then
  708. z1 := a0 shr ( count and 63 )
  709. else
  710. z1 := 0;
  711. z0 := 0;
  712. end;
  713. z1Ptr := z1;
  714. z0Ptr := z0;
  715. end;
  716. {*----------------------------------------------------------------------------
  717. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  718. | number of bits given in `count'. If any nonzero bits are shifted off, they
  719. | are ``jammed'' into the least significant bit of the result by setting the
  720. | least significant bit to 1. The value of `count' can be arbitrarily large;
  721. | in particular, if `count' is greater than 128, the result will be either
  722. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  723. | nonzero. The result is broken into two 64-bit pieces which are stored at
  724. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  725. *----------------------------------------------------------------------------*}
  726. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  727. var
  728. z0,z1 : bits64;
  729. negCount : int8;
  730. begin
  731. negCount := ( - count ) and 63;
  732. if ( count = 0 ) then begin
  733. z1 := a1;
  734. z0 := a0;
  735. end
  736. else if ( count < 64 ) then begin
  737. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  738. z0 := a0 shr count;
  739. end
  740. else begin
  741. if ( count = 64 ) then begin
  742. z1 := a0 or ord( a1 <> 0 );
  743. end
  744. else if ( count < 128 ) then begin
  745. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  746. end
  747. else begin
  748. z1 := ord( ( a0 or a1 ) <> 0 );
  749. end;
  750. z0 := 0;
  751. end;
  752. z1Ptr := z1;
  753. z0Ptr := z0;
  754. end;
  755. {*
  756. -------------------------------------------------------------------------------
  757. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  758. number of bits given in `count'. Any bits shifted off are lost. The value
  759. of `count' can be arbitrarily large; in particular, if `count' is greater
  760. than 64, the result will be 0. The result is broken into two 32-bit pieces
  761. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  762. -------------------------------------------------------------------------------
  763. *}
  764. Procedure
  765. shift64Right(
  766. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  767. Var
  768. z0, z1: bits32;
  769. negCount : int8;
  770. Begin
  771. negCount := ( - count ) AND 31;
  772. if ( count = 0 ) then
  773. Begin
  774. z1 := a1;
  775. z0 := a0;
  776. End
  777. else if ( count < 32 ) then
  778. Begin
  779. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  780. z0 := a0 shr count;
  781. End
  782. else
  783. Begin
  784. if (count < 64) then
  785. z1 := ( a0 shr ( count AND 31 ) )
  786. else
  787. z1 := 0;
  788. z0 := 0;
  789. End;
  790. z1Ptr := z1;
  791. z0Ptr := z0;
  792. End;
  793. {*
  794. -------------------------------------------------------------------------------
  795. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  796. number of bits given in `count'. If any nonzero bits are shifted off, they
  797. are ``jammed'' into the least significant bit of the result by setting the
  798. least significant bit to 1. The value of `count' can be arbitrarily large;
  799. in particular, if `count' is greater than 64, the result will be either 0
  800. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  801. nonzero. The result is broken into two 32-bit pieces which are stored at
  802. the locations pointed to by `z0Ptr' and `z1Ptr'.
  803. -------------------------------------------------------------------------------
  804. *}
  805. Procedure
  806. shift64RightJamming(
  807. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  808. VAR
  809. z0, z1 : bits32;
  810. negCount : int8;
  811. Begin
  812. negCount := ( - count ) AND 31;
  813. if ( count = 0 ) then
  814. Begin
  815. z1 := a1;
  816. z0 := a0;
  817. End
  818. else
  819. if ( count < 32 ) then
  820. Begin
  821. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  822. z0 := a0 shr count;
  823. End
  824. else
  825. Begin
  826. if ( count = 32 ) then
  827. Begin
  828. z1 := a0 OR bits32( a1 <> 0 );
  829. End
  830. else
  831. if ( count < 64 ) Then
  832. Begin
  833. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  834. End
  835. else
  836. Begin
  837. z1 := bits32( ( a0 OR a1 ) <> 0 );
  838. End;
  839. z0 := 0;
  840. End;
  841. z1Ptr := z1;
  842. z0Ptr := z0;
  843. End;
  844. {*----------------------------------------------------------------------------
  845. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  846. | bits are shifted off, they are ``jammed'' into the least significant bit of
  847. | the result by setting the least significant bit to 1. The value of `count'
  848. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  849. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  850. | The result is stored in the location pointed to by `zPtr'.
  851. *----------------------------------------------------------------------------*}
  852. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  853. var
  854. z: bits64;
  855. begin
  856. if ( count = 0 ) then
  857. begin
  858. z := a;
  859. end
  860. else if ( count < 64 ) then
  861. begin
  862. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  863. end
  864. else
  865. begin
  866. z := ord( a <> 0 );
  867. end;
  868. zPtr := z;
  869. end;
  870. {$if not defined(shift64ExtraRightJamming)}
  871. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  872. overload;
  873. forward;
  874. {$endif}
  875. {*
  876. -------------------------------------------------------------------------------
  877. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  878. by 32 _plus_ the number of bits given in `count'. The shifted result is
  879. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  880. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  881. off form a third 32-bit result as follows: The _last_ bit shifted off is
  882. the most-significant bit of the extra result, and the other 31 bits of the
  883. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  884. were all zero. This extra result is stored in the location pointed to by
  885. `z2Ptr'. The value of `count' can be arbitrarily large.
  886. (This routine makes more sense if `a0', `a1', and `a2' are considered
  887. to form a fixed-point value with binary point between `a1' and `a2'. This
  888. fixed-point value is shifted right by the number of bits given in `count',
  889. and the integer part of the result is returned at the locations pointed to
  890. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  891. corrupted as described above, and is returned at the location pointed to by
  892. `z2Ptr'.)
  893. -------------------------------------------------------------------------------
  894. }
  895. Procedure
  896. shift64ExtraRightJamming(
  897. a0: bits32;
  898. a1: bits32;
  899. a2: bits32;
  900. count: int16;
  901. VAR z0Ptr: bits32;
  902. VAR z1Ptr: bits32;
  903. VAR z2Ptr: bits32
  904. ); overload;
  905. Var
  906. z0, z1, z2: bits32;
  907. negCount : int8;
  908. Begin
  909. negCount := ( - count ) AND 31;
  910. if ( count = 0 ) then
  911. Begin
  912. z2 := a2;
  913. z1 := a1;
  914. z0 := a0;
  915. End
  916. else
  917. Begin
  918. if ( count < 32 ) Then
  919. Begin
  920. z2 := a1 shl negCount;
  921. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  922. z0 := a0 shr count;
  923. End
  924. else
  925. Begin
  926. if ( count = 32 ) then
  927. Begin
  928. z2 := a1;
  929. z1 := a0;
  930. End
  931. else
  932. Begin
  933. a2 := a2 or a1;
  934. if ( count < 64 ) then
  935. Begin
  936. z2 := a0 shl negCount;
  937. z1 := a0 shr ( count AND 31 );
  938. End
  939. else
  940. Begin
  941. if count = 64 then
  942. z2 := a0
  943. else
  944. z2 := bits32(a0 <> 0);
  945. z1 := 0;
  946. End;
  947. End;
  948. z0 := 0;
  949. End;
  950. z2 := z2 or bits32( a2 <> 0 );
  951. End;
  952. z2Ptr := z2;
  953. z1Ptr := z1;
  954. z0Ptr := z0;
  955. End;
  956. {*
  957. -------------------------------------------------------------------------------
  958. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  959. number of bits given in `count'. Any bits shifted off are lost. The value
  960. of `count' must be less than 32. The result is broken into two 32-bit
  961. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  962. -------------------------------------------------------------------------------
  963. *}
  964. Procedure
  965. shortShift64Left(
  966. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  967. Begin
  968. z1Ptr := a1 shl count;
  969. if count = 0 then
  970. z0Ptr := a0
  971. else
  972. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  973. End;
  974. {*
  975. -------------------------------------------------------------------------------
  976. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  977. by the number of bits given in `count'. Any bits shifted off are lost.
  978. The value of `count' must be less than 32. The result is broken into three
  979. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  980. `z1Ptr', and `z2Ptr'.
  981. -------------------------------------------------------------------------------
  982. *}
  983. Procedure
  984. shortShift96Left(
  985. a0: bits32;
  986. a1: bits32;
  987. a2: bits32;
  988. count: int16;
  989. VAR z0Ptr: bits32;
  990. VAR z1Ptr: bits32;
  991. VAR z2Ptr: bits32
  992. );
  993. Var
  994. z0, z1, z2: bits32;
  995. negCount: int8;
  996. Begin
  997. z2 := a2 shl count;
  998. z1 := a1 shl count;
  999. z0 := a0 shl count;
  1000. if ( 0 < count ) then
  1001. Begin
  1002. negCount := ( ( - count ) AND 31 );
  1003. z1 := z1 or (a2 shr negCount);
  1004. z0 := z0 or (a1 shr negCount);
  1005. End;
  1006. z2Ptr := z2;
  1007. z1Ptr := z1;
  1008. z0Ptr := z0;
  1009. End;
  1010. {*----------------------------------------------------------------------------
  1011. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  1012. | number of bits given in `count'. Any bits shifted off are lost. The value
  1013. | of `count' must be less than 64. The result is broken into two 64-bit
  1014. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1015. *----------------------------------------------------------------------------*}
  1016. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  1017. begin
  1018. z1Ptr := a1 shl count;
  1019. if count=0 then
  1020. z0Ptr:=a0
  1021. else
  1022. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  1023. end;
  1024. {*
  1025. -------------------------------------------------------------------------------
  1026. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  1027. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  1028. any carry out is lost. The result is broken into two 32-bit pieces which
  1029. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1030. -------------------------------------------------------------------------------
  1031. *}
  1032. Procedure
  1033. add64(
  1034. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1035. Var
  1036. z1: bits32;
  1037. Begin
  1038. z1 := a1 + b1;
  1039. z1Ptr := z1;
  1040. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  1041. End;
  1042. {*
  1043. -------------------------------------------------------------------------------
  1044. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  1045. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1046. modulo 2^96, so any carry out is lost. The result is broken into three
  1047. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1048. `z1Ptr', and `z2Ptr'.
  1049. -------------------------------------------------------------------------------
  1050. *}
  1051. Procedure
  1052. add96(
  1053. a0: bits32;
  1054. a1: bits32;
  1055. a2: bits32;
  1056. b0: bits32;
  1057. b1: bits32;
  1058. b2: bits32;
  1059. VAR z0Ptr: bits32;
  1060. VAR z1Ptr: bits32;
  1061. VAR z2Ptr: bits32
  1062. );
  1063. var
  1064. z0, z1, z2: bits32;
  1065. carry0, carry1: int8;
  1066. Begin
  1067. z2 := a2 + b2;
  1068. carry1 := int8( z2 < a2 );
  1069. z1 := a1 + b1;
  1070. carry0 := int8( z1 < a1 );
  1071. z0 := a0 + b0;
  1072. z1 := z1 + carry1;
  1073. z0 := z0 + bits32( z1 < carry1 );
  1074. z0 := z0 + carry0;
  1075. z2Ptr := z2;
  1076. z1Ptr := z1;
  1077. z0Ptr := z0;
  1078. End;
  1079. {*----------------------------------------------------------------------------
  1080. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1081. | by the number of bits given in `count'. Any bits shifted off are lost.
  1082. | The value of `count' must be less than 64. The result is broken into three
  1083. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1084. | `z1Ptr', and `z2Ptr'.
  1085. *----------------------------------------------------------------------------*}
  1086. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1087. var
  1088. z0, z1, z2 : bits64;
  1089. negCount : int8;
  1090. begin
  1091. z2 := a2 shl count;
  1092. z1 := a1 shl count;
  1093. z0 := a0 shl count;
  1094. if ( 0 < count ) then
  1095. begin
  1096. negCount := ( ( - count ) and 63 );
  1097. z1 := z1 or (a2 shr negCount);
  1098. z0 := z0 or (a1 shr negCount);
  1099. end;
  1100. z2Ptr := z2;
  1101. z1Ptr := z1;
  1102. z0Ptr := z0;
  1103. end;
  1104. {*----------------------------------------------------------------------------
  1105. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1106. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1107. | any carry out is lost. The result is broken into two 64-bit pieces which
  1108. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1109. *----------------------------------------------------------------------------*}
  1110. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1111. var
  1112. z1 : bits64;
  1113. begin
  1114. z1 := a1 + b1;
  1115. z1Ptr := z1;
  1116. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1117. end;
  1118. {*----------------------------------------------------------------------------
  1119. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1120. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1121. | modulo 2^192, so any carry out is lost. The result is broken into three
  1122. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1123. | `z1Ptr', and `z2Ptr'.
  1124. *----------------------------------------------------------------------------*}
  1125. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1126. var
  1127. z0, z1, z2 : bits64;
  1128. carry0, carry1 : int8;
  1129. begin
  1130. z2 := a2 + b2;
  1131. carry1 := ord( z2 < a2 );
  1132. z1 := a1 + b1;
  1133. carry0 := ord( z1 < a1 );
  1134. z0 := a0 + b0;
  1135. inc(z1, carry1);
  1136. inc(z0, ord( z1 < carry1 ));
  1137. inc(z0, carry0);
  1138. z2Ptr := z2;
  1139. z1Ptr := z1;
  1140. z0Ptr := z0;
  1141. end;
  1142. {*
  1143. -------------------------------------------------------------------------------
  1144. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1145. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1146. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1147. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1148. `z1Ptr'.
  1149. -------------------------------------------------------------------------------
  1150. *}
  1151. Procedure
  1152. sub64(
  1153. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1154. Begin
  1155. z1Ptr := a1 - b1;
  1156. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1157. End;
  1158. {*
  1159. -------------------------------------------------------------------------------
  1160. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1161. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1162. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1163. into three 32-bit pieces which are stored at the locations pointed to by
  1164. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1165. -------------------------------------------------------------------------------
  1166. *}
  1167. Procedure
  1168. sub96(
  1169. a0:bits32;
  1170. a1:bits32;
  1171. a2:bits32;
  1172. b0:bits32;
  1173. b1:bits32;
  1174. b2:bits32;
  1175. VAR z0Ptr:bits32;
  1176. VAR z1Ptr:bits32;
  1177. VAR z2Ptr:bits32
  1178. );
  1179. Var
  1180. z0, z1, z2: bits32;
  1181. borrow0, borrow1: int8;
  1182. Begin
  1183. z2 := a2 - b2;
  1184. borrow1 := int8( a2 < b2 );
  1185. z1 := a1 - b1;
  1186. borrow0 := int8( a1 < b1 );
  1187. z0 := a0 - b0;
  1188. z0 := z0 - bits32( z1 < borrow1 );
  1189. z1 := z1 - borrow1;
  1190. z0 := z0 -borrow0;
  1191. z2Ptr := z2;
  1192. z1Ptr := z1;
  1193. z0Ptr := z0;
  1194. End;
  1195. {*----------------------------------------------------------------------------
  1196. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1197. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1198. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1199. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1200. | `z1Ptr'.
  1201. *----------------------------------------------------------------------------*}
  1202. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1203. begin
  1204. z1Ptr := a1 - b1;
  1205. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1206. end;
  1207. {*----------------------------------------------------------------------------
  1208. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1209. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1210. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1211. | result is broken into three 64-bit pieces which are stored at the locations
  1212. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1213. *----------------------------------------------------------------------------*}
  1214. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1215. var
  1216. z0, z1, z2 : bits64;
  1217. borrow0, borrow1 : int8;
  1218. begin
  1219. z2 := a2 - b2;
  1220. borrow1 := ord( a2 < b2 );
  1221. z1 := a1 - b1;
  1222. borrow0 := ord( a1 < b1 );
  1223. z0 := a0 - b0;
  1224. dec(z0, ord( z1 < borrow1 ));
  1225. dec(z1, borrow1);
  1226. dec(z0, borrow0);
  1227. z2Ptr := z2;
  1228. z1Ptr := z1;
  1229. z0Ptr := z0;
  1230. end;
  1231. {*
  1232. -------------------------------------------------------------------------------
  1233. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1234. into two 32-bit pieces which are stored at the locations pointed to by
  1235. `z0Ptr' and `z1Ptr'.
  1236. -------------------------------------------------------------------------------
  1237. *}
  1238. {$IFDEF SOFTFPU_COMPILER_MUL32TO64}
  1239. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr :bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1240. var
  1241. tmp: qword;
  1242. begin
  1243. tmp:=qword(a) * b;
  1244. z0ptr:=hi(tmp);
  1245. z1ptr:=lo(tmp);
  1246. end;
  1247. {$ELSE}
  1248. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1249. :bits32 );
  1250. Var
  1251. aHigh, aLow, bHigh, bLow: bits16;
  1252. z0, zMiddleA, zMiddleB, z1: bits32;
  1253. Begin
  1254. aLow := bits16(a);
  1255. aHigh := a shr 16;
  1256. bLow := bits16(b);
  1257. bHigh := b shr 16;
  1258. z1 := ( bits32( aLow) ) * bLow;
  1259. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1260. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1261. z0 := ( bits32 (aHigh) ) * bHigh;
  1262. zMiddleA := zMiddleA + zMiddleB;
  1263. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1264. zMiddleA := zmiddleA shl 16;
  1265. z1 := z1 + zMiddleA;
  1266. z0 := z0 + bits32( z1 < zMiddleA );
  1267. z1Ptr := z1;
  1268. z0Ptr := z0;
  1269. End;
  1270. {$ENDIF}
  1271. {*
  1272. -------------------------------------------------------------------------------
  1273. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1274. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1275. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1276. `z2Ptr'.
  1277. -------------------------------------------------------------------------------
  1278. *}
  1279. Procedure
  1280. mul64By32To96(
  1281. a0:bits32;
  1282. a1:bits32;
  1283. b:bits32;
  1284. VAR z0Ptr:bits32;
  1285. VAR z1Ptr:bits32;
  1286. VAR z2Ptr:bits32
  1287. );
  1288. Var
  1289. z0, z1, z2, more1: bits32;
  1290. Begin
  1291. mul32To64( a1, b, z1, z2 );
  1292. mul32To64( a0, b, z0, more1 );
  1293. add64( z0, more1, 0, z1, z0, z1 );
  1294. z2Ptr := z2;
  1295. z1Ptr := z1;
  1296. z0Ptr := z0;
  1297. End;
  1298. {*
  1299. -------------------------------------------------------------------------------
  1300. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1301. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1302. product. The product is broken into four 32-bit pieces which are stored at
  1303. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1304. -------------------------------------------------------------------------------
  1305. *}
  1306. Procedure
  1307. mul64To128(
  1308. a0:bits32;
  1309. a1:bits32;
  1310. b0:bits32;
  1311. b1:bits32;
  1312. VAR z0Ptr:bits32;
  1313. VAR z1Ptr:bits32;
  1314. VAR z2Ptr:bits32;
  1315. VAR z3Ptr:bits32
  1316. );
  1317. Var
  1318. z0, z1, z2, z3: bits32;
  1319. more1, more2: bits32;
  1320. Begin
  1321. mul32To64( a1, b1, z2, z3 );
  1322. mul32To64( a1, b0, z1, more2 );
  1323. add64( z1, more2, 0, z2, z1, z2 );
  1324. mul32To64( a0, b0, z0, more1 );
  1325. add64( z0, more1, 0, z1, z0, z1 );
  1326. mul32To64( a0, b1, more1, more2 );
  1327. add64( more1, more2, 0, z2, more1, z2 );
  1328. add64( z0, z1, 0, more1, z0, z1 );
  1329. z3Ptr := z3;
  1330. z2Ptr := z2;
  1331. z1Ptr := z1;
  1332. z0Ptr := z0;
  1333. End;
  1334. {*----------------------------------------------------------------------------
  1335. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1336. | into two 64-bit pieces which are stored at the locations pointed to by
  1337. | `z0Ptr' and `z1Ptr'.
  1338. *----------------------------------------------------------------------------*}
  1339. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1340. var
  1341. aHigh, aLow, bHigh, bLow : bits32;
  1342. z0, zMiddleA, zMiddleB, z1 : bits64;
  1343. begin
  1344. aLow := a;
  1345. aHigh := a shr 32;
  1346. bLow := b;
  1347. bHigh := b shr 32;
  1348. z1 := ( bits64(aLow) ) * bLow;
  1349. zMiddleA := ( bits64( aLow )) * bHigh;
  1350. zMiddleB := ( bits64( aHigh )) * bLow;
  1351. z0 := ( bits64(aHigh) ) * bHigh;
  1352. inc(zMiddleA, zMiddleB);
  1353. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1354. zMiddleA := zMiddleA shl 32;
  1355. inc(z1, zMiddleA);
  1356. inc(z0, ord( z1 < zMiddleA ));
  1357. z1Ptr := z1;
  1358. z0Ptr := z0;
  1359. end;
  1360. {*----------------------------------------------------------------------------
  1361. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1362. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1363. | product. The product is broken into four 64-bit pieces which are stored at
  1364. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1365. *----------------------------------------------------------------------------*}
  1366. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1367. var
  1368. z0,z1,z2,z3,more1,more2 : bits64;
  1369. begin
  1370. mul64To128( a1, b1, z2, z3 );
  1371. mul64To128( a1, b0, z1, more2 );
  1372. add128( z1, more2, 0, z2, z1, z2 );
  1373. mul64To128( a0, b0, z0, more1 );
  1374. add128( z0, more1, 0, z1, z0, z1 );
  1375. mul64To128( a0, b1, more1, more2 );
  1376. add128( more1, more2, 0, z2, more1, z2 );
  1377. add128( z0, z1, 0, more1, z0, z1 );
  1378. z3Ptr := z3;
  1379. z2Ptr := z2;
  1380. z1Ptr := z1;
  1381. z0Ptr := z0;
  1382. end;
  1383. {*----------------------------------------------------------------------------
  1384. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1385. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1386. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1387. | `z2Ptr'.
  1388. *----------------------------------------------------------------------------*}
  1389. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1390. var
  1391. z0, z1, z2, more1 : bits64;
  1392. begin
  1393. mul64To128( a1, b, z1, z2 );
  1394. mul64To128( a0, b, z0, more1 );
  1395. add128( z0, more1, 0, z1, z0, z1 );
  1396. z2Ptr := z2;
  1397. z1Ptr := z1;
  1398. z0Ptr := z0;
  1399. end;
  1400. {*----------------------------------------------------------------------------
  1401. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1402. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1403. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1404. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1405. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1406. | unsigned integer is returned.
  1407. *----------------------------------------------------------------------------*}
  1408. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1409. var
  1410. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1411. begin
  1412. if ( b <= a0 ) then
  1413. begin
  1414. result:=qword( $FFFFFFFFFFFFFFFF );
  1415. exit;
  1416. end;
  1417. b0 := b shr 32;
  1418. if ( b0 shl 32 <= a0 ) then
  1419. z:=qword( $FFFFFFFF00000000 )
  1420. else
  1421. z:=( a0 div b0 ) shl 32;
  1422. mul64To128( b, z, term0, term1 );
  1423. sub128( a0, a1, term0, term1, rem0, rem1 );
  1424. while ( ( sbits64(rem0) ) < 0 ) do begin
  1425. dec(z,qword( $100000000 ));
  1426. b1 := b shl 32;
  1427. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1428. end;
  1429. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1430. if ( b0 shl 32 <= rem0 ) then
  1431. z:=z or $FFFFFFFF
  1432. else
  1433. z:=z or rem0 div b0;
  1434. result:=z;
  1435. end;
  1436. {*
  1437. -------------------------------------------------------------------------------
  1438. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1439. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1440. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1441. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1442. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1443. unsigned integer is returned.
  1444. -------------------------------------------------------------------------------
  1445. *}
  1446. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1447. Var
  1448. b0, b1: bits32;
  1449. rem0, rem1, term0, term1: bits32;
  1450. z: bits32;
  1451. Begin
  1452. if ( b <= a0 ) then
  1453. Begin
  1454. estimateDiv64To32 := $FFFFFFFF;
  1455. exit;
  1456. End;
  1457. b0 := b shr 16;
  1458. if ( b0 shl 16 <= a0 ) then
  1459. z:= $FFFF0000
  1460. else
  1461. z:= ( a0 div b0 ) shl 16;
  1462. mul32To64( b, z, term0, term1 );
  1463. sub64( a0, a1, term0, term1, rem0, rem1 );
  1464. while ( ( sbits32 (rem0) ) < 0 ) do
  1465. Begin
  1466. z := z - $10000;
  1467. b1 := b shl 16;
  1468. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1469. End;
  1470. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1471. if ( b0 shl 16 <= rem0 ) then
  1472. z := z or $FFFF
  1473. else
  1474. z := z or (rem0 div b0);
  1475. estimateDiv64To32 := z;
  1476. End;
  1477. {*
  1478. -------------------------------------------------------------------------------
  1479. Returns an approximation to the square root of the 32-bit significand given
  1480. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1481. `aExp' (the least significant bit) is 1, the integer returned approximates
  1482. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1483. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1484. case, the approximation returned lies strictly within +/-2 of the exact
  1485. value.
  1486. -------------------------------------------------------------------------------
  1487. *}
  1488. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1489. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1490. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1491. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1492. );
  1493. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1494. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1495. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1496. );
  1497. Var
  1498. index: int8;
  1499. z: bits32;
  1500. Begin
  1501. index := ( a shr 27 ) AND 15;
  1502. if ( aExp AND 1 ) <> 0 then
  1503. Begin
  1504. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1505. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1506. a := a shr 1;
  1507. End
  1508. else
  1509. Begin
  1510. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1511. z := a div z + z;
  1512. if ( $20000 <= z ) then
  1513. z := $FFFF8000
  1514. else
  1515. z := ( z shl 15 );
  1516. if ( z <= a ) then
  1517. Begin
  1518. estimateSqrt32 := bits32 ( SarLongint( sbits32 (a)) );
  1519. exit;
  1520. End;
  1521. End;
  1522. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1523. End;
  1524. {*
  1525. -------------------------------------------------------------------------------
  1526. Returns the number of leading 0 bits before the most-significant 1 bit of
  1527. `a'. If `a' is zero, 32 is returned.
  1528. -------------------------------------------------------------------------------
  1529. *}
  1530. Function countLeadingZeros32( a:bits32 ): int8;
  1531. const countLeadingZerosHigh:array[0..255] of int8 = (
  1532. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1533. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1534. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1535. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1536. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1537. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1538. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1539. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1540. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1541. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1542. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1543. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1544. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1545. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1546. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1547. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1548. );
  1549. Var
  1550. shiftCount: int8;
  1551. Begin
  1552. shiftCount := 0;
  1553. if ( a < $10000 ) then
  1554. Begin
  1555. shiftCount := shiftcount + 16;
  1556. a := a shl 16;
  1557. End;
  1558. if ( a < $1000000 ) then
  1559. Begin
  1560. shiftCount := shiftcount + 8;
  1561. a := a shl 8;
  1562. end;
  1563. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1564. countLeadingZeros32:= shiftCount;
  1565. End;
  1566. {*----------------------------------------------------------------------------
  1567. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1568. | `a'. If `a' is zero, 64 is returned.
  1569. *----------------------------------------------------------------------------*}
  1570. function countLeadingZeros64( a : bits64): int8;
  1571. var
  1572. shiftcount : int8;
  1573. Begin
  1574. shiftCount := 0;
  1575. if ( a < bits64(bits64(1) shl 32 )) then
  1576. shiftCount := shiftcount + 32
  1577. else
  1578. a := a shr 32;
  1579. shiftCount := shiftCount + countLeadingZeros32( a );
  1580. countLeadingZeros64:= shiftCount;
  1581. End;
  1582. {*
  1583. -------------------------------------------------------------------------------
  1584. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1585. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1586. Otherwise, returns 0.
  1587. -------------------------------------------------------------------------------
  1588. *}
  1589. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1590. Begin
  1591. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1592. End;
  1593. {*
  1594. -------------------------------------------------------------------------------
  1595. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1596. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1597. returns 0.
  1598. -------------------------------------------------------------------------------
  1599. *}
  1600. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1601. Begin
  1602. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1603. End;
  1604. const
  1605. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1606. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1607. (*****************************************************************************)
  1608. (* End Low-Level arithmetic *)
  1609. (*****************************************************************************)
  1610. {*----------------------------------------------------------------------------
  1611. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1612. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1613. | returns 0.
  1614. *----------------------------------------------------------------------------*}
  1615. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1616. begin
  1617. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1618. end;
  1619. {*
  1620. -------------------------------------------------------------------------------
  1621. Functions and definitions to determine: (1) whether tininess for underflow
  1622. is detected before or after rounding by default, (2) what (if anything)
  1623. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1624. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1625. are propagated from function inputs to output. These details are ENDIAN
  1626. specific
  1627. -------------------------------------------------------------------------------
  1628. *}
  1629. {$IFDEF ENDIAN_LITTLE}
  1630. {*
  1631. -------------------------------------------------------------------------------
  1632. Internal canonical NaN format.
  1633. -------------------------------------------------------------------------------
  1634. *}
  1635. TYPE
  1636. commonNaNT = record
  1637. high, low : bits32;
  1638. sign: flag;
  1639. end;
  1640. {*
  1641. -------------------------------------------------------------------------------
  1642. The pattern for a default generated single-precision NaN.
  1643. -------------------------------------------------------------------------------
  1644. *}
  1645. const float32_default_nan = $FFC00000;
  1646. {*
  1647. -------------------------------------------------------------------------------
  1648. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1649. otherwise returns 0.
  1650. -------------------------------------------------------------------------------
  1651. *}
  1652. Function float32_is_nan( a : float32 ): flag;
  1653. Begin
  1654. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1655. End;
  1656. {*
  1657. -------------------------------------------------------------------------------
  1658. Returns 1 if the single-precision floating-point value `a' is a signaling
  1659. NaN; otherwise returns 0.
  1660. -------------------------------------------------------------------------------
  1661. *}
  1662. Function float32_is_signaling_nan( a : float32 ): flag;
  1663. Begin
  1664. float32_is_signaling_nan := flag
  1665. (( ( ( a shr 22 ) and $1FF ) = $1FE ) and (( a and $003FFFFF )<>0));
  1666. End;
  1667. {*
  1668. -------------------------------------------------------------------------------
  1669. Returns the result of converting the single-precision floating-point NaN
  1670. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1671. exception is raised.
  1672. -------------------------------------------------------------------------------
  1673. *}
  1674. function float32ToCommonNaN(a: float32) : commonNaNT;
  1675. var
  1676. z : commonNaNT ;
  1677. Begin
  1678. if ( float32_is_signaling_nan( a ) <> 0) then
  1679. float_raise( float_flag_invalid );
  1680. z.sign := a shr 31;
  1681. z.low := 0;
  1682. z.high := a shl 9;
  1683. result := z;
  1684. End;
  1685. {*
  1686. -------------------------------------------------------------------------------
  1687. Returns the result of converting the canonical NaN `a' to the single-
  1688. precision floating-point format.
  1689. -------------------------------------------------------------------------------
  1690. *}
  1691. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1692. Begin
  1693. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1694. End;
  1695. {*
  1696. -------------------------------------------------------------------------------
  1697. Takes two single-precision floating-point values `a' and `b', one of which
  1698. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1699. signaling NaN, the invalid exception is raised.
  1700. -------------------------------------------------------------------------------
  1701. *}
  1702. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1703. Var
  1704. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1705. label returnLargerSignificand;
  1706. Begin
  1707. aIsNaN := float32_is_nan( a );
  1708. aIsSignalingNaN := float32_is_signaling_nan( a );
  1709. bIsNaN := float32_is_nan( b );
  1710. bIsSignalingNaN := float32_is_signaling_nan( b );
  1711. a := a or $00400000;
  1712. b := b or $00400000;
  1713. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1714. float_raise( float_flag_invalid );
  1715. if ( aIsSignalingNaN )<> 0 then
  1716. Begin
  1717. if ( bIsSignalingNaN ) <> 0 then
  1718. goto returnLargerSignificand;
  1719. if bIsNan <> 0 then
  1720. propagateFloat32NaN := b
  1721. else
  1722. propagateFloat32NaN := a;
  1723. exit;
  1724. End
  1725. else if ( aIsNaN <> 0) then
  1726. Begin
  1727. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1728. Begin
  1729. propagateFloat32NaN := a;
  1730. exit;
  1731. End;
  1732. returnLargerSignificand:
  1733. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1734. Begin
  1735. propagateFloat32NaN := b;
  1736. exit;
  1737. End;
  1738. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1739. Begin
  1740. propagateFloat32NaN := a;
  1741. End;
  1742. if a < b then
  1743. propagateFloat32NaN := a
  1744. else
  1745. propagateFloat32NaN := b;
  1746. exit;
  1747. End
  1748. else
  1749. Begin
  1750. propagateFloat32NaN := b;
  1751. exit;
  1752. End;
  1753. End;
  1754. {*
  1755. -------------------------------------------------------------------------------
  1756. The pattern for a default generated double-precision NaN. The `high' and
  1757. `low' values hold the most- and least-significant bits, respectively.
  1758. -------------------------------------------------------------------------------
  1759. *}
  1760. const
  1761. float64_default_nan_high = $FFF80000;
  1762. float64_default_nan_low = $00000000;
  1763. {*
  1764. -------------------------------------------------------------------------------
  1765. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1766. otherwise returns 0.
  1767. -------------------------------------------------------------------------------
  1768. *}
  1769. Function float64_is_nan( a : float64 ) : flag;
  1770. Begin
  1771. float64_is_nan :=
  1772. flag(( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1773. and (( a.low or ( a.high and $000FFFFF ) )<>0));
  1774. End;
  1775. {*
  1776. -------------------------------------------------------------------------------
  1777. Returns 1 if the double-precision floating-point value `a' is a signaling
  1778. NaN; otherwise returns 0.
  1779. -------------------------------------------------------------------------------
  1780. *}
  1781. Function float64_is_signaling_nan( a : float64 ): flag;
  1782. Begin
  1783. float64_is_signaling_nan :=
  1784. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1785. and ( a.low or ( a.high and $0007FFFF ) );
  1786. End;
  1787. {*
  1788. -------------------------------------------------------------------------------
  1789. Returns the result of converting the double-precision floating-point NaN
  1790. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1791. exception is raised.
  1792. -------------------------------------------------------------------------------
  1793. *}
  1794. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1795. Var
  1796. z : commonNaNT;
  1797. Begin
  1798. if ( float64_is_signaling_nan( a )<>0 ) then
  1799. float_raise( float_flag_invalid );
  1800. z.sign := a.high shr 31;
  1801. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1802. result := z;
  1803. End;
  1804. {*
  1805. -------------------------------------------------------------------------------
  1806. Returns the result of converting the canonical NaN `a' to the double-
  1807. precision floating-point format.
  1808. -------------------------------------------------------------------------------
  1809. *}
  1810. function commonNaNToFloat64( a : commonNaNT) : float64;
  1811. Var
  1812. z: float64;
  1813. Begin
  1814. shift64Right( a.high, a.low, 12, z.high, z.low );
  1815. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1816. result := z;
  1817. End;
  1818. {*
  1819. -------------------------------------------------------------------------------
  1820. Takes two double-precision floating-point values `a' and `b', one of which
  1821. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1822. signaling NaN, the invalid exception is raised.
  1823. -------------------------------------------------------------------------------
  1824. *}
  1825. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1826. Var
  1827. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1828. label returnLargerSignificand;
  1829. Begin
  1830. aIsNaN := float64_is_nan( a );
  1831. aIsSignalingNaN := float64_is_signaling_nan( a );
  1832. bIsNaN := float64_is_nan( b );
  1833. bIsSignalingNaN := float64_is_signaling_nan( b );
  1834. a.high := a.high or $00080000;
  1835. b.high := b.high or $00080000;
  1836. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1837. float_raise( float_flag_invalid );
  1838. if ( aIsSignalingNaN )<>0 then
  1839. Begin
  1840. if ( bIsSignalingNaN )<>0 then
  1841. goto returnLargerSignificand;
  1842. if bIsNan <> 0 then
  1843. c := b
  1844. else
  1845. c := a;
  1846. exit;
  1847. End
  1848. else if ( aIsNaN )<> 0 then
  1849. Begin
  1850. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1851. Begin
  1852. c := a;
  1853. exit;
  1854. End;
  1855. returnLargerSignificand:
  1856. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1857. Begin
  1858. c := b;
  1859. exit;
  1860. End;
  1861. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1862. Begin
  1863. c := a;
  1864. exit;
  1865. End;
  1866. if a.high < b.high then
  1867. c := a
  1868. else
  1869. c := b;
  1870. exit;
  1871. End
  1872. else
  1873. Begin
  1874. c := b;
  1875. exit;
  1876. End;
  1877. End;
  1878. {*----------------------------------------------------------------------------
  1879. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1880. | otherwise returns 0.
  1881. *----------------------------------------------------------------------------*}
  1882. function float128_is_nan( a : float128): flag;
  1883. begin
  1884. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1885. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1886. end;
  1887. {*----------------------------------------------------------------------------
  1888. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1889. | signaling NaN; otherwise returns 0.
  1890. *----------------------------------------------------------------------------*}
  1891. function float128_is_signaling_nan( a : float128): flag;
  1892. begin
  1893. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1894. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1895. end;
  1896. {*----------------------------------------------------------------------------
  1897. | Returns the result of converting the quadruple-precision floating-point NaN
  1898. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1899. | exception is raised.
  1900. *----------------------------------------------------------------------------*}
  1901. function float128ToCommonNaN( a : float128): commonNaNT;
  1902. var
  1903. z: commonNaNT;
  1904. qhigh,qlow : qword;
  1905. begin
  1906. if ( float128_is_signaling_nan( a )<>0) then
  1907. float_raise( float_flag_invalid );
  1908. z.sign := a.high shr 63;
  1909. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1910. z.high:=qhigh shr 32;
  1911. z.low:=qhigh and $ffffffff;
  1912. result:=z;
  1913. end;
  1914. {*----------------------------------------------------------------------------
  1915. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1916. | precision floating-point format.
  1917. *----------------------------------------------------------------------------*}
  1918. function commonNaNToFloat128( a : commonNaNT): float128;
  1919. var
  1920. z: float128;
  1921. begin
  1922. shift128Right( a.high, a.low, 16, z.high, z.low );
  1923. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1924. result:=z;
  1925. end;
  1926. {*----------------------------------------------------------------------------
  1927. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1928. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1929. | `b' is a signaling NaN, the invalid exception is raised.
  1930. *----------------------------------------------------------------------------*}
  1931. function propagateFloat128NaN( a: float128; b : float128): float128;
  1932. var
  1933. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1934. label
  1935. returnLargerSignificand;
  1936. begin
  1937. aIsNaN := float128_is_nan( a );
  1938. aIsSignalingNaN := float128_is_signaling_nan( a );
  1939. bIsNaN := float128_is_nan( b );
  1940. bIsSignalingNaN := float128_is_signaling_nan( b );
  1941. a.high := a.high or int64( $0000800000000000 );
  1942. b.high := b.high or int64( $0000800000000000 );
  1943. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1944. float_raise( float_flag_invalid );
  1945. if ( aIsSignalingNaN )<>0 then
  1946. begin
  1947. if ( bIsSignalingNaN )<>0 then
  1948. goto returnLargerSignificand;
  1949. if bIsNaN<>0 then
  1950. result := b
  1951. else
  1952. result := a;
  1953. exit;
  1954. end
  1955. else if ( aIsNaN )<>0 then
  1956. begin
  1957. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1958. begin
  1959. result := a;
  1960. exit;
  1961. end;
  1962. returnLargerSignificand:
  1963. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1964. begin
  1965. result := b;
  1966. exit;
  1967. end;
  1968. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1969. begin
  1970. result := a;
  1971. exit
  1972. end;
  1973. if ( a.high < b.high ) then
  1974. result := a
  1975. else
  1976. result := b;
  1977. exit;
  1978. end
  1979. else
  1980. result:=b;
  1981. end;
  1982. {$ELSE}
  1983. { Big endian code }
  1984. (*----------------------------------------------------------------------------
  1985. | Internal canonical NaN format.
  1986. *----------------------------------------------------------------------------*)
  1987. type
  1988. commonNANT = record
  1989. high, low : bits32;
  1990. sign : flag;
  1991. end;
  1992. (*----------------------------------------------------------------------------
  1993. | The pattern for a default generated single-precision NaN.
  1994. *----------------------------------------------------------------------------*)
  1995. const float32_default_nan = $7FFFFFFF;
  1996. (*----------------------------------------------------------------------------
  1997. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1998. | otherwise returns 0.
  1999. *----------------------------------------------------------------------------*)
  2000. function float32_is_nan(a: float32): flag;
  2001. begin
  2002. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  2003. end;
  2004. (*----------------------------------------------------------------------------
  2005. | Returns 1 if the single-precision floating-point value `a' is a signaling
  2006. | NaN; otherwise returns 0.
  2007. *----------------------------------------------------------------------------*)
  2008. function float32_is_signaling_nan(a: float32):flag;
  2009. begin
  2010. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  2011. end;
  2012. (*----------------------------------------------------------------------------
  2013. | Returns the result of converting the single-precision floating-point NaN
  2014. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2015. | exception is raised.
  2016. *----------------------------------------------------------------------------*)
  2017. function float32ToCommonNaN( a: float32) : commonNaNT;
  2018. var
  2019. z: commonNANT;
  2020. begin
  2021. if float32_is_signaling_nan(a)<>0 then
  2022. float_raise(float_flag_invalid);
  2023. z.sign := a shr 31;
  2024. z.low := 0;
  2025. z.high := a shl 9;
  2026. result:=z;
  2027. end;
  2028. (*----------------------------------------------------------------------------
  2029. | Returns the result of converting the canonical NaN `a' to the single-
  2030. | precision floating-point format.
  2031. *----------------------------------------------------------------------------*)
  2032. function CommonNanToFloat32(a : CommonNaNT): float32;
  2033. begin
  2034. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  2035. end;
  2036. (*----------------------------------------------------------------------------
  2037. | Takes two single-precision floating-point values `a' and `b', one of which
  2038. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2039. | signaling NaN, the invalid exception is raised.
  2040. *----------------------------------------------------------------------------*)
  2041. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  2042. var
  2043. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2044. begin
  2045. aIsNaN := float32_is_nan( a );
  2046. aIsSignalingNaN := float32_is_signaling_nan( a );
  2047. bIsNaN := float32_is_nan( b );
  2048. bIsSignalingNaN := float32_is_signaling_nan( b );
  2049. a := a or $00400000;
  2050. b := b or $00400000;
  2051. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2052. float_raise( float_flag_invalid );
  2053. if bIsSignalingNaN<>0 then
  2054. propagateFloat32Nan := b
  2055. else if aIsSignalingNan<>0 then
  2056. propagateFloat32Nan := a
  2057. else if bIsNan<>0 then
  2058. propagateFloat32Nan := b
  2059. else
  2060. propagateFloat32Nan := a;
  2061. end;
  2062. (*----------------------------------------------------------------------------
  2063. | The pattern for a default generated double-precision NaN. The `high' and
  2064. | `low' values hold the most- and least-significant bits, respectively.
  2065. *----------------------------------------------------------------------------*)
  2066. const
  2067. float64_default_nan_high = $7FFFFFFF;
  2068. float64_default_nan_low = $FFFFFFFF;
  2069. (*----------------------------------------------------------------------------
  2070. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2071. | otherwise returns 0.
  2072. *----------------------------------------------------------------------------*)
  2073. function float64_is_nan(a: float64): flag;
  2074. begin
  2075. float64_is_nan := flag (
  2076. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2077. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2078. end;
  2079. (*----------------------------------------------------------------------------
  2080. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2081. | NaN; otherwise returns 0.
  2082. *----------------------------------------------------------------------------*)
  2083. function float64_is_signaling_nan( a:float64): flag;
  2084. begin
  2085. float64_is_signaling_nan := flag(
  2086. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2087. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2088. end;
  2089. (*----------------------------------------------------------------------------
  2090. | Returns the result of converting the double-precision floating-point NaN
  2091. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2092. | exception is raised.
  2093. *----------------------------------------------------------------------------*)
  2094. function float64ToCommonNaN( a : float64) : commonNaNT;
  2095. var
  2096. z : commonNaNT;
  2097. begin
  2098. if ( float64_is_signaling_nan( a )<>0 ) then
  2099. float_raise( float_flag_invalid );
  2100. z.sign := a.high shr 31;
  2101. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2102. result:=z;
  2103. end;
  2104. (*----------------------------------------------------------------------------
  2105. | Returns the result of converting the canonical NaN `a' to the double-
  2106. | precision floating-point format.
  2107. *----------------------------------------------------------------------------*)
  2108. function commonNaNToFloat64( a : commonNaNT): float64;
  2109. var
  2110. z: float64;
  2111. begin
  2112. shift64Right( a.high, a.low, 12, z.high, z.low );
  2113. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2114. result:=z;
  2115. end;
  2116. (*----------------------------------------------------------------------------
  2117. | Takes two double-precision floating-point values `a' and `b', one of which
  2118. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2119. | signaling NaN, the invalid exception is raised.
  2120. *----------------------------------------------------------------------------*)
  2121. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2122. var
  2123. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2124. begin
  2125. aIsNaN := float64_is_nan( a );
  2126. aIsSignalingNaN := float64_is_signaling_nan( a );
  2127. bIsNaN := float64_is_nan( b );
  2128. bIsSignalingNaN := float64_is_signaling_nan( b );
  2129. a.high := a.high or $00080000;
  2130. b.high := b.high or $00080000;
  2131. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2132. float_raise( float_flag_invalid );
  2133. if bIsSignalingNaN<>0 then
  2134. c := b
  2135. else if aIsSignalingNan<>0 then
  2136. c := a
  2137. else if bIsNan<>0 then
  2138. c := b
  2139. else
  2140. c := a;
  2141. end;
  2142. {*----------------------------------------------------------------------------
  2143. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  2144. | otherwise returns 0.
  2145. *----------------------------------------------------------------------------*}
  2146. function float128_is_nan( a : float128): flag;
  2147. begin
  2148. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  2149. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  2150. end;
  2151. {*----------------------------------------------------------------------------
  2152. | Returns 1 if the quadruple-precision floating-point value `a' is a
  2153. | signaling NaN; otherwise returns 0.
  2154. *----------------------------------------------------------------------------*}
  2155. function float128_is_signaling_nan( a : float128): flag;
  2156. begin
  2157. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  2158. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  2159. end;
  2160. {*----------------------------------------------------------------------------
  2161. | Returns the result of converting the quadruple-precision floating-point NaN
  2162. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2163. | exception is raised.
  2164. *----------------------------------------------------------------------------*}
  2165. function float128ToCommonNaN( a : float128): commonNaNT;
  2166. var
  2167. z: commonNaNT;
  2168. qhigh,qlow : qword;
  2169. begin
  2170. if ( float128_is_signaling_nan( a )<>0) then
  2171. float_raise( float_flag_invalid );
  2172. z.sign := a.high shr 63;
  2173. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  2174. z.high:=qhigh shr 32;
  2175. z.low:=qhigh and $ffffffff;
  2176. result:=z;
  2177. end;
  2178. {*----------------------------------------------------------------------------
  2179. | Returns the result of converting the canonical NaN `a' to the quadruple-
  2180. | precision floating-point format.
  2181. *----------------------------------------------------------------------------*}
  2182. function commonNaNToFloat128( a : commonNaNT): float128;
  2183. var
  2184. z: float128;
  2185. begin
  2186. shift128Right( a.high, a.low, 16, z.high, z.low );
  2187. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  2188. result:=z;
  2189. end;
  2190. {*----------------------------------------------------------------------------
  2191. | Takes two quadruple-precision floating-point values `a' and `b', one of
  2192. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  2193. | `b' is a signaling NaN, the invalid exception is raised.
  2194. *----------------------------------------------------------------------------*}
  2195. function propagateFloat128NaN( a: float128; b : float128): float128;
  2196. var
  2197. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2198. label
  2199. returnLargerSignificand;
  2200. begin
  2201. aIsNaN := float128_is_nan( a );
  2202. aIsSignalingNaN := float128_is_signaling_nan( a );
  2203. bIsNaN := float128_is_nan( b );
  2204. bIsSignalingNaN := float128_is_signaling_nan( b );
  2205. a.high := a.high or int64( $0000800000000000 );
  2206. b.high := b.high or int64( $0000800000000000 );
  2207. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2208. float_raise( float_flag_invalid );
  2209. if ( aIsSignalingNaN )<>0 then
  2210. begin
  2211. if ( bIsSignalingNaN )<>0 then
  2212. goto returnLargerSignificand;
  2213. if bIsNaN<>0 then
  2214. result := b
  2215. else
  2216. result := a;
  2217. exit;
  2218. end
  2219. else if ( aIsNaN )<>0 then
  2220. begin
  2221. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  2222. begin
  2223. result := a;
  2224. exit;
  2225. end;
  2226. returnLargerSignificand:
  2227. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  2228. begin
  2229. result := b;
  2230. exit;
  2231. end;
  2232. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  2233. begin
  2234. result := a;
  2235. exit
  2236. end;
  2237. if ( a.high < b.high ) then
  2238. result := a
  2239. else
  2240. result := b;
  2241. exit;
  2242. end
  2243. else
  2244. result:=b;
  2245. end;
  2246. {$ENDIF}
  2247. (****************************************************************************)
  2248. (* END ENDIAN SPECIFIC CODE *)
  2249. (****************************************************************************)
  2250. {*
  2251. -------------------------------------------------------------------------------
  2252. Returns the fraction bits of the single-precision floating-point value `a'.
  2253. -------------------------------------------------------------------------------
  2254. *}
  2255. Function ExtractFloat32Frac(a : Float32) : Bits32; inline;
  2256. Begin
  2257. ExtractFloat32Frac := A AND $007FFFFF;
  2258. End;
  2259. {*
  2260. -------------------------------------------------------------------------------
  2261. Returns the exponent bits of the single-precision floating-point value `a'.
  2262. -------------------------------------------------------------------------------
  2263. *}
  2264. Function extractFloat32Exp( a: float32 ): Int16; inline;
  2265. Begin
  2266. extractFloat32Exp := (a shr 23) AND $FF;
  2267. End;
  2268. {*
  2269. -------------------------------------------------------------------------------
  2270. Returns the sign bit of the single-precision floating-point value `a'.
  2271. -------------------------------------------------------------------------------
  2272. *}
  2273. Function extractFloat32Sign( a: float32 ): Flag; inline;
  2274. Begin
  2275. extractFloat32Sign := a shr 31;
  2276. End;
  2277. {*
  2278. -------------------------------------------------------------------------------
  2279. Normalizes the subnormal single-precision floating-point value represented
  2280. by the denormalized significand `aSig'. The normalized exponent and
  2281. significand are stored at the locations pointed to by `zExpPtr' and
  2282. `zSigPtr', respectively.
  2283. -------------------------------------------------------------------------------
  2284. *}
  2285. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2286. Var
  2287. ShiftCount : BYTE;
  2288. Begin
  2289. shiftCount := countLeadingZeros32( aSig ) - 8;
  2290. zSigPtr := aSig shl shiftCount;
  2291. zExpPtr := 1 - shiftCount;
  2292. End;
  2293. {*
  2294. -------------------------------------------------------------------------------
  2295. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2296. single-precision floating-point value, returning the result. After being
  2297. shifted into the proper positions, the three fields are simply added
  2298. together to form the result. This means that any integer portion of `zSig'
  2299. will be added into the exponent. Since a properly normalized significand
  2300. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2301. than the desired result exponent whenever `zSig' is a complete, normalized
  2302. significand.
  2303. -------------------------------------------------------------------------------
  2304. *}
  2305. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32; inline;
  2306. Begin
  2307. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2308. + zSig;
  2309. End;
  2310. {*
  2311. -------------------------------------------------------------------------------
  2312. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2313. and significand `zSig', and returns the proper single-precision floating-
  2314. point value corresponding to the abstract input. Ordinarily, the abstract
  2315. value is simply rounded and packed into the single-precision format, with
  2316. the inexact exception raised if the abstract input cannot be represented
  2317. exactly. However, if the abstract value is too large, the overflow and
  2318. inexact exceptions are raised and an infinity or maximal finite value is
  2319. returned. If the abstract value is too small, the input value is rounded to
  2320. a subnormal number, and the underflow and inexact exceptions are raised if
  2321. the abstract input cannot be represented exactly as a subnormal single-
  2322. precision floating-point number.
  2323. The input significand `zSig' has its binary point between bits 30
  2324. and 29, which is 7 bits to the left of the usual location. This shifted
  2325. significand must be normalized or smaller. If `zSig' is not normalized,
  2326. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2327. and it must not require rounding. In the usual case that `zSig' is
  2328. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2329. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2330. Binary Floating-Point Arithmetic.
  2331. -------------------------------------------------------------------------------
  2332. *}
  2333. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2334. Var
  2335. roundingMode : TFPURoundingMode;
  2336. roundNearestEven : boolean;
  2337. roundIncrement, roundBits : BYTE;
  2338. IsTiny : boolean;
  2339. Begin
  2340. roundingMode := softfloat_rounding_mode;
  2341. roundNearestEven := (roundingMode = float_round_nearest_even);
  2342. roundIncrement := $40;
  2343. if not roundNearestEven then
  2344. Begin
  2345. if ( roundingMode = float_round_to_zero ) Then
  2346. Begin
  2347. roundIncrement := 0;
  2348. End
  2349. else
  2350. Begin
  2351. roundIncrement := $7F;
  2352. if ( zSign <> 0 ) then
  2353. Begin
  2354. if roundingMode = float_round_up then roundIncrement := 0;
  2355. End
  2356. else
  2357. Begin
  2358. if roundingMode = float_round_down then roundIncrement := 0;
  2359. End;
  2360. End
  2361. End;
  2362. roundBits := zSig AND $7F;
  2363. if ($FD <= bits16 (zExp) ) then
  2364. Begin
  2365. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2366. Begin
  2367. float_raise( [float_flag_overflow,float_flag_inexact] );
  2368. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2369. exit;
  2370. End;
  2371. if ( zExp < 0 ) then
  2372. Begin
  2373. isTiny :=
  2374. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2375. OR ( zExp < -1 )
  2376. OR ( (zSig + roundIncrement) < $80000000 );
  2377. shift32RightJamming( zSig, - zExp, zSig );
  2378. zExp := 0;
  2379. roundBits := zSig AND $7F;
  2380. if ( isTiny and (roundBits<>0) ) then
  2381. float_raise( float_flag_underflow );
  2382. End;
  2383. End;
  2384. if ( roundBits )<> 0 then
  2385. set_inexact_flag;
  2386. zSig := ( zSig + roundIncrement ) shr 7;
  2387. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and ord(roundNearestEven) );
  2388. if ( zSig = 0 ) then zExp := 0;
  2389. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2390. End;
  2391. {*
  2392. -------------------------------------------------------------------------------
  2393. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2394. and significand `zSig', and returns the proper single-precision floating-
  2395. point value corresponding to the abstract input. This routine is just like
  2396. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2397. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2398. floating-point exponent.
  2399. -------------------------------------------------------------------------------
  2400. *}
  2401. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2402. Var
  2403. ShiftCount : int8;
  2404. Begin
  2405. shiftCount := countLeadingZeros32( zSig ) - 1;
  2406. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2407. End;
  2408. {*
  2409. -------------------------------------------------------------------------------
  2410. Returns the most-significant 20 fraction bits of the double-precision
  2411. floating-point value `a'.
  2412. -------------------------------------------------------------------------------
  2413. *}
  2414. Function extractFloat64Frac0(a: float64): bits32; inline;
  2415. Begin
  2416. extractFloat64Frac0 := a.high and $000FFFFF;
  2417. End;
  2418. {*
  2419. -------------------------------------------------------------------------------
  2420. Returns the least-significant 32 fraction bits of the double-precision
  2421. floating-point value `a'.
  2422. -------------------------------------------------------------------------------
  2423. *}
  2424. Function extractFloat64Frac1(a: float64): bits32; inline;
  2425. Begin
  2426. extractFloat64Frac1 := a.low;
  2427. End;
  2428. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2429. Function extractFloat64Frac(a: float64): bits64; inline;
  2430. Begin
  2431. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2432. End;
  2433. {*
  2434. -------------------------------------------------------------------------------
  2435. Returns the exponent bits of the double-precision floating-point value `a'.
  2436. -------------------------------------------------------------------------------
  2437. *}
  2438. Function extractFloat64Exp(a: float64): int16; inline;
  2439. Begin
  2440. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2441. End;
  2442. {*
  2443. -------------------------------------------------------------------------------
  2444. Returns the sign bit of the double-precision floating-point value `a'.
  2445. -------------------------------------------------------------------------------
  2446. *}
  2447. Function extractFloat64Sign(a: float64) : flag; inline;
  2448. Begin
  2449. extractFloat64Sign := a.high shr 31;
  2450. End;
  2451. {*
  2452. -------------------------------------------------------------------------------
  2453. Normalizes the subnormal double-precision floating-point value represented
  2454. by the denormalized significand formed by the concatenation of `aSig0' and
  2455. `aSig1'. The normalized exponent is stored at the location pointed to by
  2456. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2457. stored at the location pointed to by `zSig0Ptr', and the least significant
  2458. 32 bits of the normalized significand are stored at the location pointed to
  2459. by `zSig1Ptr'.
  2460. -------------------------------------------------------------------------------
  2461. *}
  2462. Procedure normalizeFloat64Subnormal(
  2463. aSig0: bits32;
  2464. aSig1: bits32;
  2465. VAR zExpPtr : Int16;
  2466. VAR zSig0Ptr : Bits32;
  2467. VAR zSig1Ptr : Bits32
  2468. );
  2469. Var
  2470. ShiftCount : Int8;
  2471. Begin
  2472. if ( aSig0 = 0 ) then
  2473. Begin
  2474. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2475. if ( shiftCount < 0 ) then
  2476. Begin
  2477. zSig0Ptr := aSig1 shr ( - shiftCount );
  2478. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2479. End
  2480. else
  2481. Begin
  2482. zSig0Ptr := aSig1 shl shiftCount;
  2483. zSig1Ptr := 0;
  2484. End;
  2485. zExpPtr := - shiftCount - 31;
  2486. End
  2487. else
  2488. Begin
  2489. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2490. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2491. zExpPtr := 1 - shiftCount;
  2492. End;
  2493. End;
  2494. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2495. var
  2496. shiftCount : int8;
  2497. begin
  2498. shiftCount := countLeadingZeros64( aSig ) - 11;
  2499. zSigPtr := aSig shl shiftCount;
  2500. zExpPtr := 1 - shiftCount;
  2501. end;
  2502. {*
  2503. -------------------------------------------------------------------------------
  2504. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2505. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2506. point value, returning the result. After being shifted into the proper
  2507. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2508. together to form the most significant 32 bits of the result. This means
  2509. that any integer portion of `zSig0' will be added into the exponent. Since
  2510. a properly normalized significand will have an integer portion equal to 1,
  2511. the `zExp' input should be 1 less than the desired result exponent whenever
  2512. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2513. -------------------------------------------------------------------------------
  2514. *}
  2515. Procedure
  2516. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2517. var
  2518. z: Float64;
  2519. Begin
  2520. z.low := zSig1;
  2521. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2522. c := z;
  2523. End;
  2524. {*----------------------------------------------------------------------------
  2525. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2526. | double-precision floating-point value, returning the result. After being
  2527. | shifted into the proper positions, the three fields are simply added
  2528. | together to form the result. This means that any integer portion of `zSig'
  2529. | will be added into the exponent. Since a properly normalized significand
  2530. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2531. | than the desired result exponent whenever `zSig' is a complete, normalized
  2532. | significand.
  2533. *----------------------------------------------------------------------------*}
  2534. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2535. begin
  2536. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2537. end;
  2538. {*
  2539. -------------------------------------------------------------------------------
  2540. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2541. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2542. and `zSig2', and returns the proper double-precision floating-point value
  2543. corresponding to the abstract input. Ordinarily, the abstract value is
  2544. simply rounded and packed into the double-precision format, with the inexact
  2545. exception raised if the abstract input cannot be represented exactly.
  2546. However, if the abstract value is too large, the overflow and inexact
  2547. exceptions are raised and an infinity or maximal finite value is returned.
  2548. If the abstract value is too small, the input value is rounded to a
  2549. subnormal number, and the underflow and inexact exceptions are raised if the
  2550. abstract input cannot be represented exactly as a subnormal double-precision
  2551. floating-point number.
  2552. The input significand must be normalized or smaller. If the input
  2553. significand is not normalized, `zExp' must be 0; in that case, the result
  2554. returned is a subnormal number, and it must not require rounding. In the
  2555. usual case that the input significand is normalized, `zExp' must be 1 less
  2556. than the ``true'' floating-point exponent. The handling of underflow and
  2557. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2558. -------------------------------------------------------------------------------
  2559. *}
  2560. Procedure
  2561. roundAndPackFloat64(
  2562. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2563. Var
  2564. roundingMode : TFPURoundingMode;
  2565. roundNearestEven, increment, isTiny : Flag;
  2566. Begin
  2567. roundingMode := softfloat_rounding_mode;
  2568. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2569. increment := flag( sbits32 (zSig2) < 0 );
  2570. if ( roundNearestEven = flag(FALSE) ) then
  2571. Begin
  2572. if ( roundingMode = float_round_to_zero ) then
  2573. increment := 0
  2574. else
  2575. Begin
  2576. if ( zSign )<> 0 then
  2577. Begin
  2578. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2579. End
  2580. else
  2581. Begin
  2582. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2583. End
  2584. End
  2585. End;
  2586. if ( $7FD <= bits16 (zExp) ) then
  2587. Begin
  2588. if (( $7FD < zExp )
  2589. or (( zExp = $7FD )
  2590. and (zSig0=$001FFFFF) and (zSig1=$FFFFFFFF)
  2591. and (increment<>0)
  2592. )
  2593. ) then
  2594. Begin
  2595. float_raise( [float_flag_overflow,float_flag_inexact] );
  2596. if (( roundingMode = float_round_to_zero )
  2597. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2598. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2599. ) then
  2600. Begin
  2601. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2602. exit;
  2603. End;
  2604. packFloat64( zSign, $7FF, 0, 0, c );
  2605. exit;
  2606. End;
  2607. if ( zExp < 0 ) then
  2608. Begin
  2609. isTiny :=
  2610. flag( softfloat_detect_tininess = float_tininess_before_rounding )
  2611. or flag( zExp < -1 )
  2612. or flag(increment = 0)
  2613. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2614. shift64ExtraRightJamming(
  2615. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2616. zExp := 0;
  2617. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2618. if ( roundNearestEven )<>0 then
  2619. Begin
  2620. increment := flag( sbits32 (zSig2) < 0 );
  2621. End
  2622. else
  2623. Begin
  2624. if ( zSign )<>0 then
  2625. Begin
  2626. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2627. End
  2628. else
  2629. Begin
  2630. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2631. End
  2632. End;
  2633. End;
  2634. End;
  2635. if ( zSig2 )<>0 then
  2636. set_inexact_flag;
  2637. if ( increment )<>0 then
  2638. Begin
  2639. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2640. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2641. End
  2642. else
  2643. Begin
  2644. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2645. End;
  2646. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2647. End;
  2648. {*----------------------------------------------------------------------------
  2649. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2650. | and significand `zSig', and returns the proper double-precision floating-
  2651. | point value corresponding to the abstract input. Ordinarily, the abstract
  2652. | value is simply rounded and packed into the double-precision format, with
  2653. | the inexact exception raised if the abstract input cannot be represented
  2654. | exactly. However, if the abstract value is too large, the overflow and
  2655. | inexact exceptions are raised and an infinity or maximal finite value is
  2656. | returned. If the abstract value is too small, the input value is rounded
  2657. | to a subnormal number, and the underflow and inexact exceptions are raised
  2658. | if the abstract input cannot be represented exactly as a subnormal double-
  2659. | precision floating-point number.
  2660. | The input significand `zSig' has its binary point between bits 62
  2661. | and 61, which is 10 bits to the left of the usual location. This shifted
  2662. | significand must be normalized or smaller. If `zSig' is not normalized,
  2663. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2664. | and it must not require rounding. In the usual case that `zSig' is
  2665. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2666. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2667. | Binary Floating-Point Arithmetic.
  2668. *----------------------------------------------------------------------------*}
  2669. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2670. var
  2671. roundingMode: TFPURoundingMode;
  2672. roundNearestEven: flag;
  2673. roundIncrement, roundBits: int16;
  2674. isTiny: flag;
  2675. begin
  2676. roundingMode := softfloat_rounding_mode;
  2677. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2678. roundIncrement := $200;
  2679. if ( roundNearestEven=0 ) then
  2680. begin
  2681. if ( roundingMode = float_round_to_zero ) then
  2682. begin
  2683. roundIncrement := 0;
  2684. end
  2685. else begin
  2686. roundIncrement := $3FF;
  2687. if ( zSign<>0 ) then
  2688. begin
  2689. if ( roundingMode = float_round_up ) then
  2690. roundIncrement := 0;
  2691. end
  2692. else begin
  2693. if ( roundingMode = float_round_down ) then
  2694. roundIncrement := 0;
  2695. end
  2696. end
  2697. end;
  2698. roundBits := zSig and $3FF;
  2699. if ( $7FD <= bits16(zExp) ) then
  2700. begin
  2701. if ( ( $7FD < zExp )
  2702. or ( ( zExp = $7FD )
  2703. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2704. ) then
  2705. begin
  2706. float_raise( [float_flag_overflow,float_flag_inexact] );
  2707. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2708. exit;
  2709. end;
  2710. if ( zExp < 0 ) then
  2711. begin
  2712. isTiny := ord(
  2713. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2714. or ( zExp < -1 )
  2715. or ( (zSig + roundIncrement) < bits64( $8000000000000000 ) ) );
  2716. shift64RightJamming( zSig, - zExp, zSig );
  2717. zExp := 0;
  2718. roundBits := zSig and $3FF;
  2719. if ( isTiny and roundBits )<>0 then
  2720. float_raise( float_flag_underflow );
  2721. end
  2722. end;
  2723. if ( roundBits<>0 ) then
  2724. set_inexact_flag;
  2725. zSig := ( zSig + roundIncrement ) shr 10;
  2726. zSig := zSig and not(qword(ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven ));
  2727. if ( zSig = 0 ) then
  2728. zExp := 0;
  2729. result:=packFloat64( zSign, zExp, zSig );
  2730. end;
  2731. {*
  2732. -------------------------------------------------------------------------------
  2733. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2734. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2735. returns the proper double-precision floating-point value corresponding
  2736. to the abstract input. This routine is just like `roundAndPackFloat64'
  2737. except that the input significand has fewer bits and does not have to be
  2738. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2739. point exponent.
  2740. -------------------------------------------------------------------------------
  2741. *}
  2742. Procedure
  2743. normalizeRoundAndPackFloat64(
  2744. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2745. Var
  2746. shiftCount : int8;
  2747. zSig2 : bits32;
  2748. Begin
  2749. if ( zSig0 = 0 ) then
  2750. Begin
  2751. zSig0 := zSig1;
  2752. zSig1 := 0;
  2753. zExp := zExp -32;
  2754. End;
  2755. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2756. if ( 0 <= shiftCount ) then
  2757. Begin
  2758. zSig2 := 0;
  2759. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2760. End
  2761. else
  2762. Begin
  2763. shift64ExtraRightJamming
  2764. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2765. End;
  2766. zExp := zExp - shiftCount;
  2767. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2768. End;
  2769. {*
  2770. ----------------------------------------------------------------------------
  2771. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2772. and significand `zSig', and returns the proper double-precision floating-
  2773. point value corresponding to the abstract input. This routine is just like
  2774. `roundAndPackFloat64' except that `zSig' does not have to be normalized.
  2775. Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2776. floating-point exponent.
  2777. ----------------------------------------------------------------------------
  2778. *}
  2779. function normalizeRoundAndPackFloat64(zSign: flag; zExp: int16; zSig: bits64): float64;
  2780. var
  2781. shiftCount: int8;
  2782. begin
  2783. shiftCount := countLeadingZeros64( zSig ) - 1;
  2784. result := roundAndPackFloat64( zSign, zExp - shiftCount, zSig shl shiftCount);
  2785. end;
  2786. {*
  2787. -------------------------------------------------------------------------------
  2788. Returns the result of converting the 32-bit two's complement integer `a' to
  2789. the single-precision floating-point format. The conversion is performed
  2790. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2791. -------------------------------------------------------------------------------
  2792. *}
  2793. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2794. Var
  2795. zSign : Flag;
  2796. Begin
  2797. if ( a = 0 ) then
  2798. Begin
  2799. int32_to_float32.float32 := 0;
  2800. exit;
  2801. End;
  2802. if ( a = sbits32 ($80000000) ) then
  2803. Begin
  2804. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2805. exit;
  2806. end;
  2807. zSign := flag( a < 0 );
  2808. If zSign<>0 then
  2809. a := -a;
  2810. int32_to_float32.float32:=
  2811. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2812. End;
  2813. {*
  2814. -------------------------------------------------------------------------------
  2815. Returns the result of converting the 32-bit two's complement integer `a' to
  2816. the double-precision floating-point format. The conversion is performed
  2817. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2818. -------------------------------------------------------------------------------
  2819. *}
  2820. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2821. var
  2822. zSign : flag;
  2823. absA : bits32;
  2824. shiftCount : int8;
  2825. zSig0, zSig1 : bits32;
  2826. Begin
  2827. if ( a = 0 ) then
  2828. Begin
  2829. packFloat64( 0, 0, 0, 0, result );
  2830. exit;
  2831. end;
  2832. zSign := flag( a < 0 );
  2833. if ZSign<>0 then
  2834. AbsA := -a
  2835. else
  2836. AbsA := a;
  2837. shiftCount := countLeadingZeros32( absA ) - 11;
  2838. if ( 0 <= shiftCount ) then
  2839. Begin
  2840. zSig0 := absA shl shiftCount;
  2841. zSig1 := 0;
  2842. End
  2843. else
  2844. Begin
  2845. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2846. End;
  2847. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2848. End;
  2849. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  2850. {$if not defined(packFloatx80)}
  2851. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  2852. forward;
  2853. {$endif}
  2854. {*----------------------------------------------------------------------------
  2855. | Returns the result of converting the 32-bit two's complement integer `a'
  2856. | to the extended double-precision floating-point format. The conversion
  2857. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2858. | Arithmetic.
  2859. *----------------------------------------------------------------------------*}
  2860. function int32_to_floatx80( a: int32 ): floatx80;
  2861. var
  2862. zSign: flag;
  2863. absA: uint32;
  2864. shiftCount: int8;
  2865. zSig: bits64;
  2866. begin
  2867. if ( a = 0 ) then begin
  2868. result := packFloatx80( 0, 0, 0 );
  2869. exit;
  2870. end;
  2871. zSign := ord( a < 0 );
  2872. if zSign <> 0 then absA := - a else absA := a;
  2873. shiftCount := countLeadingZeros32( absA ) + 32;
  2874. zSig := absA;
  2875. result := packFloatx80( zSign, $403E - shiftCount, zSig shl shiftCount );
  2876. end;
  2877. {$endif FPC_SOFTFLOAT_FLOATX80}
  2878. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  2879. {$if not defined(packFloat128)}
  2880. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64 ) : float128;
  2881. forward;
  2882. {$endif}
  2883. {*----------------------------------------------------------------------------
  2884. | Returns the result of converting the 32-bit two's complement integer `a' to
  2885. | the quadruple-precision floating-point format. The conversion is performed
  2886. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2887. *----------------------------------------------------------------------------*}
  2888. function int32_to_float128( a: int32 ): float128;
  2889. var
  2890. zSign: flag;
  2891. absA: uint32;
  2892. shiftCount: int8;
  2893. zSig0: bits64;
  2894. begin
  2895. if ( a = 0 ) then begin
  2896. result := packFloat128( 0, 0, 0, 0 );
  2897. exit;
  2898. end;
  2899. zSign := ord( a < 0 );
  2900. if zSign <> 0 then absA := - a else absA := a;
  2901. shiftCount := countLeadingZeros32( absA ) + 17;
  2902. zSig0 := absA;
  2903. result := packFloat128( zSign, $402E - shiftCount, zSig0 shl shiftCount, 0 );
  2904. end;
  2905. {$endif FPC_SOFTFLOAT_FLOAT128}
  2906. {*
  2907. -------------------------------------------------------------------------------
  2908. Returns the result of converting the single-precision floating-point value
  2909. `a' to the 32-bit two's complement integer format. The conversion is
  2910. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2911. Arithmetic---which means in particular that the conversion is rounded
  2912. according to the current rounding mode. If `a' is a NaN, the largest
  2913. positive integer is returned. Otherwise, if the conversion overflows, the
  2914. largest integer with the same sign as `a' is returned.
  2915. -------------------------------------------------------------------------------
  2916. *}
  2917. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2918. Var
  2919. aSign: flag;
  2920. aExp, shiftCount: int16;
  2921. aSig, aSigExtra: bits32;
  2922. z: int32;
  2923. roundingMode: TFPURoundingMode;
  2924. Begin
  2925. aSig := extractFloat32Frac( a.float32 );
  2926. aExp := extractFloat32Exp( a.float32 );
  2927. aSign := extractFloat32Sign( a.float32 );
  2928. shiftCount := aExp - $96;
  2929. if ( 0 <= shiftCount ) then
  2930. Begin
  2931. if ( $9E <= aExp ) then
  2932. Begin
  2933. if ( a.float32 <> $CF000000 ) then
  2934. Begin
  2935. float_raise( float_flag_invalid );
  2936. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2937. Begin
  2938. float32_to_int32 := $7FFFFFFF;
  2939. exit;
  2940. End;
  2941. End;
  2942. float32_to_int32 := sbits32 ($80000000);
  2943. exit;
  2944. End;
  2945. z := ( aSig or $00800000 ) shl shiftCount;
  2946. if ( aSign<>0 ) then z := - z;
  2947. End
  2948. else
  2949. Begin
  2950. if ( aExp < $7E ) then
  2951. Begin
  2952. aSigExtra := aExp OR aSig;
  2953. z := 0;
  2954. End
  2955. else
  2956. Begin
  2957. aSig := aSig OR $00800000;
  2958. aSigExtra := aSig shl ( shiftCount and 31 );
  2959. z := aSig shr ( - shiftCount );
  2960. End;
  2961. if ( aSigExtra<>0 ) then
  2962. set_inexact_flag;
  2963. roundingMode := softfloat_rounding_mode;
  2964. if ( roundingMode = float_round_nearest_even ) then
  2965. Begin
  2966. if ( sbits32 (aSigExtra) < 0 ) then
  2967. Begin
  2968. Inc(z);
  2969. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2970. z := z and not 1;
  2971. End;
  2972. if ( aSign<>0 ) then
  2973. z := - z;
  2974. End
  2975. else
  2976. Begin
  2977. aSigExtra := flag( aSigExtra <> 0 );
  2978. if ( aSign<>0 ) then
  2979. Begin
  2980. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2981. z := - z;
  2982. End
  2983. else
  2984. Begin
  2985. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2986. End
  2987. End;
  2988. End;
  2989. float32_to_int32 := z;
  2990. End;
  2991. {*
  2992. -------------------------------------------------------------------------------
  2993. Returns the result of converting the single-precision floating-point value
  2994. `a' to the 32-bit two's complement integer format. The conversion is
  2995. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2996. Arithmetic, except that the conversion is always rounded toward zero.
  2997. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2998. the conversion overflows, the largest integer with the same sign as `a' is
  2999. returned.
  3000. -------------------------------------------------------------------------------
  3001. *}
  3002. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  3003. Var
  3004. aSign : flag;
  3005. aExp, shiftCount : int16;
  3006. aSig : bits32;
  3007. z : int32;
  3008. Begin
  3009. aSig := extractFloat32Frac( a.float32 );
  3010. aExp := extractFloat32Exp( a.float32 );
  3011. aSign := extractFloat32Sign( a.float32 );
  3012. shiftCount := aExp - $9E;
  3013. if ( 0 <= shiftCount ) then
  3014. Begin
  3015. if ( a.float32 <> $CF000000 ) then
  3016. Begin
  3017. float_raise( float_flag_invalid );
  3018. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  3019. Begin
  3020. float32_to_int32_round_to_zero := $7FFFFFFF;
  3021. exit;
  3022. end;
  3023. End;
  3024. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  3025. exit;
  3026. End
  3027. else
  3028. if ( aExp <= $7E ) then
  3029. Begin
  3030. if ( aExp or aSig )<>0 then
  3031. set_inexact_flag;
  3032. float32_to_int32_round_to_zero := 0;
  3033. exit;
  3034. End;
  3035. aSig := ( aSig or $00800000 ) shl 8;
  3036. z := aSig shr ( - shiftCount );
  3037. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  3038. Begin
  3039. set_inexact_flag;
  3040. End;
  3041. if ( aSign<>0 ) then z := - z;
  3042. float32_to_int32_round_to_zero := z;
  3043. End;
  3044. {*----------------------------------------------------------------------------
  3045. | Returns the result of converting the single-precision floating-point value
  3046. | `a' to the 64-bit two's complement integer format. The conversion is
  3047. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3048. | Arithmetic---which means in particular that the conversion is rounded
  3049. | according to the current rounding mode. If `a' is a NaN, the largest
  3050. | positive integer is returned. Otherwise, if the conversion overflows, the
  3051. | largest integer with the same sign as `a' is returned.
  3052. *----------------------------------------------------------------------------*}
  3053. function float32_to_int64( a: float32 ): int64;
  3054. var
  3055. aSign: flag;
  3056. aExp, shiftCount: int16;
  3057. aSig: bits32;
  3058. aSig64, aSigExtra: bits64;
  3059. begin
  3060. aSig := extractFloat32Frac( a );
  3061. aExp := extractFloat32Exp( a );
  3062. aSign := extractFloat32Sign( a );
  3063. shiftCount := $BE - aExp;
  3064. if ( shiftCount < 0 ) then begin
  3065. float_raise( float_flag_invalid );
  3066. if ( aSign = 0 ) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3067. result := $7FFFFFFFFFFFFFFF;
  3068. exit;
  3069. end;
  3070. result := $8000000000000000;
  3071. exit;
  3072. end;
  3073. if ( aExp <> 0 ) then aSig := aSig or $00800000;
  3074. aSig64 := aSig;
  3075. aSig64 := aSig64 shl 40;
  3076. shift64ExtraRightJamming( aSig64, 0, shiftCount, aSig64, aSigExtra );
  3077. result := roundAndPackInt64( aSign, aSig64, aSigExtra );
  3078. end;
  3079. {*----------------------------------------------------------------------------
  3080. | Returns the result of converting the single-precision floating-point value
  3081. | `a' to the 64-bit two's complement integer format. The conversion is
  3082. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3083. | Arithmetic, except that the conversion is always rounded toward zero. If
  3084. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  3085. | conversion overflows, the largest integer with the same sign as `a' is
  3086. | returned.
  3087. *----------------------------------------------------------------------------*}
  3088. function float32_to_int64_round_to_zero( a: float32 ): int64;
  3089. var
  3090. aSign: flag;
  3091. aExp, shiftCount: int16;
  3092. aSig: bits32;
  3093. aSig64: bits64;
  3094. z: int64;
  3095. begin
  3096. aSig := extractFloat32Frac( a );
  3097. aExp := extractFloat32Exp( a );
  3098. aSign := extractFloat32Sign( a );
  3099. shiftCount := aExp - $BE;
  3100. if ( 0 <= shiftCount ) then begin
  3101. if ( a <> $DF000000 ) then begin
  3102. float_raise( float_flag_invalid );
  3103. if ( aSign = 0) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3104. result := $7FFFFFFFFFFFFFFF;
  3105. exit;
  3106. end;
  3107. end;
  3108. result := $8000000000000000;
  3109. exit;
  3110. end
  3111. else if ( aExp <= $7E ) then begin
  3112. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  3113. result := 0;
  3114. exit;
  3115. end;
  3116. aSig64 := aSig or $00800000;
  3117. aSig64 := aSig64 shl 40;
  3118. z := aSig64 shr ( - shiftCount );
  3119. if bits64( aSig64 shl ( shiftCount and 63 ) ) <> 0 then
  3120. set_inexact_flag;
  3121. if ( aSign <> 0 ) then z := - z;
  3122. result := z;
  3123. end;
  3124. {*
  3125. -------------------------------------------------------------------------------
  3126. Returns the result of converting the single-precision floating-point value
  3127. `a' to the double-precision floating-point format. The conversion is
  3128. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3129. Arithmetic.
  3130. -------------------------------------------------------------------------------
  3131. *}
  3132. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  3133. Var
  3134. aSign : flag;
  3135. aExp : int16;
  3136. aSig, zSig0, zSig1: bits32;
  3137. tmp : CommonNanT;
  3138. Begin
  3139. aSig := extractFloat32Frac( a.float32 );
  3140. aExp := extractFloat32Exp( a.float32 );
  3141. aSign := extractFloat32Sign( a.float32 );
  3142. if ( aExp = $FF ) then
  3143. Begin
  3144. if ( aSig<>0 ) then
  3145. Begin
  3146. tmp:=float32ToCommonNaN(a.float32);
  3147. result:=commonNaNToFloat64(tmp);
  3148. exit;
  3149. End;
  3150. packFloat64( aSign, $7FF, 0, 0, result);
  3151. exit;
  3152. End;
  3153. if ( aExp = 0 ) then
  3154. Begin
  3155. if ( aSig = 0 ) then
  3156. Begin
  3157. packFloat64( aSign, 0, 0, 0, result );
  3158. exit;
  3159. end;
  3160. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3161. Dec(aExp);
  3162. End;
  3163. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  3164. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  3165. End;
  3166. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  3167. {*----------------------------------------------------------------------------
  3168. | Returns the result of converting the canonical NaN `a' to the extended
  3169. | double-precision floating-point format.
  3170. *----------------------------------------------------------------------------*}
  3171. function commonNaNToFloatx80( a : commonNaNT ) : floatx80;
  3172. var
  3173. z : floatx80;
  3174. begin
  3175. z.low := bits64( $C000000000000000 ) or ( a.high shr 1 );
  3176. z.high := ( bits16( a.sign ) shl 15 ) or $7FFF;
  3177. result := z;
  3178. end;
  3179. {*----------------------------------------------------------------------------
  3180. | Returns the result of converting the single-precision floating-point value
  3181. | `a' to the extended double-precision floating-point format. The conversion
  3182. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3183. | Arithmetic.
  3184. *----------------------------------------------------------------------------*}
  3185. function float32_to_floatx80( a: float32 ): floatx80;
  3186. var
  3187. aSign: flag;
  3188. aExp: int16;
  3189. aSig: bits32;
  3190. tmp: commonNaNT;
  3191. begin
  3192. aSig := extractFloat32Frac( a );
  3193. aExp := extractFloat32Exp( a );
  3194. aSign := extractFloat32Sign( a );
  3195. if ( aExp = $FF ) then begin
  3196. if ( aSig <> 0 ) then begin
  3197. tmp:=float32ToCommonNaN(a);
  3198. result := commonNaNToFloatx80( tmp );
  3199. exit;
  3200. end;
  3201. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  3202. exit;
  3203. end;
  3204. if ( aExp = 0 ) then begin
  3205. if ( aSig = 0 ) then begin
  3206. result := packFloatx80( aSign, 0, 0 );
  3207. exit;
  3208. end;
  3209. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3210. end;
  3211. aSig := aSig or $00800000;
  3212. result := packFloatx80( aSign, aExp + $3F80, bits64(aSig) shl 40 );
  3213. end;
  3214. {$endif FPC_SOFTFLOAT_FLOATX80}
  3215. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  3216. {*----------------------------------------------------------------------------
  3217. | Returns the result of converting the single-precision floating-point value
  3218. | `a' to the double-precision floating-point format. The conversion is
  3219. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3220. | Arithmetic.
  3221. *----------------------------------------------------------------------------*}
  3222. function float32_to_float128( a: float32 ): float128;
  3223. var
  3224. aSign: flag;
  3225. aExp: int16;
  3226. aSig: bits32;
  3227. tmp: commonNaNT;
  3228. begin
  3229. aSig := extractFloat32Frac( a );
  3230. aExp := extractFloat32Exp( a );
  3231. aSign := extractFloat32Sign( a );
  3232. if ( aExp = $FF ) then begin
  3233. if ( aSig <> 0 ) then begin
  3234. tmp:=float32ToCommonNaN(a);
  3235. result := commonNaNToFloat128( tmp );
  3236. exit;
  3237. end;
  3238. result := packFloat128( aSign, $7FFF, 0, 0 );
  3239. exit;
  3240. end;
  3241. if ( aExp = 0 ) then begin
  3242. if ( aSig = 0 ) then begin
  3243. result := packFloat128( aSign, 0, 0, 0 );
  3244. exit;
  3245. end;
  3246. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3247. dec( aExp );
  3248. end;
  3249. result := packFloat128( aSign, aExp + $3F80, bits64( aSig ) shl 25, 0 );
  3250. end;
  3251. {$endif FPC_SOFTFLOAT_FLOAT128}
  3252. {*
  3253. -------------------------------------------------------------------------------
  3254. Rounds the single-precision floating-point value `a' to an integer,
  3255. and returns the result as a single-precision floating-point value. The
  3256. operation is performed according to the IEC/IEEE Standard for Binary
  3257. Floating-Point Arithmetic.
  3258. -------------------------------------------------------------------------------
  3259. *}
  3260. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  3261. Var
  3262. aSign: flag;
  3263. aExp: int16;
  3264. lastBitMask, roundBitsMask: bits32;
  3265. roundingMode: TFPURoundingMode;
  3266. z: float32;
  3267. Begin
  3268. aExp := extractFloat32Exp( a.float32 );
  3269. if ( $96 <= aExp ) then
  3270. Begin
  3271. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3272. Begin
  3273. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  3274. exit;
  3275. End;
  3276. float32_round_to_int:=a;
  3277. exit;
  3278. End;
  3279. if ( aExp <= $7E ) then
  3280. Begin
  3281. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  3282. Begin
  3283. float32_round_to_int:=a;
  3284. exit;
  3285. end;
  3286. set_inexact_flag;
  3287. aSign := extractFloat32Sign( a.float32 );
  3288. case ( softfloat_rounding_mode ) of
  3289. float_round_nearest_even:
  3290. Begin
  3291. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3292. Begin
  3293. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  3294. exit;
  3295. End;
  3296. End;
  3297. float_round_down:
  3298. Begin
  3299. if aSign <> 0 then
  3300. float32_round_to_int.float32 := $BF800000
  3301. else
  3302. float32_round_to_int.float32 := 0;
  3303. exit;
  3304. End;
  3305. float_round_up:
  3306. Begin
  3307. if aSign <> 0 then
  3308. float32_round_to_int.float32 := $80000000
  3309. else
  3310. float32_round_to_int.float32 := $3F800000;
  3311. exit;
  3312. End;
  3313. end;
  3314. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  3315. exit;
  3316. End;
  3317. lastBitMask := 1;
  3318. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  3319. lastBitMask := lastBitMask shl ($96 - aExp);
  3320. roundBitsMask := lastBitMask - 1;
  3321. z := a.float32;
  3322. roundingMode := softfloat_rounding_mode;
  3323. if ( roundingMode = float_round_nearest_even ) then
  3324. Begin
  3325. z := z + (lastBitMask shr 1);
  3326. if ( ( z and roundBitsMask ) = 0 ) then
  3327. z := z and not lastBitMask;
  3328. End
  3329. else if ( roundingMode <> float_round_to_zero ) then
  3330. Begin
  3331. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  3332. Begin
  3333. z := z + roundBitsMask;
  3334. End;
  3335. End;
  3336. z := z and not roundBitsMask;
  3337. if ( z <> a.float32 ) then
  3338. set_inexact_flag;
  3339. float32_round_to_int.float32 := z;
  3340. End;
  3341. {*
  3342. -------------------------------------------------------------------------------
  3343. Returns the result of adding the absolute values of the single-precision
  3344. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3345. before being returned. `zSign' is ignored if the result is a NaN.
  3346. The addition is performed according to the IEC/IEEE Standard for Binary
  3347. Floating-Point Arithmetic.
  3348. -------------------------------------------------------------------------------
  3349. *}
  3350. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  3351. Var
  3352. aExp, bExp, zExp: int16;
  3353. aSig, bSig, zSig: bits32;
  3354. expDiff: int16;
  3355. label roundAndPack;
  3356. Begin
  3357. aSig:=extractFloat32Frac( a );
  3358. aExp:=extractFloat32Exp( a );
  3359. bSig:=extractFloat32Frac( b );
  3360. bExp := extractFloat32Exp( b );
  3361. expDiff := aExp - bExp;
  3362. aSig := aSig shl 6;
  3363. bSig := bSig shl 6;
  3364. if ( 0 < expDiff ) then
  3365. Begin
  3366. if ( aExp = $FF ) then
  3367. Begin
  3368. if ( aSig <> 0) then
  3369. Begin
  3370. addFloat32Sigs := propagateFloat32NaN( a, b );
  3371. exit;
  3372. End;
  3373. addFloat32Sigs := a;
  3374. exit;
  3375. End;
  3376. if ( bExp = 0 ) then
  3377. Begin
  3378. Dec(expDiff);
  3379. End
  3380. else
  3381. Begin
  3382. bSig := bSig or $20000000;
  3383. End;
  3384. shift32RightJamming( bSig, expDiff, bSig );
  3385. zExp := aExp;
  3386. End
  3387. else
  3388. If ( expDiff < 0 ) then
  3389. Begin
  3390. if ( bExp = $FF ) then
  3391. Begin
  3392. if ( bSig<>0 ) then
  3393. Begin
  3394. addFloat32Sigs := propagateFloat32NaN( a, b );
  3395. exit;
  3396. end;
  3397. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3398. exit;
  3399. End;
  3400. if ( aExp = 0 ) then
  3401. Begin
  3402. Inc(expDiff);
  3403. End
  3404. else
  3405. Begin
  3406. aSig := aSig OR $20000000;
  3407. End;
  3408. shift32RightJamming( aSig, - expDiff, aSig );
  3409. zExp := bExp;
  3410. End
  3411. else
  3412. Begin
  3413. if ( aExp = $FF ) then
  3414. Begin
  3415. if ( aSig OR bSig )<> 0 then
  3416. Begin
  3417. addFloat32Sigs := propagateFloat32NaN( a, b );
  3418. exit;
  3419. end;
  3420. addFloat32Sigs := a;
  3421. exit;
  3422. End;
  3423. if ( aExp = 0 ) then
  3424. Begin
  3425. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3426. exit;
  3427. end;
  3428. zSig := $40000000 + aSig + bSig;
  3429. zExp := aExp;
  3430. goto roundAndPack;
  3431. End;
  3432. aSig := aSig OR $20000000;
  3433. zSig := ( aSig + bSig ) shl 1;
  3434. Dec(zExp);
  3435. if ( sbits32 (zSig) < 0 ) then
  3436. Begin
  3437. zSig := aSig + bSig;
  3438. Inc(zExp);
  3439. End;
  3440. roundAndPack:
  3441. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3442. End;
  3443. {*
  3444. -------------------------------------------------------------------------------
  3445. Returns the result of subtracting the absolute values of the single-
  3446. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3447. difference is negated before being returned. `zSign' is ignored if the
  3448. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3449. Standard for Binary Floating-Point Arithmetic.
  3450. -------------------------------------------------------------------------------
  3451. *}
  3452. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3453. Var
  3454. aExp, bExp, zExp: int16;
  3455. aSig, bSig, zSig: bits32;
  3456. expDiff : int16;
  3457. label aExpBigger;
  3458. label bExpBigger;
  3459. label aBigger;
  3460. label bBigger;
  3461. label normalizeRoundAndPack;
  3462. Begin
  3463. aSig := extractFloat32Frac( a );
  3464. aExp := extractFloat32Exp( a );
  3465. bSig := extractFloat32Frac( b );
  3466. bExp := extractFloat32Exp( b );
  3467. expDiff := aExp - bExp;
  3468. aSig := aSig shl 7;
  3469. bSig := bSig shl 7;
  3470. if ( 0 < expDiff ) then goto aExpBigger;
  3471. if ( expDiff < 0 ) then goto bExpBigger;
  3472. if ( aExp = $FF ) then
  3473. Begin
  3474. if ( aSig OR bSig )<> 0 then
  3475. Begin
  3476. subFloat32Sigs := propagateFloat32NaN( a, b );
  3477. exit;
  3478. End;
  3479. float_raise( float_flag_invalid );
  3480. subFloat32Sigs := float32_default_nan;
  3481. exit;
  3482. End;
  3483. if ( aExp = 0 ) then
  3484. Begin
  3485. aExp := 1;
  3486. bExp := 1;
  3487. End;
  3488. if ( bSig < aSig ) Then goto aBigger;
  3489. if ( aSig < bSig ) Then goto bBigger;
  3490. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3491. exit;
  3492. bExpBigger:
  3493. if ( bExp = $FF ) then
  3494. Begin
  3495. if ( bSig<>0 ) then
  3496. Begin
  3497. subFloat32Sigs := propagateFloat32NaN( a, b );
  3498. exit;
  3499. End;
  3500. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3501. exit;
  3502. End;
  3503. if ( aExp = 0 ) then
  3504. Begin
  3505. Inc(expDiff);
  3506. End
  3507. else
  3508. Begin
  3509. aSig := aSig OR $40000000;
  3510. End;
  3511. shift32RightJamming( aSig, - expDiff, aSig );
  3512. bSig := bSig OR $40000000;
  3513. bBigger:
  3514. zSig := bSig - aSig;
  3515. zExp := bExp;
  3516. zSign := zSign xor 1;
  3517. goto normalizeRoundAndPack;
  3518. aExpBigger:
  3519. if ( aExp = $FF ) then
  3520. Begin
  3521. if ( aSig <> 0) then
  3522. Begin
  3523. subFloat32Sigs := propagateFloat32NaN( a, b );
  3524. exit;
  3525. End;
  3526. subFloat32Sigs := a;
  3527. exit;
  3528. End;
  3529. if ( bExp = 0 ) then
  3530. Begin
  3531. Dec(expDiff);
  3532. End
  3533. else
  3534. Begin
  3535. bSig := bSig OR $40000000;
  3536. End;
  3537. shift32RightJamming( bSig, expDiff, bSig );
  3538. aSig := aSig OR $40000000;
  3539. aBigger:
  3540. zSig := aSig - bSig;
  3541. zExp := aExp;
  3542. normalizeRoundAndPack:
  3543. Dec(zExp);
  3544. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3545. End;
  3546. {*
  3547. -------------------------------------------------------------------------------
  3548. Returns the result of adding the single-precision floating-point values `a'
  3549. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3550. Binary Floating-Point Arithmetic.
  3551. -------------------------------------------------------------------------------
  3552. *}
  3553. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3554. Var
  3555. aSign, bSign: Flag;
  3556. Begin
  3557. aSign := extractFloat32Sign( a.float32 );
  3558. bSign := extractFloat32Sign( b.float32 );
  3559. if ( aSign = bSign ) then
  3560. Begin
  3561. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3562. End
  3563. else
  3564. Begin
  3565. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3566. End;
  3567. End;
  3568. {*
  3569. -------------------------------------------------------------------------------
  3570. Returns the result of subtracting the single-precision floating-point values
  3571. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3572. for Binary Floating-Point Arithmetic.
  3573. -------------------------------------------------------------------------------
  3574. *}
  3575. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3576. Var
  3577. aSign, bSign: flag;
  3578. Begin
  3579. aSign := extractFloat32Sign( a.float32 );
  3580. bSign := extractFloat32Sign( b.float32 );
  3581. if ( aSign = bSign ) then
  3582. Begin
  3583. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3584. End
  3585. else
  3586. Begin
  3587. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3588. End;
  3589. End;
  3590. {*
  3591. -------------------------------------------------------------------------------
  3592. Returns the result of multiplying the single-precision floating-point values
  3593. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3594. for Binary Floating-Point Arithmetic.
  3595. -------------------------------------------------------------------------------
  3596. *}
  3597. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3598. Var
  3599. aSign, bSign, zSign: flag;
  3600. aExp, bExp, zExp : int16;
  3601. aSig, bSig, zSig0, zSig1: bits32;
  3602. Begin
  3603. aSig := extractFloat32Frac( a.float32 );
  3604. aExp := extractFloat32Exp( a.float32 );
  3605. aSign := extractFloat32Sign( a.float32 );
  3606. bSig := extractFloat32Frac( b.float32 );
  3607. bExp := extractFloat32Exp( b.float32 );
  3608. bSign := extractFloat32Sign( b.float32 );
  3609. zSign := aSign xor bSign;
  3610. if ( aExp = $FF ) then
  3611. Begin
  3612. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3613. Begin
  3614. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3615. exit;
  3616. End;
  3617. if ( ( bits32(bExp) OR bSig ) = 0 ) then
  3618. Begin
  3619. float_raise( float_flag_invalid );
  3620. float32_mul.float32 := float32_default_nan;
  3621. exit;
  3622. End;
  3623. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3624. exit;
  3625. End;
  3626. if ( bExp = $FF ) then
  3627. Begin
  3628. if ( bSig <> 0 ) then
  3629. Begin
  3630. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3631. exit;
  3632. End;
  3633. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3634. Begin
  3635. float_raise( float_flag_invalid );
  3636. float32_mul.float32 := float32_default_nan;
  3637. exit;
  3638. End;
  3639. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3640. exit;
  3641. End;
  3642. if ( aExp = 0 ) then
  3643. Begin
  3644. if ( aSig = 0 ) then
  3645. Begin
  3646. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3647. exit;
  3648. End;
  3649. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3650. End;
  3651. if ( bExp = 0 ) then
  3652. Begin
  3653. if ( bSig = 0 ) then
  3654. Begin
  3655. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3656. exit;
  3657. End;
  3658. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3659. End;
  3660. zExp := aExp + bExp - $7F;
  3661. aSig := ( aSig OR $00800000 ) shl 7;
  3662. bSig := ( bSig OR $00800000 ) shl 8;
  3663. mul32To64( aSig, bSig, zSig0, zSig1 );
  3664. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3665. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3666. Begin
  3667. zSig0 := zSig0 shl 1;
  3668. Dec(zExp);
  3669. End;
  3670. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3671. End;
  3672. {*
  3673. -------------------------------------------------------------------------------
  3674. Returns the result of dividing the single-precision floating-point value `a'
  3675. by the corresponding value `b'. The operation is performed according to the
  3676. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3677. -------------------------------------------------------------------------------
  3678. *}
  3679. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3680. Var
  3681. aSign, bSign, zSign: flag;
  3682. aExp, bExp, zExp: int16;
  3683. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3684. Begin
  3685. aSig := extractFloat32Frac( a.float32 );
  3686. aExp := extractFloat32Exp( a.float32 );
  3687. aSign := extractFloat32Sign( a.float32 );
  3688. bSig := extractFloat32Frac( b.float32 );
  3689. bExp := extractFloat32Exp( b.float32 );
  3690. bSign := extractFloat32Sign( b.float32 );
  3691. zSign := aSign xor bSign;
  3692. if ( aExp = $FF ) then
  3693. Begin
  3694. if ( aSig <> 0 ) then
  3695. Begin
  3696. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3697. exit;
  3698. End;
  3699. if ( bExp = $FF ) then
  3700. Begin
  3701. if ( bSig <> 0) then
  3702. Begin
  3703. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3704. exit;
  3705. End;
  3706. float_raise( float_flag_invalid );
  3707. float32_div.float32 := float32_default_nan;
  3708. exit;
  3709. End;
  3710. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3711. exit;
  3712. End;
  3713. if ( bExp = $FF ) then
  3714. Begin
  3715. if ( bSig <> 0) then
  3716. Begin
  3717. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3718. exit;
  3719. End;
  3720. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3721. exit;
  3722. End;
  3723. if ( bExp = 0 ) Then
  3724. Begin
  3725. if ( bSig = 0 ) Then
  3726. Begin
  3727. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3728. Begin
  3729. float_raise( float_flag_invalid );
  3730. float32_div.float32 := float32_default_nan;
  3731. exit;
  3732. End;
  3733. float_raise( float_flag_divbyzero );
  3734. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3735. exit;
  3736. End;
  3737. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3738. End;
  3739. if ( aExp = 0 ) Then
  3740. Begin
  3741. if ( aSig = 0 ) Then
  3742. Begin
  3743. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3744. exit;
  3745. End;
  3746. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3747. End;
  3748. zExp := aExp - bExp + $7D;
  3749. aSig := ( aSig OR $00800000 ) shl 7;
  3750. bSig := ( bSig OR $00800000 ) shl 8;
  3751. if ( bSig <= ( aSig + aSig ) ) then
  3752. Begin
  3753. aSig := aSig shr 1;
  3754. Inc(zExp);
  3755. End;
  3756. zSig := estimateDiv64To32( aSig, 0, bSig );
  3757. if ( ( zSig and $3F ) <= 2 ) then
  3758. Begin
  3759. mul32To64( bSig, zSig, term0, term1 );
  3760. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3761. while ( sbits32 (rem0) < 0 ) do
  3762. Begin
  3763. Dec(zSig);
  3764. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3765. End;
  3766. zSig := zSig or bits32( rem1 <> 0 );
  3767. End;
  3768. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3769. End;
  3770. {*
  3771. -------------------------------------------------------------------------------
  3772. Returns the remainder of the single-precision floating-point value `a'
  3773. with respect to the corresponding value `b'. The operation is performed
  3774. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3775. -------------------------------------------------------------------------------
  3776. *}
  3777. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3778. Var
  3779. aSign, zSign: flag;
  3780. aExp, bExp, expDiff: int16;
  3781. aSig, bSig, q, alternateASig: bits32;
  3782. sigMean: sbits32;
  3783. Begin
  3784. aSig := extractFloat32Frac( a.float32 );
  3785. aExp := extractFloat32Exp( a.float32 );
  3786. aSign := extractFloat32Sign( a.float32 );
  3787. bSig := extractFloat32Frac( b.float32 );
  3788. bExp := extractFloat32Exp( b.float32 );
  3789. if ( aExp = $FF ) then
  3790. Begin
  3791. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3792. Begin
  3793. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3794. exit;
  3795. End;
  3796. float_raise( float_flag_invalid );
  3797. float32_rem.float32 := float32_default_nan;
  3798. exit;
  3799. End;
  3800. if ( bExp = $FF ) then
  3801. Begin
  3802. if ( bSig <> 0 ) then
  3803. Begin
  3804. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3805. exit;
  3806. End;
  3807. float32_rem := a;
  3808. exit;
  3809. End;
  3810. if ( bExp = 0 ) then
  3811. Begin
  3812. if ( bSig = 0 ) then
  3813. Begin
  3814. float_raise( float_flag_invalid );
  3815. float32_rem.float32 := float32_default_nan;
  3816. exit;
  3817. End;
  3818. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3819. End;
  3820. if ( aExp = 0 ) then
  3821. Begin
  3822. if ( aSig = 0 ) then
  3823. Begin
  3824. float32_rem := a;
  3825. exit;
  3826. End;
  3827. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3828. End;
  3829. expDiff := aExp - bExp;
  3830. aSig := ( aSig OR $00800000 ) shl 8;
  3831. bSig := ( bSig OR $00800000 ) shl 8;
  3832. if ( expDiff < 0 ) then
  3833. Begin
  3834. if ( expDiff < -1 ) then
  3835. Begin
  3836. float32_rem := a;
  3837. exit;
  3838. End;
  3839. aSig := aSig shr 1;
  3840. End;
  3841. q := bits32( bSig <= aSig );
  3842. if ( q <> 0) then
  3843. aSig := aSig - bSig;
  3844. expDiff := expDiff - 32;
  3845. while ( 0 < expDiff ) do
  3846. Begin
  3847. q := estimateDiv64To32( aSig, 0, bSig );
  3848. if (2 < q) then
  3849. q := q - 2
  3850. else
  3851. q := 0;
  3852. aSig := - ( ( bSig shr 2 ) * q );
  3853. expDiff := expDiff - 30;
  3854. End;
  3855. expDiff := expDiff + 32;
  3856. if ( 0 < expDiff ) then
  3857. Begin
  3858. q := estimateDiv64To32( aSig, 0, bSig );
  3859. if (2 < q) then
  3860. q := q - 2
  3861. else
  3862. q := 0;
  3863. q := q shr (32 - expDiff);
  3864. bSig := bSig shr 2;
  3865. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3866. End
  3867. else
  3868. Begin
  3869. aSig := aSig shr 2;
  3870. bSig := bSig shr 2;
  3871. End;
  3872. Repeat
  3873. alternateASig := aSig;
  3874. Inc(q);
  3875. aSig := aSig - bSig;
  3876. Until not ( 0 <= sbits32 (aSig) );
  3877. sigMean := aSig + alternateASig;
  3878. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3879. Begin
  3880. aSig := alternateASig;
  3881. End;
  3882. zSign := flag( sbits32 (aSig) < 0 );
  3883. if ( zSign<>0 ) then
  3884. aSig := - aSig;
  3885. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3886. End;
  3887. {*
  3888. -------------------------------------------------------------------------------
  3889. Returns the square root of the single-precision floating-point value `a'.
  3890. The operation is performed according to the IEC/IEEE Standard for Binary
  3891. Floating-Point Arithmetic.
  3892. -------------------------------------------------------------------------------
  3893. *}
  3894. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3895. Var
  3896. aSign : flag;
  3897. aExp, zExp : int16;
  3898. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3899. label roundAndPack;
  3900. Begin
  3901. aSig := extractFloat32Frac( a.float32 );
  3902. aExp := extractFloat32Exp( a.float32 );
  3903. aSign := extractFloat32Sign( a.float32 );
  3904. if ( aExp = $FF ) then
  3905. Begin
  3906. if ( aSig <> 0) then
  3907. Begin
  3908. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3909. exit;
  3910. End;
  3911. if ( aSign = 0) then
  3912. Begin
  3913. float32_sqrt := a;
  3914. exit;
  3915. End;
  3916. float_raise( float_flag_invalid );
  3917. float32_sqrt.float32 := float32_default_nan;
  3918. exit;
  3919. End;
  3920. if ( aSign <> 0) then
  3921. Begin
  3922. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3923. Begin
  3924. float32_sqrt := a;
  3925. exit;
  3926. End;
  3927. float_raise( float_flag_invalid );
  3928. float32_sqrt.float32 := float32_default_nan;
  3929. exit;
  3930. End;
  3931. if ( aExp = 0 ) then
  3932. Begin
  3933. if ( aSig = 0 ) then
  3934. Begin
  3935. float32_sqrt.float32 := 0;
  3936. exit;
  3937. End;
  3938. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3939. End;
  3940. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3941. aSig := ( aSig OR $00800000 ) shl 8;
  3942. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3943. if ( ( zSig and $7F ) <= 5 ) then
  3944. Begin
  3945. if ( zSig < 2 ) then
  3946. Begin
  3947. zSig := $7FFFFFFF;
  3948. goto roundAndPack;
  3949. End
  3950. else
  3951. Begin
  3952. aSig := aSig shr (aExp and 1);
  3953. mul32To64( zSig, zSig, term0, term1 );
  3954. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3955. while ( sbits32 (rem0) < 0 ) do
  3956. Begin
  3957. Dec(zSig);
  3958. shortShift64Left( 0, zSig, 1, term0, term1 );
  3959. term1 := term1 or 1;
  3960. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3961. End;
  3962. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3963. End;
  3964. End;
  3965. shift32RightJamming( zSig, 1, zSig );
  3966. roundAndPack:
  3967. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3968. End;
  3969. {*
  3970. -------------------------------------------------------------------------------
  3971. Returns 1 if the single-precision floating-point value `a' is equal to
  3972. the corresponding value `b', and 0 otherwise. The comparison is performed
  3973. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3974. -------------------------------------------------------------------------------
  3975. *}
  3976. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3977. Begin
  3978. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3979. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3980. ) then
  3981. Begin
  3982. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3983. Begin
  3984. float_raise( float_flag_invalid );
  3985. End;
  3986. float32_eq := 0;
  3987. exit;
  3988. End;
  3989. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3990. End;
  3991. {*
  3992. -------------------------------------------------------------------------------
  3993. Returns 1 if the single-precision floating-point value `a' is less than
  3994. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3995. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3996. Arithmetic.
  3997. -------------------------------------------------------------------------------
  3998. *}
  3999. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  4000. var
  4001. aSign, bSign: flag;
  4002. Begin
  4003. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  4004. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  4005. ) then
  4006. Begin
  4007. float_raise( float_flag_invalid );
  4008. float32_le := 0;
  4009. exit;
  4010. End;
  4011. aSign := extractFloat32Sign( a.float32 );
  4012. bSign := extractFloat32Sign( b.float32 );
  4013. if ( aSign <> bSign ) then
  4014. Begin
  4015. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  4016. exit;
  4017. End;
  4018. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  4019. End;
  4020. {*
  4021. -------------------------------------------------------------------------------
  4022. Returns 1 if the single-precision floating-point value `a' is less than
  4023. the corresponding value `b', and 0 otherwise. The comparison is performed
  4024. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4025. -------------------------------------------------------------------------------
  4026. *}
  4027. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  4028. var
  4029. aSign, bSign: flag;
  4030. Begin
  4031. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  4032. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  4033. ) then
  4034. Begin
  4035. float_raise( float_flag_invalid );
  4036. float32_lt :=0;
  4037. exit;
  4038. End;
  4039. aSign := extractFloat32Sign( a.float32 );
  4040. bSign := extractFloat32Sign( b.float32 );
  4041. if ( aSign <> bSign ) then
  4042. Begin
  4043. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  4044. exit;
  4045. End;
  4046. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  4047. End;
  4048. {*
  4049. -------------------------------------------------------------------------------
  4050. Returns 1 if the single-precision floating-point value `a' is equal to
  4051. the corresponding value `b', and 0 otherwise. The invalid exception is
  4052. raised if either operand is a NaN. Otherwise, the comparison is performed
  4053. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4054. -------------------------------------------------------------------------------
  4055. *}
  4056. Function float32_eq_signaling( a: float32; b: float32) : flag;
  4057. Begin
  4058. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  4059. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  4060. ) then
  4061. Begin
  4062. float_raise( float_flag_invalid );
  4063. float32_eq_signaling := 0;
  4064. exit;
  4065. End;
  4066. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  4067. End;
  4068. {*
  4069. -------------------------------------------------------------------------------
  4070. Returns 1 if the single-precision floating-point value `a' is less than or
  4071. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4072. cause an exception. Otherwise, the comparison is performed according to the
  4073. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4074. -------------------------------------------------------------------------------
  4075. *}
  4076. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  4077. Var
  4078. aSign, bSign: flag;
  4079. Begin
  4080. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4081. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4082. ) then
  4083. Begin
  4084. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4085. Begin
  4086. float_raise( float_flag_invalid );
  4087. End;
  4088. float32_le_quiet := 0;
  4089. exit;
  4090. End;
  4091. aSign := extractFloat32Sign( a );
  4092. bSign := extractFloat32Sign( b );
  4093. if ( aSign <> bSign ) then
  4094. Begin
  4095. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  4096. exit;
  4097. End;
  4098. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  4099. End;
  4100. {*
  4101. -------------------------------------------------------------------------------
  4102. Returns 1 if the single-precision floating-point value `a' is less than
  4103. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4104. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4105. Standard for Binary Floating-Point Arithmetic.
  4106. -------------------------------------------------------------------------------
  4107. *}
  4108. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  4109. Var
  4110. aSign, bSign: flag;
  4111. Begin
  4112. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4113. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4114. ) then
  4115. Begin
  4116. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4117. Begin
  4118. float_raise( float_flag_invalid );
  4119. End;
  4120. float32_lt_quiet := 0;
  4121. exit;
  4122. End;
  4123. aSign := extractFloat32Sign( a );
  4124. bSign := extractFloat32Sign( b );
  4125. if ( aSign <> bSign ) then
  4126. Begin
  4127. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  4128. exit;
  4129. End;
  4130. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  4131. End;
  4132. {*
  4133. -------------------------------------------------------------------------------
  4134. Returns the result of converting the double-precision floating-point value
  4135. `a' to the 32-bit two's complement integer format. The conversion is
  4136. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4137. Arithmetic---which means in particular that the conversion is rounded
  4138. according to the current rounding mode. If `a' is a NaN, the largest
  4139. positive integer is returned. Otherwise, if the conversion overflows, the
  4140. largest integer with the same sign as `a' is returned.
  4141. -------------------------------------------------------------------------------
  4142. *}
  4143. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  4144. var
  4145. aSign: flag;
  4146. aExp, shiftCount: int16;
  4147. aSig0, aSig1, absZ, aSigExtra: bits32;
  4148. z: int32;
  4149. roundingMode: TFPURoundingMode;
  4150. label invalid;
  4151. Begin
  4152. aSig1 := extractFloat64Frac1( a );
  4153. aSig0 := extractFloat64Frac0( a );
  4154. aExp := extractFloat64Exp( a );
  4155. aSign := extractFloat64Sign( a );
  4156. shiftCount := aExp - $413;
  4157. if ( 0 <= shiftCount ) then
  4158. Begin
  4159. if ( $41E < aExp ) then
  4160. Begin
  4161. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4162. aSign := 0;
  4163. goto invalid;
  4164. End;
  4165. shortShift64Left(
  4166. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4167. if ( $80000000 < absZ ) then
  4168. goto invalid;
  4169. End
  4170. else
  4171. Begin
  4172. aSig1 := flag( aSig1 <> 0 );
  4173. if ( aExp < $3FE ) then
  4174. Begin
  4175. aSigExtra := aExp OR aSig0 OR aSig1;
  4176. absZ := 0;
  4177. End
  4178. else
  4179. Begin
  4180. aSig0 := aSig0 OR $00100000;
  4181. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4182. absZ := aSig0 shr ( - shiftCount );
  4183. End;
  4184. End;
  4185. roundingMode := softfloat_rounding_mode;
  4186. if ( roundingMode = float_round_nearest_even ) then
  4187. Begin
  4188. if ( sbits32(aSigExtra) < 0 ) then
  4189. Begin
  4190. Inc(absZ);
  4191. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  4192. absZ := absZ and not 1;
  4193. End;
  4194. if aSign <> 0 then
  4195. z := - absZ
  4196. else
  4197. z := absZ;
  4198. End
  4199. else
  4200. Begin
  4201. aSigExtra := bits32( aSigExtra <> 0 );
  4202. if ( aSign <> 0) then
  4203. Begin
  4204. z := - ( absZ
  4205. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  4206. End
  4207. else
  4208. Begin
  4209. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  4210. End
  4211. End;
  4212. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  4213. Begin
  4214. invalid:
  4215. float_raise( float_flag_invalid );
  4216. if (aSign <> 0 ) then
  4217. float64_to_int32 := sbits32 ($80000000)
  4218. else
  4219. float64_to_int32 := $7FFFFFFF;
  4220. exit;
  4221. End;
  4222. if ( aSigExtra <> 0) then
  4223. set_inexact_flag;
  4224. float64_to_int32 := z;
  4225. End;
  4226. {*
  4227. -------------------------------------------------------------------------------
  4228. Returns the result of converting the double-precision floating-point value
  4229. `a' to the 32-bit two's complement integer format. The conversion is
  4230. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4231. Arithmetic, except that the conversion is always rounded toward zero.
  4232. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4233. the conversion overflows, the largest integer with the same sign as `a' is
  4234. returned.
  4235. -------------------------------------------------------------------------------
  4236. *}
  4237. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  4238. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  4239. Var
  4240. aSign: flag;
  4241. aExp, shiftCount: int16;
  4242. aSig0, aSig1, absZ, aSigExtra: bits32;
  4243. z: int32;
  4244. label invalid;
  4245. Begin
  4246. aSig1 := extractFloat64Frac1( a );
  4247. aSig0 := extractFloat64Frac0( a );
  4248. aExp := extractFloat64Exp( a );
  4249. aSign := extractFloat64Sign( a );
  4250. shiftCount := aExp - $413;
  4251. if ( 0 <= shiftCount ) then
  4252. Begin
  4253. if ( $41E < aExp ) then
  4254. Begin
  4255. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4256. aSign := 0;
  4257. goto invalid;
  4258. End;
  4259. shortShift64Left(
  4260. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4261. End
  4262. else
  4263. Begin
  4264. if ( aExp < $3FF ) then
  4265. Begin
  4266. if ( bits32(aExp) OR aSig0 OR aSig1 )<>0 then
  4267. Begin
  4268. set_inexact_flag;
  4269. End;
  4270. float64_to_int32_round_to_zero := 0;
  4271. exit;
  4272. End;
  4273. aSig0 := aSig0 or $00100000;
  4274. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4275. absZ := aSig0 shr ( - shiftCount );
  4276. End;
  4277. if aSign <> 0 then
  4278. z := - absZ
  4279. else
  4280. z := absZ;
  4281. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  4282. Begin
  4283. invalid:
  4284. float_raise( float_flag_invalid );
  4285. if (aSign <> 0) then
  4286. float64_to_int32_round_to_zero := sbits32 ($80000000)
  4287. else
  4288. float64_to_int32_round_to_zero := $7FFFFFFF;
  4289. exit;
  4290. End;
  4291. if ( aSigExtra <> 0) then
  4292. set_inexact_flag;
  4293. float64_to_int32_round_to_zero := z;
  4294. End;
  4295. {*----------------------------------------------------------------------------
  4296. | Returns the result of converting the double-precision floating-point value
  4297. | `a' to the 64-bit two's complement integer format. The conversion is
  4298. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4299. | Arithmetic---which means in particular that the conversion is rounded
  4300. | according to the current rounding mode. If `a' is a NaN, the largest
  4301. | positive integer is returned. Otherwise, if the conversion overflows, the
  4302. | largest integer with the same sign as `a' is returned.
  4303. *----------------------------------------------------------------------------*}
  4304. function float64_to_int64( a: float64 ): int64;
  4305. var
  4306. aSign: flag;
  4307. aExp, shiftCount: int16;
  4308. aSig, aSigExtra: bits64;
  4309. begin
  4310. aSig := extractFloat64Frac( a );
  4311. aExp := extractFloat64Exp( a );
  4312. aSign := extractFloat64Sign( a );
  4313. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4314. shiftCount := $433 - aExp;
  4315. if ( shiftCount <= 0 ) then begin
  4316. if ( $43E < aExp ) then begin
  4317. float_raise( float_flag_invalid );
  4318. if ( ( aSign = 0 )
  4319. or ( ( aExp = $7FF )
  4320. and ( aSig <> $0010000000000000 ) )
  4321. ) then begin
  4322. result := $7FFFFFFFFFFFFFFF;
  4323. exit;
  4324. end;
  4325. result := $8000000000000000;
  4326. exit;
  4327. end;
  4328. aSigExtra := 0;
  4329. aSig := aSig shl ( - shiftCount );
  4330. end
  4331. else
  4332. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  4333. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  4334. end;
  4335. {*----------------------------------------------------------------------------
  4336. | Returns the result of converting the double-precision floating-point value
  4337. | `a' to the 64-bit two's complement integer format. The conversion is
  4338. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4339. | Arithmetic, except that the conversion is always rounded toward zero.
  4340. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4341. | the conversion overflows, the largest integer with the same sign as `a' is
  4342. | returned.
  4343. *----------------------------------------------------------------------------*}
  4344. {$define FPC_SYSTEM_HAS_float64_to_int64_round_to_zero}
  4345. function float64_to_int64_round_to_zero( a: float64 ): int64;
  4346. var
  4347. aSign: flag;
  4348. aExp, shiftCount: int16;
  4349. aSig: bits64;
  4350. z: int64;
  4351. begin
  4352. aSig := extractFloat64Frac( a );
  4353. aExp := extractFloat64Exp( a );
  4354. aSign := extractFloat64Sign( a );
  4355. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4356. shiftCount := aExp - $433;
  4357. if ( 0 <= shiftCount ) then begin
  4358. if ( $43E <= aExp ) then begin
  4359. if ( bits64 ( a ) <> bits64( $C3E0000000000000 ) ) then begin
  4360. float_raise( float_flag_invalid );
  4361. if ( ( aSign = 0 )
  4362. or ( ( aExp = $7FF )
  4363. and ( aSig <> $0010000000000000 ) )
  4364. ) then begin
  4365. result := $7FFFFFFFFFFFFFFF;
  4366. exit;
  4367. end;
  4368. end;
  4369. result := $8000000000000000;
  4370. exit;
  4371. end;
  4372. z := aSig shl shiftCount;
  4373. end
  4374. else begin
  4375. if ( aExp < $3FE ) then begin
  4376. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  4377. result := 0;
  4378. exit;
  4379. end;
  4380. z := aSig shr ( - shiftCount );
  4381. if ( bits64( aSig shl ( shiftCount and 63 ) ) <> 0 ) then
  4382. set_inexact_flag;
  4383. end;
  4384. if ( aSign <> 0 ) then z := - z;
  4385. result := z;
  4386. end;
  4387. {*
  4388. -------------------------------------------------------------------------------
  4389. Returns the result of converting the double-precision floating-point value
  4390. `a' to the single-precision floating-point format. The conversion is
  4391. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4392. Arithmetic.
  4393. -------------------------------------------------------------------------------
  4394. *}
  4395. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  4396. Var
  4397. aSign: flag;
  4398. aExp: int16;
  4399. aSig0, aSig1, zSig: bits32;
  4400. allZero: bits32;
  4401. tmp : CommonNanT;
  4402. Begin
  4403. aSig1 := extractFloat64Frac1( a );
  4404. aSig0 := extractFloat64Frac0( a );
  4405. aExp := extractFloat64Exp( a );
  4406. aSign := extractFloat64Sign( a );
  4407. if ( aExp = $7FF ) then
  4408. Begin
  4409. if ( aSig0 OR aSig1 ) <> 0 then
  4410. Begin
  4411. tmp:=float64ToCommonNaN(a);
  4412. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  4413. exit;
  4414. End;
  4415. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  4416. exit;
  4417. End;
  4418. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  4419. if ( aExp <> 0) then
  4420. zSig := zSig OR $40000000;
  4421. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  4422. End;
  4423. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  4424. {*----------------------------------------------------------------------------
  4425. | Returns the result of converting the double-precision floating-point value
  4426. | `a' to the extended double-precision floating-point format. The conversion
  4427. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4428. | Arithmetic.
  4429. *----------------------------------------------------------------------------*}
  4430. function float64_to_floatx80( a: float64 ): floatx80;
  4431. var
  4432. aSign: flag;
  4433. aExp: int16;
  4434. aSig: bits64;
  4435. begin
  4436. aSig := extractFloat64Frac( a );
  4437. aExp := extractFloat64Exp( a );
  4438. aSign := extractFloat64Sign( a );
  4439. if ( aExp = $7FF ) then begin
  4440. if ( aSig <> 0 ) then begin
  4441. result := commonNaNToFloatx80( float64ToCommonNaN( a ) );
  4442. exit;
  4443. end;
  4444. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  4445. exit;
  4446. end;
  4447. if ( aExp = 0 ) then begin
  4448. if ( aSig = 0 ) then begin
  4449. result := packFloatx80( aSign, 0, 0 );
  4450. exit;
  4451. end;
  4452. normalizeFloat64Subnormal( aSig, aExp, aSig );
  4453. end;
  4454. result :=
  4455. packFloatx80(
  4456. aSign, aExp + $3C00, ( aSig or $0010000000000000 ) shl 11 );
  4457. end;
  4458. {$endif FPC_SOFTFLOAT_FLOATX80}
  4459. {*
  4460. -------------------------------------------------------------------------------
  4461. Rounds the double-precision floating-point value `a' to an integer,
  4462. and returns the result as a double-precision floating-point value. The
  4463. operation is performed according to the IEC/IEEE Standard for Binary
  4464. Floating-Point Arithmetic.
  4465. -------------------------------------------------------------------------------
  4466. *}
  4467. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  4468. Var
  4469. aSign: flag;
  4470. aExp: int16;
  4471. lastBitMask, roundBitsMask: bits32;
  4472. roundingMode: TFPURoundingMode;
  4473. z: float64;
  4474. Begin
  4475. aExp := extractFloat64Exp( a );
  4476. if ( $413 <= aExp ) then
  4477. Begin
  4478. if ( $433 <= aExp ) then
  4479. Begin
  4480. if ( ( aExp = $7FF )
  4481. AND
  4482. (
  4483. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  4484. ) <>0)
  4485. ) then
  4486. Begin
  4487. propagateFloat64NaN( a, a, result );
  4488. exit;
  4489. End;
  4490. result := a;
  4491. exit;
  4492. End;
  4493. lastBitMask := 1;
  4494. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  4495. roundBitsMask := lastBitMask - 1;
  4496. z := a;
  4497. roundingMode := softfloat_rounding_mode;
  4498. if ( roundingMode = float_round_nearest_even ) then
  4499. Begin
  4500. if ( lastBitMask <> 0) then
  4501. Begin
  4502. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  4503. if ( ( z.low and roundBitsMask ) = 0 ) then
  4504. z.low := z.low and not lastBitMask;
  4505. End
  4506. else
  4507. Begin
  4508. if ( sbits32 (z.low) < 0 ) then
  4509. Begin
  4510. Inc(z.high);
  4511. if ( bits32 ( z.low shl 1 ) = 0 ) then
  4512. z.high := z.high and not 1;
  4513. End;
  4514. End;
  4515. End
  4516. else if ( roundingMode <> float_round_to_zero ) then
  4517. Begin
  4518. if ( extractFloat64Sign( z )
  4519. xor flag( roundingMode = float_round_up ) )<> 0 then
  4520. Begin
  4521. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4522. End;
  4523. End;
  4524. z.low := z.low and not roundBitsMask;
  4525. End
  4526. else
  4527. Begin
  4528. if ( aExp <= $3FE ) then
  4529. Begin
  4530. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4531. Begin
  4532. result := a;
  4533. exit;
  4534. End;
  4535. set_inexact_flag;
  4536. aSign := extractFloat64Sign( a );
  4537. case ( softfloat_rounding_mode ) of
  4538. float_round_nearest_even:
  4539. Begin
  4540. if ( ( aExp = $3FE )
  4541. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4542. ) then
  4543. Begin
  4544. packFloat64( aSign, $3FF, 0, 0, result );
  4545. exit;
  4546. End;
  4547. End;
  4548. float_round_down:
  4549. Begin
  4550. if aSign<>0 then
  4551. packFloat64( 1, $3FF, 0, 0, result )
  4552. else
  4553. packFloat64( 0, 0, 0, 0, result );
  4554. exit;
  4555. End;
  4556. float_round_up:
  4557. Begin
  4558. if aSign <> 0 then
  4559. packFloat64( 1, 0, 0, 0, result )
  4560. else
  4561. packFloat64( 0, $3FF, 0, 0, result );
  4562. exit;
  4563. End;
  4564. end;
  4565. packFloat64( aSign, 0, 0, 0, result );
  4566. exit;
  4567. End;
  4568. lastBitMask := 1;
  4569. lastBitMask := lastBitMask shl ($413 - aExp);
  4570. roundBitsMask := lastBitMask - 1;
  4571. z.low := 0;
  4572. z.high := a.high;
  4573. roundingMode := softfloat_rounding_mode;
  4574. if ( roundingMode = float_round_nearest_even ) then
  4575. Begin
  4576. z.high := z.high + lastBitMask shr 1;
  4577. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4578. Begin
  4579. z.high := z.high and not lastBitMask;
  4580. End;
  4581. End
  4582. else if ( roundingMode <> float_round_to_zero ) then
  4583. Begin
  4584. if ( extractFloat64Sign( z )
  4585. xor flag( roundingMode = float_round_up ) )<> 0 then
  4586. Begin
  4587. z.high := z.high or bits32( a.low <> 0 );
  4588. z.high := z.high + roundBitsMask;
  4589. End;
  4590. End;
  4591. z.high := z.high and not roundBitsMask;
  4592. End;
  4593. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4594. Begin
  4595. set_inexact_flag;
  4596. End;
  4597. result := z;
  4598. End;
  4599. {*
  4600. -------------------------------------------------------------------------------
  4601. Returns the result of adding the absolute values of the double-precision
  4602. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4603. before being returned. `zSign' is ignored if the result is a NaN.
  4604. The addition is performed according to the IEC/IEEE Standard for Binary
  4605. Floating-Point Arithmetic.
  4606. -------------------------------------------------------------------------------
  4607. *}
  4608. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4609. Var
  4610. aExp, bExp, zExp: int16;
  4611. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4612. expDiff: int16;
  4613. label shiftRight1;
  4614. label roundAndPack;
  4615. Begin
  4616. aSig1 := extractFloat64Frac1( a );
  4617. aSig0 := extractFloat64Frac0( a );
  4618. aExp := extractFloat64Exp( a );
  4619. bSig1 := extractFloat64Frac1( b );
  4620. bSig0 := extractFloat64Frac0( b );
  4621. bExp := extractFloat64Exp( b );
  4622. expDiff := aExp - bExp;
  4623. if ( 0 < expDiff ) then
  4624. Begin
  4625. if ( aExp = $7FF ) then
  4626. Begin
  4627. if ( aSig0 OR aSig1 ) <> 0 then
  4628. Begin
  4629. propagateFloat64NaN( a, b, out );
  4630. exit;
  4631. end;
  4632. out := a;
  4633. exit;
  4634. End;
  4635. if ( bExp = 0 ) then
  4636. Begin
  4637. Dec(expDiff);
  4638. End
  4639. else
  4640. Begin
  4641. bSig0 := bSig0 or $00100000;
  4642. End;
  4643. shift64ExtraRightJamming(
  4644. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4645. zExp := aExp;
  4646. End
  4647. else if ( expDiff < 0 ) then
  4648. Begin
  4649. if ( bExp = $7FF ) then
  4650. Begin
  4651. if ( bSig0 OR bSig1 ) <> 0 then
  4652. Begin
  4653. propagateFloat64NaN( a, b, out );
  4654. exit;
  4655. End;
  4656. packFloat64( zSign, $7FF, 0, 0, out );
  4657. exit;
  4658. End;
  4659. if ( aExp = 0 ) then
  4660. Begin
  4661. Inc(expDiff);
  4662. End
  4663. else
  4664. Begin
  4665. aSig0 := aSig0 or $00100000;
  4666. End;
  4667. shift64ExtraRightJamming(
  4668. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4669. zExp := bExp;
  4670. End
  4671. else
  4672. Begin
  4673. if ( aExp = $7FF ) then
  4674. Begin
  4675. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4676. Begin
  4677. propagateFloat64NaN( a, b, out );
  4678. exit;
  4679. End;
  4680. out := a;
  4681. exit;
  4682. End;
  4683. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4684. if ( aExp = 0 ) then
  4685. Begin
  4686. packFloat64( zSign, 0, zSig0, zSig1, out );
  4687. exit;
  4688. End;
  4689. zSig2 := 0;
  4690. zSig0 := zSig0 or $00200000;
  4691. zExp := aExp;
  4692. goto shiftRight1;
  4693. End;
  4694. aSig0 := aSig0 or $00100000;
  4695. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4696. Dec(zExp);
  4697. if ( zSig0 < $00200000 ) then
  4698. goto roundAndPack;
  4699. Inc(zExp);
  4700. shiftRight1:
  4701. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4702. roundAndPack:
  4703. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4704. End;
  4705. {*
  4706. -------------------------------------------------------------------------------
  4707. Returns the result of subtracting the absolute values of the double-
  4708. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4709. difference is negated before being returned. `zSign' is ignored if the
  4710. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4711. Standard for Binary Floating-Point Arithmetic.
  4712. -------------------------------------------------------------------------------
  4713. *}
  4714. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4715. Var
  4716. aExp, bExp, zExp: int16;
  4717. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4718. expDiff: int16;
  4719. z: float64;
  4720. label aExpBigger;
  4721. label bExpBigger;
  4722. label aBigger;
  4723. label bBigger;
  4724. label normalizeRoundAndPack;
  4725. Begin
  4726. aSig1 := extractFloat64Frac1( a );
  4727. aSig0 := extractFloat64Frac0( a );
  4728. aExp := extractFloat64Exp( a );
  4729. bSig1 := extractFloat64Frac1( b );
  4730. bSig0 := extractFloat64Frac0( b );
  4731. bExp := extractFloat64Exp( b );
  4732. expDiff := aExp - bExp;
  4733. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4734. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4735. if ( 0 < expDiff ) then goto aExpBigger;
  4736. if ( expDiff < 0 ) then goto bExpBigger;
  4737. if ( aExp = $7FF ) then
  4738. Begin
  4739. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4740. Begin
  4741. propagateFloat64NaN( a, b, out );
  4742. exit;
  4743. End;
  4744. float_raise( float_flag_invalid );
  4745. z.low := float64_default_nan_low;
  4746. z.high := float64_default_nan_high;
  4747. out := z;
  4748. exit;
  4749. End;
  4750. if ( aExp = 0 ) then
  4751. Begin
  4752. aExp := 1;
  4753. bExp := 1;
  4754. End;
  4755. if ( bSig0 < aSig0 ) then goto aBigger;
  4756. if ( aSig0 < bSig0 ) then goto bBigger;
  4757. if ( bSig1 < aSig1 ) then goto aBigger;
  4758. if ( aSig1 < bSig1 ) then goto bBigger;
  4759. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4760. exit;
  4761. bExpBigger:
  4762. if ( bExp = $7FF ) then
  4763. Begin
  4764. if ( bSig0 OR bSig1 ) <> 0 then
  4765. Begin
  4766. propagateFloat64NaN( a, b, out );
  4767. exit;
  4768. End;
  4769. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4770. exit;
  4771. End;
  4772. if ( aExp = 0 ) then
  4773. Begin
  4774. Inc(expDiff);
  4775. End
  4776. else
  4777. Begin
  4778. aSig0 := aSig0 or $40000000;
  4779. End;
  4780. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4781. bSig0 := bSig0 or $40000000;
  4782. bBigger:
  4783. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4784. zExp := bExp;
  4785. zSign := zSign xor 1;
  4786. goto normalizeRoundAndPack;
  4787. aExpBigger:
  4788. if ( aExp = $7FF ) then
  4789. Begin
  4790. if ( aSig0 OR aSig1 ) <> 0 then
  4791. Begin
  4792. propagateFloat64NaN( a, b, out );
  4793. exit;
  4794. End;
  4795. out := a;
  4796. exit;
  4797. End;
  4798. if ( bExp = 0 ) then
  4799. Begin
  4800. Dec(expDiff);
  4801. End
  4802. else
  4803. Begin
  4804. bSig0 := bSig0 or $40000000;
  4805. End;
  4806. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4807. aSig0 := aSig0 or $40000000;
  4808. aBigger:
  4809. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4810. zExp := aExp;
  4811. normalizeRoundAndPack:
  4812. Dec(zExp);
  4813. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4814. End;
  4815. {*
  4816. -------------------------------------------------------------------------------
  4817. Returns the result of adding the double-precision floating-point values `a'
  4818. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4819. Binary Floating-Point Arithmetic.
  4820. -------------------------------------------------------------------------------
  4821. *}
  4822. Function float64_add( a: float64; b : float64) : Float64;
  4823. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4824. Var
  4825. aSign, bSign: flag;
  4826. Begin
  4827. aSign := extractFloat64Sign( a );
  4828. bSign := extractFloat64Sign( b );
  4829. if ( aSign = bSign ) then
  4830. Begin
  4831. addFloat64Sigs( a, b, aSign, result );
  4832. End
  4833. else
  4834. Begin
  4835. subFloat64Sigs( a, b, aSign, result );
  4836. End;
  4837. End;
  4838. {*
  4839. -------------------------------------------------------------------------------
  4840. Returns the result of subtracting the double-precision floating-point values
  4841. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4842. for Binary Floating-Point Arithmetic.
  4843. -------------------------------------------------------------------------------
  4844. *}
  4845. Function float64_sub(a: float64; b : float64) : Float64;
  4846. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4847. Var
  4848. aSign, bSign: flag;
  4849. Begin
  4850. aSign := extractFloat64Sign( a );
  4851. bSign := extractFloat64Sign( b );
  4852. if ( aSign = bSign ) then
  4853. Begin
  4854. subFloat64Sigs( a, b, aSign, result );
  4855. End
  4856. else
  4857. Begin
  4858. addFloat64Sigs( a, b, aSign, result );
  4859. End;
  4860. End;
  4861. {*
  4862. -------------------------------------------------------------------------------
  4863. Returns the result of multiplying the double-precision floating-point values
  4864. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4865. for Binary Floating-Point Arithmetic.
  4866. -------------------------------------------------------------------------------
  4867. *}
  4868. Function float64_mul( a: float64; b:float64) : Float64;
  4869. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4870. Var
  4871. aSign, bSign, zSign: flag;
  4872. aExp, bExp, zExp: int16;
  4873. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4874. z: float64;
  4875. label invalid;
  4876. Begin
  4877. aSig1 := extractFloat64Frac1( a );
  4878. aSig0 := extractFloat64Frac0( a );
  4879. aExp := extractFloat64Exp( a );
  4880. aSign := extractFloat64Sign( a );
  4881. bSig1 := extractFloat64Frac1( b );
  4882. bSig0 := extractFloat64Frac0( b );
  4883. bExp := extractFloat64Exp( b );
  4884. bSign := extractFloat64Sign( b );
  4885. zSign := aSign xor bSign;
  4886. if ( aExp = $7FF ) then
  4887. Begin
  4888. if ( (( aSig0 OR aSig1 ) <>0)
  4889. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4890. Begin
  4891. propagateFloat64NaN( a, b, result );
  4892. exit;
  4893. End;
  4894. if ( ( bits32(bExp) OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4895. packFloat64( zSign, $7FF, 0, 0, result );
  4896. exit;
  4897. End;
  4898. if ( bExp = $7FF ) then
  4899. Begin
  4900. if ( bSig0 OR bSig1 )<> 0 then
  4901. Begin
  4902. propagateFloat64NaN( a, b, result );
  4903. exit;
  4904. End;
  4905. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4906. Begin
  4907. invalid:
  4908. float_raise( float_flag_invalid );
  4909. z.low := float64_default_nan_low;
  4910. z.high := float64_default_nan_high;
  4911. result := z;
  4912. exit;
  4913. End;
  4914. packFloat64( zSign, $7FF, 0, 0, result );
  4915. exit;
  4916. End;
  4917. if ( aExp = 0 ) then
  4918. Begin
  4919. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4920. Begin
  4921. packFloat64( zSign, 0, 0, 0, result );
  4922. exit;
  4923. End;
  4924. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4925. End;
  4926. if ( bExp = 0 ) then
  4927. Begin
  4928. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4929. Begin
  4930. packFloat64( zSign, 0, 0, 0, result );
  4931. exit;
  4932. End;
  4933. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4934. End;
  4935. zExp := aExp + bExp - $400;
  4936. aSig0 := aSig0 or $00100000;
  4937. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4938. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4939. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4940. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4941. if ( $00200000 <= zSig0 ) then
  4942. Begin
  4943. shift64ExtraRightJamming(
  4944. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4945. Inc(zExp);
  4946. End;
  4947. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4948. End;
  4949. {*
  4950. -------------------------------------------------------------------------------
  4951. Returns the result of dividing the double-precision floating-point value `a'
  4952. by the corresponding value `b'. The operation is performed according to the
  4953. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4954. -------------------------------------------------------------------------------
  4955. *}
  4956. Function float64_div(a: float64; b : float64) : Float64;
  4957. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4958. Var
  4959. aSign, bSign, zSign: flag;
  4960. aExp, bExp, zExp: int16;
  4961. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4962. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4963. z: float64;
  4964. label invalid;
  4965. Begin
  4966. aSig1 := extractFloat64Frac1( a );
  4967. aSig0 := extractFloat64Frac0( a );
  4968. aExp := extractFloat64Exp( a );
  4969. aSign := extractFloat64Sign( a );
  4970. bSig1 := extractFloat64Frac1( b );
  4971. bSig0 := extractFloat64Frac0( b );
  4972. bExp := extractFloat64Exp( b );
  4973. bSign := extractFloat64Sign( b );
  4974. zSign := aSign xor bSign;
  4975. if ( aExp = $7FF ) then
  4976. Begin
  4977. if ( aSig0 OR aSig1 )<> 0 then
  4978. Begin
  4979. propagateFloat64NaN( a, b, result );
  4980. exit;
  4981. end;
  4982. if ( bExp = $7FF ) then
  4983. Begin
  4984. if ( bSig0 OR bSig1 )<>0 then
  4985. Begin
  4986. propagateFloat64NaN( a, b, result );
  4987. exit;
  4988. End;
  4989. goto invalid;
  4990. End;
  4991. packFloat64( zSign, $7FF, 0, 0, result );
  4992. exit;
  4993. End;
  4994. if ( bExp = $7FF ) then
  4995. Begin
  4996. if ( bSig0 OR bSig1 )<> 0 then
  4997. Begin
  4998. propagateFloat64NaN( a, b, result );
  4999. exit;
  5000. End;
  5001. packFloat64( zSign, 0, 0, 0, result );
  5002. exit;
  5003. End;
  5004. if ( bExp = 0 ) then
  5005. Begin
  5006. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5007. Begin
  5008. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  5009. Begin
  5010. invalid:
  5011. float_raise( float_flag_invalid );
  5012. z.low := float64_default_nan_low;
  5013. z.high := float64_default_nan_high;
  5014. result := z;
  5015. exit;
  5016. End;
  5017. float_raise( float_flag_divbyzero );
  5018. packFloat64( zSign, $7FF, 0, 0, result );
  5019. exit;
  5020. End;
  5021. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5022. End;
  5023. if ( aExp = 0 ) then
  5024. Begin
  5025. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5026. Begin
  5027. packFloat64( zSign, 0, 0, 0, result );
  5028. exit;
  5029. End;
  5030. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5031. End;
  5032. zExp := aExp - bExp + $3FD;
  5033. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  5034. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5035. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  5036. Begin
  5037. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  5038. Inc(zExp);
  5039. End;
  5040. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5041. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  5042. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  5043. while ( sbits32 (rem0) < 0 ) do
  5044. Begin
  5045. Dec(zSig0);
  5046. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  5047. End;
  5048. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  5049. if ( ( zSig1 and $3FF ) <= 4 ) then
  5050. Begin
  5051. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  5052. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  5053. while ( sbits32 (rem1) < 0 ) do
  5054. Begin
  5055. Dec(zSig1);
  5056. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  5057. End;
  5058. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5059. End;
  5060. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  5061. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  5062. End;
  5063. {*
  5064. -------------------------------------------------------------------------------
  5065. Returns the remainder of the double-precision floating-point value `a'
  5066. with respect to the corresponding value `b'. The operation is performed
  5067. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5068. -------------------------------------------------------------------------------
  5069. *}
  5070. Function float64_rem(a: float64; b : float64) : float64;
  5071. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  5072. Var
  5073. aSign, zSign: flag;
  5074. aExp, bExp, expDiff: int16;
  5075. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  5076. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  5077. sigMean0: sbits32;
  5078. z: float64;
  5079. label invalid;
  5080. Begin
  5081. aSig1 := extractFloat64Frac1( a );
  5082. aSig0 := extractFloat64Frac0( a );
  5083. aExp := extractFloat64Exp( a );
  5084. aSign := extractFloat64Sign( a );
  5085. bSig1 := extractFloat64Frac1( b );
  5086. bSig0 := extractFloat64Frac0( b );
  5087. bExp := extractFloat64Exp( b );
  5088. if ( aExp = $7FF ) then
  5089. Begin
  5090. if ((( aSig0 OR aSig1 )<>0)
  5091. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  5092. Begin
  5093. propagateFloat64NaN( a, b, result );
  5094. exit;
  5095. End;
  5096. goto invalid;
  5097. End;
  5098. if ( bExp = $7FF ) then
  5099. Begin
  5100. if ( bSig0 OR bSig1 ) <> 0 then
  5101. Begin
  5102. propagateFloat64NaN( a, b, result );
  5103. exit;
  5104. End;
  5105. result := a;
  5106. exit;
  5107. End;
  5108. if ( bExp = 0 ) then
  5109. Begin
  5110. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5111. Begin
  5112. invalid:
  5113. float_raise( float_flag_invalid );
  5114. z.low := float64_default_nan_low;
  5115. z.high := float64_default_nan_high;
  5116. result := z;
  5117. exit;
  5118. End;
  5119. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5120. End;
  5121. if ( aExp = 0 ) then
  5122. Begin
  5123. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5124. Begin
  5125. result := a;
  5126. exit;
  5127. End;
  5128. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5129. End;
  5130. expDiff := aExp - bExp;
  5131. if ( expDiff < -1 ) then
  5132. Begin
  5133. result := a;
  5134. exit;
  5135. End;
  5136. shortShift64Left(
  5137. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  5138. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5139. q := le64( bSig0, bSig1, aSig0, aSig1 );
  5140. if ( q )<>0 then
  5141. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5142. expDiff := expDiff - 32;
  5143. while ( 0 < expDiff ) do
  5144. Begin
  5145. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5146. if 4 < q then
  5147. q:= q - 4
  5148. else
  5149. q := 0;
  5150. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5151. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  5152. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  5153. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  5154. expDiff := expDiff - 29;
  5155. End;
  5156. if ( -32 < expDiff ) then
  5157. Begin
  5158. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5159. if 4 < q then
  5160. q := q - 4
  5161. else
  5162. q := 0;
  5163. q := q shr (- expDiff);
  5164. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5165. expDiff := expDiff + 24;
  5166. if ( expDiff < 0 ) then
  5167. Begin
  5168. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  5169. End
  5170. else
  5171. Begin
  5172. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  5173. End;
  5174. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5175. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  5176. End
  5177. else
  5178. Begin
  5179. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  5180. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5181. End;
  5182. Repeat
  5183. alternateASig0 := aSig0;
  5184. alternateASig1 := aSig1;
  5185. Inc(q);
  5186. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5187. Until not ( 0 <= sbits32 (aSig0) );
  5188. add64(
  5189. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  5190. if ( ( sigMean0 < 0 )
  5191. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  5192. Begin
  5193. aSig0 := alternateASig0;
  5194. aSig1 := alternateASig1;
  5195. End;
  5196. zSign := flag( sbits32 (aSig0) < 0 );
  5197. if ( zSign <> 0 ) then
  5198. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  5199. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  5200. End;
  5201. {*
  5202. -------------------------------------------------------------------------------
  5203. Returns the square root of the double-precision floating-point value `a'.
  5204. The operation is performed according to the IEC/IEEE Standard for Binary
  5205. Floating-Point Arithmetic.
  5206. -------------------------------------------------------------------------------
  5207. *}
  5208. function float64_sqrt( a: float64 ): float64;
  5209. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  5210. Var
  5211. aSign: flag;
  5212. aExp, zExp: int16;
  5213. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  5214. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  5215. label invalid;
  5216. Begin
  5217. aSig1 := extractFloat64Frac1( a );
  5218. aSig0 := extractFloat64Frac0( a );
  5219. aExp := extractFloat64Exp( a );
  5220. aSign := extractFloat64Sign( a );
  5221. if ( aExp = $7FF ) then
  5222. Begin
  5223. if ( aSig0 OR aSig1 ) <> 0 then
  5224. Begin
  5225. propagateFloat64NaN( a, a, result );
  5226. exit;
  5227. End;
  5228. if ( aSign = 0) then
  5229. Begin
  5230. result := a;
  5231. exit;
  5232. End;
  5233. goto invalid;
  5234. End;
  5235. if ( aSign <> 0 ) then
  5236. Begin
  5237. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  5238. Begin
  5239. result := a;
  5240. exit;
  5241. End;
  5242. invalid:
  5243. float_raise( float_flag_invalid );
  5244. result.low := float64_default_nan_low;
  5245. result.high := float64_default_nan_high;
  5246. exit;
  5247. End;
  5248. if ( aExp = 0 ) then
  5249. Begin
  5250. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5251. Begin
  5252. packFloat64( 0, 0, 0, 0, result );
  5253. exit;
  5254. End;
  5255. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5256. End;
  5257. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  5258. aSig0 := aSig0 or $00100000;
  5259. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  5260. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  5261. if ( zSig0 = 0 ) then
  5262. zSig0 := $7FFFFFFF;
  5263. doubleZSig0 := zSig0 + zSig0;
  5264. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  5265. mul32To64( zSig0, zSig0, term0, term1 );
  5266. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  5267. while ( sbits32 (rem0) < 0 ) do
  5268. Begin
  5269. Dec(zSig0);
  5270. doubleZSig0 := doubleZSig0 - 2;
  5271. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  5272. End;
  5273. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  5274. if ( ( zSig1 and $1FF ) <= 5 ) then
  5275. Begin
  5276. if ( zSig1 = 0 ) then
  5277. zSig1 := 1;
  5278. mul32To64( doubleZSig0, zSig1, term1, term2 );
  5279. sub64( rem1, 0, term1, term2, rem1, rem2 );
  5280. mul32To64( zSig1, zSig1, term2, term3 );
  5281. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  5282. while ( sbits32 (rem1) < 0 ) do
  5283. Begin
  5284. Dec(zSig1);
  5285. shortShift64Left( 0, zSig1, 1, term2, term3 );
  5286. term3 := term3 or 1;
  5287. term2 := term2 or doubleZSig0;
  5288. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  5289. End;
  5290. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5291. End;
  5292. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  5293. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, result );
  5294. End;
  5295. {*
  5296. -------------------------------------------------------------------------------
  5297. Returns 1 if the double-precision floating-point value `a' is equal to
  5298. the corresponding value `b', and 0 otherwise. The comparison is performed
  5299. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5300. -------------------------------------------------------------------------------
  5301. *}
  5302. Function float64_eq(a: float64; b: float64): flag;
  5303. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  5304. Begin
  5305. if
  5306. (
  5307. ( extractFloat64Exp( a ) = $7FF )
  5308. AND
  5309. (
  5310. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5311. )
  5312. )
  5313. OR (
  5314. ( extractFloat64Exp( b ) = $7FF )
  5315. AND (
  5316. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5317. )
  5318. )
  5319. ) then
  5320. Begin
  5321. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5322. float_raise( float_flag_invalid );
  5323. float64_eq := 0;
  5324. exit;
  5325. End;
  5326. float64_eq := flag(
  5327. ( a.low = b.low )
  5328. AND ( ( a.high = b.high )
  5329. OR ( ( a.low = 0 )
  5330. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5331. ));
  5332. End;
  5333. {*
  5334. -------------------------------------------------------------------------------
  5335. Returns 1 if the double-precision floating-point value `a' is less than
  5336. or equal to the corresponding value `b', and 0 otherwise. The comparison
  5337. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5338. Arithmetic.
  5339. -------------------------------------------------------------------------------
  5340. *}
  5341. Function float64_le(a: float64;b: float64): flag;
  5342. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  5343. Var
  5344. aSign, bSign: flag;
  5345. Begin
  5346. if
  5347. (
  5348. ( extractFloat64Exp( a ) = $7FF )
  5349. AND
  5350. (
  5351. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5352. )
  5353. )
  5354. OR (
  5355. ( extractFloat64Exp( b ) = $7FF )
  5356. AND (
  5357. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5358. )
  5359. )
  5360. ) then
  5361. Begin
  5362. float_raise( float_flag_invalid );
  5363. float64_le := 0;
  5364. exit;
  5365. End;
  5366. aSign := extractFloat64Sign( a );
  5367. bSign := extractFloat64Sign( b );
  5368. if ( aSign <> bSign ) then
  5369. Begin
  5370. float64_le := flag(
  5371. (aSign <> 0)
  5372. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5373. = 0 ));
  5374. exit;
  5375. End;
  5376. if aSign <> 0 then
  5377. float64_le := le64( b.high, b.low, a.high, a.low )
  5378. else
  5379. float64_le := le64( a.high, a.low, b.high, b.low );
  5380. End;
  5381. {*
  5382. -------------------------------------------------------------------------------
  5383. Returns 1 if the double-precision floating-point value `a' is less than
  5384. the corresponding value `b', and 0 otherwise. The comparison is performed
  5385. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5386. -------------------------------------------------------------------------------
  5387. *}
  5388. Function float64_lt(a: float64;b: float64): flag;
  5389. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  5390. Var
  5391. aSign, bSign: flag;
  5392. Begin
  5393. if
  5394. (
  5395. ( extractFloat64Exp( a ) = $7FF )
  5396. AND
  5397. (
  5398. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5399. )
  5400. )
  5401. OR (
  5402. ( extractFloat64Exp( b ) = $7FF )
  5403. AND (
  5404. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5405. )
  5406. )
  5407. ) then
  5408. Begin
  5409. float_raise( float_flag_invalid );
  5410. float64_lt := 0;
  5411. exit;
  5412. End;
  5413. aSign := extractFloat64Sign( a );
  5414. bSign := extractFloat64Sign( b );
  5415. if ( aSign <> bSign ) then
  5416. Begin
  5417. float64_lt := flag(
  5418. (aSign <> 0)
  5419. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5420. <> 0 ));
  5421. exit;
  5422. End;
  5423. if aSign <> 0 then
  5424. float64_lt := lt64( b.high, b.low, a.high, a.low )
  5425. else
  5426. float64_lt := lt64( a.high, a.low, b.high, b.low );
  5427. End;
  5428. {*
  5429. -------------------------------------------------------------------------------
  5430. Returns 1 if the double-precision floating-point value `a' is equal to
  5431. the corresponding value `b', and 0 otherwise. The invalid exception is
  5432. raised if either operand is a NaN. Otherwise, the comparison is performed
  5433. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5434. -------------------------------------------------------------------------------
  5435. *}
  5436. Function float64_eq_signaling( a: float64; b: float64): flag;
  5437. Begin
  5438. if
  5439. (
  5440. ( extractFloat64Exp( a ) = $7FF )
  5441. AND
  5442. (
  5443. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5444. )
  5445. )
  5446. OR (
  5447. ( extractFloat64Exp( b ) = $7FF )
  5448. AND (
  5449. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5450. )
  5451. )
  5452. ) then
  5453. Begin
  5454. float_raise( float_flag_invalid );
  5455. float64_eq_signaling := 0;
  5456. exit;
  5457. End;
  5458. float64_eq_signaling := flag(
  5459. ( a.low = b.low )
  5460. AND ( ( a.high = b.high )
  5461. OR ( ( a.low = 0 )
  5462. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5463. ));
  5464. End;
  5465. {*
  5466. -------------------------------------------------------------------------------
  5467. Returns 1 if the double-precision floating-point value `a' is less than or
  5468. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  5469. cause an exception. Otherwise, the comparison is performed according to the
  5470. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5471. -------------------------------------------------------------------------------
  5472. *}
  5473. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  5474. Var
  5475. aSign, bSign : flag;
  5476. Begin
  5477. if
  5478. (
  5479. ( extractFloat64Exp( a ) = $7FF )
  5480. AND
  5481. (
  5482. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5483. )
  5484. )
  5485. OR (
  5486. ( extractFloat64Exp( b ) = $7FF )
  5487. AND (
  5488. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5489. )
  5490. )
  5491. ) then
  5492. Begin
  5493. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5494. float_raise( float_flag_invalid );
  5495. float64_le_quiet := 0;
  5496. exit;
  5497. End;
  5498. aSign := extractFloat64Sign( a );
  5499. bSign := extractFloat64Sign( b );
  5500. if ( aSign <> bSign ) then
  5501. Begin
  5502. float64_le_quiet := flag
  5503. ((aSign <> 0)
  5504. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5505. = 0 ));
  5506. exit;
  5507. End;
  5508. if aSign <> 0 then
  5509. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  5510. else
  5511. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  5512. End;
  5513. {*
  5514. -------------------------------------------------------------------------------
  5515. Returns 1 if the double-precision floating-point value `a' is less than
  5516. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5517. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5518. Standard for Binary Floating-Point Arithmetic.
  5519. -------------------------------------------------------------------------------
  5520. *}
  5521. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5522. Var
  5523. aSign, bSign: flag;
  5524. Begin
  5525. if
  5526. (
  5527. ( extractFloat64Exp( a ) = $7FF )
  5528. AND
  5529. (
  5530. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5531. )
  5532. )
  5533. OR (
  5534. ( extractFloat64Exp( b ) = $7FF )
  5535. AND (
  5536. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5537. )
  5538. )
  5539. ) then
  5540. Begin
  5541. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5542. float_raise( float_flag_invalid );
  5543. float64_lt_quiet := 0;
  5544. exit;
  5545. End;
  5546. aSign := extractFloat64Sign( a );
  5547. bSign := extractFloat64Sign( b );
  5548. if ( aSign <> bSign ) then
  5549. Begin
  5550. float64_lt_quiet := flag(
  5551. (aSign<>0)
  5552. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5553. <> 0 ));
  5554. exit;
  5555. End;
  5556. If aSign <> 0 then
  5557. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5558. else
  5559. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5560. End;
  5561. {*----------------------------------------------------------------------------
  5562. | Returns the result of converting the 64-bit two's complement integer `a'
  5563. | to the single-precision floating-point format. The conversion is performed
  5564. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5565. *----------------------------------------------------------------------------*}
  5566. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5567. var
  5568. zSign : flag;
  5569. absA : uint64;
  5570. shiftCount: int8;
  5571. Begin
  5572. if ( a = 0 ) then
  5573. begin
  5574. int64_to_float32.float32 := 0;
  5575. exit;
  5576. end;
  5577. if a < 0 then
  5578. zSign := flag(TRUE)
  5579. else
  5580. zSign := flag(FALSE);
  5581. if zSign<>0 then
  5582. absA := -a
  5583. else
  5584. absA := a;
  5585. shiftCount := countLeadingZeros64( absA ) - 40;
  5586. if ( 0 <= shiftCount ) then
  5587. begin
  5588. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5589. end
  5590. else
  5591. begin
  5592. shiftCount := shiftCount + 7;
  5593. if ( shiftCount < 0 ) then
  5594. shift64RightJamming( absA, - shiftCount, absA )
  5595. else
  5596. absA := absA shl shiftCount;
  5597. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5598. end;
  5599. End;
  5600. {*----------------------------------------------------------------------------
  5601. | Returns the result of converting the 64-bit two's complement integer `a'
  5602. | to the single-precision floating-point format. The conversion is performed
  5603. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5604. | Unisgned version.
  5605. *----------------------------------------------------------------------------*}
  5606. function qword_to_float32( a: qword ): float32rec; compilerproc;
  5607. var
  5608. absA : uint64;
  5609. shiftCount: int8;
  5610. Begin
  5611. if ( a = 0 ) then
  5612. begin
  5613. qword_to_float32.float32 := 0;
  5614. exit;
  5615. end;
  5616. absA := a;
  5617. shiftCount := countLeadingZeros64( absA ) - 40;
  5618. if ( 0 <= shiftCount ) then
  5619. begin
  5620. qword_to_float32.float32:= packFloat32( 0, $95 - shiftCount, absA shl shiftCount );
  5621. end
  5622. else
  5623. begin
  5624. shiftCount := shiftCount + 7;
  5625. if ( shiftCount < 0 ) then
  5626. shift64RightJamming( absA, - shiftCount, absA )
  5627. else
  5628. absA := absA shl shiftCount;
  5629. qword_to_float32.float32:=roundAndPackFloat32( 0, $9C - shiftCount, absA );
  5630. end;
  5631. End;
  5632. {*----------------------------------------------------------------------------
  5633. | Returns the result of converting the 64-bit two's complement integer `a'
  5634. | to the double-precision floating-point format. The conversion is performed
  5635. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5636. *----------------------------------------------------------------------------*}
  5637. function qword_to_float64( a: qword ): float64;
  5638. {$ifdef fpc}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5639. var
  5640. shiftCount: int8;
  5641. Begin
  5642. if ( a = 0 ) then
  5643. result := packFloat64( 0, 0, 0 )
  5644. else
  5645. begin
  5646. shiftCount := countLeadingZeros64(a) - 1;
  5647. { numbers with <= 53 significant bits are converted exactly }
  5648. if (shiftCount > 9) then
  5649. result := packFloat64(0, $43c - shiftCount, a shl (shiftCount-10))
  5650. else if (shiftCount>=0) then
  5651. result := roundAndPackFloat64( 0, $43c - shiftCount, a shl shiftCount)
  5652. else
  5653. begin
  5654. { the only possible negative value is -1, in case bit 63 of 'a' is set }
  5655. shift64RightJamming(a, 1, a);
  5656. result := roundAndPackFloat64(0, $43d, a);
  5657. end;
  5658. end;
  5659. End;
  5660. {*----------------------------------------------------------------------------
  5661. | Returns the result of converting the 64-bit two's complement integer `a'
  5662. | to the double-precision floating-point format. The conversion is performed
  5663. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5664. *----------------------------------------------------------------------------*}
  5665. function int64_to_float64( a: int64 ): float64;
  5666. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5667. Begin
  5668. if ( a = 0 ) then
  5669. result := packFloat64( 0, 0, 0 )
  5670. else if (a = int64($8000000000000000)) then
  5671. result := packFloat64( 1, $43e, 0 )
  5672. else if (a < 0) then
  5673. result := normalizeRoundAndPackFloat64( 1, $43c, -a )
  5674. else
  5675. result := normalizeRoundAndPackFloat64( 0, $43c, a );
  5676. End;
  5677. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5678. {*----------------------------------------------------------------------------
  5679. | Returns the result of converting the 64-bit two's complement integer `a'
  5680. | to the extended double-precision floating-point format. The conversion
  5681. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5682. | Arithmetic.
  5683. *----------------------------------------------------------------------------*}
  5684. function int64_to_floatx80( a: int64 ): floatx80;
  5685. var
  5686. zSign: flag;
  5687. absA: uint64;
  5688. shiftCount: int8;
  5689. begin
  5690. if ( a = 0 ) then begin
  5691. result := packFloatx80( 0, 0, 0 );
  5692. exit;
  5693. end;
  5694. zSign := ord( a < 0 );
  5695. if zSign <> 0 then absA := - a else absA := a;
  5696. shiftCount := countLeadingZeros64( absA );
  5697. result := packFloatx80( zSign, $403E - shiftCount, absA shl shiftCount );
  5698. end;
  5699. {*----------------------------------------------------------------------------
  5700. | Returns the result of converting the 64-bit two's complement integer `a'
  5701. | to the extended double-precision floating-point format. The conversion
  5702. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5703. | Arithmetic.
  5704. | Unsigned version.
  5705. *----------------------------------------------------------------------------*}
  5706. function qword_to_floatx80( a: qword ): floatx80;
  5707. var
  5708. absA: bits64;
  5709. shiftCount: int8;
  5710. begin
  5711. if ( a = 0 ) then begin
  5712. result := packFloatx80( 0, 0, 0 );
  5713. exit;
  5714. end;
  5715. absA := a;
  5716. shiftCount := countLeadingZeros64( absA );
  5717. result := packFloatx80( 0, $403E - shiftCount, absA shl shiftCount );
  5718. end;
  5719. {$endif FPC_SOFTFLOAT_FLOATX80}
  5720. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5721. {*----------------------------------------------------------------------------
  5722. | Returns the result of converting the 64-bit two's complement integer `a' to
  5723. | the quadruple-precision floating-point format. The conversion is performed
  5724. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5725. *----------------------------------------------------------------------------*}
  5726. function int64_to_float128( a: int64 ): float128;
  5727. var
  5728. zSign: flag;
  5729. absA: uint64;
  5730. shiftCount: int8;
  5731. zExp: int32;
  5732. zSig0, zSig1: bits64;
  5733. begin
  5734. if ( a = 0 ) then begin
  5735. result := packFloat128( 0, 0, 0, 0 );
  5736. exit;
  5737. end;
  5738. zSign := ord( a < 0 );
  5739. if zSign <> 0 then absA := - a else absA := a;
  5740. shiftCount := countLeadingZeros64( absA ) + 49;
  5741. zExp := $406E - shiftCount;
  5742. if ( 64 <= shiftCount ) then begin
  5743. zSig1 := 0;
  5744. zSig0 := absA;
  5745. dec( shiftCount, 64 );
  5746. end
  5747. else begin
  5748. zSig1 := absA;
  5749. zSig0 := 0;
  5750. end;
  5751. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5752. result := packFloat128( zSign, zExp, zSig0, zSig1 );
  5753. end;
  5754. {*----------------------------------------------------------------------------
  5755. | Returns the result of converting the 64-bit two's complement integer `a' to
  5756. | the quadruple-precision floating-point format. The conversion is performed
  5757. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5758. | Unsigned version.
  5759. *----------------------------------------------------------------------------*}
  5760. function qword_to_float128( a: qword ): float128;
  5761. var
  5762. absA: bits64;
  5763. shiftCount: int8;
  5764. zExp: int32;
  5765. zSig0, zSig1: bits64;
  5766. begin
  5767. if ( a = 0 ) then begin
  5768. result := packFloat128( 0, 0, 0, 0 );
  5769. exit;
  5770. end;
  5771. absA := a;
  5772. shiftCount := countLeadingZeros64( absA ) + 49;
  5773. zExp := $406E - shiftCount;
  5774. if ( 64 <= shiftCount ) then begin
  5775. zSig1 := 0;
  5776. zSig0 := absA;
  5777. dec( shiftCount, 64 );
  5778. end
  5779. else begin
  5780. zSig1 := absA;
  5781. zSig0 := 0;
  5782. end;
  5783. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5784. result := packFloat128( 0, zExp, zSig0, zSig1 );
  5785. end;
  5786. {$endif FPC_SOFTFLOAT_FLOAT128}
  5787. {*----------------------------------------------------------------------------
  5788. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5789. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5790. | Otherwise, returns 0.
  5791. *----------------------------------------------------------------------------*}
  5792. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5793. begin
  5794. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5795. end;
  5796. {*----------------------------------------------------------------------------
  5797. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5798. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5799. | Otherwise, returns 0.
  5800. *----------------------------------------------------------------------------*}
  5801. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5802. begin
  5803. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5804. end;
  5805. {*----------------------------------------------------------------------------
  5806. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5807. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5808. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5809. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5810. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5811. | the most-significant bit of the extra result, and the other 63 bits of the
  5812. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5813. | were all zero. This extra result is stored in the location pointed to by
  5814. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5815. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5816. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5817. | fixed-point value is shifted right by the number of bits given in `count',
  5818. | and the integer part of the result is returned at the locations pointed to
  5819. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5820. | corrupted as described above, and is returned at the location pointed to by
  5821. | `z2Ptr'.)
  5822. *----------------------------------------------------------------------------*}
  5823. procedure shift128ExtraRightJamming(
  5824. a0: bits64;
  5825. a1: bits64;
  5826. a2: bits64;
  5827. count: int16;
  5828. var z0Ptr: bits64;
  5829. var z1Ptr: bits64;
  5830. var z2Ptr: bits64);
  5831. var
  5832. z0, z1, z2: bits64;
  5833. negCount: int8;
  5834. begin
  5835. negCount := ( - count ) and 63;
  5836. if ( count = 0 ) then
  5837. begin
  5838. z2 := a2;
  5839. z1 := a1;
  5840. z0 := a0;
  5841. end
  5842. else begin
  5843. if ( count < 64 ) then
  5844. begin
  5845. z2 := a1 shl negCount;
  5846. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5847. z0 := a0 shr count;
  5848. end
  5849. else begin
  5850. if ( count = 64 ) then
  5851. begin
  5852. z2 := a1;
  5853. z1 := a0;
  5854. end
  5855. else begin
  5856. a2 := a2 or a1;
  5857. if ( count < 128 ) then
  5858. begin
  5859. z2 := a0 shl negCount;
  5860. z1 := a0 shr ( count and 63 );
  5861. end
  5862. else begin
  5863. if ( count = 128 ) then
  5864. z2 := a0
  5865. else
  5866. z2 := ord( a0 <> 0 );
  5867. z1 := 0;
  5868. end;
  5869. end;
  5870. z0 := 0;
  5871. end;
  5872. z2 := z2 or ord( a2 <> 0 );
  5873. end;
  5874. z2Ptr := z2;
  5875. z1Ptr := z1;
  5876. z0Ptr := z0;
  5877. end;
  5878. {*----------------------------------------------------------------------------
  5879. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5880. | _plus_ the number of bits given in `count'. The shifted result is at most
  5881. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5882. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5883. | shifted off is the most-significant bit of the extra result, and the other
  5884. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5885. | bits shifted off were all zero. This extra result is stored in the location
  5886. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5887. | (This routine makes more sense if `a0' and `a1' are considered to form
  5888. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5889. | point value is shifted right by the number of bits given in `count', and
  5890. | the integer part of the result is returned at the location pointed to by
  5891. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5892. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5893. *----------------------------------------------------------------------------*}
  5894. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5895. var
  5896. z0, z1: bits64;
  5897. negCount: int8;
  5898. begin
  5899. negCount := ( - count ) and 63;
  5900. if ( count = 0 ) then
  5901. begin
  5902. z1 := a1;
  5903. z0 := a0;
  5904. end
  5905. else if ( count < 64 ) then
  5906. begin
  5907. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5908. z0 := a0 shr count;
  5909. end
  5910. else begin
  5911. if ( count = 64 ) then
  5912. begin
  5913. z1 := a0 or ord( a1 <> 0 );
  5914. end
  5915. else begin
  5916. z1 := ord( ( a0 or a1 ) <> 0 );
  5917. end;
  5918. z0 := 0;
  5919. end;
  5920. z1Ptr := z1;
  5921. z0Ptr := z0;
  5922. end;
  5923. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5924. {*----------------------------------------------------------------------------
  5925. | Returns the fraction bits of the extended double-precision floating-point
  5926. | value `a'.
  5927. *----------------------------------------------------------------------------*}
  5928. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5929. begin
  5930. result:=a.low;
  5931. end;
  5932. {*----------------------------------------------------------------------------
  5933. | Returns the exponent bits of the extended double-precision floating-point
  5934. | value `a'.
  5935. *----------------------------------------------------------------------------*}
  5936. function extractFloatx80Exp(a : floatx80): int32;inline;
  5937. begin
  5938. result:=a.high and $7FFF;
  5939. end;
  5940. {*----------------------------------------------------------------------------
  5941. | Returns the sign bit of the extended double-precision floating-point value
  5942. | `a'.
  5943. *----------------------------------------------------------------------------*}
  5944. function extractFloatx80Sign(a : floatx80): flag;inline;
  5945. begin
  5946. result:=a.high shr 15;
  5947. end;
  5948. {*----------------------------------------------------------------------------
  5949. | Normalizes the subnormal extended double-precision floating-point value
  5950. | represented by the denormalized significand `aSig'. The normalized exponent
  5951. | and significand are stored at the locations pointed to by `zExpPtr' and
  5952. | `zSigPtr', respectively.
  5953. *----------------------------------------------------------------------------*}
  5954. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5955. var
  5956. shiftCount: int8;
  5957. begin
  5958. shiftCount := countLeadingZeros64( aSig );
  5959. zSigPtr := aSig shl shiftCount;
  5960. zExpPtr := 1 - shiftCount;
  5961. end;
  5962. {*----------------------------------------------------------------------------
  5963. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5964. | extended double-precision floating-point value, returning the result.
  5965. *----------------------------------------------------------------------------*}
  5966. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5967. var
  5968. z: floatx80;
  5969. begin
  5970. z.low := zSig;
  5971. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5972. result:=z;
  5973. end;
  5974. {*----------------------------------------------------------------------------
  5975. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5976. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5977. | and returns the proper extended double-precision floating-point value
  5978. | corresponding to the abstract input. Ordinarily, the abstract value is
  5979. | rounded and packed into the extended double-precision format, with the
  5980. | inexact exception raised if the abstract input cannot be represented
  5981. | exactly. However, if the abstract value is too large, the overflow and
  5982. | inexact exceptions are raised and an infinity or maximal finite value is
  5983. | returned. If the abstract value is too small, the input value is rounded to
  5984. | a subnormal number, and the underflow and inexact exceptions are raised if
  5985. | the abstract input cannot be represented exactly as a subnormal extended
  5986. | double-precision floating-point number.
  5987. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5988. | number of bits as single or double precision, respectively. Otherwise, the
  5989. | result is rounded to the full precision of the extended double-precision
  5990. | format.
  5991. | The input significand must be normalized or smaller. If the input
  5992. | significand is not normalized, `zExp' must be 0; in that case, the result
  5993. | returned is a subnormal number, and it must not require rounding. The
  5994. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5995. | Floating-Point Arithmetic.
  5996. *----------------------------------------------------------------------------*}
  5997. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5998. var
  5999. roundingMode: TFPURoundingMode;
  6000. roundNearestEven, increment, isTiny: flag;
  6001. roundIncrement, roundMask, roundBits: int64;
  6002. label
  6003. precision80, overflow;
  6004. begin
  6005. roundingMode := softfloat_rounding_mode;
  6006. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  6007. if ( roundingPrecision = 80 ) then
  6008. goto precision80;
  6009. if ( roundingPrecision = 64 ) then
  6010. begin
  6011. roundIncrement := int64( $0000000000000400 );
  6012. roundMask := int64( $00000000000007FF );
  6013. end
  6014. else if ( roundingPrecision = 32 ) then
  6015. begin
  6016. roundIncrement := int64( $0000008000000000 );
  6017. roundMask := int64( $000000FFFFFFFFFF );
  6018. end
  6019. else begin
  6020. goto precision80;
  6021. end;
  6022. zSig0 := zSig0 or ord( zSig1 <> 0 );
  6023. if ( not (roundNearestEven<>0) ) then
  6024. begin
  6025. if ( roundingMode = float_round_to_zero ) then
  6026. begin
  6027. roundIncrement := 0;
  6028. end
  6029. else begin
  6030. roundIncrement := roundMask;
  6031. if ( zSign<>0 ) then
  6032. begin
  6033. if ( roundingMode = float_round_up ) then
  6034. roundIncrement := 0;
  6035. end
  6036. else begin
  6037. if ( roundingMode = float_round_down ) then
  6038. roundIncrement := 0;
  6039. end;
  6040. end;
  6041. end;
  6042. roundBits := zSig0 and roundMask;
  6043. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6044. if ( ( $7FFE < zExp )
  6045. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  6046. ) then begin
  6047. goto overflow;
  6048. end;
  6049. if ( zExp <= 0 ) then begin
  6050. isTiny := ord (
  6051. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6052. or ( zExp < 0 )
  6053. or ( zSig0 <= zSig0 + roundIncrement ) );
  6054. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  6055. zExp := 0;
  6056. roundBits := zSig0 and roundMask;
  6057. if ( isTiny <> 0 ) and ( roundBits <> 0 ) then float_raise( float_flag_underflow );
  6058. if ( roundBits <> 0 ) then set_inexact_flag;
  6059. inc( zSig0, roundIncrement );
  6060. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6061. roundIncrement := roundMask + 1;
  6062. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6063. roundMask := roundMask or roundIncrement;
  6064. end;
  6065. zSig0 := zSig0 and not roundMask;
  6066. result:=packFloatx80( zSign, zExp, zSig0 );
  6067. exit;
  6068. end;
  6069. end;
  6070. if ( roundBits <> 0 ) then set_inexact_flag;
  6071. inc( zSig0, roundIncrement );
  6072. if ( zSig0 < roundIncrement ) then begin
  6073. inc(zExp);
  6074. zSig0 := bits64( $8000000000000000 );
  6075. end;
  6076. roundIncrement := roundMask + 1;
  6077. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6078. roundMask := roundMask or roundIncrement;
  6079. end;
  6080. zSig0 := zSig0 and not roundMask;
  6081. if ( zSig0 = 0 ) then zExp := 0;
  6082. result:=packFloatx80( zSign, zExp, zSig0 );
  6083. exit;
  6084. precision80:
  6085. increment := ord ( sbits64( zSig1 ) < 0 );
  6086. if ( roundNearestEven = 0 ) then begin
  6087. if ( roundingMode = float_round_to_zero ) then begin
  6088. increment := 0;
  6089. end
  6090. else begin
  6091. if ( zSign <> 0 ) then begin
  6092. increment := ord ( roundingMode = float_round_down ) and zSig1;
  6093. end
  6094. else begin
  6095. increment := ord ( roundingMode = float_round_up ) and zSig1;
  6096. end;
  6097. end;
  6098. end;
  6099. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6100. if ( ( $7FFE < zExp )
  6101. or ( ( zExp = $7FFE )
  6102. and ( zSig0 = bits64( $FFFFFFFFFFFFFFFF ) )
  6103. and ( increment <> 0 )
  6104. )
  6105. ) then begin
  6106. roundMask := 0;
  6107. overflow:
  6108. float_raise( [float_flag_overflow,float_flag_inexact] );
  6109. if ( ( roundingMode = float_round_to_zero )
  6110. or ( ( zSign <> 0) and ( roundingMode = float_round_up ) )
  6111. or ( ( zSign = 0) and ( roundingMode = float_round_down ) )
  6112. ) then begin
  6113. result:=packFloatx80( zSign, $7FFE, not roundMask );
  6114. exit;
  6115. end;
  6116. result:=packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6117. exit;
  6118. end;
  6119. if ( zExp <= 0 ) then begin
  6120. isTiny := ord(
  6121. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6122. or ( zExp < 0 )
  6123. or ( increment = 0 )
  6124. or ( zSig0 < bits64( $FFFFFFFFFFFFFFFF ) ) );
  6125. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  6126. zExp := 0;
  6127. if ( ( isTiny <> 0 ) and ( zSig1 <> 0 ) ) then float_raise( float_flag_underflow );
  6128. if ( zSig1 <> 0 ) then set_inexact_flag;
  6129. if ( roundNearestEven <> 0 ) then begin
  6130. increment := ord( sbits64( zSig1 ) < 0 );
  6131. end
  6132. else begin
  6133. if ( zSign <> 0 ) then begin
  6134. increment := ord( roundingMode = float_round_down ) and zSig1;
  6135. end
  6136. else begin
  6137. increment := ord( roundingMode = float_round_up ) and zSig1;
  6138. end;
  6139. end;
  6140. if ( increment <> 0 ) then begin
  6141. inc(zSig0);
  6142. zSig0 :=
  6143. not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6144. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6145. end;
  6146. result:=packFloatx80( zSign, zExp, zSig0 );
  6147. exit;
  6148. end;
  6149. end;
  6150. if ( zSig1 <> 0 ) then set_inexact_flag;
  6151. if ( increment <> 0 ) then begin
  6152. inc(zSig0);
  6153. if ( zSig0 = 0 ) then begin
  6154. inc(zExp);
  6155. zSig0 := bits64( $8000000000000000 );
  6156. end
  6157. else begin
  6158. zSig0 := zSig0 and not bits64( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6159. end;
  6160. end
  6161. else begin
  6162. if ( zSig0 = 0 ) then zExp := 0;
  6163. end;
  6164. result:=packFloatx80( zSign, zExp, zSig0 );
  6165. end;
  6166. {*----------------------------------------------------------------------------
  6167. | Takes an abstract floating-point value having sign `zSign', exponent
  6168. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  6169. | and returns the proper extended double-precision floating-point value
  6170. | corresponding to the abstract input. This routine is just like
  6171. | `roundAndPackFloatx80' except that the input significand does not have to be
  6172. | normalized.
  6173. *----------------------------------------------------------------------------*}
  6174. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6175. var
  6176. shiftCount: int8;
  6177. begin
  6178. if ( zSig0 = 0 ) then begin
  6179. zSig0 := zSig1;
  6180. zSig1 := 0;
  6181. dec( zExp, 64 );
  6182. end;
  6183. shiftCount := countLeadingZeros64( zSig0 );
  6184. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6185. zExp := zExp - shiftCount;
  6186. result :=
  6187. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  6188. end;
  6189. {*----------------------------------------------------------------------------
  6190. | Returns the result of converting the extended double-precision floating-
  6191. | point value `a' to the 32-bit two's complement integer format. The
  6192. | conversion is performed according to the IEC/IEEE Standard for Binary
  6193. | Floating-Point Arithmetic---which means in particular that the conversion
  6194. | is rounded according to the current rounding mode. If `a' is a NaN, the
  6195. | largest positive integer is returned. Otherwise, if the conversion
  6196. | overflows, the largest integer with the same sign as `a' is returned.
  6197. *----------------------------------------------------------------------------*}
  6198. function floatx80_to_int32(a: floatx80): int32;
  6199. var
  6200. aSign: flag;
  6201. aExp, shiftCount: int32;
  6202. aSig: bits64;
  6203. begin
  6204. aSig := extractFloatx80Frac( a );
  6205. aExp := extractFloatx80Exp( a );
  6206. aSign := extractFloatx80Sign( a );
  6207. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then aSign := 0;
  6208. shiftCount := $4037 - aExp;
  6209. if ( shiftCount <= 0 ) then shiftCount := 1;
  6210. shift64RightJamming( aSig, shiftCount, aSig );
  6211. result := roundAndPackInt32( aSign, aSig );
  6212. end;
  6213. {*----------------------------------------------------------------------------
  6214. | Returns the result of converting the extended double-precision floating-
  6215. | point value `a' to the 32-bit two's complement integer format. The
  6216. | conversion is performed according to the IEC/IEEE Standard for Binary
  6217. | Floating-Point Arithmetic, except that the conversion is always rounded
  6218. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6219. | Otherwise, if the conversion overflows, the largest integer with the same
  6220. | sign as `a' is returned.
  6221. *----------------------------------------------------------------------------*}
  6222. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  6223. var
  6224. aSign: flag;
  6225. aExp, shiftCount: int32;
  6226. aSig, savedASig: bits64;
  6227. z: int32;
  6228. label
  6229. invalid;
  6230. begin
  6231. aSig := extractFloatx80Frac( a );
  6232. aExp := extractFloatx80Exp( a );
  6233. aSign := extractFloatx80Sign( a );
  6234. if ( $401E < aExp ) then begin
  6235. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 )<>0 ) then aSign := 0;
  6236. goto invalid;
  6237. end
  6238. else if ( aExp < $3FFF ) then begin
  6239. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6240. result := 0;
  6241. exit;
  6242. end;
  6243. shiftCount := $403E - aExp;
  6244. savedASig := aSig;
  6245. aSig := aSig shr shiftCount;
  6246. z := aSig;
  6247. if ( aSign <> 0 ) then z := - z;
  6248. if ( ord( z < 0 ) xor aSign ) <> 0 then begin
  6249. invalid:
  6250. float_raise( float_flag_invalid );
  6251. if aSign <> 0 then result := sbits32( $80000000 ) else result := $7FFFFFFF;
  6252. exit;
  6253. end;
  6254. if ( ( aSig shl shiftCount ) <> savedASig ) then begin
  6255. set_inexact_flag;
  6256. end;
  6257. result := z;
  6258. end;
  6259. {*----------------------------------------------------------------------------
  6260. | Returns the result of converting the extended double-precision floating-
  6261. | point value `a' to the 64-bit two's complement integer format. The
  6262. | conversion is performed according to the IEC/IEEE Standard for Binary
  6263. | Floating-Point Arithmetic---which means in particular that the conversion
  6264. | is rounded according to the current rounding mode. If `a' is a NaN,
  6265. | the largest positive integer is returned. Otherwise, if the conversion
  6266. | overflows, the largest integer with the same sign as `a' is returned.
  6267. *----------------------------------------------------------------------------*}
  6268. function floatx80_to_int64(a: floatx80): int64;
  6269. var
  6270. aSign: flag;
  6271. aExp, shiftCount: int32;
  6272. aSig, aSigExtra: bits64;
  6273. begin
  6274. aSig := extractFloatx80Frac( a );
  6275. aExp := extractFloatx80Exp( a );
  6276. aSign := extractFloatx80Sign( a );
  6277. shiftCount := $403E - aExp;
  6278. if ( shiftCount <= 0 ) then begin
  6279. if ( shiftCount <> 0 ) then begin
  6280. float_raise( float_flag_invalid );
  6281. if ( ( aSign = 0 )
  6282. or ( ( aExp = $7FFF )
  6283. and ( aSig <> bits64( $8000000000000000 ) ) )
  6284. ) then begin
  6285. result := $7FFFFFFFFFFFFFFF;
  6286. exit;
  6287. end;
  6288. result := $8000000000000000;
  6289. exit;
  6290. end;
  6291. aSigExtra := 0;
  6292. end
  6293. else begin
  6294. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  6295. end;
  6296. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  6297. end;
  6298. {*----------------------------------------------------------------------------
  6299. | Returns the result of converting the extended double-precision floating-
  6300. | point value `a' to the 64-bit two's complement integer format. The
  6301. | conversion is performed according to the IEC/IEEE Standard for Binary
  6302. | Floating-Point Arithmetic, except that the conversion is always rounded
  6303. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6304. | Otherwise, if the conversion overflows, the largest integer with the same
  6305. | sign as `a' is returned.
  6306. *----------------------------------------------------------------------------*}
  6307. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  6308. var
  6309. aSign: flag;
  6310. aExp, shiftCount: int32;
  6311. aSig: bits64;
  6312. z: int64;
  6313. begin
  6314. aSig := extractFloatx80Frac( a );
  6315. aExp := extractFloatx80Exp( a );
  6316. aSign := extractFloatx80Sign( a );
  6317. shiftCount := aExp - $403E;
  6318. if ( 0 <= shiftCount ) then begin
  6319. aSig := $7FFFFFFFFFFFFFFF;
  6320. if ( ( a.high <> $C03E ) or ( aSig <> 0 ) ) then begin
  6321. float_raise( float_flag_invalid );
  6322. if ( ( aSign = 0 ) or ( ( aExp = $7FFF ) and ( aSig <> 0 ) ) ) then begin
  6323. result := $7FFFFFFFFFFFFFFF;
  6324. exit;
  6325. end;
  6326. end;
  6327. result := $8000000000000000;
  6328. exit;
  6329. end
  6330. else if ( aExp < $3FFF ) then begin
  6331. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6332. result := 0;
  6333. exit;
  6334. end;
  6335. z := aSig shr ( - shiftCount );
  6336. if bits64( aSig shl ( shiftCount and 63 ) ) <> 0 then begin
  6337. set_inexact_flag;
  6338. end;
  6339. if ( aSign <> 0 ) then z := - z;
  6340. result := z;
  6341. end;
  6342. {*----------------------------------------------------------------------------
  6343. | The pattern for a default generated extended double-precision NaN. The
  6344. | `high' and `low' values hold the most- and least-significant bits,
  6345. | respectively.
  6346. *----------------------------------------------------------------------------*}
  6347. const
  6348. floatx80_default_nan_high = $FFFF;
  6349. floatx80_default_nan_low = bits64( $C000000000000000 );
  6350. {*----------------------------------------------------------------------------
  6351. | Returns 1 if the extended double-precision floating-point value `a' is a
  6352. | signaling NaN; otherwise returns 0.
  6353. *----------------------------------------------------------------------------*}
  6354. function floatx80_is_signaling_nan(a : floatx80): flag;
  6355. var
  6356. aLow: bits64;
  6357. begin
  6358. aLow := a.low and not $4000000000000000;
  6359. result := ord(
  6360. ( a.high and $7FFF = $7FFF )
  6361. and ( bits64( aLow shl 1 ) <> 0 )
  6362. and ( a.low = aLow ) );
  6363. end;
  6364. {*----------------------------------------------------------------------------
  6365. | Returns the result of converting the extended double-precision floating-
  6366. | point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
  6367. | invalid exception is raised.
  6368. *----------------------------------------------------------------------------*}
  6369. function floatx80ToCommonNaN(a : floatx80): commonNaNT;
  6370. var
  6371. z: commonNaNT;
  6372. begin
  6373. if floatx80_is_signaling_nan( a ) <> 0 then float_raise( float_flag_invalid );
  6374. z.sign := a.high shr 15;
  6375. z.low := 0;
  6376. z.high := a.low shl 1;
  6377. result := z;
  6378. end;
  6379. {*----------------------------------------------------------------------------
  6380. | Returns 1 if the extended double-precision floating-point value `a' is a
  6381. | NaN; otherwise returns 0.
  6382. *----------------------------------------------------------------------------*}
  6383. function floatx80_is_nan(a : floatx80 ): flag;
  6384. begin
  6385. result := ord( ( ( a.high and $7FFF ) = $7FFF ) and ( bits64( a.low shl 1 ) <> 0 ) );
  6386. end;
  6387. {*----------------------------------------------------------------------------
  6388. | Takes two extended double-precision floating-point values `a' and `b', one
  6389. | of which is a NaN, and returns the appropriate NaN result. If either `a' or
  6390. | `b' is a signaling NaN, the invalid exception is raised.
  6391. *----------------------------------------------------------------------------*}
  6392. function propagateFloatx80NaN(a, b: floatx80): floatx80;
  6393. var
  6394. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  6395. label
  6396. returnLargerSignificand;
  6397. begin
  6398. aIsNaN := floatx80_is_nan( a );
  6399. aIsSignalingNaN := floatx80_is_signaling_nan( a );
  6400. bIsNaN := floatx80_is_nan( b );
  6401. bIsSignalingNaN := floatx80_is_signaling_nan( b );
  6402. a.low := a.low or $C000000000000000;
  6403. b.low := b.low or $C000000000000000;
  6404. if aIsSignalingNaN or bIsSignalingNaN <> 0 then float_raise( float_flag_invalid );
  6405. if aIsSignalingNaN <> 0 then begin
  6406. if bIsSignalingNaN <> 0 then goto returnLargerSignificand;
  6407. if bIsNaN <> 0 then result := b else result := a;
  6408. exit;
  6409. end
  6410. else if aIsNaN <>0 then begin
  6411. if ( bIsSignalingNaN <> 0 ) or ( bIsNaN = 0) then begin
  6412. result := a;
  6413. exit;
  6414. end;
  6415. returnLargerSignificand:
  6416. if ( a.low < b.low ) then begin
  6417. result := b;
  6418. exit;
  6419. end;
  6420. if ( b.low < a.low ) then begin
  6421. result := a;
  6422. exit;
  6423. end;
  6424. if a.high < b.high then result := a else result := b;
  6425. exit;
  6426. end
  6427. else
  6428. result := b;
  6429. end;
  6430. {*----------------------------------------------------------------------------
  6431. | Returns the result of converting the extended double-precision floating-
  6432. | point value `a' to the single-precision floating-point format. The
  6433. | conversion is performed according to the IEC/IEEE Standard for Binary
  6434. | Floating-Point Arithmetic.
  6435. *----------------------------------------------------------------------------*}
  6436. function floatx80_to_float32(a: floatx80): float32;
  6437. var
  6438. aSign: flag;
  6439. aExp: int32;
  6440. aSig: bits64;
  6441. begin
  6442. aSig := extractFloatx80Frac( a );
  6443. aExp := extractFloatx80Exp( a );
  6444. aSign := extractFloatx80Sign( a );
  6445. if ( aExp = $7FFF ) then begin
  6446. if bits64( aSig shl 1 ) <> 0 then begin
  6447. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  6448. exit;
  6449. end;
  6450. result := packFloat32( aSign, $FF, 0 );
  6451. exit;
  6452. end;
  6453. shift64RightJamming( aSig, 33, aSig );
  6454. if ( aExp or aSig <> 0 ) then dec( aExp, $3F81 );
  6455. result := roundAndPackFloat32( aSign, aExp, aSig );
  6456. end;
  6457. {*----------------------------------------------------------------------------
  6458. | Returns the result of converting the extended double-precision floating-
  6459. | point value `a' to the double-precision floating-point format. The
  6460. | conversion is performed according to the IEC/IEEE Standard for Binary
  6461. | Floating-Point Arithmetic.
  6462. *----------------------------------------------------------------------------*}
  6463. function floatx80_to_float64(a: floatx80): float64;
  6464. var
  6465. aSign: flag;
  6466. aExp: int32;
  6467. aSig, zSig: bits64;
  6468. begin
  6469. aSig := extractFloatx80Frac( a );
  6470. aExp := extractFloatx80Exp( a );
  6471. aSign := extractFloatx80Sign( a );
  6472. if ( aExp = $7FFF ) then begin
  6473. if bits64( aSig shl 1 ) <> 0 then begin
  6474. result:=commonNaNToFloat64(floatx80ToCommonNaN(a));
  6475. exit;
  6476. end;
  6477. result := packFloat64( aSign, $7FF, 0 );
  6478. exit;
  6479. end;
  6480. shift64RightJamming( aSig, 1, zSig );
  6481. if ( aExp or aSig <> 0 ) then dec( aExp, $3C01 );
  6482. result := roundAndPackFloat64( aSign, aExp, zSig );
  6483. end;
  6484. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6485. {*----------------------------------------------------------------------------
  6486. | Returns the result of converting the extended double-precision floating-
  6487. | point value `a' to the quadruple-precision floating-point format. The
  6488. | conversion is performed according to the IEC/IEEE Standard for Binary
  6489. | Floating-Point Arithmetic.
  6490. *----------------------------------------------------------------------------*}
  6491. function floatx80_to_float128(a: floatx80): float128;
  6492. var
  6493. aSign: flag;
  6494. aExp: int16;
  6495. aSig, zSig0, zSig1: bits64;
  6496. begin
  6497. aSig := extractFloatx80Frac( a );
  6498. aExp := extractFloatx80Exp( a );
  6499. aSign := extractFloatx80Sign( a );
  6500. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then begin
  6501. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  6502. exit;
  6503. end;
  6504. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  6505. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  6506. end;
  6507. {$endif FPC_SOFTFLOAT_FLOAT128}
  6508. {*----------------------------------------------------------------------------
  6509. | Rounds the extended double-precision floating-point value `a' to an integer,
  6510. | and Returns the result as an extended quadruple-precision floating-point
  6511. | value. The operation is performed according to the IEC/IEEE Standard for
  6512. | Binary Floating-Point Arithmetic.
  6513. *----------------------------------------------------------------------------*}
  6514. function floatx80_round_to_int(a: floatx80): floatx80;
  6515. var
  6516. aSign: flag;
  6517. aExp: int32;
  6518. lastBitMask, roundBitsMask: bits64;
  6519. roundingMode: TFPURoundingMode;
  6520. z: floatx80;
  6521. begin
  6522. aExp := extractFloatx80Exp( a );
  6523. if ( $403E <= aExp ) then begin
  6524. if ( aExp = $7FFF ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) then begin
  6525. result := propagateFloatx80NaN( a, a );
  6526. exit;
  6527. end;
  6528. result := a;
  6529. exit;
  6530. end;
  6531. if ( aExp < $3FFF ) then begin
  6532. if ( ( aExp = 0 )
  6533. and ( bits64( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) then begin
  6534. result := a;
  6535. exit;
  6536. end;
  6537. set_inexact_flag;
  6538. aSign := extractFloatx80Sign( a );
  6539. case softfloat_rounding_mode of
  6540. float_round_nearest_even:
  6541. if ( ( aExp = $3FFE ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  6542. ) then begin
  6543. result :=
  6544. packFloatx80( aSign, $3FFF, bits64( $8000000000000000 ) );
  6545. exit;
  6546. end;
  6547. float_round_down: begin
  6548. if aSign <> 0 then
  6549. result := packFloatx80( 1, $3FFF, bits64( $8000000000000000 ) )
  6550. else
  6551. result := packFloatx80( 0, 0, 0 );
  6552. exit;
  6553. end;
  6554. float_round_up: begin
  6555. if aSign <> 0 then
  6556. result := packFloatx80( 1, 0, 0 )
  6557. else
  6558. result := packFloatx80( 0, $3FFF, bits64( $8000000000000000 ) );
  6559. exit;
  6560. end;
  6561. end;
  6562. result := packFloatx80( aSign, 0, 0 );
  6563. exit;
  6564. end;
  6565. lastBitMask := 1;
  6566. lastBitMask := lastBitMask shl ( $403E - aExp );
  6567. roundBitsMask := lastBitMask - 1;
  6568. z := a;
  6569. roundingMode := softfloat_rounding_mode;
  6570. if ( roundingMode = float_round_nearest_even ) then begin
  6571. inc( z.low, lastBitMask shr 1 );
  6572. if ( ( z.low and roundBitsMask ) = 0 ) then z.low := z.low and not lastBitMask;
  6573. end
  6574. else if ( roundingMode <> float_round_to_zero ) then begin
  6575. if ( extractFloatx80Sign( z ) <> 0 ) xor ( roundingMode = float_round_up ) then begin
  6576. inc( z.low, roundBitsMask );
  6577. end;
  6578. end;
  6579. z.low := z.low and not roundBitsMask;
  6580. if ( z.low = 0 ) then begin
  6581. inc(z.high);
  6582. z.low := bits64( $8000000000000000 );
  6583. end;
  6584. if ( z.low <> a.low ) then set_inexact_flag;
  6585. result := z;
  6586. end;
  6587. {*----------------------------------------------------------------------------
  6588. | Returns the result of adding the absolute values of the extended double-
  6589. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  6590. | negated before being returned. `zSign' is ignored if the result is a NaN.
  6591. | The addition is performed according to the IEC/IEEE Standard for Binary
  6592. | Floating-Point Arithmetic.
  6593. *----------------------------------------------------------------------------*}
  6594. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6595. var
  6596. aExp, bExp, zExp: int32;
  6597. aSig, bSig, zSig0, zSig1: bits64;
  6598. expDiff: int32;
  6599. label
  6600. shiftRight1, roundAndPack;
  6601. begin
  6602. aSig := extractFloatx80Frac( a );
  6603. aExp := extractFloatx80Exp( a );
  6604. bSig := extractFloatx80Frac( b );
  6605. bExp := extractFloatx80Exp( b );
  6606. expDiff := aExp - bExp;
  6607. if ( 0 < expDiff ) then begin
  6608. if ( aExp = $7FFF ) then begin
  6609. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6610. result := propagateFloatx80NaN( a, b );
  6611. exit;
  6612. end;
  6613. result := a;
  6614. exit;
  6615. end;
  6616. if ( bExp = 0 ) then dec(expDiff);
  6617. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6618. zExp := aExp;
  6619. end
  6620. else if ( expDiff < 0 ) then begin
  6621. if ( bExp = $7FFF ) then begin
  6622. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6623. result := propagateFloatx80NaN( a, b );
  6624. exit;
  6625. end;
  6626. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6627. exit;
  6628. end;
  6629. if ( aExp = 0 ) then inc(expDiff);
  6630. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6631. zExp := bExp;
  6632. end
  6633. else begin
  6634. if ( aExp = $7FFF ) then begin
  6635. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6636. result := propagateFloatx80NaN( a, b );
  6637. exit;
  6638. end;
  6639. result := a;
  6640. exit;
  6641. end;
  6642. zSig1 := 0;
  6643. zSig0 := aSig + bSig;
  6644. if ( aExp = 0 ) then begin
  6645. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  6646. goto roundAndPack;
  6647. end;
  6648. zExp := aExp;
  6649. goto shiftRight1;
  6650. end;
  6651. zSig0 := aSig + bSig;
  6652. if ( sbits64( zSig0 ) < 0 ) then goto roundAndPack;
  6653. shiftRight1:
  6654. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  6655. zSig0 := zSig0 or $8000000000000000;
  6656. inc(zExp);
  6657. roundAndPack:
  6658. result :=
  6659. roundAndPackFloatx80(
  6660. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6661. end;
  6662. {*----------------------------------------------------------------------------
  6663. | Returns the result of subtracting the absolute values of the extended
  6664. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  6665. | difference is negated before being returned. `zSign' is ignored if the
  6666. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6667. | Standard for Binary Floating-Point Arithmetic.
  6668. *----------------------------------------------------------------------------*}
  6669. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6670. var
  6671. aExp, bExp, zExp: int32;
  6672. aSig, bSig, zSig0, zSig1: bits64;
  6673. expDiff: int32;
  6674. z: floatx80;
  6675. label
  6676. bExpBigger, bBigger, aExpBigger, aBigger, normalizeRoundAndPack;
  6677. begin
  6678. aSig := extractFloatx80Frac( a );
  6679. aExp := extractFloatx80Exp( a );
  6680. bSig := extractFloatx80Frac( b );
  6681. bExp := extractFloatx80Exp( b );
  6682. expDiff := aExp - bExp;
  6683. if ( 0 < expDiff ) then goto aExpBigger;
  6684. if ( expDiff < 0 ) then goto bExpBigger;
  6685. if ( aExp = $7FFF ) then begin
  6686. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6687. result := propagateFloatx80NaN( a, b );
  6688. exit;
  6689. end;
  6690. float_raise( float_flag_invalid );
  6691. z.low := floatx80_default_nan_low;
  6692. z.high := floatx80_default_nan_high;
  6693. result := z;
  6694. exit;
  6695. end;
  6696. if ( aExp = 0 ) then begin
  6697. aExp := 1;
  6698. bExp := 1;
  6699. end;
  6700. zSig1 := 0;
  6701. if ( bSig < aSig ) then goto aBigger;
  6702. if ( aSig < bSig ) then goto bBigger;
  6703. result := packFloatx80( ord( softfloat_rounding_mode = float_round_down ), 0, 0 );
  6704. exit;
  6705. bExpBigger:
  6706. if ( bExp = $7FFF ) then begin
  6707. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6708. result := propagateFloatx80NaN( a, b );
  6709. exit;
  6710. end;
  6711. result := packFloatx80( zSign xor 1, $7FFF, bits64( $8000000000000000 ) );
  6712. exit;
  6713. end;
  6714. if ( aExp = 0 ) then inc(expDiff);
  6715. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6716. bBigger:
  6717. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6718. zExp := bExp;
  6719. zSign := zSign xor 1;
  6720. goto normalizeRoundAndPack;
  6721. aExpBigger:
  6722. if ( aExp = $7FFF ) then begin
  6723. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6724. result := propagateFloatx80NaN( a, b );
  6725. exit;
  6726. end;
  6727. result := a;
  6728. exit;
  6729. end;
  6730. if ( bExp = 0 ) then dec(expDiff);
  6731. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6732. aBigger:
  6733. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6734. zExp := aExp;
  6735. normalizeRoundAndPack:
  6736. result :=
  6737. normalizeRoundAndPackFloatx80(
  6738. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6739. end;
  6740. {*----------------------------------------------------------------------------
  6741. | Returns the result of adding the extended double-precision floating-point
  6742. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6743. | Standard for Binary Floating-Point Arithmetic.
  6744. *----------------------------------------------------------------------------*}
  6745. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6746. var
  6747. aSign, bSign: flag;
  6748. begin
  6749. aSign := extractFloatx80Sign( a );
  6750. bSign := extractFloatx80Sign( b );
  6751. if ( aSign = bSign ) then begin
  6752. result := addFloatx80Sigs( a, b, aSign );
  6753. end
  6754. else begin
  6755. result := subFloatx80Sigs( a, b, aSign );
  6756. end;
  6757. end;
  6758. {*----------------------------------------------------------------------------
  6759. | Returns the result of subtracting the extended double-precision floating-
  6760. | point values `a' and `b'. The operation is performed according to the
  6761. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6762. *----------------------------------------------------------------------------*}
  6763. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6764. var
  6765. aSign, bSign: flag;
  6766. begin
  6767. aSign := extractFloatx80Sign( a );
  6768. bSign := extractFloatx80Sign( b );
  6769. if ( aSign = bSign ) then begin
  6770. result := subFloatx80Sigs( a, b, aSign );
  6771. end
  6772. else begin
  6773. result := addFloatx80Sigs( a, b, aSign );
  6774. end;
  6775. end;
  6776. {*----------------------------------------------------------------------------
  6777. | Returns the result of multiplying the extended double-precision floating-
  6778. | point values `a' and `b'. The operation is performed according to the
  6779. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6780. *----------------------------------------------------------------------------*}
  6781. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6782. var
  6783. aSign, bSign, zSign: flag;
  6784. aExp, bExp, zExp: int32;
  6785. aSig, bSig, zSig0, zSig1: bits64;
  6786. z: floatx80;
  6787. label
  6788. invalid;
  6789. begin
  6790. aSig := extractFloatx80Frac( a );
  6791. aExp := extractFloatx80Exp( a );
  6792. aSign := extractFloatx80Sign( a );
  6793. bSig := extractFloatx80Frac( b );
  6794. bExp := extractFloatx80Exp( b );
  6795. bSign := extractFloatx80Sign( b );
  6796. zSign := aSign xor bSign;
  6797. if ( aExp = $7FFF ) then begin
  6798. if ( bits64( aSig shl 1 ) <> 0 )
  6799. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6800. result := propagateFloatx80NaN( a, b );
  6801. exit;
  6802. end;
  6803. if ( ( bExp or bSig ) = 0 ) then goto invalid;
  6804. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6805. exit;
  6806. end;
  6807. if ( bExp = $7FFF ) then begin
  6808. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6809. result := propagateFloatx80NaN( a, b );
  6810. exit;
  6811. end;
  6812. if ( ( aExp or aSig ) = 0 ) then begin
  6813. invalid:
  6814. float_raise( float_flag_invalid );
  6815. z.low := floatx80_default_nan_low;
  6816. z.high := floatx80_default_nan_high;
  6817. result := z;
  6818. exit;
  6819. end;
  6820. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6821. exit;
  6822. end;
  6823. if ( aExp = 0 ) then begin
  6824. if ( aSig = 0 ) then begin
  6825. result := packFloatx80( zSign, 0, 0 );
  6826. exit;
  6827. end;
  6828. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6829. end;
  6830. if ( bExp = 0 ) then begin
  6831. if ( bSig = 0 ) then begin
  6832. result := packFloatx80( zSign, 0, 0 );
  6833. exit;
  6834. end;
  6835. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6836. end;
  6837. zExp := aExp + bExp - $3FFE;
  6838. mul64To128( aSig, bSig, zSig0, zSig1 );
  6839. if 0 < sbits64( zSig0 ) then begin
  6840. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6841. dec(zExp);
  6842. end;
  6843. result :=
  6844. roundAndPackFloatx80(
  6845. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6846. end;
  6847. {*----------------------------------------------------------------------------
  6848. | Returns the result of dividing the extended double-precision floating-point
  6849. | value `a' by the corresponding value `b'. The operation is performed
  6850. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6851. *----------------------------------------------------------------------------*}
  6852. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6853. var
  6854. aSign, bSign, zSign: flag;
  6855. aExp, bExp, zExp: int32;
  6856. aSig, bSig, zSig0, zSig1: bits64;
  6857. rem0, rem1, rem2, term0, term1, term2: bits64;
  6858. z: floatx80;
  6859. label
  6860. invalid;
  6861. begin
  6862. aSig := extractFloatx80Frac( a );
  6863. aExp := extractFloatx80Exp( a );
  6864. aSign := extractFloatx80Sign( a );
  6865. bSig := extractFloatx80Frac( b );
  6866. bExp := extractFloatx80Exp( b );
  6867. bSign := extractFloatx80Sign( b );
  6868. zSign := aSign xor bSign;
  6869. if ( aExp = $7FFF ) then begin
  6870. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6871. result := propagateFloatx80NaN( a, b );
  6872. exit;
  6873. end;
  6874. if ( bExp = $7FFF ) then begin
  6875. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6876. result := propagateFloatx80NaN( a, b );
  6877. exit;
  6878. end;
  6879. goto invalid;
  6880. end;
  6881. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6882. exit;
  6883. end;
  6884. if ( bExp = $7FFF ) then begin
  6885. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6886. result := propagateFloatx80NaN( a, b );
  6887. exit;
  6888. end;
  6889. result := packFloatx80( zSign, 0, 0 );
  6890. exit;
  6891. end;
  6892. if ( bExp = 0 ) then begin
  6893. if ( bSig = 0 ) then begin
  6894. if ( ( aExp or aSig ) = 0 ) then begin
  6895. invalid:
  6896. float_raise( float_flag_invalid );
  6897. z.low := floatx80_default_nan_low;
  6898. z.high := floatx80_default_nan_high;
  6899. result := z;
  6900. exit;
  6901. end;
  6902. float_raise( float_flag_divbyzero );
  6903. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6904. exit;
  6905. end;
  6906. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6907. end;
  6908. if ( aExp = 0 ) then begin
  6909. if ( aSig = 0 ) then begin
  6910. result := packFloatx80( zSign, 0, 0 );
  6911. exit;
  6912. end;
  6913. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6914. end;
  6915. zExp := aExp - bExp + $3FFE;
  6916. rem1 := 0;
  6917. if ( bSig <= aSig ) then begin
  6918. shift128Right( aSig, 0, 1, aSig, rem1 );
  6919. inc(zExp);
  6920. end;
  6921. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6922. mul64To128( bSig, zSig0, term0, term1 );
  6923. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6924. while ( sbits64( rem0 ) < 0 ) do begin
  6925. dec(zSig0);
  6926. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6927. end;
  6928. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6929. if ( bits64( zSig1 shl 1 ) <= 8 ) then begin
  6930. mul64To128( bSig, zSig1, term1, term2 );
  6931. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6932. while ( sbits64( rem1 ) < 0 ) do begin
  6933. dec(zSig1);
  6934. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6935. end;
  6936. zSig1 := zSig1 or ord( ( rem1 or rem2 ) <> 0 );
  6937. end;
  6938. result :=
  6939. roundAndPackFloatx80(
  6940. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6941. end;
  6942. {*----------------------------------------------------------------------------
  6943. | Returns the remainder of the extended double-precision floating-point value
  6944. | `a' with respect to the corresponding value `b'. The operation is performed
  6945. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6946. *----------------------------------------------------------------------------*}
  6947. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6948. var
  6949. aSign, zSign: flag;
  6950. aExp, bExp, expDiff: int32;
  6951. aSig0, aSig1, bSig: bits64;
  6952. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6953. z: floatx80;
  6954. label
  6955. invalid;
  6956. begin
  6957. aSig0 := extractFloatx80Frac( a );
  6958. aExp := extractFloatx80Exp( a );
  6959. aSign := extractFloatx80Sign( a );
  6960. bSig := extractFloatx80Frac( b );
  6961. bExp := extractFloatx80Exp( b );
  6962. if ( aExp = $7FFF ) then begin
  6963. if ( bits64( aSig0 shl 1 ) <> 0 )
  6964. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6965. result := propagateFloatx80NaN( a, b );
  6966. exit;
  6967. end;
  6968. goto invalid;
  6969. end;
  6970. if ( bExp = $7FFF ) then begin
  6971. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6972. result := propagateFloatx80NaN( a, b );
  6973. exit;
  6974. end;
  6975. result := a;
  6976. exit;
  6977. end;
  6978. if ( bExp = 0 ) then begin
  6979. if ( bSig = 0 ) then begin
  6980. invalid:
  6981. float_raise( float_flag_invalid );
  6982. z.low := floatx80_default_nan_low;
  6983. z.high := floatx80_default_nan_high;
  6984. result := z;
  6985. exit;
  6986. end;
  6987. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6988. end;
  6989. if ( aExp = 0 ) then begin
  6990. if ( bits64( aSig0 shl 1 ) = 0 ) then begin
  6991. result := a;
  6992. exit;
  6993. end;
  6994. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6995. end;
  6996. bSig := bSig or $8000000000000000;
  6997. zSign := aSign;
  6998. expDiff := aExp - bExp;
  6999. aSig1 := 0;
  7000. if ( expDiff < 0 ) then begin
  7001. if ( expDiff < -1 ) then begin
  7002. result := a;
  7003. exit;
  7004. end;
  7005. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  7006. expDiff := 0;
  7007. end;
  7008. q := ord( bSig <= aSig0 );
  7009. if ( q <> 0 ) then dec( aSig0, bSig );
  7010. dec( expDiff, 64 );
  7011. while ( 0 < expDiff ) do begin
  7012. q := estimateDiv128To64( aSig0, aSig1, bSig );
  7013. if ( 2 < q ) then q := q - 2 else q := 0;
  7014. mul64To128( bSig, q, term0, term1 );
  7015. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7016. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  7017. dec( expDiff, 62 );
  7018. end;
  7019. inc( expDiff, 64 );
  7020. if ( 0 < expDiff ) then begin
  7021. q := estimateDiv128To64( aSig0, aSig1, bSig );
  7022. if ( 2 < q ) then q:= q - 2 else q := 0;
  7023. q := q shr ( 64 - expDiff );
  7024. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  7025. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7026. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  7027. while ( le128( term0, term1, aSig0, aSig1 ) <> 0 ) do begin
  7028. inc(q);
  7029. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7030. end;
  7031. end
  7032. else begin
  7033. term1 := 0;
  7034. term0 := bSig;
  7035. end;
  7036. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  7037. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7038. or ( ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7039. and ( q and 1 <> 0 ) )
  7040. then begin
  7041. aSig0 := alternateASig0;
  7042. aSig1 := alternateASig1;
  7043. zSign := ord( zSign = 0 );
  7044. end;
  7045. result :=
  7046. normalizeRoundAndPackFloatx80(
  7047. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  7048. end;
  7049. {*----------------------------------------------------------------------------
  7050. | Returns the square root of the extended double-precision floating-point
  7051. | value `a'. The operation is performed according to the IEC/IEEE Standard
  7052. | for Binary Floating-Point Arithmetic.
  7053. *----------------------------------------------------------------------------*}
  7054. function floatx80_sqrt(a: floatx80): floatx80;
  7055. var
  7056. aSign: flag;
  7057. aExp, zExp: int32;
  7058. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  7059. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7060. z: floatx80;
  7061. label
  7062. invalid;
  7063. begin
  7064. aSig0 := extractFloatx80Frac( a );
  7065. aExp := extractFloatx80Exp( a );
  7066. aSign := extractFloatx80Sign( a );
  7067. if ( aExp = $7FFF ) then begin
  7068. if ( bits64( aSig0 shl 1 ) <> 0 ) then begin
  7069. result := propagateFloatx80NaN( a, a );
  7070. exit;
  7071. end;
  7072. if ( aSign = 0 ) then begin
  7073. result := a;
  7074. exit;
  7075. end;
  7076. goto invalid;
  7077. end;
  7078. if ( aSign <> 0 ) then begin
  7079. if ( ( aExp or aSig0 ) = 0 ) then begin
  7080. result := a;
  7081. exit;
  7082. end;
  7083. invalid:
  7084. float_raise( float_flag_invalid );
  7085. z.low := floatx80_default_nan_low;
  7086. z.high := floatx80_default_nan_high;
  7087. result := z;
  7088. exit;
  7089. end;
  7090. if ( aExp = 0 ) then begin
  7091. if ( aSig0 = 0 ) then begin
  7092. result := packFloatx80( 0, 0, 0 );
  7093. exit;
  7094. end;
  7095. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  7096. end;
  7097. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFF;
  7098. zSig0 := estimateSqrt32( aExp, aSig0 shr 32 );
  7099. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  7100. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7101. doubleZSig0 := zSig0 shl 1;
  7102. mul64To128( zSig0, zSig0, term0, term1 );
  7103. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7104. while ( sbits64( rem0 ) < 0 ) do begin
  7105. dec(zSig0);
  7106. dec( doubleZSig0, 2 );
  7107. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7108. end;
  7109. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7110. if ( ( zSig1 and $3FFFFFFFFFFFFFFF ) <= 5 ) then begin
  7111. if ( zSig1 = 0 ) then zSig1 := 1;
  7112. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7113. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7114. mul64To128( zSig1, zSig1, term2, term3 );
  7115. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7116. while ( sbits64( rem1 ) < 0 ) do begin
  7117. dec(zSig1);
  7118. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7119. term3 := term3 or 1;
  7120. term2 := term2 or doubleZSig0;
  7121. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7122. end;
  7123. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7124. end;
  7125. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  7126. zSig0 := zSig0 or doubleZSig0;
  7127. result :=
  7128. roundAndPackFloatx80(
  7129. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  7130. end;
  7131. {*----------------------------------------------------------------------------
  7132. | Returns 1 if the extended double-precision floating-point value `a' is
  7133. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  7134. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  7135. | Arithmetic.
  7136. *----------------------------------------------------------------------------*}
  7137. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  7138. begin
  7139. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7140. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  7141. ) or ( ( extractFloatx80Exp( b ) = $7FFF )
  7142. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 )
  7143. ) then begin
  7144. if ( floatx80_is_signaling_nan( a )
  7145. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7146. float_raise( float_flag_invalid );
  7147. end;
  7148. result := 0;
  7149. exit;
  7150. end;
  7151. result := ord(
  7152. ( a.low = b.low )
  7153. and ( ( a.high = b.high )
  7154. or ( ( a.low = 0 )
  7155. and ( bits16 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7156. ) );
  7157. end;
  7158. {*----------------------------------------------------------------------------
  7159. | Returns 1 if the extended double-precision floating-point value `a' is
  7160. | less than or equal to the corresponding value `b', and 0 otherwise. The
  7161. | comparison is performed according to the IEC/IEEE Standard for Binary
  7162. | Floating-Point Arithmetic.
  7163. *----------------------------------------------------------------------------*}
  7164. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  7165. var
  7166. aSign, bSign: flag;
  7167. begin
  7168. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7169. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7170. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7171. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7172. then begin
  7173. float_raise( float_flag_invalid );
  7174. result := 0;
  7175. exit;
  7176. end;
  7177. aSign := extractFloatx80Sign( a );
  7178. bSign := extractFloatx80Sign( b );
  7179. if ( aSign <> bSign ) then begin
  7180. result := ord(
  7181. ( aSign <> 0 )
  7182. or ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low = 0 ) );
  7183. exit;
  7184. end;
  7185. if aSign<>0 then
  7186. result := le128( b.high, b.low, a.high, a.low )
  7187. else
  7188. result := le128( a.high, a.low, b.high, b.low );
  7189. end;
  7190. {*----------------------------------------------------------------------------
  7191. | Returns 1 if the extended double-precision floating-point value `a' is
  7192. | less than the corresponding value `b', and 0 otherwise. The comparison
  7193. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7194. | Arithmetic.
  7195. *----------------------------------------------------------------------------*}
  7196. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  7197. var
  7198. aSign, bSign: flag;
  7199. begin
  7200. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7201. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7202. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7203. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7204. then begin
  7205. float_raise( float_flag_invalid );
  7206. result := 0;
  7207. exit;
  7208. end;
  7209. aSign := extractFloatx80Sign( a );
  7210. bSign := extractFloatx80Sign( b );
  7211. if ( aSign <> bSign ) then begin
  7212. result := ord(
  7213. ( aSign <> 0 )
  7214. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7215. exit;
  7216. end;
  7217. if aSign <> 0 then
  7218. result := lt128( b.high, b.low, a.high, a.low )
  7219. else
  7220. result := lt128( a.high, a.low, b.high, b.low );
  7221. end;
  7222. {*----------------------------------------------------------------------------
  7223. | Returns 1 if the extended double-precision floating-point value `a' is equal
  7224. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  7225. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7226. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7227. *----------------------------------------------------------------------------*}
  7228. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  7229. begin
  7230. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7231. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7232. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7233. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7234. then begin
  7235. float_raise( float_flag_invalid );
  7236. result := 0;
  7237. exit;
  7238. end;
  7239. result := ord(
  7240. ( a.low = b.low )
  7241. and ( ( a.high = b.high )
  7242. or ( ( a.low = 0 )
  7243. and ( bits16( ( a.high or b.high ) shl 1 ) = 0 ) )
  7244. ) );
  7245. end;
  7246. {*----------------------------------------------------------------------------
  7247. | Returns 1 if the extended double-precision floating-point value `a' is less
  7248. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  7249. | do not cause an exception. Otherwise, the comparison is performed according
  7250. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7251. *----------------------------------------------------------------------------*}
  7252. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  7253. var
  7254. aSign, bSign: flag;
  7255. begin
  7256. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7257. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7258. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7259. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7260. then begin
  7261. if ( floatx80_is_signaling_nan( a )
  7262. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7263. float_raise( float_flag_invalid );
  7264. end;
  7265. result := 0;
  7266. exit;
  7267. end;
  7268. aSign := extractFloatx80Sign( a );
  7269. bSign := extractFloatx80Sign( b );
  7270. if ( aSign <> bSign ) then begin
  7271. result := ord(
  7272. ( aSign <> 0 )
  7273. or ( ( bits16( ( a.high or b.high ) shl 1 ) ) or a.low or b.low = 0 ) );
  7274. exit;
  7275. end;
  7276. if aSign <> 0 then
  7277. result := le128( b.high, b.low, a.high, a.low )
  7278. else
  7279. result := le128( a.high, a.low, b.high, b.low );
  7280. end;
  7281. {*----------------------------------------------------------------------------
  7282. | Returns 1 if the extended double-precision floating-point value `a' is less
  7283. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  7284. | an exception. Otherwise, the comparison is performed according to the
  7285. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7286. *----------------------------------------------------------------------------*}
  7287. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  7288. var
  7289. aSign, bSign: flag;
  7290. begin
  7291. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7292. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7293. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7294. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7295. then begin
  7296. if ( floatx80_is_signaling_nan( a )
  7297. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7298. float_raise( float_flag_invalid );
  7299. end;
  7300. result := 0;
  7301. exit;
  7302. end;
  7303. aSign := extractFloatx80Sign( a );
  7304. bSign := extractFloatx80Sign( b );
  7305. if ( aSign <> bSign ) then begin
  7306. result := ord(
  7307. ( aSign <> 0 )
  7308. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7309. exit;
  7310. end;
  7311. if aSign <> 0 then
  7312. result := lt128( b.high, b.low, a.high, a.low )
  7313. else
  7314. result := lt128( a.high, a.low, b.high, b.low );
  7315. end;
  7316. {$endif FPC_SOFTFLOAT_FLOATX80}
  7317. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  7318. {*----------------------------------------------------------------------------
  7319. | Returns the least-significant 64 fraction bits of the quadruple-precision
  7320. | floating-point value `a'.
  7321. *----------------------------------------------------------------------------*}
  7322. function extractFloat128Frac1(a : float128): bits64;
  7323. begin
  7324. result:=a.low;
  7325. end;
  7326. {*----------------------------------------------------------------------------
  7327. | Returns the most-significant 48 fraction bits of the quadruple-precision
  7328. | floating-point value `a'.
  7329. *----------------------------------------------------------------------------*}
  7330. function extractFloat128Frac0(a : float128): bits64;
  7331. begin
  7332. result:=a.high and int64($0000FFFFFFFFFFFF);
  7333. end;
  7334. {*----------------------------------------------------------------------------
  7335. | Returns the exponent bits of the quadruple-precision floating-point value
  7336. | `a'.
  7337. *----------------------------------------------------------------------------*}
  7338. function extractFloat128Exp(a : float128): int32;
  7339. begin
  7340. result:=( a.high shr 48 ) and $7FFF;
  7341. end;
  7342. {*----------------------------------------------------------------------------
  7343. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  7344. *----------------------------------------------------------------------------*}
  7345. function extractFloat128Sign(a : float128): flag;
  7346. begin
  7347. result:=a.high shr 63;
  7348. end;
  7349. {*----------------------------------------------------------------------------
  7350. | Normalizes the subnormal quadruple-precision floating-point value
  7351. | represented by the denormalized significand formed by the concatenation of
  7352. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  7353. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  7354. | significand are stored at the location pointed to by `zSig0Ptr', and the
  7355. | least significant 64 bits of the normalized significand are stored at the
  7356. | location pointed to by `zSig1Ptr'.
  7357. *----------------------------------------------------------------------------*}
  7358. procedure normalizeFloat128Subnormal(
  7359. aSig0: bits64;
  7360. aSig1: bits64;
  7361. var zExpPtr: int32;
  7362. var zSig0Ptr: bits64;
  7363. var zSig1Ptr: bits64);
  7364. var
  7365. shiftCount: int8;
  7366. begin
  7367. if ( aSig0 = 0 ) then
  7368. begin
  7369. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  7370. if ( shiftCount < 0 ) then
  7371. begin
  7372. zSig0Ptr := aSig1 shr ( - shiftCount );
  7373. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  7374. end
  7375. else begin
  7376. zSig0Ptr := aSig1 shl shiftCount;
  7377. zSig1Ptr := 0;
  7378. end;
  7379. zExpPtr := - shiftCount - 63;
  7380. end
  7381. else begin
  7382. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  7383. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  7384. zExpPtr := 1 - shiftCount;
  7385. end;
  7386. end;
  7387. {*----------------------------------------------------------------------------
  7388. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  7389. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  7390. | floating-point value, returning the result. After being shifted into the
  7391. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  7392. | added together to form the most significant 32 bits of the result. This
  7393. | means that any integer portion of `zSig0' will be added into the exponent.
  7394. | Since a properly normalized significand will have an integer portion equal
  7395. | to 1, the `zExp' input should be 1 less than the desired result exponent
  7396. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  7397. | significand.
  7398. *----------------------------------------------------------------------------*}
  7399. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  7400. var
  7401. z: float128;
  7402. begin
  7403. z.low := zSig1;
  7404. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  7405. result:=z;
  7406. end;
  7407. {*----------------------------------------------------------------------------
  7408. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7409. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  7410. | and `zSig2', and returns the proper quadruple-precision floating-point value
  7411. | corresponding to the abstract input. Ordinarily, the abstract value is
  7412. | simply rounded and packed into the quadruple-precision format, with the
  7413. | inexact exception raised if the abstract input cannot be represented
  7414. | exactly. However, if the abstract value is too large, the overflow and
  7415. | inexact exceptions are raised and an infinity or maximal finite value is
  7416. | returned. If the abstract value is too small, the input value is rounded to
  7417. | a subnormal number, and the underflow and inexact exceptions are raised if
  7418. | the abstract input cannot be represented exactly as a subnormal quadruple-
  7419. | precision floating-point number.
  7420. | The input significand must be normalized or smaller. If the input
  7421. | significand is not normalized, `zExp' must be 0; in that case, the result
  7422. | returned is a subnormal number, and it must not require rounding. In the
  7423. | usual case that the input significand is normalized, `zExp' must be 1 less
  7424. | than the ``true'' floating-point exponent. The handling of underflow and
  7425. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7426. *----------------------------------------------------------------------------*}
  7427. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  7428. var
  7429. roundingMode: TFPURoundingMode;
  7430. roundNearestEven, increment, isTiny: flag;
  7431. begin
  7432. roundingMode := softfloat_rounding_mode;
  7433. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  7434. increment := ord( sbits64(zSig2) < 0 );
  7435. if ( roundNearestEven=0 ) then
  7436. begin
  7437. if ( roundingMode = float_round_to_zero ) then
  7438. begin
  7439. increment := 0;
  7440. end
  7441. else begin
  7442. if ( zSign<>0 ) then
  7443. begin
  7444. increment := ord( roundingMode = float_round_down ) and zSig2;
  7445. end
  7446. else begin
  7447. increment := ord( roundingMode = float_round_up ) and zSig2;
  7448. end;
  7449. end;
  7450. end;
  7451. if ( $7FFD <= bits32(zExp) ) then
  7452. begin
  7453. if ( ord( $7FFD < zExp )
  7454. or ( ord( zExp = $7FFD )
  7455. and eq128(
  7456. int64( $0001FFFFFFFFFFFF ),
  7457. bits64( $FFFFFFFFFFFFFFFF ),
  7458. zSig0,
  7459. zSig1
  7460. )
  7461. and increment
  7462. )
  7463. )<>0 then
  7464. begin
  7465. float_raise( [float_flag_overflow,float_flag_inexact] );
  7466. if ( ord( roundingMode = float_round_to_zero )
  7467. or ( zSign and ord( roundingMode = float_round_up ) )
  7468. or ( ord( zSign = 0) and ord( roundingMode = float_round_down ) )
  7469. )<>0 then
  7470. begin
  7471. result :=
  7472. packFloat128(
  7473. zSign,
  7474. $7FFE,
  7475. int64( $0000FFFFFFFFFFFF ),
  7476. bits64( $FFFFFFFFFFFFFFFF )
  7477. );
  7478. exit;
  7479. end;
  7480. result:=packFloat128( zSign, $7FFF, 0, 0 );
  7481. exit;
  7482. end;
  7483. if ( zExp < 0 ) then
  7484. begin
  7485. isTiny :=
  7486. ord(( softfloat_detect_tininess = float_tininess_before_rounding )
  7487. or ( zExp < -1 )
  7488. or not( increment<>0 )
  7489. or boolean(lt128(
  7490. zSig0,
  7491. zSig1,
  7492. int64( $0001FFFFFFFFFFFF ),
  7493. bits64( $FFFFFFFFFFFFFFFF )
  7494. )));
  7495. shift128ExtraRightJamming(
  7496. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  7497. zExp := 0;
  7498. if ( isTiny and zSig2 )<>0 then
  7499. float_raise( float_flag_underflow );
  7500. if ( roundNearestEven<>0 ) then
  7501. begin
  7502. increment := ord( sbits64(zSig2) < 0 );
  7503. end
  7504. else begin
  7505. if ( zSign<>0 ) then
  7506. begin
  7507. increment := ord( roundingMode = float_round_down ) and zSig2;
  7508. end
  7509. else begin
  7510. increment := ord( roundingMode = float_round_up ) and zSig2;
  7511. end;
  7512. end;
  7513. end;
  7514. end;
  7515. if ( zSig2<>0 ) then
  7516. set_inexact_flag;
  7517. if ( increment<>0 ) then
  7518. begin
  7519. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  7520. zSig1 := zSig1 and not bits64( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  7521. end
  7522. else begin
  7523. if ( ( zSig0 or zSig1 ) = 0 ) then
  7524. zExp := 0;
  7525. end;
  7526. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  7527. end;
  7528. {*----------------------------------------------------------------------------
  7529. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7530. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  7531. | returns the proper quadruple-precision floating-point value corresponding
  7532. | to the abstract input. This routine is just like `roundAndPackFloat128'
  7533. | except that the input significand has fewer bits and does not have to be
  7534. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  7535. | point exponent.
  7536. *----------------------------------------------------------------------------*}
  7537. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  7538. var
  7539. shiftCount: int8;
  7540. zSig2: bits64;
  7541. begin
  7542. if ( zSig0 = 0 ) then
  7543. begin
  7544. zSig0 := zSig1;
  7545. zSig1 := 0;
  7546. dec(zExp, 64);
  7547. end;
  7548. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  7549. if ( 0 <= shiftCount ) then
  7550. begin
  7551. zSig2 := 0;
  7552. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  7553. end
  7554. else begin
  7555. shift128ExtraRightJamming(
  7556. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  7557. end;
  7558. dec(zExp, shiftCount);
  7559. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7560. end;
  7561. {*----------------------------------------------------------------------------
  7562. | Returns the result of converting the quadruple-precision floating-point
  7563. | value `a' to the 32-bit two's complement integer format. The conversion
  7564. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7565. | Arithmetic---which means in particular that the conversion is rounded
  7566. | according to the current rounding mode. If `a' is a NaN, the largest
  7567. | positive integer is returned. Otherwise, if the conversion overflows, the
  7568. | largest integer with the same sign as `a' is returned.
  7569. *----------------------------------------------------------------------------*}
  7570. function float128_to_int32(a: float128): int32;
  7571. var
  7572. aSign: flag;
  7573. aExp, shiftCount: int32;
  7574. aSig0, aSig1: bits64;
  7575. begin
  7576. aSig1 := extractFloat128Frac1( a );
  7577. aSig0 := extractFloat128Frac0( a );
  7578. aExp := extractFloat128Exp( a );
  7579. aSign := extractFloat128Sign( a );
  7580. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  7581. aSign := 0;
  7582. if ( aExp<>0 ) then
  7583. aSig0 := aSig0 or int64( $0001000000000000 );
  7584. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7585. shiftCount := $4028 - aExp;
  7586. if ( 0 < shiftCount ) then
  7587. shift64RightJamming( aSig0, shiftCount, aSig0 );
  7588. result := roundAndPackInt32( aSign, aSig0 );
  7589. end;
  7590. {*----------------------------------------------------------------------------
  7591. | Returns the result of converting the quadruple-precision floating-point
  7592. | value `a' to the 32-bit two's complement integer format. The conversion
  7593. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7594. | Arithmetic, except that the conversion is always rounded toward zero. If
  7595. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  7596. | conversion overflows, the largest integer with the same sign as `a' is
  7597. | returned.
  7598. *----------------------------------------------------------------------------*}
  7599. function float128_to_int32_round_to_zero(a: float128): int32;
  7600. var
  7601. aSign: flag;
  7602. aExp, shiftCount: int32;
  7603. aSig0, aSig1, savedASig: bits64;
  7604. z: int32;
  7605. label
  7606. invalid;
  7607. begin
  7608. aSig1 := extractFloat128Frac1( a );
  7609. aSig0 := extractFloat128Frac0( a );
  7610. aExp := extractFloat128Exp( a );
  7611. aSign := extractFloat128Sign( a );
  7612. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7613. if ( $401E < aExp ) then
  7614. begin
  7615. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  7616. aSign := 0;
  7617. goto invalid;
  7618. end
  7619. else if ( aExp < $3FFF ) then
  7620. begin
  7621. if ( aExp or aSig0 )<>0 then
  7622. set_inexact_flag;
  7623. result := 0;
  7624. exit;
  7625. end;
  7626. aSig0 := aSig0 or int64( $0001000000000000 );
  7627. shiftCount := $402F - aExp;
  7628. savedASig := aSig0;
  7629. aSig0 := aSig0 shr shiftCount;
  7630. z := aSig0;
  7631. if ( aSign )<>0 then
  7632. z := - z;
  7633. if ( ord( z < 0 ) xor aSign )<>0 then
  7634. begin
  7635. invalid:
  7636. float_raise( float_flag_invalid );
  7637. if aSign<>0 then
  7638. result:= int32( $80000000 )
  7639. else
  7640. result:=$7FFFFFFF;
  7641. exit;
  7642. end;
  7643. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  7644. begin
  7645. set_inexact_flag;
  7646. end;
  7647. result := z;
  7648. end;
  7649. {*----------------------------------------------------------------------------
  7650. | Returns the result of converting the quadruple-precision floating-point
  7651. | value `a' to the 64-bit two's complement integer format. The conversion
  7652. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7653. | Arithmetic---which means in particular that the conversion is rounded
  7654. | according to the current rounding mode. If `a' is a NaN, the largest
  7655. | positive integer is returned. Otherwise, if the conversion overflows, the
  7656. | largest integer with the same sign as `a' is returned.
  7657. *----------------------------------------------------------------------------*}
  7658. function float128_to_int64(a: float128): int64;
  7659. var
  7660. aSign: flag;
  7661. aExp, shiftCount: int32;
  7662. aSig0, aSig1: bits64;
  7663. begin
  7664. aSig1 := extractFloat128Frac1( a );
  7665. aSig0 := extractFloat128Frac0( a );
  7666. aExp := extractFloat128Exp( a );
  7667. aSign := extractFloat128Sign( a );
  7668. if ( aExp<>0 ) then
  7669. aSig0 := aSig0 or int64( $0001000000000000 );
  7670. shiftCount := $402F - aExp;
  7671. if ( shiftCount <= 0 ) then
  7672. begin
  7673. if ( $403E < aExp ) then
  7674. begin
  7675. float_raise( float_flag_invalid );
  7676. if ( (aSign=0)
  7677. or ( ( aExp = $7FFF )
  7678. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  7679. )
  7680. ) then
  7681. begin
  7682. result := int64( $7FFFFFFFFFFFFFFF );
  7683. exit;
  7684. end;
  7685. result := int64( $8000000000000000 );
  7686. exit;
  7687. end;
  7688. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  7689. end
  7690. else begin
  7691. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  7692. end;
  7693. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  7694. end;
  7695. {*----------------------------------------------------------------------------
  7696. | Returns the result of converting the quadruple-precision floating-point
  7697. | value `a' to the 64-bit two's complement integer format. The conversion
  7698. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7699. | Arithmetic, except that the conversion is always rounded toward zero.
  7700. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  7701. | the conversion overflows, the largest integer with the same sign as `a' is
  7702. | returned.
  7703. *----------------------------------------------------------------------------*}
  7704. function float128_to_int64_round_to_zero(a: float128): int64;
  7705. var
  7706. aSign: flag;
  7707. aExp, shiftCount: int32;
  7708. aSig0, aSig1: bits64;
  7709. z: int64;
  7710. begin
  7711. aSig1 := extractFloat128Frac1( a );
  7712. aSig0 := extractFloat128Frac0( a );
  7713. aExp := extractFloat128Exp( a );
  7714. aSign := extractFloat128Sign( a );
  7715. if ( aExp<>0 ) then
  7716. aSig0 := aSig0 or int64( $0001000000000000 );
  7717. shiftCount := aExp - $402F;
  7718. if ( 0 < shiftCount ) then
  7719. begin
  7720. if ( $403E <= aExp ) then
  7721. begin
  7722. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  7723. if ( ( a.high = bits64( $C03E000000000000 ) )
  7724. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  7725. begin
  7726. if ( aSig1<>0 ) then
  7727. set_inexact_flag;
  7728. end
  7729. else begin
  7730. float_raise( float_flag_invalid );
  7731. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  7732. begin
  7733. result := int64( $7FFFFFFFFFFFFFFF );
  7734. exit;
  7735. end;
  7736. end;
  7737. result := int64( $8000000000000000 );
  7738. exit;
  7739. end;
  7740. z := ( aSig0 shl shiftCount ) or ( aSig1 shr ( ( - shiftCount ) and 63 ) );
  7741. if ( int64( aSig1 shl shiftCount )<>0 ) then
  7742. begin
  7743. set_inexact_flag;
  7744. end;
  7745. end
  7746. else begin
  7747. if ( aExp < $3FFF ) then
  7748. begin
  7749. if ( aExp or aSig0 or aSig1 )<>0 then
  7750. begin
  7751. set_inexact_flag;
  7752. end;
  7753. result := 0;
  7754. exit;
  7755. end;
  7756. z := aSig0 shr ( - shiftCount );
  7757. if ( (aSig1<>0)
  7758. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  7759. begin
  7760. set_inexact_flag;
  7761. end;
  7762. end;
  7763. if ( aSign<>0 ) then
  7764. z := - z;
  7765. result := z;
  7766. end;
  7767. {*----------------------------------------------------------------------------
  7768. | Returns the result of converting the quadruple-precision floating-point
  7769. | value `a' to the single-precision floating-point format. The conversion
  7770. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7771. | Arithmetic.
  7772. *----------------------------------------------------------------------------*}
  7773. function float128_to_float32(a: float128): float32;
  7774. var
  7775. aSign: flag;
  7776. aExp: int32;
  7777. aSig0, aSig1: bits64;
  7778. zSig: bits32;
  7779. begin
  7780. aSig1 := extractFloat128Frac1( a );
  7781. aSig0 := extractFloat128Frac0( a );
  7782. aExp := extractFloat128Exp( a );
  7783. aSign := extractFloat128Sign( a );
  7784. if ( aExp = $7FFF ) then
  7785. begin
  7786. if ( aSig0 or aSig1 )<>0 then
  7787. begin
  7788. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  7789. exit;
  7790. end;
  7791. result := packFloat32( aSign, $FF, 0 );
  7792. exit;
  7793. end;
  7794. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7795. shift64RightJamming( aSig0, 18, aSig0 );
  7796. zSig := aSig0;
  7797. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7798. begin
  7799. zSig := zSig or $40000000;
  7800. dec(aExp,$3F81);
  7801. end;
  7802. result := roundAndPackFloat32( aSign, aExp, zSig );
  7803. end;
  7804. {*----------------------------------------------------------------------------
  7805. | Returns the result of converting the quadruple-precision floating-point
  7806. | value `a' to the double-precision floating-point format. The conversion
  7807. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7808. | Arithmetic.
  7809. *----------------------------------------------------------------------------*}
  7810. function float128_to_float64(a: float128): float64;
  7811. var
  7812. aSign: flag;
  7813. aExp: int32;
  7814. aSig0, aSig1: bits64;
  7815. begin
  7816. aSig1 := extractFloat128Frac1( a );
  7817. aSig0 := extractFloat128Frac0( a );
  7818. aExp := extractFloat128Exp( a );
  7819. aSign := extractFloat128Sign( a );
  7820. if ( aExp = $7FFF ) then
  7821. begin
  7822. if ( aSig0 or aSig1 )<>0 then
  7823. begin
  7824. result:=commonNaNToFloat64(float128ToCommonNaN(a));
  7825. exit;
  7826. end;
  7827. result:=packFloat64( aSign, $7FF, 0);
  7828. exit;
  7829. end;
  7830. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7831. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7832. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7833. begin
  7834. aSig0 := aSig0 or int64( $4000000000000000 );
  7835. dec(aExp,$3C01);
  7836. end;
  7837. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7838. end;
  7839. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7840. {*----------------------------------------------------------------------------
  7841. | Returns the result of converting the quadruple-precision floating-point
  7842. | value `a' to the extended double-precision floating-point format. The
  7843. | conversion is performed according to the IEC/IEEE Standard for Binary
  7844. | Floating-Point Arithmetic.
  7845. *----------------------------------------------------------------------------*}
  7846. function float128_to_floatx80(a: float128): floatx80;
  7847. var
  7848. aSign: flag;
  7849. aExp: int32;
  7850. aSig0, aSig1: bits64;
  7851. begin
  7852. aSig1 := extractFloat128Frac1( a );
  7853. aSig0 := extractFloat128Frac0( a );
  7854. aExp := extractFloat128Exp( a );
  7855. aSign := extractFloat128Sign( a );
  7856. if ( aExp = $7FFF ) then begin
  7857. if ( aSig0 or aSig1 <> 0 ) then begin
  7858. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7859. exit;
  7860. end;
  7861. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  7862. exit;
  7863. end;
  7864. if ( aExp = 0 ) then begin
  7865. if ( ( aSig0 or aSig1 ) = 0 ) then
  7866. begin
  7867. result := packFloatx80( aSign, 0, 0 );
  7868. exit;
  7869. end;
  7870. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7871. end
  7872. else begin
  7873. aSig0 := aSig0 or int64( $0001000000000000 );
  7874. end;
  7875. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7876. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7877. end;
  7878. {$endif FPC_SOFTFLOAT_FLOATX80}
  7879. {*----------------------------------------------------------------------------
  7880. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7881. | Returns the result as a quadruple-precision floating-point value. The
  7882. | operation is performed according to the IEC/IEEE Standard for Binary
  7883. | Floating-Point Arithmetic.
  7884. *----------------------------------------------------------------------------*}
  7885. function float128_round_to_int(a: float128): float128;
  7886. var
  7887. aSign: flag;
  7888. aExp: int32;
  7889. lastBitMask, roundBitsMask: bits64;
  7890. roundingMode: TFPURoundingMode;
  7891. z: float128;
  7892. begin
  7893. aExp := extractFloat128Exp( a );
  7894. if ( $402F <= aExp ) then
  7895. begin
  7896. if ( $406F <= aExp ) then
  7897. begin
  7898. if ( ( aExp = $7FFF )
  7899. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7900. ) then
  7901. begin
  7902. result := propagateFloat128NaN( a, a );
  7903. exit;
  7904. end;
  7905. result := a;
  7906. exit;
  7907. end;
  7908. lastBitMask := 1;
  7909. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7910. roundBitsMask := lastBitMask - 1;
  7911. z := a;
  7912. roundingMode := softfloat_rounding_mode;
  7913. if ( roundingMode = float_round_nearest_even ) then
  7914. begin
  7915. if ( lastBitMask )<>0 then
  7916. begin
  7917. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7918. if ( ( z.low and roundBitsMask ) = 0 ) then
  7919. z.low := z.low and not(lastBitMask);
  7920. end
  7921. else begin
  7922. if ( sbits64(z.low) < 0 ) then
  7923. begin
  7924. inc(z.high);
  7925. if ( bits64( z.low shl 1 ) = 0 ) then
  7926. z.high := z.high and not bits64( 1 );
  7927. end;
  7928. end;
  7929. end
  7930. else if ( roundingMode <> float_round_to_zero ) then
  7931. begin
  7932. if ( extractFloat128Sign( z )
  7933. xor ord( roundingMode = float_round_up ) )<>0 then
  7934. begin
  7935. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7936. end;
  7937. end;
  7938. z.low := z.low and not(roundBitsMask);
  7939. end
  7940. else begin
  7941. if ( aExp < $3FFF ) then
  7942. begin
  7943. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7944. begin
  7945. result := a;
  7946. exit;
  7947. end;
  7948. set_inexact_flag;
  7949. aSign := extractFloat128Sign( a );
  7950. case softfloat_rounding_mode of
  7951. float_round_nearest_even:
  7952. if ( ( aExp = $3FFE )
  7953. and ( (extractFloat128Frac0( a )<>0)
  7954. or (extractFloat128Frac1( a )<>0) )
  7955. ) then begin
  7956. begin
  7957. result := packFloat128( aSign, $3FFF, 0, 0 );
  7958. exit;
  7959. end;
  7960. end;
  7961. float_round_down:
  7962. begin
  7963. if aSign<>0 then
  7964. result:=packFloat128( 1, $3FFF, 0, 0 )
  7965. else
  7966. result:=packFloat128( 0, 0, 0, 0 );
  7967. exit;
  7968. end;
  7969. float_round_up:
  7970. begin
  7971. if aSign<>0 then
  7972. result := packFloat128( 1, 0, 0, 0 )
  7973. else
  7974. result:=packFloat128( 0, $3FFF, 0, 0 );
  7975. exit;
  7976. end;
  7977. end;
  7978. result := packFloat128( aSign, 0, 0, 0 );
  7979. exit;
  7980. end;
  7981. lastBitMask := 1;
  7982. lastBitMask := lastBitMask shl ($402F - aExp);
  7983. roundBitsMask := lastBitMask - 1;
  7984. z.low := 0;
  7985. z.high := a.high;
  7986. roundingMode := softfloat_rounding_mode;
  7987. if ( roundingMode = float_round_nearest_even ) then begin
  7988. inc(z.high,lastBitMask shr 1);
  7989. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7990. z.high := z.high and not(lastBitMask);
  7991. end;
  7992. end
  7993. else if ( roundingMode <> float_round_to_zero ) then begin
  7994. if ( (extractFloat128Sign( z )<>0)
  7995. xor ( roundingMode = float_round_up ) ) then begin
  7996. z.high := z.high or ord( a.low <> 0 );
  7997. z.high := z.high+roundBitsMask;
  7998. end;
  7999. end;
  8000. z.high := z.high and not(roundBitsMask);
  8001. end;
  8002. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  8003. set_inexact_flag;
  8004. end;
  8005. result := z;
  8006. end;
  8007. {*----------------------------------------------------------------------------
  8008. | Returns the result of adding the absolute values of the quadruple-precision
  8009. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  8010. | before being returned. `zSign' is ignored if the result is a NaN.
  8011. | The addition is performed according to the IEC/IEEE Standard for Binary
  8012. | Floating-Point Arithmetic.
  8013. *----------------------------------------------------------------------------*}
  8014. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  8015. var
  8016. aExp, bExp, zExp: int32;
  8017. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8018. expDiff: int32;
  8019. label
  8020. shiftRight1,roundAndPack;
  8021. begin
  8022. aSig1 := extractFloat128Frac1( a );
  8023. aSig0 := extractFloat128Frac0( a );
  8024. aExp := extractFloat128Exp( a );
  8025. bSig1 := extractFloat128Frac1( b );
  8026. bSig0 := extractFloat128Frac0( b );
  8027. bExp := extractFloat128Exp( b );
  8028. expDiff := aExp - bExp;
  8029. if ( 0 < expDiff ) then begin
  8030. if ( aExp = $7FFF ) then begin
  8031. if ( aSig0 or aSig1 )<>0 then
  8032. begin
  8033. result := propagateFloat128NaN( a, b );
  8034. exit;
  8035. end;
  8036. result := a;
  8037. exit;
  8038. end;
  8039. if ( bExp = 0 ) then begin
  8040. dec(expDiff);
  8041. end
  8042. else begin
  8043. bSig0 := bSig0 or int64( $0001000000000000 );
  8044. end;
  8045. shift128ExtraRightJamming(
  8046. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  8047. zExp := aExp;
  8048. end
  8049. else if ( expDiff < 0 ) then begin
  8050. if ( bExp = $7FFF ) then begin
  8051. if ( bSig0 or bSig1 )<>0 then
  8052. begin
  8053. result := propagateFloat128NaN( a, b );
  8054. exit;
  8055. end;
  8056. result := packFloat128( zSign, $7FFF, 0, 0 );
  8057. exit;
  8058. end;
  8059. if ( aExp = 0 ) then begin
  8060. inc(expDiff);
  8061. end
  8062. else begin
  8063. aSig0 := aSig0 or int64( $0001000000000000 );
  8064. end;
  8065. shift128ExtraRightJamming(
  8066. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  8067. zExp := bExp;
  8068. end
  8069. else begin
  8070. if ( aExp = $7FFF ) then begin
  8071. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8072. result := propagateFloat128NaN( a, b );
  8073. exit;
  8074. end;
  8075. result := a;
  8076. exit;
  8077. end;
  8078. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8079. if ( aExp = 0 ) then
  8080. begin
  8081. result := packFloat128( zSign, 0, zSig0, zSig1 );
  8082. exit;
  8083. end;
  8084. zSig2 := 0;
  8085. zSig0 := zSig0 or int64( $0002000000000000 );
  8086. zExp := aExp;
  8087. goto shiftRight1;
  8088. end;
  8089. aSig0 := aSig0 or int64( $0001000000000000 );
  8090. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8091. dec(zExp);
  8092. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  8093. inc(zExp);
  8094. shiftRight1:
  8095. shift128ExtraRightJamming(
  8096. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8097. roundAndPack:
  8098. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8099. end;
  8100. {*----------------------------------------------------------------------------
  8101. | Returns the result of subtracting the absolute values of the quadruple-
  8102. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  8103. | difference is negated before being returned. `zSign' is ignored if the
  8104. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  8105. | Standard for Binary Floating-Point Arithmetic.
  8106. *----------------------------------------------------------------------------*}
  8107. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  8108. var
  8109. aExp, bExp, zExp: int32;
  8110. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  8111. expDiff: int32;
  8112. z: float128;
  8113. label
  8114. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  8115. begin
  8116. aSig1 := extractFloat128Frac1( a );
  8117. aSig0 := extractFloat128Frac0( a );
  8118. aExp := extractFloat128Exp( a );
  8119. bSig1 := extractFloat128Frac1( b );
  8120. bSig0 := extractFloat128Frac0( b );
  8121. bExp := extractFloat128Exp( b );
  8122. expDiff := aExp - bExp;
  8123. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  8124. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  8125. if ( 0 < expDiff ) then goto aExpBigger;
  8126. if ( expDiff < 0 ) then goto bExpBigger;
  8127. if ( aExp = $7FFF ) then begin
  8128. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8129. result := propagateFloat128NaN( a, b );
  8130. exit;
  8131. end;
  8132. float_raise( float_flag_invalid );
  8133. z.low := float128_default_nan_low;
  8134. z.high := float128_default_nan_high;
  8135. result := z;
  8136. exit;
  8137. end;
  8138. if ( aExp = 0 ) then begin
  8139. aExp := 1;
  8140. bExp := 1;
  8141. end;
  8142. if ( bSig0 < aSig0 ) then goto aBigger;
  8143. if ( aSig0 < bSig0 ) then goto bBigger;
  8144. if ( bSig1 < aSig1 ) then goto aBigger;
  8145. if ( aSig1 < bSig1 ) then goto bBigger;
  8146. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  8147. exit;
  8148. bExpBigger:
  8149. if ( bExp = $7FFF ) then begin
  8150. if ( bSig0 or bSig1 )<>0 then
  8151. begin
  8152. result := propagateFloat128NaN( a, b );
  8153. exit;
  8154. end;
  8155. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  8156. exit;
  8157. end;
  8158. if ( aExp = 0 ) then begin
  8159. inc(expDiff);
  8160. end
  8161. else begin
  8162. aSig0 := aSig0 or int64( $4000000000000000 );
  8163. end;
  8164. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8165. bSig0 := bSig0 or int64( $4000000000000000 );
  8166. bBigger:
  8167. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  8168. zExp := bExp;
  8169. zSign := zSign xor 1;
  8170. goto normalizeRoundAndPack;
  8171. aExpBigger:
  8172. if ( aExp = $7FFF ) then begin
  8173. if ( aSig0 or aSig1 )<>0 then
  8174. begin
  8175. result := propagateFloat128NaN( a, b );
  8176. exit;
  8177. end;
  8178. result := a;
  8179. exit;
  8180. end;
  8181. if ( bExp = 0 ) then begin
  8182. dec(expDiff);
  8183. end
  8184. else begin
  8185. bSig0 := bSig0 or int64( $4000000000000000 );
  8186. end;
  8187. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  8188. aSig0 := aSig0 or int64( $4000000000000000 );
  8189. aBigger:
  8190. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8191. zExp := aExp;
  8192. normalizeRoundAndPack:
  8193. dec(zExp);
  8194. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  8195. end;
  8196. {*----------------------------------------------------------------------------
  8197. | Returns the result of adding the quadruple-precision floating-point values
  8198. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  8199. | for Binary Floating-Point Arithmetic.
  8200. *----------------------------------------------------------------------------*}
  8201. function float128_add(a: float128; b: float128): float128;
  8202. var
  8203. aSign, bSign: flag;
  8204. begin
  8205. aSign := extractFloat128Sign( a );
  8206. bSign := extractFloat128Sign( b );
  8207. if ( aSign = bSign ) then begin
  8208. result := addFloat128Sigs( a, b, aSign );
  8209. end
  8210. else begin
  8211. result := subFloat128Sigs( a, b, aSign );
  8212. end;
  8213. end;
  8214. {*----------------------------------------------------------------------------
  8215. | Returns the result of subtracting the quadruple-precision floating-point
  8216. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8217. | Standard for Binary Floating-Point Arithmetic.
  8218. *----------------------------------------------------------------------------*}
  8219. function float128_sub(a: float128; b: float128): float128;
  8220. var
  8221. aSign, bSign: flag;
  8222. begin
  8223. aSign := extractFloat128Sign( a );
  8224. bSign := extractFloat128Sign( b );
  8225. if ( aSign = bSign ) then begin
  8226. result := subFloat128Sigs( a, b, aSign );
  8227. end
  8228. else begin
  8229. result := addFloat128Sigs( a, b, aSign );
  8230. end;
  8231. end;
  8232. {*----------------------------------------------------------------------------
  8233. | Returns the result of multiplying the quadruple-precision floating-point
  8234. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8235. | Standard for Binary Floating-Point Arithmetic.
  8236. *----------------------------------------------------------------------------*}
  8237. function float128_mul(a: float128; b: float128): float128;
  8238. var
  8239. aSign, bSign, zSign: flag;
  8240. aExp, bExp, zExp: int32;
  8241. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  8242. z: float128;
  8243. label
  8244. invalid;
  8245. begin
  8246. aSig1 := extractFloat128Frac1( a );
  8247. aSig0 := extractFloat128Frac0( a );
  8248. aExp := extractFloat128Exp( a );
  8249. aSign := extractFloat128Sign( a );
  8250. bSig1 := extractFloat128Frac1( b );
  8251. bSig0 := extractFloat128Frac0( b );
  8252. bExp := extractFloat128Exp( b );
  8253. bSign := extractFloat128Sign( b );
  8254. zSign := aSign xor bSign;
  8255. if ( aExp = $7FFF ) then begin
  8256. if ( (( aSig0 or aSig1 )<>0)
  8257. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8258. result := propagateFloat128NaN( a, b );
  8259. exit;
  8260. end;
  8261. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  8262. result := packFloat128( zSign, $7FFF, 0, 0 );
  8263. exit;
  8264. end;
  8265. if ( bExp = $7FFF ) then begin
  8266. if ( bSig0 or bSig1 )<>0 then
  8267. begin
  8268. result := propagateFloat128NaN( a, b );
  8269. exit;
  8270. end;
  8271. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8272. invalid:
  8273. float_raise( float_flag_invalid );
  8274. z.low := float128_default_nan_low;
  8275. z.high := float128_default_nan_high;
  8276. result := z;
  8277. exit;
  8278. end;
  8279. result := packFloat128( zSign, $7FFF, 0, 0 );
  8280. exit;
  8281. end;
  8282. if ( aExp = 0 ) then begin
  8283. if ( ( aSig0 or aSig1 ) = 0 ) then
  8284. begin
  8285. result := packFloat128( zSign, 0, 0, 0 );
  8286. exit;
  8287. end;
  8288. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8289. end;
  8290. if ( bExp = 0 ) then begin
  8291. if ( ( bSig0 or bSig1 ) = 0 ) then
  8292. begin
  8293. result := packFloat128( zSign, 0, 0, 0 );
  8294. exit;
  8295. end;
  8296. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8297. end;
  8298. zExp := aExp + bExp - $4000;
  8299. aSig0 := aSig0 or int64( $0001000000000000 );
  8300. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  8301. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  8302. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  8303. zSig2 := zSig2 or ord( zSig3 <> 0 );
  8304. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  8305. shift128ExtraRightJamming(
  8306. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8307. inc(zExp);
  8308. end;
  8309. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8310. end;
  8311. {*----------------------------------------------------------------------------
  8312. | Returns the result of dividing the quadruple-precision floating-point value
  8313. | `a' by the corresponding value `b'. The operation is performed according to
  8314. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8315. *----------------------------------------------------------------------------*}
  8316. function float128_div(a: float128; b: float128): float128;
  8317. var
  8318. aSign, bSign, zSign: flag;
  8319. aExp, bExp, zExp: int32;
  8320. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8321. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8322. z: float128;
  8323. label
  8324. invalid;
  8325. begin
  8326. aSig1 := extractFloat128Frac1( a );
  8327. aSig0 := extractFloat128Frac0( a );
  8328. aExp := extractFloat128Exp( a );
  8329. aSign := extractFloat128Sign( a );
  8330. bSig1 := extractFloat128Frac1( b );
  8331. bSig0 := extractFloat128Frac0( b );
  8332. bExp := extractFloat128Exp( b );
  8333. bSign := extractFloat128Sign( b );
  8334. zSign := aSign xor bSign;
  8335. if ( aExp = $7FFF ) then begin
  8336. if ( aSig0 or aSig1 )<>0 then
  8337. begin
  8338. result := propagateFloat128NaN( a, b );
  8339. exit;
  8340. end;
  8341. if ( bExp = $7FFF ) then begin
  8342. if ( bSig0 or bSig1 )<>0 then
  8343. begin
  8344. result := propagateFloat128NaN( a, b );
  8345. exit;
  8346. end;
  8347. goto invalid;
  8348. end;
  8349. result := packFloat128( zSign, $7FFF, 0, 0 );
  8350. exit;
  8351. end;
  8352. if ( bExp = $7FFF ) then begin
  8353. if ( bSig0 or bSig1 )<>0 then
  8354. begin
  8355. result := propagateFloat128NaN( a, b );
  8356. exit;
  8357. end;
  8358. result := packFloat128( zSign, 0, 0, 0 );
  8359. exit;
  8360. end;
  8361. if ( bExp = 0 ) then begin
  8362. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8363. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8364. invalid:
  8365. float_raise( float_flag_invalid );
  8366. z.low := float128_default_nan_low;
  8367. z.high := float128_default_nan_high;
  8368. result := z;
  8369. exit;
  8370. end;
  8371. float_raise( float_flag_divbyzero );
  8372. result := packFloat128( zSign, $7FFF, 0, 0 );
  8373. exit;
  8374. end;
  8375. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8376. end;
  8377. if ( aExp = 0 ) then begin
  8378. if ( ( aSig0 or aSig1 ) = 0 ) then
  8379. begin
  8380. result := packFloat128( zSign, 0, 0, 0 );
  8381. exit;
  8382. end;
  8383. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8384. end;
  8385. zExp := aExp - bExp + $3FFD;
  8386. shortShift128Left(
  8387. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  8388. shortShift128Left(
  8389. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8390. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  8391. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  8392. inc(zExp);
  8393. end;
  8394. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8395. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  8396. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  8397. while ( sbits64(rem0) < 0 ) do begin
  8398. dec(zSig0);
  8399. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  8400. end;
  8401. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  8402. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  8403. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  8404. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  8405. while ( sbits64(rem1) < 0 ) do begin
  8406. dec(zSig1);
  8407. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  8408. end;
  8409. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8410. end;
  8411. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  8412. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8413. end;
  8414. {*----------------------------------------------------------------------------
  8415. | Returns the remainder of the quadruple-precision floating-point value `a'
  8416. | with respect to the corresponding value `b'. The operation is performed
  8417. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8418. *----------------------------------------------------------------------------*}
  8419. function float128_rem(a: float128; b: float128): float128;
  8420. var
  8421. aSign, zSign: flag;
  8422. aExp, bExp, expDiff: int32;
  8423. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  8424. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  8425. sigMean0: sbits64;
  8426. z: float128;
  8427. label
  8428. invalid;
  8429. begin
  8430. aSig1 := extractFloat128Frac1( a );
  8431. aSig0 := extractFloat128Frac0( a );
  8432. aExp := extractFloat128Exp( a );
  8433. aSign := extractFloat128Sign( a );
  8434. bSig1 := extractFloat128Frac1( b );
  8435. bSig0 := extractFloat128Frac0( b );
  8436. bExp := extractFloat128Exp( b );
  8437. if ( aExp = $7FFF ) then begin
  8438. if ( (( aSig0 or aSig1 )<>0)
  8439. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8440. result := propagateFloat128NaN( a, b );
  8441. exit;
  8442. end;
  8443. goto invalid;
  8444. end;
  8445. if ( bExp = $7FFF ) then begin
  8446. if ( bSig0 or bSig1 )<>0 then
  8447. begin
  8448. result := propagateFloat128NaN( a, b );
  8449. exit;
  8450. end;
  8451. result := a;
  8452. exit;
  8453. end;
  8454. if ( bExp = 0 ) then begin
  8455. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8456. invalid:
  8457. float_raise( float_flag_invalid );
  8458. z.low := float128_default_nan_low;
  8459. z.high := float128_default_nan_high;
  8460. result := z;
  8461. exit;
  8462. end;
  8463. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8464. end;
  8465. if ( aExp = 0 ) then begin
  8466. if ( ( aSig0 or aSig1 ) = 0 ) then
  8467. begin
  8468. result := a;
  8469. exit;
  8470. end;
  8471. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8472. end;
  8473. expDiff := aExp - bExp;
  8474. if ( expDiff < -1 ) then
  8475. begin
  8476. result := a;
  8477. exit;
  8478. end;
  8479. shortShift128Left(
  8480. aSig0 or int64( $0001000000000000 ),
  8481. aSig1,
  8482. 15 - ord( expDiff < 0 ),
  8483. aSig0,
  8484. aSig1
  8485. );
  8486. shortShift128Left(
  8487. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8488. q := le128( bSig0, bSig1, aSig0, aSig1 );
  8489. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8490. dec(expDiff,64);
  8491. while ( 0 < expDiff ) do begin
  8492. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8493. if ( 4 < q ) then
  8494. q := q - 4
  8495. else
  8496. q := 0;
  8497. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8498. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  8499. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  8500. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  8501. dec(expDiff,61);
  8502. end;
  8503. if ( -64 < expDiff ) then begin
  8504. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8505. if ( 4 < q ) then
  8506. q := q - 4
  8507. else
  8508. q := 0;
  8509. q := q shr (- expDiff);
  8510. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8511. inc(expDiff,52);
  8512. if ( expDiff < 0 ) then begin
  8513. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8514. end
  8515. else begin
  8516. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  8517. end;
  8518. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8519. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  8520. end
  8521. else begin
  8522. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  8523. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8524. end;
  8525. repeat
  8526. alternateASig0 := aSig0;
  8527. alternateASig1 := aSig1;
  8528. inc(q);
  8529. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8530. until not( 0 <= sbits64(aSig0) );
  8531. add128(
  8532. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  8533. if ( ( sigMean0 < 0 )
  8534. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  8535. aSig0 := alternateASig0;
  8536. aSig1 := alternateASig1;
  8537. end;
  8538. zSign := ord( sbits64(aSig0) < 0 );
  8539. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  8540. result :=
  8541. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  8542. end;
  8543. {*----------------------------------------------------------------------------
  8544. | Returns the square root of the quadruple-precision floating-point value `a'.
  8545. | The operation is performed according to the IEC/IEEE Standard for Binary
  8546. | Floating-Point Arithmetic.
  8547. *----------------------------------------------------------------------------*}
  8548. function float128_sqrt(a: float128): float128;
  8549. var
  8550. aSign: flag;
  8551. aExp, zExp: int32;
  8552. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  8553. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8554. z: float128;
  8555. label
  8556. invalid;
  8557. begin
  8558. aSig1 := extractFloat128Frac1( a );
  8559. aSig0 := extractFloat128Frac0( a );
  8560. aExp := extractFloat128Exp( a );
  8561. aSign := extractFloat128Sign( a );
  8562. if ( aExp = $7FFF ) then begin
  8563. if ( aSig0 or aSig1 )<>0 then
  8564. begin
  8565. result := propagateFloat128NaN( a, a );
  8566. exit;
  8567. end;
  8568. if ( aSign=0 ) then
  8569. begin
  8570. result := a;
  8571. exit;
  8572. end;
  8573. goto invalid;
  8574. end;
  8575. if ( aSign<>0 ) then begin
  8576. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  8577. begin
  8578. result := a;
  8579. exit;
  8580. end;
  8581. invalid:
  8582. float_raise( float_flag_invalid );
  8583. z.low := float128_default_nan_low;
  8584. z.high := float128_default_nan_high;
  8585. result := z;
  8586. exit;
  8587. end;
  8588. if ( aExp = 0 ) then begin
  8589. if ( ( aSig0 or aSig1 ) = 0 ) then
  8590. begin
  8591. result := packFloat128( 0, 0, 0, 0 );
  8592. exit;
  8593. end;
  8594. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8595. end;
  8596. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFE;
  8597. aSig0 := aSig0 or int64( $0001000000000000 );
  8598. zSig0 := estimateSqrt32( aExp, aSig0 shr 17 );
  8599. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  8600. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  8601. doubleZSig0 := zSig0 shl 1;
  8602. mul64To128( zSig0, zSig0, term0, term1 );
  8603. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  8604. while ( sbits64(rem0) < 0 ) do begin
  8605. dec(zSig0);
  8606. dec(doubleZSig0,2);
  8607. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  8608. end;
  8609. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  8610. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  8611. if ( zSig1 = 0 ) then zSig1 := 1;
  8612. mul64To128( doubleZSig0, zSig1, term1, term2 );
  8613. sub128( rem1, 0, term1, term2, rem1, rem2 );
  8614. mul64To128( zSig1, zSig1, term2, term3 );
  8615. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  8616. while ( sbits64(rem1) < 0 ) do begin
  8617. dec(zSig1);
  8618. shortShift128Left( 0, zSig1, 1, term2, term3 );
  8619. term3 := term3 or 1;
  8620. term2 := term2 or doubleZSig0;
  8621. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  8622. end;
  8623. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8624. end;
  8625. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  8626. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  8627. end;
  8628. {*----------------------------------------------------------------------------
  8629. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8630. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8631. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8632. *----------------------------------------------------------------------------*}
  8633. function float128_eq(a: float128; b: float128): flag;
  8634. begin
  8635. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8636. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8637. or ( ( extractFloat128Exp( b ) = $7FFF )
  8638. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8639. ) then begin
  8640. if ( (float128_is_signaling_nan( a )<>0)
  8641. or (float128_is_signaling_nan( b )<>0) ) then begin
  8642. float_raise( float_flag_invalid );
  8643. end;
  8644. result := 0;
  8645. exit;
  8646. end;
  8647. result := ord(
  8648. ( a.low = b.low )
  8649. and ( ( a.high = b.high )
  8650. or ( ( a.low = 0 )
  8651. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  8652. ));
  8653. end;
  8654. {*----------------------------------------------------------------------------
  8655. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8656. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  8657. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  8658. | Arithmetic.
  8659. *----------------------------------------------------------------------------*}
  8660. function float128_le(a: float128; b: float128): flag;
  8661. var
  8662. aSign, bSign: flag;
  8663. begin
  8664. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8665. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8666. or ( ( extractFloat128Exp( b ) = $7FFF )
  8667. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8668. ) then begin
  8669. float_raise( float_flag_invalid );
  8670. result := 0;
  8671. exit;
  8672. end;
  8673. aSign := extractFloat128Sign( a );
  8674. bSign := extractFloat128Sign( b );
  8675. if ( aSign <> bSign ) then begin
  8676. result := ord(
  8677. (aSign<>0)
  8678. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8679. = 0 ));
  8680. exit;
  8681. end;
  8682. if aSign<>0 then
  8683. result := le128( b.high, b.low, a.high, a.low )
  8684. else
  8685. result := le128( a.high, a.low, b.high, b.low );
  8686. end;
  8687. {*----------------------------------------------------------------------------
  8688. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8689. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8690. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8691. *----------------------------------------------------------------------------*}
  8692. function float128_lt(a: float128; b: float128): flag;
  8693. var
  8694. aSign, bSign: flag;
  8695. begin
  8696. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8697. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8698. or ( ( extractFloat128Exp( b ) = $7FFF )
  8699. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8700. ) then begin
  8701. float_raise( float_flag_invalid );
  8702. result := 0;
  8703. exit;
  8704. end;
  8705. aSign := extractFloat128Sign( a );
  8706. bSign := extractFloat128Sign( b );
  8707. if ( aSign <> bSign ) then begin
  8708. result := ord(
  8709. (aSign<>0)
  8710. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8711. <> 0 ));
  8712. exit;
  8713. end;
  8714. if aSign<>0 then
  8715. result := lt128( b.high, b.low, a.high, a.low )
  8716. else
  8717. result := lt128( a.high, a.low, b.high, b.low );
  8718. end;
  8719. {*----------------------------------------------------------------------------
  8720. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8721. | the corresponding value `b', and 0 otherwise. The invalid exception is
  8722. | raised if either operand is a NaN. Otherwise, the comparison is performed
  8723. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8724. *----------------------------------------------------------------------------*}
  8725. function float128_eq_signaling(a: float128; b: float128): flag;
  8726. begin
  8727. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8728. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8729. or ( ( extractFloat128Exp( b ) = $7FFF )
  8730. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8731. ) then begin
  8732. float_raise( float_flag_invalid );
  8733. result := 0;
  8734. exit;
  8735. end;
  8736. result := ord(
  8737. ( a.low = b.low )
  8738. and ( ( a.high = b.high )
  8739. or ( ( a.low = 0 )
  8740. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  8741. ));
  8742. end;
  8743. {*----------------------------------------------------------------------------
  8744. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8745. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  8746. | cause an exception. Otherwise, the comparison is performed according to the
  8747. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8748. *----------------------------------------------------------------------------*}
  8749. function float128_le_quiet(a: float128; b: float128): flag;
  8750. var
  8751. aSign, bSign: flag;
  8752. begin
  8753. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8754. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8755. or ( ( extractFloat128Exp( b ) = $7FFF )
  8756. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8757. ) then begin
  8758. if ( (float128_is_signaling_nan( a )<>0)
  8759. or (float128_is_signaling_nan( b )<>0) ) then begin
  8760. float_raise( float_flag_invalid );
  8761. end;
  8762. result := 0;
  8763. exit;
  8764. end;
  8765. aSign := extractFloat128Sign( a );
  8766. bSign := extractFloat128Sign( b );
  8767. if ( aSign <> bSign ) then begin
  8768. result := ord(
  8769. (aSign<>0)
  8770. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8771. = 0 ));
  8772. exit;
  8773. end;
  8774. if aSign<>0 then
  8775. result := le128( b.high, b.low, a.high, a.low )
  8776. else
  8777. result := le128( a.high, a.low, b.high, b.low );
  8778. end;
  8779. {*----------------------------------------------------------------------------
  8780. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8781. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  8782. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  8783. | Standard for Binary Floating-Point Arithmetic.
  8784. *----------------------------------------------------------------------------*}
  8785. function float128_lt_quiet(a: float128; b: float128): flag;
  8786. var
  8787. aSign, bSign: flag;
  8788. begin
  8789. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8790. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8791. or ( ( extractFloat128Exp( b ) = $7FFF )
  8792. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8793. ) then begin
  8794. if ( (float128_is_signaling_nan( a )<>0)
  8795. or (float128_is_signaling_nan( b )<>0) ) then begin
  8796. float_raise( float_flag_invalid );
  8797. end;
  8798. result := 0;
  8799. exit;
  8800. end;
  8801. aSign := extractFloat128Sign( a );
  8802. bSign := extractFloat128Sign( b );
  8803. if ( aSign <> bSign ) then begin
  8804. result := ord(
  8805. (aSign<>0)
  8806. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8807. <> 0 ));
  8808. exit;
  8809. end;
  8810. if aSign<>0 then
  8811. result:=lt128( b.high, b.low, a.high, a.low )
  8812. else
  8813. result:=lt128( a.high, a.low, b.high, b.low );
  8814. end;
  8815. {----------------------------------------------------------------------------
  8816. | Returns the result of converting the double-precision floating-point value
  8817. | `a' to the quadruple-precision floating-point format. The conversion is
  8818. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8819. | Arithmetic.
  8820. *----------------------------------------------------------------------------}
  8821. function float64_to_float128( a : float64) : float128;
  8822. var
  8823. aSign : flag;
  8824. aExp : int16;
  8825. aSig, zSig0, zSig1 : bits64;
  8826. begin
  8827. aSig := extractFloat64Frac( a );
  8828. aExp := extractFloat64Exp( a );
  8829. aSign := extractFloat64Sign( a );
  8830. if ( aExp = $7FF ) then begin
  8831. if ( aSig<>0 ) then begin
  8832. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8833. exit;
  8834. end;
  8835. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8836. exit;
  8837. end;
  8838. if ( aExp = 0 ) then begin
  8839. if ( aSig = 0 ) then
  8840. begin
  8841. result:=packFloat128( aSign, 0, 0, 0 );
  8842. exit;
  8843. end;
  8844. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8845. dec(aExp);
  8846. end;
  8847. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8848. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8849. end;
  8850. {$endif FPC_SOFTFLOAT_FLOAT128}
  8851. {$endif not(defined(fpc_softfpu_interface))}
  8852. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8853. end.
  8854. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}