softfpu.pp 324 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286928792889289929092919292929392949295929692979298929993009301930293039304930593069307930893099310931193129313931493159316931793189319932093219322932393249325932693279328932993309331933293339334933593369337
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. {$macro on}
  69. {$define compilerproc:=stdcall }
  70. interface
  71. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  72. {$if not(defined(fpc_softfpu_implementation))}
  73. {
  74. -------------------------------------------------------------------------------
  75. Software IEC/IEEE floating-point types.
  76. -------------------------------------------------------------------------------
  77. }
  78. TYPE
  79. float32 = longword;
  80. {$define FPC_SYSTEM_HAS_float32}
  81. { we use here a record in the function header because
  82. the record allows bitwise conversion to single }
  83. float32rec = record
  84. float32 : float32;
  85. end;
  86. flag = byte;
  87. bits8 = byte;
  88. sbits8 = shortint;
  89. bits16 = word;
  90. sbits16 = smallint;
  91. sbits32 = longint;
  92. bits32 = longword;
  93. {$ifndef fpc}
  94. qword = int64;
  95. {$endif}
  96. { now part of the system unit
  97. uint64 = qword;
  98. }
  99. bits64 = qword;
  100. sbits64 = int64;
  101. {$ifdef ENDIAN_LITTLE}
  102. float64 = record
  103. case byte of
  104. 1: (low,high : bits32);
  105. // force the record to be aligned like a double
  106. // else *_to_double will fail for cpus like sparc
  107. // and avoid expensive unpacking/packing operations
  108. 2: (dummy : double);
  109. end;
  110. int64rec = record
  111. case byte of
  112. 1: (low,high : bits32);
  113. // force the record to be aligned like a double
  114. // else *_to_double will fail for cpus like sparc
  115. // and avoid expensive unpacking/packing operations
  116. 2: (dummy : int64);
  117. end;
  118. floatx80 = record
  119. case byte of
  120. 1: (low : qword;high : word);
  121. // force the record to be aligned like a double
  122. // else *_to_double will fail for cpus like sparc
  123. // and avoid expensive unpacking/packing operations
  124. 2: (dummy : extended);
  125. end;
  126. float128 = record
  127. case byte of
  128. 1: (low,high : qword);
  129. // force the record to be aligned like a double
  130. // else *_to_double will fail for cpus like sparc
  131. // and avoid expensive unpacking/packing operations
  132. 2: (dummy : qword);
  133. end;
  134. {$else}
  135. float64 = record
  136. case byte of
  137. 1: (high,low : bits32);
  138. // force the record to be aligned like a double
  139. // else *_to_double will fail for cpus like sparc
  140. 2: (dummy : double);
  141. end;
  142. int64rec = record
  143. case byte of
  144. 1: (high,low : bits32);
  145. // force the record to be aligned like a double
  146. // else *_to_double will fail for cpus like sparc
  147. // and avoid expensive unpacking/packing operations
  148. 2: (dummy : int64);
  149. end;
  150. floatx80 = record
  151. case byte of
  152. 1: (high : word;low : qword);
  153. // force the record to be aligned like a double
  154. // else *_to_double will fail for cpus like sparc
  155. // and avoid expensive unpacking/packing operations
  156. 2: (dummy : qword);
  157. end;
  158. float128 = record
  159. case byte of
  160. 1: (high : qword;low : qword);
  161. // force the record to be aligned like a double
  162. // else *_to_double will fail for cpus like sparc
  163. // and avoid expensive unpacking/packing operations
  164. 2: (dummy : qword);
  165. end;
  166. {$endif}
  167. {$define FPC_SYSTEM_HAS_float64}
  168. {*
  169. -------------------------------------------------------------------------------
  170. Returns 1 if the double-precision floating-point value `a' is less than
  171. the corresponding value `b', and 0 otherwise. The comparison is performed
  172. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  173. -------------------------------------------------------------------------------
  174. *}
  175. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  176. {*
  177. -------------------------------------------------------------------------------
  178. Returns 1 if the double-precision floating-point value `a' is less than
  179. or equal to the corresponding value `b', and 0 otherwise. The comparison
  180. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  181. Arithmetic.
  182. -------------------------------------------------------------------------------
  183. *}
  184. Function float64_le(a: float64;b: float64): flag; compilerproc;
  185. {*
  186. -------------------------------------------------------------------------------
  187. Returns 1 if the double-precision floating-point value `a' is equal to
  188. the corresponding value `b', and 0 otherwise. The comparison is performed
  189. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  190. -------------------------------------------------------------------------------
  191. *}
  192. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  193. {*
  194. -------------------------------------------------------------------------------
  195. Returns the square root of the double-precision floating-point value `a'.
  196. The operation is performed according to the IEC/IEEE Standard for Binary
  197. Floating-Point Arithmetic.
  198. -------------------------------------------------------------------------------
  199. *}
  200. Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
  201. {*
  202. -------------------------------------------------------------------------------
  203. Returns the remainder of the double-precision floating-point value `a'
  204. with respect to the corresponding value `b'. The operation is performed
  205. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  206. -------------------------------------------------------------------------------
  207. *}
  208. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  209. {*
  210. -------------------------------------------------------------------------------
  211. Returns the result of dividing the double-precision floating-point value `a'
  212. by the corresponding value `b'. The operation is performed according to the
  213. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  214. -------------------------------------------------------------------------------
  215. *}
  216. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  217. {*
  218. -------------------------------------------------------------------------------
  219. Returns the result of multiplying the double-precision floating-point values
  220. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  221. for Binary Floating-Point Arithmetic.
  222. -------------------------------------------------------------------------------
  223. *}
  224. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  225. {*
  226. -------------------------------------------------------------------------------
  227. Returns the result of subtracting the double-precision floating-point values
  228. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  229. for Binary Floating-Point Arithmetic.
  230. -------------------------------------------------------------------------------
  231. *}
  232. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  233. {*
  234. -------------------------------------------------------------------------------
  235. Returns the result of adding the double-precision floating-point values `a'
  236. and `b'. The operation is performed according to the IEC/IEEE Standard for
  237. Binary Floating-Point Arithmetic.
  238. -------------------------------------------------------------------------------
  239. *}
  240. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  241. {*
  242. -------------------------------------------------------------------------------
  243. Rounds the double-precision floating-point value `a' to an integer,
  244. and returns the result as a double-precision floating-point value. The
  245. operation is performed according to the IEC/IEEE Standard for Binary
  246. Floating-Point Arithmetic.
  247. -------------------------------------------------------------------------------
  248. *}
  249. Function float64_round_to_int(a: float64) : float64; compilerproc;
  250. {*
  251. -------------------------------------------------------------------------------
  252. Returns the result of converting the double-precision floating-point value
  253. `a' to the single-precision floating-point format. The conversion is
  254. performed according to the IEC/IEEE Standard for Binary Floating-Point
  255. Arithmetic.
  256. -------------------------------------------------------------------------------
  257. *}
  258. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  259. {*
  260. -------------------------------------------------------------------------------
  261. Returns the result of converting the double-precision floating-point value
  262. `a' to the 32-bit two's complement integer format. The conversion is
  263. performed according to the IEC/IEEE Standard for Binary Floating-Point
  264. Arithmetic, except that the conversion is always rounded toward zero.
  265. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  266. the conversion overflows, the largest integer with the same sign as `a' is
  267. returned.
  268. -------------------------------------------------------------------------------
  269. *}
  270. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  271. {*
  272. -------------------------------------------------------------------------------
  273. Returns the result of converting the double-precision floating-point value
  274. `a' to the 32-bit two's complement integer format. The conversion is
  275. performed according to the IEC/IEEE Standard for Binary Floating-Point
  276. Arithmetic---which means in particular that the conversion is rounded
  277. according to the current rounding mode. If `a' is a NaN, the largest
  278. positive integer is returned. Otherwise, if the conversion overflows, the
  279. largest integer with the same sign as `a' is returned.
  280. -------------------------------------------------------------------------------
  281. *}
  282. Function float64_to_int32(a: float64): int32; compilerproc;
  283. {*
  284. -------------------------------------------------------------------------------
  285. Returns 1 if the single-precision floating-point value `a' is less than
  286. the corresponding value `b', and 0 otherwise. The comparison is performed
  287. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  288. -------------------------------------------------------------------------------
  289. *}
  290. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  291. {*
  292. -------------------------------------------------------------------------------
  293. Returns 1 if the single-precision floating-point value `a' is less than
  294. or equal to the corresponding value `b', and 0 otherwise. The comparison
  295. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  296. Arithmetic.
  297. -------------------------------------------------------------------------------
  298. *}
  299. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  300. {*
  301. -------------------------------------------------------------------------------
  302. Returns 1 if the single-precision floating-point value `a' is equal to
  303. the corresponding value `b', and 0 otherwise. The comparison is performed
  304. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  305. -------------------------------------------------------------------------------
  306. *}
  307. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  308. {*
  309. -------------------------------------------------------------------------------
  310. Returns the square root of the single-precision floating-point value `a'.
  311. The operation is performed according to the IEC/IEEE Standard for Binary
  312. Floating-Point Arithmetic.
  313. -------------------------------------------------------------------------------
  314. *}
  315. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  316. {*
  317. -------------------------------------------------------------------------------
  318. Returns the remainder of the single-precision floating-point value `a'
  319. with respect to the corresponding value `b'. The operation is performed
  320. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  321. -------------------------------------------------------------------------------
  322. *}
  323. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  324. {*
  325. -------------------------------------------------------------------------------
  326. Returns the result of dividing the single-precision floating-point value `a'
  327. by the corresponding value `b'. The operation is performed according to the
  328. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  329. -------------------------------------------------------------------------------
  330. *}
  331. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  332. {*
  333. -------------------------------------------------------------------------------
  334. Returns the result of multiplying the single-precision floating-point values
  335. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  336. for Binary Floating-Point Arithmetic.
  337. -------------------------------------------------------------------------------
  338. *}
  339. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  340. {*
  341. -------------------------------------------------------------------------------
  342. Returns the result of subtracting the single-precision floating-point values
  343. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  344. for Binary Floating-Point Arithmetic.
  345. -------------------------------------------------------------------------------
  346. *}
  347. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  348. {*
  349. -------------------------------------------------------------------------------
  350. Returns the result of adding the single-precision floating-point values `a'
  351. and `b'. The operation is performed according to the IEC/IEEE Standard for
  352. Binary Floating-Point Arithmetic.
  353. -------------------------------------------------------------------------------
  354. *}
  355. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  356. {*
  357. -------------------------------------------------------------------------------
  358. Rounds the single-precision floating-point value `a' to an integer,
  359. and returns the result as a single-precision floating-point value. The
  360. operation is performed according to the IEC/IEEE Standard for Binary
  361. Floating-Point Arithmetic.
  362. -------------------------------------------------------------------------------
  363. *}
  364. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  365. {*
  366. -------------------------------------------------------------------------------
  367. Returns the result of converting the single-precision floating-point value
  368. `a' to the double-precision floating-point format. The conversion is
  369. performed according to the IEC/IEEE Standard for Binary Floating-Point
  370. Arithmetic.
  371. -------------------------------------------------------------------------------
  372. *}
  373. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  374. {*
  375. -------------------------------------------------------------------------------
  376. Returns the result of converting the single-precision floating-point value
  377. `a' to the 32-bit two's complement integer format. The conversion is
  378. performed according to the IEC/IEEE Standard for Binary Floating-Point
  379. Arithmetic, except that the conversion is always rounded toward zero.
  380. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  381. the conversion overflows, the largest integer with the same sign as `a' is
  382. returned.
  383. -------------------------------------------------------------------------------
  384. *}
  385. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  386. {*
  387. -------------------------------------------------------------------------------
  388. Returns the result of converting the single-precision floating-point value
  389. `a' to the 32-bit two's complement integer format. The conversion is
  390. performed according to the IEC/IEEE Standard for Binary Floating-Point
  391. Arithmetic---which means in particular that the conversion is rounded
  392. according to the current rounding mode. If `a' is a NaN, the largest
  393. positive integer is returned. Otherwise, if the conversion overflows, the
  394. largest integer with the same sign as `a' is returned.
  395. -------------------------------------------------------------------------------
  396. *}
  397. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  398. {*
  399. -------------------------------------------------------------------------------
  400. Returns the result of converting the 32-bit two's complement integer `a' to
  401. the double-precision floating-point format. The conversion is performed
  402. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  403. -------------------------------------------------------------------------------
  404. *}
  405. Function int32_to_float64( a: int32) : float64; compilerproc;
  406. {*
  407. -------------------------------------------------------------------------------
  408. Returns the result of converting the 32-bit two's complement integer `a' to
  409. the single-precision floating-point format. The conversion is performed
  410. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  411. -------------------------------------------------------------------------------
  412. *}
  413. Function int32_to_float32( a: int32): float32rec; compilerproc;
  414. {*----------------------------------------------------------------------------
  415. | Returns the result of converting the 64-bit two's complement integer `a'
  416. | to the double-precision floating-point format. The conversion is performed
  417. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  418. *----------------------------------------------------------------------------*}
  419. Function int64_to_float64( a: int64 ): float64; compilerproc;
  420. Function qword_to_float64( a: qword ): float64; compilerproc;
  421. {*----------------------------------------------------------------------------
  422. | Returns the result of converting the 64-bit two's complement integer `a'
  423. | to the single-precision floating-point format. The conversion is performed
  424. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  425. *----------------------------------------------------------------------------*}
  426. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  427. Function qword_to_float32( a: qword ): float32rec; compilerproc;
  428. // +++
  429. function float32_to_int64( a: float32 ): int64;
  430. function float32_to_int64_round_to_zero( a: float32 ): int64;
  431. function float32_eq_signaling( a: float32; b: float32) : flag;
  432. function float32_le_quiet( a: float32 ; b : float32 ): flag;
  433. function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  434. function float32_is_signaling_nan( a : float32 ): flag;
  435. function float32_is_nan( a : float32 ): flag;
  436. function float64_to_int64( a: float64 ): int64;
  437. function float64_to_int64_round_to_zero( a: float64 ): int64;
  438. function float64_eq_signaling( a: float64; b: float64): flag;
  439. function float64_le_quiet(a: float64 ; b: float64 ): flag;
  440. function float64_lt_quiet(a: float64; b: float64 ): Flag;
  441. function float64_is_signaling_nan( a : float64 ): flag;
  442. function float64_is_nan( a : float64 ): flag;
  443. // ===
  444. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  445. {*----------------------------------------------------------------------------
  446. | Extended double-precision rounding precision
  447. *----------------------------------------------------------------------------*}
  448. var // threadvar!?
  449. floatx80_rounding_precision : int8 = 80;
  450. function int32_to_floatx80( a: int32 ): floatx80;
  451. function int64_to_floatx80( a: int64 ): floatx80;
  452. function qword_to_floatx80( a: qword ): floatx80;
  453. function float32_to_floatx80( a: float32 ): floatx80;
  454. function float64_to_floatx80( a: float64 ): floatx80;
  455. function floatx80_to_int32( a: floatx80 ): int32;
  456. function floatx80_to_int32_round_to_zero( a: floatx80 ): int32;
  457. function floatx80_to_int64( a: floatx80 ): int64;
  458. function floatx80_to_int64_round_to_zero( a: floatx80 ): int64;
  459. function floatx80_to_float32( a: floatx80 ): float32;
  460. function floatx80_to_float64( a: floatx80 ): float64;
  461. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  462. function floatx80_to_float128( a: floatx80 ): float128;
  463. {$endif FPC_SOFTFLOAT_FLOAT128}
  464. function floatx80_round_to_int( a: floatx80 ): floatx80;
  465. function floatx80_add( a: floatx80; b: floatx80 ): floatx80;
  466. function floatx80_sub( a: floatx80; b: floatx80 ): floatx80;
  467. function floatx80_mul( a: floatx80; b: floatx80 ): floatx80;
  468. function floatx80_div( a: floatx80; b: floatx80 ): floatx80;
  469. function floatx80_rem( a: floatx80; b: floatx80 ): floatx80;
  470. function floatx80_sqrt( a: floatx80 ): floatx80;
  471. function floatx80_eq( a: floatx80; b: floatx80 ): flag;
  472. function floatx80_le( a: floatx80; b: floatx80 ): flag;
  473. function floatx80_lt( a: floatx80; b: floatx80 ): flag;
  474. function floatx80_eq_signaling( a: floatx80; b: floatx80 ): flag;
  475. function floatx80_le_quiet( a: floatx80; b: floatx80 ): flag;
  476. function floatx80_lt_quiet( a: floatx80; b: floatx80 ): flag;
  477. function floatx80_is_signaling_nan( a: floatx80 ): flag;
  478. function floatx80_is_nan(a : floatx80 ): flag;
  479. {$endif FPC_SOFTFLOAT_FLOATX80}
  480. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  481. function int32_to_float128( a: int32 ): float128;
  482. function int64_to_float128( a: int64 ): float128;
  483. function qword_to_float128( a: qword ): float128;
  484. function float32_to_float128( a: float32 ): float128;
  485. function float128_is_nan( a : float128): flag;
  486. function float128_is_signaling_nan( a : float128): flag;
  487. function float128_to_int32(a: float128): int32;
  488. function float128_to_int32_round_to_zero(a: float128): int32;
  489. function float128_to_int64(a: float128): int64;
  490. function float128_to_int64_round_to_zero(a: float128): int64;
  491. function float128_to_float32(a: float128): float32;
  492. function float128_to_float64(a: float128): float64;
  493. function float64_to_float128( a : float64) : float128;
  494. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  495. function float128_to_floatx80(a: float128): floatx80;
  496. {$endif FPC_SOFTFLOAT_FLOATX80}
  497. function float128_round_to_int(a: float128): float128;
  498. function float128_add(a: float128; b: float128): float128;
  499. function float128_sub(a: float128; b: float128): float128;
  500. function float128_mul(a: float128; b: float128): float128;
  501. function float128_div(a: float128; b: float128): float128;
  502. function float128_rem(a: float128; b: float128): float128;
  503. function float128_sqrt(a: float128): float128;
  504. function float128_eq(a: float128; b: float128): flag;
  505. function float128_le(a: float128; b: float128): flag;
  506. function float128_lt(a: float128; b: float128): flag;
  507. function float128_eq_signaling(a: float128; b: float128): flag;
  508. function float128_le_quiet(a: float128; b: float128): flag;
  509. function float128_lt_quiet(a: float128; b: float128): flag;
  510. {$endif FPC_SOFTFLOAT_FLOAT128}
  511. CONST
  512. {-------------------------------------------------------------------------------
  513. Software IEC/IEEE floating-point underflow tininess-detection mode.
  514. -------------------------------------------------------------------------------
  515. *}
  516. float_tininess_after_rounding = 0;
  517. float_tininess_before_rounding = 1;
  518. {*
  519. -------------------------------------------------------------------------------
  520. Underflow tininess-detection mode, statically initialized to default value.
  521. (The declaration in `softfloat.h' must match the `int8' type here.)
  522. -------------------------------------------------------------------------------
  523. *}
  524. var // threadvar!?
  525. softfloat_detect_tininess: int8 = float_tininess_after_rounding;
  526. {$endif not(defined(fpc_softfpu_implementation))}
  527. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  528. implementation
  529. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  530. {$if not(defined(fpc_softfpu_interface))}
  531. (*****************************************************************************)
  532. (*----------------------------------------------------------------------------*)
  533. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  534. (* division and square root approximations. (Can be specialized to target if *)
  535. (* desired.) *)
  536. (* ---------------------------------------------------------------------------*)
  537. (*****************************************************************************)
  538. {*----------------------------------------------------------------------------
  539. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  540. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  541. | input. If `zSign' is 1, the input is negated before being converted to an
  542. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  543. | is simply rounded to an integer, with the inexact exception raised if the
  544. | input cannot be represented exactly as an integer. However, if the fixed-
  545. | point input is too large, the invalid exception is raised and the largest
  546. | positive or negative integer is returned.
  547. *----------------------------------------------------------------------------*}
  548. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  549. var
  550. roundingMode: int8;
  551. roundNearestEven: flag;
  552. roundIncrement, roundBits: int8;
  553. z: int32;
  554. begin
  555. roundingMode := softfloat_rounding_mode;
  556. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  557. roundIncrement := $40;
  558. if ( roundNearestEven=0 ) then
  559. begin
  560. if ( roundingMode = float_round_to_zero ) then
  561. begin
  562. roundIncrement := 0;
  563. end
  564. else begin
  565. roundIncrement := $7F;
  566. if ( zSign<>0 ) then
  567. begin
  568. if ( roundingMode = float_round_up ) then
  569. roundIncrement := 0;
  570. end
  571. else begin
  572. if ( roundingMode = float_round_down ) then
  573. roundIncrement := 0;
  574. end;
  575. end;
  576. end;
  577. roundBits := absZ and $7F;
  578. absZ := ( absZ + roundIncrement ) shr 7;
  579. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
  580. z := absZ;
  581. if ( zSign<>0 ) then
  582. z := - z;
  583. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  584. begin
  585. float_raise( float_flag_invalid );
  586. if zSign<>0 then
  587. result:=sbits32($80000000)
  588. else
  589. result:=$7FFFFFFF;
  590. exit;
  591. end;
  592. if ( roundBits<>0 ) then
  593. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  594. result:=z;
  595. end;
  596. {*----------------------------------------------------------------------------
  597. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  598. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  599. | and returns the properly rounded 64-bit integer corresponding to the input.
  600. | If `zSign' is 1, the input is negated before being converted to an integer.
  601. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  602. | the inexact exception raised if the input cannot be represented exactly as
  603. | an integer. However, if the fixed-point input is too large, the invalid
  604. | exception is raised and the largest positive or negative integer is
  605. | returned.
  606. *----------------------------------------------------------------------------*}
  607. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  608. var
  609. roundingMode: int8;
  610. roundNearestEven, increment: flag;
  611. z: int64;
  612. label
  613. overflow;
  614. begin
  615. roundingMode := softfloat_rounding_mode;
  616. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  617. increment := ord( sbits64(absZ1) < 0 );
  618. if ( roundNearestEven=0 ) then
  619. begin
  620. if ( roundingMode = float_round_to_zero ) then
  621. begin
  622. increment := 0;
  623. end
  624. else begin
  625. if ( zSign<>0 ) then
  626. begin
  627. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  628. end
  629. else begin
  630. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  631. end;
  632. end;
  633. end;
  634. if ( increment<>0 ) then
  635. begin
  636. inc(absZ0);
  637. if ( absZ0 = 0 ) then
  638. goto overflow;
  639. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  640. end;
  641. z := absZ0;
  642. if ( zSign<>0 ) then
  643. z := - z;
  644. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  645. begin
  646. overflow:
  647. float_raise( float_flag_invalid );
  648. if zSign<>0 then
  649. result:=int64($8000000000000000)
  650. else
  651. result:=int64($7FFFFFFFFFFFFFFF);
  652. exit;
  653. end;
  654. if ( absZ1<>0 ) then
  655. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  656. result:=z;
  657. end;
  658. {*
  659. -------------------------------------------------------------------------------
  660. Shifts `a' right by the number of bits given in `count'. If any nonzero
  661. bits are shifted off, they are ``jammed'' into the least significant bit of
  662. the result by setting the least significant bit to 1. The value of `count'
  663. can be arbitrarily large; in particular, if `count' is greater than 32, the
  664. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  665. The result is stored in the location pointed to by `zPtr'.
  666. -------------------------------------------------------------------------------
  667. *}
  668. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  669. var
  670. z: Bits32;
  671. Begin
  672. if ( count = 0 ) then
  673. z := a
  674. else
  675. if ( count < 32 ) then
  676. Begin
  677. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  678. End
  679. else
  680. Begin
  681. z := bits32( a <> 0 );
  682. End;
  683. zPtr := z;
  684. End;
  685. {*----------------------------------------------------------------------------
  686. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  687. | number of bits given in `count'. Any bits shifted off are lost. The value
  688. | of `count' can be arbitrarily large; in particular, if `count' is greater
  689. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  690. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  691. *----------------------------------------------------------------------------*}
  692. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  693. var
  694. z0, z1: bits64;
  695. negCount: int8;
  696. begin
  697. negCount := ( - count ) and 63;
  698. if ( count = 0 ) then
  699. begin
  700. z1 := a1;
  701. z0 := a0;
  702. end
  703. else if ( count < 64 ) then
  704. begin
  705. z1 := ( a0 shl negCount ) or ( a1 shr count );
  706. z0 := a0 shr count;
  707. end
  708. else
  709. begin
  710. if ( count < 128 ) then
  711. z1 := a0 shr ( count and 63 )
  712. else
  713. z1 := 0;
  714. z0 := 0;
  715. end;
  716. z1Ptr := z1;
  717. z0Ptr := z0;
  718. end;
  719. {*----------------------------------------------------------------------------
  720. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  721. | number of bits given in `count'. If any nonzero bits are shifted off, they
  722. | are ``jammed'' into the least significant bit of the result by setting the
  723. | least significant bit to 1. The value of `count' can be arbitrarily large;
  724. | in particular, if `count' is greater than 128, the result will be either
  725. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  726. | nonzero. The result is broken into two 64-bit pieces which are stored at
  727. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  728. *----------------------------------------------------------------------------*}
  729. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  730. var
  731. z0,z1 : bits64;
  732. negCount : int8;
  733. begin
  734. negCount := ( - count ) and 63;
  735. if ( count = 0 ) then begin
  736. z1 := a1;
  737. z0 := a0;
  738. end
  739. else if ( count < 64 ) then begin
  740. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  741. z0 := a0>>count;
  742. end
  743. else begin
  744. if ( count = 64 ) then begin
  745. z1 := a0 or ord( a1 <> 0 );
  746. end
  747. else if ( count < 128 ) then begin
  748. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  749. end
  750. else begin
  751. z1 := ord( ( a0 or a1 ) <> 0 );
  752. end;
  753. z0 := 0;
  754. end;
  755. z1Ptr := z1;
  756. z0Ptr := z0;
  757. end;
  758. {*
  759. -------------------------------------------------------------------------------
  760. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  761. number of bits given in `count'. Any bits shifted off are lost. The value
  762. of `count' can be arbitrarily large; in particular, if `count' is greater
  763. than 64, the result will be 0. The result is broken into two 32-bit pieces
  764. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  765. -------------------------------------------------------------------------------
  766. *}
  767. Procedure
  768. shift64Right(
  769. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  770. Var
  771. z0, z1: bits32;
  772. negCount : int8;
  773. Begin
  774. negCount := ( - count ) AND 31;
  775. if ( count = 0 ) then
  776. Begin
  777. z1 := a1;
  778. z0 := a0;
  779. End
  780. else if ( count < 32 ) then
  781. Begin
  782. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  783. z0 := a0 shr count;
  784. End
  785. else
  786. Begin
  787. if (count < 64) then
  788. z1 := ( a0 shr ( count AND 31 ) )
  789. else
  790. z1 := 0;
  791. z0 := 0;
  792. End;
  793. z1Ptr := z1;
  794. z0Ptr := z0;
  795. End;
  796. {*
  797. -------------------------------------------------------------------------------
  798. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  799. number of bits given in `count'. If any nonzero bits are shifted off, they
  800. are ``jammed'' into the least significant bit of the result by setting the
  801. least significant bit to 1. The value of `count' can be arbitrarily large;
  802. in particular, if `count' is greater than 64, the result will be either 0
  803. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  804. nonzero. The result is broken into two 32-bit pieces which are stored at
  805. the locations pointed to by `z0Ptr' and `z1Ptr'.
  806. -------------------------------------------------------------------------------
  807. *}
  808. Procedure
  809. shift64RightJamming(
  810. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  811. VAR
  812. z0, z1 : bits32;
  813. negCount : int8;
  814. Begin
  815. negCount := ( - count ) AND 31;
  816. if ( count = 0 ) then
  817. Begin
  818. z1 := a1;
  819. z0 := a0;
  820. End
  821. else
  822. if ( count < 32 ) then
  823. Begin
  824. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  825. z0 := a0 shr count;
  826. End
  827. else
  828. Begin
  829. if ( count = 32 ) then
  830. Begin
  831. z1 := a0 OR bits32( a1 <> 0 );
  832. End
  833. else
  834. if ( count < 64 ) Then
  835. Begin
  836. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  837. End
  838. else
  839. Begin
  840. z1 := bits32( ( a0 OR a1 ) <> 0 );
  841. End;
  842. z0 := 0;
  843. End;
  844. z1Ptr := z1;
  845. z0Ptr := z0;
  846. End;
  847. {*----------------------------------------------------------------------------
  848. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  849. | bits are shifted off, they are ``jammed'' into the least significant bit of
  850. | the result by setting the least significant bit to 1. The value of `count'
  851. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  852. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  853. | The result is stored in the location pointed to by `zPtr'.
  854. *----------------------------------------------------------------------------*}
  855. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  856. var
  857. z: bits64;
  858. begin
  859. if ( count = 0 ) then
  860. begin
  861. z := a;
  862. end
  863. else if ( count < 64 ) then
  864. begin
  865. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  866. end
  867. else
  868. begin
  869. z := ord( a <> 0 );
  870. end;
  871. zPtr := z;
  872. end;
  873. {$if not defined(shift64ExtraRightJamming)}
  874. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  875. overload;
  876. forward;
  877. {$endif}
  878. {*
  879. -------------------------------------------------------------------------------
  880. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  881. by 32 _plus_ the number of bits given in `count'. The shifted result is
  882. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  883. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  884. off form a third 32-bit result as follows: The _last_ bit shifted off is
  885. the most-significant bit of the extra result, and the other 31 bits of the
  886. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  887. were all zero. This extra result is stored in the location pointed to by
  888. `z2Ptr'. The value of `count' can be arbitrarily large.
  889. (This routine makes more sense if `a0', `a1', and `a2' are considered
  890. to form a fixed-point value with binary point between `a1' and `a2'. This
  891. fixed-point value is shifted right by the number of bits given in `count',
  892. and the integer part of the result is returned at the locations pointed to
  893. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  894. corrupted as described above, and is returned at the location pointed to by
  895. `z2Ptr'.)
  896. -------------------------------------------------------------------------------
  897. }
  898. Procedure
  899. shift64ExtraRightJamming(
  900. a0: bits32;
  901. a1: bits32;
  902. a2: bits32;
  903. count: int16;
  904. VAR z0Ptr: bits32;
  905. VAR z1Ptr: bits32;
  906. VAR z2Ptr: bits32
  907. ); overload;
  908. Var
  909. z0, z1, z2: bits32;
  910. negCount : int8;
  911. Begin
  912. negCount := ( - count ) AND 31;
  913. if ( count = 0 ) then
  914. Begin
  915. z2 := a2;
  916. z1 := a1;
  917. z0 := a0;
  918. End
  919. else
  920. Begin
  921. if ( count < 32 ) Then
  922. Begin
  923. z2 := a1 shl negCount;
  924. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  925. z0 := a0 shr count;
  926. End
  927. else
  928. Begin
  929. if ( count = 32 ) then
  930. Begin
  931. z2 := a1;
  932. z1 := a0;
  933. End
  934. else
  935. Begin
  936. a2 := a2 or a1;
  937. if ( count < 64 ) then
  938. Begin
  939. z2 := a0 shl negCount;
  940. z1 := a0 shr ( count AND 31 );
  941. End
  942. else
  943. Begin
  944. if count = 64 then
  945. z2 := a0
  946. else
  947. z2 := bits32(a0 <> 0);
  948. z1 := 0;
  949. End;
  950. End;
  951. z0 := 0;
  952. End;
  953. z2 := z2 or bits32( a2 <> 0 );
  954. End;
  955. z2Ptr := z2;
  956. z1Ptr := z1;
  957. z0Ptr := z0;
  958. End;
  959. {*
  960. -------------------------------------------------------------------------------
  961. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  962. number of bits given in `count'. Any bits shifted off are lost. The value
  963. of `count' must be less than 32. The result is broken into two 32-bit
  964. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  965. -------------------------------------------------------------------------------
  966. *}
  967. Procedure
  968. shortShift64Left(
  969. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  970. Begin
  971. z1Ptr := a1 shl count;
  972. if count = 0 then
  973. z0Ptr := a0
  974. else
  975. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  976. End;
  977. {*
  978. -------------------------------------------------------------------------------
  979. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  980. by the number of bits given in `count'. Any bits shifted off are lost.
  981. The value of `count' must be less than 32. The result is broken into three
  982. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  983. `z1Ptr', and `z2Ptr'.
  984. -------------------------------------------------------------------------------
  985. *}
  986. Procedure
  987. shortShift96Left(
  988. a0: bits32;
  989. a1: bits32;
  990. a2: bits32;
  991. count: int16;
  992. VAR z0Ptr: bits32;
  993. VAR z1Ptr: bits32;
  994. VAR z2Ptr: bits32
  995. );
  996. Var
  997. z0, z1, z2: bits32;
  998. negCount: int8;
  999. Begin
  1000. z2 := a2 shl count;
  1001. z1 := a1 shl count;
  1002. z0 := a0 shl count;
  1003. if ( 0 < count ) then
  1004. Begin
  1005. negCount := ( ( - count ) AND 31 );
  1006. z1 := z1 or (a2 shr negCount);
  1007. z0 := z0 or (a1 shr negCount);
  1008. End;
  1009. z2Ptr := z2;
  1010. z1Ptr := z1;
  1011. z0Ptr := z0;
  1012. End;
  1013. {*----------------------------------------------------------------------------
  1014. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  1015. | number of bits given in `count'. Any bits shifted off are lost. The value
  1016. | of `count' must be less than 64. The result is broken into two 64-bit
  1017. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1018. *----------------------------------------------------------------------------*}
  1019. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  1020. begin
  1021. z1Ptr := a1 shl count;
  1022. if count=0 then
  1023. z0Ptr:=a0
  1024. else
  1025. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  1026. end;
  1027. {*
  1028. -------------------------------------------------------------------------------
  1029. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  1030. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  1031. any carry out is lost. The result is broken into two 32-bit pieces which
  1032. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1033. -------------------------------------------------------------------------------
  1034. *}
  1035. Procedure
  1036. add64(
  1037. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  1038. Var
  1039. z1: bits32;
  1040. Begin
  1041. z1 := a1 + b1;
  1042. z1Ptr := z1;
  1043. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  1044. End;
  1045. {*
  1046. -------------------------------------------------------------------------------
  1047. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  1048. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1049. modulo 2^96, so any carry out is lost. The result is broken into three
  1050. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1051. `z1Ptr', and `z2Ptr'.
  1052. -------------------------------------------------------------------------------
  1053. *}
  1054. Procedure
  1055. add96(
  1056. a0: bits32;
  1057. a1: bits32;
  1058. a2: bits32;
  1059. b0: bits32;
  1060. b1: bits32;
  1061. b2: bits32;
  1062. VAR z0Ptr: bits32;
  1063. VAR z1Ptr: bits32;
  1064. VAR z2Ptr: bits32
  1065. );
  1066. var
  1067. z0, z1, z2: bits32;
  1068. carry0, carry1: int8;
  1069. Begin
  1070. z2 := a2 + b2;
  1071. carry1 := int8( z2 < a2 );
  1072. z1 := a1 + b1;
  1073. carry0 := int8( z1 < a1 );
  1074. z0 := a0 + b0;
  1075. z1 := z1 + carry1;
  1076. z0 := z0 + bits32( z1 < carry1 );
  1077. z0 := z0 + carry0;
  1078. z2Ptr := z2;
  1079. z1Ptr := z1;
  1080. z0Ptr := z0;
  1081. End;
  1082. {*----------------------------------------------------------------------------
  1083. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1084. | by the number of bits given in `count'. Any bits shifted off are lost.
  1085. | The value of `count' must be less than 64. The result is broken into three
  1086. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1087. | `z1Ptr', and `z2Ptr'.
  1088. *----------------------------------------------------------------------------*}
  1089. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1090. var
  1091. z0, z1, z2 : bits64;
  1092. negCount : int8;
  1093. begin
  1094. z2 := a2 shl count;
  1095. z1 := a1 shl count;
  1096. z0 := a0 shl count;
  1097. if ( 0 < count ) then
  1098. begin
  1099. negCount := ( ( - count ) and 63 );
  1100. z1 := z1 or (a2 shr negCount);
  1101. z0 := z0 or (a1 shr negCount);
  1102. end;
  1103. z2Ptr := z2;
  1104. z1Ptr := z1;
  1105. z0Ptr := z0;
  1106. end;
  1107. {*----------------------------------------------------------------------------
  1108. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1109. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1110. | any carry out is lost. The result is broken into two 64-bit pieces which
  1111. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1112. *----------------------------------------------------------------------------*}
  1113. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);inline;
  1114. var
  1115. z1 : bits64;
  1116. begin
  1117. z1 := a1 + b1;
  1118. z1Ptr := z1;
  1119. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1120. end;
  1121. {*----------------------------------------------------------------------------
  1122. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1123. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1124. | modulo 2^192, so any carry out is lost. The result is broken into three
  1125. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1126. | `z1Ptr', and `z2Ptr'.
  1127. *----------------------------------------------------------------------------*}
  1128. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1129. var
  1130. z0, z1, z2 : bits64;
  1131. carry0, carry1 : int8;
  1132. begin
  1133. z2 := a2 + b2;
  1134. carry1 := ord( z2 < a2 );
  1135. z1 := a1 + b1;
  1136. carry0 := ord( z1 < a1 );
  1137. z0 := a0 + b0;
  1138. inc(z1, carry1);
  1139. inc(z0, ord( z1 < carry1 ));
  1140. inc(z0, carry0);
  1141. z2Ptr := z2;
  1142. z1Ptr := z1;
  1143. z0Ptr := z0;
  1144. end;
  1145. {*
  1146. -------------------------------------------------------------------------------
  1147. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1148. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1149. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1150. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1151. `z1Ptr'.
  1152. -------------------------------------------------------------------------------
  1153. *}
  1154. Procedure
  1155. sub64(
  1156. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  1157. Begin
  1158. z1Ptr := a1 - b1;
  1159. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1160. End;
  1161. {*
  1162. -------------------------------------------------------------------------------
  1163. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1164. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1165. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1166. into three 32-bit pieces which are stored at the locations pointed to by
  1167. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1168. -------------------------------------------------------------------------------
  1169. *}
  1170. Procedure
  1171. sub96(
  1172. a0:bits32;
  1173. a1:bits32;
  1174. a2:bits32;
  1175. b0:bits32;
  1176. b1:bits32;
  1177. b2:bits32;
  1178. VAR z0Ptr:bits32;
  1179. VAR z1Ptr:bits32;
  1180. VAR z2Ptr:bits32
  1181. );
  1182. Var
  1183. z0, z1, z2: bits32;
  1184. borrow0, borrow1: int8;
  1185. Begin
  1186. z2 := a2 - b2;
  1187. borrow1 := int8( a2 < b2 );
  1188. z1 := a1 - b1;
  1189. borrow0 := int8( a1 < b1 );
  1190. z0 := a0 - b0;
  1191. z0 := z0 - bits32( z1 < borrow1 );
  1192. z1 := z1 - borrow1;
  1193. z0 := z0 -borrow0;
  1194. z2Ptr := z2;
  1195. z1Ptr := z1;
  1196. z0Ptr := z0;
  1197. End;
  1198. {*----------------------------------------------------------------------------
  1199. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1200. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1201. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1202. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1203. | `z1Ptr'.
  1204. *----------------------------------------------------------------------------*}
  1205. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1206. begin
  1207. z1Ptr := a1 - b1;
  1208. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1209. end;
  1210. {*----------------------------------------------------------------------------
  1211. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1212. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1213. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1214. | result is broken into three 64-bit pieces which are stored at the locations
  1215. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1216. *----------------------------------------------------------------------------*}
  1217. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1218. var
  1219. z0, z1, z2 : bits64;
  1220. borrow0, borrow1 : int8;
  1221. begin
  1222. z2 := a2 - b2;
  1223. borrow1 := ord( a2 < b2 );
  1224. z1 := a1 - b1;
  1225. borrow0 := ord( a1 < b1 );
  1226. z0 := a0 - b0;
  1227. dec(z0, ord( z1 < borrow1 ));
  1228. dec(z1, borrow1);
  1229. dec(z0, borrow0);
  1230. z2Ptr := z2;
  1231. z1Ptr := z1;
  1232. z0Ptr := z0;
  1233. end;
  1234. {*
  1235. -------------------------------------------------------------------------------
  1236. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1237. into two 32-bit pieces which are stored at the locations pointed to by
  1238. `z0Ptr' and `z1Ptr'.
  1239. -------------------------------------------------------------------------------
  1240. *}
  1241. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1242. :bits32 );
  1243. Var
  1244. aHigh, aLow, bHigh, bLow: bits16;
  1245. z0, zMiddleA, zMiddleB, z1: bits32;
  1246. Begin
  1247. aLow := a and $ffff;
  1248. aHigh := a shr 16;
  1249. bLow := b and $ffff;
  1250. bHigh := b shr 16;
  1251. z1 := ( bits32( aLow) ) * bLow;
  1252. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1253. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1254. z0 := ( bits32 (aHigh) ) * bHigh;
  1255. zMiddleA := zMiddleA + zMiddleB;
  1256. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1257. zMiddleA := zmiddleA shl 16;
  1258. z1 := z1 + zMiddleA;
  1259. z0 := z0 + bits32( z1 < zMiddleA );
  1260. z1Ptr := z1;
  1261. z0Ptr := z0;
  1262. End;
  1263. {*
  1264. -------------------------------------------------------------------------------
  1265. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1266. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1267. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1268. `z2Ptr'.
  1269. -------------------------------------------------------------------------------
  1270. *}
  1271. Procedure
  1272. mul64By32To96(
  1273. a0:bits32;
  1274. a1:bits32;
  1275. b:bits32;
  1276. VAR z0Ptr:bits32;
  1277. VAR z1Ptr:bits32;
  1278. VAR z2Ptr:bits32
  1279. );
  1280. Var
  1281. z0, z1, z2, more1: bits32;
  1282. Begin
  1283. mul32To64( a1, b, z1, z2 );
  1284. mul32To64( a0, b, z0, more1 );
  1285. add64( z0, more1, 0, z1, z0, z1 );
  1286. z2Ptr := z2;
  1287. z1Ptr := z1;
  1288. z0Ptr := z0;
  1289. End;
  1290. {*
  1291. -------------------------------------------------------------------------------
  1292. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1293. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1294. product. The product is broken into four 32-bit pieces which are stored at
  1295. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1296. -------------------------------------------------------------------------------
  1297. *}
  1298. Procedure
  1299. mul64To128(
  1300. a0:bits32;
  1301. a1:bits32;
  1302. b0:bits32;
  1303. b1:bits32;
  1304. VAR z0Ptr:bits32;
  1305. VAR z1Ptr:bits32;
  1306. VAR z2Ptr:bits32;
  1307. VAR z3Ptr:bits32
  1308. );
  1309. Var
  1310. z0, z1, z2, z3: bits32;
  1311. more1, more2: bits32;
  1312. Begin
  1313. mul32To64( a1, b1, z2, z3 );
  1314. mul32To64( a1, b0, z1, more2 );
  1315. add64( z1, more2, 0, z2, z1, z2 );
  1316. mul32To64( a0, b0, z0, more1 );
  1317. add64( z0, more1, 0, z1, z0, z1 );
  1318. mul32To64( a0, b1, more1, more2 );
  1319. add64( more1, more2, 0, z2, more1, z2 );
  1320. add64( z0, z1, 0, more1, z0, z1 );
  1321. z3Ptr := z3;
  1322. z2Ptr := z2;
  1323. z1Ptr := z1;
  1324. z0Ptr := z0;
  1325. End;
  1326. {*----------------------------------------------------------------------------
  1327. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1328. | into two 64-bit pieces which are stored at the locations pointed to by
  1329. | `z0Ptr' and `z1Ptr'.
  1330. *----------------------------------------------------------------------------*}
  1331. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1332. var
  1333. aHigh, aLow, bHigh, bLow : bits32;
  1334. z0, zMiddleA, zMiddleB, z1 : bits64;
  1335. begin
  1336. aLow := a;
  1337. aHigh := a shr 32;
  1338. bLow := b;
  1339. bHigh := b shr 32;
  1340. z1 := ( bits64(aLow) ) * bLow;
  1341. zMiddleA := ( bits64( aLow )) * bHigh;
  1342. zMiddleB := ( bits64( aHigh )) * bLow;
  1343. z0 := ( bits64(aHigh) ) * bHigh;
  1344. inc(zMiddleA, zMiddleB);
  1345. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1346. zMiddleA := zMiddleA shl 32;
  1347. inc(z1, zMiddleA);
  1348. inc(z0, ord( z1 < zMiddleA ));
  1349. z1Ptr := z1;
  1350. z0Ptr := z0;
  1351. end;
  1352. {*----------------------------------------------------------------------------
  1353. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1354. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1355. | product. The product is broken into four 64-bit pieces which are stored at
  1356. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1357. *----------------------------------------------------------------------------*}
  1358. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1359. var
  1360. z0,z1,z2,z3,more1,more2 : bits64;
  1361. begin
  1362. mul64To128( a1, b1, z2, z3 );
  1363. mul64To128( a1, b0, z1, more2 );
  1364. add128( z1, more2, 0, z2, z1, z2 );
  1365. mul64To128( a0, b0, z0, more1 );
  1366. add128( z0, more1, 0, z1, z0, z1 );
  1367. mul64To128( a0, b1, more1, more2 );
  1368. add128( more1, more2, 0, z2, more1, z2 );
  1369. add128( z0, z1, 0, more1, z0, z1 );
  1370. z3Ptr := z3;
  1371. z2Ptr := z2;
  1372. z1Ptr := z1;
  1373. z0Ptr := z0;
  1374. end;
  1375. {*----------------------------------------------------------------------------
  1376. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1377. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1378. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1379. | `z2Ptr'.
  1380. *----------------------------------------------------------------------------*}
  1381. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1382. var
  1383. z0, z1, z2, more1 : bits64;
  1384. begin
  1385. mul64To128( a1, b, z1, z2 );
  1386. mul64To128( a0, b, z0, more1 );
  1387. add128( z0, more1, 0, z1, z0, z1 );
  1388. z2Ptr := z2;
  1389. z1Ptr := z1;
  1390. z0Ptr := z0;
  1391. end;
  1392. {*----------------------------------------------------------------------------
  1393. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1394. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1395. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1396. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1397. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1398. | unsigned integer is returned.
  1399. *----------------------------------------------------------------------------*}
  1400. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1401. var
  1402. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1403. begin
  1404. if ( b <= a0 ) then
  1405. begin
  1406. result:=qword( $FFFFFFFFFFFFFFFF );
  1407. exit;
  1408. end;
  1409. b0 := b shr 32;
  1410. if ( b0 shl 32 <= a0 ) then
  1411. z:=qword( $FFFFFFFF00000000 )
  1412. else
  1413. z:=( a0 div b0 ) shl 32;
  1414. mul64To128( b, z, term0, term1 );
  1415. sub128( a0, a1, term0, term1, rem0, rem1 );
  1416. while ( ( sbits64(rem0) ) < 0 ) do begin
  1417. dec(z,qword( $100000000 ));
  1418. b1 := b shl 32;
  1419. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1420. end;
  1421. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1422. if ( b0 shl 32 <= rem0 ) then
  1423. z:=z or $FFFFFFFF
  1424. else
  1425. z:=z or rem0 div b0;
  1426. result:=z;
  1427. end;
  1428. {*
  1429. -------------------------------------------------------------------------------
  1430. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1431. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1432. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1433. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1434. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1435. unsigned integer is returned.
  1436. -------------------------------------------------------------------------------
  1437. *}
  1438. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1439. Var
  1440. b0, b1: bits32;
  1441. rem0, rem1, term0, term1: bits32;
  1442. z: bits32;
  1443. Begin
  1444. if ( b <= a0 ) then
  1445. Begin
  1446. estimateDiv64To32 := $FFFFFFFF;
  1447. exit;
  1448. End;
  1449. b0 := b shr 16;
  1450. if ( b0 shl 16 <= a0 ) then
  1451. z:= $FFFF0000
  1452. else
  1453. z:= ( a0 div b0 ) shl 16;
  1454. mul32To64( b, z, term0, term1 );
  1455. sub64( a0, a1, term0, term1, rem0, rem1 );
  1456. while ( ( sbits32 (rem0) ) < 0 ) do
  1457. Begin
  1458. z := z - $10000;
  1459. b1 := b shl 16;
  1460. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1461. End;
  1462. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1463. if ( b0 shl 16 <= rem0 ) then
  1464. z := z or $FFFF
  1465. else
  1466. z := z or (rem0 div b0);
  1467. estimateDiv64To32 := z;
  1468. End;
  1469. {*
  1470. -------------------------------------------------------------------------------
  1471. Returns an approximation to the square root of the 32-bit significand given
  1472. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1473. `aExp' (the least significant bit) is 1, the integer returned approximates
  1474. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1475. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1476. case, the approximation returned lies strictly within +/-2 of the exact
  1477. value.
  1478. -------------------------------------------------------------------------------
  1479. *}
  1480. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1481. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1482. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1483. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1484. );
  1485. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1486. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1487. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1488. );
  1489. Var
  1490. index: int8;
  1491. z: bits32;
  1492. Begin
  1493. index := ( a shr 27 ) AND 15;
  1494. if ( aExp AND 1 ) <> 0 then
  1495. Begin
  1496. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1497. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1498. a := a shr 1;
  1499. End
  1500. else
  1501. Begin
  1502. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1503. z := a div z + z;
  1504. if ( $20000 <= z ) then
  1505. z := $FFFF8000
  1506. else
  1507. z := ( z shl 15 );
  1508. if ( z <= a ) then
  1509. Begin
  1510. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1511. exit;
  1512. End;
  1513. End;
  1514. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1515. End;
  1516. {*
  1517. -------------------------------------------------------------------------------
  1518. Returns the number of leading 0 bits before the most-significant 1 bit of
  1519. `a'. If `a' is zero, 32 is returned.
  1520. -------------------------------------------------------------------------------
  1521. *}
  1522. Function countLeadingZeros32( a:bits32 ): int8;
  1523. const countLeadingZerosHigh:array[0..255] of int8 = (
  1524. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1525. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1526. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1527. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1528. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1529. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1530. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1531. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1532. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1533. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1534. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1535. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1536. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1537. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1538. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1539. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1540. );
  1541. Var
  1542. shiftCount: int8;
  1543. Begin
  1544. shiftCount := 0;
  1545. if ( a < $10000 ) then
  1546. Begin
  1547. shiftCount := shiftcount + 16;
  1548. a := a shl 16;
  1549. End;
  1550. if ( a < $1000000 ) then
  1551. Begin
  1552. shiftCount := shiftcount + 8;
  1553. a := a shl 8;
  1554. end;
  1555. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1556. countLeadingZeros32:= shiftCount;
  1557. End;
  1558. {*----------------------------------------------------------------------------
  1559. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1560. | `a'. If `a' is zero, 64 is returned.
  1561. *----------------------------------------------------------------------------*}
  1562. function countLeadingZeros64( a : bits64): int8;
  1563. var
  1564. shiftcount : int8;
  1565. Begin
  1566. shiftCount := 0;
  1567. if ( a < bits64(bits64(1) shl 32 )) then
  1568. shiftCount := shiftcount + 32
  1569. else
  1570. a := a shr 32;
  1571. shiftCount := shiftCount + countLeadingZeros32( a );
  1572. countLeadingZeros64:= shiftCount;
  1573. End;
  1574. {*
  1575. -------------------------------------------------------------------------------
  1576. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
  1577. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1578. returns 0.
  1579. -------------------------------------------------------------------------------
  1580. *}
  1581. Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1582. Begin
  1583. eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
  1584. End;
  1585. {*
  1586. -------------------------------------------------------------------------------
  1587. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1588. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1589. Otherwise, returns 0.
  1590. -------------------------------------------------------------------------------
  1591. *}
  1592. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1593. Begin
  1594. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1595. End;
  1596. {*
  1597. -------------------------------------------------------------------------------
  1598. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1599. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1600. returns 0.
  1601. -------------------------------------------------------------------------------
  1602. *}
  1603. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1604. Begin
  1605. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1606. End;
  1607. {*
  1608. -------------------------------------------------------------------------------
  1609. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
  1610. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1611. returns 0.
  1612. -------------------------------------------------------------------------------
  1613. *}
  1614. Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1615. Begin
  1616. ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
  1617. End;
  1618. const
  1619. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1620. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1621. (*****************************************************************************)
  1622. (* End Low-Level arithmetic *)
  1623. (*****************************************************************************)
  1624. {*
  1625. -------------------------------------------------------------------------------
  1626. Functions and definitions to determine: (1) whether tininess for underflow
  1627. is detected before or after rounding by default, (2) what (if anything)
  1628. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1629. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1630. are propagated from function inputs to output. These details are ENDIAN
  1631. specific
  1632. -------------------------------------------------------------------------------
  1633. *}
  1634. {$IFDEF ENDIAN_LITTLE}
  1635. {*
  1636. -------------------------------------------------------------------------------
  1637. Internal canonical NaN format.
  1638. -------------------------------------------------------------------------------
  1639. *}
  1640. TYPE
  1641. commonNaNT = record
  1642. high, low : bits32;
  1643. sign: flag;
  1644. end;
  1645. {*
  1646. -------------------------------------------------------------------------------
  1647. The pattern for a default generated single-precision NaN.
  1648. -------------------------------------------------------------------------------
  1649. *}
  1650. const float32_default_nan = $FFC00000;
  1651. {*
  1652. -------------------------------------------------------------------------------
  1653. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1654. otherwise returns 0.
  1655. -------------------------------------------------------------------------------
  1656. *}
  1657. Function float32_is_nan( a : float32 ): flag;
  1658. Begin
  1659. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1660. End;
  1661. {*
  1662. -------------------------------------------------------------------------------
  1663. Returns 1 if the single-precision floating-point value `a' is a signaling
  1664. NaN; otherwise returns 0.
  1665. -------------------------------------------------------------------------------
  1666. *}
  1667. Function float32_is_signaling_nan( a : float32 ): flag;
  1668. Begin
  1669. float32_is_signaling_nan := flag
  1670. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1671. End;
  1672. {*
  1673. -------------------------------------------------------------------------------
  1674. Returns the result of converting the single-precision floating-point NaN
  1675. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1676. exception is raised.
  1677. -------------------------------------------------------------------------------
  1678. *}
  1679. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1680. var
  1681. z : commonNaNT ;
  1682. Begin
  1683. if ( float32_is_signaling_nan( a ) <> 0) then
  1684. float_raise( float_flag_invalid );
  1685. z.sign := a shr 31;
  1686. z.low := 0;
  1687. z.high := a shl 9;
  1688. c := z;
  1689. End;
  1690. {*
  1691. -------------------------------------------------------------------------------
  1692. Returns the result of converting the canonical NaN `a' to the single-
  1693. precision floating-point format.
  1694. -------------------------------------------------------------------------------
  1695. *}
  1696. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1697. Begin
  1698. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1699. End;
  1700. {*
  1701. -------------------------------------------------------------------------------
  1702. Takes two single-precision floating-point values `a' and `b', one of which
  1703. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1704. signaling NaN, the invalid exception is raised.
  1705. -------------------------------------------------------------------------------
  1706. *}
  1707. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1708. Var
  1709. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1710. label returnLargerSignificand;
  1711. Begin
  1712. aIsNaN := float32_is_nan( a );
  1713. aIsSignalingNaN := float32_is_signaling_nan( a );
  1714. bIsNaN := float32_is_nan( b );
  1715. bIsSignalingNaN := float32_is_signaling_nan( b );
  1716. a := a or $00400000;
  1717. b := b or $00400000;
  1718. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1719. float_raise( float_flag_invalid );
  1720. if ( aIsSignalingNaN )<> 0 then
  1721. Begin
  1722. if ( bIsSignalingNaN ) <> 0 then
  1723. goto returnLargerSignificand;
  1724. if bIsNan <> 0 then
  1725. propagateFloat32NaN := b
  1726. else
  1727. propagateFloat32NaN := a;
  1728. exit;
  1729. End
  1730. else if ( aIsNaN <> 0) then
  1731. Begin
  1732. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1733. Begin
  1734. propagateFloat32NaN := a;
  1735. exit;
  1736. End;
  1737. returnLargerSignificand:
  1738. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1739. Begin
  1740. propagateFloat32NaN := b;
  1741. exit;
  1742. End;
  1743. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1744. Begin
  1745. propagateFloat32NaN := a;
  1746. End;
  1747. if a < b then
  1748. propagateFloat32NaN := a
  1749. else
  1750. propagateFloat32NaN := b;
  1751. exit;
  1752. End
  1753. else
  1754. Begin
  1755. propagateFloat32NaN := b;
  1756. exit;
  1757. End;
  1758. End;
  1759. {*
  1760. -------------------------------------------------------------------------------
  1761. The pattern for a default generated double-precision NaN. The `high' and
  1762. `low' values hold the most- and least-significant bits, respectively.
  1763. -------------------------------------------------------------------------------
  1764. *}
  1765. const
  1766. float64_default_nan_high = $FFF80000;
  1767. float64_default_nan_low = $00000000;
  1768. {*
  1769. -------------------------------------------------------------------------------
  1770. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1771. otherwise returns 0.
  1772. -------------------------------------------------------------------------------
  1773. *}
  1774. Function float64_is_nan( a : float64 ) : flag;
  1775. Begin
  1776. float64_is_nan :=
  1777. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1778. and ( a.low or ( a.high and $000FFFFF ) );
  1779. End;
  1780. {*
  1781. -------------------------------------------------------------------------------
  1782. Returns 1 if the double-precision floating-point value `a' is a signaling
  1783. NaN; otherwise returns 0.
  1784. -------------------------------------------------------------------------------
  1785. *}
  1786. Function float64_is_signaling_nan( a : float64 ): flag;
  1787. Begin
  1788. float64_is_signaling_nan :=
  1789. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1790. and ( a.low or ( a.high and $0007FFFF ) );
  1791. End;
  1792. {*
  1793. -------------------------------------------------------------------------------
  1794. Returns the result of converting the double-precision floating-point NaN
  1795. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1796. exception is raised.
  1797. -------------------------------------------------------------------------------
  1798. *}
  1799. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1800. Var
  1801. z : commonNaNT;
  1802. Begin
  1803. if ( float64_is_signaling_nan( a )<>0 ) then
  1804. float_raise( float_flag_invalid );
  1805. z.sign := a.high shr 31;
  1806. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1807. c := z;
  1808. End;
  1809. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1810. Var
  1811. z : commonNaNT;
  1812. Begin
  1813. if ( float64_is_signaling_nan( a )<>0 ) then
  1814. float_raise( float_flag_invalid );
  1815. z.sign := a.high shr 31;
  1816. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1817. result := z;
  1818. End;
  1819. {*
  1820. -------------------------------------------------------------------------------
  1821. Returns the result of converting the canonical NaN `a' to the double-
  1822. precision floating-point format.
  1823. -------------------------------------------------------------------------------
  1824. *}
  1825. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1826. Var
  1827. z: float64;
  1828. Begin
  1829. shift64Right( a.high, a.low, 12, z.high, z.low );
  1830. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1831. c := z;
  1832. End;
  1833. {*
  1834. -------------------------------------------------------------------------------
  1835. Takes two double-precision floating-point values `a' and `b', one of which
  1836. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1837. signaling NaN, the invalid exception is raised.
  1838. -------------------------------------------------------------------------------
  1839. *}
  1840. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1841. Var
  1842. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1843. label returnLargerSignificand;
  1844. Begin
  1845. aIsNaN := float64_is_nan( a );
  1846. aIsSignalingNaN := float64_is_signaling_nan( a );
  1847. bIsNaN := float64_is_nan( b );
  1848. bIsSignalingNaN := float64_is_signaling_nan( b );
  1849. a.high := a.high or $00080000;
  1850. b.high := b.high or $00080000;
  1851. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1852. float_raise( float_flag_invalid );
  1853. if ( aIsSignalingNaN )<>0 then
  1854. Begin
  1855. if ( bIsSignalingNaN )<>0 then
  1856. goto returnLargerSignificand;
  1857. if bIsNan <> 0 then
  1858. c := b
  1859. else
  1860. c := a;
  1861. exit;
  1862. End
  1863. else if ( aIsNaN )<> 0 then
  1864. Begin
  1865. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1866. Begin
  1867. c := a;
  1868. exit;
  1869. End;
  1870. returnLargerSignificand:
  1871. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1872. Begin
  1873. c := b;
  1874. exit;
  1875. End;
  1876. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1877. Begin
  1878. c := a;
  1879. exit;
  1880. End;
  1881. if a.high < b.high then
  1882. c := a
  1883. else
  1884. c := b;
  1885. exit;
  1886. End
  1887. else
  1888. Begin
  1889. c := b;
  1890. exit;
  1891. End;
  1892. End;
  1893. {*----------------------------------------------------------------------------
  1894. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1895. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1896. | returns 0.
  1897. *----------------------------------------------------------------------------*}
  1898. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1899. begin
  1900. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1901. end;
  1902. {*----------------------------------------------------------------------------
  1903. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1904. | otherwise returns 0.
  1905. *----------------------------------------------------------------------------*}
  1906. function float128_is_nan( a : float128): flag;
  1907. begin
  1908. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1909. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1910. end;
  1911. {*----------------------------------------------------------------------------
  1912. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1913. | signaling NaN; otherwise returns 0.
  1914. *----------------------------------------------------------------------------*}
  1915. function float128_is_signaling_nan( a : float128): flag;
  1916. begin
  1917. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1918. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1919. end;
  1920. {*----------------------------------------------------------------------------
  1921. | Returns the result of converting the quadruple-precision floating-point NaN
  1922. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1923. | exception is raised.
  1924. *----------------------------------------------------------------------------*}
  1925. function float128ToCommonNaN( a : float128): commonNaNT;
  1926. var
  1927. z: commonNaNT;
  1928. qhigh,qlow : qword;
  1929. begin
  1930. if ( float128_is_signaling_nan( a )<>0) then
  1931. float_raise( float_flag_invalid );
  1932. z.sign := a.high shr 63;
  1933. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1934. z.high:=qhigh shr 32;
  1935. z.low:=qhigh and $ffffffff;
  1936. result:=z;
  1937. end;
  1938. {*----------------------------------------------------------------------------
  1939. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1940. | precision floating-point format.
  1941. *----------------------------------------------------------------------------*}
  1942. function commonNaNToFloat128( a : commonNaNT): float128;
  1943. var
  1944. z: float128;
  1945. begin
  1946. shift128Right( a.high, a.low, 16, z.high, z.low );
  1947. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1948. result:=z;
  1949. end;
  1950. {*----------------------------------------------------------------------------
  1951. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1952. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1953. | `b' is a signaling NaN, the invalid exception is raised.
  1954. *----------------------------------------------------------------------------*}
  1955. function propagateFloat128NaN( a: float128; b : float128): float128;
  1956. var
  1957. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1958. label
  1959. returnLargerSignificand;
  1960. begin
  1961. aIsNaN := float128_is_nan( a );
  1962. aIsSignalingNaN := float128_is_signaling_nan( a );
  1963. bIsNaN := float128_is_nan( b );
  1964. bIsSignalingNaN := float128_is_signaling_nan( b );
  1965. a.high := a.high or int64( $0000800000000000 );
  1966. b.high := b.high or int64( $0000800000000000 );
  1967. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1968. float_raise( float_flag_invalid );
  1969. if ( aIsSignalingNaN )<>0 then
  1970. begin
  1971. if ( bIsSignalingNaN )<>0 then
  1972. goto returnLargerSignificand;
  1973. if bIsNaN<>0 then
  1974. result := b
  1975. else
  1976. result := a;
  1977. exit;
  1978. end
  1979. else if ( aIsNaN )<>0 then
  1980. begin
  1981. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1982. begin
  1983. result := a;
  1984. exit;
  1985. end;
  1986. returnLargerSignificand:
  1987. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1988. begin
  1989. result := b;
  1990. exit;
  1991. end;
  1992. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1993. begin
  1994. result := a;
  1995. exit
  1996. end;
  1997. if ( a.high < b.high ) then
  1998. result := a
  1999. else
  2000. result := b;
  2001. exit;
  2002. end
  2003. else
  2004. result:=b;
  2005. end;
  2006. {$ELSE}
  2007. { Big endian code }
  2008. (*----------------------------------------------------------------------------
  2009. | Internal canonical NaN format.
  2010. *----------------------------------------------------------------------------*)
  2011. type
  2012. commonNANT = record
  2013. high, low : bits32;
  2014. sign : flag;
  2015. end;
  2016. (*----------------------------------------------------------------------------
  2017. | The pattern for a default generated single-precision NaN.
  2018. *----------------------------------------------------------------------------*)
  2019. const float32_default_nan = $7FFFFFFF;
  2020. (*----------------------------------------------------------------------------
  2021. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  2022. | otherwise returns 0.
  2023. *----------------------------------------------------------------------------*)
  2024. function float32_is_nan(a: float32): flag;
  2025. begin
  2026. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  2027. end;
  2028. (*----------------------------------------------------------------------------
  2029. | Returns 1 if the single-precision floating-point value `a' is a signaling
  2030. | NaN; otherwise returns 0.
  2031. *----------------------------------------------------------------------------*)
  2032. function float32_is_signaling_nan(a: float32):flag;
  2033. begin
  2034. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  2035. end;
  2036. (*----------------------------------------------------------------------------
  2037. | Returns the result of converting the single-precision floating-point NaN
  2038. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2039. | exception is raised.
  2040. *----------------------------------------------------------------------------*)
  2041. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  2042. var
  2043. z: commonNANT;
  2044. begin
  2045. if float32_is_signaling_nan(a)<>0 then
  2046. float_raise(float_flag_invalid);
  2047. z.sign := a shr 31;
  2048. z.low := 0;
  2049. z.high := a shl 9;
  2050. c:=z;
  2051. end;
  2052. (*----------------------------------------------------------------------------
  2053. | Returns the result of converting the canonical NaN `a' to the single-
  2054. | precision floating-point format.
  2055. *----------------------------------------------------------------------------*)
  2056. function CommonNanToFloat32(a : CommonNaNT): float32;
  2057. begin
  2058. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  2059. end;
  2060. (*----------------------------------------------------------------------------
  2061. | Takes two single-precision floating-point values `a' and `b', one of which
  2062. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2063. | signaling NaN, the invalid exception is raised.
  2064. *----------------------------------------------------------------------------*)
  2065. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  2066. var
  2067. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2068. begin
  2069. aIsNaN := float32_is_nan( a );
  2070. aIsSignalingNaN := float32_is_signaling_nan( a );
  2071. bIsNaN := float32_is_nan( b );
  2072. bIsSignalingNaN := float32_is_signaling_nan( b );
  2073. a := a or $00400000;
  2074. b := b or $00400000;
  2075. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2076. float_raise( float_flag_invalid );
  2077. if bIsSignalingNaN<>0 then
  2078. propagateFloat32Nan := b
  2079. else if aIsSignalingNan<>0 then
  2080. propagateFloat32Nan := a
  2081. else if bIsNan<>0 then
  2082. propagateFloat32Nan := b
  2083. else
  2084. propagateFloat32Nan := a;
  2085. end;
  2086. (*----------------------------------------------------------------------------
  2087. | The pattern for a default generated double-precision NaN. The `high' and
  2088. | `low' values hold the most- and least-significant bits, respectively.
  2089. *----------------------------------------------------------------------------*)
  2090. const
  2091. float64_default_nan_high = $7FFFFFFF;
  2092. float64_default_nan_low = $FFFFFFFF;
  2093. (*----------------------------------------------------------------------------
  2094. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2095. | otherwise returns 0.
  2096. *----------------------------------------------------------------------------*)
  2097. function float64_is_nan(a: float64): flag;
  2098. begin
  2099. float64_is_nan := flag (
  2100. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2101. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2102. end;
  2103. (*----------------------------------------------------------------------------
  2104. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2105. | NaN; otherwise returns 0.
  2106. *----------------------------------------------------------------------------*)
  2107. function float64_is_signaling_nan( a:float64): flag;
  2108. begin
  2109. float64_is_signaling_nan := flag(
  2110. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2111. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2112. end;
  2113. (*----------------------------------------------------------------------------
  2114. | Returns the result of converting the double-precision floating-point NaN
  2115. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2116. | exception is raised.
  2117. *----------------------------------------------------------------------------*)
  2118. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  2119. var
  2120. z : commonNaNT;
  2121. begin
  2122. if ( float64_is_signaling_nan( a )<>0 ) then
  2123. float_raise( float_flag_invalid );
  2124. z.sign := a.high shr 31;
  2125. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2126. c:=z;
  2127. end;
  2128. (*----------------------------------------------------------------------------
  2129. | Returns the result of converting the canonical NaN `a' to the double-
  2130. | precision floating-point format.
  2131. *----------------------------------------------------------------------------*)
  2132. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  2133. var
  2134. z: float64;
  2135. begin
  2136. shift64Right( a.high, a.low, 12, z.high, z.low );
  2137. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2138. c:=z;
  2139. end;
  2140. (*----------------------------------------------------------------------------
  2141. | Takes two double-precision floating-point values `a' and `b', one of which
  2142. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2143. | signaling NaN, the invalid exception is raised.
  2144. *----------------------------------------------------------------------------*)
  2145. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2146. var
  2147. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2148. begin
  2149. aIsNaN := float64_is_nan( a );
  2150. aIsSignalingNaN := float64_is_signaling_nan( a );
  2151. bIsNaN := float64_is_nan( b );
  2152. bIsSignalingNaN := float64_is_signaling_nan( b );
  2153. a.high := a.high or $00080000;
  2154. b.high := b.high or $00080000;
  2155. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2156. float_raise( float_flag_invalid );
  2157. if bIsSignalingNaN<>0 then
  2158. c := b
  2159. else if aIsSignalingNan<>0 then
  2160. c := a
  2161. else if bIsNan<>0 then
  2162. c := b
  2163. else
  2164. c := a;
  2165. end;
  2166. {$ENDIF}
  2167. (****************************************************************************)
  2168. (* END ENDIAN SPECIFIC CODE *)
  2169. (****************************************************************************)
  2170. {*
  2171. -------------------------------------------------------------------------------
  2172. Returns the fraction bits of the single-precision floating-point value `a'.
  2173. -------------------------------------------------------------------------------
  2174. *}
  2175. Function ExtractFloat32Frac(a : Float32) : Bits32;
  2176. Begin
  2177. ExtractFloat32Frac := A AND $007FFFFF;
  2178. End;
  2179. {*
  2180. -------------------------------------------------------------------------------
  2181. Returns the exponent bits of the single-precision floating-point value `a'.
  2182. -------------------------------------------------------------------------------
  2183. *}
  2184. Function extractFloat32Exp( a: float32 ): Int16;
  2185. Begin
  2186. extractFloat32Exp := (a shr 23) AND $FF;
  2187. End;
  2188. {*
  2189. -------------------------------------------------------------------------------
  2190. Returns the sign bit of the single-precision floating-point value `a'.
  2191. -------------------------------------------------------------------------------
  2192. *}
  2193. Function extractFloat32Sign( a: float32 ): Flag;
  2194. Begin
  2195. extractFloat32Sign := a shr 31;
  2196. End;
  2197. {*
  2198. -------------------------------------------------------------------------------
  2199. Normalizes the subnormal single-precision floating-point value represented
  2200. by the denormalized significand `aSig'. The normalized exponent and
  2201. significand are stored at the locations pointed to by `zExpPtr' and
  2202. `zSigPtr', respectively.
  2203. -------------------------------------------------------------------------------
  2204. *}
  2205. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2206. Var
  2207. ShiftCount : BYTE;
  2208. Begin
  2209. shiftCount := countLeadingZeros32( aSig ) - 8;
  2210. zSigPtr := aSig shl shiftCount;
  2211. zExpPtr := 1 - shiftCount;
  2212. End;
  2213. {*
  2214. -------------------------------------------------------------------------------
  2215. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2216. single-precision floating-point value, returning the result. After being
  2217. shifted into the proper positions, the three fields are simply added
  2218. together to form the result. This means that any integer portion of `zSig'
  2219. will be added into the exponent. Since a properly normalized significand
  2220. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2221. than the desired result exponent whenever `zSig' is a complete, normalized
  2222. significand.
  2223. -------------------------------------------------------------------------------
  2224. *}
  2225. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  2226. Begin
  2227. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2228. + zSig;
  2229. End;
  2230. {*
  2231. -------------------------------------------------------------------------------
  2232. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2233. and significand `zSig', and returns the proper single-precision floating-
  2234. point value corresponding to the abstract input. Ordinarily, the abstract
  2235. value is simply rounded and packed into the single-precision format, with
  2236. the inexact exception raised if the abstract input cannot be represented
  2237. exactly. However, if the abstract value is too large, the overflow and
  2238. inexact exceptions are raised and an infinity or maximal finite value is
  2239. returned. If the abstract value is too small, the input value is rounded to
  2240. a subnormal number, and the underflow and inexact exceptions are raised if
  2241. the abstract input cannot be represented exactly as a subnormal single-
  2242. precision floating-point number.
  2243. The input significand `zSig' has its binary point between bits 30
  2244. and 29, which is 7 bits to the left of the usual location. This shifted
  2245. significand must be normalized or smaller. If `zSig' is not normalized,
  2246. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2247. and it must not require rounding. In the usual case that `zSig' is
  2248. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2249. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2250. Binary Floating-Point Arithmetic.
  2251. -------------------------------------------------------------------------------
  2252. *}
  2253. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2254. Var
  2255. roundingMode : BYTE;
  2256. roundNearestEven : Flag;
  2257. roundIncrement, roundBits : BYTE;
  2258. IsTiny : Flag;
  2259. Begin
  2260. roundingMode := softfloat_rounding_mode;
  2261. if (roundingMode = float_round_nearest_even) then
  2262. Begin
  2263. roundNearestEven := Flag(TRUE);
  2264. end
  2265. else
  2266. roundNearestEven := Flag(FALSE);
  2267. roundIncrement := $40;
  2268. if ( Boolean(roundNearestEven) = FALSE) then
  2269. Begin
  2270. if ( roundingMode = float_round_to_zero ) Then
  2271. Begin
  2272. roundIncrement := 0;
  2273. End
  2274. else
  2275. Begin
  2276. roundIncrement := $7F;
  2277. if ( zSign <> 0 ) then
  2278. Begin
  2279. if roundingMode = float_round_up then roundIncrement := 0;
  2280. End
  2281. else
  2282. Begin
  2283. if roundingMode = float_round_down then roundIncrement := 0;
  2284. End;
  2285. End
  2286. End;
  2287. roundBits := zSig AND $7F;
  2288. if ($FD <= bits16 (zExp) ) then
  2289. Begin
  2290. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2291. Begin
  2292. float_raise( float_flag_overflow OR float_flag_inexact );
  2293. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2294. exit;
  2295. End;
  2296. if ( zExp < 0 ) then
  2297. Begin
  2298. isTiny :=
  2299. flag(( softfloat_detect_tininess = float_tininess_before_rounding )
  2300. OR ( zExp < -1 )
  2301. OR ( (zSig + roundIncrement) < $80000000 ));
  2302. shift32RightJamming( zSig, - zExp, zSig );
  2303. zExp := 0;
  2304. roundBits := zSig AND $7F;
  2305. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  2306. float_raise( float_flag_underflow );
  2307. End;
  2308. End;
  2309. if ( roundBits )<> 0 then
  2310. softfloat_exception_flags := float_flag_inexact OR softfloat_exception_flags;
  2311. zSig := ( zSig + roundIncrement ) shr 7;
  2312. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  2313. if ( zSig = 0 ) then zExp := 0;
  2314. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2315. exit;
  2316. End;
  2317. {*
  2318. -------------------------------------------------------------------------------
  2319. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2320. and significand `zSig', and returns the proper single-precision floating-
  2321. point value corresponding to the abstract input. This routine is just like
  2322. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2323. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2324. floating-point exponent.
  2325. -------------------------------------------------------------------------------
  2326. *}
  2327. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2328. Var
  2329. ShiftCount : int8;
  2330. Begin
  2331. shiftCount := countLeadingZeros32( zSig ) - 1;
  2332. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2333. End;
  2334. {*
  2335. -------------------------------------------------------------------------------
  2336. Returns the most-significant 20 fraction bits of the double-precision
  2337. floating-point value `a'.
  2338. -------------------------------------------------------------------------------
  2339. *}
  2340. Function extractFloat64Frac0(a: float64): bits32;
  2341. Begin
  2342. extractFloat64Frac0 := a.high and $000FFFFF;
  2343. End;
  2344. {*
  2345. -------------------------------------------------------------------------------
  2346. Returns the least-significant 32 fraction bits of the double-precision
  2347. floating-point value `a'.
  2348. -------------------------------------------------------------------------------
  2349. *}
  2350. Function extractFloat64Frac1(a: float64): bits32;
  2351. Begin
  2352. extractFloat64Frac1 := a.low;
  2353. End;
  2354. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2355. Function extractFloat64Frac(a: float64): bits64;
  2356. Begin
  2357. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2358. End;
  2359. {*
  2360. -------------------------------------------------------------------------------
  2361. Returns the exponent bits of the double-precision floating-point value `a'.
  2362. -------------------------------------------------------------------------------
  2363. *}
  2364. Function extractFloat64Exp(a: float64): int16;
  2365. Begin
  2366. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2367. End;
  2368. {*
  2369. -------------------------------------------------------------------------------
  2370. Returns the sign bit of the double-precision floating-point value `a'.
  2371. -------------------------------------------------------------------------------
  2372. *}
  2373. Function extractFloat64Sign(a: float64) : flag;
  2374. Begin
  2375. extractFloat64Sign := a.high shr 31;
  2376. End;
  2377. {*
  2378. -------------------------------------------------------------------------------
  2379. Normalizes the subnormal double-precision floating-point value represented
  2380. by the denormalized significand formed by the concatenation of `aSig0' and
  2381. `aSig1'. The normalized exponent is stored at the location pointed to by
  2382. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2383. stored at the location pointed to by `zSig0Ptr', and the least significant
  2384. 32 bits of the normalized significand are stored at the location pointed to
  2385. by `zSig1Ptr'.
  2386. -------------------------------------------------------------------------------
  2387. *}
  2388. Procedure normalizeFloat64Subnormal(
  2389. aSig0: bits32;
  2390. aSig1: bits32;
  2391. VAR zExpPtr : Int16;
  2392. VAR zSig0Ptr : Bits32;
  2393. VAR zSig1Ptr : Bits32
  2394. );
  2395. Var
  2396. ShiftCount : Int8;
  2397. Begin
  2398. if ( aSig0 = 0 ) then
  2399. Begin
  2400. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2401. if ( shiftCount < 0 ) then
  2402. Begin
  2403. zSig0Ptr := aSig1 shr ( - shiftCount );
  2404. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2405. End
  2406. else
  2407. Begin
  2408. zSig0Ptr := aSig1 shl shiftCount;
  2409. zSig1Ptr := 0;
  2410. End;
  2411. zExpPtr := - shiftCount - 31;
  2412. End
  2413. else
  2414. Begin
  2415. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2416. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2417. zExpPtr := 1 - shiftCount;
  2418. End;
  2419. End;
  2420. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2421. var
  2422. shiftCount : int8;
  2423. begin
  2424. shiftCount := countLeadingZeros64( aSig ) - 11;
  2425. zSigPtr := aSig shl shiftCount;
  2426. zExpPtr := 1 - shiftCount;
  2427. end;
  2428. {*
  2429. -------------------------------------------------------------------------------
  2430. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2431. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2432. point value, returning the result. After being shifted into the proper
  2433. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2434. together to form the most significant 32 bits of the result. This means
  2435. that any integer portion of `zSig0' will be added into the exponent. Since
  2436. a properly normalized significand will have an integer portion equal to 1,
  2437. the `zExp' input should be 1 less than the desired result exponent whenever
  2438. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2439. -------------------------------------------------------------------------------
  2440. *}
  2441. Procedure
  2442. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2443. var
  2444. z: Float64;
  2445. Begin
  2446. z.low := zSig1;
  2447. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2448. c := z;
  2449. End;
  2450. {*----------------------------------------------------------------------------
  2451. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2452. | double-precision floating-point value, returning the result. After being
  2453. | shifted into the proper positions, the three fields are simply added
  2454. | together to form the result. This means that any integer portion of `zSig'
  2455. | will be added into the exponent. Since a properly normalized significand
  2456. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2457. | than the desired result exponent whenever `zSig' is a complete, normalized
  2458. | significand.
  2459. *----------------------------------------------------------------------------*}
  2460. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2461. begin
  2462. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2463. end;
  2464. {*
  2465. -------------------------------------------------------------------------------
  2466. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2467. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2468. and `zSig2', and returns the proper double-precision floating-point value
  2469. corresponding to the abstract input. Ordinarily, the abstract value is
  2470. simply rounded and packed into the double-precision format, with the inexact
  2471. exception raised if the abstract input cannot be represented exactly.
  2472. However, if the abstract value is too large, the overflow and inexact
  2473. exceptions are raised and an infinity or maximal finite value is returned.
  2474. If the abstract value is too small, the input value is rounded to a
  2475. subnormal number, and the underflow and inexact exceptions are raised if the
  2476. abstract input cannot be represented exactly as a subnormal double-precision
  2477. floating-point number.
  2478. The input significand must be normalized or smaller. If the input
  2479. significand is not normalized, `zExp' must be 0; in that case, the result
  2480. returned is a subnormal number, and it must not require rounding. In the
  2481. usual case that the input significand is normalized, `zExp' must be 1 less
  2482. than the ``true'' floating-point exponent. The handling of underflow and
  2483. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2484. -------------------------------------------------------------------------------
  2485. *}
  2486. Procedure
  2487. roundAndPackFloat64(
  2488. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2489. Var
  2490. roundingMode : Int8;
  2491. roundNearestEven, increment, isTiny : Flag;
  2492. Begin
  2493. roundingMode := softfloat_rounding_mode;
  2494. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2495. increment := flag( sbits32 (zSig2) < 0 );
  2496. if ( roundNearestEven = flag(FALSE) ) then
  2497. Begin
  2498. if ( roundingMode = float_round_to_zero ) then
  2499. increment := 0
  2500. else
  2501. Begin
  2502. if ( zSign )<> 0 then
  2503. Begin
  2504. increment := flag( roundingMode = float_round_down ) and zSig2;
  2505. End
  2506. else
  2507. Begin
  2508. increment := flag( roundingMode = float_round_up ) and zSig2;
  2509. End
  2510. End
  2511. End;
  2512. if ( $7FD <= bits16 (zExp) ) then
  2513. Begin
  2514. if (( $7FD < zExp )
  2515. or (( zExp = $7FD )
  2516. and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
  2517. and (increment<>0)
  2518. )
  2519. ) then
  2520. Begin
  2521. float_raise( float_flag_overflow OR float_flag_inexact );
  2522. if (( roundingMode = float_round_to_zero )
  2523. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2524. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2525. ) then
  2526. Begin
  2527. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2528. exit;
  2529. End;
  2530. packFloat64( zSign, $7FF, 0, 0, c );
  2531. exit;
  2532. End;
  2533. if ( zExp < 0 ) then
  2534. Begin
  2535. isTiny :=
  2536. flag( softfloat_detect_tininess = float_tininess_before_rounding )
  2537. or flag( zExp < -1 )
  2538. or flag(increment = 0)
  2539. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2540. shift64ExtraRightJamming(
  2541. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2542. zExp := 0;
  2543. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2544. if ( roundNearestEven )<>0 then
  2545. Begin
  2546. increment := flag( sbits32 (zSig2) < 0 );
  2547. End
  2548. else
  2549. Begin
  2550. if ( zSign )<>0 then
  2551. Begin
  2552. increment := flag( roundingMode = float_round_down ) and zSig2;
  2553. End
  2554. else
  2555. Begin
  2556. increment := flag( roundingMode = float_round_up ) and zSig2;
  2557. End
  2558. End;
  2559. End;
  2560. End;
  2561. if ( zSig2 )<>0 then
  2562. softfloat_exception_flags := softfloat_exception_flags OR float_flag_inexact;
  2563. if ( increment )<>0 then
  2564. Begin
  2565. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2566. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2567. End
  2568. else
  2569. Begin
  2570. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2571. End;
  2572. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2573. End;
  2574. {*----------------------------------------------------------------------------
  2575. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2576. | and significand `zSig', and returns the proper double-precision floating-
  2577. | point value corresponding to the abstract input. Ordinarily, the abstract
  2578. | value is simply rounded and packed into the double-precision format, with
  2579. | the inexact exception raised if the abstract input cannot be represented
  2580. | exactly. However, if the abstract value is too large, the overflow and
  2581. | inexact exceptions are raised and an infinity or maximal finite value is
  2582. | returned. If the abstract value is too small, the input value is rounded
  2583. | to a subnormal number, and the underflow and inexact exceptions are raised
  2584. | if the abstract input cannot be represented exactly as a subnormal double-
  2585. | precision floating-point number.
  2586. | The input significand `zSig' has its binary point between bits 62
  2587. | and 61, which is 10 bits to the left of the usual location. This shifted
  2588. | significand must be normalized or smaller. If `zSig' is not normalized,
  2589. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2590. | and it must not require rounding. In the usual case that `zSig' is
  2591. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2592. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2593. | Binary Floating-Point Arithmetic.
  2594. *----------------------------------------------------------------------------*}
  2595. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2596. var
  2597. roundingMode: int8;
  2598. roundNearestEven: flag;
  2599. roundIncrement, roundBits: int16;
  2600. isTiny: flag;
  2601. begin
  2602. roundingMode := softfloat_rounding_mode;
  2603. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2604. roundIncrement := $200;
  2605. if ( roundNearestEven=0 ) then
  2606. begin
  2607. if ( roundingMode = float_round_to_zero ) then
  2608. begin
  2609. roundIncrement := 0;
  2610. end
  2611. else begin
  2612. roundIncrement := $3FF;
  2613. if ( zSign<>0 ) then
  2614. begin
  2615. if ( roundingMode = float_round_up ) then
  2616. roundIncrement := 0;
  2617. end
  2618. else begin
  2619. if ( roundingMode = float_round_down ) then
  2620. roundIncrement := 0;
  2621. end
  2622. end
  2623. end;
  2624. roundBits := zSig and $3FF;
  2625. if ( $7FD <= bits16(zExp) ) then
  2626. begin
  2627. if ( ( $7FD < zExp )
  2628. or ( ( zExp = $7FD )
  2629. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2630. ) then
  2631. begin
  2632. float_raise( float_flag_overflow or float_flag_inexact );
  2633. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2634. exit;
  2635. end;
  2636. if ( zExp < 0 ) then
  2637. begin
  2638. isTiny := ord(
  2639. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2640. or ( zExp < -1 )
  2641. or ( (zSig + roundIncrement) < bits64( $8000000000000000 ) ) );
  2642. shift64RightJamming( zSig, - zExp, zSig );
  2643. zExp := 0;
  2644. roundBits := zSig and $3FF;
  2645. if ( isTiny and roundBits )<>0 then
  2646. float_raise( float_flag_underflow );
  2647. end
  2648. end;
  2649. if ( roundBits<>0 ) then
  2650. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2651. zSig := ( zSig + roundIncrement ) shr 10;
  2652. zSig := zSig and not(qword(ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven ));
  2653. if ( zSig = 0 ) then
  2654. zExp := 0;
  2655. result:=packFloat64( zSign, zExp, zSig );
  2656. end;
  2657. {*
  2658. -------------------------------------------------------------------------------
  2659. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2660. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2661. returns the proper double-precision floating-point value corresponding
  2662. to the abstract input. This routine is just like `roundAndPackFloat64'
  2663. except that the input significand has fewer bits and does not have to be
  2664. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2665. point exponent.
  2666. -------------------------------------------------------------------------------
  2667. *}
  2668. Procedure
  2669. normalizeRoundAndPackFloat64(
  2670. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2671. Var
  2672. shiftCount : int8;
  2673. zSig2 : bits32;
  2674. Begin
  2675. if ( zSig0 = 0 ) then
  2676. Begin
  2677. zSig0 := zSig1;
  2678. zSig1 := 0;
  2679. zExp := zExp -32;
  2680. End;
  2681. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2682. if ( 0 <= shiftCount ) then
  2683. Begin
  2684. zSig2 := 0;
  2685. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2686. End
  2687. else
  2688. Begin
  2689. shift64ExtraRightJamming
  2690. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2691. End;
  2692. zExp := zExp - shiftCount;
  2693. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2694. End;
  2695. {*
  2696. -------------------------------------------------------------------------------
  2697. Returns the result of converting the 32-bit two's complement integer `a' to
  2698. the single-precision floating-point format. The conversion is performed
  2699. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2700. -------------------------------------------------------------------------------
  2701. *}
  2702. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2703. Var
  2704. zSign : Flag;
  2705. Begin
  2706. if ( a = 0 ) then
  2707. Begin
  2708. int32_to_float32.float32 := 0;
  2709. exit;
  2710. End;
  2711. if ( a = sbits32 ($80000000) ) then
  2712. Begin
  2713. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2714. exit;
  2715. end;
  2716. zSign := flag( a < 0 );
  2717. If zSign<>0 then
  2718. a := -a;
  2719. int32_to_float32.float32:=
  2720. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2721. End;
  2722. {*
  2723. -------------------------------------------------------------------------------
  2724. Returns the result of converting the 32-bit two's complement integer `a' to
  2725. the double-precision floating-point format. The conversion is performed
  2726. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2727. -------------------------------------------------------------------------------
  2728. *}
  2729. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2730. var
  2731. zSign : flag;
  2732. absA : bits32;
  2733. shiftCount : int8;
  2734. zSig0, zSig1 : bits32;
  2735. Begin
  2736. if ( a = 0 ) then
  2737. Begin
  2738. packFloat64( 0, 0, 0, 0, result );
  2739. exit;
  2740. end;
  2741. zSign := flag( a < 0 );
  2742. if ZSign<>0 then
  2743. AbsA := -a
  2744. else
  2745. AbsA := a;
  2746. shiftCount := countLeadingZeros32( absA ) - 11;
  2747. if ( 0 <= shiftCount ) then
  2748. Begin
  2749. zSig0 := absA shl shiftCount;
  2750. zSig1 := 0;
  2751. End
  2752. else
  2753. Begin
  2754. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2755. End;
  2756. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2757. End;
  2758. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  2759. {$if not defined(packFloatx80)}
  2760. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  2761. forward;
  2762. {$endif}
  2763. {*----------------------------------------------------------------------------
  2764. | Returns the result of converting the 32-bit two's complement integer `a'
  2765. | to the extended double-precision floating-point format. The conversion
  2766. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2767. | Arithmetic.
  2768. *----------------------------------------------------------------------------*}
  2769. function int32_to_floatx80( a: int32 ): floatx80;
  2770. var
  2771. zSign: flag;
  2772. absA: uint32;
  2773. shiftCount: int8;
  2774. zSig: bits64;
  2775. begin
  2776. if ( a = 0 ) then begin
  2777. result := packFloatx80( 0, 0, 0 );
  2778. exit;
  2779. end;
  2780. zSign := ord( a < 0 );
  2781. if zSign <> 0 then absA := - a else absA := a;
  2782. shiftCount := countLeadingZeros32( absA ) + 32;
  2783. zSig := absA;
  2784. result := packFloatx80( zSign, $403E - shiftCount, zSig shl shiftCount );
  2785. end;
  2786. {$endif FPC_SOFTFLOAT_FLOATX80}
  2787. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  2788. {$if not defined(packFloat128)}
  2789. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64 ) : float128;
  2790. forward;
  2791. {$endif}
  2792. {*----------------------------------------------------------------------------
  2793. | Returns the result of converting the 32-bit two's complement integer `a' to
  2794. | the quadruple-precision floating-point format. The conversion is performed
  2795. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2796. *----------------------------------------------------------------------------*}
  2797. function int32_to_float128( a: int32 ): float128;
  2798. var
  2799. zSign: flag;
  2800. absA: uint32;
  2801. shiftCount: int8;
  2802. zSig0: bits64;
  2803. begin
  2804. if ( a = 0 ) then begin
  2805. result := packFloat128( 0, 0, 0, 0 );
  2806. exit;
  2807. end;
  2808. zSign := ord( a < 0 );
  2809. if zSign <> 0 then absA := - a else absA := a;
  2810. shiftCount := countLeadingZeros32( absA ) + 17;
  2811. zSig0 := absA;
  2812. result := packFloat128( zSign, $402E - shiftCount, zSig0 shl shiftCount, 0 );
  2813. end;
  2814. {$endif FPC_SOFTFLOAT_FLOAT128}
  2815. {*
  2816. -------------------------------------------------------------------------------
  2817. Returns the result of converting the single-precision floating-point value
  2818. `a' to the 32-bit two's complement integer format. The conversion is
  2819. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2820. Arithmetic---which means in particular that the conversion is rounded
  2821. according to the current rounding mode. If `a' is a NaN, the largest
  2822. positive integer is returned. Otherwise, if the conversion overflows, the
  2823. largest integer with the same sign as `a' is returned.
  2824. -------------------------------------------------------------------------------
  2825. *}
  2826. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2827. Var
  2828. aSign: flag;
  2829. aExp, shiftCount: int16;
  2830. aSig, aSigExtra: bits32;
  2831. z: int32;
  2832. roundingMode: int8;
  2833. Begin
  2834. aSig := extractFloat32Frac( a.float32 );
  2835. aExp := extractFloat32Exp( a.float32 );
  2836. aSign := extractFloat32Sign( a.float32 );
  2837. shiftCount := aExp - $96;
  2838. if ( 0 <= shiftCount ) then
  2839. Begin
  2840. if ( $9E <= aExp ) then
  2841. Begin
  2842. if ( a.float32 <> $CF000000 ) then
  2843. Begin
  2844. float_raise( float_flag_invalid );
  2845. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2846. Begin
  2847. float32_to_int32 := $7FFFFFFF;
  2848. exit;
  2849. End;
  2850. End;
  2851. float32_to_int32 := sbits32 ($80000000);
  2852. exit;
  2853. End;
  2854. z := ( aSig or $00800000 ) shl shiftCount;
  2855. if ( aSign<>0 ) then z := - z;
  2856. End
  2857. else
  2858. Begin
  2859. if ( aExp < $7E ) then
  2860. Begin
  2861. aSigExtra := aExp OR aSig;
  2862. z := 0;
  2863. End
  2864. else
  2865. Begin
  2866. aSig := aSig OR $00800000;
  2867. aSigExtra := aSig shl ( shiftCount and 31 );
  2868. z := aSig shr ( - shiftCount );
  2869. End;
  2870. if ( aSigExtra<>0 ) then
  2871. softfloat_exception_flags := softfloat_exception_flags
  2872. or float_flag_inexact;
  2873. roundingMode := softfloat_rounding_mode;
  2874. if ( roundingMode = float_round_nearest_even ) then
  2875. Begin
  2876. if ( sbits32 (aSigExtra) < 0 ) then
  2877. Begin
  2878. Inc(z);
  2879. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2880. z := z and not 1;
  2881. End;
  2882. if ( aSign<>0 ) then
  2883. z := - z;
  2884. End
  2885. else
  2886. Begin
  2887. aSigExtra := flag( aSigExtra <> 0 );
  2888. if ( aSign<>0 ) then
  2889. Begin
  2890. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2891. z := - z;
  2892. End
  2893. else
  2894. Begin
  2895. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2896. End
  2897. End;
  2898. End;
  2899. float32_to_int32 := z;
  2900. End;
  2901. {*
  2902. -------------------------------------------------------------------------------
  2903. Returns the result of converting the single-precision floating-point value
  2904. `a' to the 32-bit two's complement integer format. The conversion is
  2905. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2906. Arithmetic, except that the conversion is always rounded toward zero.
  2907. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2908. the conversion overflows, the largest integer with the same sign as `a' is
  2909. returned.
  2910. -------------------------------------------------------------------------------
  2911. *}
  2912. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2913. Var
  2914. aSign : flag;
  2915. aExp, shiftCount : int16;
  2916. aSig : bits32;
  2917. z : int32;
  2918. Begin
  2919. aSig := extractFloat32Frac( a.float32 );
  2920. aExp := extractFloat32Exp( a.float32 );
  2921. aSign := extractFloat32Sign( a.float32 );
  2922. shiftCount := aExp - $9E;
  2923. if ( 0 <= shiftCount ) then
  2924. Begin
  2925. if ( a.float32 <> $CF000000 ) then
  2926. Begin
  2927. float_raise( float_flag_invalid );
  2928. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2929. Begin
  2930. float32_to_int32_round_to_zero := $7FFFFFFF;
  2931. exit;
  2932. end;
  2933. End;
  2934. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2935. exit;
  2936. End
  2937. else
  2938. if ( aExp <= $7E ) then
  2939. Begin
  2940. if ( aExp or aSig )<>0 then
  2941. softfloat_exception_flags :=
  2942. softfloat_exception_flags or float_flag_inexact;
  2943. float32_to_int32_round_to_zero := 0;
  2944. exit;
  2945. End;
  2946. aSig := ( aSig or $00800000 ) shl 8;
  2947. z := aSig shr ( - shiftCount );
  2948. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2949. Begin
  2950. softfloat_exception_flags :=
  2951. softfloat_exception_flags or float_flag_inexact;
  2952. End;
  2953. if ( aSign<>0 ) then z := - z;
  2954. float32_to_int32_round_to_zero := z;
  2955. End;
  2956. {*----------------------------------------------------------------------------
  2957. | Returns the result of converting the single-precision floating-point value
  2958. | `a' to the 64-bit two's complement integer format. The conversion is
  2959. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2960. | Arithmetic---which means in particular that the conversion is rounded
  2961. | according to the current rounding mode. If `a' is a NaN, the largest
  2962. | positive integer is returned. Otherwise, if the conversion overflows, the
  2963. | largest integer with the same sign as `a' is returned.
  2964. *----------------------------------------------------------------------------*}
  2965. function float32_to_int64( a: float32 ): int64;
  2966. var
  2967. aSign: flag;
  2968. aExp, shiftCount: int16;
  2969. aSig: bits32;
  2970. aSig64, aSigExtra: bits64;
  2971. begin
  2972. aSig := extractFloat32Frac( a );
  2973. aExp := extractFloat32Exp( a );
  2974. aSign := extractFloat32Sign( a );
  2975. shiftCount := $BE - aExp;
  2976. if ( shiftCount < 0 ) then begin
  2977. float_raise( float_flag_invalid );
  2978. if ( aSign = 0 ) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  2979. result := $7FFFFFFFFFFFFFFF;
  2980. exit;
  2981. end;
  2982. result := $8000000000000000;
  2983. exit;
  2984. end;
  2985. if ( aExp <> 0 ) then aSig := aSig or $00800000;
  2986. aSig64 := aSig;
  2987. aSig64 := aSig64 shl 40;
  2988. shift64ExtraRightJamming( aSig64, 0, shiftCount, aSig64, aSigExtra );
  2989. result := roundAndPackInt64( aSign, aSig64, aSigExtra );
  2990. end;
  2991. {*----------------------------------------------------------------------------
  2992. | Returns the result of converting the single-precision floating-point value
  2993. | `a' to the 64-bit two's complement integer format. The conversion is
  2994. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2995. | Arithmetic, except that the conversion is always rounded toward zero. If
  2996. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  2997. | conversion overflows, the largest integer with the same sign as `a' is
  2998. | returned.
  2999. *----------------------------------------------------------------------------*}
  3000. function float32_to_int64_round_to_zero( a: float32 ): int64;
  3001. var
  3002. aSign: flag;
  3003. aExp, shiftCount: int16;
  3004. aSig: bits32;
  3005. aSig64: bits64;
  3006. z: int64;
  3007. begin
  3008. aSig := extractFloat32Frac( a );
  3009. aExp := extractFloat32Exp( a );
  3010. aSign := extractFloat32Sign( a );
  3011. shiftCount := aExp - $BE;
  3012. if ( 0 <= shiftCount ) then begin
  3013. if ( a <> $DF000000 ) then begin
  3014. float_raise( float_flag_invalid );
  3015. if ( aSign = 0) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3016. result := $7FFFFFFFFFFFFFFF;
  3017. exit;
  3018. end;
  3019. end;
  3020. result := $8000000000000000;
  3021. exit;
  3022. end
  3023. else if ( aExp <= $7E ) then begin
  3024. if ( aExp or aSig <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3025. result := 0;
  3026. exit;
  3027. end;
  3028. aSig64 := aSig or $00800000;
  3029. aSig64 := aSig64 shl 40;
  3030. z := aSig64 shr ( - shiftCount );
  3031. if bits64( aSig64 shl ( shiftCount and 63 ) ) <> 0 then
  3032. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3033. if ( aSign <> 0 ) then z := - z;
  3034. result := z;
  3035. end;
  3036. {*
  3037. -------------------------------------------------------------------------------
  3038. Returns the result of converting the single-precision floating-point value
  3039. `a' to the double-precision floating-point format. The conversion is
  3040. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3041. Arithmetic.
  3042. -------------------------------------------------------------------------------
  3043. *}
  3044. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  3045. Var
  3046. aSign : flag;
  3047. aExp : int16;
  3048. aSig, zSig0, zSig1: bits32;
  3049. tmp : CommonNanT;
  3050. Begin
  3051. aSig := extractFloat32Frac( a.float32 );
  3052. aExp := extractFloat32Exp( a.float32 );
  3053. aSign := extractFloat32Sign( a.float32 );
  3054. if ( aExp = $FF ) then
  3055. Begin
  3056. if ( aSig<>0 ) then
  3057. Begin
  3058. float32ToCommonNaN(a.float32, tmp);
  3059. commonNaNToFloat64(tmp , result);
  3060. exit;
  3061. End;
  3062. packFloat64( aSign, $7FF, 0, 0, result);
  3063. exit;
  3064. End;
  3065. if ( aExp = 0 ) then
  3066. Begin
  3067. if ( aSig = 0 ) then
  3068. Begin
  3069. packFloat64( aSign, 0, 0, 0, result );
  3070. exit;
  3071. end;
  3072. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3073. Dec(aExp);
  3074. End;
  3075. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  3076. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  3077. End;
  3078. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  3079. {*----------------------------------------------------------------------------
  3080. | Returns the result of converting the canonical NaN `a' to the extended
  3081. | double-precision floating-point format.
  3082. *----------------------------------------------------------------------------*}
  3083. function commonNaNToFloatx80( a : commonNaNT ) : floatx80;
  3084. var
  3085. z : floatx80;
  3086. begin
  3087. z.low := bits64( $C000000000000000 ) or ( a.high shr 1 );
  3088. z.high := ( bits16( a.sign ) shl 15 ) or $7FFF;
  3089. result := z;
  3090. end;
  3091. {*----------------------------------------------------------------------------
  3092. | Returns the result of converting the single-precision floating-point value
  3093. | `a' to the extended double-precision floating-point format. The conversion
  3094. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3095. | Arithmetic.
  3096. *----------------------------------------------------------------------------*}
  3097. function float32_to_floatx80( a: float32 ): floatx80;
  3098. var
  3099. aSign: flag;
  3100. aExp: int16;
  3101. aSig: bits32;
  3102. tmp: commonNaNT;
  3103. begin
  3104. aSig := extractFloat32Frac( a );
  3105. aExp := extractFloat32Exp( a );
  3106. aSign := extractFloat32Sign( a );
  3107. if ( aExp = $FF ) then begin
  3108. if ( aSig <> 0 ) then begin
  3109. float32ToCommonNaN( a, tmp );
  3110. result := commonNaNToFloatx80( tmp );
  3111. exit;
  3112. end;
  3113. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  3114. exit;
  3115. end;
  3116. if ( aExp = 0 ) then begin
  3117. if ( aSig = 0 ) then begin
  3118. result := packFloatx80( aSign, 0, 0 );
  3119. exit;
  3120. end;
  3121. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3122. end;
  3123. aSig := aSig or $00800000;
  3124. result := packFloatx80( aSign, aExp + $3F80, bits64(aSig) shl 40 );
  3125. end;
  3126. {$endif FPC_SOFTFLOAT_FLOATX80}
  3127. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  3128. {*----------------------------------------------------------------------------
  3129. | Returns the result of converting the single-precision floating-point value
  3130. | `a' to the double-precision floating-point format. The conversion is
  3131. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3132. | Arithmetic.
  3133. *----------------------------------------------------------------------------*}
  3134. function float32_to_float128( a: float32 ): float128;
  3135. var
  3136. aSign: flag;
  3137. aExp: int16;
  3138. aSig: bits32;
  3139. tmp: commonNaNT;
  3140. begin
  3141. aSig := extractFloat32Frac( a );
  3142. aExp := extractFloat32Exp( a );
  3143. aSign := extractFloat32Sign( a );
  3144. if ( aExp = $FF ) then begin
  3145. if ( aSig <> 0 ) then begin
  3146. float32ToCommonNaN( a, tmp );
  3147. result := commonNaNToFloat128( tmp );
  3148. exit;
  3149. end;
  3150. result := packFloat128( aSign, $7FFF, 0, 0 );
  3151. exit;
  3152. end;
  3153. if ( aExp = 0 ) then begin
  3154. if ( aSig = 0 ) then begin
  3155. result := packFloat128( aSign, 0, 0, 0 );
  3156. exit;
  3157. end;
  3158. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3159. dec( aExp );
  3160. end;
  3161. result := packFloat128( aSign, aExp + $3F80, bits64( aSig ) shl 25, 0 );
  3162. end;
  3163. {$endif FPC_SOFTFLOAT_FLOAT128}
  3164. {*
  3165. -------------------------------------------------------------------------------
  3166. Rounds the single-precision floating-point value `a' to an integer,
  3167. and returns the result as a single-precision floating-point value. The
  3168. operation is performed according to the IEC/IEEE Standard for Binary
  3169. Floating-Point Arithmetic.
  3170. -------------------------------------------------------------------------------
  3171. *}
  3172. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  3173. Var
  3174. aSign: flag;
  3175. aExp: int16;
  3176. lastBitMask, roundBitsMask: bits32;
  3177. roundingMode: int8;
  3178. z: float32;
  3179. Begin
  3180. aExp := extractFloat32Exp( a.float32 );
  3181. if ( $96 <= aExp ) then
  3182. Begin
  3183. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3184. Begin
  3185. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  3186. exit;
  3187. End;
  3188. float32_round_to_int:=a;
  3189. exit;
  3190. End;
  3191. if ( aExp <= $7E ) then
  3192. Begin
  3193. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  3194. Begin
  3195. float32_round_to_int:=a;
  3196. exit;
  3197. end;
  3198. softfloat_exception_flags
  3199. := softfloat_exception_flags OR float_flag_inexact;
  3200. aSign := extractFloat32Sign( a.float32 );
  3201. case ( softfloat_rounding_mode ) of
  3202. float_round_nearest_even:
  3203. Begin
  3204. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3205. Begin
  3206. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  3207. exit;
  3208. End;
  3209. End;
  3210. float_round_down:
  3211. Begin
  3212. if aSign <> 0 then
  3213. float32_round_to_int.float32 := $BF800000
  3214. else
  3215. float32_round_to_int.float32 := 0;
  3216. exit;
  3217. End;
  3218. float_round_up:
  3219. Begin
  3220. if aSign <> 0 then
  3221. float32_round_to_int.float32 := $80000000
  3222. else
  3223. float32_round_to_int.float32 := $3F800000;
  3224. exit;
  3225. End;
  3226. end;
  3227. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  3228. exit;
  3229. End;
  3230. lastBitMask := 1;
  3231. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  3232. lastBitMask := lastBitMask shl ($96 - aExp);
  3233. roundBitsMask := lastBitMask - 1;
  3234. z := a.float32;
  3235. roundingMode := softfloat_rounding_mode;
  3236. if ( roundingMode = float_round_nearest_even ) then
  3237. Begin
  3238. z := z + (lastBitMask shr 1);
  3239. if ( ( z and roundBitsMask ) = 0 ) then
  3240. z := z and not lastBitMask;
  3241. End
  3242. else if ( roundingMode <> float_round_to_zero ) then
  3243. Begin
  3244. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  3245. Begin
  3246. z := z + roundBitsMask;
  3247. End;
  3248. End;
  3249. z := z and not roundBitsMask;
  3250. if ( z <> a.float32 ) then
  3251. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3252. float32_round_to_int.float32 := z;
  3253. End;
  3254. {*
  3255. -------------------------------------------------------------------------------
  3256. Returns the result of adding the absolute values of the single-precision
  3257. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3258. before being returned. `zSign' is ignored if the result is a NaN.
  3259. The addition is performed according to the IEC/IEEE Standard for Binary
  3260. Floating-Point Arithmetic.
  3261. -------------------------------------------------------------------------------
  3262. *}
  3263. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  3264. Var
  3265. aExp, bExp, zExp: int16;
  3266. aSig, bSig, zSig: bits32;
  3267. expDiff: int16;
  3268. label roundAndPack;
  3269. Begin
  3270. aSig:=extractFloat32Frac( a );
  3271. aExp:=extractFloat32Exp( a );
  3272. bSig:=extractFloat32Frac( b );
  3273. bExp := extractFloat32Exp( b );
  3274. expDiff := aExp - bExp;
  3275. aSig := aSig shl 6;
  3276. bSig := bSig shl 6;
  3277. if ( 0 < expDiff ) then
  3278. Begin
  3279. if ( aExp = $FF ) then
  3280. Begin
  3281. if ( aSig <> 0) then
  3282. Begin
  3283. addFloat32Sigs := propagateFloat32NaN( a, b );
  3284. exit;
  3285. End;
  3286. addFloat32Sigs := a;
  3287. exit;
  3288. End;
  3289. if ( bExp = 0 ) then
  3290. Begin
  3291. Dec(expDiff);
  3292. End
  3293. else
  3294. Begin
  3295. bSig := bSig or $20000000;
  3296. End;
  3297. shift32RightJamming( bSig, expDiff, bSig );
  3298. zExp := aExp;
  3299. End
  3300. else
  3301. If ( expDiff < 0 ) then
  3302. Begin
  3303. if ( bExp = $FF ) then
  3304. Begin
  3305. if ( bSig<>0 ) then
  3306. Begin
  3307. addFloat32Sigs := propagateFloat32NaN( a, b );
  3308. exit;
  3309. end;
  3310. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3311. exit;
  3312. End;
  3313. if ( aExp = 0 ) then
  3314. Begin
  3315. Inc(expDiff);
  3316. End
  3317. else
  3318. Begin
  3319. aSig := aSig OR $20000000;
  3320. End;
  3321. shift32RightJamming( aSig, - expDiff, aSig );
  3322. zExp := bExp;
  3323. End
  3324. else
  3325. Begin
  3326. if ( aExp = $FF ) then
  3327. Begin
  3328. if ( aSig OR bSig )<> 0 then
  3329. Begin
  3330. addFloat32Sigs := propagateFloat32NaN( a, b );
  3331. exit;
  3332. end;
  3333. addFloat32Sigs := a;
  3334. exit;
  3335. End;
  3336. if ( aExp = 0 ) then
  3337. Begin
  3338. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3339. exit;
  3340. end;
  3341. zSig := $40000000 + aSig + bSig;
  3342. zExp := aExp;
  3343. goto roundAndPack;
  3344. End;
  3345. aSig := aSig OR $20000000;
  3346. zSig := ( aSig + bSig ) shl 1;
  3347. Dec(zExp);
  3348. if ( sbits32 (zSig) < 0 ) then
  3349. Begin
  3350. zSig := aSig + bSig;
  3351. Inc(zExp);
  3352. End;
  3353. roundAndPack:
  3354. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3355. End;
  3356. {*
  3357. -------------------------------------------------------------------------------
  3358. Returns the result of subtracting the absolute values of the single-
  3359. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3360. difference is negated before being returned. `zSign' is ignored if the
  3361. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3362. Standard for Binary Floating-Point Arithmetic.
  3363. -------------------------------------------------------------------------------
  3364. *}
  3365. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3366. Var
  3367. aExp, bExp, zExp: int16;
  3368. aSig, bSig, zSig: bits32;
  3369. expDiff : int16;
  3370. label aExpBigger;
  3371. label bExpBigger;
  3372. label aBigger;
  3373. label bBigger;
  3374. label normalizeRoundAndPack;
  3375. Begin
  3376. aSig := extractFloat32Frac( a );
  3377. aExp := extractFloat32Exp( a );
  3378. bSig := extractFloat32Frac( b );
  3379. bExp := extractFloat32Exp( b );
  3380. expDiff := aExp - bExp;
  3381. aSig := aSig shl 7;
  3382. bSig := bSig shl 7;
  3383. if ( 0 < expDiff ) then goto aExpBigger;
  3384. if ( expDiff < 0 ) then goto bExpBigger;
  3385. if ( aExp = $FF ) then
  3386. Begin
  3387. if ( aSig OR bSig )<> 0 then
  3388. Begin
  3389. subFloat32Sigs := propagateFloat32NaN( a, b );
  3390. exit;
  3391. End;
  3392. float_raise( float_flag_invalid );
  3393. subFloat32Sigs := float32_default_nan;
  3394. exit;
  3395. End;
  3396. if ( aExp = 0 ) then
  3397. Begin
  3398. aExp := 1;
  3399. bExp := 1;
  3400. End;
  3401. if ( bSig < aSig ) Then goto aBigger;
  3402. if ( aSig < bSig ) Then goto bBigger;
  3403. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3404. exit;
  3405. bExpBigger:
  3406. if ( bExp = $FF ) then
  3407. Begin
  3408. if ( bSig<>0 ) then
  3409. Begin
  3410. subFloat32Sigs := propagateFloat32NaN( a, b );
  3411. exit;
  3412. End;
  3413. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3414. exit;
  3415. End;
  3416. if ( aExp = 0 ) then
  3417. Begin
  3418. Inc(expDiff);
  3419. End
  3420. else
  3421. Begin
  3422. aSig := aSig OR $40000000;
  3423. End;
  3424. shift32RightJamming( aSig, - expDiff, aSig );
  3425. bSig := bSig OR $40000000;
  3426. bBigger:
  3427. zSig := bSig - aSig;
  3428. zExp := bExp;
  3429. zSign := zSign xor 1;
  3430. goto normalizeRoundAndPack;
  3431. aExpBigger:
  3432. if ( aExp = $FF ) then
  3433. Begin
  3434. if ( aSig <> 0) then
  3435. Begin
  3436. subFloat32Sigs := propagateFloat32NaN( a, b );
  3437. exit;
  3438. End;
  3439. subFloat32Sigs := a;
  3440. exit;
  3441. End;
  3442. if ( bExp = 0 ) then
  3443. Begin
  3444. Dec(expDiff);
  3445. End
  3446. else
  3447. Begin
  3448. bSig := bSig OR $40000000;
  3449. End;
  3450. shift32RightJamming( bSig, expDiff, bSig );
  3451. aSig := aSig OR $40000000;
  3452. aBigger:
  3453. zSig := aSig - bSig;
  3454. zExp := aExp;
  3455. normalizeRoundAndPack:
  3456. Dec(zExp);
  3457. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3458. End;
  3459. {*
  3460. -------------------------------------------------------------------------------
  3461. Returns the result of adding the single-precision floating-point values `a'
  3462. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3463. Binary Floating-Point Arithmetic.
  3464. -------------------------------------------------------------------------------
  3465. *}
  3466. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3467. Var
  3468. aSign, bSign: Flag;
  3469. Begin
  3470. aSign := extractFloat32Sign( a.float32 );
  3471. bSign := extractFloat32Sign( b.float32 );
  3472. if ( aSign = bSign ) then
  3473. Begin
  3474. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3475. End
  3476. else
  3477. Begin
  3478. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3479. End;
  3480. End;
  3481. {*
  3482. -------------------------------------------------------------------------------
  3483. Returns the result of subtracting the single-precision floating-point values
  3484. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3485. for Binary Floating-Point Arithmetic.
  3486. -------------------------------------------------------------------------------
  3487. *}
  3488. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3489. Var
  3490. aSign, bSign: flag;
  3491. Begin
  3492. aSign := extractFloat32Sign( a.float32 );
  3493. bSign := extractFloat32Sign( b.float32 );
  3494. if ( aSign = bSign ) then
  3495. Begin
  3496. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3497. End
  3498. else
  3499. Begin
  3500. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3501. End;
  3502. End;
  3503. {*
  3504. -------------------------------------------------------------------------------
  3505. Returns the result of multiplying the single-precision floating-point values
  3506. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3507. for Binary Floating-Point Arithmetic.
  3508. -------------------------------------------------------------------------------
  3509. *}
  3510. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3511. Var
  3512. aSign, bSign, zSign: flag;
  3513. aExp, bExp, zExp : int16;
  3514. aSig, bSig, zSig0, zSig1: bits32;
  3515. Begin
  3516. aSig := extractFloat32Frac( a.float32 );
  3517. aExp := extractFloat32Exp( a.float32 );
  3518. aSign := extractFloat32Sign( a.float32 );
  3519. bSig := extractFloat32Frac( b.float32 );
  3520. bExp := extractFloat32Exp( b.float32 );
  3521. bSign := extractFloat32Sign( b.float32 );
  3522. zSign := aSign xor bSign;
  3523. if ( aExp = $FF ) then
  3524. Begin
  3525. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3526. Begin
  3527. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3528. exit;
  3529. End;
  3530. if ( ( bExp OR bSig ) = 0 ) then
  3531. Begin
  3532. float_raise( float_flag_invalid );
  3533. float32_mul.float32 := float32_default_nan;
  3534. exit;
  3535. End;
  3536. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3537. exit;
  3538. End;
  3539. if ( bExp = $FF ) then
  3540. Begin
  3541. if ( bSig <> 0 ) then
  3542. Begin
  3543. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3544. exit;
  3545. End;
  3546. if ( ( aExp OR aSig ) = 0 ) then
  3547. Begin
  3548. float_raise( float_flag_invalid );
  3549. float32_mul.float32 := float32_default_nan;
  3550. exit;
  3551. End;
  3552. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3553. exit;
  3554. End;
  3555. if ( aExp = 0 ) then
  3556. Begin
  3557. if ( aSig = 0 ) then
  3558. Begin
  3559. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3560. exit;
  3561. End;
  3562. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3563. End;
  3564. if ( bExp = 0 ) then
  3565. Begin
  3566. if ( bSig = 0 ) then
  3567. Begin
  3568. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3569. exit;
  3570. End;
  3571. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3572. End;
  3573. zExp := aExp + bExp - $7F;
  3574. aSig := ( aSig OR $00800000 ) shl 7;
  3575. bSig := ( bSig OR $00800000 ) shl 8;
  3576. mul32To64( aSig, bSig, zSig0, zSig1 );
  3577. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3578. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3579. Begin
  3580. zSig0 := zSig0 shl 1;
  3581. Dec(zExp);
  3582. End;
  3583. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3584. End;
  3585. {*
  3586. -------------------------------------------------------------------------------
  3587. Returns the result of dividing the single-precision floating-point value `a'
  3588. by the corresponding value `b'. The operation is performed according to the
  3589. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3590. -------------------------------------------------------------------------------
  3591. *}
  3592. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3593. Var
  3594. aSign, bSign, zSign: flag;
  3595. aExp, bExp, zExp: int16;
  3596. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3597. Begin
  3598. aSig := extractFloat32Frac( a.float32 );
  3599. aExp := extractFloat32Exp( a.float32 );
  3600. aSign := extractFloat32Sign( a.float32 );
  3601. bSig := extractFloat32Frac( b.float32 );
  3602. bExp := extractFloat32Exp( b.float32 );
  3603. bSign := extractFloat32Sign( b.float32 );
  3604. zSign := aSign xor bSign;
  3605. if ( aExp = $FF ) then
  3606. Begin
  3607. if ( aSig <> 0 ) then
  3608. Begin
  3609. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3610. exit;
  3611. End;
  3612. if ( bExp = $FF ) then
  3613. Begin
  3614. if ( bSig <> 0) then
  3615. Begin
  3616. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3617. exit;
  3618. End;
  3619. float_raise( float_flag_invalid );
  3620. float32_div.float32 := float32_default_nan;
  3621. exit;
  3622. End;
  3623. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3624. exit;
  3625. End;
  3626. if ( bExp = $FF ) then
  3627. Begin
  3628. if ( bSig <> 0) then
  3629. Begin
  3630. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3631. exit;
  3632. End;
  3633. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3634. exit;
  3635. End;
  3636. if ( bExp = 0 ) Then
  3637. Begin
  3638. if ( bSig = 0 ) Then
  3639. Begin
  3640. if ( ( aExp OR aSig ) = 0 ) then
  3641. Begin
  3642. float_raise( float_flag_invalid );
  3643. float32_div.float32 := float32_default_nan;
  3644. exit;
  3645. End;
  3646. float_raise( float_flag_divbyzero );
  3647. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3648. exit;
  3649. End;
  3650. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3651. End;
  3652. if ( aExp = 0 ) Then
  3653. Begin
  3654. if ( aSig = 0 ) Then
  3655. Begin
  3656. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3657. exit;
  3658. End;
  3659. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3660. End;
  3661. zExp := aExp - bExp + $7D;
  3662. aSig := ( aSig OR $00800000 ) shl 7;
  3663. bSig := ( bSig OR $00800000 ) shl 8;
  3664. if ( bSig <= ( aSig + aSig ) ) then
  3665. Begin
  3666. aSig := aSig shr 1;
  3667. Inc(zExp);
  3668. End;
  3669. zSig := estimateDiv64To32( aSig, 0, bSig );
  3670. if ( ( zSig and $3F ) <= 2 ) then
  3671. Begin
  3672. mul32To64( bSig, zSig, term0, term1 );
  3673. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3674. while ( sbits32 (rem0) < 0 ) do
  3675. Begin
  3676. Dec(zSig);
  3677. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3678. End;
  3679. zSig := zSig or bits32( rem1 <> 0 );
  3680. End;
  3681. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3682. End;
  3683. {*
  3684. -------------------------------------------------------------------------------
  3685. Returns the remainder of the single-precision floating-point value `a'
  3686. with respect to the corresponding value `b'. The operation is performed
  3687. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3688. -------------------------------------------------------------------------------
  3689. *}
  3690. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3691. Var
  3692. aSign, zSign: flag;
  3693. aExp, bExp, expDiff: int16;
  3694. aSig, bSig, q, alternateASig: bits32;
  3695. sigMean: sbits32;
  3696. Begin
  3697. aSig := extractFloat32Frac( a.float32 );
  3698. aExp := extractFloat32Exp( a.float32 );
  3699. aSign := extractFloat32Sign( a.float32 );
  3700. bSig := extractFloat32Frac( b.float32 );
  3701. bExp := extractFloat32Exp( b.float32 );
  3702. if ( aExp = $FF ) then
  3703. Begin
  3704. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3705. Begin
  3706. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3707. exit;
  3708. End;
  3709. float_raise( float_flag_invalid );
  3710. float32_rem.float32 := float32_default_nan;
  3711. exit;
  3712. End;
  3713. if ( bExp = $FF ) then
  3714. Begin
  3715. if ( bSig <> 0 ) then
  3716. Begin
  3717. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3718. exit;
  3719. End;
  3720. float32_rem := a;
  3721. exit;
  3722. End;
  3723. if ( bExp = 0 ) then
  3724. Begin
  3725. if ( bSig = 0 ) then
  3726. Begin
  3727. float_raise( float_flag_invalid );
  3728. float32_rem.float32 := float32_default_nan;
  3729. exit;
  3730. End;
  3731. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3732. End;
  3733. if ( aExp = 0 ) then
  3734. Begin
  3735. if ( aSig = 0 ) then
  3736. Begin
  3737. float32_rem := a;
  3738. exit;
  3739. End;
  3740. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3741. End;
  3742. expDiff := aExp - bExp;
  3743. aSig := ( aSig OR $00800000 ) shl 8;
  3744. bSig := ( bSig OR $00800000 ) shl 8;
  3745. if ( expDiff < 0 ) then
  3746. Begin
  3747. if ( expDiff < -1 ) then
  3748. Begin
  3749. float32_rem := a;
  3750. exit;
  3751. End;
  3752. aSig := aSig shr 1;
  3753. End;
  3754. q := bits32( bSig <= aSig );
  3755. if ( q <> 0) then
  3756. aSig := aSig - bSig;
  3757. expDiff := expDiff - 32;
  3758. while ( 0 < expDiff ) do
  3759. Begin
  3760. q := estimateDiv64To32( aSig, 0, bSig );
  3761. if (2 < q) then
  3762. q := q - 2
  3763. else
  3764. q := 0;
  3765. aSig := - ( ( bSig shr 2 ) * q );
  3766. expDiff := expDiff - 30;
  3767. End;
  3768. expDiff := expDiff + 32;
  3769. if ( 0 < expDiff ) then
  3770. Begin
  3771. q := estimateDiv64To32( aSig, 0, bSig );
  3772. if (2 < q) then
  3773. q := q - 2
  3774. else
  3775. q := 0;
  3776. q := q shr (32 - expDiff);
  3777. bSig := bSig shr 2;
  3778. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3779. End
  3780. else
  3781. Begin
  3782. aSig := aSig shr 2;
  3783. bSig := bSig shr 2;
  3784. End;
  3785. Repeat
  3786. alternateASig := aSig;
  3787. Inc(q);
  3788. aSig := aSig - bSig;
  3789. Until not ( 0 <= sbits32 (aSig) );
  3790. sigMean := aSig + alternateASig;
  3791. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3792. Begin
  3793. aSig := alternateASig;
  3794. End;
  3795. zSign := flag( sbits32 (aSig) < 0 );
  3796. if ( zSign<>0 ) then
  3797. aSig := - aSig;
  3798. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3799. End;
  3800. {*
  3801. -------------------------------------------------------------------------------
  3802. Returns the square root of the single-precision floating-point value `a'.
  3803. The operation is performed according to the IEC/IEEE Standard for Binary
  3804. Floating-Point Arithmetic.
  3805. -------------------------------------------------------------------------------
  3806. *}
  3807. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3808. Var
  3809. aSign : flag;
  3810. aExp, zExp : int16;
  3811. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3812. label roundAndPack;
  3813. Begin
  3814. aSig := extractFloat32Frac( a.float32 );
  3815. aExp := extractFloat32Exp( a.float32 );
  3816. aSign := extractFloat32Sign( a.float32 );
  3817. if ( aExp = $FF ) then
  3818. Begin
  3819. if ( aSig <> 0) then
  3820. Begin
  3821. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3822. exit;
  3823. End;
  3824. if ( aSign = 0) then
  3825. Begin
  3826. float32_sqrt := a;
  3827. exit;
  3828. End;
  3829. float_raise( float_flag_invalid );
  3830. float32_sqrt.float32 := float32_default_nan;
  3831. exit;
  3832. End;
  3833. if ( aSign <> 0) then
  3834. Begin
  3835. if ( ( aExp OR aSig ) = 0 ) then
  3836. Begin
  3837. float32_sqrt := a;
  3838. exit;
  3839. End;
  3840. float_raise( float_flag_invalid );
  3841. float32_sqrt.float32 := float32_default_nan;
  3842. exit;
  3843. End;
  3844. if ( aExp = 0 ) then
  3845. Begin
  3846. if ( aSig = 0 ) then
  3847. Begin
  3848. float32_sqrt.float32 := 0;
  3849. exit;
  3850. End;
  3851. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3852. End;
  3853. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3854. aSig := ( aSig OR $00800000 ) shl 8;
  3855. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3856. if ( ( zSig and $7F ) <= 5 ) then
  3857. Begin
  3858. if ( zSig < 2 ) then
  3859. Begin
  3860. zSig := $7FFFFFFF;
  3861. goto roundAndPack;
  3862. End
  3863. else
  3864. Begin
  3865. aSig := aSig shr (aExp and 1);
  3866. mul32To64( zSig, zSig, term0, term1 );
  3867. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3868. while ( sbits32 (rem0) < 0 ) do
  3869. Begin
  3870. Dec(zSig);
  3871. shortShift64Left( 0, zSig, 1, term0, term1 );
  3872. term1 := term1 or 1;
  3873. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3874. End;
  3875. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3876. End;
  3877. End;
  3878. shift32RightJamming( zSig, 1, zSig );
  3879. roundAndPack:
  3880. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3881. End;
  3882. {*
  3883. -------------------------------------------------------------------------------
  3884. Returns 1 if the single-precision floating-point value `a' is equal to
  3885. the corresponding value `b', and 0 otherwise. The comparison is performed
  3886. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3887. -------------------------------------------------------------------------------
  3888. *}
  3889. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3890. Begin
  3891. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3892. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3893. ) then
  3894. Begin
  3895. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3896. Begin
  3897. float_raise( float_flag_invalid );
  3898. End;
  3899. float32_eq := 0;
  3900. exit;
  3901. End;
  3902. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3903. End;
  3904. {*
  3905. -------------------------------------------------------------------------------
  3906. Returns 1 if the single-precision floating-point value `a' is less than
  3907. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3908. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3909. Arithmetic.
  3910. -------------------------------------------------------------------------------
  3911. *}
  3912. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3913. var
  3914. aSign, bSign: flag;
  3915. Begin
  3916. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3917. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3918. ) then
  3919. Begin
  3920. float_raise( float_flag_invalid );
  3921. float32_le := 0;
  3922. exit;
  3923. End;
  3924. aSign := extractFloat32Sign( a.float32 );
  3925. bSign := extractFloat32Sign( b.float32 );
  3926. if ( aSign <> bSign ) then
  3927. Begin
  3928. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3929. exit;
  3930. End;
  3931. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3932. End;
  3933. {*
  3934. -------------------------------------------------------------------------------
  3935. Returns 1 if the single-precision floating-point value `a' is less than
  3936. the corresponding value `b', and 0 otherwise. The comparison is performed
  3937. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3938. -------------------------------------------------------------------------------
  3939. *}
  3940. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3941. var
  3942. aSign, bSign: flag;
  3943. Begin
  3944. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3945. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3946. ) then
  3947. Begin
  3948. float_raise( float_flag_invalid );
  3949. float32_lt :=0;
  3950. exit;
  3951. End;
  3952. aSign := extractFloat32Sign( a.float32 );
  3953. bSign := extractFloat32Sign( b.float32 );
  3954. if ( aSign <> bSign ) then
  3955. Begin
  3956. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3957. exit;
  3958. End;
  3959. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3960. End;
  3961. {*
  3962. -------------------------------------------------------------------------------
  3963. Returns 1 if the single-precision floating-point value `a' is equal to
  3964. the corresponding value `b', and 0 otherwise. The invalid exception is
  3965. raised if either operand is a NaN. Otherwise, the comparison is performed
  3966. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3967. -------------------------------------------------------------------------------
  3968. *}
  3969. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3970. Begin
  3971. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3972. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3973. ) then
  3974. Begin
  3975. float_raise( float_flag_invalid );
  3976. float32_eq_signaling := 0;
  3977. exit;
  3978. End;
  3979. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3980. End;
  3981. {*
  3982. -------------------------------------------------------------------------------
  3983. Returns 1 if the single-precision floating-point value `a' is less than or
  3984. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3985. cause an exception. Otherwise, the comparison is performed according to the
  3986. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3987. -------------------------------------------------------------------------------
  3988. *}
  3989. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3990. Var
  3991. aSign, bSign: flag;
  3992. Begin
  3993. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3994. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3995. ) then
  3996. Begin
  3997. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3998. Begin
  3999. float_raise( float_flag_invalid );
  4000. End;
  4001. float32_le_quiet := 0;
  4002. exit;
  4003. End;
  4004. aSign := extractFloat32Sign( a );
  4005. bSign := extractFloat32Sign( b );
  4006. if ( aSign <> bSign ) then
  4007. Begin
  4008. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  4009. exit;
  4010. End;
  4011. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  4012. End;
  4013. {*
  4014. -------------------------------------------------------------------------------
  4015. Returns 1 if the single-precision floating-point value `a' is less than
  4016. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4017. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4018. Standard for Binary Floating-Point Arithmetic.
  4019. -------------------------------------------------------------------------------
  4020. *}
  4021. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  4022. Var
  4023. aSign, bSign: flag;
  4024. Begin
  4025. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4026. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4027. ) then
  4028. Begin
  4029. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4030. Begin
  4031. float_raise( float_flag_invalid );
  4032. End;
  4033. float32_lt_quiet := 0;
  4034. exit;
  4035. End;
  4036. aSign := extractFloat32Sign( a );
  4037. bSign := extractFloat32Sign( b );
  4038. if ( aSign <> bSign ) then
  4039. Begin
  4040. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  4041. exit;
  4042. End;
  4043. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  4044. End;
  4045. {*
  4046. -------------------------------------------------------------------------------
  4047. Returns the result of converting the double-precision floating-point value
  4048. `a' to the 32-bit two's complement integer format. The conversion is
  4049. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4050. Arithmetic---which means in particular that the conversion is rounded
  4051. according to the current rounding mode. If `a' is a NaN, the largest
  4052. positive integer is returned. Otherwise, if the conversion overflows, the
  4053. largest integer with the same sign as `a' is returned.
  4054. -------------------------------------------------------------------------------
  4055. *}
  4056. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  4057. var
  4058. aSign: flag;
  4059. aExp, shiftCount: int16;
  4060. aSig0, aSig1, absZ, aSigExtra: bits32;
  4061. z: int32;
  4062. roundingMode: int8;
  4063. label invalid;
  4064. Begin
  4065. aSig1 := extractFloat64Frac1( a );
  4066. aSig0 := extractFloat64Frac0( a );
  4067. aExp := extractFloat64Exp( a );
  4068. aSign := extractFloat64Sign( a );
  4069. shiftCount := aExp - $413;
  4070. if ( 0 <= shiftCount ) then
  4071. Begin
  4072. if ( $41E < aExp ) then
  4073. Begin
  4074. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4075. aSign := 0;
  4076. goto invalid;
  4077. End;
  4078. shortShift64Left(
  4079. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4080. if ( $80000000 < absZ ) then
  4081. goto invalid;
  4082. End
  4083. else
  4084. Begin
  4085. aSig1 := flag( aSig1 <> 0 );
  4086. if ( aExp < $3FE ) then
  4087. Begin
  4088. aSigExtra := aExp OR aSig0 OR aSig1;
  4089. absZ := 0;
  4090. End
  4091. else
  4092. Begin
  4093. aSig0 := aSig0 OR $00100000;
  4094. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4095. absZ := aSig0 shr ( - shiftCount );
  4096. End;
  4097. End;
  4098. roundingMode := softfloat_rounding_mode;
  4099. if ( roundingMode = float_round_nearest_even ) then
  4100. Begin
  4101. if ( sbits32(aSigExtra) < 0 ) then
  4102. Begin
  4103. Inc(absZ);
  4104. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  4105. absZ := absZ and not 1;
  4106. End;
  4107. if aSign <> 0 then
  4108. z := - absZ
  4109. else
  4110. z := absZ;
  4111. End
  4112. else
  4113. Begin
  4114. aSigExtra := bits32( aSigExtra <> 0 );
  4115. if ( aSign <> 0) then
  4116. Begin
  4117. z := - ( absZ
  4118. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  4119. End
  4120. else
  4121. Begin
  4122. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  4123. End
  4124. End;
  4125. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  4126. Begin
  4127. invalid:
  4128. float_raise( float_flag_invalid );
  4129. if (aSign <> 0 ) then
  4130. float64_to_int32 := sbits32 ($80000000)
  4131. else
  4132. float64_to_int32 := $7FFFFFFF;
  4133. exit;
  4134. End;
  4135. if ( aSigExtra <> 0) then
  4136. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  4137. float64_to_int32 := z;
  4138. End;
  4139. {*
  4140. -------------------------------------------------------------------------------
  4141. Returns the result of converting the double-precision floating-point value
  4142. `a' to the 32-bit two's complement integer format. The conversion is
  4143. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4144. Arithmetic, except that the conversion is always rounded toward zero.
  4145. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4146. the conversion overflows, the largest integer with the same sign as `a' is
  4147. returned.
  4148. -------------------------------------------------------------------------------
  4149. *}
  4150. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  4151. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  4152. Var
  4153. aSign: flag;
  4154. aExp, shiftCount: int16;
  4155. aSig0, aSig1, absZ, aSigExtra: bits32;
  4156. z: int32;
  4157. label invalid;
  4158. Begin
  4159. aSig1 := extractFloat64Frac1( a );
  4160. aSig0 := extractFloat64Frac0( a );
  4161. aExp := extractFloat64Exp( a );
  4162. aSign := extractFloat64Sign( a );
  4163. shiftCount := aExp - $413;
  4164. if ( 0 <= shiftCount ) then
  4165. Begin
  4166. if ( $41E < aExp ) then
  4167. Begin
  4168. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4169. aSign := 0;
  4170. goto invalid;
  4171. End;
  4172. shortShift64Left(
  4173. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4174. End
  4175. else
  4176. Begin
  4177. if ( aExp < $3FF ) then
  4178. Begin
  4179. if ( aExp OR aSig0 OR aSig1 )<>0 then
  4180. Begin
  4181. softfloat_exception_flags :=
  4182. softfloat_exception_flags or float_flag_inexact;
  4183. End;
  4184. float64_to_int32_round_to_zero := 0;
  4185. exit;
  4186. End;
  4187. aSig0 := aSig0 or $00100000;
  4188. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4189. absZ := aSig0 shr ( - shiftCount );
  4190. End;
  4191. if aSign <> 0 then
  4192. z := - absZ
  4193. else
  4194. z := absZ;
  4195. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  4196. Begin
  4197. invalid:
  4198. float_raise( float_flag_invalid );
  4199. if (aSign <> 0) then
  4200. float64_to_int32_round_to_zero := sbits32 ($80000000)
  4201. else
  4202. float64_to_int32_round_to_zero := $7FFFFFFF;
  4203. exit;
  4204. End;
  4205. if ( aSigExtra <> 0) then
  4206. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  4207. float64_to_int32_round_to_zero := z;
  4208. End;
  4209. {*----------------------------------------------------------------------------
  4210. | Returns the result of converting the double-precision floating-point value
  4211. | `a' to the 64-bit two's complement integer format. The conversion is
  4212. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4213. | Arithmetic---which means in particular that the conversion is rounded
  4214. | according to the current rounding mode. If `a' is a NaN, the largest
  4215. | positive integer is returned. Otherwise, if the conversion overflows, the
  4216. | largest integer with the same sign as `a' is returned.
  4217. *----------------------------------------------------------------------------*}
  4218. function float64_to_int64( a: float64 ): int64;
  4219. var
  4220. aSign: flag;
  4221. aExp, shiftCount: int16;
  4222. aSig, aSigExtra: bits64;
  4223. begin
  4224. aSig := extractFloat64Frac( a );
  4225. aExp := extractFloat64Exp( a );
  4226. aSign := extractFloat64Sign( a );
  4227. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4228. shiftCount := $433 - aExp;
  4229. if ( shiftCount <= 0 ) then begin
  4230. if ( $43E < aExp ) then begin
  4231. float_raise( float_flag_invalid );
  4232. if ( ( aSign = 0 )
  4233. or ( ( aExp = $7FF )
  4234. and ( aSig <> $0010000000000000 ) )
  4235. ) then begin
  4236. result := $7FFFFFFFFFFFFFFF;
  4237. exit;
  4238. end;
  4239. result := $8000000000000000;
  4240. exit;
  4241. end;
  4242. aSigExtra := 0;
  4243. aSig := aSig shl ( - shiftCount );
  4244. end
  4245. else
  4246. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  4247. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  4248. end;
  4249. {*----------------------------------------------------------------------------
  4250. | Returns the result of converting the double-precision floating-point value
  4251. | `a' to the 64-bit two's complement integer format. The conversion is
  4252. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4253. | Arithmetic, except that the conversion is always rounded toward zero.
  4254. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4255. | the conversion overflows, the largest integer with the same sign as `a' is
  4256. | returned.
  4257. *----------------------------------------------------------------------------*}
  4258. {$define FPC_SYSTEM_HAS_float64_to_int64_round_to_zero}
  4259. function float64_to_int64_round_to_zero( a: float64 ): int64;
  4260. var
  4261. aSign: flag;
  4262. aExp, shiftCount: int16;
  4263. aSig: bits64;
  4264. z: int64;
  4265. begin
  4266. aSig := extractFloat64Frac( a );
  4267. aExp := extractFloat64Exp( a );
  4268. aSign := extractFloat64Sign( a );
  4269. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4270. shiftCount := aExp - $433;
  4271. if ( 0 <= shiftCount ) then begin
  4272. if ( $43E <= aExp ) then begin
  4273. if ( bits64 ( a ) <> bits64( $C3E0000000000000 ) ) then begin
  4274. float_raise( float_flag_invalid );
  4275. if ( ( aSign = 0 )
  4276. or ( ( aExp = $7FF )
  4277. and ( aSig <> $0010000000000000 ) )
  4278. ) then begin
  4279. result := $7FFFFFFFFFFFFFFF;
  4280. exit;
  4281. end;
  4282. end;
  4283. result := $8000000000000000;
  4284. exit;
  4285. end;
  4286. z := aSig shl shiftCount;
  4287. end
  4288. else begin
  4289. if ( aExp < $3FE ) then begin
  4290. if ( aExp or aSig <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  4291. result := 0;
  4292. exit;
  4293. end;
  4294. z := aSig shr ( - shiftCount );
  4295. if ( bits64( aSig shl ( shiftCount and 63 ) ) <> 0 ) then
  4296. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  4297. end;
  4298. if ( aSign <> 0 ) then z := - z;
  4299. result := z;
  4300. end;
  4301. {*
  4302. -------------------------------------------------------------------------------
  4303. Returns the result of converting the double-precision floating-point value
  4304. `a' to the single-precision floating-point format. The conversion is
  4305. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4306. Arithmetic.
  4307. -------------------------------------------------------------------------------
  4308. *}
  4309. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  4310. Var
  4311. aSign: flag;
  4312. aExp: int16;
  4313. aSig0, aSig1, zSig: bits32;
  4314. allZero: bits32;
  4315. tmp : CommonNanT;
  4316. Begin
  4317. aSig1 := extractFloat64Frac1( a );
  4318. aSig0 := extractFloat64Frac0( a );
  4319. aExp := extractFloat64Exp( a );
  4320. aSign := extractFloat64Sign( a );
  4321. if ( aExp = $7FF ) then
  4322. Begin
  4323. if ( aSig0 OR aSig1 ) <> 0 then
  4324. Begin
  4325. float64ToCommonNaN( a, tmp );
  4326. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  4327. exit;
  4328. End;
  4329. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  4330. exit;
  4331. End;
  4332. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  4333. if ( aExp <> 0) then
  4334. zSig := zSig OR $40000000;
  4335. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  4336. End;
  4337. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  4338. {*----------------------------------------------------------------------------
  4339. | Returns the result of converting the double-precision floating-point value
  4340. | `a' to the extended double-precision floating-point format. The conversion
  4341. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4342. | Arithmetic.
  4343. *----------------------------------------------------------------------------*}
  4344. function float64_to_floatx80( a: float64 ): floatx80;
  4345. var
  4346. aSign: flag;
  4347. aExp: int16;
  4348. aSig: bits64;
  4349. begin
  4350. aSig := extractFloat64Frac( a );
  4351. aExp := extractFloat64Exp( a );
  4352. aSign := extractFloat64Sign( a );
  4353. if ( aExp = $7FF ) then begin
  4354. if ( aSig <> 0 ) then begin
  4355. result := commonNaNToFloatx80( float64ToCommonNaN( a ) );
  4356. exit;
  4357. end;
  4358. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  4359. exit;
  4360. end;
  4361. if ( aExp = 0 ) then begin
  4362. if ( aSig = 0 ) then begin
  4363. result := packFloatx80( aSign, 0, 0 );
  4364. exit;
  4365. end;
  4366. normalizeFloat64Subnormal( aSig, aExp, aSig );
  4367. end;
  4368. result :=
  4369. packFloatx80(
  4370. aSign, aExp + $3C00, ( aSig or $0010000000000000 ) shl 11 );
  4371. end;
  4372. {$endif FPC_SOFTFLOAT_FLOATX80}
  4373. {*
  4374. -------------------------------------------------------------------------------
  4375. Rounds the double-precision floating-point value `a' to an integer,
  4376. and returns the result as a double-precision floating-point value. The
  4377. operation is performed according to the IEC/IEEE Standard for Binary
  4378. Floating-Point Arithmetic.
  4379. -------------------------------------------------------------------------------
  4380. *}
  4381. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  4382. Var
  4383. aSign: flag;
  4384. aExp: int16;
  4385. lastBitMask, roundBitsMask: bits32;
  4386. roundingMode: int8;
  4387. z: float64;
  4388. Begin
  4389. aExp := extractFloat64Exp( a );
  4390. if ( $413 <= aExp ) then
  4391. Begin
  4392. if ( $433 <= aExp ) then
  4393. Begin
  4394. if ( ( aExp = $7FF )
  4395. AND
  4396. (
  4397. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  4398. ) <>0)
  4399. ) then
  4400. Begin
  4401. propagateFloat64NaN( a, a, result );
  4402. exit;
  4403. End;
  4404. result := a;
  4405. exit;
  4406. End;
  4407. lastBitMask := 1;
  4408. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  4409. roundBitsMask := lastBitMask - 1;
  4410. z := a;
  4411. roundingMode := softfloat_rounding_mode;
  4412. if ( roundingMode = float_round_nearest_even ) then
  4413. Begin
  4414. if ( lastBitMask <> 0) then
  4415. Begin
  4416. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  4417. if ( ( z.low and roundBitsMask ) = 0 ) then
  4418. z.low := z.low and not lastBitMask;
  4419. End
  4420. else
  4421. Begin
  4422. if ( sbits32 (z.low) < 0 ) then
  4423. Begin
  4424. Inc(z.high);
  4425. if ( bits32 ( z.low shl 1 ) = 0 ) then
  4426. z.high := z.high and not 1;
  4427. End;
  4428. End;
  4429. End
  4430. else if ( roundingMode <> float_round_to_zero ) then
  4431. Begin
  4432. if ( extractFloat64Sign( z )
  4433. xor flag( roundingMode = float_round_up ) )<> 0 then
  4434. Begin
  4435. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4436. End;
  4437. End;
  4438. z.low := z.low and not roundBitsMask;
  4439. End
  4440. else
  4441. Begin
  4442. if ( aExp <= $3FE ) then
  4443. Begin
  4444. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4445. Begin
  4446. result := a;
  4447. exit;
  4448. End;
  4449. softfloat_exception_flags := softfloat_exception_flags or
  4450. float_flag_inexact;
  4451. aSign := extractFloat64Sign( a );
  4452. case ( softfloat_rounding_mode ) of
  4453. float_round_nearest_even:
  4454. Begin
  4455. if ( ( aExp = $3FE )
  4456. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4457. ) then
  4458. Begin
  4459. packFloat64( aSign, $3FF, 0, 0, result );
  4460. exit;
  4461. End;
  4462. End;
  4463. float_round_down:
  4464. Begin
  4465. if aSign<>0 then
  4466. packFloat64( 1, $3FF, 0, 0, result )
  4467. else
  4468. packFloat64( 0, 0, 0, 0, result );
  4469. exit;
  4470. End;
  4471. float_round_up:
  4472. Begin
  4473. if aSign <> 0 then
  4474. packFloat64( 1, 0, 0, 0, result )
  4475. else
  4476. packFloat64( 0, $3FF, 0, 0, result );
  4477. exit;
  4478. End;
  4479. end;
  4480. packFloat64( aSign, 0, 0, 0, result );
  4481. exit;
  4482. End;
  4483. lastBitMask := 1;
  4484. lastBitMask := lastBitMask shl ($413 - aExp);
  4485. roundBitsMask := lastBitMask - 1;
  4486. z.low := 0;
  4487. z.high := a.high;
  4488. roundingMode := softfloat_rounding_mode;
  4489. if ( roundingMode = float_round_nearest_even ) then
  4490. Begin
  4491. z.high := z.high + lastBitMask shr 1;
  4492. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4493. Begin
  4494. z.high := z.high and not lastBitMask;
  4495. End;
  4496. End
  4497. else if ( roundingMode <> float_round_to_zero ) then
  4498. Begin
  4499. if ( extractFloat64Sign( z )
  4500. xor flag( roundingMode = float_round_up ) )<> 0 then
  4501. Begin
  4502. z.high := z.high or bits32( a.low <> 0 );
  4503. z.high := z.high + roundBitsMask;
  4504. End;
  4505. End;
  4506. z.high := z.high and not roundBitsMask;
  4507. End;
  4508. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4509. Begin
  4510. softfloat_exception_flags :=
  4511. softfloat_exception_flags or float_flag_inexact;
  4512. End;
  4513. result := z;
  4514. End;
  4515. {*
  4516. -------------------------------------------------------------------------------
  4517. Returns the result of adding the absolute values of the double-precision
  4518. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4519. before being returned. `zSign' is ignored if the result is a NaN.
  4520. The addition is performed according to the IEC/IEEE Standard for Binary
  4521. Floating-Point Arithmetic.
  4522. -------------------------------------------------------------------------------
  4523. *}
  4524. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4525. Var
  4526. aExp, bExp, zExp: int16;
  4527. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4528. expDiff: int16;
  4529. label shiftRight1;
  4530. label roundAndPack;
  4531. Begin
  4532. aSig1 := extractFloat64Frac1( a );
  4533. aSig0 := extractFloat64Frac0( a );
  4534. aExp := extractFloat64Exp( a );
  4535. bSig1 := extractFloat64Frac1( b );
  4536. bSig0 := extractFloat64Frac0( b );
  4537. bExp := extractFloat64Exp( b );
  4538. expDiff := aExp - bExp;
  4539. if ( 0 < expDiff ) then
  4540. Begin
  4541. if ( aExp = $7FF ) then
  4542. Begin
  4543. if ( aSig0 OR aSig1 ) <> 0 then
  4544. Begin
  4545. propagateFloat64NaN( a, b, out );
  4546. exit;
  4547. end;
  4548. out := a;
  4549. exit;
  4550. End;
  4551. if ( bExp = 0 ) then
  4552. Begin
  4553. Dec(expDiff);
  4554. End
  4555. else
  4556. Begin
  4557. bSig0 := bSig0 or $00100000;
  4558. End;
  4559. shift64ExtraRightJamming(
  4560. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4561. zExp := aExp;
  4562. End
  4563. else if ( expDiff < 0 ) then
  4564. Begin
  4565. if ( bExp = $7FF ) then
  4566. Begin
  4567. if ( bSig0 OR bSig1 ) <> 0 then
  4568. Begin
  4569. propagateFloat64NaN( a, b, out );
  4570. exit;
  4571. End;
  4572. packFloat64( zSign, $7FF, 0, 0, out );
  4573. exit;
  4574. End;
  4575. if ( aExp = 0 ) then
  4576. Begin
  4577. Inc(expDiff);
  4578. End
  4579. else
  4580. Begin
  4581. aSig0 := aSig0 or $00100000;
  4582. End;
  4583. shift64ExtraRightJamming(
  4584. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4585. zExp := bExp;
  4586. End
  4587. else
  4588. Begin
  4589. if ( aExp = $7FF ) then
  4590. Begin
  4591. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4592. Begin
  4593. propagateFloat64NaN( a, b, out );
  4594. exit;
  4595. End;
  4596. out := a;
  4597. exit;
  4598. End;
  4599. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4600. if ( aExp = 0 ) then
  4601. Begin
  4602. packFloat64( zSign, 0, zSig0, zSig1, out );
  4603. exit;
  4604. End;
  4605. zSig2 := 0;
  4606. zSig0 := zSig0 or $00200000;
  4607. zExp := aExp;
  4608. goto shiftRight1;
  4609. End;
  4610. aSig0 := aSig0 or $00100000;
  4611. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4612. Dec(zExp);
  4613. if ( zSig0 < $00200000 ) then
  4614. goto roundAndPack;
  4615. Inc(zExp);
  4616. shiftRight1:
  4617. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4618. roundAndPack:
  4619. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4620. End;
  4621. {*
  4622. -------------------------------------------------------------------------------
  4623. Returns the result of subtracting the absolute values of the double-
  4624. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4625. difference is negated before being returned. `zSign' is ignored if the
  4626. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4627. Standard for Binary Floating-Point Arithmetic.
  4628. -------------------------------------------------------------------------------
  4629. *}
  4630. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4631. Var
  4632. aExp, bExp, zExp: int16;
  4633. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4634. expDiff: int16;
  4635. z: float64;
  4636. label aExpBigger;
  4637. label bExpBigger;
  4638. label aBigger;
  4639. label bBigger;
  4640. label normalizeRoundAndPack;
  4641. Begin
  4642. aSig1 := extractFloat64Frac1( a );
  4643. aSig0 := extractFloat64Frac0( a );
  4644. aExp := extractFloat64Exp( a );
  4645. bSig1 := extractFloat64Frac1( b );
  4646. bSig0 := extractFloat64Frac0( b );
  4647. bExp := extractFloat64Exp( b );
  4648. expDiff := aExp - bExp;
  4649. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4650. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4651. if ( 0 < expDiff ) then goto aExpBigger;
  4652. if ( expDiff < 0 ) then goto bExpBigger;
  4653. if ( aExp = $7FF ) then
  4654. Begin
  4655. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4656. Begin
  4657. propagateFloat64NaN( a, b, out );
  4658. exit;
  4659. End;
  4660. float_raise( float_flag_invalid );
  4661. z.low := float64_default_nan_low;
  4662. z.high := float64_default_nan_high;
  4663. out := z;
  4664. exit;
  4665. End;
  4666. if ( aExp = 0 ) then
  4667. Begin
  4668. aExp := 1;
  4669. bExp := 1;
  4670. End;
  4671. if ( bSig0 < aSig0 ) then goto aBigger;
  4672. if ( aSig0 < bSig0 ) then goto bBigger;
  4673. if ( bSig1 < aSig1 ) then goto aBigger;
  4674. if ( aSig1 < bSig1 ) then goto bBigger;
  4675. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4676. exit;
  4677. bExpBigger:
  4678. if ( bExp = $7FF ) then
  4679. Begin
  4680. if ( bSig0 OR bSig1 ) <> 0 then
  4681. Begin
  4682. propagateFloat64NaN( a, b, out );
  4683. exit;
  4684. End;
  4685. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4686. exit;
  4687. End;
  4688. if ( aExp = 0 ) then
  4689. Begin
  4690. Inc(expDiff);
  4691. End
  4692. else
  4693. Begin
  4694. aSig0 := aSig0 or $40000000;
  4695. End;
  4696. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4697. bSig0 := bSig0 or $40000000;
  4698. bBigger:
  4699. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4700. zExp := bExp;
  4701. zSign := zSign xor 1;
  4702. goto normalizeRoundAndPack;
  4703. aExpBigger:
  4704. if ( aExp = $7FF ) then
  4705. Begin
  4706. if ( aSig0 OR aSig1 ) <> 0 then
  4707. Begin
  4708. propagateFloat64NaN( a, b, out );
  4709. exit;
  4710. End;
  4711. out := a;
  4712. exit;
  4713. End;
  4714. if ( bExp = 0 ) then
  4715. Begin
  4716. Dec(expDiff);
  4717. End
  4718. else
  4719. Begin
  4720. bSig0 := bSig0 or $40000000;
  4721. End;
  4722. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4723. aSig0 := aSig0 or $40000000;
  4724. aBigger:
  4725. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4726. zExp := aExp;
  4727. normalizeRoundAndPack:
  4728. Dec(zExp);
  4729. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4730. End;
  4731. {*
  4732. -------------------------------------------------------------------------------
  4733. Returns the result of adding the double-precision floating-point values `a'
  4734. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4735. Binary Floating-Point Arithmetic.
  4736. -------------------------------------------------------------------------------
  4737. *}
  4738. Function float64_add( a: float64; b : float64) : Float64;
  4739. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4740. Var
  4741. aSign, bSign: flag;
  4742. Begin
  4743. aSign := extractFloat64Sign( a );
  4744. bSign := extractFloat64Sign( b );
  4745. if ( aSign = bSign ) then
  4746. Begin
  4747. addFloat64Sigs( a, b, aSign, result );
  4748. End
  4749. else
  4750. Begin
  4751. subFloat64Sigs( a, b, aSign, result );
  4752. End;
  4753. End;
  4754. {*
  4755. -------------------------------------------------------------------------------
  4756. Returns the result of subtracting the double-precision floating-point values
  4757. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4758. for Binary Floating-Point Arithmetic.
  4759. -------------------------------------------------------------------------------
  4760. *}
  4761. Function float64_sub(a: float64; b : float64) : Float64;
  4762. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4763. Var
  4764. aSign, bSign: flag;
  4765. Begin
  4766. aSign := extractFloat64Sign( a );
  4767. bSign := extractFloat64Sign( b );
  4768. if ( aSign = bSign ) then
  4769. Begin
  4770. subFloat64Sigs( a, b, aSign, result );
  4771. End
  4772. else
  4773. Begin
  4774. addFloat64Sigs( a, b, aSign, result );
  4775. End;
  4776. End;
  4777. {*
  4778. -------------------------------------------------------------------------------
  4779. Returns the result of multiplying the double-precision floating-point values
  4780. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4781. for Binary Floating-Point Arithmetic.
  4782. -------------------------------------------------------------------------------
  4783. *}
  4784. Function float64_mul( a: float64; b:float64) : Float64;
  4785. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4786. Var
  4787. aSign, bSign, zSign: flag;
  4788. aExp, bExp, zExp: int16;
  4789. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4790. z: float64;
  4791. label invalid;
  4792. Begin
  4793. aSig1 := extractFloat64Frac1( a );
  4794. aSig0 := extractFloat64Frac0( a );
  4795. aExp := extractFloat64Exp( a );
  4796. aSign := extractFloat64Sign( a );
  4797. bSig1 := extractFloat64Frac1( b );
  4798. bSig0 := extractFloat64Frac0( b );
  4799. bExp := extractFloat64Exp( b );
  4800. bSign := extractFloat64Sign( b );
  4801. zSign := aSign xor bSign;
  4802. if ( aExp = $7FF ) then
  4803. Begin
  4804. if ( (( aSig0 OR aSig1 ) <>0)
  4805. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4806. Begin
  4807. propagateFloat64NaN( a, b, result );
  4808. exit;
  4809. End;
  4810. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4811. packFloat64( zSign, $7FF, 0, 0, result );
  4812. exit;
  4813. End;
  4814. if ( bExp = $7FF ) then
  4815. Begin
  4816. if ( bSig0 OR bSig1 )<> 0 then
  4817. Begin
  4818. propagateFloat64NaN( a, b, result );
  4819. exit;
  4820. End;
  4821. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4822. Begin
  4823. invalid:
  4824. float_raise( float_flag_invalid );
  4825. z.low := float64_default_nan_low;
  4826. z.high := float64_default_nan_high;
  4827. result := z;
  4828. exit;
  4829. End;
  4830. packFloat64( zSign, $7FF, 0, 0, result );
  4831. exit;
  4832. End;
  4833. if ( aExp = 0 ) then
  4834. Begin
  4835. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4836. Begin
  4837. packFloat64( zSign, 0, 0, 0, result );
  4838. exit;
  4839. End;
  4840. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4841. End;
  4842. if ( bExp = 0 ) then
  4843. Begin
  4844. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4845. Begin
  4846. packFloat64( zSign, 0, 0, 0, result );
  4847. exit;
  4848. End;
  4849. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4850. End;
  4851. zExp := aExp + bExp - $400;
  4852. aSig0 := aSig0 or $00100000;
  4853. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4854. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4855. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4856. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4857. if ( $00200000 <= zSig0 ) then
  4858. Begin
  4859. shift64ExtraRightJamming(
  4860. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4861. Inc(zExp);
  4862. End;
  4863. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4864. End;
  4865. {*
  4866. -------------------------------------------------------------------------------
  4867. Returns the result of dividing the double-precision floating-point value `a'
  4868. by the corresponding value `b'. The operation is performed according to the
  4869. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4870. -------------------------------------------------------------------------------
  4871. *}
  4872. Function float64_div(a: float64; b : float64) : Float64;
  4873. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4874. Var
  4875. aSign, bSign, zSign: flag;
  4876. aExp, bExp, zExp: int16;
  4877. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4878. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4879. z: float64;
  4880. label invalid;
  4881. Begin
  4882. aSig1 := extractFloat64Frac1( a );
  4883. aSig0 := extractFloat64Frac0( a );
  4884. aExp := extractFloat64Exp( a );
  4885. aSign := extractFloat64Sign( a );
  4886. bSig1 := extractFloat64Frac1( b );
  4887. bSig0 := extractFloat64Frac0( b );
  4888. bExp := extractFloat64Exp( b );
  4889. bSign := extractFloat64Sign( b );
  4890. zSign := aSign xor bSign;
  4891. if ( aExp = $7FF ) then
  4892. Begin
  4893. if ( aSig0 OR aSig1 )<> 0 then
  4894. Begin
  4895. propagateFloat64NaN( a, b, result );
  4896. exit;
  4897. end;
  4898. if ( bExp = $7FF ) then
  4899. Begin
  4900. if ( bSig0 OR bSig1 )<>0 then
  4901. Begin
  4902. propagateFloat64NaN( a, b, result );
  4903. exit;
  4904. End;
  4905. goto invalid;
  4906. End;
  4907. packFloat64( zSign, $7FF, 0, 0, result );
  4908. exit;
  4909. End;
  4910. if ( bExp = $7FF ) then
  4911. Begin
  4912. if ( bSig0 OR bSig1 )<> 0 then
  4913. Begin
  4914. propagateFloat64NaN( a, b, result );
  4915. exit;
  4916. End;
  4917. packFloat64( zSign, 0, 0, 0, result );
  4918. exit;
  4919. End;
  4920. if ( bExp = 0 ) then
  4921. Begin
  4922. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4923. Begin
  4924. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4925. Begin
  4926. invalid:
  4927. float_raise( float_flag_invalid );
  4928. z.low := float64_default_nan_low;
  4929. z.high := float64_default_nan_high;
  4930. result := z;
  4931. exit;
  4932. End;
  4933. float_raise( float_flag_divbyzero );
  4934. packFloat64( zSign, $7FF, 0, 0, result );
  4935. exit;
  4936. End;
  4937. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4938. End;
  4939. if ( aExp = 0 ) then
  4940. Begin
  4941. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4942. Begin
  4943. packFloat64( zSign, 0, 0, 0, result );
  4944. exit;
  4945. End;
  4946. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4947. End;
  4948. zExp := aExp - bExp + $3FD;
  4949. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4950. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4951. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4952. Begin
  4953. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4954. Inc(zExp);
  4955. End;
  4956. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4957. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4958. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4959. while ( sbits32 (rem0) < 0 ) do
  4960. Begin
  4961. Dec(zSig0);
  4962. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4963. End;
  4964. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4965. if ( ( zSig1 and $3FF ) <= 4 ) then
  4966. Begin
  4967. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4968. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4969. while ( sbits32 (rem1) < 0 ) do
  4970. Begin
  4971. Dec(zSig1);
  4972. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4973. End;
  4974. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4975. End;
  4976. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4977. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4978. End;
  4979. {*
  4980. -------------------------------------------------------------------------------
  4981. Returns the remainder of the double-precision floating-point value `a'
  4982. with respect to the corresponding value `b'. The operation is performed
  4983. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4984. -------------------------------------------------------------------------------
  4985. *}
  4986. Function float64_rem(a: float64; b : float64) : float64;
  4987. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4988. Var
  4989. aSign, zSign: flag;
  4990. aExp, bExp, expDiff: int16;
  4991. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4992. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4993. sigMean0: sbits32;
  4994. z: float64;
  4995. label invalid;
  4996. Begin
  4997. aSig1 := extractFloat64Frac1( a );
  4998. aSig0 := extractFloat64Frac0( a );
  4999. aExp := extractFloat64Exp( a );
  5000. aSign := extractFloat64Sign( a );
  5001. bSig1 := extractFloat64Frac1( b );
  5002. bSig0 := extractFloat64Frac0( b );
  5003. bExp := extractFloat64Exp( b );
  5004. if ( aExp = $7FF ) then
  5005. Begin
  5006. if ((( aSig0 OR aSig1 )<>0)
  5007. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  5008. Begin
  5009. propagateFloat64NaN( a, b, result );
  5010. exit;
  5011. End;
  5012. goto invalid;
  5013. End;
  5014. if ( bExp = $7FF ) then
  5015. Begin
  5016. if ( bSig0 OR bSig1 ) <> 0 then
  5017. Begin
  5018. propagateFloat64NaN( a, b, result );
  5019. exit;
  5020. End;
  5021. result := a;
  5022. exit;
  5023. End;
  5024. if ( bExp = 0 ) then
  5025. Begin
  5026. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5027. Begin
  5028. invalid:
  5029. float_raise( float_flag_invalid );
  5030. z.low := float64_default_nan_low;
  5031. z.high := float64_default_nan_high;
  5032. result := z;
  5033. exit;
  5034. End;
  5035. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5036. End;
  5037. if ( aExp = 0 ) then
  5038. Begin
  5039. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5040. Begin
  5041. result := a;
  5042. exit;
  5043. End;
  5044. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5045. End;
  5046. expDiff := aExp - bExp;
  5047. if ( expDiff < -1 ) then
  5048. Begin
  5049. result := a;
  5050. exit;
  5051. End;
  5052. shortShift64Left(
  5053. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  5054. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5055. q := le64( bSig0, bSig1, aSig0, aSig1 );
  5056. if ( q )<>0 then
  5057. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5058. expDiff := expDiff - 32;
  5059. while ( 0 < expDiff ) do
  5060. Begin
  5061. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5062. if 4 < q then
  5063. q:= q - 4
  5064. else
  5065. q := 0;
  5066. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5067. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  5068. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  5069. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  5070. expDiff := expDiff - 29;
  5071. End;
  5072. if ( -32 < expDiff ) then
  5073. Begin
  5074. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5075. if 4 < q then
  5076. q := q - 4
  5077. else
  5078. q := 0;
  5079. q := q shr (- expDiff);
  5080. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5081. expDiff := expDiff + 24;
  5082. if ( expDiff < 0 ) then
  5083. Begin
  5084. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  5085. End
  5086. else
  5087. Begin
  5088. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  5089. End;
  5090. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5091. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  5092. End
  5093. else
  5094. Begin
  5095. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  5096. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5097. End;
  5098. Repeat
  5099. alternateASig0 := aSig0;
  5100. alternateASig1 := aSig1;
  5101. Inc(q);
  5102. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5103. Until not ( 0 <= sbits32 (aSig0) );
  5104. add64(
  5105. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  5106. if ( ( sigMean0 < 0 )
  5107. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  5108. Begin
  5109. aSig0 := alternateASig0;
  5110. aSig1 := alternateASig1;
  5111. End;
  5112. zSign := flag( sbits32 (aSig0) < 0 );
  5113. if ( zSign <> 0 ) then
  5114. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  5115. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  5116. End;
  5117. {*
  5118. -------------------------------------------------------------------------------
  5119. Returns the square root of the double-precision floating-point value `a'.
  5120. The operation is performed according to the IEC/IEEE Standard for Binary
  5121. Floating-Point Arithmetic.
  5122. -------------------------------------------------------------------------------
  5123. *}
  5124. Procedure float64_sqrt( a: float64; var out: float64 );
  5125. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  5126. Var
  5127. aSign: flag;
  5128. aExp, zExp: int16;
  5129. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  5130. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  5131. z: float64;
  5132. label invalid;
  5133. Begin
  5134. aSig1 := extractFloat64Frac1( a );
  5135. aSig0 := extractFloat64Frac0( a );
  5136. aExp := extractFloat64Exp( a );
  5137. aSign := extractFloat64Sign( a );
  5138. if ( aExp = $7FF ) then
  5139. Begin
  5140. if ( aSig0 OR aSig1 ) <> 0 then
  5141. Begin
  5142. propagateFloat64NaN( a, a, out );
  5143. exit;
  5144. End;
  5145. if ( aSign = 0) then
  5146. Begin
  5147. out := a;
  5148. exit;
  5149. End;
  5150. goto invalid;
  5151. End;
  5152. if ( aSign <> 0 ) then
  5153. Begin
  5154. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  5155. Begin
  5156. out := a;
  5157. exit;
  5158. End;
  5159. invalid:
  5160. float_raise( float_flag_invalid );
  5161. z.low := float64_default_nan_low;
  5162. z.high := float64_default_nan_high;
  5163. out := z;
  5164. exit;
  5165. End;
  5166. if ( aExp = 0 ) then
  5167. Begin
  5168. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5169. Begin
  5170. packFloat64( 0, 0, 0, 0, out );
  5171. exit;
  5172. End;
  5173. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5174. End;
  5175. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  5176. aSig0 := aSig0 or $00100000;
  5177. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  5178. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  5179. if ( zSig0 = 0 ) then
  5180. zSig0 := $7FFFFFFF;
  5181. doubleZSig0 := zSig0 + zSig0;
  5182. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  5183. mul32To64( zSig0, zSig0, term0, term1 );
  5184. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  5185. while ( sbits32 (rem0) < 0 ) do
  5186. Begin
  5187. Dec(zSig0);
  5188. doubleZSig0 := doubleZSig0 - 2;
  5189. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  5190. End;
  5191. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  5192. if ( ( zSig1 and $1FF ) <= 5 ) then
  5193. Begin
  5194. if ( zSig1 = 0 ) then
  5195. zSig1 := 1;
  5196. mul32To64( doubleZSig0, zSig1, term1, term2 );
  5197. sub64( rem1, 0, term1, term2, rem1, rem2 );
  5198. mul32To64( zSig1, zSig1, term2, term3 );
  5199. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  5200. while ( sbits32 (rem1) < 0 ) do
  5201. Begin
  5202. Dec(zSig1);
  5203. shortShift64Left( 0, zSig1, 1, term2, term3 );
  5204. term3 := term3 or 1;
  5205. term2 := term2 or doubleZSig0;
  5206. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  5207. End;
  5208. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5209. End;
  5210. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  5211. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  5212. End;
  5213. {*
  5214. -------------------------------------------------------------------------------
  5215. Returns 1 if the double-precision floating-point value `a' is equal to
  5216. the corresponding value `b', and 0 otherwise. The comparison is performed
  5217. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5218. -------------------------------------------------------------------------------
  5219. *}
  5220. Function float64_eq(a: float64; b: float64): flag;
  5221. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  5222. Begin
  5223. if
  5224. (
  5225. ( extractFloat64Exp( a ) = $7FF )
  5226. AND
  5227. (
  5228. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5229. )
  5230. )
  5231. OR (
  5232. ( extractFloat64Exp( b ) = $7FF )
  5233. AND (
  5234. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5235. )
  5236. )
  5237. ) then
  5238. Begin
  5239. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5240. float_raise( float_flag_invalid );
  5241. float64_eq := 0;
  5242. exit;
  5243. End;
  5244. float64_eq := flag(
  5245. ( a.low = b.low )
  5246. AND ( ( a.high = b.high )
  5247. OR ( ( a.low = 0 )
  5248. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5249. ));
  5250. End;
  5251. {*
  5252. -------------------------------------------------------------------------------
  5253. Returns 1 if the double-precision floating-point value `a' is less than
  5254. or equal to the corresponding value `b', and 0 otherwise. The comparison
  5255. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5256. Arithmetic.
  5257. -------------------------------------------------------------------------------
  5258. *}
  5259. Function float64_le(a: float64;b: float64): flag;
  5260. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  5261. Var
  5262. aSign, bSign: flag;
  5263. Begin
  5264. if
  5265. (
  5266. ( extractFloat64Exp( a ) = $7FF )
  5267. AND
  5268. (
  5269. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5270. )
  5271. )
  5272. OR (
  5273. ( extractFloat64Exp( b ) = $7FF )
  5274. AND (
  5275. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5276. )
  5277. )
  5278. ) then
  5279. Begin
  5280. float_raise( float_flag_invalid );
  5281. float64_le := 0;
  5282. exit;
  5283. End;
  5284. aSign := extractFloat64Sign( a );
  5285. bSign := extractFloat64Sign( b );
  5286. if ( aSign <> bSign ) then
  5287. Begin
  5288. float64_le := flag(
  5289. (aSign <> 0)
  5290. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5291. = 0 ));
  5292. exit;
  5293. End;
  5294. if aSign <> 0 then
  5295. float64_le := le64( b.high, b.low, a.high, a.low )
  5296. else
  5297. float64_le := le64( a.high, a.low, b.high, b.low );
  5298. End;
  5299. {*
  5300. -------------------------------------------------------------------------------
  5301. Returns 1 if the double-precision floating-point value `a' is less than
  5302. the corresponding value `b', and 0 otherwise. The comparison is performed
  5303. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5304. -------------------------------------------------------------------------------
  5305. *}
  5306. Function float64_lt(a: float64;b: float64): flag;
  5307. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  5308. Var
  5309. aSign, bSign: flag;
  5310. Begin
  5311. if
  5312. (
  5313. ( extractFloat64Exp( a ) = $7FF )
  5314. AND
  5315. (
  5316. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5317. )
  5318. )
  5319. OR (
  5320. ( extractFloat64Exp( b ) = $7FF )
  5321. AND (
  5322. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5323. )
  5324. )
  5325. ) then
  5326. Begin
  5327. float_raise( float_flag_invalid );
  5328. float64_lt := 0;
  5329. exit;
  5330. End;
  5331. aSign := extractFloat64Sign( a );
  5332. bSign := extractFloat64Sign( b );
  5333. if ( aSign <> bSign ) then
  5334. Begin
  5335. float64_lt := flag(
  5336. (aSign <> 0)
  5337. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5338. <> 0 ));
  5339. exit;
  5340. End;
  5341. if aSign <> 0 then
  5342. float64_lt := lt64( b.high, b.low, a.high, a.low )
  5343. else
  5344. float64_lt := lt64( a.high, a.low, b.high, b.low );
  5345. End;
  5346. {*
  5347. -------------------------------------------------------------------------------
  5348. Returns 1 if the double-precision floating-point value `a' is equal to
  5349. the corresponding value `b', and 0 otherwise. The invalid exception is
  5350. raised if either operand is a NaN. Otherwise, the comparison is performed
  5351. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5352. -------------------------------------------------------------------------------
  5353. *}
  5354. Function float64_eq_signaling( a: float64; b: float64): flag;
  5355. Begin
  5356. if
  5357. (
  5358. ( extractFloat64Exp( a ) = $7FF )
  5359. AND
  5360. (
  5361. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5362. )
  5363. )
  5364. OR (
  5365. ( extractFloat64Exp( b ) = $7FF )
  5366. AND (
  5367. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5368. )
  5369. )
  5370. ) then
  5371. Begin
  5372. float_raise( float_flag_invalid );
  5373. float64_eq_signaling := 0;
  5374. exit;
  5375. End;
  5376. float64_eq_signaling := flag(
  5377. ( a.low = b.low )
  5378. AND ( ( a.high = b.high )
  5379. OR ( ( a.low = 0 )
  5380. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5381. ));
  5382. End;
  5383. {*
  5384. -------------------------------------------------------------------------------
  5385. Returns 1 if the double-precision floating-point value `a' is less than or
  5386. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  5387. cause an exception. Otherwise, the comparison is performed according to the
  5388. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5389. -------------------------------------------------------------------------------
  5390. *}
  5391. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  5392. Var
  5393. aSign, bSign : flag;
  5394. Begin
  5395. if
  5396. (
  5397. ( extractFloat64Exp( a ) = $7FF )
  5398. AND
  5399. (
  5400. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5401. )
  5402. )
  5403. OR (
  5404. ( extractFloat64Exp( b ) = $7FF )
  5405. AND (
  5406. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5407. )
  5408. )
  5409. ) then
  5410. Begin
  5411. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5412. float_raise( float_flag_invalid );
  5413. float64_le_quiet := 0;
  5414. exit;
  5415. End;
  5416. aSign := extractFloat64Sign( a );
  5417. bSign := extractFloat64Sign( b );
  5418. if ( aSign <> bSign ) then
  5419. Begin
  5420. float64_le_quiet := flag
  5421. ((aSign <> 0)
  5422. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5423. = 0 ));
  5424. exit;
  5425. End;
  5426. if aSign <> 0 then
  5427. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  5428. else
  5429. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  5430. End;
  5431. {*
  5432. -------------------------------------------------------------------------------
  5433. Returns 1 if the double-precision floating-point value `a' is less than
  5434. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5435. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5436. Standard for Binary Floating-Point Arithmetic.
  5437. -------------------------------------------------------------------------------
  5438. *}
  5439. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5440. Var
  5441. aSign, bSign: flag;
  5442. Begin
  5443. if
  5444. (
  5445. ( extractFloat64Exp( a ) = $7FF )
  5446. AND
  5447. (
  5448. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5449. )
  5450. )
  5451. OR (
  5452. ( extractFloat64Exp( b ) = $7FF )
  5453. AND (
  5454. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5455. )
  5456. )
  5457. ) then
  5458. Begin
  5459. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5460. float_raise( float_flag_invalid );
  5461. float64_lt_quiet := 0;
  5462. exit;
  5463. End;
  5464. aSign := extractFloat64Sign( a );
  5465. bSign := extractFloat64Sign( b );
  5466. if ( aSign <> bSign ) then
  5467. Begin
  5468. float64_lt_quiet := flag(
  5469. (aSign<>0)
  5470. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5471. <> 0 ));
  5472. exit;
  5473. End;
  5474. If aSign <> 0 then
  5475. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5476. else
  5477. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5478. End;
  5479. {*----------------------------------------------------------------------------
  5480. | Returns the result of converting the 64-bit two's complement integer `a'
  5481. | to the single-precision floating-point format. The conversion is performed
  5482. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5483. *----------------------------------------------------------------------------*}
  5484. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5485. var
  5486. zSign : flag;
  5487. absA : uint64;
  5488. shiftCount: int8;
  5489. intval : int64rec;
  5490. Begin
  5491. if ( a = 0 ) then
  5492. begin
  5493. int64_to_float32.float32 := 0;
  5494. exit;
  5495. end;
  5496. if a < 0 then
  5497. zSign := flag(TRUE)
  5498. else
  5499. zSign := flag(FALSE);
  5500. if zSign<>0 then
  5501. absA := -a
  5502. else
  5503. absA := a;
  5504. shiftCount := countLeadingZeros64( absA ) - 40;
  5505. if ( 0 <= shiftCount ) then
  5506. begin
  5507. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5508. end
  5509. else
  5510. begin
  5511. shiftCount := shiftCount + 7;
  5512. if ( shiftCount < 0 ) then
  5513. begin
  5514. intval.low := int64rec(AbsA).low;
  5515. intval.high := int64rec(AbsA).high;
  5516. shift64RightJamming( intval.high, intval.low, - shiftCount,
  5517. intval.high, intval.low);
  5518. int64rec(absA).low := intval.low;
  5519. int64rec(absA).high := intval.high;
  5520. end
  5521. else
  5522. absA := absA shl shiftCount;
  5523. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5524. end;
  5525. End;
  5526. {*----------------------------------------------------------------------------
  5527. | Returns the result of converting the 64-bit two's complement integer `a'
  5528. | to the single-precision floating-point format. The conversion is performed
  5529. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5530. | Unisgned version.
  5531. *----------------------------------------------------------------------------*}
  5532. function qword_to_float32( a: qword ): float32rec; compilerproc;
  5533. var
  5534. zSign : flag;
  5535. absA : uint64;
  5536. shiftCount: int8;
  5537. intval : int64rec;
  5538. Begin
  5539. if ( a = 0 ) then
  5540. begin
  5541. qword_to_float32.float32 := 0;
  5542. exit;
  5543. end;
  5544. zSign := flag(FALSE);
  5545. absA := a;
  5546. shiftCount := countLeadingZeros64( absA ) - 40;
  5547. if ( 0 <= shiftCount ) then
  5548. begin
  5549. qword_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5550. end
  5551. else
  5552. begin
  5553. shiftCount := shiftCount + 7;
  5554. if ( shiftCount < 0 ) then
  5555. begin
  5556. intval.low := int64rec(AbsA).low;
  5557. intval.high := int64rec(AbsA).high;
  5558. shift64RightJamming( intval.high, intval.low, - shiftCount,
  5559. intval.high, intval.low);
  5560. int64rec(absA).low := intval.low;
  5561. int64rec(absA).high := intval.high;
  5562. end
  5563. else
  5564. absA := absA shl shiftCount;
  5565. qword_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5566. end;
  5567. End;
  5568. {*----------------------------------------------------------------------------
  5569. | Returns the result of converting the 64-bit two's complement integer `a'
  5570. | to the double-precision floating-point format. The conversion is performed
  5571. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5572. *----------------------------------------------------------------------------*}
  5573. function qword_to_float64( a: qword ): float64;
  5574. {$ifdef fpc}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5575. var
  5576. zSign : flag;
  5577. float_result : float64;
  5578. AbsA : bits64;
  5579. shiftcount : int8;
  5580. zSig0, zSig1 : bits32;
  5581. Begin
  5582. if ( a = 0 ) then
  5583. Begin
  5584. packFloat64( 0, 0, 0, 0, result );
  5585. exit;
  5586. end;
  5587. zSign := flag(FALSE);
  5588. AbsA := a;
  5589. shiftCount := countLeadingZeros64( absA ) - 11;
  5590. if ( 0 <= shiftCount ) then
  5591. Begin
  5592. absA := absA shl shiftcount;
  5593. zSig0:=int64rec(absA).high;
  5594. zSig1:=int64rec(absA).low;
  5595. End
  5596. else
  5597. Begin
  5598. shift64Right( int64rec(absA).high, int64rec(absA).low,
  5599. - shiftCount, zSig0, zSig1 );
  5600. End;
  5601. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5602. qword_to_float64:= float_result;
  5603. End;
  5604. {*----------------------------------------------------------------------------
  5605. | Returns the result of converting the 64-bit two's complement integer `a'
  5606. | to the double-precision floating-point format. The conversion is performed
  5607. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5608. *----------------------------------------------------------------------------*}
  5609. function int64_to_float64( a: int64 ): float64;
  5610. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5611. var
  5612. zSign : flag;
  5613. float_result : float64;
  5614. AbsA : bits64;
  5615. shiftcount : int8;
  5616. zSig0, zSig1 : bits32;
  5617. Begin
  5618. if ( a = 0 ) then
  5619. Begin
  5620. packFloat64( 0, 0, 0, 0, result );
  5621. exit;
  5622. end;
  5623. zSign := flag( a < 0 );
  5624. if ZSign<>0 then
  5625. AbsA := -a
  5626. else
  5627. AbsA := a;
  5628. shiftCount := countLeadingZeros64( absA ) - 11;
  5629. if ( 0 <= shiftCount ) then
  5630. Begin
  5631. absA := absA shl shiftcount;
  5632. zSig0:=int64rec(absA).high;
  5633. zSig1:=int64rec(absA).low;
  5634. End
  5635. else
  5636. Begin
  5637. shift64Right( int64rec(absA).high, int64rec(absA).low,
  5638. - shiftCount, zSig0, zSig1 );
  5639. End;
  5640. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5641. int64_to_float64:= float_result;
  5642. End;
  5643. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5644. {*----------------------------------------------------------------------------
  5645. | Returns the result of converting the 64-bit two's complement integer `a'
  5646. | to the extended double-precision floating-point format. The conversion
  5647. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5648. | Arithmetic.
  5649. *----------------------------------------------------------------------------*}
  5650. function int64_to_floatx80( a: int64 ): floatx80;
  5651. var
  5652. zSign: flag;
  5653. absA: uint64;
  5654. shiftCount: int8;
  5655. begin
  5656. if ( a = 0 ) then begin
  5657. result := packFloatx80( 0, 0, 0 );
  5658. exit;
  5659. end;
  5660. zSign := ord( a < 0 );
  5661. if zSign <> 0 then absA := - a else absA := a;
  5662. shiftCount := countLeadingZeros64( absA );
  5663. result := packFloatx80( zSign, $403E - shiftCount, absA shl shiftCount );
  5664. end;
  5665. {*----------------------------------------------------------------------------
  5666. | Returns the result of converting the 64-bit two's complement integer `a'
  5667. | to the extended double-precision floating-point format. The conversion
  5668. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5669. | Arithmetic.
  5670. | Unsigned version.
  5671. *----------------------------------------------------------------------------*}
  5672. function qword_to_floatx80( a: qword ): floatx80;
  5673. var
  5674. absA: bits64;
  5675. shiftCount: int8;
  5676. begin
  5677. if ( a = 0 ) then begin
  5678. result := packFloatx80( 0, 0, 0 );
  5679. exit;
  5680. end;
  5681. absA := a;
  5682. shiftCount := countLeadingZeros64( absA );
  5683. result := packFloatx80( 0, $403E - shiftCount, absA shl shiftCount );
  5684. end;
  5685. {$endif FPC_SOFTFLOAT_FLOATX80}
  5686. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5687. {*----------------------------------------------------------------------------
  5688. | Returns the result of converting the 64-bit two's complement integer `a' to
  5689. | the quadruple-precision floating-point format. The conversion is performed
  5690. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5691. *----------------------------------------------------------------------------*}
  5692. function int64_to_float128( a: int64 ): float128;
  5693. var
  5694. zSign: flag;
  5695. absA: uint64;
  5696. shiftCount: int8;
  5697. zExp: int32;
  5698. zSig0, zSig1: bits64;
  5699. begin
  5700. if ( a = 0 ) then begin
  5701. result := packFloat128( 0, 0, 0, 0 );
  5702. exit;
  5703. end;
  5704. zSign := ord( a < 0 );
  5705. if zSign <> 0 then absA := - a else absA := a;
  5706. shiftCount := countLeadingZeros64( absA ) + 49;
  5707. zExp := $406E - shiftCount;
  5708. if ( 64 <= shiftCount ) then begin
  5709. zSig1 := 0;
  5710. zSig0 := absA;
  5711. dec( shiftCount, 64 );
  5712. end
  5713. else begin
  5714. zSig1 := absA;
  5715. zSig0 := 0;
  5716. end;
  5717. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5718. result := packFloat128( zSign, zExp, zSig0, zSig1 );
  5719. end;
  5720. {*----------------------------------------------------------------------------
  5721. | Returns the result of converting the 64-bit two's complement integer `a' to
  5722. | the quadruple-precision floating-point format. The conversion is performed
  5723. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5724. | Unsigned version.
  5725. *----------------------------------------------------------------------------*}
  5726. function qword_to_float128( a: qword ): float128;
  5727. var
  5728. absA: bits64;
  5729. shiftCount: int8;
  5730. zExp: int32;
  5731. zSig0, zSig1: bits64;
  5732. begin
  5733. if ( a = 0 ) then begin
  5734. result := packFloat128( 0, 0, 0, 0 );
  5735. exit;
  5736. end;
  5737. absA := a;
  5738. shiftCount := countLeadingZeros64( absA ) + 49;
  5739. zExp := $406E - shiftCount;
  5740. if ( 64 <= shiftCount ) then begin
  5741. zSig1 := 0;
  5742. zSig0 := absA;
  5743. dec( shiftCount, 64 );
  5744. end
  5745. else begin
  5746. zSig1 := absA;
  5747. zSig0 := 0;
  5748. end;
  5749. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5750. result := packFloat128( 0, zExp, zSig0, zSig1 );
  5751. end;
  5752. {$endif FPC_SOFTFLOAT_FLOAT128}
  5753. {*----------------------------------------------------------------------------
  5754. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5755. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5756. | Otherwise, returns 0.
  5757. *----------------------------------------------------------------------------*}
  5758. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5759. begin
  5760. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5761. end;
  5762. {*----------------------------------------------------------------------------
  5763. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5764. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5765. | Otherwise, returns 0.
  5766. *----------------------------------------------------------------------------*}
  5767. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5768. begin
  5769. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5770. end;
  5771. {*----------------------------------------------------------------------------
  5772. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5773. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5774. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5775. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5776. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5777. | the most-significant bit of the extra result, and the other 63 bits of the
  5778. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5779. | were all zero. This extra result is stored in the location pointed to by
  5780. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5781. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5782. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5783. | fixed-point value is shifted right by the number of bits given in `count',
  5784. | and the integer part of the result is returned at the locations pointed to
  5785. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5786. | corrupted as described above, and is returned at the location pointed to by
  5787. | `z2Ptr'.)
  5788. *----------------------------------------------------------------------------*}
  5789. procedure shift128ExtraRightJamming(
  5790. a0: bits64;
  5791. a1: bits64;
  5792. a2: bits64;
  5793. count: int16;
  5794. var z0Ptr: bits64;
  5795. var z1Ptr: bits64;
  5796. var z2Ptr: bits64);
  5797. var
  5798. z0, z1, z2: bits64;
  5799. negCount: int8;
  5800. begin
  5801. negCount := ( - count ) and 63;
  5802. if ( count = 0 ) then
  5803. begin
  5804. z2 := a2;
  5805. z1 := a1;
  5806. z0 := a0;
  5807. end
  5808. else begin
  5809. if ( count < 64 ) then
  5810. begin
  5811. z2 := a1 shl negCount;
  5812. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5813. z0 := a0 shr count;
  5814. end
  5815. else begin
  5816. if ( count = 64 ) then
  5817. begin
  5818. z2 := a1;
  5819. z1 := a0;
  5820. end
  5821. else begin
  5822. a2 := a2 or a1;
  5823. if ( count < 128 ) then
  5824. begin
  5825. z2 := a0 shl negCount;
  5826. z1 := a0 shr ( count and 63 );
  5827. end
  5828. else begin
  5829. if ( count = 128 ) then
  5830. z2 := a0
  5831. else
  5832. z2 := ord( a0 <> 0 );
  5833. z1 := 0;
  5834. end;
  5835. end;
  5836. z0 := 0;
  5837. end;
  5838. z2 := z2 or ord( a2 <> 0 );
  5839. end;
  5840. z2Ptr := z2;
  5841. z1Ptr := z1;
  5842. z0Ptr := z0;
  5843. end;
  5844. {*----------------------------------------------------------------------------
  5845. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5846. | _plus_ the number of bits given in `count'. The shifted result is at most
  5847. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5848. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5849. | shifted off is the most-significant bit of the extra result, and the other
  5850. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5851. | bits shifted off were all zero. This extra result is stored in the location
  5852. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5853. | (This routine makes more sense if `a0' and `a1' are considered to form
  5854. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5855. | point value is shifted right by the number of bits given in `count', and
  5856. | the integer part of the result is returned at the location pointed to by
  5857. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5858. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5859. *----------------------------------------------------------------------------*}
  5860. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5861. var
  5862. z0, z1: bits64;
  5863. negCount: int8;
  5864. begin
  5865. negCount := ( - count ) and 63;
  5866. if ( count = 0 ) then
  5867. begin
  5868. z1 := a1;
  5869. z0 := a0;
  5870. end
  5871. else if ( count < 64 ) then
  5872. begin
  5873. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5874. z0 := a0 shr count;
  5875. end
  5876. else begin
  5877. if ( count = 64 ) then
  5878. begin
  5879. z1 := a0 or ord( a1 <> 0 );
  5880. end
  5881. else begin
  5882. z1 := ord( ( a0 or a1 ) <> 0 );
  5883. end;
  5884. z0 := 0;
  5885. end;
  5886. z1Ptr := z1;
  5887. z0Ptr := z0;
  5888. end;
  5889. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5890. {*----------------------------------------------------------------------------
  5891. | Returns the fraction bits of the extended double-precision floating-point
  5892. | value `a'.
  5893. *----------------------------------------------------------------------------*}
  5894. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5895. begin
  5896. result:=a.low;
  5897. end;
  5898. {*----------------------------------------------------------------------------
  5899. | Returns the exponent bits of the extended double-precision floating-point
  5900. | value `a'.
  5901. *----------------------------------------------------------------------------*}
  5902. function extractFloatx80Exp(a : floatx80): int32;inline;
  5903. begin
  5904. result:=a.high and $7FFF;
  5905. end;
  5906. {*----------------------------------------------------------------------------
  5907. | Returns the sign bit of the extended double-precision floating-point value
  5908. | `a'.
  5909. *----------------------------------------------------------------------------*}
  5910. function extractFloatx80Sign(a : floatx80): flag;inline;
  5911. begin
  5912. result:=a.high shr 15;
  5913. end;
  5914. {*----------------------------------------------------------------------------
  5915. | Normalizes the subnormal extended double-precision floating-point value
  5916. | represented by the denormalized significand `aSig'. The normalized exponent
  5917. | and significand are stored at the locations pointed to by `zExpPtr' and
  5918. | `zSigPtr', respectively.
  5919. *----------------------------------------------------------------------------*}
  5920. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5921. var
  5922. shiftCount: int8;
  5923. begin
  5924. shiftCount := countLeadingZeros64( aSig );
  5925. zSigPtr := aSig shl shiftCount;
  5926. zExpPtr := 1 - shiftCount;
  5927. end;
  5928. {*----------------------------------------------------------------------------
  5929. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5930. | extended double-precision floating-point value, returning the result.
  5931. *----------------------------------------------------------------------------*}
  5932. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5933. var
  5934. z: floatx80;
  5935. begin
  5936. z.low := zSig;
  5937. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5938. result:=z;
  5939. end;
  5940. {*----------------------------------------------------------------------------
  5941. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5942. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5943. | and returns the proper extended double-precision floating-point value
  5944. | corresponding to the abstract input. Ordinarily, the abstract value is
  5945. | rounded and packed into the extended double-precision format, with the
  5946. | inexact exception raised if the abstract input cannot be represented
  5947. | exactly. However, if the abstract value is too large, the overflow and
  5948. | inexact exceptions are raised and an infinity or maximal finite value is
  5949. | returned. If the abstract value is too small, the input value is rounded to
  5950. | a subnormal number, and the underflow and inexact exceptions are raised if
  5951. | the abstract input cannot be represented exactly as a subnormal extended
  5952. | double-precision floating-point number.
  5953. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5954. | number of bits as single or double precision, respectively. Otherwise, the
  5955. | result is rounded to the full precision of the extended double-precision
  5956. | format.
  5957. | The input significand must be normalized or smaller. If the input
  5958. | significand is not normalized, `zExp' must be 0; in that case, the result
  5959. | returned is a subnormal number, and it must not require rounding. The
  5960. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5961. | Floating-Point Arithmetic.
  5962. *----------------------------------------------------------------------------*}
  5963. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5964. var
  5965. roundingMode: int8;
  5966. roundNearestEven, increment, isTiny: flag;
  5967. roundIncrement, roundMask, roundBits: int64;
  5968. label
  5969. precision80, overflow;
  5970. begin
  5971. roundingMode := softfloat_rounding_mode;
  5972. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5973. if ( roundingPrecision = 80 ) then
  5974. goto precision80;
  5975. if ( roundingPrecision = 64 ) then
  5976. begin
  5977. roundIncrement := int64( $0000000000000400 );
  5978. roundMask := int64( $00000000000007FF );
  5979. end
  5980. else if ( roundingPrecision = 32 ) then
  5981. begin
  5982. roundIncrement := int64( $0000008000000000 );
  5983. roundMask := int64( $000000FFFFFFFFFF );
  5984. end
  5985. else begin
  5986. goto precision80;
  5987. end;
  5988. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5989. if ( not (roundNearestEven<>0) ) then
  5990. begin
  5991. if ( roundingMode = float_round_to_zero ) then
  5992. begin
  5993. roundIncrement := 0;
  5994. end
  5995. else begin
  5996. roundIncrement := roundMask;
  5997. if ( zSign<>0 ) then
  5998. begin
  5999. if ( roundingMode = float_round_up ) then
  6000. roundIncrement := 0;
  6001. end
  6002. else begin
  6003. if ( roundingMode = float_round_down ) then
  6004. roundIncrement := 0;
  6005. end;
  6006. end;
  6007. end;
  6008. roundBits := zSig0 and roundMask;
  6009. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6010. if ( ( $7FFE < zExp )
  6011. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  6012. ) then begin
  6013. goto overflow;
  6014. end;
  6015. if ( zExp <= 0 ) then begin
  6016. isTiny := ord (
  6017. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6018. or ( zExp < 0 )
  6019. or ( zSig0 <= zSig0 + roundIncrement ) );
  6020. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  6021. zExp := 0;
  6022. roundBits := zSig0 and roundMask;
  6023. if ( isTiny <> 0 ) and ( roundBits <> 0 ) then float_raise( float_flag_underflow );
  6024. if ( roundBits <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6025. inc( zSig0, roundIncrement );
  6026. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6027. roundIncrement := roundMask + 1;
  6028. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6029. roundMask := roundMask or roundIncrement;
  6030. end;
  6031. zSig0 := zSig0 and not roundMask;
  6032. result:=packFloatx80( zSign, zExp, zSig0 );
  6033. exit;
  6034. end;
  6035. end;
  6036. if ( roundBits <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6037. inc( zSig0, roundIncrement );
  6038. if ( zSig0 < roundIncrement ) then begin
  6039. inc(zExp);
  6040. zSig0 := bits64( $8000000000000000 );
  6041. end;
  6042. roundIncrement := roundMask + 1;
  6043. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6044. roundMask := roundMask or roundIncrement;
  6045. end;
  6046. zSig0 := zSig0 and not roundMask;
  6047. if ( zSig0 = 0 ) then zExp := 0;
  6048. result:=packFloatx80( zSign, zExp, zSig0 );
  6049. exit;
  6050. precision80:
  6051. increment := ord ( sbits64( zSig1 ) < 0 );
  6052. if ( roundNearestEven = 0 ) then begin
  6053. if ( roundingMode = float_round_to_zero ) then begin
  6054. increment := 0;
  6055. end
  6056. else begin
  6057. if ( zSign <> 0 ) then begin
  6058. increment := ord ( roundingMode = float_round_down ) and zSig1;
  6059. end
  6060. else begin
  6061. increment := ord ( roundingMode = float_round_up ) and zSig1;
  6062. end;
  6063. end;
  6064. end;
  6065. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6066. if ( ( $7FFE < zExp )
  6067. or ( ( zExp = $7FFE )
  6068. and ( zSig0 = bits64( $FFFFFFFFFFFFFFFF ) )
  6069. and ( increment <> 0 )
  6070. )
  6071. ) then begin
  6072. roundMask := 0;
  6073. overflow:
  6074. float_raise( float_flag_overflow or float_flag_inexact );
  6075. if ( ( roundingMode = float_round_to_zero )
  6076. or ( ( zSign <> 0) and ( roundingMode = float_round_up ) )
  6077. or ( ( zSign = 0) and ( roundingMode = float_round_down ) )
  6078. ) then begin
  6079. result:=packFloatx80( zSign, $7FFE, not roundMask );
  6080. exit;
  6081. end;
  6082. result:=packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6083. exit;
  6084. end;
  6085. if ( zExp <= 0 ) then begin
  6086. isTiny := ord(
  6087. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6088. or ( zExp < 0 )
  6089. or ( increment = 0 )
  6090. or ( zSig0 < bits64( $FFFFFFFFFFFFFFFF ) ) );
  6091. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  6092. zExp := 0;
  6093. if ( ( isTiny <> 0 ) and ( zSig1 <> 0 ) ) then float_raise( float_flag_underflow );
  6094. if ( zSig1 <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6095. if ( roundNearestEven <> 0 ) then begin
  6096. increment := ord( sbits64( zSig1 ) < 0 );
  6097. end
  6098. else begin
  6099. if ( zSign <> 0 ) then begin
  6100. increment := ord( roundingMode = float_round_down ) and zSig1;
  6101. end
  6102. else begin
  6103. increment := ord( roundingMode = float_round_up ) and zSig1;
  6104. end;
  6105. end;
  6106. if ( increment <> 0 ) then begin
  6107. inc(zSig0);
  6108. zSig0 :=
  6109. not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6110. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6111. end;
  6112. result:=packFloatx80( zSign, zExp, zSig0 );
  6113. exit;
  6114. end;
  6115. end;
  6116. if ( zSig1 <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6117. if ( increment <> 0 ) then begin
  6118. inc(zSig0);
  6119. if ( zSig0 = 0 ) then begin
  6120. inc(zExp);
  6121. zSig0 := bits64( $8000000000000000 );
  6122. end
  6123. else begin
  6124. zSig0 := zSig0 and not bits64( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6125. end;
  6126. end
  6127. else begin
  6128. if ( zSig0 = 0 ) then zExp := 0;
  6129. end;
  6130. result:=packFloatx80( zSign, zExp, zSig0 );
  6131. end;
  6132. {*----------------------------------------------------------------------------
  6133. | Takes an abstract floating-point value having sign `zSign', exponent
  6134. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  6135. | and returns the proper extended double-precision floating-point value
  6136. | corresponding to the abstract input. This routine is just like
  6137. | `roundAndPackFloatx80' except that the input significand does not have to be
  6138. | normalized.
  6139. *----------------------------------------------------------------------------*}
  6140. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6141. var
  6142. shiftCount: int8;
  6143. begin
  6144. if ( zSig0 = 0 ) then begin
  6145. zSig0 := zSig1;
  6146. zSig1 := 0;
  6147. dec( zExp, 64 );
  6148. end;
  6149. shiftCount := countLeadingZeros64( zSig0 );
  6150. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6151. zExp := zExp - shiftCount;
  6152. result :=
  6153. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  6154. end;
  6155. {*----------------------------------------------------------------------------
  6156. | Returns the result of converting the extended double-precision floating-
  6157. | point value `a' to the 32-bit two's complement integer format. The
  6158. | conversion is performed according to the IEC/IEEE Standard for Binary
  6159. | Floating-Point Arithmetic---which means in particular that the conversion
  6160. | is rounded according to the current rounding mode. If `a' is a NaN, the
  6161. | largest positive integer is returned. Otherwise, if the conversion
  6162. | overflows, the largest integer with the same sign as `a' is returned.
  6163. *----------------------------------------------------------------------------*}
  6164. function floatx80_to_int32(a: floatx80): int32;
  6165. var
  6166. aSign: flag;
  6167. aExp, shiftCount: int32;
  6168. aSig: bits64;
  6169. begin
  6170. aSig := extractFloatx80Frac( a );
  6171. aExp := extractFloatx80Exp( a );
  6172. aSign := extractFloatx80Sign( a );
  6173. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then aSign := 0;
  6174. shiftCount := $4037 - aExp;
  6175. if ( shiftCount <= 0 ) then shiftCount := 1;
  6176. shift64RightJamming( aSig, shiftCount, aSig );
  6177. result := roundAndPackInt32( aSign, aSig );
  6178. end;
  6179. {*----------------------------------------------------------------------------
  6180. | Returns the result of converting the extended double-precision floating-
  6181. | point value `a' to the 32-bit two's complement integer format. The
  6182. | conversion is performed according to the IEC/IEEE Standard for Binary
  6183. | Floating-Point Arithmetic, except that the conversion is always rounded
  6184. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6185. | Otherwise, if the conversion overflows, the largest integer with the same
  6186. | sign as `a' is returned.
  6187. *----------------------------------------------------------------------------*}
  6188. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  6189. var
  6190. aSign: flag;
  6191. aExp, shiftCount: int32;
  6192. aSig, savedASig: bits64;
  6193. z: int32;
  6194. label
  6195. invalid;
  6196. begin
  6197. aSig := extractFloatx80Frac( a );
  6198. aExp := extractFloatx80Exp( a );
  6199. aSign := extractFloatx80Sign( a );
  6200. if ( $401E < aExp ) then begin
  6201. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 )<>0 ) then aSign := 0;
  6202. goto invalid;
  6203. end
  6204. else if ( aExp < $3FFF ) then begin
  6205. if ( aExp or aSig <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6206. result := 0;
  6207. exit;
  6208. end;
  6209. shiftCount := $403E - aExp;
  6210. savedASig := aSig;
  6211. aSig := aSig shr shiftCount;
  6212. z := aSig;
  6213. if ( aSign <> 0 ) then z := - z;
  6214. if ( ord( z < 0 ) xor aSign ) <> 0 then begin
  6215. invalid:
  6216. float_raise( float_flag_invalid );
  6217. if aSign <> 0 then result := sbits32( $80000000 ) else result := $7FFFFFFF;
  6218. exit;
  6219. end;
  6220. if ( ( aSig shl shiftCount ) <> savedASig ) then begin
  6221. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6222. end;
  6223. result := z;
  6224. end;
  6225. {*----------------------------------------------------------------------------
  6226. | Returns the result of converting the extended double-precision floating-
  6227. | point value `a' to the 64-bit two's complement integer format. The
  6228. | conversion is performed according to the IEC/IEEE Standard for Binary
  6229. | Floating-Point Arithmetic---which means in particular that the conversion
  6230. | is rounded according to the current rounding mode. If `a' is a NaN,
  6231. | the largest positive integer is returned. Otherwise, if the conversion
  6232. | overflows, the largest integer with the same sign as `a' is returned.
  6233. *----------------------------------------------------------------------------*}
  6234. function floatx80_to_int64(a: floatx80): int64;
  6235. var
  6236. aSign: flag;
  6237. aExp, shiftCount: int32;
  6238. aSig, aSigExtra: bits64;
  6239. begin
  6240. aSig := extractFloatx80Frac( a );
  6241. aExp := extractFloatx80Exp( a );
  6242. aSign := extractFloatx80Sign( a );
  6243. shiftCount := $403E - aExp;
  6244. if ( shiftCount <= 0 ) then begin
  6245. if ( shiftCount <> 0 ) then begin
  6246. float_raise( float_flag_invalid );
  6247. if ( ( aSign = 0 )
  6248. or ( ( aExp = $7FFF )
  6249. and ( aSig <> bits64( $8000000000000000 ) ) )
  6250. ) then begin
  6251. result := $7FFFFFFFFFFFFFFF;
  6252. exit;
  6253. end;
  6254. result := $8000000000000000;
  6255. exit;
  6256. end;
  6257. aSigExtra := 0;
  6258. end
  6259. else begin
  6260. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  6261. end;
  6262. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  6263. end;
  6264. {*----------------------------------------------------------------------------
  6265. | Returns the result of converting the extended double-precision floating-
  6266. | point value `a' to the 64-bit two's complement integer format. The
  6267. | conversion is performed according to the IEC/IEEE Standard for Binary
  6268. | Floating-Point Arithmetic, except that the conversion is always rounded
  6269. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6270. | Otherwise, if the conversion overflows, the largest integer with the same
  6271. | sign as `a' is returned.
  6272. *----------------------------------------------------------------------------*}
  6273. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  6274. var
  6275. aSign: flag;
  6276. aExp, shiftCount: int32;
  6277. aSig: bits64;
  6278. z: int64;
  6279. begin
  6280. aSig := extractFloatx80Frac( a );
  6281. aExp := extractFloatx80Exp( a );
  6282. aSign := extractFloatx80Sign( a );
  6283. shiftCount := aExp - $403E;
  6284. if ( 0 <= shiftCount ) then begin
  6285. aSig := $7FFFFFFFFFFFFFFF;
  6286. if ( ( a.high <> $C03E ) or ( aSig <> 0 ) ) then begin
  6287. float_raise( float_flag_invalid );
  6288. if ( ( aSign = 0 ) or ( ( aExp = $7FFF ) and ( aSig <> 0 ) ) ) then begin
  6289. result := $7FFFFFFFFFFFFFFF;
  6290. exit;
  6291. end;
  6292. end;
  6293. result := $8000000000000000;
  6294. exit;
  6295. end
  6296. else if ( aExp < $3FFF ) then begin
  6297. if ( aExp or aSig <> 0 ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6298. result := 0;
  6299. exit;
  6300. end;
  6301. z := aSig shr ( - shiftCount );
  6302. if bits64( aSig shl ( shiftCount and 63 ) ) <> 0 then begin
  6303. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6304. end;
  6305. if ( aSign <> 0 ) then z := - z;
  6306. result := z;
  6307. end;
  6308. {*----------------------------------------------------------------------------
  6309. | The pattern for a default generated extended double-precision NaN. The
  6310. | `high' and `low' values hold the most- and least-significant bits,
  6311. | respectively.
  6312. *----------------------------------------------------------------------------*}
  6313. const
  6314. floatx80_default_nan_high = $FFFF;
  6315. floatx80_default_nan_low = bits64( $C000000000000000 );
  6316. {*----------------------------------------------------------------------------
  6317. | Returns 1 if the extended double-precision floating-point value `a' is a
  6318. | signaling NaN; otherwise returns 0.
  6319. *----------------------------------------------------------------------------*}
  6320. function floatx80_is_signaling_nan(a : floatx80): flag;
  6321. var
  6322. aLow: bits64;
  6323. begin
  6324. aLow := a.low and not $4000000000000000;
  6325. result := ord(
  6326. ( a.high and $7FFF = $7FFF )
  6327. and ( bits64( aLow shl 1 ) <> 0 )
  6328. and ( a.low = aLow ) );
  6329. end;
  6330. {*----------------------------------------------------------------------------
  6331. | Returns the result of converting the extended double-precision floating-
  6332. | point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
  6333. | invalid exception is raised.
  6334. *----------------------------------------------------------------------------*}
  6335. function floatx80ToCommonNaN(a : floatx80): commonNaNT;
  6336. var
  6337. z: commonNaNT;
  6338. begin
  6339. if floatx80_is_signaling_nan( a ) <> 0 then float_raise( float_flag_invalid );
  6340. z.sign := a.high shr 15;
  6341. z.low := 0;
  6342. z.high := a.low shl 1;
  6343. result := z;
  6344. end;
  6345. {*----------------------------------------------------------------------------
  6346. | Returns 1 if the extended double-precision floating-point value `a' is a
  6347. | NaN; otherwise returns 0.
  6348. *----------------------------------------------------------------------------*}
  6349. function floatx80_is_nan(a : floatx80 ): flag;
  6350. begin
  6351. result := ord( ( ( a.high and $7FFF ) = $7FFF ) and ( bits64( a.low<<1 ) <> 0 ) );
  6352. end;
  6353. {*----------------------------------------------------------------------------
  6354. | Takes two extended double-precision floating-point values `a' and `b', one
  6355. | of which is a NaN, and returns the appropriate NaN result. If either `a' or
  6356. | `b' is a signaling NaN, the invalid exception is raised.
  6357. *----------------------------------------------------------------------------*}
  6358. function propagateFloatx80NaN(a, b: floatx80): floatx80;
  6359. var
  6360. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  6361. label
  6362. returnLargerSignificand;
  6363. begin
  6364. aIsNaN := floatx80_is_nan( a );
  6365. aIsSignalingNaN := floatx80_is_signaling_nan( a );
  6366. bIsNaN := floatx80_is_nan( b );
  6367. bIsSignalingNaN := floatx80_is_signaling_nan( b );
  6368. a.low := a.low or $C000000000000000;
  6369. b.low := b.low or $C000000000000000;
  6370. if aIsSignalingNaN or bIsSignalingNaN <> 0 then float_raise( float_flag_invalid );
  6371. if aIsSignalingNaN <> 0 then begin
  6372. if bIsSignalingNaN <> 0 then goto returnLargerSignificand;
  6373. if bIsNaN <> 0 then result := b else result := a;
  6374. exit;
  6375. end
  6376. else if aIsNaN <>0 then begin
  6377. if ( bIsSignalingNaN <> 0 ) or ( bIsNaN = 0) then begin
  6378. result := a;
  6379. exit;
  6380. end;
  6381. returnLargerSignificand:
  6382. if ( a.low < b.low ) then begin
  6383. result := b;
  6384. exit;
  6385. end;
  6386. if ( b.low < a.low ) then begin
  6387. result := a;
  6388. exit;
  6389. end;
  6390. if a.high < b.high then result := a else result := b;
  6391. exit;
  6392. end
  6393. else
  6394. result := b;
  6395. end;
  6396. {*----------------------------------------------------------------------------
  6397. | Returns the result of converting the extended double-precision floating-
  6398. | point value `a' to the single-precision floating-point format. The
  6399. | conversion is performed according to the IEC/IEEE Standard for Binary
  6400. | Floating-Point Arithmetic.
  6401. *----------------------------------------------------------------------------*}
  6402. function floatx80_to_float32(a: floatx80): float32;
  6403. var
  6404. aSign: flag;
  6405. aExp: int32;
  6406. aSig: bits64;
  6407. begin
  6408. aSig := extractFloatx80Frac( a );
  6409. aExp := extractFloatx80Exp( a );
  6410. aSign := extractFloatx80Sign( a );
  6411. if ( aExp = $7FFF ) then begin
  6412. if bits64( aSig shl 1 ) <> 0 then begin
  6413. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  6414. exit;
  6415. end;
  6416. result := packFloat32( aSign, $FF, 0 );
  6417. exit;
  6418. end;
  6419. shift64RightJamming( aSig, 33, aSig );
  6420. if ( aExp or aSig <> 0 ) then dec( aExp, $3F81 );
  6421. result := roundAndPackFloat32( aSign, aExp, aSig );
  6422. end;
  6423. {*----------------------------------------------------------------------------
  6424. | Returns the result of converting the extended double-precision floating-
  6425. | point value `a' to the double-precision floating-point format. The
  6426. | conversion is performed according to the IEC/IEEE Standard for Binary
  6427. | Floating-Point Arithmetic.
  6428. *----------------------------------------------------------------------------*}
  6429. function floatx80_to_float64(a: floatx80): float64;
  6430. var
  6431. aSign: flag;
  6432. aExp: int32;
  6433. aSig, zSig: bits64;
  6434. begin
  6435. aSig := extractFloatx80Frac( a );
  6436. aExp := extractFloatx80Exp( a );
  6437. aSign := extractFloatx80Sign( a );
  6438. if ( aExp = $7FFF ) then begin
  6439. if bits64( aSig shl 1 ) <> 0 then begin
  6440. commonNaNToFloat64( floatx80ToCommonNaN( a ), result );
  6441. exit;
  6442. end;
  6443. result := packFloat64( aSign, $7FF, 0 );
  6444. exit;
  6445. end;
  6446. shift64RightJamming( aSig, 1, zSig );
  6447. if ( aExp or aSig <> 0 ) then dec( aExp, $3C01 );
  6448. result := roundAndPackFloat64( aSign, aExp, zSig );
  6449. end;
  6450. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6451. {*----------------------------------------------------------------------------
  6452. | Returns the result of converting the extended double-precision floating-
  6453. | point value `a' to the quadruple-precision floating-point format. The
  6454. | conversion is performed according to the IEC/IEEE Standard for Binary
  6455. | Floating-Point Arithmetic.
  6456. *----------------------------------------------------------------------------*}
  6457. function floatx80_to_float128(a: floatx80): float128;
  6458. var
  6459. aSign: flag;
  6460. aExp: int16;
  6461. aSig, zSig0, zSig1: bits64;
  6462. begin
  6463. aSig := extractFloatx80Frac( a );
  6464. aExp := extractFloatx80Exp( a );
  6465. aSign := extractFloatx80Sign( a );
  6466. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then begin
  6467. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  6468. exit;
  6469. end;
  6470. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  6471. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  6472. end;
  6473. {$endif FPC_SOFTFLOAT_FLOAT128}
  6474. {*----------------------------------------------------------------------------
  6475. | Rounds the extended double-precision floating-point value `a' to an integer,
  6476. | and Returns the result as an extended quadruple-precision floating-point
  6477. | value. The operation is performed according to the IEC/IEEE Standard for
  6478. | Binary Floating-Point Arithmetic.
  6479. *----------------------------------------------------------------------------*}
  6480. function floatx80_round_to_int(a: floatx80): floatx80;
  6481. var
  6482. aSign: flag;
  6483. aExp: int32;
  6484. lastBitMask, roundBitsMask: bits64;
  6485. roundingMode: int8;
  6486. z: floatx80;
  6487. begin
  6488. aExp := extractFloatx80Exp( a );
  6489. if ( $403E <= aExp ) then begin
  6490. if ( aExp = $7FFF ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) then begin
  6491. result := propagateFloatx80NaN( a, a );
  6492. exit;
  6493. end;
  6494. result := a;
  6495. exit;
  6496. end;
  6497. if ( aExp < $3FFF ) then begin
  6498. if ( ( aExp = 0 )
  6499. and ( bits64( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) then begin
  6500. result := a;
  6501. exit;
  6502. end;
  6503. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6504. aSign := extractFloatx80Sign( a );
  6505. case softfloat_rounding_mode of
  6506. float_round_nearest_even:
  6507. if ( ( aExp = $3FFE ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  6508. ) then begin
  6509. result :=
  6510. packFloatx80( aSign, $3FFF, bits64( $8000000000000000 ) );
  6511. exit;
  6512. end;
  6513. float_round_down: begin
  6514. if aSign <> 0 then
  6515. result := packFloatx80( 1, $3FFF, bits64( $8000000000000000 ) )
  6516. else
  6517. result := packFloatx80( 0, 0, 0 );
  6518. exit;
  6519. end;
  6520. float_round_up: begin
  6521. if aSign <> 0 then
  6522. result := packFloatx80( 1, 0, 0 )
  6523. else
  6524. result := packFloatx80( 0, $3FFF, bits64( $8000000000000000 ) );
  6525. exit;
  6526. end;
  6527. end;
  6528. result := packFloatx80( aSign, 0, 0 );
  6529. exit;
  6530. end;
  6531. lastBitMask := 1;
  6532. lastBitMask := lastBitMask shl ( $403E - aExp );
  6533. roundBitsMask := lastBitMask - 1;
  6534. z := a;
  6535. roundingMode := softfloat_rounding_mode;
  6536. if ( roundingMode = float_round_nearest_even ) then begin
  6537. inc( z.low, lastBitMask shr 1 );
  6538. if ( ( z.low and roundBitsMask ) = 0 ) then z.low := z.low and not lastBitMask;
  6539. end
  6540. else if ( roundingMode <> float_round_to_zero ) then begin
  6541. if ( extractFloatx80Sign( z ) <> 0 ) xor ( roundingMode = float_round_up ) then begin
  6542. inc( z.low, roundBitsMask );
  6543. end;
  6544. end;
  6545. z.low := z.low and not roundBitsMask;
  6546. if ( z.low = 0 ) then begin
  6547. inc(z.high);
  6548. z.low := bits64( $8000000000000000 );
  6549. end;
  6550. if ( z.low <> a.low ) then softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6551. result := z;
  6552. end;
  6553. {*----------------------------------------------------------------------------
  6554. | Returns the result of adding the absolute values of the extended double-
  6555. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  6556. | negated before being returned. `zSign' is ignored if the result is a NaN.
  6557. | The addition is performed according to the IEC/IEEE Standard for Binary
  6558. | Floating-Point Arithmetic.
  6559. *----------------------------------------------------------------------------*}
  6560. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6561. var
  6562. aExp, bExp, zExp: int32;
  6563. aSig, bSig, zSig0, zSig1: bits64;
  6564. expDiff: int32;
  6565. label
  6566. shiftRight1, roundAndPack;
  6567. begin
  6568. aSig := extractFloatx80Frac( a );
  6569. aExp := extractFloatx80Exp( a );
  6570. bSig := extractFloatx80Frac( b );
  6571. bExp := extractFloatx80Exp( b );
  6572. expDiff := aExp - bExp;
  6573. if ( 0 < expDiff ) then begin
  6574. if ( aExp = $7FFF ) then begin
  6575. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6576. result := propagateFloatx80NaN( a, b );
  6577. exit;
  6578. end;
  6579. result := a;
  6580. exit;
  6581. end;
  6582. if ( bExp = 0 ) then dec(expDiff);
  6583. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6584. zExp := aExp;
  6585. end
  6586. else if ( expDiff < 0 ) then begin
  6587. if ( bExp = $7FFF ) then begin
  6588. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6589. result := propagateFloatx80NaN( a, b );
  6590. exit;
  6591. end;
  6592. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6593. exit;
  6594. end;
  6595. if ( aExp = 0 ) then inc(expDiff);
  6596. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6597. zExp := bExp;
  6598. end
  6599. else begin
  6600. if ( aExp = $7FFF ) then begin
  6601. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6602. result := propagateFloatx80NaN( a, b );
  6603. exit;
  6604. end;
  6605. result := a;
  6606. exit;
  6607. end;
  6608. zSig1 := 0;
  6609. zSig0 := aSig + bSig;
  6610. if ( aExp = 0 ) then begin
  6611. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  6612. goto roundAndPack;
  6613. end;
  6614. zExp := aExp;
  6615. goto shiftRight1;
  6616. end;
  6617. zSig0 := aSig + bSig;
  6618. if ( sbits64( zSig0 ) < 0 ) then goto roundAndPack;
  6619. shiftRight1:
  6620. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  6621. zSig0 := zSig0 or $8000000000000000;
  6622. inc(zExp);
  6623. roundAndPack:
  6624. result :=
  6625. roundAndPackFloatx80(
  6626. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6627. end;
  6628. {*----------------------------------------------------------------------------
  6629. | Returns the result of subtracting the absolute values of the extended
  6630. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  6631. | difference is negated before being returned. `zSign' is ignored if the
  6632. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6633. | Standard for Binary Floating-Point Arithmetic.
  6634. *----------------------------------------------------------------------------*}
  6635. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6636. var
  6637. aExp, bExp, zExp: int32;
  6638. aSig, bSig, zSig0, zSig1: bits64;
  6639. expDiff: int32;
  6640. z: floatx80;
  6641. label
  6642. bExpBigger, bBigger, aExpBigger, aBigger, normalizeRoundAndPack;
  6643. begin
  6644. aSig := extractFloatx80Frac( a );
  6645. aExp := extractFloatx80Exp( a );
  6646. bSig := extractFloatx80Frac( b );
  6647. bExp := extractFloatx80Exp( b );
  6648. expDiff := aExp - bExp;
  6649. if ( 0 < expDiff ) then goto aExpBigger;
  6650. if ( expDiff < 0 ) then goto bExpBigger;
  6651. if ( aExp = $7FFF ) then begin
  6652. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6653. result := propagateFloatx80NaN( a, b );
  6654. exit;
  6655. end;
  6656. float_raise( float_flag_invalid );
  6657. z.low := floatx80_default_nan_low;
  6658. z.high := floatx80_default_nan_high;
  6659. result := z;
  6660. exit;
  6661. end;
  6662. if ( aExp = 0 ) then begin
  6663. aExp := 1;
  6664. bExp := 1;
  6665. end;
  6666. zSig1 := 0;
  6667. if ( bSig < aSig ) then goto aBigger;
  6668. if ( aSig < bSig ) then goto bBigger;
  6669. result := packFloatx80( ord( softfloat_rounding_mode = float_round_down ), 0, 0 );
  6670. exit;
  6671. bExpBigger:
  6672. if ( bExp = $7FFF ) then begin
  6673. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6674. result := propagateFloatx80NaN( a, b );
  6675. exit;
  6676. end;
  6677. result := packFloatx80( zSign xor 1, $7FFF, bits64( $8000000000000000 ) );
  6678. exit;
  6679. end;
  6680. if ( aExp = 0 ) then inc(expDiff);
  6681. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6682. bBigger:
  6683. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6684. zExp := bExp;
  6685. zSign := zSign xor 1;
  6686. goto normalizeRoundAndPack;
  6687. aExpBigger:
  6688. if ( aExp = $7FFF ) then begin
  6689. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6690. result := propagateFloatx80NaN( a, b );
  6691. exit;
  6692. end;
  6693. result := a;
  6694. exit;
  6695. end;
  6696. if ( bExp = 0 ) then dec(expDiff);
  6697. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6698. aBigger:
  6699. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6700. zExp := aExp;
  6701. normalizeRoundAndPack:
  6702. result :=
  6703. normalizeRoundAndPackFloatx80(
  6704. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6705. end;
  6706. {*----------------------------------------------------------------------------
  6707. | Returns the result of adding the extended double-precision floating-point
  6708. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6709. | Standard for Binary Floating-Point Arithmetic.
  6710. *----------------------------------------------------------------------------*}
  6711. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6712. var
  6713. aSign, bSign: flag;
  6714. begin
  6715. aSign := extractFloatx80Sign( a );
  6716. bSign := extractFloatx80Sign( b );
  6717. if ( aSign = bSign ) then begin
  6718. result := addFloatx80Sigs( a, b, aSign );
  6719. end
  6720. else begin
  6721. result := subFloatx80Sigs( a, b, aSign );
  6722. end;
  6723. end;
  6724. {*----------------------------------------------------------------------------
  6725. | Returns the result of subtracting the extended double-precision floating-
  6726. | point values `a' and `b'. The operation is performed according to the
  6727. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6728. *----------------------------------------------------------------------------*}
  6729. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6730. var
  6731. aSign, bSign: flag;
  6732. begin
  6733. aSign := extractFloatx80Sign( a );
  6734. bSign := extractFloatx80Sign( b );
  6735. if ( aSign = bSign ) then begin
  6736. result := subFloatx80Sigs( a, b, aSign );
  6737. end
  6738. else begin
  6739. result := addFloatx80Sigs( a, b, aSign );
  6740. end;
  6741. end;
  6742. {*----------------------------------------------------------------------------
  6743. | Returns the result of multiplying the extended double-precision floating-
  6744. | point values `a' and `b'. The operation is performed according to the
  6745. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6746. *----------------------------------------------------------------------------*}
  6747. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6748. var
  6749. aSign, bSign, zSign: flag;
  6750. aExp, bExp, zExp: int32;
  6751. aSig, bSig, zSig0, zSig1: bits64;
  6752. z: floatx80;
  6753. label
  6754. invalid;
  6755. begin
  6756. aSig := extractFloatx80Frac( a );
  6757. aExp := extractFloatx80Exp( a );
  6758. aSign := extractFloatx80Sign( a );
  6759. bSig := extractFloatx80Frac( b );
  6760. bExp := extractFloatx80Exp( b );
  6761. bSign := extractFloatx80Sign( b );
  6762. zSign := aSign xor bSign;
  6763. if ( aExp = $7FFF ) then begin
  6764. if ( bits64( aSig shl 1 ) <> 0 )
  6765. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6766. result := propagateFloatx80NaN( a, b );
  6767. exit;
  6768. end;
  6769. if ( ( bExp or bSig ) = 0 ) then goto invalid;
  6770. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6771. exit;
  6772. end;
  6773. if ( bExp = $7FFF ) then begin
  6774. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6775. result := propagateFloatx80NaN( a, b );
  6776. exit;
  6777. end;
  6778. if ( ( aExp or aSig ) = 0 ) then begin
  6779. invalid:
  6780. float_raise( float_flag_invalid );
  6781. z.low := floatx80_default_nan_low;
  6782. z.high := floatx80_default_nan_high;
  6783. result := z;
  6784. exit;
  6785. end;
  6786. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6787. exit;
  6788. end;
  6789. if ( aExp = 0 ) then begin
  6790. if ( aSig = 0 ) then begin
  6791. result := packFloatx80( zSign, 0, 0 );
  6792. exit;
  6793. end;
  6794. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6795. end;
  6796. if ( bExp = 0 ) then begin
  6797. if ( bSig = 0 ) then begin
  6798. result := packFloatx80( zSign, 0, 0 );
  6799. exit;
  6800. end;
  6801. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6802. end;
  6803. zExp := aExp + bExp - $3FFE;
  6804. mul64To128( aSig, bSig, zSig0, zSig1 );
  6805. if 0 < sbits64( zSig0 ) then begin
  6806. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6807. dec(zExp);
  6808. end;
  6809. result :=
  6810. roundAndPackFloatx80(
  6811. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6812. end;
  6813. {*----------------------------------------------------------------------------
  6814. | Returns the result of dividing the extended double-precision floating-point
  6815. | value `a' by the corresponding value `b'. The operation is performed
  6816. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6817. *----------------------------------------------------------------------------*}
  6818. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6819. var
  6820. aSign, bSign, zSign: flag;
  6821. aExp, bExp, zExp: int32;
  6822. aSig, bSig, zSig0, zSig1: bits64;
  6823. rem0, rem1, rem2, term0, term1, term2: bits64;
  6824. z: floatx80;
  6825. label
  6826. invalid;
  6827. begin
  6828. aSig := extractFloatx80Frac( a );
  6829. aExp := extractFloatx80Exp( a );
  6830. aSign := extractFloatx80Sign( a );
  6831. bSig := extractFloatx80Frac( b );
  6832. bExp := extractFloatx80Exp( b );
  6833. bSign := extractFloatx80Sign( b );
  6834. zSign := aSign xor bSign;
  6835. if ( aExp = $7FFF ) then begin
  6836. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6837. result := propagateFloatx80NaN( a, b );
  6838. exit;
  6839. end;
  6840. if ( bExp = $7FFF ) then begin
  6841. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6842. result := propagateFloatx80NaN( a, b );
  6843. exit;
  6844. end;
  6845. goto invalid;
  6846. end;
  6847. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6848. exit;
  6849. end;
  6850. if ( bExp = $7FFF ) then begin
  6851. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6852. result := propagateFloatx80NaN( a, b );
  6853. exit;
  6854. end;
  6855. result := packFloatx80( zSign, 0, 0 );
  6856. exit;
  6857. end;
  6858. if ( bExp = 0 ) then begin
  6859. if ( bSig = 0 ) then begin
  6860. if ( ( aExp or aSig ) = 0 ) then begin
  6861. invalid:
  6862. float_raise( float_flag_invalid );
  6863. z.low := floatx80_default_nan_low;
  6864. z.high := floatx80_default_nan_high;
  6865. result := z;
  6866. exit;
  6867. end;
  6868. float_raise( float_flag_divbyzero );
  6869. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6870. exit;
  6871. end;
  6872. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6873. end;
  6874. if ( aExp = 0 ) then begin
  6875. if ( aSig = 0 ) then begin
  6876. result := packFloatx80( zSign, 0, 0 );
  6877. exit;
  6878. end;
  6879. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6880. end;
  6881. zExp := aExp - bExp + $3FFE;
  6882. rem1 := 0;
  6883. if ( bSig <= aSig ) then begin
  6884. shift128Right( aSig, 0, 1, aSig, rem1 );
  6885. inc(zExp);
  6886. end;
  6887. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6888. mul64To128( bSig, zSig0, term0, term1 );
  6889. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6890. while ( sbits64( rem0 ) < 0 ) do begin
  6891. dec(zSig0);
  6892. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6893. end;
  6894. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6895. if ( bits64( zSig1 shl 1 ) <= 8 ) then begin
  6896. mul64To128( bSig, zSig1, term1, term2 );
  6897. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6898. while ( sbits64( rem1 ) < 0 ) do begin
  6899. dec(zSig1);
  6900. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6901. end;
  6902. zSig1 := zSig1 or ord( ( rem1 or rem2 ) <> 0 );
  6903. end;
  6904. result :=
  6905. roundAndPackFloatx80(
  6906. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6907. end;
  6908. {*----------------------------------------------------------------------------
  6909. | Returns the remainder of the extended double-precision floating-point value
  6910. | `a' with respect to the corresponding value `b'. The operation is performed
  6911. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6912. *----------------------------------------------------------------------------*}
  6913. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6914. var
  6915. aSign, zSign: flag;
  6916. aExp, bExp, expDiff: int32;
  6917. aSig0, aSig1, bSig: bits64;
  6918. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6919. z: floatx80;
  6920. label
  6921. invalid;
  6922. begin
  6923. aSig0 := extractFloatx80Frac( a );
  6924. aExp := extractFloatx80Exp( a );
  6925. aSign := extractFloatx80Sign( a );
  6926. bSig := extractFloatx80Frac( b );
  6927. bExp := extractFloatx80Exp( b );
  6928. if ( aExp = $7FFF ) then begin
  6929. if ( bits64( aSig0 shl 1 ) <> 0 )
  6930. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6931. result := propagateFloatx80NaN( a, b );
  6932. exit;
  6933. end;
  6934. goto invalid;
  6935. end;
  6936. if ( bExp = $7FFF ) then begin
  6937. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6938. result := propagateFloatx80NaN( a, b );
  6939. exit;
  6940. end;
  6941. result := a;
  6942. exit;
  6943. end;
  6944. if ( bExp = 0 ) then begin
  6945. if ( bSig = 0 ) then begin
  6946. invalid:
  6947. float_raise( float_flag_invalid );
  6948. z.low := floatx80_default_nan_low;
  6949. z.high := floatx80_default_nan_high;
  6950. result := z;
  6951. exit;
  6952. end;
  6953. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6954. end;
  6955. if ( aExp = 0 ) then begin
  6956. if ( bits64( aSig0 shl 1 ) = 0 ) then begin
  6957. result := a;
  6958. exit;
  6959. end;
  6960. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6961. end;
  6962. bSig := bSig or $8000000000000000;
  6963. zSign := aSign;
  6964. expDiff := aExp - bExp;
  6965. aSig1 := 0;
  6966. if ( expDiff < 0 ) then begin
  6967. if ( expDiff < -1 ) then begin
  6968. result := a;
  6969. exit;
  6970. end;
  6971. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6972. expDiff := 0;
  6973. end;
  6974. q := ord( bSig <= aSig0 );
  6975. if ( q <> 0 ) then dec( aSig0, bSig );
  6976. dec( expDiff, 64 );
  6977. while ( 0 < expDiff ) do begin
  6978. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6979. if ( 2 < q ) then q := q - 2 else q := 0;
  6980. mul64To128( bSig, q, term0, term1 );
  6981. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6982. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  6983. dec( expDiff, 62 );
  6984. end;
  6985. inc( expDiff, 64 );
  6986. if ( 0 < expDiff ) then begin
  6987. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6988. if ( 2 < q ) then q:= q - 2 else q := 0;
  6989. q := q shr ( 64 - expDiff );
  6990. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  6991. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6992. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  6993. while ( le128( term0, term1, aSig0, aSig1 ) <> 0 ) do begin
  6994. inc(q);
  6995. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6996. end;
  6997. end
  6998. else begin
  6999. term1 := 0;
  7000. term0 := bSig;
  7001. end;
  7002. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  7003. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7004. or ( ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7005. and ( q and 1 <> 0 ) )
  7006. then begin
  7007. aSig0 := alternateASig0;
  7008. aSig1 := alternateASig1;
  7009. zSign := ord( zSign = 0 );
  7010. end;
  7011. result :=
  7012. normalizeRoundAndPackFloatx80(
  7013. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  7014. end;
  7015. {*----------------------------------------------------------------------------
  7016. | Returns the square root of the extended double-precision floating-point
  7017. | value `a'. The operation is performed according to the IEC/IEEE Standard
  7018. | for Binary Floating-Point Arithmetic.
  7019. *----------------------------------------------------------------------------*}
  7020. function floatx80_sqrt(a: floatx80): floatx80;
  7021. var
  7022. aSign: flag;
  7023. aExp, zExp: int32;
  7024. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  7025. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7026. z: floatx80;
  7027. label
  7028. invalid;
  7029. begin
  7030. aSig0 := extractFloatx80Frac( a );
  7031. aExp := extractFloatx80Exp( a );
  7032. aSign := extractFloatx80Sign( a );
  7033. if ( aExp = $7FFF ) then begin
  7034. if ( bits64( aSig0 shl 1 ) <> 0 ) then begin
  7035. result := propagateFloatx80NaN( a, a );
  7036. exit;
  7037. end;
  7038. if ( aSign = 0 ) then begin
  7039. result := a;
  7040. exit;
  7041. end;
  7042. goto invalid;
  7043. end;
  7044. if ( aSign <> 0 ) then begin
  7045. if ( ( aExp or aSig0 ) = 0 ) then begin
  7046. result := a;
  7047. exit;
  7048. end;
  7049. invalid:
  7050. float_raise( float_flag_invalid );
  7051. z.low := floatx80_default_nan_low;
  7052. z.high := floatx80_default_nan_high;
  7053. result := z;
  7054. exit;
  7055. end;
  7056. if ( aExp = 0 ) then begin
  7057. if ( aSig0 = 0 ) then begin
  7058. result := packFloatx80( 0, 0, 0 );
  7059. exit;
  7060. end;
  7061. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  7062. end;
  7063. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
  7064. zSig0 := estimateSqrt32( aExp, aSig0>>32 );
  7065. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  7066. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7067. doubleZSig0 := zSig0 shl 1;
  7068. mul64To128( zSig0, zSig0, term0, term1 );
  7069. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7070. while ( sbits64( rem0 ) < 0 ) do begin
  7071. dec(zSig0);
  7072. dec( doubleZSig0, 2 );
  7073. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, rem0, rem1 );
  7074. end;
  7075. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7076. if ( ( zSig1 and $3FFFFFFFFFFFFFFF ) <= 5 ) then begin
  7077. if ( zSig1 = 0 ) then zSig1 := 1;
  7078. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7079. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7080. mul64To128( zSig1, zSig1, term2, term3 );
  7081. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7082. while ( sbits64( rem1 ) < 0 ) do begin
  7083. dec(zSig1);
  7084. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7085. term3 := term3 or 1;
  7086. term2 := term2 or doubleZSig0;
  7087. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7088. end;
  7089. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7090. end;
  7091. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  7092. zSig0 := zSig0 or doubleZSig0;
  7093. result :=
  7094. roundAndPackFloatx80(
  7095. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  7096. end;
  7097. {*----------------------------------------------------------------------------
  7098. | Returns 1 if the extended double-precision floating-point value `a' is
  7099. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  7100. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  7101. | Arithmetic.
  7102. *----------------------------------------------------------------------------*}
  7103. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  7104. begin
  7105. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7106. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  7107. ) or ( ( extractFloatx80Exp( b ) = $7FFF )
  7108. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 )
  7109. ) then begin
  7110. if ( floatx80_is_signaling_nan( a )
  7111. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7112. float_raise( float_flag_invalid );
  7113. end;
  7114. result := 0;
  7115. exit;
  7116. end;
  7117. result := ord(
  7118. ( a.low = b.low )
  7119. and ( ( a.high = b.high )
  7120. or ( ( a.low = 0 )
  7121. and ( bits16 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7122. ) );
  7123. end;
  7124. {*----------------------------------------------------------------------------
  7125. | Returns 1 if the extended double-precision floating-point value `a' is
  7126. | less than or equal to the corresponding value `b', and 0 otherwise. The
  7127. | comparison is performed according to the IEC/IEEE Standard for Binary
  7128. | Floating-Point Arithmetic.
  7129. *----------------------------------------------------------------------------*}
  7130. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  7131. var
  7132. aSign, bSign: flag;
  7133. begin
  7134. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7135. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7136. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7137. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7138. then begin
  7139. float_raise( float_flag_invalid );
  7140. result := 0;
  7141. exit;
  7142. end;
  7143. aSign := extractFloatx80Sign( a );
  7144. bSign := extractFloatx80Sign( b );
  7145. if ( aSign <> bSign ) then begin
  7146. result := ord(
  7147. ( aSign <> 0 )
  7148. or ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low = 0 ) );
  7149. exit;
  7150. end;
  7151. if aSign<>0 then
  7152. result := le128( b.high, b.low, a.high, a.low )
  7153. else
  7154. result := le128( a.high, a.low, b.high, b.low );
  7155. end;
  7156. {*----------------------------------------------------------------------------
  7157. | Returns 1 if the extended double-precision floating-point value `a' is
  7158. | less than the corresponding value `b', and 0 otherwise. The comparison
  7159. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7160. | Arithmetic.
  7161. *----------------------------------------------------------------------------*}
  7162. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  7163. var
  7164. aSign, bSign: flag;
  7165. begin
  7166. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7167. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7168. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7169. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7170. then begin
  7171. float_raise( float_flag_invalid );
  7172. result := 0;
  7173. exit;
  7174. end;
  7175. aSign := extractFloatx80Sign( a );
  7176. bSign := extractFloatx80Sign( b );
  7177. if ( aSign <> bSign ) then begin
  7178. result := ord(
  7179. ( aSign <> 0 )
  7180. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7181. exit;
  7182. end;
  7183. if aSign <> 0 then
  7184. result := lt128( b.high, b.low, a.high, a.low )
  7185. else
  7186. result := lt128( a.high, a.low, b.high, b.low );
  7187. end;
  7188. {*----------------------------------------------------------------------------
  7189. | Returns 1 if the extended double-precision floating-point value `a' is equal
  7190. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  7191. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7192. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7193. *----------------------------------------------------------------------------*}
  7194. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  7195. begin
  7196. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7197. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7198. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7199. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7200. then begin
  7201. float_raise( float_flag_invalid );
  7202. result := 0;
  7203. exit;
  7204. end;
  7205. result := ord(
  7206. ( a.low = b.low )
  7207. and ( ( a.high = b.high )
  7208. or ( ( a.low = 0 )
  7209. and ( bits16( ( a.high or b.high ) shl 1 ) = 0 ) )
  7210. ) );
  7211. end;
  7212. {*----------------------------------------------------------------------------
  7213. | Returns 1 if the extended double-precision floating-point value `a' is less
  7214. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  7215. | do not cause an exception. Otherwise, the comparison is performed according
  7216. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7217. *----------------------------------------------------------------------------*}
  7218. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  7219. var
  7220. aSign, bSign: flag;
  7221. begin
  7222. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7223. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7224. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7225. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7226. then begin
  7227. if ( floatx80_is_signaling_nan( a )
  7228. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7229. float_raise( float_flag_invalid );
  7230. end;
  7231. result := 0;
  7232. exit;
  7233. end;
  7234. aSign := extractFloatx80Sign( a );
  7235. bSign := extractFloatx80Sign( b );
  7236. if ( aSign <> bSign ) then begin
  7237. result := ord(
  7238. ( aSign <> 0 )
  7239. or ( ( bits16( ( a.high or b.high ) shl 1 ) ) or a.low or b.low = 0 ) );
  7240. exit;
  7241. end;
  7242. if aSign <> 0 then
  7243. result := le128( b.high, b.low, a.high, a.low )
  7244. else
  7245. result := le128( a.high, a.low, b.high, b.low );
  7246. end;
  7247. {*----------------------------------------------------------------------------
  7248. | Returns 1 if the extended double-precision floating-point value `a' is less
  7249. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  7250. | an exception. Otherwise, the comparison is performed according to the
  7251. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7252. *----------------------------------------------------------------------------*}
  7253. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  7254. var
  7255. aSign, bSign: flag;
  7256. begin
  7257. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7258. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7259. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7260. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7261. then begin
  7262. if ( floatx80_is_signaling_nan( a )
  7263. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7264. float_raise( float_flag_invalid );
  7265. end;
  7266. result := 0;
  7267. exit;
  7268. end;
  7269. aSign := extractFloatx80Sign( a );
  7270. bSign := extractFloatx80Sign( b );
  7271. if ( aSign <> bSign ) then begin
  7272. result := ord(
  7273. ( aSign <> 0 )
  7274. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7275. exit;
  7276. end;
  7277. if aSign <> 0 then
  7278. result := lt128( b.high, b.low, a.high, a.low )
  7279. else
  7280. result := lt128( a.high, a.low, b.high, b.low );
  7281. end;
  7282. {$endif FPC_SOFTFLOAT_FLOATX80}
  7283. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  7284. {*----------------------------------------------------------------------------
  7285. | Returns the least-significant 64 fraction bits of the quadruple-precision
  7286. | floating-point value `a'.
  7287. *----------------------------------------------------------------------------*}
  7288. function extractFloat128Frac1(a : float128): bits64;
  7289. begin
  7290. result:=a.low;
  7291. end;
  7292. {*----------------------------------------------------------------------------
  7293. | Returns the most-significant 48 fraction bits of the quadruple-precision
  7294. | floating-point value `a'.
  7295. *----------------------------------------------------------------------------*}
  7296. function extractFloat128Frac0(a : float128): bits64;
  7297. begin
  7298. result:=a.high and int64($0000FFFFFFFFFFFF);
  7299. end;
  7300. {*----------------------------------------------------------------------------
  7301. | Returns the exponent bits of the quadruple-precision floating-point value
  7302. | `a'.
  7303. *----------------------------------------------------------------------------*}
  7304. function extractFloat128Exp(a : float128): int32;
  7305. begin
  7306. result:=( a.high shr 48 ) and $7FFF;
  7307. end;
  7308. {*----------------------------------------------------------------------------
  7309. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  7310. *----------------------------------------------------------------------------*}
  7311. function extractFloat128Sign(a : float128): flag;
  7312. begin
  7313. result:=a.high shr 63;
  7314. end;
  7315. {*----------------------------------------------------------------------------
  7316. | Normalizes the subnormal quadruple-precision floating-point value
  7317. | represented by the denormalized significand formed by the concatenation of
  7318. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  7319. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  7320. | significand are stored at the location pointed to by `zSig0Ptr', and the
  7321. | least significant 64 bits of the normalized significand are stored at the
  7322. | location pointed to by `zSig1Ptr'.
  7323. *----------------------------------------------------------------------------*}
  7324. procedure normalizeFloat128Subnormal(
  7325. aSig0: bits64;
  7326. aSig1: bits64;
  7327. var zExpPtr: int32;
  7328. var zSig0Ptr: bits64;
  7329. var zSig1Ptr: bits64);
  7330. var
  7331. shiftCount: int8;
  7332. begin
  7333. if ( aSig0 = 0 ) then
  7334. begin
  7335. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  7336. if ( shiftCount < 0 ) then
  7337. begin
  7338. zSig0Ptr := aSig1 shr ( - shiftCount );
  7339. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  7340. end
  7341. else begin
  7342. zSig0Ptr := aSig1 shl shiftCount;
  7343. zSig1Ptr := 0;
  7344. end;
  7345. zExpPtr := - shiftCount - 63;
  7346. end
  7347. else begin
  7348. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  7349. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  7350. zExpPtr := 1 - shiftCount;
  7351. end;
  7352. end;
  7353. {*----------------------------------------------------------------------------
  7354. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  7355. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  7356. | floating-point value, returning the result. After being shifted into the
  7357. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  7358. | added together to form the most significant 32 bits of the result. This
  7359. | means that any integer portion of `zSig0' will be added into the exponent.
  7360. | Since a properly normalized significand will have an integer portion equal
  7361. | to 1, the `zExp' input should be 1 less than the desired result exponent
  7362. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  7363. | significand.
  7364. *----------------------------------------------------------------------------*}
  7365. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  7366. var
  7367. z: float128;
  7368. begin
  7369. z.low := zSig1;
  7370. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  7371. result:=z;
  7372. end;
  7373. {*----------------------------------------------------------------------------
  7374. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7375. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  7376. | and `zSig2', and returns the proper quadruple-precision floating-point value
  7377. | corresponding to the abstract input. Ordinarily, the abstract value is
  7378. | simply rounded and packed into the quadruple-precision format, with the
  7379. | inexact exception raised if the abstract input cannot be represented
  7380. | exactly. However, if the abstract value is too large, the overflow and
  7381. | inexact exceptions are raised and an infinity or maximal finite value is
  7382. | returned. If the abstract value is too small, the input value is rounded to
  7383. | a subnormal number, and the underflow and inexact exceptions are raised if
  7384. | the abstract input cannot be represented exactly as a subnormal quadruple-
  7385. | precision floating-point number.
  7386. | The input significand must be normalized or smaller. If the input
  7387. | significand is not normalized, `zExp' must be 0; in that case, the result
  7388. | returned is a subnormal number, and it must not require rounding. In the
  7389. | usual case that the input significand is normalized, `zExp' must be 1 less
  7390. | than the ``true'' floating-point exponent. The handling of underflow and
  7391. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7392. *----------------------------------------------------------------------------*}
  7393. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  7394. var
  7395. roundingMode: int8;
  7396. roundNearestEven, increment, isTiny: flag;
  7397. begin
  7398. roundingMode := softfloat_rounding_mode;
  7399. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  7400. increment := ord( sbits64(zSig2) < 0 );
  7401. if ( roundNearestEven=0 ) then
  7402. begin
  7403. if ( roundingMode = float_round_to_zero ) then
  7404. begin
  7405. increment := 0;
  7406. end
  7407. else begin
  7408. if ( zSign<>0 ) then
  7409. begin
  7410. increment := ord( roundingMode = float_round_down ) and zSig2;
  7411. end
  7412. else begin
  7413. increment := ord( roundingMode = float_round_up ) and zSig2;
  7414. end;
  7415. end;
  7416. end;
  7417. if ( $7FFD <= bits32(zExp) ) then
  7418. begin
  7419. if ( ord( $7FFD < zExp )
  7420. or ( ord( zExp = $7FFD )
  7421. and eq128(
  7422. int64( $0001FFFFFFFFFFFF ),
  7423. bits64( $FFFFFFFFFFFFFFFF ),
  7424. zSig0,
  7425. zSig1
  7426. )
  7427. and increment
  7428. )
  7429. )<>0 then
  7430. begin
  7431. float_raise( float_flag_overflow or float_flag_inexact );
  7432. if ( ord( roundingMode = float_round_to_zero )
  7433. or ( zSign and ord( roundingMode = float_round_up ) )
  7434. or ( ord( zSign = 0) and ord( roundingMode = float_round_down ) )
  7435. )<>0 then
  7436. begin
  7437. result :=
  7438. packFloat128(
  7439. zSign,
  7440. $7FFE,
  7441. int64( $0000FFFFFFFFFFFF ),
  7442. bits64( $FFFFFFFFFFFFFFFF )
  7443. );
  7444. exit;
  7445. end;
  7446. result:=packFloat128( zSign, $7FFF, 0, 0 );
  7447. exit;
  7448. end;
  7449. if ( zExp < 0 ) then
  7450. begin
  7451. isTiny :=
  7452. ord(( softfloat_detect_tininess = float_tininess_before_rounding )
  7453. or ( zExp < -1 )
  7454. or not( increment<>0 )
  7455. or boolean(lt128(
  7456. zSig0,
  7457. zSig1,
  7458. int64( $0001FFFFFFFFFFFF ),
  7459. bits64( $FFFFFFFFFFFFFFFF )
  7460. )));
  7461. shift128ExtraRightJamming(
  7462. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  7463. zExp := 0;
  7464. if ( isTiny and zSig2 )<>0 then
  7465. float_raise( float_flag_underflow );
  7466. if ( roundNearestEven<>0 ) then
  7467. begin
  7468. increment := ord( sbits64(zSig2) < 0 );
  7469. end
  7470. else begin
  7471. if ( zSign<>0 ) then
  7472. begin
  7473. increment := ord( roundingMode = float_round_down ) and zSig2;
  7474. end
  7475. else begin
  7476. increment := ord( roundingMode = float_round_up ) and zSig2;
  7477. end;
  7478. end;
  7479. end;
  7480. end;
  7481. if ( zSig2<>0 ) then
  7482. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7483. if ( increment<>0 ) then
  7484. begin
  7485. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  7486. zSig1 := zSig1 and not bits64( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  7487. end
  7488. else begin
  7489. if ( ( zSig0 or zSig1 ) = 0 ) then
  7490. zExp := 0;
  7491. end;
  7492. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  7493. end;
  7494. {*----------------------------------------------------------------------------
  7495. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7496. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  7497. | returns the proper quadruple-precision floating-point value corresponding
  7498. | to the abstract input. This routine is just like `roundAndPackFloat128'
  7499. | except that the input significand has fewer bits and does not have to be
  7500. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  7501. | point exponent.
  7502. *----------------------------------------------------------------------------*}
  7503. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  7504. var
  7505. shiftCount: int8;
  7506. zSig2: bits64;
  7507. begin
  7508. if ( zSig0 = 0 ) then
  7509. begin
  7510. zSig0 := zSig1;
  7511. zSig1 := 0;
  7512. dec(zExp, 64);
  7513. end;
  7514. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  7515. if ( 0 <= shiftCount ) then
  7516. begin
  7517. zSig2 := 0;
  7518. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  7519. end
  7520. else begin
  7521. shift128ExtraRightJamming(
  7522. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  7523. end;
  7524. dec(zExp, shiftCount);
  7525. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7526. end;
  7527. {*----------------------------------------------------------------------------
  7528. | Returns the result of converting the quadruple-precision floating-point
  7529. | value `a' to the 32-bit two's complement integer format. The conversion
  7530. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7531. | Arithmetic---which means in particular that the conversion is rounded
  7532. | according to the current rounding mode. If `a' is a NaN, the largest
  7533. | positive integer is returned. Otherwise, if the conversion overflows, the
  7534. | largest integer with the same sign as `a' is returned.
  7535. *----------------------------------------------------------------------------*}
  7536. function float128_to_int32(a: float128): int32;
  7537. var
  7538. aSign: flag;
  7539. aExp, shiftCount: int32;
  7540. aSig0, aSig1: bits64;
  7541. begin
  7542. aSig1 := extractFloat128Frac1( a );
  7543. aSig0 := extractFloat128Frac0( a );
  7544. aExp := extractFloat128Exp( a );
  7545. aSign := extractFloat128Sign( a );
  7546. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  7547. aSign := 0;
  7548. if ( aExp<>0 ) then
  7549. aSig0 := aSig0 or int64( $0001000000000000 );
  7550. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7551. shiftCount := $4028 - aExp;
  7552. if ( 0 < shiftCount ) then
  7553. shift64RightJamming( aSig0, shiftCount, aSig0 );
  7554. result := roundAndPackInt32( aSign, aSig0 );
  7555. end;
  7556. {*----------------------------------------------------------------------------
  7557. | Returns the result of converting the quadruple-precision floating-point
  7558. | value `a' to the 32-bit two's complement integer format. The conversion
  7559. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7560. | Arithmetic, except that the conversion is always rounded toward zero. If
  7561. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  7562. | conversion overflows, the largest integer with the same sign as `a' is
  7563. | returned.
  7564. *----------------------------------------------------------------------------*}
  7565. function float128_to_int32_round_to_zero(a: float128): int32;
  7566. var
  7567. aSign: flag;
  7568. aExp, shiftCount: int32;
  7569. aSig0, aSig1, savedASig: bits64;
  7570. z: int32;
  7571. label
  7572. invalid;
  7573. begin
  7574. aSig1 := extractFloat128Frac1( a );
  7575. aSig0 := extractFloat128Frac0( a );
  7576. aExp := extractFloat128Exp( a );
  7577. aSign := extractFloat128Sign( a );
  7578. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7579. if ( $401E < aExp ) then
  7580. begin
  7581. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  7582. aSign := 0;
  7583. goto invalid;
  7584. end
  7585. else if ( aExp < $3FFF ) then
  7586. begin
  7587. if ( aExp or aSig0 )<>0 then
  7588. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7589. result := 0;
  7590. exit;
  7591. end;
  7592. aSig0 := aSig0 or int64( $0001000000000000 );
  7593. shiftCount := $402F - aExp;
  7594. savedASig := aSig0;
  7595. aSig0 := aSig0 shr shiftCount;
  7596. z := aSig0;
  7597. if ( aSign )<>0 then
  7598. z := - z;
  7599. if ( ord( z < 0 ) xor aSign )<>0 then
  7600. begin
  7601. invalid:
  7602. float_raise( float_flag_invalid );
  7603. if aSign<>0 then
  7604. result:= int32( $80000000 )
  7605. else
  7606. result:=$7FFFFFFF;
  7607. exit;
  7608. end;
  7609. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  7610. begin
  7611. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7612. end;
  7613. result := z;
  7614. end;
  7615. {*----------------------------------------------------------------------------
  7616. | Returns the result of converting the quadruple-precision floating-point
  7617. | value `a' to the 64-bit two's complement integer format. The conversion
  7618. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7619. | Arithmetic---which means in particular that the conversion is rounded
  7620. | according to the current rounding mode. If `a' is a NaN, the largest
  7621. | positive integer is returned. Otherwise, if the conversion overflows, the
  7622. | largest integer with the same sign as `a' is returned.
  7623. *----------------------------------------------------------------------------*}
  7624. function float128_to_int64(a: float128): int64;
  7625. var
  7626. aSign: flag;
  7627. aExp, shiftCount: int32;
  7628. aSig0, aSig1: bits64;
  7629. begin
  7630. aSig1 := extractFloat128Frac1( a );
  7631. aSig0 := extractFloat128Frac0( a );
  7632. aExp := extractFloat128Exp( a );
  7633. aSign := extractFloat128Sign( a );
  7634. if ( aExp<>0 ) then
  7635. aSig0 := aSig0 or int64( $0001000000000000 );
  7636. shiftCount := $402F - aExp;
  7637. if ( shiftCount <= 0 ) then
  7638. begin
  7639. if ( $403E < aExp ) then
  7640. begin
  7641. float_raise( float_flag_invalid );
  7642. if ( (aSign=0)
  7643. or ( ( aExp = $7FFF )
  7644. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  7645. )
  7646. ) then
  7647. begin
  7648. result := int64( $7FFFFFFFFFFFFFFF );
  7649. exit;
  7650. end;
  7651. result := int64( $8000000000000000 );
  7652. exit;
  7653. end;
  7654. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  7655. end
  7656. else begin
  7657. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  7658. end;
  7659. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  7660. end;
  7661. {*----------------------------------------------------------------------------
  7662. | Returns the result of converting the quadruple-precision floating-point
  7663. | value `a' to the 64-bit two's complement integer format. The conversion
  7664. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7665. | Arithmetic, except that the conversion is always rounded toward zero.
  7666. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  7667. | the conversion overflows, the largest integer with the same sign as `a' is
  7668. | returned.
  7669. *----------------------------------------------------------------------------*}
  7670. function float128_to_int64_round_to_zero(a: float128): int64;
  7671. var
  7672. aSign: flag;
  7673. aExp, shiftCount: int32;
  7674. aSig0, aSig1: bits64;
  7675. z: int64;
  7676. begin
  7677. aSig1 := extractFloat128Frac1( a );
  7678. aSig0 := extractFloat128Frac0( a );
  7679. aExp := extractFloat128Exp( a );
  7680. aSign := extractFloat128Sign( a );
  7681. if ( aExp<>0 ) then
  7682. aSig0 := aSig0 or int64( $0001000000000000 );
  7683. shiftCount := aExp - $402F;
  7684. if ( 0 < shiftCount ) then
  7685. begin
  7686. if ( $403E <= aExp ) then
  7687. begin
  7688. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  7689. if ( ( a.high = bits64( $C03E000000000000 ) )
  7690. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  7691. begin
  7692. if ( aSig1<>0 ) then
  7693. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7694. end
  7695. else begin
  7696. float_raise( float_flag_invalid );
  7697. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  7698. begin
  7699. result := int64( $7FFFFFFFFFFFFFFF );
  7700. exit;
  7701. end;
  7702. end;
  7703. result := int64( $8000000000000000 );
  7704. exit;
  7705. end;
  7706. z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
  7707. if ( int64( aSig1 shl shiftCount )<>0 ) then
  7708. begin
  7709. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7710. end;
  7711. end
  7712. else begin
  7713. if ( aExp < $3FFF ) then
  7714. begin
  7715. if ( aExp or aSig0 or aSig1 )<>0 then
  7716. begin
  7717. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7718. end;
  7719. result := 0;
  7720. exit;
  7721. end;
  7722. z := aSig0 shr ( - shiftCount );
  7723. if ( (aSig1<>0)
  7724. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  7725. begin
  7726. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7727. end;
  7728. end;
  7729. if ( aSign<>0 ) then
  7730. z := - z;
  7731. result := z;
  7732. end;
  7733. {*----------------------------------------------------------------------------
  7734. | Returns the result of converting the quadruple-precision floating-point
  7735. | value `a' to the single-precision floating-point format. The conversion
  7736. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7737. | Arithmetic.
  7738. *----------------------------------------------------------------------------*}
  7739. function float128_to_float32(a: float128): float32;
  7740. var
  7741. aSign: flag;
  7742. aExp: int32;
  7743. aSig0, aSig1: bits64;
  7744. zSig: bits32;
  7745. begin
  7746. aSig1 := extractFloat128Frac1( a );
  7747. aSig0 := extractFloat128Frac0( a );
  7748. aExp := extractFloat128Exp( a );
  7749. aSign := extractFloat128Sign( a );
  7750. if ( aExp = $7FFF ) then
  7751. begin
  7752. if ( aSig0 or aSig1 )<>0 then
  7753. begin
  7754. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  7755. exit;
  7756. end;
  7757. result := packFloat32( aSign, $FF, 0 );
  7758. exit;
  7759. end;
  7760. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7761. shift64RightJamming( aSig0, 18, aSig0 );
  7762. zSig := aSig0;
  7763. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7764. begin
  7765. zSig := zSig or $40000000;
  7766. dec(aExp,$3F81);
  7767. end;
  7768. result := roundAndPackFloat32( aSign, aExp, zSig );
  7769. end;
  7770. {*----------------------------------------------------------------------------
  7771. | Returns the result of converting the quadruple-precision floating-point
  7772. | value `a' to the double-precision floating-point format. The conversion
  7773. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7774. | Arithmetic.
  7775. *----------------------------------------------------------------------------*}
  7776. function float128_to_float64(a: float128): float64;
  7777. var
  7778. aSign: flag;
  7779. aExp: int32;
  7780. aSig0, aSig1: bits64;
  7781. begin
  7782. aSig1 := extractFloat128Frac1( a );
  7783. aSig0 := extractFloat128Frac0( a );
  7784. aExp := extractFloat128Exp( a );
  7785. aSign := extractFloat128Sign( a );
  7786. if ( aExp = $7FFF ) then
  7787. begin
  7788. if ( aSig0 or aSig1 )<>0 then
  7789. begin
  7790. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  7791. exit;
  7792. end;
  7793. result:=packFloat64( aSign, $7FF, 0);
  7794. exit;
  7795. end;
  7796. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7797. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7798. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7799. begin
  7800. aSig0 := aSig0 or int64( $4000000000000000 );
  7801. dec(aExp,$3C01);
  7802. end;
  7803. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7804. end;
  7805. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7806. {*----------------------------------------------------------------------------
  7807. | Returns the result of converting the quadruple-precision floating-point
  7808. | value `a' to the extended double-precision floating-point format. The
  7809. | conversion is performed according to the IEC/IEEE Standard for Binary
  7810. | Floating-Point Arithmetic.
  7811. *----------------------------------------------------------------------------*}
  7812. function float128_to_floatx80(a: float128): floatx80;
  7813. var
  7814. aSign: flag;
  7815. aExp: int32;
  7816. aSig0, aSig1: bits64;
  7817. begin
  7818. aSig1 := extractFloat128Frac1( a );
  7819. aSig0 := extractFloat128Frac0( a );
  7820. aExp := extractFloat128Exp( a );
  7821. aSign := extractFloat128Sign( a );
  7822. if ( aExp = $7FFF ) then begin
  7823. if ( aSig0 or aSig1 <> 0 ) then begin
  7824. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7825. exit;
  7826. end;
  7827. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  7828. exit;
  7829. end;
  7830. if ( aExp = 0 ) then begin
  7831. if ( ( aSig0 or aSig1 ) = 0 ) then
  7832. begin
  7833. result := packFloatx80( aSign, 0, 0 );
  7834. exit;
  7835. end;
  7836. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7837. end
  7838. else begin
  7839. aSig0 := aSig0 or int64( $0001000000000000 );
  7840. end;
  7841. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7842. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7843. end;
  7844. {$endif FPC_SOFTFLOAT_FLOATX80}
  7845. {*----------------------------------------------------------------------------
  7846. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7847. | Returns the result as a quadruple-precision floating-point value. The
  7848. | operation is performed according to the IEC/IEEE Standard for Binary
  7849. | Floating-Point Arithmetic.
  7850. *----------------------------------------------------------------------------*}
  7851. function float128_round_to_int(a: float128): float128;
  7852. var
  7853. aSign: flag;
  7854. aExp: int32;
  7855. lastBitMask, roundBitsMask: bits64;
  7856. roundingMode: int8;
  7857. z: float128;
  7858. begin
  7859. aExp := extractFloat128Exp( a );
  7860. if ( $402F <= aExp ) then
  7861. begin
  7862. if ( $406F <= aExp ) then
  7863. begin
  7864. if ( ( aExp = $7FFF )
  7865. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7866. ) then
  7867. begin
  7868. result := propagateFloat128NaN( a, a );
  7869. exit;
  7870. end;
  7871. result := a;
  7872. exit;
  7873. end;
  7874. lastBitMask := 1;
  7875. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7876. roundBitsMask := lastBitMask - 1;
  7877. z := a;
  7878. roundingMode := softfloat_rounding_mode;
  7879. if ( roundingMode = float_round_nearest_even ) then
  7880. begin
  7881. if ( lastBitMask )<>0 then
  7882. begin
  7883. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7884. if ( ( z.low and roundBitsMask ) = 0 ) then
  7885. z.low := z.low and not(lastBitMask);
  7886. end
  7887. else begin
  7888. if ( sbits64(z.low) < 0 ) then
  7889. begin
  7890. inc(z.high);
  7891. if ( bits64( z.low shl 1 ) = 0 ) then
  7892. z.high := z.high and not bits64( 1 );
  7893. end;
  7894. end;
  7895. end
  7896. else if ( roundingMode <> float_round_to_zero ) then
  7897. begin
  7898. if ( extractFloat128Sign( z )
  7899. xor ord( roundingMode = float_round_up ) )<>0 then
  7900. begin
  7901. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7902. end;
  7903. end;
  7904. z.low := z.low and not(roundBitsMask);
  7905. end
  7906. else begin
  7907. if ( aExp < $3FFF ) then
  7908. begin
  7909. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7910. begin
  7911. result := a;
  7912. exit;
  7913. end;
  7914. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7915. aSign := extractFloat128Sign( a );
  7916. case softfloat_rounding_mode of
  7917. float_round_nearest_even:
  7918. if ( ( aExp = $3FFE )
  7919. and ( (extractFloat128Frac0( a )<>0)
  7920. or (extractFloat128Frac1( a )<>0) )
  7921. ) then begin
  7922. begin
  7923. result := packFloat128( aSign, $3FFF, 0, 0 );
  7924. exit;
  7925. end;
  7926. end;
  7927. float_round_down:
  7928. begin
  7929. if aSign<>0 then
  7930. result:=packFloat128( 1, $3FFF, 0, 0 )
  7931. else
  7932. result:=packFloat128( 0, 0, 0, 0 );
  7933. exit;
  7934. end;
  7935. float_round_up:
  7936. begin
  7937. if aSign<>0 then
  7938. result := packFloat128( 1, 0, 0, 0 )
  7939. else
  7940. result:=packFloat128( 0, $3FFF, 0, 0 );
  7941. exit;
  7942. end;
  7943. end;
  7944. result := packFloat128( aSign, 0, 0, 0 );
  7945. exit;
  7946. end;
  7947. lastBitMask := 1;
  7948. lastBitMask := lastBitMask shl ($402F - aExp);
  7949. roundBitsMask := lastBitMask - 1;
  7950. z.low := 0;
  7951. z.high := a.high;
  7952. roundingMode := softfloat_rounding_mode;
  7953. if ( roundingMode = float_round_nearest_even ) then begin
  7954. inc(z.high,lastBitMask shr 1);
  7955. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7956. z.high := z.high and not(lastBitMask);
  7957. end;
  7958. end
  7959. else if ( roundingMode <> float_round_to_zero ) then begin
  7960. if ( (extractFloat128Sign( z )<>0)
  7961. xor ( roundingMode = float_round_up ) ) then begin
  7962. z.high := z.high or ord( a.low <> 0 );
  7963. z.high := z.high+roundBitsMask;
  7964. end;
  7965. end;
  7966. z.high := z.high and not(roundBitsMask);
  7967. end;
  7968. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7969. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7970. end;
  7971. result := z;
  7972. end;
  7973. {*----------------------------------------------------------------------------
  7974. | Returns the result of adding the absolute values of the quadruple-precision
  7975. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7976. | before being returned. `zSign' is ignored if the result is a NaN.
  7977. | The addition is performed according to the IEC/IEEE Standard for Binary
  7978. | Floating-Point Arithmetic.
  7979. *----------------------------------------------------------------------------*}
  7980. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7981. var
  7982. aExp, bExp, zExp: int32;
  7983. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7984. expDiff: int32;
  7985. label
  7986. shiftRight1,roundAndPack;
  7987. begin
  7988. aSig1 := extractFloat128Frac1( a );
  7989. aSig0 := extractFloat128Frac0( a );
  7990. aExp := extractFloat128Exp( a );
  7991. bSig1 := extractFloat128Frac1( b );
  7992. bSig0 := extractFloat128Frac0( b );
  7993. bExp := extractFloat128Exp( b );
  7994. expDiff := aExp - bExp;
  7995. if ( 0 < expDiff ) then begin
  7996. if ( aExp = $7FFF ) then begin
  7997. if ( aSig0 or aSig1 )<>0 then
  7998. begin
  7999. result := propagateFloat128NaN( a, b );
  8000. exit;
  8001. end;
  8002. result := a;
  8003. exit;
  8004. end;
  8005. if ( bExp = 0 ) then begin
  8006. dec(expDiff);
  8007. end
  8008. else begin
  8009. bSig0 := bSig0 or int64( $0001000000000000 );
  8010. end;
  8011. shift128ExtraRightJamming(
  8012. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  8013. zExp := aExp;
  8014. end
  8015. else if ( expDiff < 0 ) then begin
  8016. if ( bExp = $7FFF ) then begin
  8017. if ( bSig0 or bSig1 )<>0 then
  8018. begin
  8019. result := propagateFloat128NaN( a, b );
  8020. exit;
  8021. end;
  8022. result := packFloat128( zSign, $7FFF, 0, 0 );
  8023. exit;
  8024. end;
  8025. if ( aExp = 0 ) then begin
  8026. inc(expDiff);
  8027. end
  8028. else begin
  8029. aSig0 := aSig0 or int64( $0001000000000000 );
  8030. end;
  8031. shift128ExtraRightJamming(
  8032. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  8033. zExp := bExp;
  8034. end
  8035. else begin
  8036. if ( aExp = $7FFF ) then begin
  8037. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8038. result := propagateFloat128NaN( a, b );
  8039. exit;
  8040. end;
  8041. result := a;
  8042. exit;
  8043. end;
  8044. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8045. if ( aExp = 0 ) then
  8046. begin
  8047. result := packFloat128( zSign, 0, zSig0, zSig1 );
  8048. exit;
  8049. end;
  8050. zSig2 := 0;
  8051. zSig0 := zSig0 or int64( $0002000000000000 );
  8052. zExp := aExp;
  8053. goto shiftRight1;
  8054. end;
  8055. aSig0 := aSig0 or int64( $0001000000000000 );
  8056. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8057. dec(zExp);
  8058. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  8059. inc(zExp);
  8060. shiftRight1:
  8061. shift128ExtraRightJamming(
  8062. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8063. roundAndPack:
  8064. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8065. end;
  8066. {*----------------------------------------------------------------------------
  8067. | Returns the result of subtracting the absolute values of the quadruple-
  8068. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  8069. | difference is negated before being returned. `zSign' is ignored if the
  8070. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  8071. | Standard for Binary Floating-Point Arithmetic.
  8072. *----------------------------------------------------------------------------*}
  8073. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  8074. var
  8075. aExp, bExp, zExp: int32;
  8076. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  8077. expDiff: int32;
  8078. z: float128;
  8079. label
  8080. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  8081. begin
  8082. aSig1 := extractFloat128Frac1( a );
  8083. aSig0 := extractFloat128Frac0( a );
  8084. aExp := extractFloat128Exp( a );
  8085. bSig1 := extractFloat128Frac1( b );
  8086. bSig0 := extractFloat128Frac0( b );
  8087. bExp := extractFloat128Exp( b );
  8088. expDiff := aExp - bExp;
  8089. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  8090. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  8091. if ( 0 < expDiff ) then goto aExpBigger;
  8092. if ( expDiff < 0 ) then goto bExpBigger;
  8093. if ( aExp = $7FFF ) then begin
  8094. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8095. result := propagateFloat128NaN( a, b );
  8096. exit;
  8097. end;
  8098. float_raise( float_flag_invalid );
  8099. z.low := float128_default_nan_low;
  8100. z.high := float128_default_nan_high;
  8101. result := z;
  8102. exit;
  8103. end;
  8104. if ( aExp = 0 ) then begin
  8105. aExp := 1;
  8106. bExp := 1;
  8107. end;
  8108. if ( bSig0 < aSig0 ) then goto aBigger;
  8109. if ( aSig0 < bSig0 ) then goto bBigger;
  8110. if ( bSig1 < aSig1 ) then goto aBigger;
  8111. if ( aSig1 < bSig1 ) then goto bBigger;
  8112. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  8113. exit;
  8114. bExpBigger:
  8115. if ( bExp = $7FFF ) then begin
  8116. if ( bSig0 or bSig1 )<>0 then
  8117. begin
  8118. result := propagateFloat128NaN( a, b );
  8119. exit;
  8120. end;
  8121. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  8122. exit;
  8123. end;
  8124. if ( aExp = 0 ) then begin
  8125. inc(expDiff);
  8126. end
  8127. else begin
  8128. aSig0 := aSig0 or int64( $4000000000000000 );
  8129. end;
  8130. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8131. bSig0 := bSig0 or int64( $4000000000000000 );
  8132. bBigger:
  8133. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  8134. zExp := bExp;
  8135. zSign := zSign xor 1;
  8136. goto normalizeRoundAndPack;
  8137. aExpBigger:
  8138. if ( aExp = $7FFF ) then begin
  8139. if ( aSig0 or aSig1 )<>0 then
  8140. begin
  8141. result := propagateFloat128NaN( a, b );
  8142. exit;
  8143. end;
  8144. result := a;
  8145. exit;
  8146. end;
  8147. if ( bExp = 0 ) then begin
  8148. dec(expDiff);
  8149. end
  8150. else begin
  8151. bSig0 := bSig0 or int64( $4000000000000000 );
  8152. end;
  8153. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  8154. aSig0 := aSig0 or int64( $4000000000000000 );
  8155. aBigger:
  8156. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8157. zExp := aExp;
  8158. normalizeRoundAndPack:
  8159. dec(zExp);
  8160. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  8161. end;
  8162. {*----------------------------------------------------------------------------
  8163. | Returns the result of adding the quadruple-precision floating-point values
  8164. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  8165. | for Binary Floating-Point Arithmetic.
  8166. *----------------------------------------------------------------------------*}
  8167. function float128_add(a: float128; b: float128): float128;
  8168. var
  8169. aSign, bSign: flag;
  8170. begin
  8171. aSign := extractFloat128Sign( a );
  8172. bSign := extractFloat128Sign( b );
  8173. if ( aSign = bSign ) then begin
  8174. result := addFloat128Sigs( a, b, aSign );
  8175. end
  8176. else begin
  8177. result := subFloat128Sigs( a, b, aSign );
  8178. end;
  8179. end;
  8180. {*----------------------------------------------------------------------------
  8181. | Returns the result of subtracting the quadruple-precision floating-point
  8182. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8183. | Standard for Binary Floating-Point Arithmetic.
  8184. *----------------------------------------------------------------------------*}
  8185. function float128_sub(a: float128; b: float128): float128;
  8186. var
  8187. aSign, bSign: flag;
  8188. begin
  8189. aSign := extractFloat128Sign( a );
  8190. bSign := extractFloat128Sign( b );
  8191. if ( aSign = bSign ) then begin
  8192. result := subFloat128Sigs( a, b, aSign );
  8193. end
  8194. else begin
  8195. result := addFloat128Sigs( a, b, aSign );
  8196. end;
  8197. end;
  8198. {*----------------------------------------------------------------------------
  8199. | Returns the result of multiplying the quadruple-precision floating-point
  8200. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8201. | Standard for Binary Floating-Point Arithmetic.
  8202. *----------------------------------------------------------------------------*}
  8203. function float128_mul(a: float128; b: float128): float128;
  8204. var
  8205. aSign, bSign, zSign: flag;
  8206. aExp, bExp, zExp: int32;
  8207. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  8208. z: float128;
  8209. label
  8210. invalid;
  8211. begin
  8212. aSig1 := extractFloat128Frac1( a );
  8213. aSig0 := extractFloat128Frac0( a );
  8214. aExp := extractFloat128Exp( a );
  8215. aSign := extractFloat128Sign( a );
  8216. bSig1 := extractFloat128Frac1( b );
  8217. bSig0 := extractFloat128Frac0( b );
  8218. bExp := extractFloat128Exp( b );
  8219. bSign := extractFloat128Sign( b );
  8220. zSign := aSign xor bSign;
  8221. if ( aExp = $7FFF ) then begin
  8222. if ( (( aSig0 or aSig1 )<>0)
  8223. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8224. result := propagateFloat128NaN( a, b );
  8225. exit;
  8226. end;
  8227. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  8228. result := packFloat128( zSign, $7FFF, 0, 0 );
  8229. exit;
  8230. end;
  8231. if ( bExp = $7FFF ) then begin
  8232. if ( bSig0 or bSig1 )<>0 then
  8233. begin
  8234. result := propagateFloat128NaN( a, b );
  8235. exit;
  8236. end;
  8237. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8238. invalid:
  8239. float_raise( float_flag_invalid );
  8240. z.low := float128_default_nan_low;
  8241. z.high := float128_default_nan_high;
  8242. result := z;
  8243. exit;
  8244. end;
  8245. result := packFloat128( zSign, $7FFF, 0, 0 );
  8246. exit;
  8247. end;
  8248. if ( aExp = 0 ) then begin
  8249. if ( ( aSig0 or aSig1 ) = 0 ) then
  8250. begin
  8251. result := packFloat128( zSign, 0, 0, 0 );
  8252. exit;
  8253. end;
  8254. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8255. end;
  8256. if ( bExp = 0 ) then begin
  8257. if ( ( bSig0 or bSig1 ) = 0 ) then
  8258. begin
  8259. result := packFloat128( zSign, 0, 0, 0 );
  8260. exit;
  8261. end;
  8262. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8263. end;
  8264. zExp := aExp + bExp - $4000;
  8265. aSig0 := aSig0 or int64( $0001000000000000 );
  8266. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  8267. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  8268. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  8269. zSig2 := zSig2 or ord( zSig3 <> 0 );
  8270. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  8271. shift128ExtraRightJamming(
  8272. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8273. inc(zExp);
  8274. end;
  8275. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8276. end;
  8277. {*----------------------------------------------------------------------------
  8278. | Returns the result of dividing the quadruple-precision floating-point value
  8279. | `a' by the corresponding value `b'. The operation is performed according to
  8280. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8281. *----------------------------------------------------------------------------*}
  8282. function float128_div(a: float128; b: float128): float128;
  8283. var
  8284. aSign, bSign, zSign: flag;
  8285. aExp, bExp, zExp: int32;
  8286. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8287. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8288. z: float128;
  8289. label
  8290. invalid;
  8291. begin
  8292. aSig1 := extractFloat128Frac1( a );
  8293. aSig0 := extractFloat128Frac0( a );
  8294. aExp := extractFloat128Exp( a );
  8295. aSign := extractFloat128Sign( a );
  8296. bSig1 := extractFloat128Frac1( b );
  8297. bSig0 := extractFloat128Frac0( b );
  8298. bExp := extractFloat128Exp( b );
  8299. bSign := extractFloat128Sign( b );
  8300. zSign := aSign xor bSign;
  8301. if ( aExp = $7FFF ) then begin
  8302. if ( aSig0 or aSig1 )<>0 then
  8303. begin
  8304. result := propagateFloat128NaN( a, b );
  8305. exit;
  8306. end;
  8307. if ( bExp = $7FFF ) then begin
  8308. if ( bSig0 or bSig1 )<>0 then
  8309. begin
  8310. result := propagateFloat128NaN( a, b );
  8311. exit;
  8312. end;
  8313. goto invalid;
  8314. end;
  8315. result := packFloat128( zSign, $7FFF, 0, 0 );
  8316. exit;
  8317. end;
  8318. if ( bExp = $7FFF ) then begin
  8319. if ( bSig0 or bSig1 )<>0 then
  8320. begin
  8321. result := propagateFloat128NaN( a, b );
  8322. exit;
  8323. end;
  8324. result := packFloat128( zSign, 0, 0, 0 );
  8325. exit;
  8326. end;
  8327. if ( bExp = 0 ) then begin
  8328. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8329. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8330. invalid:
  8331. float_raise( float_flag_invalid );
  8332. z.low := float128_default_nan_low;
  8333. z.high := float128_default_nan_high;
  8334. result := z;
  8335. exit;
  8336. end;
  8337. float_raise( float_flag_divbyzero );
  8338. result := packFloat128( zSign, $7FFF, 0, 0 );
  8339. exit;
  8340. end;
  8341. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8342. end;
  8343. if ( aExp = 0 ) then begin
  8344. if ( ( aSig0 or aSig1 ) = 0 ) then
  8345. begin
  8346. result := packFloat128( zSign, 0, 0, 0 );
  8347. exit;
  8348. end;
  8349. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8350. end;
  8351. zExp := aExp - bExp + $3FFD;
  8352. shortShift128Left(
  8353. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  8354. shortShift128Left(
  8355. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8356. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  8357. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  8358. inc(zExp);
  8359. end;
  8360. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8361. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  8362. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  8363. while ( sbits64(rem0) < 0 ) do begin
  8364. dec(zSig0);
  8365. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  8366. end;
  8367. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  8368. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  8369. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  8370. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  8371. while ( sbits64(rem1) < 0 ) do begin
  8372. dec(zSig1);
  8373. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  8374. end;
  8375. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8376. end;
  8377. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  8378. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8379. end;
  8380. {*----------------------------------------------------------------------------
  8381. | Returns the remainder of the quadruple-precision floating-point value `a'
  8382. | with respect to the corresponding value `b'. The operation is performed
  8383. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8384. *----------------------------------------------------------------------------*}
  8385. function float128_rem(a: float128; b: float128): float128;
  8386. var
  8387. aSign, zSign: flag;
  8388. aExp, bExp, expDiff: int32;
  8389. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  8390. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  8391. sigMean0: sbits64;
  8392. z: float128;
  8393. label
  8394. invalid;
  8395. begin
  8396. aSig1 := extractFloat128Frac1( a );
  8397. aSig0 := extractFloat128Frac0( a );
  8398. aExp := extractFloat128Exp( a );
  8399. aSign := extractFloat128Sign( a );
  8400. bSig1 := extractFloat128Frac1( b );
  8401. bSig0 := extractFloat128Frac0( b );
  8402. bExp := extractFloat128Exp( b );
  8403. if ( aExp = $7FFF ) then begin
  8404. if ( (( aSig0 or aSig1 )<>0)
  8405. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8406. result := propagateFloat128NaN( a, b );
  8407. exit;
  8408. end;
  8409. goto invalid;
  8410. end;
  8411. if ( bExp = $7FFF ) then begin
  8412. if ( bSig0 or bSig1 )<>0 then
  8413. begin
  8414. result := propagateFloat128NaN( a, b );
  8415. exit;
  8416. end;
  8417. result := a;
  8418. exit;
  8419. end;
  8420. if ( bExp = 0 ) then begin
  8421. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8422. invalid:
  8423. float_raise( float_flag_invalid );
  8424. z.low := float128_default_nan_low;
  8425. z.high := float128_default_nan_high;
  8426. result := z;
  8427. exit;
  8428. end;
  8429. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8430. end;
  8431. if ( aExp = 0 ) then begin
  8432. if ( ( aSig0 or aSig1 ) = 0 ) then
  8433. begin
  8434. result := a;
  8435. exit;
  8436. end;
  8437. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8438. end;
  8439. expDiff := aExp - bExp;
  8440. if ( expDiff < -1 ) then
  8441. begin
  8442. result := a;
  8443. exit;
  8444. end;
  8445. shortShift128Left(
  8446. aSig0 or int64( $0001000000000000 ),
  8447. aSig1,
  8448. 15 - ord( expDiff < 0 ),
  8449. aSig0,
  8450. aSig1
  8451. );
  8452. shortShift128Left(
  8453. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8454. q := le128( bSig0, bSig1, aSig0, aSig1 );
  8455. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8456. dec(expDiff,64);
  8457. while ( 0 < expDiff ) do begin
  8458. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8459. if ( 4 < q ) then
  8460. q := q - 4
  8461. else
  8462. q := 0;
  8463. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8464. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  8465. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  8466. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  8467. dec(expDiff,61);
  8468. end;
  8469. if ( -64 < expDiff ) then begin
  8470. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8471. if ( 4 < q ) then
  8472. q := q - 4
  8473. else
  8474. q := 0;
  8475. q := q shr (- expDiff);
  8476. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8477. inc(expDiff,52);
  8478. if ( expDiff < 0 ) then begin
  8479. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8480. end
  8481. else begin
  8482. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  8483. end;
  8484. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8485. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  8486. end
  8487. else begin
  8488. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  8489. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8490. end;
  8491. repeat
  8492. alternateASig0 := aSig0;
  8493. alternateASig1 := aSig1;
  8494. inc(q);
  8495. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8496. until not( 0 <= sbits64(aSig0) );
  8497. add128(
  8498. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  8499. if ( ( sigMean0 < 0 )
  8500. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  8501. aSig0 := alternateASig0;
  8502. aSig1 := alternateASig1;
  8503. end;
  8504. zSign := ord( sbits64(aSig0) < 0 );
  8505. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  8506. result :=
  8507. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  8508. end;
  8509. {*----------------------------------------------------------------------------
  8510. | Returns the square root of the quadruple-precision floating-point value `a'.
  8511. | The operation is performed according to the IEC/IEEE Standard for Binary
  8512. | Floating-Point Arithmetic.
  8513. *----------------------------------------------------------------------------*}
  8514. function float128_sqrt(a: float128): float128;
  8515. var
  8516. aSign: flag;
  8517. aExp, zExp: int32;
  8518. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  8519. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8520. z: float128;
  8521. label
  8522. invalid;
  8523. begin
  8524. aSig1 := extractFloat128Frac1( a );
  8525. aSig0 := extractFloat128Frac0( a );
  8526. aExp := extractFloat128Exp( a );
  8527. aSign := extractFloat128Sign( a );
  8528. if ( aExp = $7FFF ) then begin
  8529. if ( aSig0 or aSig1 )<>0 then
  8530. begin
  8531. result := propagateFloat128NaN( a, a );
  8532. exit;
  8533. end;
  8534. if ( aSign=0 ) then
  8535. begin
  8536. result := a;
  8537. exit;
  8538. end;
  8539. goto invalid;
  8540. end;
  8541. if ( aSign<>0 ) then begin
  8542. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  8543. begin
  8544. result := a;
  8545. exit;
  8546. end;
  8547. invalid:
  8548. float_raise( float_flag_invalid );
  8549. z.low := float128_default_nan_low;
  8550. z.high := float128_default_nan_high;
  8551. result := z;
  8552. exit;
  8553. end;
  8554. if ( aExp = 0 ) then begin
  8555. if ( ( aSig0 or aSig1 ) = 0 ) then
  8556. begin
  8557. result := packFloat128( 0, 0, 0, 0 );
  8558. exit;
  8559. end;
  8560. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8561. end;
  8562. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
  8563. aSig0 := aSig0 or int64( $0001000000000000 );
  8564. zSig0 := estimateSqrt32( aExp, aSig0>>17 );
  8565. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  8566. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  8567. doubleZSig0 := zSig0 shl 1;
  8568. mul64To128( zSig0, zSig0, term0, term1 );
  8569. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  8570. while ( sbits64(rem0) < 0 ) do begin
  8571. dec(zSig0);
  8572. dec(doubleZSig0,2);
  8573. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  8574. end;
  8575. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  8576. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  8577. if ( zSig1 = 0 ) then zSig1 := 1;
  8578. mul64To128( doubleZSig0, zSig1, term1, term2 );
  8579. sub128( rem1, 0, term1, term2, rem1, rem2 );
  8580. mul64To128( zSig1, zSig1, term2, term3 );
  8581. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  8582. while ( sbits64(rem1) < 0 ) do begin
  8583. dec(zSig1);
  8584. shortShift128Left( 0, zSig1, 1, term2, term3 );
  8585. term3 := term3 or 1;
  8586. term2 := term2 or doubleZSig0;
  8587. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  8588. end;
  8589. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8590. end;
  8591. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  8592. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  8593. end;
  8594. {*----------------------------------------------------------------------------
  8595. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8596. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8597. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8598. *----------------------------------------------------------------------------*}
  8599. function float128_eq(a: float128; b: float128): flag;
  8600. begin
  8601. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8602. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8603. or ( ( extractFloat128Exp( b ) = $7FFF )
  8604. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8605. ) then begin
  8606. if ( (float128_is_signaling_nan( a )<>0)
  8607. or (float128_is_signaling_nan( b )<>0) ) then begin
  8608. float_raise( float_flag_invalid );
  8609. end;
  8610. result := 0;
  8611. exit;
  8612. end;
  8613. result := ord(
  8614. ( a.low = b.low )
  8615. and ( ( a.high = b.high )
  8616. or ( ( a.low = 0 )
  8617. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  8618. ));
  8619. end;
  8620. {*----------------------------------------------------------------------------
  8621. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8622. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  8623. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  8624. | Arithmetic.
  8625. *----------------------------------------------------------------------------*}
  8626. function float128_le(a: float128; b: float128): flag;
  8627. var
  8628. aSign, bSign: flag;
  8629. begin
  8630. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8631. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8632. or ( ( extractFloat128Exp( b ) = $7FFF )
  8633. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8634. ) then begin
  8635. float_raise( float_flag_invalid );
  8636. result := 0;
  8637. exit;
  8638. end;
  8639. aSign := extractFloat128Sign( a );
  8640. bSign := extractFloat128Sign( b );
  8641. if ( aSign <> bSign ) then begin
  8642. result := ord(
  8643. (aSign<>0)
  8644. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8645. = 0 ));
  8646. exit;
  8647. end;
  8648. if aSign<>0 then
  8649. result := le128( b.high, b.low, a.high, a.low )
  8650. else
  8651. result := le128( a.high, a.low, b.high, b.low );
  8652. end;
  8653. {*----------------------------------------------------------------------------
  8654. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8655. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8656. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8657. *----------------------------------------------------------------------------*}
  8658. function float128_lt(a: float128; b: float128): flag;
  8659. var
  8660. aSign, bSign: flag;
  8661. begin
  8662. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8663. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8664. or ( ( extractFloat128Exp( b ) = $7FFF )
  8665. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8666. ) then begin
  8667. float_raise( float_flag_invalid );
  8668. result := 0;
  8669. exit;
  8670. end;
  8671. aSign := extractFloat128Sign( a );
  8672. bSign := extractFloat128Sign( b );
  8673. if ( aSign <> bSign ) then begin
  8674. result := ord(
  8675. (aSign<>0)
  8676. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8677. <> 0 ));
  8678. exit;
  8679. end;
  8680. if aSign<>0 then
  8681. result := lt128( b.high, b.low, a.high, a.low )
  8682. else
  8683. result := lt128( a.high, a.low, b.high, b.low );
  8684. end;
  8685. {*----------------------------------------------------------------------------
  8686. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8687. | the corresponding value `b', and 0 otherwise. The invalid exception is
  8688. | raised if either operand is a NaN. Otherwise, the comparison is performed
  8689. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8690. *----------------------------------------------------------------------------*}
  8691. function float128_eq_signaling(a: float128; b: float128): flag;
  8692. begin
  8693. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8694. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8695. or ( ( extractFloat128Exp( b ) = $7FFF )
  8696. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8697. ) then begin
  8698. float_raise( float_flag_invalid );
  8699. result := 0;
  8700. exit;
  8701. end;
  8702. result := ord(
  8703. ( a.low = b.low )
  8704. and ( ( a.high = b.high )
  8705. or ( ( a.low = 0 )
  8706. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  8707. ));
  8708. end;
  8709. {*----------------------------------------------------------------------------
  8710. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8711. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  8712. | cause an exception. Otherwise, the comparison is performed according to the
  8713. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8714. *----------------------------------------------------------------------------*}
  8715. function float128_le_quiet(a: float128; b: float128): flag;
  8716. var
  8717. aSign, bSign: flag;
  8718. begin
  8719. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8720. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8721. or ( ( extractFloat128Exp( b ) = $7FFF )
  8722. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8723. ) then begin
  8724. if ( (float128_is_signaling_nan( a )<>0)
  8725. or (float128_is_signaling_nan( b )<>0) ) then begin
  8726. float_raise( float_flag_invalid );
  8727. end;
  8728. result := 0;
  8729. exit;
  8730. end;
  8731. aSign := extractFloat128Sign( a );
  8732. bSign := extractFloat128Sign( b );
  8733. if ( aSign <> bSign ) then begin
  8734. result := ord(
  8735. (aSign<>0)
  8736. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8737. = 0 ));
  8738. exit;
  8739. end;
  8740. if aSign<>0 then
  8741. result := le128( b.high, b.low, a.high, a.low )
  8742. else
  8743. result := le128( a.high, a.low, b.high, b.low );
  8744. end;
  8745. {*----------------------------------------------------------------------------
  8746. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8747. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  8748. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  8749. | Standard for Binary Floating-Point Arithmetic.
  8750. *----------------------------------------------------------------------------*}
  8751. function float128_lt_quiet(a: float128; b: float128): flag;
  8752. var
  8753. aSign, bSign: flag;
  8754. begin
  8755. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8756. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8757. or ( ( extractFloat128Exp( b ) = $7FFF )
  8758. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8759. ) then begin
  8760. if ( (float128_is_signaling_nan( a )<>0)
  8761. or (float128_is_signaling_nan( b )<>0) ) then begin
  8762. float_raise( float_flag_invalid );
  8763. end;
  8764. result := 0;
  8765. exit;
  8766. end;
  8767. aSign := extractFloat128Sign( a );
  8768. bSign := extractFloat128Sign( b );
  8769. if ( aSign <> bSign ) then begin
  8770. result := ord(
  8771. (aSign<>0)
  8772. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8773. <> 0 ));
  8774. exit;
  8775. end;
  8776. if aSign<>0 then
  8777. result:=lt128( b.high, b.low, a.high, a.low )
  8778. else
  8779. result:=lt128( a.high, a.low, b.high, b.low );
  8780. end;
  8781. {----------------------------------------------------------------------------
  8782. | Returns the result of converting the double-precision floating-point value
  8783. | `a' to the quadruple-precision floating-point format. The conversion is
  8784. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8785. | Arithmetic.
  8786. *----------------------------------------------------------------------------}
  8787. function float64_to_float128( a : float64) : float128;
  8788. var
  8789. aSign : flag;
  8790. aExp : int16;
  8791. aSig, zSig0, zSig1 : bits64;
  8792. begin
  8793. aSig := extractFloat64Frac( a );
  8794. aExp := extractFloat64Exp( a );
  8795. aSign := extractFloat64Sign( a );
  8796. if ( aExp = $7FF ) then begin
  8797. if ( aSig<>0 ) then begin
  8798. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8799. exit;
  8800. end;
  8801. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8802. exit;
  8803. end;
  8804. if ( aExp = 0 ) then begin
  8805. if ( aSig = 0 ) then
  8806. begin
  8807. result:=packFloat128( aSign, 0, 0, 0 );
  8808. exit;
  8809. end;
  8810. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8811. dec(aExp);
  8812. end;
  8813. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8814. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8815. end;
  8816. {$endif FPC_SOFTFLOAT_FLOAT128}
  8817. {$endif not(defined(fpc_softfpu_interface))}
  8818. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8819. end.
  8820. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}