softfpu.pp 327 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286928792889289929092919292929392949295929692979298929993009301930293039304930593069307930893099310931193129313931493159316931793189319932093219322932393249325932693279328932993309331933293339334933593369337933893399340934193429343934493459346934793489349935093519352935393549355935693579358935993609361936293639364936593669367936893699370937193729373937493759376937793789379938093819382938393849385938693879388938993909391
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. interface
  69. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  70. {$if not(defined(fpc_softfpu_implementation))}
  71. {
  72. -------------------------------------------------------------------------------
  73. Software IEC/IEEE floating-point types.
  74. -------------------------------------------------------------------------------
  75. }
  76. TYPE
  77. {$ifndef FPC_SYSTEM_HAS_float32}
  78. float32 = longword;
  79. {$define FPC_SYSTEM_HAS_float32}
  80. {$endif ndef FPC_SYSTEM_HAS_float32}
  81. { we use here a record in the function header because
  82. the record allows bitwise conversion to single }
  83. float32rec = record
  84. float32 : float32;
  85. end;
  86. flag = byte;
  87. bits8 = byte;
  88. sbits8 = shortint;
  89. bits16 = word;
  90. sbits16 = smallint;
  91. sbits32 = longint;
  92. bits32 = longword;
  93. {$ifndef fpc}
  94. qword = int64;
  95. {$endif}
  96. { now part of the system unit
  97. uint64 = qword;
  98. }
  99. bits64 = qword;
  100. sbits64 = int64;
  101. {$ifdef ENDIAN_LITTLE}
  102. {$ifndef FPC_SYSTEM_HAS_float64}
  103. float64 = record
  104. case byte of
  105. // force the record to be aligned like a double
  106. // else *_to_double will fail for cpus like sparc
  107. // and avoid expensive unpacking/packing operations
  108. 1: (dummy : double);
  109. 2: (low,high : bits32);
  110. end;
  111. {$endif ndef FPC_SYSTEM_HAS_float64}
  112. floatx80 = record
  113. case byte of
  114. // force the record to be aligned like a double
  115. // else *_to_double will fail for cpus like sparc
  116. // and avoid expensive unpacking/packing operations
  117. 1: (dummy : extended);
  118. 2: (low : qword;high : word);
  119. end;
  120. float128 = record
  121. case byte of
  122. // force the record to be aligned like a double
  123. // else *_to_double will fail for cpus like sparc
  124. // and avoid expensive unpacking/packing operations
  125. 1: (dummy : qword);
  126. 2: (low,high : qword);
  127. end;
  128. {$else}
  129. {$ifndef FPC_SYSTEM_HAS_float64}
  130. float64 = record
  131. case byte of
  132. // force the record to be aligned like a double
  133. // else *_to_double will fail for cpus like sparc
  134. 1: (dummy : double);
  135. 2: (high,low : bits32);
  136. end;
  137. {$endif ndef FPC_SYSTEM_HAS_float64}
  138. floatx80 = record
  139. case byte of
  140. // force the record to be aligned like a double
  141. // else *_to_double will fail for cpus like sparc
  142. // and avoid expensive unpacking/packing operations
  143. 1: (dummy : qword);
  144. 2: (high : word;low : qword);
  145. end;
  146. float128 = record
  147. case byte of
  148. // force the record to be aligned like a double
  149. // else *_to_double will fail for cpus like sparc
  150. // and avoid expensive unpacking/packing operations
  151. 1: (dummy : qword);
  152. 2: (high : qword;low : qword);
  153. end;
  154. {$endif}
  155. {$define FPC_SYSTEM_HAS_float64}
  156. {*
  157. -------------------------------------------------------------------------------
  158. Returns 1 if the double-precision floating-point value `a' is less than
  159. the corresponding value `b', and 0 otherwise. The comparison is performed
  160. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  161. -------------------------------------------------------------------------------
  162. *}
  163. Function float64_lt(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  164. {*
  165. -------------------------------------------------------------------------------
  166. Returns 1 if the double-precision floating-point value `a' is less than
  167. or equal to the corresponding value `b', and 0 otherwise. The comparison
  168. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  169. Arithmetic.
  170. -------------------------------------------------------------------------------
  171. *}
  172. Function float64_le(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  173. {*
  174. -------------------------------------------------------------------------------
  175. Returns 1 if the double-precision floating-point value `a' is equal to
  176. the corresponding value `b', and 0 otherwise. The comparison is performed
  177. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  178. -------------------------------------------------------------------------------
  179. *}
  180. Function float64_eq(a: float64;b: float64): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  181. {*
  182. -------------------------------------------------------------------------------
  183. Returns the square root of the double-precision floating-point value `a'.
  184. The operation is performed according to the IEC/IEEE Standard for Binary
  185. Floating-Point Arithmetic.
  186. -------------------------------------------------------------------------------
  187. *}
  188. function float64_sqrt( a: float64 ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  189. {*
  190. -------------------------------------------------------------------------------
  191. Returns the remainder of the double-precision floating-point value `a'
  192. with respect to the corresponding value `b'. The operation is performed
  193. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  194. -------------------------------------------------------------------------------
  195. *}
  196. Function float64_rem(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  197. {*
  198. -------------------------------------------------------------------------------
  199. Returns the result of dividing the double-precision floating-point value `a'
  200. by the corresponding value `b'. The operation is performed according to the
  201. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  202. -------------------------------------------------------------------------------
  203. *}
  204. Function float64_div(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  205. {*
  206. -------------------------------------------------------------------------------
  207. Returns the result of multiplying the double-precision floating-point values
  208. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  209. for Binary Floating-Point Arithmetic.
  210. -------------------------------------------------------------------------------
  211. *}
  212. Function float64_mul( a: float64; b:float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  213. {*
  214. -------------------------------------------------------------------------------
  215. Returns the result of subtracting the double-precision floating-point values
  216. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  217. for Binary Floating-Point Arithmetic.
  218. -------------------------------------------------------------------------------
  219. *}
  220. Function float64_sub(a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  221. {*
  222. -------------------------------------------------------------------------------
  223. Returns the result of adding the double-precision floating-point values `a'
  224. and `b'. The operation is performed according to the IEC/IEEE Standard for
  225. Binary Floating-Point Arithmetic.
  226. -------------------------------------------------------------------------------
  227. *}
  228. Function float64_add( a: float64; b : float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  229. {*
  230. -------------------------------------------------------------------------------
  231. Rounds the double-precision floating-point value `a' to an integer,
  232. and returns the result as a double-precision floating-point value. The
  233. operation is performed according to the IEC/IEEE Standard for Binary
  234. Floating-Point Arithmetic.
  235. -------------------------------------------------------------------------------
  236. *}
  237. Function float64_round_to_int(a: float64) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  238. {*
  239. -------------------------------------------------------------------------------
  240. Returns the result of converting the double-precision floating-point value
  241. `a' to the single-precision floating-point format. The conversion is
  242. performed according to the IEC/IEEE Standard for Binary Floating-Point
  243. Arithmetic.
  244. -------------------------------------------------------------------------------
  245. *}
  246. Function float64_to_float32(a: float64) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  247. {*
  248. -------------------------------------------------------------------------------
  249. Returns the result of converting the double-precision floating-point value
  250. `a' to the 32-bit two's complement integer format. The conversion is
  251. performed according to the IEC/IEEE Standard for Binary Floating-Point
  252. Arithmetic, except that the conversion is always rounded toward zero.
  253. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  254. the conversion overflows, the largest integer with the same sign as `a' is
  255. returned.
  256. -------------------------------------------------------------------------------
  257. *}
  258. Function float64_to_int32_round_to_zero(a: float64 ): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  259. {*
  260. -------------------------------------------------------------------------------
  261. Returns the result of converting the double-precision floating-point value
  262. `a' to the 32-bit two's complement integer format. The conversion is
  263. performed according to the IEC/IEEE Standard for Binary Floating-Point
  264. Arithmetic---which means in particular that the conversion is rounded
  265. according to the current rounding mode. If `a' is a NaN, the largest
  266. positive integer is returned. Otherwise, if the conversion overflows, the
  267. largest integer with the same sign as `a' is returned.
  268. -------------------------------------------------------------------------------
  269. *}
  270. Function float64_to_int32(a: float64): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  271. {*
  272. -------------------------------------------------------------------------------
  273. Returns 1 if the single-precision floating-point value `a' is less than
  274. the corresponding value `b', and 0 otherwise. The comparison is performed
  275. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  276. -------------------------------------------------------------------------------
  277. *}
  278. Function float32_lt( a:float32rec ; b : float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  279. {*
  280. -------------------------------------------------------------------------------
  281. Returns 1 if the single-precision floating-point value `a' is less than
  282. or equal to the corresponding value `b', and 0 otherwise. The comparison
  283. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  284. Arithmetic.
  285. -------------------------------------------------------------------------------
  286. *}
  287. Function float32_le( a: float32rec; b : float32rec ):flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  288. {*
  289. -------------------------------------------------------------------------------
  290. Returns 1 if the single-precision floating-point value `a' is equal to
  291. the corresponding value `b', and 0 otherwise. The comparison is performed
  292. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  293. -------------------------------------------------------------------------------
  294. *}
  295. Function float32_eq( a:float32rec; b:float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  296. {*
  297. -------------------------------------------------------------------------------
  298. Returns the square root of the single-precision floating-point value `a'.
  299. The operation is performed according to the IEC/IEEE Standard for Binary
  300. Floating-Point Arithmetic.
  301. -------------------------------------------------------------------------------
  302. *}
  303. Function float32_sqrt(a: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  304. {*
  305. -------------------------------------------------------------------------------
  306. Returns the remainder of the single-precision floating-point value `a'
  307. with respect to the corresponding value `b'. The operation is performed
  308. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  309. -------------------------------------------------------------------------------
  310. *}
  311. Function float32_rem(a: float32rec; b: float32rec ):float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  312. {*
  313. -------------------------------------------------------------------------------
  314. Returns the result of dividing the single-precision floating-point value `a'
  315. by the corresponding value `b'. The operation is performed according to the
  316. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  317. -------------------------------------------------------------------------------
  318. *}
  319. Function float32_div(a: float32rec;b: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  320. {*
  321. -------------------------------------------------------------------------------
  322. Returns the result of multiplying the single-precision floating-point values
  323. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  324. for Binary Floating-Point Arithmetic.
  325. -------------------------------------------------------------------------------
  326. *}
  327. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  328. {*
  329. -------------------------------------------------------------------------------
  330. Returns the result of subtracting the single-precision floating-point values
  331. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  332. for Binary Floating-Point Arithmetic.
  333. -------------------------------------------------------------------------------
  334. *}
  335. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  336. {*
  337. -------------------------------------------------------------------------------
  338. Returns the result of adding the single-precision floating-point values `a'
  339. and `b'. The operation is performed according to the IEC/IEEE Standard for
  340. Binary Floating-Point Arithmetic.
  341. -------------------------------------------------------------------------------
  342. *}
  343. Function float32_add( a: float32rec; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  344. {*
  345. -------------------------------------------------------------------------------
  346. Rounds the single-precision floating-point value `a' to an integer,
  347. and returns the result as a single-precision floating-point value. The
  348. operation is performed according to the IEC/IEEE Standard for Binary
  349. Floating-Point Arithmetic.
  350. -------------------------------------------------------------------------------
  351. *}
  352. Function float32_round_to_int( a: float32rec): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  353. {*
  354. -------------------------------------------------------------------------------
  355. Returns the result of converting the single-precision floating-point value
  356. `a' to the double-precision floating-point format. The conversion is
  357. performed according to the IEC/IEEE Standard for Binary Floating-Point
  358. Arithmetic.
  359. -------------------------------------------------------------------------------
  360. *}
  361. Function float32_to_float64( a : float32rec) : Float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  362. {*
  363. -------------------------------------------------------------------------------
  364. Returns the result of converting the single-precision floating-point value
  365. `a' to the 32-bit two's complement integer format. The conversion is
  366. performed according to the IEC/IEEE Standard for Binary Floating-Point
  367. Arithmetic, except that the conversion is always rounded toward zero.
  368. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  369. the conversion overflows, the largest integer with the same sign as `a' is
  370. returned.
  371. -------------------------------------------------------------------------------
  372. *}
  373. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  374. {*
  375. -------------------------------------------------------------------------------
  376. Returns the result of converting the single-precision floating-point value
  377. `a' to the 32-bit two's complement integer format. The conversion is
  378. performed according to the IEC/IEEE Standard for Binary Floating-Point
  379. Arithmetic---which means in particular that the conversion is rounded
  380. according to the current rounding mode. If `a' is a NaN, the largest
  381. positive integer is returned. Otherwise, if the conversion overflows, the
  382. largest integer with the same sign as `a' is returned.
  383. -------------------------------------------------------------------------------
  384. *}
  385. Function float32_to_int32( a : float32rec) : int32; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  386. {*
  387. -------------------------------------------------------------------------------
  388. Returns the result of converting the 32-bit two's complement integer `a' to
  389. the double-precision floating-point format. The conversion is performed
  390. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  391. -------------------------------------------------------------------------------
  392. *}
  393. Function int32_to_float64( a: int32) : float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  394. {*
  395. -------------------------------------------------------------------------------
  396. Returns the result of converting the 32-bit two's complement integer `a' to
  397. the single-precision floating-point format. The conversion is performed
  398. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  399. -------------------------------------------------------------------------------
  400. *}
  401. Function int32_to_float32( a: int32): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  402. {*----------------------------------------------------------------------------
  403. | Returns the result of converting the 64-bit two's complement integer `a'
  404. | to the double-precision floating-point format. The conversion is performed
  405. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  406. *----------------------------------------------------------------------------*}
  407. Function int64_to_float64( a: int64 ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  408. Function qword_to_float64( a: qword ): float64; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  409. {*----------------------------------------------------------------------------
  410. | Returns the result of converting the 64-bit two's complement integer `a'
  411. | to the single-precision floating-point format. The conversion is performed
  412. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  413. *----------------------------------------------------------------------------*}
  414. Function int64_to_float32( a: int64 ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  415. Function qword_to_float32( a: qword ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  416. // +++
  417. function float32_to_int64( a: float32 ): int64;
  418. function float32_to_int64_round_to_zero( a: float32 ): int64;
  419. function float32_eq_signaling( a: float32; b: float32) : flag;
  420. function float32_le_quiet( a: float32 ; b : float32 ): flag;
  421. function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  422. function float32_is_signaling_nan( a : float32 ): flag;
  423. function float32_is_nan( a : float32 ): flag;
  424. function float64_to_int64( a: float64 ): int64;
  425. function float64_to_int64_round_to_zero( a: float64 ): int64;
  426. function float64_eq_signaling( a: float64; b: float64): flag;
  427. function float64_le_quiet(a: float64 ; b: float64 ): flag;
  428. function float64_lt_quiet(a: float64; b: float64 ): Flag;
  429. function float64_is_signaling_nan( a : float64 ): flag;
  430. function float64_is_nan( a : float64 ): flag;
  431. // ===
  432. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  433. {*----------------------------------------------------------------------------
  434. | Extended double-precision rounding precision
  435. *----------------------------------------------------------------------------*}
  436. var // threadvar!?
  437. floatx80_rounding_precision : int8 = 80;
  438. function int32_to_floatx80( a: int32 ): floatx80;
  439. function int64_to_floatx80( a: int64 ): floatx80;
  440. function qword_to_floatx80( a: qword ): floatx80;
  441. function float32_to_floatx80( a: float32 ): floatx80;
  442. function float64_to_floatx80( a: float64 ): floatx80;
  443. function floatx80_to_int32( a: floatx80 ): int32;
  444. function floatx80_to_int32_round_to_zero( a: floatx80 ): int32;
  445. function floatx80_to_int64( a: floatx80 ): int64;
  446. function floatx80_to_int64_round_to_zero( a: floatx80 ): int64;
  447. function floatx80_to_float32( a: floatx80 ): float32;
  448. function floatx80_to_float64( a: floatx80 ): float64;
  449. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  450. function floatx80_to_float128( a: floatx80 ): float128;
  451. {$endif FPC_SOFTFLOAT_FLOAT128}
  452. function floatx80_round_to_int( a: floatx80 ): floatx80;
  453. function floatx80_add( a: floatx80; b: floatx80 ): floatx80;
  454. function floatx80_sub( a: floatx80; b: floatx80 ): floatx80;
  455. function floatx80_mul( a: floatx80; b: floatx80 ): floatx80;
  456. function floatx80_div( a: floatx80; b: floatx80 ): floatx80;
  457. function floatx80_rem( a: floatx80; b: floatx80 ): floatx80;
  458. function floatx80_sqrt( a: floatx80 ): floatx80;
  459. function floatx80_eq( a: floatx80; b: floatx80 ): flag;
  460. function floatx80_le( a: floatx80; b: floatx80 ): flag;
  461. function floatx80_lt( a: floatx80; b: floatx80 ): flag;
  462. function floatx80_eq_signaling( a: floatx80; b: floatx80 ): flag;
  463. function floatx80_le_quiet( a: floatx80; b: floatx80 ): flag;
  464. function floatx80_lt_quiet( a: floatx80; b: floatx80 ): flag;
  465. function floatx80_is_signaling_nan( a: floatx80 ): flag;
  466. function floatx80_is_nan(a : floatx80 ): flag;
  467. {$endif FPC_SOFTFLOAT_FLOATX80}
  468. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  469. function int32_to_float128( a: int32 ): float128;
  470. function int64_to_float128( a: int64 ): float128;
  471. function qword_to_float128( a: qword ): float128;
  472. function float32_to_float128( a: float32 ): float128;
  473. function float128_is_nan( a : float128): flag;
  474. function float128_is_signaling_nan( a : float128): flag;
  475. function float128_to_int32(a: float128): int32;
  476. function float128_to_int32_round_to_zero(a: float128): int32;
  477. function float128_to_int64(a: float128): int64;
  478. function float128_to_int64_round_to_zero(a: float128): int64;
  479. function float128_to_float32(a: float128): float32;
  480. function float128_to_float64(a: float128): float64;
  481. function float64_to_float128( a : float64) : float128;
  482. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  483. function float128_to_floatx80(a: float128): floatx80;
  484. {$endif FPC_SOFTFLOAT_FLOATX80}
  485. function float128_round_to_int(a: float128): float128;
  486. function float128_add(a: float128; b: float128): float128;
  487. function float128_sub(a: float128; b: float128): float128;
  488. function float128_mul(a: float128; b: float128): float128;
  489. function float128_div(a: float128; b: float128): float128;
  490. function float128_rem(a: float128; b: float128): float128;
  491. function float128_sqrt(a: float128): float128;
  492. function float128_eq(a: float128; b: float128): flag;
  493. function float128_le(a: float128; b: float128): flag;
  494. function float128_lt(a: float128; b: float128): flag;
  495. function float128_eq_signaling(a: float128; b: float128): flag;
  496. function float128_le_quiet(a: float128; b: float128): flag;
  497. function float128_lt_quiet(a: float128; b: float128): flag;
  498. {$endif FPC_SOFTFLOAT_FLOAT128}
  499. CONST
  500. {-------------------------------------------------------------------------------
  501. Software IEC/IEEE floating-point underflow tininess-detection mode.
  502. -------------------------------------------------------------------------------
  503. *}
  504. float_tininess_after_rounding = 0;
  505. float_tininess_before_rounding = 1;
  506. {*
  507. -------------------------------------------------------------------------------
  508. Underflow tininess-detection mode, statically initialized to default value.
  509. (The declaration in `softfloat.h' must match the `int8' type here.)
  510. -------------------------------------------------------------------------------
  511. *}
  512. var // threadvar!?
  513. softfloat_detect_tininess: int8 = float_tininess_after_rounding;
  514. {$endif not(defined(fpc_softfpu_implementation))}
  515. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  516. implementation
  517. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  518. {$if not(defined(fpc_softfpu_interface))}
  519. {$ifdef FPC}
  520. { disable range and overflow checking explicitly }
  521. { This might be more essential for x80 and 128-bit
  522. floating point types and could, maybe be
  523. restricted to code handle floatx80 and float128 }
  524. {$push}
  525. {$R-}
  526. {$Q-}
  527. {$endif FPC}
  528. (*****************************************************************************)
  529. (*----------------------------------------------------------------------------*)
  530. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  531. (* division and square root approximations. (Can be specialized to target if *)
  532. (* desired.) *)
  533. (* ---------------------------------------------------------------------------*)
  534. (*****************************************************************************)
  535. { This procedure serves as a single access point to softfloat_exception_flags.
  536. It also helps to reduce code size a bit because softfloat_exception_flags is
  537. a threadvar. }
  538. procedure set_inexact_flag;
  539. begin
  540. include(softfloat_exception_flags,float_flag_inexact);
  541. end;
  542. {*----------------------------------------------------------------------------
  543. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  544. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  545. | input. If `zSign' is 1, the input is negated before being converted to an
  546. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  547. | is simply rounded to an integer, with the inexact exception raised if the
  548. | input cannot be represented exactly as an integer. However, if the fixed-
  549. | point input is too large, the invalid exception is raised and the largest
  550. | positive or negative integer is returned.
  551. *----------------------------------------------------------------------------*}
  552. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  553. var
  554. roundingMode: TFPURoundingMode;
  555. roundNearestEven: boolean;
  556. roundIncrement, roundBits: int8;
  557. z: int32;
  558. begin
  559. roundingMode := softfloat_rounding_mode;
  560. roundNearestEven := (roundingMode = float_round_nearest_even);
  561. roundIncrement := $40;
  562. if not roundNearestEven then
  563. begin
  564. if ( roundingMode = float_round_to_zero ) then
  565. begin
  566. roundIncrement := 0;
  567. end
  568. else begin
  569. roundIncrement := $7F;
  570. if ( zSign<>0 ) then
  571. begin
  572. if ( roundingMode = float_round_up ) then
  573. roundIncrement := 0;
  574. end
  575. else begin
  576. if ( roundingMode = float_round_down ) then
  577. roundIncrement := 0;
  578. end;
  579. end;
  580. end;
  581. roundBits := lo(absZ) and $7F;
  582. absZ := ( absZ + roundIncrement ) shr 7;
  583. absZ := absZ and not( bits64( ord( ( roundBits xor $40 ) = 0 ) and ord(roundNearestEven) ));
  584. z := absZ;
  585. if ( zSign<>0 ) then
  586. z := - z;
  587. if ( longint(hi( absZ )) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  588. begin
  589. float_raise( float_flag_invalid );
  590. if zSign<>0 then
  591. result:=sbits32($80000000)
  592. else
  593. result:=$7FFFFFFF;
  594. exit;
  595. end;
  596. if ( roundBits<>0 ) then
  597. set_inexact_flag;
  598. result:=z;
  599. end;
  600. {*----------------------------------------------------------------------------
  601. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  602. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  603. | and returns the properly rounded 64-bit integer corresponding to the input.
  604. | If `zSign' is 1, the input is negated before being converted to an integer.
  605. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  606. | the inexact exception raised if the input cannot be represented exactly as
  607. | an integer. However, if the fixed-point input is too large, the invalid
  608. | exception is raised and the largest positive or negative integer is
  609. | returned.
  610. *----------------------------------------------------------------------------*}
  611. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  612. var
  613. roundingMode: TFPURoundingMode;
  614. roundNearestEven, increment: flag;
  615. z: int64;
  616. label
  617. overflow;
  618. begin
  619. roundingMode := softfloat_rounding_mode;
  620. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  621. increment := ord( sbits64(absZ1) < 0 );
  622. if ( roundNearestEven=0 ) then
  623. begin
  624. if ( roundingMode = float_round_to_zero ) then
  625. begin
  626. increment := 0;
  627. end
  628. else begin
  629. if ( zSign<>0 ) then
  630. begin
  631. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  632. end
  633. else begin
  634. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  635. end;
  636. end;
  637. end;
  638. if ( increment<>0 ) then
  639. begin
  640. inc(absZ0);
  641. if ( absZ0 = 0 ) then
  642. goto overflow;
  643. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  644. end;
  645. z := absZ0;
  646. if ( zSign<>0 ) then
  647. z := - z;
  648. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  649. begin
  650. overflow:
  651. float_raise( float_flag_invalid );
  652. if zSign<>0 then
  653. result:=int64($8000000000000000)
  654. else
  655. result:=int64($7FFFFFFFFFFFFFFF);
  656. exit;
  657. end;
  658. if ( absZ1<>0 ) then
  659. set_inexact_flag;
  660. result:=z;
  661. end;
  662. {*
  663. -------------------------------------------------------------------------------
  664. Shifts `a' right by the number of bits given in `count'. If any nonzero
  665. bits are shifted off, they are ``jammed'' into the least significant bit of
  666. the result by setting the least significant bit to 1. The value of `count'
  667. can be arbitrarily large; in particular, if `count' is greater than 32, the
  668. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  669. The result is stored in the location pointed to by `zPtr'.
  670. -------------------------------------------------------------------------------
  671. *}
  672. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  673. var
  674. z: Bits32;
  675. Begin
  676. if ( count = 0 ) then
  677. z := a
  678. else
  679. if ( count < 32 ) then
  680. Begin
  681. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  682. End
  683. else
  684. Begin
  685. z := bits32( a <> 0 );
  686. End;
  687. zPtr := z;
  688. End;
  689. {*----------------------------------------------------------------------------
  690. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  691. | number of bits given in `count'. Any bits shifted off are lost. The value
  692. | of `count' can be arbitrarily large; in particular, if `count' is greater
  693. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  694. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  695. *----------------------------------------------------------------------------*}
  696. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  697. var
  698. z0, z1: bits64;
  699. negCount: int8;
  700. begin
  701. negCount := ( - count ) and 63;
  702. if ( count = 0 ) then
  703. begin
  704. z1 := a1;
  705. z0 := a0;
  706. end
  707. else if ( count < 64 ) then
  708. begin
  709. z1 := ( a0 shl negCount ) or ( a1 shr count );
  710. z0 := a0 shr count;
  711. end
  712. else
  713. begin
  714. if ( count < 128 ) then
  715. z1 := a0 shr ( count and 63 )
  716. else
  717. z1 := 0;
  718. z0 := 0;
  719. end;
  720. z1Ptr := z1;
  721. z0Ptr := z0;
  722. end;
  723. {*----------------------------------------------------------------------------
  724. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  725. | number of bits given in `count'. If any nonzero bits are shifted off, they
  726. | are ``jammed'' into the least significant bit of the result by setting the
  727. | least significant bit to 1. The value of `count' can be arbitrarily large;
  728. | in particular, if `count' is greater than 128, the result will be either
  729. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  730. | nonzero. The result is broken into two 64-bit pieces which are stored at
  731. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  732. *----------------------------------------------------------------------------*}
  733. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  734. var
  735. z0,z1 : bits64;
  736. negCount : int8;
  737. begin
  738. negCount := ( - count ) and 63;
  739. if ( count = 0 ) then begin
  740. z1 := a1;
  741. z0 := a0;
  742. end
  743. else if ( count < 64 ) then begin
  744. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  745. z0 := a0 shr count;
  746. end
  747. else begin
  748. if ( count = 64 ) then begin
  749. z1 := a0 or ord( a1 <> 0 );
  750. end
  751. else if ( count < 128 ) then begin
  752. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  753. end
  754. else begin
  755. z1 := ord( ( a0 or a1 ) <> 0 );
  756. end;
  757. z0 := 0;
  758. end;
  759. z1Ptr := z1;
  760. z0Ptr := z0;
  761. end;
  762. {*
  763. -------------------------------------------------------------------------------
  764. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  765. number of bits given in `count'. Any bits shifted off are lost. The value
  766. of `count' can be arbitrarily large; in particular, if `count' is greater
  767. than 64, the result will be 0. The result is broken into two 32-bit pieces
  768. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  769. -------------------------------------------------------------------------------
  770. *}
  771. Procedure
  772. shift64Right(
  773. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  774. Var
  775. z0, z1: bits32;
  776. negCount : int8;
  777. Begin
  778. negCount := ( - count ) AND 31;
  779. if ( count = 0 ) then
  780. Begin
  781. z1 := a1;
  782. z0 := a0;
  783. End
  784. else if ( count < 32 ) then
  785. Begin
  786. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  787. z0 := a0 shr count;
  788. End
  789. else
  790. Begin
  791. if (count < 64) then
  792. z1 := ( a0 shr ( count AND 31 ) )
  793. else
  794. z1 := 0;
  795. z0 := 0;
  796. End;
  797. z1Ptr := z1;
  798. z0Ptr := z0;
  799. End;
  800. {*
  801. -------------------------------------------------------------------------------
  802. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  803. number of bits given in `count'. If any nonzero bits are shifted off, they
  804. are ``jammed'' into the least significant bit of the result by setting the
  805. least significant bit to 1. The value of `count' can be arbitrarily large;
  806. in particular, if `count' is greater than 64, the result will be either 0
  807. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  808. nonzero. The result is broken into two 32-bit pieces which are stored at
  809. the locations pointed to by `z0Ptr' and `z1Ptr'.
  810. -------------------------------------------------------------------------------
  811. *}
  812. Procedure
  813. shift64RightJamming(
  814. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  815. VAR
  816. z0, z1 : bits32;
  817. negCount : int8;
  818. Begin
  819. negCount := ( - count ) AND 31;
  820. if ( count = 0 ) then
  821. Begin
  822. z1 := a1;
  823. z0 := a0;
  824. End
  825. else
  826. if ( count < 32 ) then
  827. Begin
  828. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  829. z0 := a0 shr count;
  830. End
  831. else
  832. Begin
  833. if ( count = 32 ) then
  834. Begin
  835. z1 := a0 OR bits32( a1 <> 0 );
  836. End
  837. else
  838. if ( count < 64 ) Then
  839. Begin
  840. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  841. End
  842. else
  843. Begin
  844. z1 := bits32( ( a0 OR a1 ) <> 0 );
  845. End;
  846. z0 := 0;
  847. End;
  848. z1Ptr := z1;
  849. z0Ptr := z0;
  850. End;
  851. {*----------------------------------------------------------------------------
  852. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  853. | bits are shifted off, they are ``jammed'' into the least significant bit of
  854. | the result by setting the least significant bit to 1. The value of `count'
  855. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  856. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  857. | The result is stored in the location pointed to by `zPtr'.
  858. *----------------------------------------------------------------------------*}
  859. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  860. var
  861. z: bits64;
  862. begin
  863. if ( count = 0 ) then
  864. begin
  865. z := a;
  866. end
  867. else if ( count < 64 ) then
  868. begin
  869. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  870. end
  871. else
  872. begin
  873. z := ord( a <> 0 );
  874. end;
  875. zPtr := z;
  876. end;
  877. {$if not defined(shift64ExtraRightJamming)}
  878. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  879. overload;
  880. forward;
  881. {$endif}
  882. {*
  883. -------------------------------------------------------------------------------
  884. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  885. by 32 _plus_ the number of bits given in `count'. The shifted result is
  886. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  887. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  888. off form a third 32-bit result as follows: The _last_ bit shifted off is
  889. the most-significant bit of the extra result, and the other 31 bits of the
  890. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  891. were all zero. This extra result is stored in the location pointed to by
  892. `z2Ptr'. The value of `count' can be arbitrarily large.
  893. (This routine makes more sense if `a0', `a1', and `a2' are considered
  894. to form a fixed-point value with binary point between `a1' and `a2'. This
  895. fixed-point value is shifted right by the number of bits given in `count',
  896. and the integer part of the result is returned at the locations pointed to
  897. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  898. corrupted as described above, and is returned at the location pointed to by
  899. `z2Ptr'.)
  900. -------------------------------------------------------------------------------
  901. }
  902. Procedure
  903. shift64ExtraRightJamming(
  904. a0: bits32;
  905. a1: bits32;
  906. a2: bits32;
  907. count: int16;
  908. VAR z0Ptr: bits32;
  909. VAR z1Ptr: bits32;
  910. VAR z2Ptr: bits32
  911. ); overload;
  912. Var
  913. z0, z1, z2: bits32;
  914. negCount : int8;
  915. Begin
  916. negCount := ( - count ) AND 31;
  917. if ( count = 0 ) then
  918. Begin
  919. z2 := a2;
  920. z1 := a1;
  921. z0 := a0;
  922. End
  923. else
  924. Begin
  925. if ( count < 32 ) Then
  926. Begin
  927. z2 := a1 shl negCount;
  928. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  929. z0 := a0 shr count;
  930. End
  931. else
  932. Begin
  933. if ( count = 32 ) then
  934. Begin
  935. z2 := a1;
  936. z1 := a0;
  937. End
  938. else
  939. Begin
  940. a2 := a2 or a1;
  941. if ( count < 64 ) then
  942. Begin
  943. z2 := a0 shl negCount;
  944. z1 := a0 shr ( count AND 31 );
  945. End
  946. else
  947. Begin
  948. if count = 64 then
  949. z2 := a0
  950. else
  951. z2 := bits32(a0 <> 0);
  952. z1 := 0;
  953. End;
  954. End;
  955. z0 := 0;
  956. End;
  957. z2 := z2 or bits32( a2 <> 0 );
  958. End;
  959. z2Ptr := z2;
  960. z1Ptr := z1;
  961. z0Ptr := z0;
  962. End;
  963. {*
  964. -------------------------------------------------------------------------------
  965. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  966. number of bits given in `count'. Any bits shifted off are lost. The value
  967. of `count' must be less than 32. The result is broken into two 32-bit
  968. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  969. -------------------------------------------------------------------------------
  970. *}
  971. Procedure
  972. shortShift64Left(
  973. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  974. Begin
  975. z1Ptr := a1 shl count;
  976. if count = 0 then
  977. z0Ptr := a0
  978. else
  979. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  980. End;
  981. {*
  982. -------------------------------------------------------------------------------
  983. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  984. by the number of bits given in `count'. Any bits shifted off are lost.
  985. The value of `count' must be less than 32. The result is broken into three
  986. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  987. `z1Ptr', and `z2Ptr'.
  988. -------------------------------------------------------------------------------
  989. *}
  990. Procedure
  991. shortShift96Left(
  992. a0: bits32;
  993. a1: bits32;
  994. a2: bits32;
  995. count: int16;
  996. VAR z0Ptr: bits32;
  997. VAR z1Ptr: bits32;
  998. VAR z2Ptr: bits32
  999. );
  1000. Var
  1001. z0, z1, z2: bits32;
  1002. negCount: int8;
  1003. Begin
  1004. z2 := a2 shl count;
  1005. z1 := a1 shl count;
  1006. z0 := a0 shl count;
  1007. if ( 0 < count ) then
  1008. Begin
  1009. negCount := ( ( - count ) AND 31 );
  1010. z1 := z1 or (a2 shr negCount);
  1011. z0 := z0 or (a1 shr negCount);
  1012. End;
  1013. z2Ptr := z2;
  1014. z1Ptr := z1;
  1015. z0Ptr := z0;
  1016. End;
  1017. {*----------------------------------------------------------------------------
  1018. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  1019. | number of bits given in `count'. Any bits shifted off are lost. The value
  1020. | of `count' must be less than 64. The result is broken into two 64-bit
  1021. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1022. *----------------------------------------------------------------------------*}
  1023. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  1024. begin
  1025. z1Ptr := a1 shl count;
  1026. if count=0 then
  1027. z0Ptr:=a0
  1028. else
  1029. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  1030. end;
  1031. {*
  1032. -------------------------------------------------------------------------------
  1033. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  1034. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  1035. any carry out is lost. The result is broken into two 32-bit pieces which
  1036. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1037. -------------------------------------------------------------------------------
  1038. *}
  1039. Procedure
  1040. add64(
  1041. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1042. Var
  1043. z1: bits32;
  1044. Begin
  1045. z1 := a1 + b1;
  1046. z1Ptr := z1;
  1047. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  1048. End;
  1049. {*
  1050. -------------------------------------------------------------------------------
  1051. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  1052. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1053. modulo 2^96, so any carry out is lost. The result is broken into three
  1054. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1055. `z1Ptr', and `z2Ptr'.
  1056. -------------------------------------------------------------------------------
  1057. *}
  1058. Procedure
  1059. add96(
  1060. a0: bits32;
  1061. a1: bits32;
  1062. a2: bits32;
  1063. b0: bits32;
  1064. b1: bits32;
  1065. b2: bits32;
  1066. VAR z0Ptr: bits32;
  1067. VAR z1Ptr: bits32;
  1068. VAR z2Ptr: bits32
  1069. );
  1070. var
  1071. z0, z1, z2: bits32;
  1072. carry0, carry1: int8;
  1073. Begin
  1074. z2 := a2 + b2;
  1075. carry1 := int8( z2 < a2 );
  1076. z1 := a1 + b1;
  1077. carry0 := int8( z1 < a1 );
  1078. z0 := a0 + b0;
  1079. z1 := z1 + carry1;
  1080. z0 := z0 + bits32( z1 < carry1 );
  1081. z0 := z0 + carry0;
  1082. z2Ptr := z2;
  1083. z1Ptr := z1;
  1084. z0Ptr := z0;
  1085. End;
  1086. {*----------------------------------------------------------------------------
  1087. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  1088. | by the number of bits given in `count'. Any bits shifted off are lost.
  1089. | The value of `count' must be less than 64. The result is broken into three
  1090. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1091. | `z1Ptr', and `z2Ptr'.
  1092. *----------------------------------------------------------------------------*}
  1093. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1094. var
  1095. z0, z1, z2 : bits64;
  1096. negCount : int8;
  1097. begin
  1098. z2 := a2 shl count;
  1099. z1 := a1 shl count;
  1100. z0 := a0 shl count;
  1101. if ( 0 < count ) then
  1102. begin
  1103. negCount := ( ( - count ) and 63 );
  1104. z1 := z1 or (a2 shr negCount);
  1105. z0 := z0 or (a1 shr negCount);
  1106. end;
  1107. z2Ptr := z2;
  1108. z1Ptr := z1;
  1109. z0Ptr := z0;
  1110. end;
  1111. {*----------------------------------------------------------------------------
  1112. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1113. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1114. | any carry out is lost. The result is broken into two 64-bit pieces which
  1115. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1116. *----------------------------------------------------------------------------*}
  1117. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1118. var
  1119. z1 : bits64;
  1120. begin
  1121. z1 := a1 + b1;
  1122. z1Ptr := z1;
  1123. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1124. end;
  1125. {*----------------------------------------------------------------------------
  1126. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1127. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1128. | modulo 2^192, so any carry out is lost. The result is broken into three
  1129. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1130. | `z1Ptr', and `z2Ptr'.
  1131. *----------------------------------------------------------------------------*}
  1132. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1133. var
  1134. z0, z1, z2 : bits64;
  1135. carry0, carry1 : int8;
  1136. begin
  1137. z2 := a2 + b2;
  1138. carry1 := ord( z2 < a2 );
  1139. z1 := a1 + b1;
  1140. carry0 := ord( z1 < a1 );
  1141. z0 := a0 + b0;
  1142. inc(z1, carry1);
  1143. inc(z0, ord( z1 < carry1 ));
  1144. inc(z0, carry0);
  1145. z2Ptr := z2;
  1146. z1Ptr := z1;
  1147. z0Ptr := z0;
  1148. end;
  1149. {*
  1150. -------------------------------------------------------------------------------
  1151. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1152. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1153. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1154. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1155. `z1Ptr'.
  1156. -------------------------------------------------------------------------------
  1157. *}
  1158. Procedure
  1159. sub64(
  1160. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1161. Begin
  1162. z1Ptr := a1 - b1;
  1163. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1164. End;
  1165. {*
  1166. -------------------------------------------------------------------------------
  1167. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1168. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1169. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1170. into three 32-bit pieces which are stored at the locations pointed to by
  1171. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1172. -------------------------------------------------------------------------------
  1173. *}
  1174. Procedure
  1175. sub96(
  1176. a0:bits32;
  1177. a1:bits32;
  1178. a2:bits32;
  1179. b0:bits32;
  1180. b1:bits32;
  1181. b2:bits32;
  1182. VAR z0Ptr:bits32;
  1183. VAR z1Ptr:bits32;
  1184. VAR z2Ptr:bits32
  1185. );
  1186. Var
  1187. z0, z1, z2: bits32;
  1188. borrow0, borrow1: int8;
  1189. Begin
  1190. z2 := a2 - b2;
  1191. borrow1 := int8( a2 < b2 );
  1192. z1 := a1 - b1;
  1193. borrow0 := int8( a1 < b1 );
  1194. z0 := a0 - b0;
  1195. z0 := z0 - bits32( z1 < borrow1 );
  1196. z1 := z1 - borrow1;
  1197. z0 := z0 -borrow0;
  1198. z2Ptr := z2;
  1199. z1Ptr := z1;
  1200. z0Ptr := z0;
  1201. End;
  1202. {*----------------------------------------------------------------------------
  1203. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1204. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1205. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1206. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1207. | `z1Ptr'.
  1208. *----------------------------------------------------------------------------*}
  1209. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1210. begin
  1211. z1Ptr := a1 - b1;
  1212. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1213. end;
  1214. {*----------------------------------------------------------------------------
  1215. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1216. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1217. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1218. | result is broken into three 64-bit pieces which are stored at the locations
  1219. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1220. *----------------------------------------------------------------------------*}
  1221. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1222. var
  1223. z0, z1, z2 : bits64;
  1224. borrow0, borrow1 : int8;
  1225. begin
  1226. z2 := a2 - b2;
  1227. borrow1 := ord( a2 < b2 );
  1228. z1 := a1 - b1;
  1229. borrow0 := ord( a1 < b1 );
  1230. z0 := a0 - b0;
  1231. dec(z0, ord( z1 < borrow1 ));
  1232. dec(z1, borrow1);
  1233. dec(z0, borrow0);
  1234. z2Ptr := z2;
  1235. z1Ptr := z1;
  1236. z0Ptr := z0;
  1237. end;
  1238. {*
  1239. -------------------------------------------------------------------------------
  1240. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1241. into two 32-bit pieces which are stored at the locations pointed to by
  1242. `z0Ptr' and `z1Ptr'.
  1243. -------------------------------------------------------------------------------
  1244. *}
  1245. {$IFDEF SOFTFPU_COMPILER_MUL32TO64}
  1246. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr :bits32 );{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1247. var
  1248. tmp: qword;
  1249. begin
  1250. tmp:=qword(a) * b;
  1251. z0ptr:=hi(tmp);
  1252. z1ptr:=lo(tmp);
  1253. end;
  1254. {$ELSE}
  1255. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1256. :bits32 );
  1257. Var
  1258. aHigh, aLow, bHigh, bLow: bits16;
  1259. z0, zMiddleA, zMiddleB, z1: bits32;
  1260. Begin
  1261. aLow := bits16(a);
  1262. aHigh := a shr 16;
  1263. bLow := bits16(b);
  1264. bHigh := b shr 16;
  1265. z1 := ( bits32( aLow) ) * bLow;
  1266. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1267. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1268. z0 := ( bits32 (aHigh) ) * bHigh;
  1269. zMiddleA := zMiddleA + zMiddleB;
  1270. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1271. zMiddleA := zmiddleA shl 16;
  1272. z1 := z1 + zMiddleA;
  1273. z0 := z0 + bits32( z1 < zMiddleA );
  1274. z1Ptr := z1;
  1275. z0Ptr := z0;
  1276. End;
  1277. {$ENDIF}
  1278. {*
  1279. -------------------------------------------------------------------------------
  1280. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1281. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1282. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1283. `z2Ptr'.
  1284. -------------------------------------------------------------------------------
  1285. *}
  1286. Procedure
  1287. mul64By32To96(
  1288. a0:bits32;
  1289. a1:bits32;
  1290. b:bits32;
  1291. VAR z0Ptr:bits32;
  1292. VAR z1Ptr:bits32;
  1293. VAR z2Ptr:bits32
  1294. );
  1295. Var
  1296. z0, z1, z2, more1: bits32;
  1297. Begin
  1298. mul32To64( a1, b, z1, z2 );
  1299. mul32To64( a0, b, z0, more1 );
  1300. add64( z0, more1, 0, z1, z0, z1 );
  1301. z2Ptr := z2;
  1302. z1Ptr := z1;
  1303. z0Ptr := z0;
  1304. End;
  1305. {*
  1306. -------------------------------------------------------------------------------
  1307. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1308. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1309. product. The product is broken into four 32-bit pieces which are stored at
  1310. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1311. -------------------------------------------------------------------------------
  1312. *}
  1313. Procedure
  1314. mul64To128(
  1315. a0:bits32;
  1316. a1:bits32;
  1317. b0:bits32;
  1318. b1:bits32;
  1319. VAR z0Ptr:bits32;
  1320. VAR z1Ptr:bits32;
  1321. VAR z2Ptr:bits32;
  1322. VAR z3Ptr:bits32
  1323. );
  1324. Var
  1325. z0, z1, z2, z3: bits32;
  1326. more1, more2: bits32;
  1327. Begin
  1328. mul32To64( a1, b1, z2, z3 );
  1329. mul32To64( a1, b0, z1, more2 );
  1330. add64( z1, more2, 0, z2, z1, z2 );
  1331. mul32To64( a0, b0, z0, more1 );
  1332. add64( z0, more1, 0, z1, z0, z1 );
  1333. mul32To64( a0, b1, more1, more2 );
  1334. add64( more1, more2, 0, z2, more1, z2 );
  1335. add64( z0, z1, 0, more1, z0, z1 );
  1336. z3Ptr := z3;
  1337. z2Ptr := z2;
  1338. z1Ptr := z1;
  1339. z0Ptr := z0;
  1340. End;
  1341. {*----------------------------------------------------------------------------
  1342. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1343. | into two 64-bit pieces which are stored at the locations pointed to by
  1344. | `z0Ptr' and `z1Ptr'.
  1345. *----------------------------------------------------------------------------*}
  1346. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1347. var
  1348. aHigh, aLow, bHigh, bLow : bits32;
  1349. z0, zMiddleA, zMiddleB, z1 : bits64;
  1350. begin
  1351. aLow := a;
  1352. aHigh := a shr 32;
  1353. bLow := b;
  1354. bHigh := b shr 32;
  1355. z1 := ( bits64(aLow) ) * bLow;
  1356. zMiddleA := ( bits64( aLow )) * bHigh;
  1357. zMiddleB := ( bits64( aHigh )) * bLow;
  1358. z0 := ( bits64(aHigh) ) * bHigh;
  1359. inc(zMiddleA, zMiddleB);
  1360. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1361. zMiddleA := zMiddleA shl 32;
  1362. inc(z1, zMiddleA);
  1363. inc(z0, ord( z1 < zMiddleA ));
  1364. z1Ptr := z1;
  1365. z0Ptr := z0;
  1366. end;
  1367. {*----------------------------------------------------------------------------
  1368. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1369. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1370. | product. The product is broken into four 64-bit pieces which are stored at
  1371. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1372. *----------------------------------------------------------------------------*}
  1373. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1374. var
  1375. z0,z1,z2,z3,more1,more2 : bits64;
  1376. begin
  1377. mul64To128( a1, b1, z2, z3 );
  1378. mul64To128( a1, b0, z1, more2 );
  1379. add128( z1, more2, 0, z2, z1, z2 );
  1380. mul64To128( a0, b0, z0, more1 );
  1381. add128( z0, more1, 0, z1, z0, z1 );
  1382. mul64To128( a0, b1, more1, more2 );
  1383. add128( more1, more2, 0, z2, more1, z2 );
  1384. add128( z0, z1, 0, more1, z0, z1 );
  1385. z3Ptr := z3;
  1386. z2Ptr := z2;
  1387. z1Ptr := z1;
  1388. z0Ptr := z0;
  1389. end;
  1390. {*----------------------------------------------------------------------------
  1391. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1392. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1393. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1394. | `z2Ptr'.
  1395. *----------------------------------------------------------------------------*}
  1396. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1397. var
  1398. z0, z1, z2, more1 : bits64;
  1399. begin
  1400. mul64To128( a1, b, z1, z2 );
  1401. mul64To128( a0, b, z0, more1 );
  1402. add128( z0, more1, 0, z1, z0, z1 );
  1403. z2Ptr := z2;
  1404. z1Ptr := z1;
  1405. z0Ptr := z0;
  1406. end;
  1407. {*----------------------------------------------------------------------------
  1408. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1409. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1410. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1411. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1412. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1413. | unsigned integer is returned.
  1414. *----------------------------------------------------------------------------*}
  1415. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1416. var
  1417. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1418. begin
  1419. if ( b <= a0 ) then
  1420. begin
  1421. result:=qword( $FFFFFFFFFFFFFFFF );
  1422. exit;
  1423. end;
  1424. b0 := b shr 32;
  1425. if ( b0 shl 32 <= a0 ) then
  1426. z:=qword( $FFFFFFFF00000000 )
  1427. else
  1428. z:=( a0 div b0 ) shl 32;
  1429. mul64To128( b, z, term0, term1 );
  1430. sub128( a0, a1, term0, term1, rem0, rem1 );
  1431. while ( ( sbits64(rem0) ) < 0 ) do begin
  1432. dec(z,qword( $100000000 ));
  1433. b1 := b shl 32;
  1434. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1435. end;
  1436. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1437. if ( b0 shl 32 <= rem0 ) then
  1438. z:=z or $FFFFFFFF
  1439. else
  1440. z:=z or rem0 div b0;
  1441. result:=z;
  1442. end;
  1443. {*
  1444. -------------------------------------------------------------------------------
  1445. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1446. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1447. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1448. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1449. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1450. unsigned integer is returned.
  1451. -------------------------------------------------------------------------------
  1452. *}
  1453. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1454. Var
  1455. b0, b1: bits32;
  1456. rem0, rem1, term0, term1: bits32;
  1457. z: bits32;
  1458. Begin
  1459. if ( b <= a0 ) then
  1460. Begin
  1461. estimateDiv64To32 := $FFFFFFFF;
  1462. exit;
  1463. End;
  1464. b0 := b shr 16;
  1465. if ( b0 shl 16 <= a0 ) then
  1466. z:= $FFFF0000
  1467. else
  1468. z:= ( a0 div b0 ) shl 16;
  1469. mul32To64( b, z, term0, term1 );
  1470. sub64( a0, a1, term0, term1, rem0, rem1 );
  1471. while ( ( sbits32 (rem0) ) < 0 ) do
  1472. Begin
  1473. z := z - $10000;
  1474. b1 := b shl 16;
  1475. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1476. End;
  1477. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1478. if ( b0 shl 16 <= rem0 ) then
  1479. z := z or $FFFF
  1480. else
  1481. z := z or (rem0 div b0);
  1482. estimateDiv64To32 := z;
  1483. End;
  1484. {*
  1485. -------------------------------------------------------------------------------
  1486. Returns an approximation to the square root of the 32-bit significand given
  1487. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1488. `aExp' (the least significant bit) is 1, the integer returned approximates
  1489. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1490. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1491. case, the approximation returned lies strictly within +/-2 of the exact
  1492. value.
  1493. -------------------------------------------------------------------------------
  1494. *}
  1495. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1496. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1497. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1498. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1499. );
  1500. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1501. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1502. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1503. );
  1504. Var
  1505. index: int8;
  1506. z: bits32;
  1507. Begin
  1508. index := ( a shr 27 ) AND 15;
  1509. if ( aExp AND 1 ) <> 0 then
  1510. Begin
  1511. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1512. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1513. a := a shr 1;
  1514. End
  1515. else
  1516. Begin
  1517. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1518. z := a div z + z;
  1519. if ( $20000 <= z ) then
  1520. z := $FFFF8000
  1521. else
  1522. z := ( z shl 15 );
  1523. if ( z <= a ) then
  1524. Begin
  1525. estimateSqrt32 := bits32 ( SarLongint( sbits32 (a)) );
  1526. exit;
  1527. End;
  1528. End;
  1529. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1530. End;
  1531. {*
  1532. -------------------------------------------------------------------------------
  1533. Returns the number of leading 0 bits before the most-significant 1 bit of
  1534. `a'. If `a' is zero, 32 is returned.
  1535. -------------------------------------------------------------------------------
  1536. *}
  1537. Function countLeadingZeros32( a:bits32 ): int8;
  1538. const countLeadingZerosHigh:array[0..255] of int8 = (
  1539. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1540. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1541. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1542. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1543. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1544. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1545. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1546. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1547. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1548. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1549. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1550. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1551. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1552. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1553. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1554. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1555. );
  1556. Var
  1557. shiftCount: int8;
  1558. Begin
  1559. shiftCount := 0;
  1560. if ( a < $10000 ) then
  1561. Begin
  1562. shiftCount := shiftcount + 16;
  1563. a := a shl 16;
  1564. End;
  1565. if ( a < $1000000 ) then
  1566. Begin
  1567. shiftCount := shiftcount + 8;
  1568. a := a shl 8;
  1569. end;
  1570. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1571. countLeadingZeros32:= shiftCount;
  1572. End;
  1573. {*----------------------------------------------------------------------------
  1574. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1575. | `a'. If `a' is zero, 64 is returned.
  1576. *----------------------------------------------------------------------------*}
  1577. function countLeadingZeros64( a : bits64): int8;
  1578. var
  1579. shiftcount : int8;
  1580. Begin
  1581. shiftCount := 0;
  1582. if ( a < bits64(bits64(1) shl 32 )) then
  1583. shiftCount := shiftcount + 32
  1584. else
  1585. a := a shr 32;
  1586. shiftCount := shiftCount + countLeadingZeros32( a );
  1587. countLeadingZeros64:= shiftCount;
  1588. End;
  1589. {*
  1590. -------------------------------------------------------------------------------
  1591. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1592. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1593. Otherwise, returns 0.
  1594. -------------------------------------------------------------------------------
  1595. *}
  1596. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1597. Begin
  1598. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1599. End;
  1600. {*
  1601. -------------------------------------------------------------------------------
  1602. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1603. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1604. returns 0.
  1605. -------------------------------------------------------------------------------
  1606. *}
  1607. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;{$IFDEF SOFTFPU_INLINE}inline;{$ENDIF}
  1608. Begin
  1609. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1610. End;
  1611. const
  1612. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1613. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1614. (*****************************************************************************)
  1615. (* End Low-Level arithmetic *)
  1616. (*****************************************************************************)
  1617. {*----------------------------------------------------------------------------
  1618. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1619. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1620. | returns 0.
  1621. *----------------------------------------------------------------------------*}
  1622. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1623. begin
  1624. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1625. end;
  1626. {*
  1627. -------------------------------------------------------------------------------
  1628. Functions and definitions to determine: (1) whether tininess for underflow
  1629. is detected before or after rounding by default, (2) what (if anything)
  1630. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1631. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1632. are propagated from function inputs to output. These details are ENDIAN
  1633. specific
  1634. -------------------------------------------------------------------------------
  1635. *}
  1636. {$IFDEF ENDIAN_LITTLE}
  1637. {*
  1638. -------------------------------------------------------------------------------
  1639. Internal canonical NaN format.
  1640. -------------------------------------------------------------------------------
  1641. *}
  1642. TYPE
  1643. commonNaNT = record
  1644. high, low : bits32;
  1645. sign: flag;
  1646. end;
  1647. {*
  1648. -------------------------------------------------------------------------------
  1649. The pattern for a default generated single-precision NaN.
  1650. -------------------------------------------------------------------------------
  1651. *}
  1652. const float32_default_nan = $FFC00000;
  1653. {*
  1654. -------------------------------------------------------------------------------
  1655. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1656. otherwise returns 0.
  1657. -------------------------------------------------------------------------------
  1658. *}
  1659. Function float32_is_nan( a : float32 ): flag;
  1660. Begin
  1661. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1662. End;
  1663. {*
  1664. -------------------------------------------------------------------------------
  1665. Returns 1 if the single-precision floating-point value `a' is a signaling
  1666. NaN; otherwise returns 0.
  1667. -------------------------------------------------------------------------------
  1668. *}
  1669. Function float32_is_signaling_nan( a : float32 ): flag;
  1670. Begin
  1671. float32_is_signaling_nan := flag
  1672. (( ( ( a shr 22 ) and $1FF ) = $1FE ) and (( a and $003FFFFF )<>0));
  1673. End;
  1674. {*
  1675. -------------------------------------------------------------------------------
  1676. Returns the result of converting the single-precision floating-point NaN
  1677. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1678. exception is raised.
  1679. -------------------------------------------------------------------------------
  1680. *}
  1681. function float32ToCommonNaN(a: float32) : commonNaNT;
  1682. var
  1683. z : commonNaNT ;
  1684. Begin
  1685. if ( float32_is_signaling_nan( a ) <> 0) then
  1686. float_raise( float_flag_invalid );
  1687. z.sign := a shr 31;
  1688. z.low := 0;
  1689. z.high := a shl 9;
  1690. result := z;
  1691. End;
  1692. {*
  1693. -------------------------------------------------------------------------------
  1694. Returns the result of converting the canonical NaN `a' to the single-
  1695. precision floating-point format.
  1696. -------------------------------------------------------------------------------
  1697. *}
  1698. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1699. Begin
  1700. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1701. End;
  1702. {*
  1703. -------------------------------------------------------------------------------
  1704. Takes two single-precision floating-point values `a' and `b', one of which
  1705. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1706. signaling NaN, the invalid exception is raised.
  1707. -------------------------------------------------------------------------------
  1708. *}
  1709. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1710. Var
  1711. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1712. label returnLargerSignificand;
  1713. Begin
  1714. aIsNaN := float32_is_nan( a );
  1715. aIsSignalingNaN := float32_is_signaling_nan( a );
  1716. bIsNaN := float32_is_nan( b );
  1717. bIsSignalingNaN := float32_is_signaling_nan( b );
  1718. a := a or $00400000;
  1719. b := b or $00400000;
  1720. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1721. float_raise( float_flag_invalid );
  1722. if ( aIsSignalingNaN )<> 0 then
  1723. Begin
  1724. if ( bIsSignalingNaN ) <> 0 then
  1725. goto returnLargerSignificand;
  1726. if bIsNan <> 0 then
  1727. propagateFloat32NaN := b
  1728. else
  1729. propagateFloat32NaN := a;
  1730. exit;
  1731. End
  1732. else if ( aIsNaN <> 0) then
  1733. Begin
  1734. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1735. Begin
  1736. propagateFloat32NaN := a;
  1737. exit;
  1738. End;
  1739. returnLargerSignificand:
  1740. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1741. Begin
  1742. propagateFloat32NaN := b;
  1743. exit;
  1744. End;
  1745. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1746. Begin
  1747. propagateFloat32NaN := a;
  1748. End;
  1749. if a < b then
  1750. propagateFloat32NaN := a
  1751. else
  1752. propagateFloat32NaN := b;
  1753. exit;
  1754. End
  1755. else
  1756. Begin
  1757. propagateFloat32NaN := b;
  1758. exit;
  1759. End;
  1760. End;
  1761. {*
  1762. -------------------------------------------------------------------------------
  1763. The pattern for a default generated double-precision NaN. The `high' and
  1764. `low' values hold the most- and least-significant bits, respectively.
  1765. -------------------------------------------------------------------------------
  1766. *}
  1767. const
  1768. float64_default_nan_high = $FFF80000;
  1769. float64_default_nan_low = $00000000;
  1770. {*
  1771. -------------------------------------------------------------------------------
  1772. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1773. otherwise returns 0.
  1774. -------------------------------------------------------------------------------
  1775. *}
  1776. Function float64_is_nan( a : float64 ) : flag;
  1777. Begin
  1778. float64_is_nan :=
  1779. flag(( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1780. and (( a.low or ( a.high and $000FFFFF ) )<>0));
  1781. End;
  1782. {*
  1783. -------------------------------------------------------------------------------
  1784. Returns 1 if the double-precision floating-point value `a' is a signaling
  1785. NaN; otherwise returns 0.
  1786. -------------------------------------------------------------------------------
  1787. *}
  1788. Function float64_is_signaling_nan( a : float64 ): flag;
  1789. Begin
  1790. float64_is_signaling_nan :=
  1791. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1792. and ( a.low or ( a.high and $0007FFFF ) );
  1793. End;
  1794. {*
  1795. -------------------------------------------------------------------------------
  1796. Returns the result of converting the double-precision floating-point NaN
  1797. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1798. exception is raised.
  1799. -------------------------------------------------------------------------------
  1800. *}
  1801. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1802. Var
  1803. z : commonNaNT;
  1804. Begin
  1805. if ( float64_is_signaling_nan( a )<>0 ) then
  1806. float_raise( float_flag_invalid );
  1807. z.sign := a.high shr 31;
  1808. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1809. result := z;
  1810. End;
  1811. {*
  1812. -------------------------------------------------------------------------------
  1813. Returns the result of converting the canonical NaN `a' to the double-
  1814. precision floating-point format.
  1815. -------------------------------------------------------------------------------
  1816. *}
  1817. function commonNaNToFloat64( a : commonNaNT) : float64;
  1818. Var
  1819. z: float64;
  1820. Begin
  1821. shift64Right( a.high, a.low, 12, z.high, z.low );
  1822. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1823. result := z;
  1824. End;
  1825. {*
  1826. -------------------------------------------------------------------------------
  1827. Takes two double-precision floating-point values `a' and `b', one of which
  1828. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1829. signaling NaN, the invalid exception is raised.
  1830. -------------------------------------------------------------------------------
  1831. *}
  1832. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1833. Var
  1834. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1835. label returnLargerSignificand;
  1836. Begin
  1837. aIsNaN := float64_is_nan( a );
  1838. aIsSignalingNaN := float64_is_signaling_nan( a );
  1839. bIsNaN := float64_is_nan( b );
  1840. bIsSignalingNaN := float64_is_signaling_nan( b );
  1841. a.high := a.high or $00080000;
  1842. b.high := b.high or $00080000;
  1843. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1844. float_raise( float_flag_invalid );
  1845. if ( aIsSignalingNaN )<>0 then
  1846. Begin
  1847. if ( bIsSignalingNaN )<>0 then
  1848. goto returnLargerSignificand;
  1849. if bIsNan <> 0 then
  1850. c := b
  1851. else
  1852. c := a;
  1853. exit;
  1854. End
  1855. else if ( aIsNaN )<> 0 then
  1856. Begin
  1857. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1858. Begin
  1859. c := a;
  1860. exit;
  1861. End;
  1862. returnLargerSignificand:
  1863. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1864. Begin
  1865. c := b;
  1866. exit;
  1867. End;
  1868. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1869. Begin
  1870. c := a;
  1871. exit;
  1872. End;
  1873. if a.high < b.high then
  1874. c := a
  1875. else
  1876. c := b;
  1877. exit;
  1878. End
  1879. else
  1880. Begin
  1881. c := b;
  1882. exit;
  1883. End;
  1884. End;
  1885. {*----------------------------------------------------------------------------
  1886. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1887. | otherwise returns 0.
  1888. *----------------------------------------------------------------------------*}
  1889. function float128_is_nan( a : float128): flag;
  1890. begin
  1891. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1892. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1893. end;
  1894. {*----------------------------------------------------------------------------
  1895. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1896. | signaling NaN; otherwise returns 0.
  1897. *----------------------------------------------------------------------------*}
  1898. function float128_is_signaling_nan( a : float128): flag;
  1899. begin
  1900. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1901. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1902. end;
  1903. {*----------------------------------------------------------------------------
  1904. | Returns the result of converting the quadruple-precision floating-point NaN
  1905. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1906. | exception is raised.
  1907. *----------------------------------------------------------------------------*}
  1908. function float128ToCommonNaN( a : float128): commonNaNT;
  1909. var
  1910. z: commonNaNT;
  1911. qhigh,qlow : qword;
  1912. begin
  1913. if ( float128_is_signaling_nan( a )<>0) then
  1914. float_raise( float_flag_invalid );
  1915. z.sign := a.high shr 63;
  1916. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1917. z.high:=qhigh shr 32;
  1918. z.low:=qhigh and $ffffffff;
  1919. result:=z;
  1920. end;
  1921. {*----------------------------------------------------------------------------
  1922. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1923. | precision floating-point format.
  1924. *----------------------------------------------------------------------------*}
  1925. function commonNaNToFloat128( a : commonNaNT): float128;
  1926. var
  1927. z: float128;
  1928. begin
  1929. shift128Right( a.high, a.low, 16, z.high, z.low );
  1930. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1931. result:=z;
  1932. end;
  1933. {*----------------------------------------------------------------------------
  1934. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1935. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1936. | `b' is a signaling NaN, the invalid exception is raised.
  1937. *----------------------------------------------------------------------------*}
  1938. function propagateFloat128NaN( a: float128; b : float128): float128;
  1939. var
  1940. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1941. label
  1942. returnLargerSignificand;
  1943. begin
  1944. aIsNaN := float128_is_nan( a );
  1945. aIsSignalingNaN := float128_is_signaling_nan( a );
  1946. bIsNaN := float128_is_nan( b );
  1947. bIsSignalingNaN := float128_is_signaling_nan( b );
  1948. a.high := a.high or int64( $0000800000000000 );
  1949. b.high := b.high or int64( $0000800000000000 );
  1950. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1951. float_raise( float_flag_invalid );
  1952. if ( aIsSignalingNaN )<>0 then
  1953. begin
  1954. if ( bIsSignalingNaN )<>0 then
  1955. goto returnLargerSignificand;
  1956. if bIsNaN<>0 then
  1957. result := b
  1958. else
  1959. result := a;
  1960. exit;
  1961. end
  1962. else if ( aIsNaN )<>0 then
  1963. begin
  1964. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1965. begin
  1966. result := a;
  1967. exit;
  1968. end;
  1969. returnLargerSignificand:
  1970. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1971. begin
  1972. result := b;
  1973. exit;
  1974. end;
  1975. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1976. begin
  1977. result := a;
  1978. exit
  1979. end;
  1980. if ( a.high < b.high ) then
  1981. result := a
  1982. else
  1983. result := b;
  1984. exit;
  1985. end
  1986. else
  1987. result:=b;
  1988. end;
  1989. {$ELSE}
  1990. { Big endian code }
  1991. (*----------------------------------------------------------------------------
  1992. | Internal canonical NaN format.
  1993. *----------------------------------------------------------------------------*)
  1994. type
  1995. commonNANT = record
  1996. high, low : bits32;
  1997. sign : flag;
  1998. end;
  1999. (*----------------------------------------------------------------------------
  2000. | The pattern for a default generated single-precision NaN.
  2001. *----------------------------------------------------------------------------*)
  2002. const float32_default_nan = $7FFFFFFF;
  2003. (*----------------------------------------------------------------------------
  2004. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  2005. | otherwise returns 0.
  2006. *----------------------------------------------------------------------------*)
  2007. function float32_is_nan(a: float32): flag;
  2008. begin
  2009. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  2010. end;
  2011. (*----------------------------------------------------------------------------
  2012. | Returns 1 if the single-precision floating-point value `a' is a signaling
  2013. | NaN; otherwise returns 0.
  2014. *----------------------------------------------------------------------------*)
  2015. function float32_is_signaling_nan(a: float32):flag;
  2016. begin
  2017. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  2018. end;
  2019. (*----------------------------------------------------------------------------
  2020. | Returns the result of converting the single-precision floating-point NaN
  2021. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2022. | exception is raised.
  2023. *----------------------------------------------------------------------------*)
  2024. function float32ToCommonNaN( a: float32) : commonNaNT;
  2025. var
  2026. z: commonNANT;
  2027. begin
  2028. if float32_is_signaling_nan(a)<>0 then
  2029. float_raise(float_flag_invalid);
  2030. z.sign := a shr 31;
  2031. z.low := 0;
  2032. z.high := a shl 9;
  2033. result:=z;
  2034. end;
  2035. (*----------------------------------------------------------------------------
  2036. | Returns the result of converting the canonical NaN `a' to the single-
  2037. | precision floating-point format.
  2038. *----------------------------------------------------------------------------*)
  2039. function CommonNanToFloat32(a : CommonNaNT): float32;
  2040. begin
  2041. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  2042. end;
  2043. (*----------------------------------------------------------------------------
  2044. | Takes two single-precision floating-point values `a' and `b', one of which
  2045. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2046. | signaling NaN, the invalid exception is raised.
  2047. *----------------------------------------------------------------------------*)
  2048. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  2049. var
  2050. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2051. begin
  2052. aIsNaN := float32_is_nan( a );
  2053. aIsSignalingNaN := float32_is_signaling_nan( a );
  2054. bIsNaN := float32_is_nan( b );
  2055. bIsSignalingNaN := float32_is_signaling_nan( b );
  2056. a := a or $00400000;
  2057. b := b or $00400000;
  2058. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2059. float_raise( float_flag_invalid );
  2060. if bIsSignalingNaN<>0 then
  2061. propagateFloat32Nan := b
  2062. else if aIsSignalingNan<>0 then
  2063. propagateFloat32Nan := a
  2064. else if bIsNan<>0 then
  2065. propagateFloat32Nan := b
  2066. else
  2067. propagateFloat32Nan := a;
  2068. end;
  2069. (*----------------------------------------------------------------------------
  2070. | The pattern for a default generated double-precision NaN. The `high' and
  2071. | `low' values hold the most- and least-significant bits, respectively.
  2072. *----------------------------------------------------------------------------*)
  2073. const
  2074. float64_default_nan_high = $7FFFFFFF;
  2075. float64_default_nan_low = $FFFFFFFF;
  2076. (*----------------------------------------------------------------------------
  2077. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2078. | otherwise returns 0.
  2079. *----------------------------------------------------------------------------*)
  2080. function float64_is_nan(a: float64): flag;
  2081. begin
  2082. float64_is_nan := flag (
  2083. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2084. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2085. end;
  2086. (*----------------------------------------------------------------------------
  2087. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2088. | NaN; otherwise returns 0.
  2089. *----------------------------------------------------------------------------*)
  2090. function float64_is_signaling_nan( a:float64): flag;
  2091. begin
  2092. float64_is_signaling_nan := flag(
  2093. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2094. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2095. end;
  2096. (*----------------------------------------------------------------------------
  2097. | Returns the result of converting the double-precision floating-point NaN
  2098. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2099. | exception is raised.
  2100. *----------------------------------------------------------------------------*)
  2101. function float64ToCommonNaN( a : float64) : commonNaNT;
  2102. var
  2103. z : commonNaNT;
  2104. begin
  2105. if ( float64_is_signaling_nan( a )<>0 ) then
  2106. float_raise( float_flag_invalid );
  2107. z.sign := a.high shr 31;
  2108. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2109. result:=z;
  2110. end;
  2111. (*----------------------------------------------------------------------------
  2112. | Returns the result of converting the canonical NaN `a' to the double-
  2113. | precision floating-point format.
  2114. *----------------------------------------------------------------------------*)
  2115. function commonNaNToFloat64( a : commonNaNT): float64;
  2116. var
  2117. z: float64;
  2118. begin
  2119. shift64Right( a.high, a.low, 12, z.high, z.low );
  2120. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2121. result:=z;
  2122. end;
  2123. (*----------------------------------------------------------------------------
  2124. | Takes two double-precision floating-point values `a' and `b', one of which
  2125. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2126. | signaling NaN, the invalid exception is raised.
  2127. *----------------------------------------------------------------------------*)
  2128. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2129. var
  2130. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2131. begin
  2132. aIsNaN := float64_is_nan( a );
  2133. aIsSignalingNaN := float64_is_signaling_nan( a );
  2134. bIsNaN := float64_is_nan( b );
  2135. bIsSignalingNaN := float64_is_signaling_nan( b );
  2136. a.high := a.high or $00080000;
  2137. b.high := b.high or $00080000;
  2138. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2139. float_raise( float_flag_invalid );
  2140. if bIsSignalingNaN<>0 then
  2141. c := b
  2142. else if aIsSignalingNan<>0 then
  2143. c := a
  2144. else if bIsNan<>0 then
  2145. c := b
  2146. else
  2147. c := a;
  2148. end;
  2149. {*----------------------------------------------------------------------------
  2150. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  2151. | otherwise returns 0.
  2152. *----------------------------------------------------------------------------*}
  2153. function float128_is_nan( a : float128): flag;
  2154. begin
  2155. result:= ord(( bits64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  2156. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  2157. end;
  2158. {*----------------------------------------------------------------------------
  2159. | Returns 1 if the quadruple-precision floating-point value `a' is a
  2160. | signaling NaN; otherwise returns 0.
  2161. *----------------------------------------------------------------------------*}
  2162. function float128_is_signaling_nan( a : float128): flag;
  2163. begin
  2164. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  2165. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  2166. end;
  2167. {*----------------------------------------------------------------------------
  2168. | Returns the result of converting the quadruple-precision floating-point NaN
  2169. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2170. | exception is raised.
  2171. *----------------------------------------------------------------------------*}
  2172. function float128ToCommonNaN( a : float128): commonNaNT;
  2173. var
  2174. z: commonNaNT;
  2175. qhigh,qlow : qword;
  2176. begin
  2177. if ( float128_is_signaling_nan( a )<>0) then
  2178. float_raise( float_flag_invalid );
  2179. z.sign := a.high shr 63;
  2180. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  2181. z.high:=qhigh shr 32;
  2182. z.low:=qhigh and $ffffffff;
  2183. result:=z;
  2184. end;
  2185. {*----------------------------------------------------------------------------
  2186. | Returns the result of converting the canonical NaN `a' to the quadruple-
  2187. | precision floating-point format.
  2188. *----------------------------------------------------------------------------*}
  2189. function commonNaNToFloat128( a : commonNaNT): float128;
  2190. var
  2191. z: float128;
  2192. begin
  2193. shift128Right( a.high, a.low, 16, z.high, z.low );
  2194. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  2195. result:=z;
  2196. end;
  2197. {*----------------------------------------------------------------------------
  2198. | Takes two quadruple-precision floating-point values `a' and `b', one of
  2199. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  2200. | `b' is a signaling NaN, the invalid exception is raised.
  2201. *----------------------------------------------------------------------------*}
  2202. function propagateFloat128NaN( a: float128; b : float128): float128;
  2203. var
  2204. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  2205. label
  2206. returnLargerSignificand;
  2207. begin
  2208. aIsNaN := float128_is_nan( a );
  2209. aIsSignalingNaN := float128_is_signaling_nan( a );
  2210. bIsNaN := float128_is_nan( b );
  2211. bIsSignalingNaN := float128_is_signaling_nan( b );
  2212. a.high := a.high or int64( $0000800000000000 );
  2213. b.high := b.high or int64( $0000800000000000 );
  2214. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  2215. float_raise( float_flag_invalid );
  2216. if ( aIsSignalingNaN )<>0 then
  2217. begin
  2218. if ( bIsSignalingNaN )<>0 then
  2219. goto returnLargerSignificand;
  2220. if bIsNaN<>0 then
  2221. result := b
  2222. else
  2223. result := a;
  2224. exit;
  2225. end
  2226. else if ( aIsNaN )<>0 then
  2227. begin
  2228. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  2229. begin
  2230. result := a;
  2231. exit;
  2232. end;
  2233. returnLargerSignificand:
  2234. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  2235. begin
  2236. result := b;
  2237. exit;
  2238. end;
  2239. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  2240. begin
  2241. result := a;
  2242. exit
  2243. end;
  2244. if ( a.high < b.high ) then
  2245. result := a
  2246. else
  2247. result := b;
  2248. exit;
  2249. end
  2250. else
  2251. result:=b;
  2252. end;
  2253. {$ENDIF}
  2254. (****************************************************************************)
  2255. (* END ENDIAN SPECIFIC CODE *)
  2256. (****************************************************************************)
  2257. {*
  2258. -------------------------------------------------------------------------------
  2259. Returns the fraction bits of the single-precision floating-point value `a'.
  2260. -------------------------------------------------------------------------------
  2261. *}
  2262. Function ExtractFloat32Frac(a : Float32) : Bits32; inline;
  2263. Begin
  2264. ExtractFloat32Frac := A AND $007FFFFF;
  2265. End;
  2266. {*
  2267. -------------------------------------------------------------------------------
  2268. Returns the exponent bits of the single-precision floating-point value `a'.
  2269. -------------------------------------------------------------------------------
  2270. *}
  2271. Function extractFloat32Exp( a: float32 ): Int16; inline;
  2272. Begin
  2273. extractFloat32Exp := (a shr 23) AND $FF;
  2274. End;
  2275. {*
  2276. -------------------------------------------------------------------------------
  2277. Returns the sign bit of the single-precision floating-point value `a'.
  2278. -------------------------------------------------------------------------------
  2279. *}
  2280. Function extractFloat32Sign( a: float32 ): Flag; inline;
  2281. Begin
  2282. extractFloat32Sign := a shr 31;
  2283. End;
  2284. {*
  2285. -------------------------------------------------------------------------------
  2286. Normalizes the subnormal single-precision floating-point value represented
  2287. by the denormalized significand `aSig'. The normalized exponent and
  2288. significand are stored at the locations pointed to by `zExpPtr' and
  2289. `zSigPtr', respectively.
  2290. -------------------------------------------------------------------------------
  2291. *}
  2292. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2293. Var
  2294. ShiftCount : BYTE;
  2295. Begin
  2296. shiftCount := countLeadingZeros32( aSig ) - 8;
  2297. zSigPtr := aSig shl shiftCount;
  2298. zExpPtr := 1 - shiftCount;
  2299. End;
  2300. {*
  2301. -------------------------------------------------------------------------------
  2302. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2303. single-precision floating-point value, returning the result. After being
  2304. shifted into the proper positions, the three fields are simply added
  2305. together to form the result. This means that any integer portion of `zSig'
  2306. will be added into the exponent. Since a properly normalized significand
  2307. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2308. than the desired result exponent whenever `zSig' is a complete, normalized
  2309. significand.
  2310. -------------------------------------------------------------------------------
  2311. *}
  2312. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32; inline;
  2313. Begin
  2314. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2315. + zSig;
  2316. End;
  2317. {*
  2318. -------------------------------------------------------------------------------
  2319. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2320. and significand `zSig', and returns the proper single-precision floating-
  2321. point value corresponding to the abstract input. Ordinarily, the abstract
  2322. value is simply rounded and packed into the single-precision format, with
  2323. the inexact exception raised if the abstract input cannot be represented
  2324. exactly. However, if the abstract value is too large, the overflow and
  2325. inexact exceptions are raised and an infinity or maximal finite value is
  2326. returned. If the abstract value is too small, the input value is rounded to
  2327. a subnormal number, and the underflow and inexact exceptions are raised if
  2328. the abstract input cannot be represented exactly as a subnormal single-
  2329. precision floating-point number.
  2330. The input significand `zSig' has its binary point between bits 30
  2331. and 29, which is 7 bits to the left of the usual location. This shifted
  2332. significand must be normalized or smaller. If `zSig' is not normalized,
  2333. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2334. and it must not require rounding. In the usual case that `zSig' is
  2335. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2336. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2337. Binary Floating-Point Arithmetic.
  2338. -------------------------------------------------------------------------------
  2339. *}
  2340. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2341. Var
  2342. roundingMode : TFPURoundingMode;
  2343. roundNearestEven : boolean;
  2344. roundIncrement, roundBits : BYTE;
  2345. IsTiny : boolean;
  2346. Begin
  2347. roundingMode := softfloat_rounding_mode;
  2348. roundNearestEven := (roundingMode = float_round_nearest_even);
  2349. roundIncrement := $40;
  2350. if not roundNearestEven then
  2351. Begin
  2352. if ( roundingMode = float_round_to_zero ) Then
  2353. Begin
  2354. roundIncrement := 0;
  2355. End
  2356. else
  2357. Begin
  2358. roundIncrement := $7F;
  2359. if ( zSign <> 0 ) then
  2360. Begin
  2361. if roundingMode = float_round_up then roundIncrement := 0;
  2362. End
  2363. else
  2364. Begin
  2365. if roundingMode = float_round_down then roundIncrement := 0;
  2366. End;
  2367. End
  2368. End;
  2369. roundBits := zSig AND $7F;
  2370. if ($FD <= bits16 (zExp) ) then
  2371. Begin
  2372. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2373. Begin
  2374. float_raise( [float_flag_overflow,float_flag_inexact] );
  2375. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2376. exit;
  2377. End;
  2378. if ( zExp < 0 ) then
  2379. Begin
  2380. isTiny :=
  2381. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2382. OR ( zExp < -1 )
  2383. OR ( (zSig + roundIncrement) < $80000000 );
  2384. shift32RightJamming( zSig, - zExp, zSig );
  2385. zExp := 0;
  2386. roundBits := zSig AND $7F;
  2387. if ( isTiny and (roundBits<>0) ) then
  2388. float_raise( float_flag_underflow );
  2389. End;
  2390. End;
  2391. if ( roundBits )<> 0 then
  2392. set_inexact_flag;
  2393. zSig := ( zSig + roundIncrement ) shr 7;
  2394. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and ord(roundNearestEven) );
  2395. if ( zSig = 0 ) then zExp := 0;
  2396. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2397. End;
  2398. {*
  2399. -------------------------------------------------------------------------------
  2400. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2401. and significand `zSig', and returns the proper single-precision floating-
  2402. point value corresponding to the abstract input. This routine is just like
  2403. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2404. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2405. floating-point exponent.
  2406. -------------------------------------------------------------------------------
  2407. *}
  2408. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2409. Var
  2410. ShiftCount : int8;
  2411. Begin
  2412. shiftCount := countLeadingZeros32( zSig ) - 1;
  2413. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2414. End;
  2415. {*
  2416. -------------------------------------------------------------------------------
  2417. Returns the most-significant 20 fraction bits of the double-precision
  2418. floating-point value `a'.
  2419. -------------------------------------------------------------------------------
  2420. *}
  2421. Function extractFloat64Frac0(a: float64): bits32; inline;
  2422. Begin
  2423. extractFloat64Frac0 := a.high and $000FFFFF;
  2424. End;
  2425. {*
  2426. -------------------------------------------------------------------------------
  2427. Returns the least-significant 32 fraction bits of the double-precision
  2428. floating-point value `a'.
  2429. -------------------------------------------------------------------------------
  2430. *}
  2431. Function extractFloat64Frac1(a: float64): bits32; inline;
  2432. Begin
  2433. extractFloat64Frac1 := a.low;
  2434. End;
  2435. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2436. Function extractFloat64Frac(a: float64): bits64; inline;
  2437. Begin
  2438. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2439. End;
  2440. {*
  2441. -------------------------------------------------------------------------------
  2442. Returns the exponent bits of the double-precision floating-point value `a'.
  2443. -------------------------------------------------------------------------------
  2444. *}
  2445. Function extractFloat64Exp(a: float64): int16; inline;
  2446. Begin
  2447. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2448. End;
  2449. {*
  2450. -------------------------------------------------------------------------------
  2451. Returns the sign bit of the double-precision floating-point value `a'.
  2452. -------------------------------------------------------------------------------
  2453. *}
  2454. Function extractFloat64Sign(a: float64) : flag; inline;
  2455. Begin
  2456. extractFloat64Sign := a.high shr 31;
  2457. End;
  2458. {*
  2459. -------------------------------------------------------------------------------
  2460. Normalizes the subnormal double-precision floating-point value represented
  2461. by the denormalized significand formed by the concatenation of `aSig0' and
  2462. `aSig1'. The normalized exponent is stored at the location pointed to by
  2463. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2464. stored at the location pointed to by `zSig0Ptr', and the least significant
  2465. 32 bits of the normalized significand are stored at the location pointed to
  2466. by `zSig1Ptr'.
  2467. -------------------------------------------------------------------------------
  2468. *}
  2469. Procedure normalizeFloat64Subnormal(
  2470. aSig0: bits32;
  2471. aSig1: bits32;
  2472. VAR zExpPtr : Int16;
  2473. VAR zSig0Ptr : Bits32;
  2474. VAR zSig1Ptr : Bits32
  2475. );
  2476. Var
  2477. ShiftCount : Int8;
  2478. Begin
  2479. if ( aSig0 = 0 ) then
  2480. Begin
  2481. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2482. if ( shiftCount < 0 ) then
  2483. Begin
  2484. zSig0Ptr := aSig1 shr ( - shiftCount );
  2485. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2486. End
  2487. else
  2488. Begin
  2489. zSig0Ptr := aSig1 shl shiftCount;
  2490. zSig1Ptr := 0;
  2491. End;
  2492. zExpPtr := - shiftCount - 31;
  2493. End
  2494. else
  2495. Begin
  2496. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2497. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2498. zExpPtr := 1 - shiftCount;
  2499. End;
  2500. End;
  2501. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2502. var
  2503. shiftCount : int8;
  2504. begin
  2505. shiftCount := countLeadingZeros64( aSig ) - 11;
  2506. zSigPtr := aSig shl shiftCount;
  2507. zExpPtr := 1 - shiftCount;
  2508. end;
  2509. {*
  2510. -------------------------------------------------------------------------------
  2511. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2512. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2513. point value, returning the result. After being shifted into the proper
  2514. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2515. together to form the most significant 32 bits of the result. This means
  2516. that any integer portion of `zSig0' will be added into the exponent. Since
  2517. a properly normalized significand will have an integer portion equal to 1,
  2518. the `zExp' input should be 1 less than the desired result exponent whenever
  2519. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2520. -------------------------------------------------------------------------------
  2521. *}
  2522. Procedure
  2523. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2524. var
  2525. z: Float64;
  2526. Begin
  2527. z.low := zSig1;
  2528. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2529. c := z;
  2530. End;
  2531. {*----------------------------------------------------------------------------
  2532. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2533. | double-precision floating-point value, returning the result. After being
  2534. | shifted into the proper positions, the three fields are simply added
  2535. | together to form the result. This means that any integer portion of `zSig'
  2536. | will be added into the exponent. Since a properly normalized significand
  2537. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2538. | than the desired result exponent whenever `zSig' is a complete, normalized
  2539. | significand.
  2540. *----------------------------------------------------------------------------*}
  2541. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2542. begin
  2543. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2544. end;
  2545. {*
  2546. -------------------------------------------------------------------------------
  2547. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2548. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2549. and `zSig2', and returns the proper double-precision floating-point value
  2550. corresponding to the abstract input. Ordinarily, the abstract value is
  2551. simply rounded and packed into the double-precision format, with the inexact
  2552. exception raised if the abstract input cannot be represented exactly.
  2553. However, if the abstract value is too large, the overflow and inexact
  2554. exceptions are raised and an infinity or maximal finite value is returned.
  2555. If the abstract value is too small, the input value is rounded to a
  2556. subnormal number, and the underflow and inexact exceptions are raised if the
  2557. abstract input cannot be represented exactly as a subnormal double-precision
  2558. floating-point number.
  2559. The input significand must be normalized or smaller. If the input
  2560. significand is not normalized, `zExp' must be 0; in that case, the result
  2561. returned is a subnormal number, and it must not require rounding. In the
  2562. usual case that the input significand is normalized, `zExp' must be 1 less
  2563. than the ``true'' floating-point exponent. The handling of underflow and
  2564. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2565. -------------------------------------------------------------------------------
  2566. *}
  2567. Procedure
  2568. roundAndPackFloat64(
  2569. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2570. Var
  2571. roundingMode : TFPURoundingMode;
  2572. roundNearestEven, increment, isTiny : Flag;
  2573. Begin
  2574. roundingMode := softfloat_rounding_mode;
  2575. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2576. increment := flag( sbits32 (zSig2) < 0 );
  2577. if ( roundNearestEven = flag(FALSE) ) then
  2578. Begin
  2579. if ( roundingMode = float_round_to_zero ) then
  2580. increment := 0
  2581. else
  2582. Begin
  2583. if ( zSign )<> 0 then
  2584. Begin
  2585. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2586. End
  2587. else
  2588. Begin
  2589. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2590. End
  2591. End
  2592. End;
  2593. if ( $7FD <= bits16 (zExp) ) then
  2594. Begin
  2595. if (( $7FD < zExp )
  2596. or (( zExp = $7FD )
  2597. and (zSig0=$001FFFFF) and (zSig1=$FFFFFFFF)
  2598. and (increment<>0)
  2599. )
  2600. ) then
  2601. Begin
  2602. float_raise( [float_flag_overflow,float_flag_inexact] );
  2603. if (( roundingMode = float_round_to_zero )
  2604. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2605. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2606. ) then
  2607. Begin
  2608. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2609. exit;
  2610. End;
  2611. packFloat64( zSign, $7FF, 0, 0, c );
  2612. exit;
  2613. End;
  2614. if ( zExp < 0 ) then
  2615. Begin
  2616. isTiny :=
  2617. flag( softfloat_detect_tininess = float_tininess_before_rounding )
  2618. or flag( zExp < -1 )
  2619. or flag(increment = 0)
  2620. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2621. shift64ExtraRightJamming(
  2622. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2623. zExp := 0;
  2624. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2625. if ( roundNearestEven )<>0 then
  2626. Begin
  2627. increment := flag( sbits32 (zSig2) < 0 );
  2628. End
  2629. else
  2630. Begin
  2631. if ( zSign )<>0 then
  2632. Begin
  2633. increment := flag(( roundingMode = float_round_down ) and (zSig2<>0));
  2634. End
  2635. else
  2636. Begin
  2637. increment := flag(( roundingMode = float_round_up ) and (zSig2<>0));
  2638. End
  2639. End;
  2640. End;
  2641. End;
  2642. if ( zSig2 )<>0 then
  2643. set_inexact_flag;
  2644. if ( increment )<>0 then
  2645. Begin
  2646. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2647. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2648. End
  2649. else
  2650. Begin
  2651. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2652. End;
  2653. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2654. End;
  2655. {*----------------------------------------------------------------------------
  2656. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2657. | and significand `zSig', and returns the proper double-precision floating-
  2658. | point value corresponding to the abstract input. Ordinarily, the abstract
  2659. | value is simply rounded and packed into the double-precision format, with
  2660. | the inexact exception raised if the abstract input cannot be represented
  2661. | exactly. However, if the abstract value is too large, the overflow and
  2662. | inexact exceptions are raised and an infinity or maximal finite value is
  2663. | returned. If the abstract value is too small, the input value is rounded
  2664. | to a subnormal number, and the underflow and inexact exceptions are raised
  2665. | if the abstract input cannot be represented exactly as a subnormal double-
  2666. | precision floating-point number.
  2667. | The input significand `zSig' has its binary point between bits 62
  2668. | and 61, which is 10 bits to the left of the usual location. This shifted
  2669. | significand must be normalized or smaller. If `zSig' is not normalized,
  2670. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2671. | and it must not require rounding. In the usual case that `zSig' is
  2672. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2673. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2674. | Binary Floating-Point Arithmetic.
  2675. *----------------------------------------------------------------------------*}
  2676. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2677. var
  2678. roundingMode: TFPURoundingMode;
  2679. roundNearestEven: flag;
  2680. roundIncrement, roundBits: int16;
  2681. isTiny: flag;
  2682. begin
  2683. roundingMode := softfloat_rounding_mode;
  2684. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2685. roundIncrement := $200;
  2686. if ( roundNearestEven=0 ) then
  2687. begin
  2688. if ( roundingMode = float_round_to_zero ) then
  2689. begin
  2690. roundIncrement := 0;
  2691. end
  2692. else begin
  2693. roundIncrement := $3FF;
  2694. if ( zSign<>0 ) then
  2695. begin
  2696. if ( roundingMode = float_round_up ) then
  2697. roundIncrement := 0;
  2698. end
  2699. else begin
  2700. if ( roundingMode = float_round_down ) then
  2701. roundIncrement := 0;
  2702. end
  2703. end
  2704. end;
  2705. roundBits := zSig and $3FF;
  2706. if ( $7FD <= bits16(zExp) ) then
  2707. begin
  2708. if ( ( $7FD < zExp )
  2709. or ( ( zExp = $7FD )
  2710. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2711. ) then
  2712. begin
  2713. float_raise( [float_flag_overflow,float_flag_inexact] );
  2714. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2715. exit;
  2716. end;
  2717. if ( zExp < 0 ) then
  2718. begin
  2719. isTiny := ord(
  2720. ( softfloat_detect_tininess = float_tininess_before_rounding )
  2721. or ( zExp < -1 )
  2722. or ( (zSig + roundIncrement) < bits64( $8000000000000000 ) ) );
  2723. shift64RightJamming( zSig, - zExp, zSig );
  2724. zExp := 0;
  2725. roundBits := zSig and $3FF;
  2726. if ( isTiny and roundBits )<>0 then
  2727. float_raise( float_flag_underflow );
  2728. end
  2729. end;
  2730. if ( roundBits<>0 ) then
  2731. set_inexact_flag;
  2732. zSig := ( zSig + roundIncrement ) shr 10;
  2733. zSig := zSig and not(qword(ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven ));
  2734. if ( zSig = 0 ) then
  2735. zExp := 0;
  2736. result:=packFloat64( zSign, zExp, zSig );
  2737. end;
  2738. {*
  2739. -------------------------------------------------------------------------------
  2740. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2741. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2742. returns the proper double-precision floating-point value corresponding
  2743. to the abstract input. This routine is just like `roundAndPackFloat64'
  2744. except that the input significand has fewer bits and does not have to be
  2745. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2746. point exponent.
  2747. -------------------------------------------------------------------------------
  2748. *}
  2749. Procedure
  2750. normalizeRoundAndPackFloat64(
  2751. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2752. Var
  2753. shiftCount : int8;
  2754. zSig2 : bits32;
  2755. Begin
  2756. if ( zSig0 = 0 ) then
  2757. Begin
  2758. zSig0 := zSig1;
  2759. zSig1 := 0;
  2760. zExp := zExp -32;
  2761. End;
  2762. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2763. if ( 0 <= shiftCount ) then
  2764. Begin
  2765. zSig2 := 0;
  2766. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2767. End
  2768. else
  2769. Begin
  2770. shift64ExtraRightJamming
  2771. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2772. End;
  2773. zExp := zExp - shiftCount;
  2774. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2775. End;
  2776. {*
  2777. ----------------------------------------------------------------------------
  2778. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2779. and significand `zSig', and returns the proper double-precision floating-
  2780. point value corresponding to the abstract input. This routine is just like
  2781. `roundAndPackFloat64' except that `zSig' does not have to be normalized.
  2782. Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2783. floating-point exponent.
  2784. ----------------------------------------------------------------------------
  2785. *}
  2786. function normalizeRoundAndPackFloat64(zSign: flag; zExp: int16; zSig: bits64): float64;
  2787. var
  2788. shiftCount: int8;
  2789. begin
  2790. shiftCount := countLeadingZeros64( zSig ) - 1;
  2791. result := roundAndPackFloat64( zSign, zExp - shiftCount, zSig shl shiftCount);
  2792. end;
  2793. {*
  2794. -------------------------------------------------------------------------------
  2795. Returns the result of converting the 32-bit two's complement integer `a' to
  2796. the single-precision floating-point format. The conversion is performed
  2797. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2798. -------------------------------------------------------------------------------
  2799. *}
  2800. Function int32_to_float32( a: int32): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  2801. Var
  2802. zSign : Flag;
  2803. Begin
  2804. if ( a = 0 ) then
  2805. Begin
  2806. int32_to_float32.float32 := 0;
  2807. exit;
  2808. End;
  2809. if ( a = sbits32 ($80000000) ) then
  2810. Begin
  2811. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2812. exit;
  2813. end;
  2814. zSign := flag( a < 0 );
  2815. If zSign<>0 then
  2816. a := -a;
  2817. int32_to_float32.float32:=
  2818. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2819. End;
  2820. {*
  2821. -------------------------------------------------------------------------------
  2822. Returns the result of converting the 32-bit two's complement integer `a' to
  2823. the double-precision floating-point format. The conversion is performed
  2824. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2825. -------------------------------------------------------------------------------
  2826. *}
  2827. Function int32_to_float64( a: int32) : float64;{$ifdef FPC_IS_SYSTEM} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2828. var
  2829. zSign : flag;
  2830. absA : bits32;
  2831. shiftCount : int8;
  2832. zSig0, zSig1 : bits32;
  2833. Begin
  2834. if ( a = 0 ) then
  2835. Begin
  2836. packFloat64( 0, 0, 0, 0, result );
  2837. exit;
  2838. end;
  2839. zSign := flag( a < 0 );
  2840. if ZSign<>0 then
  2841. AbsA := -a
  2842. else
  2843. AbsA := a;
  2844. shiftCount := countLeadingZeros32( absA ) - 11;
  2845. if ( 0 <= shiftCount ) then
  2846. Begin
  2847. zSig0 := absA shl shiftCount;
  2848. zSig1 := 0;
  2849. End
  2850. else
  2851. Begin
  2852. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2853. End;
  2854. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2855. End;
  2856. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  2857. {$if not defined(packFloatx80)}
  2858. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  2859. forward;
  2860. {$endif}
  2861. {*----------------------------------------------------------------------------
  2862. | Returns the result of converting the 32-bit two's complement integer `a'
  2863. | to the extended double-precision floating-point format. The conversion
  2864. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2865. | Arithmetic.
  2866. *----------------------------------------------------------------------------*}
  2867. function int32_to_floatx80( a: int32 ): floatx80;
  2868. var
  2869. zSign: flag;
  2870. absA: uint32;
  2871. shiftCount: int8;
  2872. zSig: bits64;
  2873. begin
  2874. if ( a = 0 ) then begin
  2875. result := packFloatx80( 0, 0, 0 );
  2876. exit;
  2877. end;
  2878. zSign := ord( a < 0 );
  2879. if zSign <> 0 then absA := - a else absA := a;
  2880. shiftCount := countLeadingZeros32( absA ) + 32;
  2881. zSig := absA;
  2882. result := packFloatx80( zSign, $403E - shiftCount, zSig shl shiftCount );
  2883. end;
  2884. {$endif FPC_SOFTFLOAT_FLOATX80}
  2885. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  2886. {$if not defined(packFloat128)}
  2887. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64 ) : float128;
  2888. forward;
  2889. {$endif}
  2890. {*----------------------------------------------------------------------------
  2891. | Returns the result of converting the 32-bit two's complement integer `a' to
  2892. | the quadruple-precision floating-point format. The conversion is performed
  2893. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2894. *----------------------------------------------------------------------------*}
  2895. function int32_to_float128( a: int32 ): float128;
  2896. var
  2897. zSign: flag;
  2898. absA: uint32;
  2899. shiftCount: int8;
  2900. zSig0: bits64;
  2901. begin
  2902. if ( a = 0 ) then begin
  2903. result := packFloat128( 0, 0, 0, 0 );
  2904. exit;
  2905. end;
  2906. zSign := ord( a < 0 );
  2907. if zSign <> 0 then absA := - a else absA := a;
  2908. shiftCount := countLeadingZeros32( absA ) + 17;
  2909. zSig0 := absA;
  2910. result := packFloat128( zSign, $402E - shiftCount, zSig0 shl shiftCount, 0 );
  2911. end;
  2912. {$endif FPC_SOFTFLOAT_FLOAT128}
  2913. {*
  2914. -------------------------------------------------------------------------------
  2915. Returns the result of converting the single-precision floating-point value
  2916. `a' to the 32-bit two's complement integer format. The conversion is
  2917. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2918. Arithmetic---which means in particular that the conversion is rounded
  2919. according to the current rounding mode. If `a' is a NaN, the largest
  2920. positive integer is returned. Otherwise, if the conversion overflows, the
  2921. largest integer with the same sign as `a' is returned.
  2922. -------------------------------------------------------------------------------
  2923. *}
  2924. Function float32_to_int32( a : float32rec) : int32;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  2925. Var
  2926. aSign: flag;
  2927. aExp, shiftCount: int16;
  2928. aSig, aSigExtra: bits32;
  2929. z: int32;
  2930. roundingMode: TFPURoundingMode;
  2931. Begin
  2932. aSig := extractFloat32Frac( a.float32 );
  2933. aExp := extractFloat32Exp( a.float32 );
  2934. aSign := extractFloat32Sign( a.float32 );
  2935. shiftCount := aExp - $96;
  2936. if ( 0 <= shiftCount ) then
  2937. Begin
  2938. if ( $9E <= aExp ) then
  2939. Begin
  2940. if ( a.float32 <> $CF000000 ) then
  2941. Begin
  2942. float_raise( float_flag_invalid );
  2943. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2944. Begin
  2945. float32_to_int32 := $7FFFFFFF;
  2946. exit;
  2947. End;
  2948. End;
  2949. float32_to_int32 := sbits32 ($80000000);
  2950. exit;
  2951. End;
  2952. z := ( aSig or $00800000 ) shl shiftCount;
  2953. if ( aSign<>0 ) then z := - z;
  2954. End
  2955. else
  2956. Begin
  2957. if ( aExp < $7E ) then
  2958. Begin
  2959. aSigExtra := aExp OR aSig;
  2960. z := 0;
  2961. End
  2962. else
  2963. Begin
  2964. aSig := aSig OR $00800000;
  2965. aSigExtra := aSig shl ( shiftCount and 31 );
  2966. z := aSig shr ( - shiftCount );
  2967. End;
  2968. if ( aSigExtra<>0 ) then
  2969. set_inexact_flag;
  2970. roundingMode := softfloat_rounding_mode;
  2971. if ( roundingMode = float_round_nearest_even ) then
  2972. Begin
  2973. if ( sbits32 (aSigExtra) < 0 ) then
  2974. Begin
  2975. Inc(z);
  2976. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2977. z := z and not 1;
  2978. End;
  2979. if ( aSign<>0 ) then
  2980. z := - z;
  2981. End
  2982. else
  2983. Begin
  2984. aSigExtra := flag( aSigExtra <> 0 );
  2985. if ( aSign<>0 ) then
  2986. Begin
  2987. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2988. z := - z;
  2989. End
  2990. else
  2991. Begin
  2992. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2993. End
  2994. End;
  2995. End;
  2996. float32_to_int32 := z;
  2997. End;
  2998. {*
  2999. -------------------------------------------------------------------------------
  3000. Returns the result of converting the single-precision floating-point value
  3001. `a' to the 32-bit two's complement integer format. The conversion is
  3002. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3003. Arithmetic, except that the conversion is always rounded toward zero.
  3004. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3005. the conversion overflows, the largest integer with the same sign as `a' is
  3006. returned.
  3007. -------------------------------------------------------------------------------
  3008. *}
  3009. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3010. Var
  3011. aSign : flag;
  3012. aExp, shiftCount : int16;
  3013. aSig : bits32;
  3014. z : int32;
  3015. Begin
  3016. aSig := extractFloat32Frac( a.float32 );
  3017. aExp := extractFloat32Exp( a.float32 );
  3018. aSign := extractFloat32Sign( a.float32 );
  3019. shiftCount := aExp - $9E;
  3020. if ( 0 <= shiftCount ) then
  3021. Begin
  3022. if ( a.float32 <> $CF000000 ) then
  3023. Begin
  3024. float_raise( float_flag_invalid );
  3025. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  3026. Begin
  3027. float32_to_int32_round_to_zero := $7FFFFFFF;
  3028. exit;
  3029. end;
  3030. End;
  3031. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  3032. exit;
  3033. End
  3034. else
  3035. if ( aExp <= $7E ) then
  3036. Begin
  3037. if ( aExp or aSig )<>0 then
  3038. set_inexact_flag;
  3039. float32_to_int32_round_to_zero := 0;
  3040. exit;
  3041. End;
  3042. aSig := ( aSig or $00800000 ) shl 8;
  3043. z := aSig shr ( - shiftCount );
  3044. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  3045. Begin
  3046. set_inexact_flag;
  3047. End;
  3048. if ( aSign<>0 ) then z := - z;
  3049. float32_to_int32_round_to_zero := z;
  3050. End;
  3051. {*----------------------------------------------------------------------------
  3052. | Returns the result of converting the single-precision floating-point value
  3053. | `a' to the 64-bit two's complement integer format. The conversion is
  3054. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3055. | Arithmetic---which means in particular that the conversion is rounded
  3056. | according to the current rounding mode. If `a' is a NaN, the largest
  3057. | positive integer is returned. Otherwise, if the conversion overflows, the
  3058. | largest integer with the same sign as `a' is returned.
  3059. *----------------------------------------------------------------------------*}
  3060. function float32_to_int64( a: float32 ): int64;
  3061. var
  3062. aSign: flag;
  3063. aExp, shiftCount: int16;
  3064. aSig: bits32;
  3065. aSig64, aSigExtra: bits64;
  3066. begin
  3067. aSig := extractFloat32Frac( a );
  3068. aExp := extractFloat32Exp( a );
  3069. aSign := extractFloat32Sign( a );
  3070. shiftCount := $BE - aExp;
  3071. if ( shiftCount < 0 ) then begin
  3072. float_raise( float_flag_invalid );
  3073. if ( aSign = 0 ) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3074. result := $7FFFFFFFFFFFFFFF;
  3075. exit;
  3076. end;
  3077. result := $8000000000000000;
  3078. exit;
  3079. end;
  3080. if ( aExp <> 0 ) then aSig := aSig or $00800000;
  3081. aSig64 := aSig;
  3082. aSig64 := aSig64 shl 40;
  3083. shift64ExtraRightJamming( aSig64, 0, shiftCount, aSig64, aSigExtra );
  3084. result := roundAndPackInt64( aSign, aSig64, aSigExtra );
  3085. end;
  3086. {*----------------------------------------------------------------------------
  3087. | Returns the result of converting the single-precision floating-point value
  3088. | `a' to the 64-bit two's complement integer format. The conversion is
  3089. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3090. | Arithmetic, except that the conversion is always rounded toward zero. If
  3091. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  3092. | conversion overflows, the largest integer with the same sign as `a' is
  3093. | returned.
  3094. *----------------------------------------------------------------------------*}
  3095. function float32_to_int64_round_to_zero( a: float32 ): int64;
  3096. var
  3097. aSign: flag;
  3098. aExp, shiftCount: int16;
  3099. aSig: bits32;
  3100. aSig64: bits64;
  3101. z: int64;
  3102. begin
  3103. aSig := extractFloat32Frac( a );
  3104. aExp := extractFloat32Exp( a );
  3105. aSign := extractFloat32Sign( a );
  3106. shiftCount := aExp - $BE;
  3107. if ( 0 <= shiftCount ) then begin
  3108. if ( a <> $DF000000 ) then begin
  3109. float_raise( float_flag_invalid );
  3110. if ( aSign = 0) or ( ( aExp = $FF ) and ( aSig <> 0 ) ) then begin
  3111. result := $7FFFFFFFFFFFFFFF;
  3112. exit;
  3113. end;
  3114. end;
  3115. result := $8000000000000000;
  3116. exit;
  3117. end
  3118. else if ( aExp <= $7E ) then begin
  3119. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  3120. result := 0;
  3121. exit;
  3122. end;
  3123. aSig64 := aSig or $00800000;
  3124. aSig64 := aSig64 shl 40;
  3125. z := aSig64 shr ( - shiftCount );
  3126. if bits64( aSig64 shl ( shiftCount and 63 ) ) <> 0 then
  3127. set_inexact_flag;
  3128. if ( aSign <> 0 ) then z := - z;
  3129. result := z;
  3130. end;
  3131. {*
  3132. -------------------------------------------------------------------------------
  3133. Returns the result of converting the single-precision floating-point value
  3134. `a' to the double-precision floating-point format. The conversion is
  3135. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3136. Arithmetic.
  3137. -------------------------------------------------------------------------------
  3138. *}
  3139. Function float32_to_float64( a : float32rec) : Float64;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3140. Var
  3141. aSign : flag;
  3142. aExp : int16;
  3143. aSig, zSig0, zSig1: bits32;
  3144. tmp : CommonNanT;
  3145. Begin
  3146. aSig := extractFloat32Frac( a.float32 );
  3147. aExp := extractFloat32Exp( a.float32 );
  3148. aSign := extractFloat32Sign( a.float32 );
  3149. if ( aExp = $FF ) then
  3150. Begin
  3151. if ( aSig<>0 ) then
  3152. Begin
  3153. tmp:=float32ToCommonNaN(a.float32);
  3154. result:=commonNaNToFloat64(tmp);
  3155. exit;
  3156. End;
  3157. packFloat64( aSign, $7FF, 0, 0, result);
  3158. exit;
  3159. End;
  3160. if ( aExp = 0 ) then
  3161. Begin
  3162. if ( aSig = 0 ) then
  3163. Begin
  3164. packFloat64( aSign, 0, 0, 0, result );
  3165. exit;
  3166. end;
  3167. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3168. Dec(aExp);
  3169. End;
  3170. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  3171. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  3172. End;
  3173. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  3174. {*----------------------------------------------------------------------------
  3175. | Returns the result of converting the canonical NaN `a' to the extended
  3176. | double-precision floating-point format.
  3177. *----------------------------------------------------------------------------*}
  3178. function commonNaNToFloatx80( a : commonNaNT ) : floatx80;
  3179. var
  3180. z : floatx80;
  3181. begin
  3182. z.low := bits64( $C000000000000000 ) or ( a.high shr 1 );
  3183. z.high := ( bits16( a.sign ) shl 15 ) or $7FFF;
  3184. result := z;
  3185. end;
  3186. {*----------------------------------------------------------------------------
  3187. | Returns the result of converting the single-precision floating-point value
  3188. | `a' to the extended double-precision floating-point format. The conversion
  3189. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3190. | Arithmetic.
  3191. *----------------------------------------------------------------------------*}
  3192. function float32_to_floatx80( a: float32 ): floatx80;
  3193. var
  3194. aSign: flag;
  3195. aExp: int16;
  3196. aSig: bits32;
  3197. tmp: commonNaNT;
  3198. begin
  3199. aSig := extractFloat32Frac( a );
  3200. aExp := extractFloat32Exp( a );
  3201. aSign := extractFloat32Sign( a );
  3202. if ( aExp = $FF ) then begin
  3203. if ( aSig <> 0 ) then begin
  3204. tmp:=float32ToCommonNaN(a);
  3205. result := commonNaNToFloatx80( tmp );
  3206. exit;
  3207. end;
  3208. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  3209. exit;
  3210. end;
  3211. if ( aExp = 0 ) then begin
  3212. if ( aSig = 0 ) then begin
  3213. result := packFloatx80( aSign, 0, 0 );
  3214. exit;
  3215. end;
  3216. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3217. end;
  3218. aSig := aSig or $00800000;
  3219. result := packFloatx80( aSign, aExp + $3F80, bits64(aSig) shl 40 );
  3220. end;
  3221. {$endif FPC_SOFTFLOAT_FLOATX80}
  3222. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  3223. {*----------------------------------------------------------------------------
  3224. | Returns the result of converting the single-precision floating-point value
  3225. | `a' to the double-precision floating-point format. The conversion is
  3226. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3227. | Arithmetic.
  3228. *----------------------------------------------------------------------------*}
  3229. function float32_to_float128( a: float32 ): float128;
  3230. var
  3231. aSign: flag;
  3232. aExp: int16;
  3233. aSig: bits32;
  3234. tmp: commonNaNT;
  3235. begin
  3236. aSig := extractFloat32Frac( a );
  3237. aExp := extractFloat32Exp( a );
  3238. aSign := extractFloat32Sign( a );
  3239. if ( aExp = $FF ) then begin
  3240. if ( aSig <> 0 ) then begin
  3241. tmp:=float32ToCommonNaN(a);
  3242. result := commonNaNToFloat128( tmp );
  3243. exit;
  3244. end;
  3245. result := packFloat128( aSign, $7FFF, 0, 0 );
  3246. exit;
  3247. end;
  3248. if ( aExp = 0 ) then begin
  3249. if ( aSig = 0 ) then begin
  3250. result := packFloat128( aSign, 0, 0, 0 );
  3251. exit;
  3252. end;
  3253. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3254. dec( aExp );
  3255. end;
  3256. result := packFloat128( aSign, aExp + $3F80, bits64( aSig ) shl 25, 0 );
  3257. end;
  3258. {$endif FPC_SOFTFLOAT_FLOAT128}
  3259. {*
  3260. -------------------------------------------------------------------------------
  3261. Rounds the single-precision floating-point value `a' to an integer,
  3262. and returns the result as a single-precision floating-point value. The
  3263. operation is performed according to the IEC/IEEE Standard for Binary
  3264. Floating-Point Arithmetic.
  3265. -------------------------------------------------------------------------------
  3266. *}
  3267. Function float32_round_to_int( a: float32rec): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3268. Var
  3269. aSign: flag;
  3270. aExp: int16;
  3271. lastBitMask, roundBitsMask: bits32;
  3272. roundingMode: TFPURoundingMode;
  3273. z: float32;
  3274. Begin
  3275. aExp := extractFloat32Exp( a.float32 );
  3276. if ( $96 <= aExp ) then
  3277. Begin
  3278. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3279. Begin
  3280. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  3281. exit;
  3282. End;
  3283. float32_round_to_int:=a;
  3284. exit;
  3285. End;
  3286. if ( aExp <= $7E ) then
  3287. Begin
  3288. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  3289. Begin
  3290. float32_round_to_int:=a;
  3291. exit;
  3292. end;
  3293. set_inexact_flag;
  3294. aSign := extractFloat32Sign( a.float32 );
  3295. case ( softfloat_rounding_mode ) of
  3296. float_round_nearest_even:
  3297. Begin
  3298. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  3299. Begin
  3300. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  3301. exit;
  3302. End;
  3303. End;
  3304. float_round_down:
  3305. Begin
  3306. if aSign <> 0 then
  3307. float32_round_to_int.float32 := $BF800000
  3308. else
  3309. float32_round_to_int.float32 := 0;
  3310. exit;
  3311. End;
  3312. float_round_up:
  3313. Begin
  3314. if aSign <> 0 then
  3315. float32_round_to_int.float32 := $80000000
  3316. else
  3317. float32_round_to_int.float32 := $3F800000;
  3318. exit;
  3319. End;
  3320. end;
  3321. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  3322. exit;
  3323. End;
  3324. lastBitMask := 1;
  3325. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  3326. lastBitMask := lastBitMask shl ($96 - aExp);
  3327. roundBitsMask := lastBitMask - 1;
  3328. z := a.float32;
  3329. roundingMode := softfloat_rounding_mode;
  3330. if ( roundingMode = float_round_nearest_even ) then
  3331. Begin
  3332. z := z + (lastBitMask shr 1);
  3333. if ( ( z and roundBitsMask ) = 0 ) then
  3334. z := z and not lastBitMask;
  3335. End
  3336. else if ( roundingMode <> float_round_to_zero ) then
  3337. Begin
  3338. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  3339. Begin
  3340. z := z + roundBitsMask;
  3341. End;
  3342. End;
  3343. z := z and not roundBitsMask;
  3344. if ( z <> a.float32 ) then
  3345. set_inexact_flag;
  3346. float32_round_to_int.float32 := z;
  3347. End;
  3348. {*
  3349. -------------------------------------------------------------------------------
  3350. Returns the result of adding the absolute values of the single-precision
  3351. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3352. before being returned. `zSign' is ignored if the result is a NaN.
  3353. The addition is performed according to the IEC/IEEE Standard for Binary
  3354. Floating-Point Arithmetic.
  3355. -------------------------------------------------------------------------------
  3356. *}
  3357. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  3358. Var
  3359. aExp, bExp, zExp: int16;
  3360. aSig, bSig, zSig: bits32;
  3361. expDiff: int16;
  3362. label roundAndPack;
  3363. Begin
  3364. aSig:=extractFloat32Frac( a );
  3365. aExp:=extractFloat32Exp( a );
  3366. bSig:=extractFloat32Frac( b );
  3367. bExp := extractFloat32Exp( b );
  3368. expDiff := aExp - bExp;
  3369. aSig := aSig shl 6;
  3370. bSig := bSig shl 6;
  3371. if ( 0 < expDiff ) then
  3372. Begin
  3373. if ( aExp = $FF ) then
  3374. Begin
  3375. if ( aSig <> 0) then
  3376. Begin
  3377. addFloat32Sigs := propagateFloat32NaN( a, b );
  3378. exit;
  3379. End;
  3380. addFloat32Sigs := a;
  3381. exit;
  3382. End;
  3383. if ( bExp = 0 ) then
  3384. Begin
  3385. Dec(expDiff);
  3386. End
  3387. else
  3388. Begin
  3389. bSig := bSig or $20000000;
  3390. End;
  3391. shift32RightJamming( bSig, expDiff, bSig );
  3392. zExp := aExp;
  3393. End
  3394. else
  3395. If ( expDiff < 0 ) then
  3396. Begin
  3397. if ( bExp = $FF ) then
  3398. Begin
  3399. if ( bSig<>0 ) then
  3400. Begin
  3401. addFloat32Sigs := propagateFloat32NaN( a, b );
  3402. exit;
  3403. end;
  3404. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  3405. exit;
  3406. End;
  3407. if ( aExp = 0 ) then
  3408. Begin
  3409. Inc(expDiff);
  3410. End
  3411. else
  3412. Begin
  3413. aSig := aSig OR $20000000;
  3414. End;
  3415. shift32RightJamming( aSig, - expDiff, aSig );
  3416. zExp := bExp;
  3417. End
  3418. else
  3419. Begin
  3420. if ( aExp = $FF ) then
  3421. Begin
  3422. if ( aSig OR bSig )<> 0 then
  3423. Begin
  3424. addFloat32Sigs := propagateFloat32NaN( a, b );
  3425. exit;
  3426. end;
  3427. addFloat32Sigs := a;
  3428. exit;
  3429. End;
  3430. if ( aExp = 0 ) then
  3431. Begin
  3432. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3433. exit;
  3434. end;
  3435. zSig := $40000000 + aSig + bSig;
  3436. zExp := aExp;
  3437. goto roundAndPack;
  3438. End;
  3439. aSig := aSig OR $20000000;
  3440. zSig := ( aSig + bSig ) shl 1;
  3441. Dec(zExp);
  3442. if ( sbits32 (zSig) < 0 ) then
  3443. Begin
  3444. zSig := aSig + bSig;
  3445. Inc(zExp);
  3446. End;
  3447. roundAndPack:
  3448. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3449. End;
  3450. {*
  3451. -------------------------------------------------------------------------------
  3452. Returns the result of subtracting the absolute values of the single-
  3453. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3454. difference is negated before being returned. `zSign' is ignored if the
  3455. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3456. Standard for Binary Floating-Point Arithmetic.
  3457. -------------------------------------------------------------------------------
  3458. *}
  3459. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3460. Var
  3461. aExp, bExp, zExp: int16;
  3462. aSig, bSig, zSig: bits32;
  3463. expDiff : int16;
  3464. label aExpBigger;
  3465. label bExpBigger;
  3466. label aBigger;
  3467. label bBigger;
  3468. label normalizeRoundAndPack;
  3469. Begin
  3470. aSig := extractFloat32Frac( a );
  3471. aExp := extractFloat32Exp( a );
  3472. bSig := extractFloat32Frac( b );
  3473. bExp := extractFloat32Exp( b );
  3474. expDiff := aExp - bExp;
  3475. aSig := aSig shl 7;
  3476. bSig := bSig shl 7;
  3477. if ( 0 < expDiff ) then goto aExpBigger;
  3478. if ( expDiff < 0 ) then goto bExpBigger;
  3479. if ( aExp = $FF ) then
  3480. Begin
  3481. if ( aSig OR bSig )<> 0 then
  3482. Begin
  3483. subFloat32Sigs := propagateFloat32NaN( a, b );
  3484. exit;
  3485. End;
  3486. float_raise( float_flag_invalid );
  3487. subFloat32Sigs := float32_default_nan;
  3488. exit;
  3489. End;
  3490. if ( aExp = 0 ) then
  3491. Begin
  3492. aExp := 1;
  3493. bExp := 1;
  3494. End;
  3495. if ( bSig < aSig ) Then goto aBigger;
  3496. if ( aSig < bSig ) Then goto bBigger;
  3497. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3498. exit;
  3499. bExpBigger:
  3500. if ( bExp = $FF ) then
  3501. Begin
  3502. if ( bSig<>0 ) then
  3503. Begin
  3504. subFloat32Sigs := propagateFloat32NaN( a, b );
  3505. exit;
  3506. End;
  3507. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3508. exit;
  3509. End;
  3510. if ( aExp = 0 ) then
  3511. Begin
  3512. Inc(expDiff);
  3513. End
  3514. else
  3515. Begin
  3516. aSig := aSig OR $40000000;
  3517. End;
  3518. shift32RightJamming( aSig, - expDiff, aSig );
  3519. bSig := bSig OR $40000000;
  3520. bBigger:
  3521. zSig := bSig - aSig;
  3522. zExp := bExp;
  3523. zSign := zSign xor 1;
  3524. goto normalizeRoundAndPack;
  3525. aExpBigger:
  3526. if ( aExp = $FF ) then
  3527. Begin
  3528. if ( aSig <> 0) then
  3529. Begin
  3530. subFloat32Sigs := propagateFloat32NaN( a, b );
  3531. exit;
  3532. End;
  3533. subFloat32Sigs := a;
  3534. exit;
  3535. End;
  3536. if ( bExp = 0 ) then
  3537. Begin
  3538. Dec(expDiff);
  3539. End
  3540. else
  3541. Begin
  3542. bSig := bSig OR $40000000;
  3543. End;
  3544. shift32RightJamming( bSig, expDiff, bSig );
  3545. aSig := aSig OR $40000000;
  3546. aBigger:
  3547. zSig := aSig - bSig;
  3548. zExp := aExp;
  3549. normalizeRoundAndPack:
  3550. Dec(zExp);
  3551. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3552. End;
  3553. {*
  3554. -------------------------------------------------------------------------------
  3555. Returns the result of adding the single-precision floating-point values `a'
  3556. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3557. Binary Floating-Point Arithmetic.
  3558. -------------------------------------------------------------------------------
  3559. *}
  3560. Function float32_add( a: float32rec; b:float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3561. Var
  3562. aSign, bSign: Flag;
  3563. Begin
  3564. aSign := extractFloat32Sign( a.float32 );
  3565. bSign := extractFloat32Sign( b.float32 );
  3566. if ( aSign = bSign ) then
  3567. Begin
  3568. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3569. End
  3570. else
  3571. Begin
  3572. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3573. End;
  3574. End;
  3575. {*
  3576. -------------------------------------------------------------------------------
  3577. Returns the result of subtracting the single-precision floating-point values
  3578. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3579. for Binary Floating-Point Arithmetic.
  3580. -------------------------------------------------------------------------------
  3581. *}
  3582. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3583. Var
  3584. aSign, bSign: flag;
  3585. Begin
  3586. aSign := extractFloat32Sign( a.float32 );
  3587. bSign := extractFloat32Sign( b.float32 );
  3588. if ( aSign = bSign ) then
  3589. Begin
  3590. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3591. End
  3592. else
  3593. Begin
  3594. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3595. End;
  3596. End;
  3597. {*
  3598. -------------------------------------------------------------------------------
  3599. Returns the result of multiplying the single-precision floating-point values
  3600. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3601. for Binary Floating-Point Arithmetic.
  3602. -------------------------------------------------------------------------------
  3603. *}
  3604. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3605. Var
  3606. aSign, bSign, zSign: flag;
  3607. aExp, bExp, zExp : int16;
  3608. aSig, bSig, zSig0, zSig1: bits32;
  3609. Begin
  3610. aSig := extractFloat32Frac( a.float32 );
  3611. aExp := extractFloat32Exp( a.float32 );
  3612. aSign := extractFloat32Sign( a.float32 );
  3613. bSig := extractFloat32Frac( b.float32 );
  3614. bExp := extractFloat32Exp( b.float32 );
  3615. bSign := extractFloat32Sign( b.float32 );
  3616. zSign := aSign xor bSign;
  3617. if ( aExp = $FF ) then
  3618. Begin
  3619. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3620. Begin
  3621. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3622. exit;
  3623. End;
  3624. if ( ( bits32(bExp) OR bSig ) = 0 ) then
  3625. Begin
  3626. float_raise( float_flag_invalid );
  3627. float32_mul.float32 := float32_default_nan;
  3628. exit;
  3629. End;
  3630. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3631. exit;
  3632. End;
  3633. if ( bExp = $FF ) then
  3634. Begin
  3635. if ( bSig <> 0 ) then
  3636. Begin
  3637. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3638. exit;
  3639. End;
  3640. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3641. Begin
  3642. float_raise( float_flag_invalid );
  3643. float32_mul.float32 := float32_default_nan;
  3644. exit;
  3645. End;
  3646. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3647. exit;
  3648. End;
  3649. if ( aExp = 0 ) then
  3650. Begin
  3651. if ( aSig = 0 ) then
  3652. Begin
  3653. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3654. exit;
  3655. End;
  3656. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3657. End;
  3658. if ( bExp = 0 ) then
  3659. Begin
  3660. if ( bSig = 0 ) then
  3661. Begin
  3662. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3663. exit;
  3664. End;
  3665. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3666. End;
  3667. zExp := aExp + bExp - $7F;
  3668. aSig := ( aSig OR $00800000 ) shl 7;
  3669. bSig := ( bSig OR $00800000 ) shl 8;
  3670. mul32To64( aSig, bSig, zSig0, zSig1 );
  3671. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3672. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3673. Begin
  3674. zSig0 := zSig0 shl 1;
  3675. Dec(zExp);
  3676. End;
  3677. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3678. End;
  3679. {*
  3680. -------------------------------------------------------------------------------
  3681. Returns the result of dividing the single-precision floating-point value `a'
  3682. by the corresponding value `b'. The operation is performed according to the
  3683. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3684. -------------------------------------------------------------------------------
  3685. *}
  3686. Function float32_div(a: float32rec;b: float32rec ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3687. Var
  3688. aSign, bSign, zSign: flag;
  3689. aExp, bExp, zExp: int16;
  3690. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3691. Begin
  3692. aSig := extractFloat32Frac( a.float32 );
  3693. aExp := extractFloat32Exp( a.float32 );
  3694. aSign := extractFloat32Sign( a.float32 );
  3695. bSig := extractFloat32Frac( b.float32 );
  3696. bExp := extractFloat32Exp( b.float32 );
  3697. bSign := extractFloat32Sign( b.float32 );
  3698. zSign := aSign xor bSign;
  3699. if ( aExp = $FF ) then
  3700. Begin
  3701. if ( aSig <> 0 ) then
  3702. Begin
  3703. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3704. exit;
  3705. End;
  3706. if ( bExp = $FF ) then
  3707. Begin
  3708. if ( bSig <> 0) then
  3709. Begin
  3710. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3711. exit;
  3712. End;
  3713. float_raise( float_flag_invalid );
  3714. float32_div.float32 := float32_default_nan;
  3715. exit;
  3716. End;
  3717. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3718. exit;
  3719. End;
  3720. if ( bExp = $FF ) then
  3721. Begin
  3722. if ( bSig <> 0) then
  3723. Begin
  3724. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3725. exit;
  3726. End;
  3727. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3728. exit;
  3729. End;
  3730. if ( bExp = 0 ) Then
  3731. Begin
  3732. if ( bSig = 0 ) Then
  3733. Begin
  3734. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3735. Begin
  3736. float_raise( float_flag_invalid );
  3737. float32_div.float32 := float32_default_nan;
  3738. exit;
  3739. End;
  3740. float_raise( float_flag_divbyzero );
  3741. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3742. exit;
  3743. End;
  3744. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3745. End;
  3746. if ( aExp = 0 ) Then
  3747. Begin
  3748. if ( aSig = 0 ) Then
  3749. Begin
  3750. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3751. exit;
  3752. End;
  3753. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3754. End;
  3755. zExp := aExp - bExp + $7D;
  3756. aSig := ( aSig OR $00800000 ) shl 7;
  3757. bSig := ( bSig OR $00800000 ) shl 8;
  3758. if ( bSig <= ( aSig + aSig ) ) then
  3759. Begin
  3760. aSig := aSig shr 1;
  3761. Inc(zExp);
  3762. End;
  3763. zSig := estimateDiv64To32( aSig, 0, bSig );
  3764. if ( ( zSig and $3F ) <= 2 ) then
  3765. Begin
  3766. mul32To64( bSig, zSig, term0, term1 );
  3767. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3768. while ( sbits32 (rem0) < 0 ) do
  3769. Begin
  3770. Dec(zSig);
  3771. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3772. End;
  3773. zSig := zSig or bits32( rem1 <> 0 );
  3774. End;
  3775. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3776. End;
  3777. {*
  3778. -------------------------------------------------------------------------------
  3779. Returns the remainder of the single-precision floating-point value `a'
  3780. with respect to the corresponding value `b'. The operation is performed
  3781. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3782. -------------------------------------------------------------------------------
  3783. *}
  3784. Function float32_rem(a: float32rec; b: float32rec ):float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3785. Var
  3786. aSign, zSign: flag;
  3787. aExp, bExp, expDiff: int16;
  3788. aSig, bSig, q, alternateASig: bits32;
  3789. sigMean: sbits32;
  3790. Begin
  3791. aSig := extractFloat32Frac( a.float32 );
  3792. aExp := extractFloat32Exp( a.float32 );
  3793. aSign := extractFloat32Sign( a.float32 );
  3794. bSig := extractFloat32Frac( b.float32 );
  3795. bExp := extractFloat32Exp( b.float32 );
  3796. if ( aExp = $FF ) then
  3797. Begin
  3798. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3799. Begin
  3800. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3801. exit;
  3802. End;
  3803. float_raise( float_flag_invalid );
  3804. float32_rem.float32 := float32_default_nan;
  3805. exit;
  3806. End;
  3807. if ( bExp = $FF ) then
  3808. Begin
  3809. if ( bSig <> 0 ) then
  3810. Begin
  3811. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3812. exit;
  3813. End;
  3814. float32_rem := a;
  3815. exit;
  3816. End;
  3817. if ( bExp = 0 ) then
  3818. Begin
  3819. if ( bSig = 0 ) then
  3820. Begin
  3821. float_raise( float_flag_invalid );
  3822. float32_rem.float32 := float32_default_nan;
  3823. exit;
  3824. End;
  3825. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3826. End;
  3827. if ( aExp = 0 ) then
  3828. Begin
  3829. if ( aSig = 0 ) then
  3830. Begin
  3831. float32_rem := a;
  3832. exit;
  3833. End;
  3834. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3835. End;
  3836. expDiff := aExp - bExp;
  3837. aSig := ( aSig OR $00800000 ) shl 8;
  3838. bSig := ( bSig OR $00800000 ) shl 8;
  3839. if ( expDiff < 0 ) then
  3840. Begin
  3841. if ( expDiff < -1 ) then
  3842. Begin
  3843. float32_rem := a;
  3844. exit;
  3845. End;
  3846. aSig := aSig shr 1;
  3847. End;
  3848. q := bits32( bSig <= aSig );
  3849. if ( q <> 0) then
  3850. aSig := aSig - bSig;
  3851. expDiff := expDiff - 32;
  3852. while ( 0 < expDiff ) do
  3853. Begin
  3854. q := estimateDiv64To32( aSig, 0, bSig );
  3855. if (2 < q) then
  3856. q := q - 2
  3857. else
  3858. q := 0;
  3859. aSig := - ( ( bSig shr 2 ) * q );
  3860. expDiff := expDiff - 30;
  3861. End;
  3862. expDiff := expDiff + 32;
  3863. if ( 0 < expDiff ) then
  3864. Begin
  3865. q := estimateDiv64To32( aSig, 0, bSig );
  3866. if (2 < q) then
  3867. q := q - 2
  3868. else
  3869. q := 0;
  3870. q := q shr (32 - expDiff);
  3871. bSig := bSig shr 2;
  3872. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3873. End
  3874. else
  3875. Begin
  3876. aSig := aSig shr 2;
  3877. bSig := bSig shr 2;
  3878. End;
  3879. Repeat
  3880. alternateASig := aSig;
  3881. Inc(q);
  3882. aSig := aSig - bSig;
  3883. Until not ( 0 <= sbits32 (aSig) );
  3884. sigMean := aSig + alternateASig;
  3885. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3886. Begin
  3887. aSig := alternateASig;
  3888. End;
  3889. zSign := flag( sbits32 (aSig) < 0 );
  3890. if ( zSign<>0 ) then
  3891. aSig := - aSig;
  3892. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3893. End;
  3894. {*
  3895. -------------------------------------------------------------------------------
  3896. Returns the square root of the single-precision floating-point value `a'.
  3897. The operation is performed according to the IEC/IEEE Standard for Binary
  3898. Floating-Point Arithmetic.
  3899. -------------------------------------------------------------------------------
  3900. *}
  3901. Function float32_sqrt(a: float32rec ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3902. Var
  3903. aSign : flag;
  3904. aExp, zExp : int16;
  3905. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3906. label roundAndPack;
  3907. Begin
  3908. aSig := extractFloat32Frac( a.float32 );
  3909. aExp := extractFloat32Exp( a.float32 );
  3910. aSign := extractFloat32Sign( a.float32 );
  3911. if ( aExp = $FF ) then
  3912. Begin
  3913. if ( aSig <> 0) then
  3914. Begin
  3915. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3916. exit;
  3917. End;
  3918. if ( aSign = 0) then
  3919. Begin
  3920. float32_sqrt := a;
  3921. exit;
  3922. End;
  3923. float_raise( float_flag_invalid );
  3924. float32_sqrt.float32 := float32_default_nan;
  3925. exit;
  3926. End;
  3927. if ( aSign <> 0) then
  3928. Begin
  3929. if ( ( bits32(aExp) OR aSig ) = 0 ) then
  3930. Begin
  3931. float32_sqrt := a;
  3932. exit;
  3933. End;
  3934. float_raise( float_flag_invalid );
  3935. float32_sqrt.float32 := float32_default_nan;
  3936. exit;
  3937. End;
  3938. if ( aExp = 0 ) then
  3939. Begin
  3940. if ( aSig = 0 ) then
  3941. Begin
  3942. float32_sqrt.float32 := 0;
  3943. exit;
  3944. End;
  3945. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3946. End;
  3947. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3948. aSig := ( aSig OR $00800000 ) shl 8;
  3949. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3950. if ( ( zSig and $7F ) <= 5 ) then
  3951. Begin
  3952. if ( zSig < 2 ) then
  3953. Begin
  3954. zSig := $7FFFFFFF;
  3955. goto roundAndPack;
  3956. End
  3957. else
  3958. Begin
  3959. aSig := aSig shr (aExp and 1);
  3960. mul32To64( zSig, zSig, term0, term1 );
  3961. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3962. while ( sbits32 (rem0) < 0 ) do
  3963. Begin
  3964. Dec(zSig);
  3965. shortShift64Left( 0, zSig, 1, term0, term1 );
  3966. term1 := term1 or 1;
  3967. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3968. End;
  3969. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3970. End;
  3971. End;
  3972. shift32RightJamming( zSig, 1, zSig );
  3973. roundAndPack:
  3974. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3975. End;
  3976. {*
  3977. -------------------------------------------------------------------------------
  3978. Returns 1 if the single-precision floating-point value `a' is equal to
  3979. the corresponding value `b', and 0 otherwise. The comparison is performed
  3980. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3981. -------------------------------------------------------------------------------
  3982. *}
  3983. Function float32_eq( a:float32rec; b:float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  3984. Begin
  3985. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3986. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3987. ) then
  3988. Begin
  3989. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3990. Begin
  3991. float_raise( float_flag_invalid );
  3992. End;
  3993. float32_eq := 0;
  3994. exit;
  3995. End;
  3996. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3997. End;
  3998. {*
  3999. -------------------------------------------------------------------------------
  4000. Returns 1 if the single-precision floating-point value `a' is less than
  4001. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4002. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4003. Arithmetic.
  4004. -------------------------------------------------------------------------------
  4005. *}
  4006. Function float32_le( a: float32rec; b : float32rec ):flag;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  4007. var
  4008. aSign, bSign: flag;
  4009. Begin
  4010. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  4011. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  4012. ) then
  4013. Begin
  4014. float_raise( float_flag_invalid );
  4015. float32_le := 0;
  4016. exit;
  4017. End;
  4018. aSign := extractFloat32Sign( a.float32 );
  4019. bSign := extractFloat32Sign( b.float32 );
  4020. if ( aSign <> bSign ) then
  4021. Begin
  4022. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  4023. exit;
  4024. End;
  4025. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  4026. End;
  4027. {*
  4028. -------------------------------------------------------------------------------
  4029. Returns 1 if the single-precision floating-point value `a' is less than
  4030. the corresponding value `b', and 0 otherwise. The comparison is performed
  4031. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4032. -------------------------------------------------------------------------------
  4033. *}
  4034. Function float32_lt( a:float32rec ; b : float32rec): flag; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  4035. var
  4036. aSign, bSign: flag;
  4037. Begin
  4038. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  4039. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  4040. ) then
  4041. Begin
  4042. float_raise( float_flag_invalid );
  4043. float32_lt :=0;
  4044. exit;
  4045. End;
  4046. aSign := extractFloat32Sign( a.float32 );
  4047. bSign := extractFloat32Sign( b.float32 );
  4048. if ( aSign <> bSign ) then
  4049. Begin
  4050. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  4051. exit;
  4052. End;
  4053. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  4054. End;
  4055. {*
  4056. -------------------------------------------------------------------------------
  4057. Returns 1 if the single-precision floating-point value `a' is equal to
  4058. the corresponding value `b', and 0 otherwise. The invalid exception is
  4059. raised if either operand is a NaN. Otherwise, the comparison is performed
  4060. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4061. -------------------------------------------------------------------------------
  4062. *}
  4063. Function float32_eq_signaling( a: float32; b: float32) : flag;
  4064. Begin
  4065. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  4066. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  4067. ) then
  4068. Begin
  4069. float_raise( float_flag_invalid );
  4070. float32_eq_signaling := 0;
  4071. exit;
  4072. End;
  4073. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  4074. End;
  4075. {*
  4076. -------------------------------------------------------------------------------
  4077. Returns 1 if the single-precision floating-point value `a' is less than or
  4078. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4079. cause an exception. Otherwise, the comparison is performed according to the
  4080. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4081. -------------------------------------------------------------------------------
  4082. *}
  4083. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  4084. Var
  4085. aSign, bSign: flag;
  4086. Begin
  4087. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4088. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4089. ) then
  4090. Begin
  4091. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4092. Begin
  4093. float_raise( float_flag_invalid );
  4094. End;
  4095. float32_le_quiet := 0;
  4096. exit;
  4097. End;
  4098. aSign := extractFloat32Sign( a );
  4099. bSign := extractFloat32Sign( b );
  4100. if ( aSign <> bSign ) then
  4101. Begin
  4102. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  4103. exit;
  4104. End;
  4105. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  4106. End;
  4107. {*
  4108. -------------------------------------------------------------------------------
  4109. Returns 1 if the single-precision floating-point value `a' is less than
  4110. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4111. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4112. Standard for Binary Floating-Point Arithmetic.
  4113. -------------------------------------------------------------------------------
  4114. *}
  4115. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  4116. Var
  4117. aSign, bSign: flag;
  4118. Begin
  4119. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  4120. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  4121. ) then
  4122. Begin
  4123. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  4124. Begin
  4125. float_raise( float_flag_invalid );
  4126. End;
  4127. float32_lt_quiet := 0;
  4128. exit;
  4129. End;
  4130. aSign := extractFloat32Sign( a );
  4131. bSign := extractFloat32Sign( b );
  4132. if ( aSign <> bSign ) then
  4133. Begin
  4134. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  4135. exit;
  4136. End;
  4137. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  4138. End;
  4139. {*
  4140. -------------------------------------------------------------------------------
  4141. Returns the result of converting the double-precision floating-point value
  4142. `a' to the 32-bit two's complement integer format. The conversion is
  4143. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4144. Arithmetic---which means in particular that the conversion is rounded
  4145. according to the current rounding mode. If `a' is a NaN, the largest
  4146. positive integer is returned. Otherwise, if the conversion overflows, the
  4147. largest integer with the same sign as `a' is returned.
  4148. -------------------------------------------------------------------------------
  4149. *}
  4150. Function float64_to_int32(a: float64): int32;{$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  4151. var
  4152. aSign: flag;
  4153. aExp, shiftCount: int16;
  4154. aSig0, aSig1, absZ, aSigExtra: bits32;
  4155. z: int32;
  4156. roundingMode: TFPURoundingMode;
  4157. label invalid;
  4158. Begin
  4159. aSig1 := extractFloat64Frac1( a );
  4160. aSig0 := extractFloat64Frac0( a );
  4161. aExp := extractFloat64Exp( a );
  4162. aSign := extractFloat64Sign( a );
  4163. shiftCount := aExp - $413;
  4164. if ( 0 <= shiftCount ) then
  4165. Begin
  4166. if ( $41E < aExp ) then
  4167. Begin
  4168. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4169. aSign := 0;
  4170. goto invalid;
  4171. End;
  4172. shortShift64Left(
  4173. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4174. if ( $80000000 < absZ ) then
  4175. goto invalid;
  4176. End
  4177. else
  4178. Begin
  4179. aSig1 := flag( aSig1 <> 0 );
  4180. if ( aExp < $3FE ) then
  4181. Begin
  4182. aSigExtra := aExp OR aSig0 OR aSig1;
  4183. absZ := 0;
  4184. End
  4185. else
  4186. Begin
  4187. aSig0 := aSig0 OR $00100000;
  4188. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4189. absZ := aSig0 shr ( - shiftCount );
  4190. End;
  4191. End;
  4192. roundingMode := softfloat_rounding_mode;
  4193. if ( roundingMode = float_round_nearest_even ) then
  4194. Begin
  4195. if ( sbits32(aSigExtra) < 0 ) then
  4196. Begin
  4197. Inc(absZ);
  4198. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  4199. absZ := absZ and not 1;
  4200. End;
  4201. if aSign <> 0 then
  4202. z := - absZ
  4203. else
  4204. z := absZ;
  4205. End
  4206. else
  4207. Begin
  4208. aSigExtra := bits32( aSigExtra <> 0 );
  4209. if ( aSign <> 0) then
  4210. Begin
  4211. z := - ( absZ
  4212. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  4213. End
  4214. else
  4215. Begin
  4216. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  4217. End
  4218. End;
  4219. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  4220. Begin
  4221. invalid:
  4222. float_raise( float_flag_invalid );
  4223. if (aSign <> 0 ) then
  4224. float64_to_int32 := sbits32 ($80000000)
  4225. else
  4226. float64_to_int32 := $7FFFFFFF;
  4227. exit;
  4228. End;
  4229. if ( aSigExtra <> 0) then
  4230. set_inexact_flag;
  4231. float64_to_int32 := z;
  4232. End;
  4233. {*
  4234. -------------------------------------------------------------------------------
  4235. Returns the result of converting the double-precision floating-point value
  4236. `a' to the 32-bit two's complement integer format. The conversion is
  4237. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4238. Arithmetic, except that the conversion is always rounded toward zero.
  4239. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4240. the conversion overflows, the largest integer with the same sign as `a' is
  4241. returned.
  4242. -------------------------------------------------------------------------------
  4243. *}
  4244. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  4245. {$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  4246. Var
  4247. aSign: flag;
  4248. aExp, shiftCount: int16;
  4249. aSig0, aSig1, absZ, aSigExtra: bits32;
  4250. z: int32;
  4251. label invalid;
  4252. Begin
  4253. aSig1 := extractFloat64Frac1( a );
  4254. aSig0 := extractFloat64Frac0( a );
  4255. aExp := extractFloat64Exp( a );
  4256. aSign := extractFloat64Sign( a );
  4257. shiftCount := aExp - $413;
  4258. if ( 0 <= shiftCount ) then
  4259. Begin
  4260. if ( $41E < aExp ) then
  4261. Begin
  4262. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  4263. aSign := 0;
  4264. goto invalid;
  4265. End;
  4266. shortShift64Left(
  4267. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  4268. End
  4269. else
  4270. Begin
  4271. if ( aExp < $3FF ) then
  4272. Begin
  4273. if ( bits32(aExp) OR aSig0 OR aSig1 )<>0 then
  4274. Begin
  4275. set_inexact_flag;
  4276. End;
  4277. float64_to_int32_round_to_zero := 0;
  4278. exit;
  4279. End;
  4280. aSig0 := aSig0 or $00100000;
  4281. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  4282. absZ := aSig0 shr ( - shiftCount );
  4283. End;
  4284. if aSign <> 0 then
  4285. z := - absZ
  4286. else
  4287. z := absZ;
  4288. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  4289. Begin
  4290. invalid:
  4291. float_raise( float_flag_invalid );
  4292. if (aSign <> 0) then
  4293. float64_to_int32_round_to_zero := sbits32 ($80000000)
  4294. else
  4295. float64_to_int32_round_to_zero := $7FFFFFFF;
  4296. exit;
  4297. End;
  4298. if ( aSigExtra <> 0) then
  4299. set_inexact_flag;
  4300. float64_to_int32_round_to_zero := z;
  4301. End;
  4302. {*----------------------------------------------------------------------------
  4303. | Returns the result of converting the double-precision floating-point value
  4304. | `a' to the 64-bit two's complement integer format. The conversion is
  4305. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4306. | Arithmetic---which means in particular that the conversion is rounded
  4307. | according to the current rounding mode. If `a' is a NaN, the largest
  4308. | positive integer is returned. Otherwise, if the conversion overflows, the
  4309. | largest integer with the same sign as `a' is returned.
  4310. *----------------------------------------------------------------------------*}
  4311. function float64_to_int64( a: float64 ): int64;
  4312. var
  4313. aSign: flag;
  4314. aExp, shiftCount: int16;
  4315. aSig, aSigExtra: bits64;
  4316. begin
  4317. aSig := extractFloat64Frac( a );
  4318. aExp := extractFloat64Exp( a );
  4319. aSign := extractFloat64Sign( a );
  4320. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4321. shiftCount := $433 - aExp;
  4322. if ( shiftCount <= 0 ) then begin
  4323. if ( $43E < aExp ) then begin
  4324. float_raise( float_flag_invalid );
  4325. if ( ( aSign = 0 )
  4326. or ( ( aExp = $7FF )
  4327. and ( aSig <> $0010000000000000 ) )
  4328. ) then begin
  4329. result := $7FFFFFFFFFFFFFFF;
  4330. exit;
  4331. end;
  4332. result := $8000000000000000;
  4333. exit;
  4334. end;
  4335. aSigExtra := 0;
  4336. aSig := aSig shl ( - shiftCount );
  4337. end
  4338. else
  4339. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  4340. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  4341. end;
  4342. {*----------------------------------------------------------------------------
  4343. | Returns the result of converting the double-precision floating-point value
  4344. | `a' to the 64-bit two's complement integer format. The conversion is
  4345. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  4346. | Arithmetic, except that the conversion is always rounded toward zero.
  4347. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4348. | the conversion overflows, the largest integer with the same sign as `a' is
  4349. | returned.
  4350. *----------------------------------------------------------------------------*}
  4351. {$define FPC_SYSTEM_HAS_float64_to_int64_round_to_zero}
  4352. function float64_to_int64_round_to_zero( a: float64 ): int64;
  4353. var
  4354. aSign: flag;
  4355. aExp, shiftCount: int16;
  4356. aSig: bits64;
  4357. z: int64;
  4358. begin
  4359. aSig := extractFloat64Frac( a );
  4360. aExp := extractFloat64Exp( a );
  4361. aSign := extractFloat64Sign( a );
  4362. if ( aExp <> 0 ) then aSig := aSig or $0010000000000000;
  4363. shiftCount := aExp - $433;
  4364. if ( 0 <= shiftCount ) then begin
  4365. if ( $43E <= aExp ) then begin
  4366. if ( bits64 ( a ) <> bits64( $C3E0000000000000 ) ) then begin
  4367. float_raise( float_flag_invalid );
  4368. if ( ( aSign = 0 )
  4369. or ( ( aExp = $7FF )
  4370. and ( aSig <> $0010000000000000 ) )
  4371. ) then begin
  4372. result := $7FFFFFFFFFFFFFFF;
  4373. exit;
  4374. end;
  4375. end;
  4376. result := $8000000000000000;
  4377. exit;
  4378. end;
  4379. z := aSig shl shiftCount;
  4380. end
  4381. else begin
  4382. if ( aExp < $3FE ) then begin
  4383. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  4384. result := 0;
  4385. exit;
  4386. end;
  4387. z := aSig shr ( - shiftCount );
  4388. if ( bits64( aSig shl ( shiftCount and 63 ) ) <> 0 ) then
  4389. set_inexact_flag;
  4390. end;
  4391. if ( aSign <> 0 ) then z := - z;
  4392. result := z;
  4393. end;
  4394. {*
  4395. -------------------------------------------------------------------------------
  4396. Returns the result of converting the double-precision floating-point value
  4397. `a' to the single-precision floating-point format. The conversion is
  4398. performed according to the IEC/IEEE Standard for Binary Floating-Point
  4399. Arithmetic.
  4400. -------------------------------------------------------------------------------
  4401. *}
  4402. Function float64_to_float32(a: float64 ): float32rec;{$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  4403. Var
  4404. aSign: flag;
  4405. aExp: int16;
  4406. aSig0, aSig1, zSig: bits32;
  4407. allZero: bits32;
  4408. tmp : CommonNanT;
  4409. Begin
  4410. aSig1 := extractFloat64Frac1( a );
  4411. aSig0 := extractFloat64Frac0( a );
  4412. aExp := extractFloat64Exp( a );
  4413. aSign := extractFloat64Sign( a );
  4414. if ( aExp = $7FF ) then
  4415. Begin
  4416. if ( aSig0 OR aSig1 ) <> 0 then
  4417. Begin
  4418. tmp:=float64ToCommonNaN(a);
  4419. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  4420. exit;
  4421. End;
  4422. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  4423. exit;
  4424. End;
  4425. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  4426. if ( aExp <> 0) then
  4427. zSig := zSig OR $40000000;
  4428. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  4429. End;
  4430. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  4431. {*----------------------------------------------------------------------------
  4432. | Returns the result of converting the double-precision floating-point value
  4433. | `a' to the extended double-precision floating-point format. The conversion
  4434. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4435. | Arithmetic.
  4436. *----------------------------------------------------------------------------*}
  4437. function float64_to_floatx80( a: float64 ): floatx80;
  4438. var
  4439. aSign: flag;
  4440. aExp: int16;
  4441. aSig: bits64;
  4442. begin
  4443. aSig := extractFloat64Frac( a );
  4444. aExp := extractFloat64Exp( a );
  4445. aSign := extractFloat64Sign( a );
  4446. if ( aExp = $7FF ) then begin
  4447. if ( aSig <> 0 ) then begin
  4448. result := commonNaNToFloatx80( float64ToCommonNaN( a ) );
  4449. exit;
  4450. end;
  4451. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  4452. exit;
  4453. end;
  4454. if ( aExp = 0 ) then begin
  4455. if ( aSig = 0 ) then begin
  4456. result := packFloatx80( aSign, 0, 0 );
  4457. exit;
  4458. end;
  4459. normalizeFloat64Subnormal( aSig, aExp, aSig );
  4460. end;
  4461. result :=
  4462. packFloatx80(
  4463. aSign, aExp + $3C00, ( aSig or $0010000000000000 ) shl 11 );
  4464. end;
  4465. {$endif FPC_SOFTFLOAT_FLOATX80}
  4466. {*
  4467. -------------------------------------------------------------------------------
  4468. Rounds the double-precision floating-point value `a' to an integer,
  4469. and returns the result as a double-precision floating-point value. The
  4470. operation is performed according to the IEC/IEEE Standard for Binary
  4471. Floating-Point Arithmetic.
  4472. -------------------------------------------------------------------------------
  4473. *}
  4474. function float64_round_to_int(a: float64) : Float64;{$ifdef FPC_IS_SYSTEM} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  4475. Var
  4476. aSign: flag;
  4477. aExp: int16;
  4478. lastBitMask, roundBitsMask: bits32;
  4479. roundingMode: TFPURoundingMode;
  4480. z: float64;
  4481. Begin
  4482. aExp := extractFloat64Exp( a );
  4483. if ( $413 <= aExp ) then
  4484. Begin
  4485. if ( $433 <= aExp ) then
  4486. Begin
  4487. if ( ( aExp = $7FF )
  4488. AND
  4489. (
  4490. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  4491. ) <>0)
  4492. ) then
  4493. Begin
  4494. propagateFloat64NaN( a, a, result );
  4495. exit;
  4496. End;
  4497. result := a;
  4498. exit;
  4499. End;
  4500. lastBitMask := 1;
  4501. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  4502. roundBitsMask := lastBitMask - 1;
  4503. z := a;
  4504. roundingMode := softfloat_rounding_mode;
  4505. if ( roundingMode = float_round_nearest_even ) then
  4506. Begin
  4507. if ( lastBitMask <> 0) then
  4508. Begin
  4509. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  4510. if ( ( z.low and roundBitsMask ) = 0 ) then
  4511. z.low := z.low and not lastBitMask;
  4512. End
  4513. else
  4514. Begin
  4515. if ( sbits32 (z.low) < 0 ) then
  4516. Begin
  4517. Inc(z.high);
  4518. if ( bits32 ( z.low shl 1 ) = 0 ) then
  4519. z.high := z.high and not 1;
  4520. End;
  4521. End;
  4522. End
  4523. else if ( roundingMode <> float_round_to_zero ) then
  4524. Begin
  4525. if ( extractFloat64Sign( z )
  4526. xor flag( roundingMode = float_round_up ) )<> 0 then
  4527. Begin
  4528. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  4529. End;
  4530. End;
  4531. z.low := z.low and not roundBitsMask;
  4532. End
  4533. else
  4534. Begin
  4535. if ( aExp <= $3FE ) then
  4536. Begin
  4537. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4538. Begin
  4539. result := a;
  4540. exit;
  4541. End;
  4542. set_inexact_flag;
  4543. aSign := extractFloat64Sign( a );
  4544. case ( softfloat_rounding_mode ) of
  4545. float_round_nearest_even:
  4546. Begin
  4547. if ( ( aExp = $3FE )
  4548. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4549. ) then
  4550. Begin
  4551. packFloat64( aSign, $3FF, 0, 0, result );
  4552. exit;
  4553. End;
  4554. End;
  4555. float_round_down:
  4556. Begin
  4557. if aSign<>0 then
  4558. packFloat64( 1, $3FF, 0, 0, result )
  4559. else
  4560. packFloat64( 0, 0, 0, 0, result );
  4561. exit;
  4562. End;
  4563. float_round_up:
  4564. Begin
  4565. if aSign <> 0 then
  4566. packFloat64( 1, 0, 0, 0, result )
  4567. else
  4568. packFloat64( 0, $3FF, 0, 0, result );
  4569. exit;
  4570. End;
  4571. end;
  4572. packFloat64( aSign, 0, 0, 0, result );
  4573. exit;
  4574. End;
  4575. lastBitMask := 1;
  4576. lastBitMask := lastBitMask shl ($413 - aExp);
  4577. roundBitsMask := lastBitMask - 1;
  4578. z.low := 0;
  4579. z.high := a.high;
  4580. roundingMode := softfloat_rounding_mode;
  4581. if ( roundingMode = float_round_nearest_even ) then
  4582. Begin
  4583. z.high := z.high + lastBitMask shr 1;
  4584. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4585. Begin
  4586. z.high := z.high and not lastBitMask;
  4587. End;
  4588. End
  4589. else if ( roundingMode <> float_round_to_zero ) then
  4590. Begin
  4591. if ( extractFloat64Sign( z )
  4592. xor flag( roundingMode = float_round_up ) )<> 0 then
  4593. Begin
  4594. z.high := z.high or bits32( a.low <> 0 );
  4595. z.high := z.high + roundBitsMask;
  4596. End;
  4597. End;
  4598. z.high := z.high and not roundBitsMask;
  4599. End;
  4600. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4601. Begin
  4602. set_inexact_flag;
  4603. End;
  4604. result := z;
  4605. End;
  4606. {*
  4607. -------------------------------------------------------------------------------
  4608. Returns the result of adding the absolute values of the double-precision
  4609. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4610. before being returned. `zSign' is ignored if the result is a NaN.
  4611. The addition is performed according to the IEC/IEEE Standard for Binary
  4612. Floating-Point Arithmetic.
  4613. -------------------------------------------------------------------------------
  4614. *}
  4615. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4616. Var
  4617. aExp, bExp, zExp: int16;
  4618. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4619. expDiff: int16;
  4620. label shiftRight1;
  4621. label roundAndPack;
  4622. Begin
  4623. aSig1 := extractFloat64Frac1( a );
  4624. aSig0 := extractFloat64Frac0( a );
  4625. aExp := extractFloat64Exp( a );
  4626. bSig1 := extractFloat64Frac1( b );
  4627. bSig0 := extractFloat64Frac0( b );
  4628. bExp := extractFloat64Exp( b );
  4629. expDiff := aExp - bExp;
  4630. if ( 0 < expDiff ) then
  4631. Begin
  4632. if ( aExp = $7FF ) then
  4633. Begin
  4634. if ( aSig0 OR aSig1 ) <> 0 then
  4635. Begin
  4636. propagateFloat64NaN( a, b, out );
  4637. exit;
  4638. end;
  4639. out := a;
  4640. exit;
  4641. End;
  4642. if ( bExp = 0 ) then
  4643. Begin
  4644. Dec(expDiff);
  4645. End
  4646. else
  4647. Begin
  4648. bSig0 := bSig0 or $00100000;
  4649. End;
  4650. shift64ExtraRightJamming(
  4651. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4652. zExp := aExp;
  4653. End
  4654. else if ( expDiff < 0 ) then
  4655. Begin
  4656. if ( bExp = $7FF ) then
  4657. Begin
  4658. if ( bSig0 OR bSig1 ) <> 0 then
  4659. Begin
  4660. propagateFloat64NaN( a, b, out );
  4661. exit;
  4662. End;
  4663. packFloat64( zSign, $7FF, 0, 0, out );
  4664. exit;
  4665. End;
  4666. if ( aExp = 0 ) then
  4667. Begin
  4668. Inc(expDiff);
  4669. End
  4670. else
  4671. Begin
  4672. aSig0 := aSig0 or $00100000;
  4673. End;
  4674. shift64ExtraRightJamming(
  4675. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4676. zExp := bExp;
  4677. End
  4678. else
  4679. Begin
  4680. if ( aExp = $7FF ) then
  4681. Begin
  4682. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4683. Begin
  4684. propagateFloat64NaN( a, b, out );
  4685. exit;
  4686. End;
  4687. out := a;
  4688. exit;
  4689. End;
  4690. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4691. if ( aExp = 0 ) then
  4692. Begin
  4693. packFloat64( zSign, 0, zSig0, zSig1, out );
  4694. exit;
  4695. End;
  4696. zSig2 := 0;
  4697. zSig0 := zSig0 or $00200000;
  4698. zExp := aExp;
  4699. goto shiftRight1;
  4700. End;
  4701. aSig0 := aSig0 or $00100000;
  4702. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4703. Dec(zExp);
  4704. if ( zSig0 < $00200000 ) then
  4705. goto roundAndPack;
  4706. Inc(zExp);
  4707. shiftRight1:
  4708. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4709. roundAndPack:
  4710. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4711. End;
  4712. {*
  4713. -------------------------------------------------------------------------------
  4714. Returns the result of subtracting the absolute values of the double-
  4715. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4716. difference is negated before being returned. `zSign' is ignored if the
  4717. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4718. Standard for Binary Floating-Point Arithmetic.
  4719. -------------------------------------------------------------------------------
  4720. *}
  4721. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4722. Var
  4723. aExp, bExp, zExp: int16;
  4724. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4725. expDiff: int16;
  4726. z: float64;
  4727. label aExpBigger;
  4728. label bExpBigger;
  4729. label aBigger;
  4730. label bBigger;
  4731. label normalizeRoundAndPack;
  4732. Begin
  4733. aSig1 := extractFloat64Frac1( a );
  4734. aSig0 := extractFloat64Frac0( a );
  4735. aExp := extractFloat64Exp( a );
  4736. bSig1 := extractFloat64Frac1( b );
  4737. bSig0 := extractFloat64Frac0( b );
  4738. bExp := extractFloat64Exp( b );
  4739. expDiff := aExp - bExp;
  4740. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4741. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4742. if ( 0 < expDiff ) then goto aExpBigger;
  4743. if ( expDiff < 0 ) then goto bExpBigger;
  4744. if ( aExp = $7FF ) then
  4745. Begin
  4746. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4747. Begin
  4748. propagateFloat64NaN( a, b, out );
  4749. exit;
  4750. End;
  4751. float_raise( float_flag_invalid );
  4752. z.low := float64_default_nan_low;
  4753. z.high := float64_default_nan_high;
  4754. out := z;
  4755. exit;
  4756. End;
  4757. if ( aExp = 0 ) then
  4758. Begin
  4759. aExp := 1;
  4760. bExp := 1;
  4761. End;
  4762. if ( bSig0 < aSig0 ) then goto aBigger;
  4763. if ( aSig0 < bSig0 ) then goto bBigger;
  4764. if ( bSig1 < aSig1 ) then goto aBigger;
  4765. if ( aSig1 < bSig1 ) then goto bBigger;
  4766. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4767. exit;
  4768. bExpBigger:
  4769. if ( bExp = $7FF ) then
  4770. Begin
  4771. if ( bSig0 OR bSig1 ) <> 0 then
  4772. Begin
  4773. propagateFloat64NaN( a, b, out );
  4774. exit;
  4775. End;
  4776. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4777. exit;
  4778. End;
  4779. if ( aExp = 0 ) then
  4780. Begin
  4781. Inc(expDiff);
  4782. End
  4783. else
  4784. Begin
  4785. aSig0 := aSig0 or $40000000;
  4786. End;
  4787. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4788. bSig0 := bSig0 or $40000000;
  4789. bBigger:
  4790. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4791. zExp := bExp;
  4792. zSign := zSign xor 1;
  4793. goto normalizeRoundAndPack;
  4794. aExpBigger:
  4795. if ( aExp = $7FF ) then
  4796. Begin
  4797. if ( aSig0 OR aSig1 ) <> 0 then
  4798. Begin
  4799. propagateFloat64NaN( a, b, out );
  4800. exit;
  4801. End;
  4802. out := a;
  4803. exit;
  4804. End;
  4805. if ( bExp = 0 ) then
  4806. Begin
  4807. Dec(expDiff);
  4808. End
  4809. else
  4810. Begin
  4811. bSig0 := bSig0 or $40000000;
  4812. End;
  4813. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4814. aSig0 := aSig0 or $40000000;
  4815. aBigger:
  4816. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4817. zExp := aExp;
  4818. normalizeRoundAndPack:
  4819. Dec(zExp);
  4820. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4821. End;
  4822. {*
  4823. -------------------------------------------------------------------------------
  4824. Returns the result of adding the double-precision floating-point values `a'
  4825. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4826. Binary Floating-Point Arithmetic.
  4827. -------------------------------------------------------------------------------
  4828. *}
  4829. Function float64_add( a: float64; b : float64) : Float64;
  4830. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4831. Var
  4832. aSign, bSign: flag;
  4833. Begin
  4834. aSign := extractFloat64Sign( a );
  4835. bSign := extractFloat64Sign( b );
  4836. if ( aSign = bSign ) then
  4837. Begin
  4838. addFloat64Sigs( a, b, aSign, result );
  4839. End
  4840. else
  4841. Begin
  4842. subFloat64Sigs( a, b, aSign, result );
  4843. End;
  4844. End;
  4845. {*
  4846. -------------------------------------------------------------------------------
  4847. Returns the result of subtracting the double-precision floating-point values
  4848. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4849. for Binary Floating-Point Arithmetic.
  4850. -------------------------------------------------------------------------------
  4851. *}
  4852. Function float64_sub(a: float64; b : float64) : Float64;
  4853. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4854. Var
  4855. aSign, bSign: flag;
  4856. Begin
  4857. aSign := extractFloat64Sign( a );
  4858. bSign := extractFloat64Sign( b );
  4859. if ( aSign = bSign ) then
  4860. Begin
  4861. subFloat64Sigs( a, b, aSign, result );
  4862. End
  4863. else
  4864. Begin
  4865. addFloat64Sigs( a, b, aSign, result );
  4866. End;
  4867. End;
  4868. {*
  4869. -------------------------------------------------------------------------------
  4870. Returns the result of multiplying the double-precision floating-point values
  4871. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4872. for Binary Floating-Point Arithmetic.
  4873. -------------------------------------------------------------------------------
  4874. *}
  4875. Function float64_mul( a: float64; b:float64) : Float64;
  4876. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4877. Var
  4878. aSign, bSign, zSign: flag;
  4879. aExp, bExp, zExp: int16;
  4880. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4881. z: float64;
  4882. label invalid;
  4883. Begin
  4884. aSig1 := extractFloat64Frac1( a );
  4885. aSig0 := extractFloat64Frac0( a );
  4886. aExp := extractFloat64Exp( a );
  4887. aSign := extractFloat64Sign( a );
  4888. bSig1 := extractFloat64Frac1( b );
  4889. bSig0 := extractFloat64Frac0( b );
  4890. bExp := extractFloat64Exp( b );
  4891. bSign := extractFloat64Sign( b );
  4892. zSign := aSign xor bSign;
  4893. if ( aExp = $7FF ) then
  4894. Begin
  4895. if ( (( aSig0 OR aSig1 ) <>0)
  4896. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4897. Begin
  4898. propagateFloat64NaN( a, b, result );
  4899. exit;
  4900. End;
  4901. if ( ( bits32(bExp) OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4902. packFloat64( zSign, $7FF, 0, 0, result );
  4903. exit;
  4904. End;
  4905. if ( bExp = $7FF ) then
  4906. Begin
  4907. if ( bSig0 OR bSig1 )<> 0 then
  4908. Begin
  4909. propagateFloat64NaN( a, b, result );
  4910. exit;
  4911. End;
  4912. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4913. Begin
  4914. invalid:
  4915. float_raise( float_flag_invalid );
  4916. z.low := float64_default_nan_low;
  4917. z.high := float64_default_nan_high;
  4918. result := z;
  4919. exit;
  4920. End;
  4921. packFloat64( zSign, $7FF, 0, 0, result );
  4922. exit;
  4923. End;
  4924. if ( aExp = 0 ) then
  4925. Begin
  4926. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4927. Begin
  4928. packFloat64( zSign, 0, 0, 0, result );
  4929. exit;
  4930. End;
  4931. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4932. End;
  4933. if ( bExp = 0 ) then
  4934. Begin
  4935. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4936. Begin
  4937. packFloat64( zSign, 0, 0, 0, result );
  4938. exit;
  4939. End;
  4940. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4941. End;
  4942. zExp := aExp + bExp - $400;
  4943. aSig0 := aSig0 or $00100000;
  4944. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4945. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4946. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4947. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4948. if ( $00200000 <= zSig0 ) then
  4949. Begin
  4950. shift64ExtraRightJamming(
  4951. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4952. Inc(zExp);
  4953. End;
  4954. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4955. End;
  4956. {*
  4957. -------------------------------------------------------------------------------
  4958. Returns the result of dividing the double-precision floating-point value `a'
  4959. by the corresponding value `b'. The operation is performed according to the
  4960. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4961. -------------------------------------------------------------------------------
  4962. *}
  4963. Function float64_div(a: float64; b : float64) : Float64;
  4964. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4965. Var
  4966. aSign, bSign, zSign: flag;
  4967. aExp, bExp, zExp: int16;
  4968. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4969. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4970. z: float64;
  4971. label invalid;
  4972. Begin
  4973. aSig1 := extractFloat64Frac1( a );
  4974. aSig0 := extractFloat64Frac0( a );
  4975. aExp := extractFloat64Exp( a );
  4976. aSign := extractFloat64Sign( a );
  4977. bSig1 := extractFloat64Frac1( b );
  4978. bSig0 := extractFloat64Frac0( b );
  4979. bExp := extractFloat64Exp( b );
  4980. bSign := extractFloat64Sign( b );
  4981. zSign := aSign xor bSign;
  4982. if ( aExp = $7FF ) then
  4983. Begin
  4984. if ( aSig0 OR aSig1 )<> 0 then
  4985. Begin
  4986. propagateFloat64NaN( a, b, result );
  4987. exit;
  4988. end;
  4989. if ( bExp = $7FF ) then
  4990. Begin
  4991. if ( bSig0 OR bSig1 )<>0 then
  4992. Begin
  4993. propagateFloat64NaN( a, b, result );
  4994. exit;
  4995. End;
  4996. goto invalid;
  4997. End;
  4998. packFloat64( zSign, $7FF, 0, 0, result );
  4999. exit;
  5000. End;
  5001. if ( bExp = $7FF ) then
  5002. Begin
  5003. if ( bSig0 OR bSig1 )<> 0 then
  5004. Begin
  5005. propagateFloat64NaN( a, b, result );
  5006. exit;
  5007. End;
  5008. packFloat64( zSign, 0, 0, 0, result );
  5009. exit;
  5010. End;
  5011. if ( bExp = 0 ) then
  5012. Begin
  5013. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5014. Begin
  5015. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  5016. Begin
  5017. invalid:
  5018. float_raise( float_flag_invalid );
  5019. z.low := float64_default_nan_low;
  5020. z.high := float64_default_nan_high;
  5021. result := z;
  5022. exit;
  5023. End;
  5024. float_raise( float_flag_divbyzero );
  5025. packFloat64( zSign, $7FF, 0, 0, result );
  5026. exit;
  5027. End;
  5028. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5029. End;
  5030. if ( aExp = 0 ) then
  5031. Begin
  5032. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5033. Begin
  5034. packFloat64( zSign, 0, 0, 0, result );
  5035. exit;
  5036. End;
  5037. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5038. End;
  5039. zExp := aExp - bExp + $3FD;
  5040. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  5041. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5042. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  5043. Begin
  5044. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  5045. Inc(zExp);
  5046. End;
  5047. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5048. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  5049. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  5050. while ( sbits32 (rem0) < 0 ) do
  5051. Begin
  5052. Dec(zSig0);
  5053. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  5054. End;
  5055. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  5056. if ( ( zSig1 and $3FF ) <= 4 ) then
  5057. Begin
  5058. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  5059. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  5060. while ( sbits32 (rem1) < 0 ) do
  5061. Begin
  5062. Dec(zSig1);
  5063. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  5064. End;
  5065. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5066. End;
  5067. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  5068. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  5069. End;
  5070. {*
  5071. -------------------------------------------------------------------------------
  5072. Returns the remainder of the double-precision floating-point value `a'
  5073. with respect to the corresponding value `b'. The operation is performed
  5074. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5075. -------------------------------------------------------------------------------
  5076. *}
  5077. Function float64_rem(a: float64; b : float64) : float64;
  5078. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  5079. Var
  5080. aSign, zSign: flag;
  5081. aExp, bExp, expDiff: int16;
  5082. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  5083. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  5084. sigMean0: sbits32;
  5085. z: float64;
  5086. label invalid;
  5087. Begin
  5088. aSig1 := extractFloat64Frac1( a );
  5089. aSig0 := extractFloat64Frac0( a );
  5090. aExp := extractFloat64Exp( a );
  5091. aSign := extractFloat64Sign( a );
  5092. bSig1 := extractFloat64Frac1( b );
  5093. bSig0 := extractFloat64Frac0( b );
  5094. bExp := extractFloat64Exp( b );
  5095. if ( aExp = $7FF ) then
  5096. Begin
  5097. if ((( aSig0 OR aSig1 )<>0)
  5098. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  5099. Begin
  5100. propagateFloat64NaN( a, b, result );
  5101. exit;
  5102. End;
  5103. goto invalid;
  5104. End;
  5105. if ( bExp = $7FF ) then
  5106. Begin
  5107. if ( bSig0 OR bSig1 ) <> 0 then
  5108. Begin
  5109. propagateFloat64NaN( a, b, result );
  5110. exit;
  5111. End;
  5112. result := a;
  5113. exit;
  5114. End;
  5115. if ( bExp = 0 ) then
  5116. Begin
  5117. if ( ( bSig0 OR bSig1 ) = 0 ) then
  5118. Begin
  5119. invalid:
  5120. float_raise( float_flag_invalid );
  5121. z.low := float64_default_nan_low;
  5122. z.high := float64_default_nan_high;
  5123. result := z;
  5124. exit;
  5125. End;
  5126. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  5127. End;
  5128. if ( aExp = 0 ) then
  5129. Begin
  5130. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5131. Begin
  5132. result := a;
  5133. exit;
  5134. End;
  5135. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5136. End;
  5137. expDiff := aExp - bExp;
  5138. if ( expDiff < -1 ) then
  5139. Begin
  5140. result := a;
  5141. exit;
  5142. End;
  5143. shortShift64Left(
  5144. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  5145. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  5146. q := le64( bSig0, bSig1, aSig0, aSig1 );
  5147. if ( q )<>0 then
  5148. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5149. expDiff := expDiff - 32;
  5150. while ( 0 < expDiff ) do
  5151. Begin
  5152. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5153. if 4 < q then
  5154. q:= q - 4
  5155. else
  5156. q := 0;
  5157. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5158. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  5159. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  5160. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  5161. expDiff := expDiff - 29;
  5162. End;
  5163. if ( -32 < expDiff ) then
  5164. Begin
  5165. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  5166. if 4 < q then
  5167. q := q - 4
  5168. else
  5169. q := 0;
  5170. q := q shr (- expDiff);
  5171. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5172. expDiff := expDiff + 24;
  5173. if ( expDiff < 0 ) then
  5174. Begin
  5175. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  5176. End
  5177. else
  5178. Begin
  5179. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  5180. End;
  5181. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  5182. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  5183. End
  5184. else
  5185. Begin
  5186. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  5187. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  5188. End;
  5189. Repeat
  5190. alternateASig0 := aSig0;
  5191. alternateASig1 := aSig1;
  5192. Inc(q);
  5193. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  5194. Until not ( 0 <= sbits32 (aSig0) );
  5195. add64(
  5196. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  5197. if ( ( sigMean0 < 0 )
  5198. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  5199. Begin
  5200. aSig0 := alternateASig0;
  5201. aSig1 := alternateASig1;
  5202. End;
  5203. zSign := flag( sbits32 (aSig0) < 0 );
  5204. if ( zSign <> 0 ) then
  5205. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  5206. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  5207. End;
  5208. {*
  5209. -------------------------------------------------------------------------------
  5210. Returns the square root of the double-precision floating-point value `a'.
  5211. The operation is performed according to the IEC/IEEE Standard for Binary
  5212. Floating-Point Arithmetic.
  5213. -------------------------------------------------------------------------------
  5214. *}
  5215. function float64_sqrt( a: float64 ): float64;
  5216. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  5217. Var
  5218. aSign: flag;
  5219. aExp, zExp: int16;
  5220. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  5221. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  5222. label invalid;
  5223. Begin
  5224. aSig1 := extractFloat64Frac1( a );
  5225. aSig0 := extractFloat64Frac0( a );
  5226. aExp := extractFloat64Exp( a );
  5227. aSign := extractFloat64Sign( a );
  5228. if ( aExp = $7FF ) then
  5229. Begin
  5230. if ( aSig0 OR aSig1 ) <> 0 then
  5231. Begin
  5232. propagateFloat64NaN( a, a, result );
  5233. exit;
  5234. End;
  5235. if ( aSign = 0) then
  5236. Begin
  5237. result := a;
  5238. exit;
  5239. End;
  5240. goto invalid;
  5241. End;
  5242. if ( aSign <> 0 ) then
  5243. Begin
  5244. if ( ( bits32(aExp) OR aSig0 OR aSig1 ) = 0 ) then
  5245. Begin
  5246. result := a;
  5247. exit;
  5248. End;
  5249. invalid:
  5250. float_raise( float_flag_invalid );
  5251. result.low := float64_default_nan_low;
  5252. result.high := float64_default_nan_high;
  5253. exit;
  5254. End;
  5255. if ( aExp = 0 ) then
  5256. Begin
  5257. if ( ( aSig0 OR aSig1 ) = 0 ) then
  5258. Begin
  5259. packFloat64( 0, 0, 0, 0, result );
  5260. exit;
  5261. End;
  5262. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  5263. End;
  5264. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  5265. aSig0 := aSig0 or $00100000;
  5266. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  5267. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  5268. if ( zSig0 = 0 ) then
  5269. zSig0 := $7FFFFFFF;
  5270. doubleZSig0 := zSig0 + zSig0;
  5271. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  5272. mul32To64( zSig0, zSig0, term0, term1 );
  5273. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  5274. while ( sbits32 (rem0) < 0 ) do
  5275. Begin
  5276. Dec(zSig0);
  5277. doubleZSig0 := doubleZSig0 - 2;
  5278. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  5279. End;
  5280. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  5281. if ( ( zSig1 and $1FF ) <= 5 ) then
  5282. Begin
  5283. if ( zSig1 = 0 ) then
  5284. zSig1 := 1;
  5285. mul32To64( doubleZSig0, zSig1, term1, term2 );
  5286. sub64( rem1, 0, term1, term2, rem1, rem2 );
  5287. mul32To64( zSig1, zSig1, term2, term3 );
  5288. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  5289. while ( sbits32 (rem1) < 0 ) do
  5290. Begin
  5291. Dec(zSig1);
  5292. shortShift64Left( 0, zSig1, 1, term2, term3 );
  5293. term3 := term3 or 1;
  5294. term2 := term2 or doubleZSig0;
  5295. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  5296. End;
  5297. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  5298. End;
  5299. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  5300. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, result );
  5301. End;
  5302. {*
  5303. -------------------------------------------------------------------------------
  5304. Returns 1 if the double-precision floating-point value `a' is equal to
  5305. the corresponding value `b', and 0 otherwise. The comparison is performed
  5306. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5307. -------------------------------------------------------------------------------
  5308. *}
  5309. Function float64_eq(a: float64; b: float64): flag;
  5310. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  5311. Begin
  5312. if
  5313. (
  5314. ( extractFloat64Exp( a ) = $7FF )
  5315. AND
  5316. (
  5317. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5318. )
  5319. )
  5320. OR (
  5321. ( extractFloat64Exp( b ) = $7FF )
  5322. AND (
  5323. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5324. )
  5325. )
  5326. ) then
  5327. Begin
  5328. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5329. float_raise( float_flag_invalid );
  5330. float64_eq := 0;
  5331. exit;
  5332. End;
  5333. float64_eq := flag(
  5334. ( a.low = b.low )
  5335. AND ( ( a.high = b.high )
  5336. OR ( ( a.low = 0 )
  5337. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5338. ));
  5339. End;
  5340. {*
  5341. -------------------------------------------------------------------------------
  5342. Returns 1 if the double-precision floating-point value `a' is less than
  5343. or equal to the corresponding value `b', and 0 otherwise. The comparison
  5344. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5345. Arithmetic.
  5346. -------------------------------------------------------------------------------
  5347. *}
  5348. Function float64_le(a: float64;b: float64): flag;
  5349. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  5350. Var
  5351. aSign, bSign: flag;
  5352. Begin
  5353. if
  5354. (
  5355. ( extractFloat64Exp( a ) = $7FF )
  5356. AND
  5357. (
  5358. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5359. )
  5360. )
  5361. OR (
  5362. ( extractFloat64Exp( b ) = $7FF )
  5363. AND (
  5364. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5365. )
  5366. )
  5367. ) then
  5368. Begin
  5369. float_raise( float_flag_invalid );
  5370. float64_le := 0;
  5371. exit;
  5372. End;
  5373. aSign := extractFloat64Sign( a );
  5374. bSign := extractFloat64Sign( b );
  5375. if ( aSign <> bSign ) then
  5376. Begin
  5377. float64_le := flag(
  5378. (aSign <> 0)
  5379. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5380. = 0 ));
  5381. exit;
  5382. End;
  5383. if aSign <> 0 then
  5384. float64_le := le64( b.high, b.low, a.high, a.low )
  5385. else
  5386. float64_le := le64( a.high, a.low, b.high, b.low );
  5387. End;
  5388. {*
  5389. -------------------------------------------------------------------------------
  5390. Returns 1 if the double-precision floating-point value `a' is less than
  5391. the corresponding value `b', and 0 otherwise. The comparison is performed
  5392. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5393. -------------------------------------------------------------------------------
  5394. *}
  5395. Function float64_lt(a: float64;b: float64): flag;
  5396. {$ifdef FPC_IS_SYSTEM}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  5397. Var
  5398. aSign, bSign: flag;
  5399. Begin
  5400. if
  5401. (
  5402. ( extractFloat64Exp( a ) = $7FF )
  5403. AND
  5404. (
  5405. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5406. )
  5407. )
  5408. OR (
  5409. ( extractFloat64Exp( b ) = $7FF )
  5410. AND (
  5411. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5412. )
  5413. )
  5414. ) then
  5415. Begin
  5416. float_raise( float_flag_invalid );
  5417. float64_lt := 0;
  5418. exit;
  5419. End;
  5420. aSign := extractFloat64Sign( a );
  5421. bSign := extractFloat64Sign( b );
  5422. if ( aSign <> bSign ) then
  5423. Begin
  5424. float64_lt := flag(
  5425. (aSign <> 0)
  5426. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5427. <> 0 ));
  5428. exit;
  5429. End;
  5430. if aSign <> 0 then
  5431. float64_lt := lt64( b.high, b.low, a.high, a.low )
  5432. else
  5433. float64_lt := lt64( a.high, a.low, b.high, b.low );
  5434. End;
  5435. {*
  5436. -------------------------------------------------------------------------------
  5437. Returns 1 if the double-precision floating-point value `a' is equal to
  5438. the corresponding value `b', and 0 otherwise. The invalid exception is
  5439. raised if either operand is a NaN. Otherwise, the comparison is performed
  5440. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5441. -------------------------------------------------------------------------------
  5442. *}
  5443. Function float64_eq_signaling( a: float64; b: float64): flag;
  5444. Begin
  5445. if
  5446. (
  5447. ( extractFloat64Exp( a ) = $7FF )
  5448. AND
  5449. (
  5450. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5451. )
  5452. )
  5453. OR (
  5454. ( extractFloat64Exp( b ) = $7FF )
  5455. AND (
  5456. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5457. )
  5458. )
  5459. ) then
  5460. Begin
  5461. float_raise( float_flag_invalid );
  5462. float64_eq_signaling := 0;
  5463. exit;
  5464. End;
  5465. float64_eq_signaling := flag(
  5466. ( a.low = b.low )
  5467. AND ( ( a.high = b.high )
  5468. OR ( ( a.low = 0 )
  5469. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  5470. ));
  5471. End;
  5472. {*
  5473. -------------------------------------------------------------------------------
  5474. Returns 1 if the double-precision floating-point value `a' is less than or
  5475. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  5476. cause an exception. Otherwise, the comparison is performed according to the
  5477. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5478. -------------------------------------------------------------------------------
  5479. *}
  5480. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  5481. Var
  5482. aSign, bSign : flag;
  5483. Begin
  5484. if
  5485. (
  5486. ( extractFloat64Exp( a ) = $7FF )
  5487. AND
  5488. (
  5489. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5490. )
  5491. )
  5492. OR (
  5493. ( extractFloat64Exp( b ) = $7FF )
  5494. AND (
  5495. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5496. )
  5497. )
  5498. ) then
  5499. Begin
  5500. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5501. float_raise( float_flag_invalid );
  5502. float64_le_quiet := 0;
  5503. exit;
  5504. End;
  5505. aSign := extractFloat64Sign( a );
  5506. bSign := extractFloat64Sign( b );
  5507. if ( aSign <> bSign ) then
  5508. Begin
  5509. float64_le_quiet := flag
  5510. ((aSign <> 0)
  5511. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5512. = 0 ));
  5513. exit;
  5514. End;
  5515. if aSign <> 0 then
  5516. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  5517. else
  5518. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  5519. End;
  5520. {*
  5521. -------------------------------------------------------------------------------
  5522. Returns 1 if the double-precision floating-point value `a' is less than
  5523. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5524. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5525. Standard for Binary Floating-Point Arithmetic.
  5526. -------------------------------------------------------------------------------
  5527. *}
  5528. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5529. Var
  5530. aSign, bSign: flag;
  5531. Begin
  5532. if
  5533. (
  5534. ( extractFloat64Exp( a ) = $7FF )
  5535. AND
  5536. (
  5537. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5538. )
  5539. )
  5540. OR (
  5541. ( extractFloat64Exp( b ) = $7FF )
  5542. AND (
  5543. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5544. )
  5545. )
  5546. ) then
  5547. Begin
  5548. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5549. float_raise( float_flag_invalid );
  5550. float64_lt_quiet := 0;
  5551. exit;
  5552. End;
  5553. aSign := extractFloat64Sign( a );
  5554. bSign := extractFloat64Sign( b );
  5555. if ( aSign <> bSign ) then
  5556. Begin
  5557. float64_lt_quiet := flag(
  5558. (aSign<>0)
  5559. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5560. <> 0 ));
  5561. exit;
  5562. End;
  5563. If aSign <> 0 then
  5564. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5565. else
  5566. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5567. End;
  5568. {*----------------------------------------------------------------------------
  5569. | Returns the result of converting the 64-bit two's complement integer `a'
  5570. | to the single-precision floating-point format. The conversion is performed
  5571. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5572. *----------------------------------------------------------------------------*}
  5573. function int64_to_float32( a: int64 ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  5574. var
  5575. zSign : flag;
  5576. absA : uint64;
  5577. shiftCount: int8;
  5578. Begin
  5579. if ( a = 0 ) then
  5580. begin
  5581. int64_to_float32.float32 := 0;
  5582. exit;
  5583. end;
  5584. if a < 0 then
  5585. zSign := flag(TRUE)
  5586. else
  5587. zSign := flag(FALSE);
  5588. if zSign<>0 then
  5589. absA := -a
  5590. else
  5591. absA := a;
  5592. shiftCount := countLeadingZeros64( absA ) - 40;
  5593. if ( 0 <= shiftCount ) then
  5594. begin
  5595. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5596. end
  5597. else
  5598. begin
  5599. shiftCount := shiftCount + 7;
  5600. if ( shiftCount < 0 ) then
  5601. shift64RightJamming( absA, - shiftCount, absA )
  5602. else
  5603. absA := absA shl shiftCount;
  5604. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5605. end;
  5606. End;
  5607. {*----------------------------------------------------------------------------
  5608. | Returns the result of converting the 64-bit two's complement integer `a'
  5609. | to the single-precision floating-point format. The conversion is performed
  5610. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5611. | Unisgned version.
  5612. *----------------------------------------------------------------------------*}
  5613. function qword_to_float32( a: qword ): float32rec; {$ifdef FPC_IS_SYSTEM}compilerproc;{$endif FPC_IS_SYSTEM}
  5614. var
  5615. absA : uint64;
  5616. shiftCount: int8;
  5617. Begin
  5618. if ( a = 0 ) then
  5619. begin
  5620. qword_to_float32.float32 := 0;
  5621. exit;
  5622. end;
  5623. absA := a;
  5624. shiftCount := countLeadingZeros64( absA ) - 40;
  5625. if ( 0 <= shiftCount ) then
  5626. begin
  5627. qword_to_float32.float32:= packFloat32( 0, $95 - shiftCount, absA shl shiftCount );
  5628. end
  5629. else
  5630. begin
  5631. shiftCount := shiftCount + 7;
  5632. if ( shiftCount < 0 ) then
  5633. shift64RightJamming( absA, - shiftCount, absA )
  5634. else
  5635. absA := absA shl shiftCount;
  5636. qword_to_float32.float32:=roundAndPackFloat32( 0, $9C - shiftCount, absA );
  5637. end;
  5638. End;
  5639. {*----------------------------------------------------------------------------
  5640. | Returns the result of converting the 64-bit two's complement integer `a'
  5641. | to the double-precision floating-point format. The conversion is performed
  5642. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5643. *----------------------------------------------------------------------------*}
  5644. function qword_to_float64( a: qword ): float64;
  5645. {$ifdef FPC_IS_SYSTEM}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5646. var
  5647. shiftCount: int8;
  5648. Begin
  5649. if ( a = 0 ) then
  5650. result := packFloat64( 0, 0, 0 )
  5651. else
  5652. begin
  5653. shiftCount := countLeadingZeros64(a) - 1;
  5654. { numbers with <= 53 significant bits are converted exactly }
  5655. if (shiftCount > 9) then
  5656. result := packFloat64(0, $43c - shiftCount, a shl (shiftCount-10))
  5657. else if (shiftCount>=0) then
  5658. result := roundAndPackFloat64( 0, $43c - shiftCount, a shl shiftCount)
  5659. else
  5660. begin
  5661. { the only possible negative value is -1, in case bit 63 of 'a' is set }
  5662. shift64RightJamming(a, 1, a);
  5663. result := roundAndPackFloat64(0, $43d, a);
  5664. end;
  5665. end;
  5666. End;
  5667. {*----------------------------------------------------------------------------
  5668. | Returns the result of converting the 64-bit two's complement integer `a'
  5669. | to the double-precision floating-point format. The conversion is performed
  5670. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5671. *----------------------------------------------------------------------------*}
  5672. function int64_to_float64( a: int64 ): float64;
  5673. {$ifdef FPC_IS_SYSTEM}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5674. Begin
  5675. if ( a = 0 ) then
  5676. result := packFloat64( 0, 0, 0 )
  5677. else if (a = int64($8000000000000000)) then
  5678. result := packFloat64( 1, $43e, 0 )
  5679. else if (a < 0) then
  5680. result := normalizeRoundAndPackFloat64( 1, $43c, -a )
  5681. else
  5682. result := normalizeRoundAndPackFloat64( 0, $43c, a );
  5683. End;
  5684. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5685. {*----------------------------------------------------------------------------
  5686. | Returns the result of converting the 64-bit two's complement integer `a'
  5687. | to the extended double-precision floating-point format. The conversion
  5688. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5689. | Arithmetic.
  5690. *----------------------------------------------------------------------------*}
  5691. function int64_to_floatx80( a: int64 ): floatx80;
  5692. var
  5693. zSign: flag;
  5694. absA: uint64;
  5695. shiftCount: int8;
  5696. begin
  5697. if ( a = 0 ) then begin
  5698. result := packFloatx80( 0, 0, 0 );
  5699. exit;
  5700. end;
  5701. zSign := ord( a < 0 );
  5702. if zSign <> 0 then absA := - a else absA := a;
  5703. shiftCount := countLeadingZeros64( absA );
  5704. result := packFloatx80( zSign, $403E - shiftCount, absA shl shiftCount );
  5705. end;
  5706. {*----------------------------------------------------------------------------
  5707. | Returns the result of converting the 64-bit two's complement integer `a'
  5708. | to the extended double-precision floating-point format. The conversion
  5709. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5710. | Arithmetic.
  5711. | Unsigned version.
  5712. *----------------------------------------------------------------------------*}
  5713. function qword_to_floatx80( a: qword ): floatx80;
  5714. var
  5715. absA: bits64;
  5716. shiftCount: int8;
  5717. begin
  5718. if ( a = 0 ) then begin
  5719. result := packFloatx80( 0, 0, 0 );
  5720. exit;
  5721. end;
  5722. absA := a;
  5723. shiftCount := countLeadingZeros64( absA );
  5724. result := packFloatx80( 0, $403E - shiftCount, absA shl shiftCount );
  5725. end;
  5726. {$endif FPC_SOFTFLOAT_FLOATX80}
  5727. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5728. {*----------------------------------------------------------------------------
  5729. | Returns the result of converting the 64-bit two's complement integer `a' to
  5730. | the quadruple-precision floating-point format. The conversion is performed
  5731. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5732. *----------------------------------------------------------------------------*}
  5733. function int64_to_float128( a: int64 ): float128;
  5734. var
  5735. zSign: flag;
  5736. absA: uint64;
  5737. shiftCount: int8;
  5738. zExp: int32;
  5739. zSig0, zSig1: bits64;
  5740. begin
  5741. if ( a = 0 ) then begin
  5742. result := packFloat128( 0, 0, 0, 0 );
  5743. exit;
  5744. end;
  5745. zSign := ord( a < 0 );
  5746. if zSign <> 0 then absA := - a else absA := a;
  5747. shiftCount := countLeadingZeros64( absA ) + 49;
  5748. zExp := $406E - shiftCount;
  5749. if ( 64 <= shiftCount ) then begin
  5750. zSig1 := 0;
  5751. zSig0 := absA;
  5752. dec( shiftCount, 64 );
  5753. end
  5754. else begin
  5755. zSig1 := absA;
  5756. zSig0 := 0;
  5757. end;
  5758. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5759. result := packFloat128( zSign, zExp, zSig0, zSig1 );
  5760. end;
  5761. {*----------------------------------------------------------------------------
  5762. | Returns the result of converting the 64-bit two's complement integer `a' to
  5763. | the quadruple-precision floating-point format. The conversion is performed
  5764. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5765. | Unsigned version.
  5766. *----------------------------------------------------------------------------*}
  5767. function qword_to_float128( a: qword ): float128;
  5768. var
  5769. absA: bits64;
  5770. shiftCount: int8;
  5771. zExp: int32;
  5772. zSig0, zSig1: bits64;
  5773. begin
  5774. if ( a = 0 ) then begin
  5775. result := packFloat128( 0, 0, 0, 0 );
  5776. exit;
  5777. end;
  5778. absA := a;
  5779. shiftCount := countLeadingZeros64( absA ) + 49;
  5780. zExp := $406E - shiftCount;
  5781. if ( 64 <= shiftCount ) then begin
  5782. zSig1 := 0;
  5783. zSig0 := absA;
  5784. dec( shiftCount, 64 );
  5785. end
  5786. else begin
  5787. zSig1 := absA;
  5788. zSig0 := 0;
  5789. end;
  5790. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5791. result := packFloat128( 0, zExp, zSig0, zSig1 );
  5792. end;
  5793. {$endif FPC_SOFTFLOAT_FLOAT128}
  5794. {*----------------------------------------------------------------------------
  5795. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5796. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5797. | Otherwise, returns 0.
  5798. *----------------------------------------------------------------------------*}
  5799. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5800. begin
  5801. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5802. end;
  5803. {*----------------------------------------------------------------------------
  5804. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5805. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5806. | Otherwise, returns 0.
  5807. *----------------------------------------------------------------------------*}
  5808. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5809. begin
  5810. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5811. end;
  5812. {*----------------------------------------------------------------------------
  5813. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5814. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5815. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5816. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5817. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5818. | the most-significant bit of the extra result, and the other 63 bits of the
  5819. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5820. | were all zero. This extra result is stored in the location pointed to by
  5821. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5822. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5823. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5824. | fixed-point value is shifted right by the number of bits given in `count',
  5825. | and the integer part of the result is returned at the locations pointed to
  5826. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5827. | corrupted as described above, and is returned at the location pointed to by
  5828. | `z2Ptr'.)
  5829. *----------------------------------------------------------------------------*}
  5830. procedure shift128ExtraRightJamming(
  5831. a0: bits64;
  5832. a1: bits64;
  5833. a2: bits64;
  5834. count: int16;
  5835. var z0Ptr: bits64;
  5836. var z1Ptr: bits64;
  5837. var z2Ptr: bits64);
  5838. var
  5839. z0, z1, z2: bits64;
  5840. negCount: int8;
  5841. begin
  5842. negCount := ( - count ) and 63;
  5843. if ( count = 0 ) then
  5844. begin
  5845. z2 := a2;
  5846. z1 := a1;
  5847. z0 := a0;
  5848. end
  5849. else begin
  5850. if ( count < 64 ) then
  5851. begin
  5852. z2 := a1 shl negCount;
  5853. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5854. z0 := a0 shr count;
  5855. end
  5856. else begin
  5857. if ( count = 64 ) then
  5858. begin
  5859. z2 := a1;
  5860. z1 := a0;
  5861. end
  5862. else begin
  5863. a2 := a2 or a1;
  5864. if ( count < 128 ) then
  5865. begin
  5866. z2 := a0 shl negCount;
  5867. z1 := a0 shr ( count and 63 );
  5868. end
  5869. else begin
  5870. if ( count = 128 ) then
  5871. z2 := a0
  5872. else
  5873. z2 := ord( a0 <> 0 );
  5874. z1 := 0;
  5875. end;
  5876. end;
  5877. z0 := 0;
  5878. end;
  5879. z2 := z2 or ord( a2 <> 0 );
  5880. end;
  5881. z2Ptr := z2;
  5882. z1Ptr := z1;
  5883. z0Ptr := z0;
  5884. end;
  5885. {*----------------------------------------------------------------------------
  5886. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5887. | _plus_ the number of bits given in `count'. The shifted result is at most
  5888. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5889. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5890. | shifted off is the most-significant bit of the extra result, and the other
  5891. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5892. | bits shifted off were all zero. This extra result is stored in the location
  5893. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5894. | (This routine makes more sense if `a0' and `a1' are considered to form
  5895. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5896. | point value is shifted right by the number of bits given in `count', and
  5897. | the integer part of the result is returned at the location pointed to by
  5898. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5899. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5900. *----------------------------------------------------------------------------*}
  5901. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5902. var
  5903. z0, z1: bits64;
  5904. negCount: int8;
  5905. begin
  5906. negCount := ( - count ) and 63;
  5907. if ( count = 0 ) then
  5908. begin
  5909. z1 := a1;
  5910. z0 := a0;
  5911. end
  5912. else if ( count < 64 ) then
  5913. begin
  5914. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5915. z0 := a0 shr count;
  5916. end
  5917. else begin
  5918. if ( count = 64 ) then
  5919. begin
  5920. z1 := a0 or ord( a1 <> 0 );
  5921. end
  5922. else begin
  5923. z1 := ord( ( a0 or a1 ) <> 0 );
  5924. end;
  5925. z0 := 0;
  5926. end;
  5927. z1Ptr := z1;
  5928. z0Ptr := z0;
  5929. end;
  5930. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5931. {*----------------------------------------------------------------------------
  5932. | Returns the fraction bits of the extended double-precision floating-point
  5933. | value `a'.
  5934. *----------------------------------------------------------------------------*}
  5935. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5936. begin
  5937. result:=a.low;
  5938. end;
  5939. {*----------------------------------------------------------------------------
  5940. | Returns the exponent bits of the extended double-precision floating-point
  5941. | value `a'.
  5942. *----------------------------------------------------------------------------*}
  5943. function extractFloatx80Exp(a : floatx80): int32;inline;
  5944. begin
  5945. result:=a.high and $7FFF;
  5946. end;
  5947. {*----------------------------------------------------------------------------
  5948. | Returns the sign bit of the extended double-precision floating-point value
  5949. | `a'.
  5950. *----------------------------------------------------------------------------*}
  5951. function extractFloatx80Sign(a : floatx80): flag;inline;
  5952. begin
  5953. result:=a.high shr 15;
  5954. end;
  5955. {*----------------------------------------------------------------------------
  5956. | Normalizes the subnormal extended double-precision floating-point value
  5957. | represented by the denormalized significand `aSig'. The normalized exponent
  5958. | and significand are stored at the locations pointed to by `zExpPtr' and
  5959. | `zSigPtr', respectively.
  5960. *----------------------------------------------------------------------------*}
  5961. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5962. var
  5963. shiftCount: int8;
  5964. begin
  5965. shiftCount := countLeadingZeros64( aSig );
  5966. zSigPtr := aSig shl shiftCount;
  5967. zExpPtr := 1 - shiftCount;
  5968. end;
  5969. {*----------------------------------------------------------------------------
  5970. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5971. | extended double-precision floating-point value, returning the result.
  5972. *----------------------------------------------------------------------------*}
  5973. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5974. var
  5975. z: floatx80;
  5976. begin
  5977. z.low := zSig;
  5978. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5979. result:=z;
  5980. end;
  5981. {*----------------------------------------------------------------------------
  5982. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5983. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5984. | and returns the proper extended double-precision floating-point value
  5985. | corresponding to the abstract input. Ordinarily, the abstract value is
  5986. | rounded and packed into the extended double-precision format, with the
  5987. | inexact exception raised if the abstract input cannot be represented
  5988. | exactly. However, if the abstract value is too large, the overflow and
  5989. | inexact exceptions are raised and an infinity or maximal finite value is
  5990. | returned. If the abstract value is too small, the input value is rounded to
  5991. | a subnormal number, and the underflow and inexact exceptions are raised if
  5992. | the abstract input cannot be represented exactly as a subnormal extended
  5993. | double-precision floating-point number.
  5994. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5995. | number of bits as single or double precision, respectively. Otherwise, the
  5996. | result is rounded to the full precision of the extended double-precision
  5997. | format.
  5998. | The input significand must be normalized or smaller. If the input
  5999. | significand is not normalized, `zExp' must be 0; in that case, the result
  6000. | returned is a subnormal number, and it must not require rounding. The
  6001. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  6002. | Floating-Point Arithmetic.
  6003. *----------------------------------------------------------------------------*}
  6004. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6005. var
  6006. roundingMode: TFPURoundingMode;
  6007. roundNearestEven, increment, isTiny: flag;
  6008. roundIncrement, roundMask, roundBits: int64;
  6009. label
  6010. precision80, overflow;
  6011. begin
  6012. roundingMode := softfloat_rounding_mode;
  6013. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  6014. if ( roundingPrecision = 80 ) then
  6015. goto precision80;
  6016. if ( roundingPrecision = 64 ) then
  6017. begin
  6018. roundIncrement := int64( $0000000000000400 );
  6019. roundMask := int64( $00000000000007FF );
  6020. end
  6021. else if ( roundingPrecision = 32 ) then
  6022. begin
  6023. roundIncrement := int64( $0000008000000000 );
  6024. roundMask := int64( $000000FFFFFFFFFF );
  6025. end
  6026. else begin
  6027. goto precision80;
  6028. end;
  6029. zSig0 := zSig0 or ord( zSig1 <> 0 );
  6030. if ( not (roundNearestEven<>0) ) then
  6031. begin
  6032. if ( roundingMode = float_round_to_zero ) then
  6033. begin
  6034. roundIncrement := 0;
  6035. end
  6036. else begin
  6037. roundIncrement := roundMask;
  6038. if ( zSign<>0 ) then
  6039. begin
  6040. if ( roundingMode = float_round_up ) then
  6041. roundIncrement := 0;
  6042. end
  6043. else begin
  6044. if ( roundingMode = float_round_down ) then
  6045. roundIncrement := 0;
  6046. end;
  6047. end;
  6048. end;
  6049. roundBits := zSig0 and roundMask;
  6050. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6051. if ( ( $7FFE < zExp )
  6052. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  6053. ) then begin
  6054. goto overflow;
  6055. end;
  6056. if ( zExp <= 0 ) then begin
  6057. isTiny := ord (
  6058. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6059. or ( zExp < 0 )
  6060. or ( zSig0 <= zSig0 + roundIncrement ) );
  6061. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  6062. zExp := 0;
  6063. roundBits := zSig0 and roundMask;
  6064. if ( isTiny <> 0 ) and ( roundBits <> 0 ) then float_raise( float_flag_underflow );
  6065. if ( roundBits <> 0 ) then set_inexact_flag;
  6066. inc( zSig0, roundIncrement );
  6067. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6068. roundIncrement := roundMask + 1;
  6069. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6070. roundMask := roundMask or roundIncrement;
  6071. end;
  6072. zSig0 := zSig0 and not roundMask;
  6073. result:=packFloatx80( zSign, zExp, zSig0 );
  6074. exit;
  6075. end;
  6076. end;
  6077. if ( roundBits <> 0 ) then set_inexact_flag;
  6078. inc( zSig0, roundIncrement );
  6079. if ( zSig0 < roundIncrement ) then begin
  6080. inc(zExp);
  6081. zSig0 := bits64( $8000000000000000 );
  6082. end;
  6083. roundIncrement := roundMask + 1;
  6084. if ( roundNearestEven <> 0 ) and ( roundBits shl 1 = roundIncrement ) then begin
  6085. roundMask := roundMask or roundIncrement;
  6086. end;
  6087. zSig0 := zSig0 and not roundMask;
  6088. if ( zSig0 = 0 ) then zExp := 0;
  6089. result:=packFloatx80( zSign, zExp, zSig0 );
  6090. exit;
  6091. precision80:
  6092. increment := ord ( sbits64( zSig1 ) < 0 );
  6093. if ( roundNearestEven = 0 ) then begin
  6094. if ( roundingMode = float_round_to_zero ) then begin
  6095. increment := 0;
  6096. end
  6097. else begin
  6098. if ( zSign <> 0 ) then begin
  6099. increment := ord ( roundingMode = float_round_down ) and zSig1;
  6100. end
  6101. else begin
  6102. increment := ord ( roundingMode = float_round_up ) and zSig1;
  6103. end;
  6104. end;
  6105. end;
  6106. if ( $7FFD <= bits32( zExp - 1 ) ) then begin
  6107. if ( ( $7FFE < zExp )
  6108. or ( ( zExp = $7FFE )
  6109. and ( zSig0 = bits64( $FFFFFFFFFFFFFFFF ) )
  6110. and ( increment <> 0 )
  6111. )
  6112. ) then begin
  6113. roundMask := 0;
  6114. overflow:
  6115. float_raise( [float_flag_overflow,float_flag_inexact] );
  6116. if ( ( roundingMode = float_round_to_zero )
  6117. or ( ( zSign <> 0) and ( roundingMode = float_round_up ) )
  6118. or ( ( zSign = 0) and ( roundingMode = float_round_down ) )
  6119. ) then begin
  6120. result:=packFloatx80( zSign, $7FFE, not roundMask );
  6121. exit;
  6122. end;
  6123. result:=packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6124. exit;
  6125. end;
  6126. if ( zExp <= 0 ) then begin
  6127. isTiny := ord(
  6128. ( softfloat_detect_tininess = float_tininess_before_rounding )
  6129. or ( zExp < 0 )
  6130. or ( increment = 0 )
  6131. or ( zSig0 < bits64( $FFFFFFFFFFFFFFFF ) ) );
  6132. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  6133. zExp := 0;
  6134. if ( ( isTiny <> 0 ) and ( zSig1 <> 0 ) ) then float_raise( float_flag_underflow );
  6135. if ( zSig1 <> 0 ) then set_inexact_flag;
  6136. if ( roundNearestEven <> 0 ) then begin
  6137. increment := ord( sbits64( zSig1 ) < 0 );
  6138. end
  6139. else begin
  6140. if ( zSign <> 0 ) then begin
  6141. increment := ord( roundingMode = float_round_down ) and zSig1;
  6142. end
  6143. else begin
  6144. increment := ord( roundingMode = float_round_up ) and zSig1;
  6145. end;
  6146. end;
  6147. if ( increment <> 0 ) then begin
  6148. inc(zSig0);
  6149. zSig0 :=
  6150. not ( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6151. if ( sbits64( zSig0 ) < 0 ) then zExp := 1;
  6152. end;
  6153. result:=packFloatx80( zSign, zExp, zSig0 );
  6154. exit;
  6155. end;
  6156. end;
  6157. if ( zSig1 <> 0 ) then set_inexact_flag;
  6158. if ( increment <> 0 ) then begin
  6159. inc(zSig0);
  6160. if ( zSig0 = 0 ) then begin
  6161. inc(zExp);
  6162. zSig0 := bits64( $8000000000000000 );
  6163. end
  6164. else begin
  6165. zSig0 := zSig0 and not bits64( ord( bits64( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  6166. end;
  6167. end
  6168. else begin
  6169. if ( zSig0 = 0 ) then zExp := 0;
  6170. end;
  6171. result:=packFloatx80( zSign, zExp, zSig0 );
  6172. end;
  6173. {*----------------------------------------------------------------------------
  6174. | Takes an abstract floating-point value having sign `zSign', exponent
  6175. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  6176. | and returns the proper extended double-precision floating-point value
  6177. | corresponding to the abstract input. This routine is just like
  6178. | `roundAndPackFloatx80' except that the input significand does not have to be
  6179. | normalized.
  6180. *----------------------------------------------------------------------------*}
  6181. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  6182. var
  6183. shiftCount: int8;
  6184. begin
  6185. if ( zSig0 = 0 ) then begin
  6186. zSig0 := zSig1;
  6187. zSig1 := 0;
  6188. dec( zExp, 64 );
  6189. end;
  6190. shiftCount := countLeadingZeros64( zSig0 );
  6191. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6192. zExp := zExp - shiftCount;
  6193. result :=
  6194. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  6195. end;
  6196. {*----------------------------------------------------------------------------
  6197. | Returns the result of converting the extended double-precision floating-
  6198. | point value `a' to the 32-bit two's complement integer format. The
  6199. | conversion is performed according to the IEC/IEEE Standard for Binary
  6200. | Floating-Point Arithmetic---which means in particular that the conversion
  6201. | is rounded according to the current rounding mode. If `a' is a NaN, the
  6202. | largest positive integer is returned. Otherwise, if the conversion
  6203. | overflows, the largest integer with the same sign as `a' is returned.
  6204. *----------------------------------------------------------------------------*}
  6205. function floatx80_to_int32(a: floatx80): int32;
  6206. var
  6207. aSign: flag;
  6208. aExp, shiftCount: int32;
  6209. aSig: bits64;
  6210. begin
  6211. aSig := extractFloatx80Frac( a );
  6212. aExp := extractFloatx80Exp( a );
  6213. aSign := extractFloatx80Sign( a );
  6214. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then aSign := 0;
  6215. shiftCount := $4037 - aExp;
  6216. if ( shiftCount <= 0 ) then shiftCount := 1;
  6217. shift64RightJamming( aSig, shiftCount, aSig );
  6218. result := roundAndPackInt32( aSign, aSig );
  6219. end;
  6220. {*----------------------------------------------------------------------------
  6221. | Returns the result of converting the extended double-precision floating-
  6222. | point value `a' to the 32-bit two's complement integer format. The
  6223. | conversion is performed according to the IEC/IEEE Standard for Binary
  6224. | Floating-Point Arithmetic, except that the conversion is always rounded
  6225. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6226. | Otherwise, if the conversion overflows, the largest integer with the same
  6227. | sign as `a' is returned.
  6228. *----------------------------------------------------------------------------*}
  6229. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  6230. var
  6231. aSign: flag;
  6232. aExp, shiftCount: int32;
  6233. aSig, savedASig: bits64;
  6234. z: int32;
  6235. label
  6236. invalid;
  6237. begin
  6238. aSig := extractFloatx80Frac( a );
  6239. aExp := extractFloatx80Exp( a );
  6240. aSign := extractFloatx80Sign( a );
  6241. if ( $401E < aExp ) then begin
  6242. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 )<>0 ) then aSign := 0;
  6243. goto invalid;
  6244. end
  6245. else if ( aExp < $3FFF ) then begin
  6246. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6247. result := 0;
  6248. exit;
  6249. end;
  6250. shiftCount := $403E - aExp;
  6251. savedASig := aSig;
  6252. aSig := aSig shr shiftCount;
  6253. z := aSig;
  6254. if ( aSign <> 0 ) then z := - z;
  6255. if ( ord( z < 0 ) xor aSign ) <> 0 then begin
  6256. invalid:
  6257. float_raise( float_flag_invalid );
  6258. if aSign <> 0 then result := sbits32( $80000000 ) else result := $7FFFFFFF;
  6259. exit;
  6260. end;
  6261. if ( ( aSig shl shiftCount ) <> savedASig ) then begin
  6262. set_inexact_flag;
  6263. end;
  6264. result := z;
  6265. end;
  6266. {*----------------------------------------------------------------------------
  6267. | Returns the result of converting the extended double-precision floating-
  6268. | point value `a' to the 64-bit two's complement integer format. The
  6269. | conversion is performed according to the IEC/IEEE Standard for Binary
  6270. | Floating-Point Arithmetic---which means in particular that the conversion
  6271. | is rounded according to the current rounding mode. If `a' is a NaN,
  6272. | the largest positive integer is returned. Otherwise, if the conversion
  6273. | overflows, the largest integer with the same sign as `a' is returned.
  6274. *----------------------------------------------------------------------------*}
  6275. function floatx80_to_int64(a: floatx80): int64;
  6276. var
  6277. aSign: flag;
  6278. aExp, shiftCount: int32;
  6279. aSig, aSigExtra: bits64;
  6280. begin
  6281. aSig := extractFloatx80Frac( a );
  6282. aExp := extractFloatx80Exp( a );
  6283. aSign := extractFloatx80Sign( a );
  6284. shiftCount := $403E - aExp;
  6285. if ( shiftCount <= 0 ) then begin
  6286. if ( shiftCount <> 0 ) then begin
  6287. float_raise( float_flag_invalid );
  6288. if ( ( aSign = 0 )
  6289. or ( ( aExp = $7FFF )
  6290. and ( aSig <> bits64( $8000000000000000 ) ) )
  6291. ) then begin
  6292. result := $7FFFFFFFFFFFFFFF;
  6293. exit;
  6294. end;
  6295. result := $8000000000000000;
  6296. exit;
  6297. end;
  6298. aSigExtra := 0;
  6299. end
  6300. else begin
  6301. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  6302. end;
  6303. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  6304. end;
  6305. {*----------------------------------------------------------------------------
  6306. | Returns the result of converting the extended double-precision floating-
  6307. | point value `a' to the 64-bit two's complement integer format. The
  6308. | conversion is performed according to the IEC/IEEE Standard for Binary
  6309. | Floating-Point Arithmetic, except that the conversion is always rounded
  6310. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  6311. | Otherwise, if the conversion overflows, the largest integer with the same
  6312. | sign as `a' is returned.
  6313. *----------------------------------------------------------------------------*}
  6314. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  6315. var
  6316. aSign: flag;
  6317. aExp, shiftCount: int32;
  6318. aSig: bits64;
  6319. z: int64;
  6320. begin
  6321. aSig := extractFloatx80Frac( a );
  6322. aExp := extractFloatx80Exp( a );
  6323. aSign := extractFloatx80Sign( a );
  6324. shiftCount := aExp - $403E;
  6325. if ( 0 <= shiftCount ) then begin
  6326. aSig := $7FFFFFFFFFFFFFFF;
  6327. if ( ( a.high <> $C03E ) or ( aSig <> 0 ) ) then begin
  6328. float_raise( float_flag_invalid );
  6329. if ( ( aSign = 0 ) or ( ( aExp = $7FFF ) and ( aSig <> 0 ) ) ) then begin
  6330. result := $7FFFFFFFFFFFFFFF;
  6331. exit;
  6332. end;
  6333. end;
  6334. result := $8000000000000000;
  6335. exit;
  6336. end
  6337. else if ( aExp < $3FFF ) then begin
  6338. if ( aExp or aSig <> 0 ) then set_inexact_flag;
  6339. result := 0;
  6340. exit;
  6341. end;
  6342. z := aSig shr ( - shiftCount );
  6343. if bits64( aSig shl ( shiftCount and 63 ) ) <> 0 then begin
  6344. set_inexact_flag;
  6345. end;
  6346. if ( aSign <> 0 ) then z := - z;
  6347. result := z;
  6348. end;
  6349. {*----------------------------------------------------------------------------
  6350. | The pattern for a default generated extended double-precision NaN. The
  6351. | `high' and `low' values hold the most- and least-significant bits,
  6352. | respectively.
  6353. *----------------------------------------------------------------------------*}
  6354. const
  6355. floatx80_default_nan_high = $FFFF;
  6356. floatx80_default_nan_low = bits64( $C000000000000000 );
  6357. {*----------------------------------------------------------------------------
  6358. | Returns 1 if the extended double-precision floating-point value `a' is a
  6359. | signaling NaN; otherwise returns 0.
  6360. *----------------------------------------------------------------------------*}
  6361. function floatx80_is_signaling_nan(a : floatx80): flag;
  6362. var
  6363. aLow: bits64;
  6364. begin
  6365. aLow := a.low and not $4000000000000000;
  6366. result := ord(
  6367. ( a.high and $7FFF = $7FFF )
  6368. and ( bits64( aLow shl 1 ) <> 0 )
  6369. and ( a.low = aLow ) );
  6370. end;
  6371. {*----------------------------------------------------------------------------
  6372. | Returns the result of converting the extended double-precision floating-
  6373. | point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
  6374. | invalid exception is raised.
  6375. *----------------------------------------------------------------------------*}
  6376. function floatx80ToCommonNaN(a : floatx80): commonNaNT;
  6377. var
  6378. z: commonNaNT;
  6379. begin
  6380. if floatx80_is_signaling_nan( a ) <> 0 then float_raise( float_flag_invalid );
  6381. z.sign := a.high shr 15;
  6382. z.low := 0;
  6383. z.high := a.low shl 1;
  6384. result := z;
  6385. end;
  6386. {*----------------------------------------------------------------------------
  6387. | Returns 1 if the extended double-precision floating-point value `a' is a
  6388. | NaN; otherwise returns 0.
  6389. *----------------------------------------------------------------------------*}
  6390. function floatx80_is_nan(a : floatx80 ): flag;
  6391. begin
  6392. result := ord( ( ( a.high and $7FFF ) = $7FFF ) and ( bits64( a.low shl 1 ) <> 0 ) );
  6393. end;
  6394. {*----------------------------------------------------------------------------
  6395. | Takes two extended double-precision floating-point values `a' and `b', one
  6396. | of which is a NaN, and returns the appropriate NaN result. If either `a' or
  6397. | `b' is a signaling NaN, the invalid exception is raised.
  6398. *----------------------------------------------------------------------------*}
  6399. function propagateFloatx80NaN(a, b: floatx80): floatx80;
  6400. var
  6401. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  6402. label
  6403. returnLargerSignificand;
  6404. begin
  6405. aIsNaN := floatx80_is_nan( a );
  6406. aIsSignalingNaN := floatx80_is_signaling_nan( a );
  6407. bIsNaN := floatx80_is_nan( b );
  6408. bIsSignalingNaN := floatx80_is_signaling_nan( b );
  6409. a.low := a.low or $C000000000000000;
  6410. b.low := b.low or $C000000000000000;
  6411. if aIsSignalingNaN or bIsSignalingNaN <> 0 then float_raise( float_flag_invalid );
  6412. if aIsSignalingNaN <> 0 then begin
  6413. if bIsSignalingNaN <> 0 then goto returnLargerSignificand;
  6414. if bIsNaN <> 0 then result := b else result := a;
  6415. exit;
  6416. end
  6417. else if aIsNaN <>0 then begin
  6418. if ( bIsSignalingNaN <> 0 ) or ( bIsNaN = 0) then begin
  6419. result := a;
  6420. exit;
  6421. end;
  6422. returnLargerSignificand:
  6423. if ( a.low < b.low ) then begin
  6424. result := b;
  6425. exit;
  6426. end;
  6427. if ( b.low < a.low ) then begin
  6428. result := a;
  6429. exit;
  6430. end;
  6431. if a.high < b.high then result := a else result := b;
  6432. exit;
  6433. end
  6434. else
  6435. result := b;
  6436. end;
  6437. {*----------------------------------------------------------------------------
  6438. | Returns the result of converting the extended double-precision floating-
  6439. | point value `a' to the single-precision floating-point format. The
  6440. | conversion is performed according to the IEC/IEEE Standard for Binary
  6441. | Floating-Point Arithmetic.
  6442. *----------------------------------------------------------------------------*}
  6443. function floatx80_to_float32(a: floatx80): float32;
  6444. var
  6445. aSign: flag;
  6446. aExp: int32;
  6447. aSig: bits64;
  6448. begin
  6449. aSig := extractFloatx80Frac( a );
  6450. aExp := extractFloatx80Exp( a );
  6451. aSign := extractFloatx80Sign( a );
  6452. if ( aExp = $7FFF ) then begin
  6453. if bits64( aSig shl 1 ) <> 0 then begin
  6454. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  6455. exit;
  6456. end;
  6457. result := packFloat32( aSign, $FF, 0 );
  6458. exit;
  6459. end;
  6460. shift64RightJamming( aSig, 33, aSig );
  6461. if ( aExp or aSig <> 0 ) then dec( aExp, $3F81 );
  6462. result := roundAndPackFloat32( aSign, aExp, aSig );
  6463. end;
  6464. {*----------------------------------------------------------------------------
  6465. | Returns the result of converting the extended double-precision floating-
  6466. | point value `a' to the double-precision floating-point format. The
  6467. | conversion is performed according to the IEC/IEEE Standard for Binary
  6468. | Floating-Point Arithmetic.
  6469. *----------------------------------------------------------------------------*}
  6470. function floatx80_to_float64(a: floatx80): float64;
  6471. var
  6472. aSign: flag;
  6473. aExp: int32;
  6474. aSig, zSig: bits64;
  6475. begin
  6476. aSig := extractFloatx80Frac( a );
  6477. aExp := extractFloatx80Exp( a );
  6478. aSign := extractFloatx80Sign( a );
  6479. if ( aExp = $7FFF ) then begin
  6480. if bits64( aSig shl 1 ) <> 0 then begin
  6481. result:=commonNaNToFloat64(floatx80ToCommonNaN(a));
  6482. exit;
  6483. end;
  6484. result := packFloat64( aSign, $7FF, 0 );
  6485. exit;
  6486. end;
  6487. shift64RightJamming( aSig, 1, zSig );
  6488. if ( aExp or aSig <> 0 ) then dec( aExp, $3C01 );
  6489. result := roundAndPackFloat64( aSign, aExp, zSig );
  6490. end;
  6491. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6492. {*----------------------------------------------------------------------------
  6493. | Returns the result of converting the extended double-precision floating-
  6494. | point value `a' to the quadruple-precision floating-point format. The
  6495. | conversion is performed according to the IEC/IEEE Standard for Binary
  6496. | Floating-Point Arithmetic.
  6497. *----------------------------------------------------------------------------*}
  6498. function floatx80_to_float128(a: floatx80): float128;
  6499. var
  6500. aSign: flag;
  6501. aExp: int16;
  6502. aSig, zSig0, zSig1: bits64;
  6503. begin
  6504. aSig := extractFloatx80Frac( a );
  6505. aExp := extractFloatx80Exp( a );
  6506. aSign := extractFloatx80Sign( a );
  6507. if ( aExp = $7FFF ) and ( bits64( aSig shl 1 ) <> 0 ) then begin
  6508. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  6509. exit;
  6510. end;
  6511. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  6512. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  6513. end;
  6514. {$endif FPC_SOFTFLOAT_FLOAT128}
  6515. {*----------------------------------------------------------------------------
  6516. | Rounds the extended double-precision floating-point value `a' to an integer,
  6517. | and Returns the result as an extended quadruple-precision floating-point
  6518. | value. The operation is performed according to the IEC/IEEE Standard for
  6519. | Binary Floating-Point Arithmetic.
  6520. *----------------------------------------------------------------------------*}
  6521. function floatx80_round_to_int(a: floatx80): floatx80;
  6522. var
  6523. aSign: flag;
  6524. aExp: int32;
  6525. lastBitMask, roundBitsMask: bits64;
  6526. roundingMode: TFPURoundingMode;
  6527. z: floatx80;
  6528. begin
  6529. aExp := extractFloatx80Exp( a );
  6530. if ( $403E <= aExp ) then begin
  6531. if ( aExp = $7FFF ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) then begin
  6532. result := propagateFloatx80NaN( a, a );
  6533. exit;
  6534. end;
  6535. result := a;
  6536. exit;
  6537. end;
  6538. if ( aExp < $3FFF ) then begin
  6539. if ( ( aExp = 0 )
  6540. and ( bits64( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) then begin
  6541. result := a;
  6542. exit;
  6543. end;
  6544. set_inexact_flag;
  6545. aSign := extractFloatx80Sign( a );
  6546. case softfloat_rounding_mode of
  6547. float_round_nearest_even:
  6548. if ( ( aExp = $3FFE ) and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  6549. ) then begin
  6550. result :=
  6551. packFloatx80( aSign, $3FFF, bits64( $8000000000000000 ) );
  6552. exit;
  6553. end;
  6554. float_round_down: begin
  6555. if aSign <> 0 then
  6556. result := packFloatx80( 1, $3FFF, bits64( $8000000000000000 ) )
  6557. else
  6558. result := packFloatx80( 0, 0, 0 );
  6559. exit;
  6560. end;
  6561. float_round_up: begin
  6562. if aSign <> 0 then
  6563. result := packFloatx80( 1, 0, 0 )
  6564. else
  6565. result := packFloatx80( 0, $3FFF, bits64( $8000000000000000 ) );
  6566. exit;
  6567. end;
  6568. end;
  6569. result := packFloatx80( aSign, 0, 0 );
  6570. exit;
  6571. end;
  6572. lastBitMask := 1;
  6573. lastBitMask := lastBitMask shl ( $403E - aExp );
  6574. roundBitsMask := lastBitMask - 1;
  6575. z := a;
  6576. roundingMode := softfloat_rounding_mode;
  6577. if ( roundingMode = float_round_nearest_even ) then begin
  6578. inc( z.low, lastBitMask shr 1 );
  6579. if ( ( z.low and roundBitsMask ) = 0 ) then z.low := z.low and not lastBitMask;
  6580. end
  6581. else if ( roundingMode <> float_round_to_zero ) then begin
  6582. if ( extractFloatx80Sign( z ) <> 0 ) xor ( roundingMode = float_round_up ) then begin
  6583. inc( z.low, roundBitsMask );
  6584. end;
  6585. end;
  6586. z.low := z.low and not roundBitsMask;
  6587. if ( z.low = 0 ) then begin
  6588. inc(z.high);
  6589. z.low := bits64( $8000000000000000 );
  6590. end;
  6591. if ( z.low <> a.low ) then set_inexact_flag;
  6592. result := z;
  6593. end;
  6594. {*----------------------------------------------------------------------------
  6595. | Returns the result of adding the absolute values of the extended double-
  6596. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  6597. | negated before being returned. `zSign' is ignored if the result is a NaN.
  6598. | The addition is performed according to the IEC/IEEE Standard for Binary
  6599. | Floating-Point Arithmetic.
  6600. *----------------------------------------------------------------------------*}
  6601. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6602. var
  6603. aExp, bExp, zExp: int32;
  6604. aSig, bSig, zSig0, zSig1: bits64;
  6605. expDiff: int32;
  6606. label
  6607. shiftRight1, roundAndPack;
  6608. begin
  6609. aSig := extractFloatx80Frac( a );
  6610. aExp := extractFloatx80Exp( a );
  6611. bSig := extractFloatx80Frac( b );
  6612. bExp := extractFloatx80Exp( b );
  6613. expDiff := aExp - bExp;
  6614. if ( 0 < expDiff ) then begin
  6615. if ( aExp = $7FFF ) then begin
  6616. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6617. result := propagateFloatx80NaN( a, b );
  6618. exit;
  6619. end;
  6620. result := a;
  6621. exit;
  6622. end;
  6623. if ( bExp = 0 ) then dec(expDiff);
  6624. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6625. zExp := aExp;
  6626. end
  6627. else if ( expDiff < 0 ) then begin
  6628. if ( bExp = $7FFF ) then begin
  6629. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6630. result := propagateFloatx80NaN( a, b );
  6631. exit;
  6632. end;
  6633. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6634. exit;
  6635. end;
  6636. if ( aExp = 0 ) then inc(expDiff);
  6637. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6638. zExp := bExp;
  6639. end
  6640. else begin
  6641. if ( aExp = $7FFF ) then begin
  6642. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6643. result := propagateFloatx80NaN( a, b );
  6644. exit;
  6645. end;
  6646. result := a;
  6647. exit;
  6648. end;
  6649. zSig1 := 0;
  6650. zSig0 := aSig + bSig;
  6651. if ( aExp = 0 ) then begin
  6652. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  6653. goto roundAndPack;
  6654. end;
  6655. zExp := aExp;
  6656. goto shiftRight1;
  6657. end;
  6658. zSig0 := aSig + bSig;
  6659. if ( sbits64( zSig0 ) < 0 ) then goto roundAndPack;
  6660. shiftRight1:
  6661. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  6662. zSig0 := zSig0 or $8000000000000000;
  6663. inc(zExp);
  6664. roundAndPack:
  6665. result :=
  6666. roundAndPackFloatx80(
  6667. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6668. end;
  6669. {*----------------------------------------------------------------------------
  6670. | Returns the result of subtracting the absolute values of the extended
  6671. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  6672. | difference is negated before being returned. `zSign' is ignored if the
  6673. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  6674. | Standard for Binary Floating-Point Arithmetic.
  6675. *----------------------------------------------------------------------------*}
  6676. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  6677. var
  6678. aExp, bExp, zExp: int32;
  6679. aSig, bSig, zSig0, zSig1: bits64;
  6680. expDiff: int32;
  6681. z: floatx80;
  6682. label
  6683. bExpBigger, bBigger, aExpBigger, aBigger, normalizeRoundAndPack;
  6684. begin
  6685. aSig := extractFloatx80Frac( a );
  6686. aExp := extractFloatx80Exp( a );
  6687. bSig := extractFloatx80Frac( b );
  6688. bExp := extractFloatx80Exp( b );
  6689. expDiff := aExp - bExp;
  6690. if ( 0 < expDiff ) then goto aExpBigger;
  6691. if ( expDiff < 0 ) then goto bExpBigger;
  6692. if ( aExp = $7FFF ) then begin
  6693. if ( bits64( ( aSig or bSig ) shl 1 ) <> 0 ) then begin
  6694. result := propagateFloatx80NaN( a, b );
  6695. exit;
  6696. end;
  6697. float_raise( float_flag_invalid );
  6698. z.low := floatx80_default_nan_low;
  6699. z.high := floatx80_default_nan_high;
  6700. result := z;
  6701. exit;
  6702. end;
  6703. if ( aExp = 0 ) then begin
  6704. aExp := 1;
  6705. bExp := 1;
  6706. end;
  6707. zSig1 := 0;
  6708. if ( bSig < aSig ) then goto aBigger;
  6709. if ( aSig < bSig ) then goto bBigger;
  6710. result := packFloatx80( ord( softfloat_rounding_mode = float_round_down ), 0, 0 );
  6711. exit;
  6712. bExpBigger:
  6713. if ( bExp = $7FFF ) then begin
  6714. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6715. result := propagateFloatx80NaN( a, b );
  6716. exit;
  6717. end;
  6718. result := packFloatx80( zSign xor 1, $7FFF, bits64( $8000000000000000 ) );
  6719. exit;
  6720. end;
  6721. if ( aExp = 0 ) then inc(expDiff);
  6722. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  6723. bBigger:
  6724. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6725. zExp := bExp;
  6726. zSign := zSign xor 1;
  6727. goto normalizeRoundAndPack;
  6728. aExpBigger:
  6729. if ( aExp = $7FFF ) then begin
  6730. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6731. result := propagateFloatx80NaN( a, b );
  6732. exit;
  6733. end;
  6734. result := a;
  6735. exit;
  6736. end;
  6737. if ( bExp = 0 ) then dec(expDiff);
  6738. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6739. aBigger:
  6740. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6741. zExp := aExp;
  6742. normalizeRoundAndPack:
  6743. result :=
  6744. normalizeRoundAndPackFloatx80(
  6745. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6746. end;
  6747. {*----------------------------------------------------------------------------
  6748. | Returns the result of adding the extended double-precision floating-point
  6749. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6750. | Standard for Binary Floating-Point Arithmetic.
  6751. *----------------------------------------------------------------------------*}
  6752. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6753. var
  6754. aSign, bSign: flag;
  6755. begin
  6756. aSign := extractFloatx80Sign( a );
  6757. bSign := extractFloatx80Sign( b );
  6758. if ( aSign = bSign ) then begin
  6759. result := addFloatx80Sigs( a, b, aSign );
  6760. end
  6761. else begin
  6762. result := subFloatx80Sigs( a, b, aSign );
  6763. end;
  6764. end;
  6765. {*----------------------------------------------------------------------------
  6766. | Returns the result of subtracting the extended double-precision floating-
  6767. | point values `a' and `b'. The operation is performed according to the
  6768. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6769. *----------------------------------------------------------------------------*}
  6770. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6771. var
  6772. aSign, bSign: flag;
  6773. begin
  6774. aSign := extractFloatx80Sign( a );
  6775. bSign := extractFloatx80Sign( b );
  6776. if ( aSign = bSign ) then begin
  6777. result := subFloatx80Sigs( a, b, aSign );
  6778. end
  6779. else begin
  6780. result := addFloatx80Sigs( a, b, aSign );
  6781. end;
  6782. end;
  6783. {*----------------------------------------------------------------------------
  6784. | Returns the result of multiplying the extended double-precision floating-
  6785. | point values `a' and `b'. The operation is performed according to the
  6786. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6787. *----------------------------------------------------------------------------*}
  6788. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6789. var
  6790. aSign, bSign, zSign: flag;
  6791. aExp, bExp, zExp: int32;
  6792. aSig, bSig, zSig0, zSig1: bits64;
  6793. z: floatx80;
  6794. label
  6795. invalid;
  6796. begin
  6797. aSig := extractFloatx80Frac( a );
  6798. aExp := extractFloatx80Exp( a );
  6799. aSign := extractFloatx80Sign( a );
  6800. bSig := extractFloatx80Frac( b );
  6801. bExp := extractFloatx80Exp( b );
  6802. bSign := extractFloatx80Sign( b );
  6803. zSign := aSign xor bSign;
  6804. if ( aExp = $7FFF ) then begin
  6805. if ( bits64( aSig shl 1 ) <> 0 )
  6806. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6807. result := propagateFloatx80NaN( a, b );
  6808. exit;
  6809. end;
  6810. if ( ( bExp or bSig ) = 0 ) then goto invalid;
  6811. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6812. exit;
  6813. end;
  6814. if ( bExp = $7FFF ) then begin
  6815. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6816. result := propagateFloatx80NaN( a, b );
  6817. exit;
  6818. end;
  6819. if ( ( aExp or aSig ) = 0 ) then begin
  6820. invalid:
  6821. float_raise( float_flag_invalid );
  6822. z.low := floatx80_default_nan_low;
  6823. z.high := floatx80_default_nan_high;
  6824. result := z;
  6825. exit;
  6826. end;
  6827. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6828. exit;
  6829. end;
  6830. if ( aExp = 0 ) then begin
  6831. if ( aSig = 0 ) then begin
  6832. result := packFloatx80( zSign, 0, 0 );
  6833. exit;
  6834. end;
  6835. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6836. end;
  6837. if ( bExp = 0 ) then begin
  6838. if ( bSig = 0 ) then begin
  6839. result := packFloatx80( zSign, 0, 0 );
  6840. exit;
  6841. end;
  6842. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6843. end;
  6844. zExp := aExp + bExp - $3FFE;
  6845. mul64To128( aSig, bSig, zSig0, zSig1 );
  6846. if 0 < sbits64( zSig0 ) then begin
  6847. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6848. dec(zExp);
  6849. end;
  6850. result :=
  6851. roundAndPackFloatx80(
  6852. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6853. end;
  6854. {*----------------------------------------------------------------------------
  6855. | Returns the result of dividing the extended double-precision floating-point
  6856. | value `a' by the corresponding value `b'. The operation is performed
  6857. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6858. *----------------------------------------------------------------------------*}
  6859. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6860. var
  6861. aSign, bSign, zSign: flag;
  6862. aExp, bExp, zExp: int32;
  6863. aSig, bSig, zSig0, zSig1: bits64;
  6864. rem0, rem1, rem2, term0, term1, term2: bits64;
  6865. z: floatx80;
  6866. label
  6867. invalid;
  6868. begin
  6869. aSig := extractFloatx80Frac( a );
  6870. aExp := extractFloatx80Exp( a );
  6871. aSign := extractFloatx80Sign( a );
  6872. bSig := extractFloatx80Frac( b );
  6873. bExp := extractFloatx80Exp( b );
  6874. bSign := extractFloatx80Sign( b );
  6875. zSign := aSign xor bSign;
  6876. if ( aExp = $7FFF ) then begin
  6877. if ( bits64( aSig shl 1 ) <> 0 ) then begin
  6878. result := propagateFloatx80NaN( a, b );
  6879. exit;
  6880. end;
  6881. if ( bExp = $7FFF ) then begin
  6882. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6883. result := propagateFloatx80NaN( a, b );
  6884. exit;
  6885. end;
  6886. goto invalid;
  6887. end;
  6888. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6889. exit;
  6890. end;
  6891. if ( bExp = $7FFF ) then begin
  6892. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6893. result := propagateFloatx80NaN( a, b );
  6894. exit;
  6895. end;
  6896. result := packFloatx80( zSign, 0, 0 );
  6897. exit;
  6898. end;
  6899. if ( bExp = 0 ) then begin
  6900. if ( bSig = 0 ) then begin
  6901. if ( ( aExp or aSig ) = 0 ) then begin
  6902. invalid:
  6903. float_raise( float_flag_invalid );
  6904. z.low := floatx80_default_nan_low;
  6905. z.high := floatx80_default_nan_high;
  6906. result := z;
  6907. exit;
  6908. end;
  6909. float_raise( float_flag_divbyzero );
  6910. result := packFloatx80( zSign, $7FFF, bits64( $8000000000000000 ) );
  6911. exit;
  6912. end;
  6913. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6914. end;
  6915. if ( aExp = 0 ) then begin
  6916. if ( aSig = 0 ) then begin
  6917. result := packFloatx80( zSign, 0, 0 );
  6918. exit;
  6919. end;
  6920. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6921. end;
  6922. zExp := aExp - bExp + $3FFE;
  6923. rem1 := 0;
  6924. if ( bSig <= aSig ) then begin
  6925. shift128Right( aSig, 0, 1, aSig, rem1 );
  6926. inc(zExp);
  6927. end;
  6928. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6929. mul64To128( bSig, zSig0, term0, term1 );
  6930. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6931. while ( sbits64( rem0 ) < 0 ) do begin
  6932. dec(zSig0);
  6933. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6934. end;
  6935. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6936. if ( bits64( zSig1 shl 1 ) <= 8 ) then begin
  6937. mul64To128( bSig, zSig1, term1, term2 );
  6938. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6939. while ( sbits64( rem1 ) < 0 ) do begin
  6940. dec(zSig1);
  6941. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6942. end;
  6943. zSig1 := zSig1 or ord( ( rem1 or rem2 ) <> 0 );
  6944. end;
  6945. result :=
  6946. roundAndPackFloatx80(
  6947. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6948. end;
  6949. {*----------------------------------------------------------------------------
  6950. | Returns the remainder of the extended double-precision floating-point value
  6951. | `a' with respect to the corresponding value `b'. The operation is performed
  6952. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6953. *----------------------------------------------------------------------------*}
  6954. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6955. var
  6956. aSign, zSign: flag;
  6957. aExp, bExp, expDiff: int32;
  6958. aSig0, aSig1, bSig: bits64;
  6959. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6960. z: floatx80;
  6961. label
  6962. invalid;
  6963. begin
  6964. aSig0 := extractFloatx80Frac( a );
  6965. aExp := extractFloatx80Exp( a );
  6966. aSign := extractFloatx80Sign( a );
  6967. bSig := extractFloatx80Frac( b );
  6968. bExp := extractFloatx80Exp( b );
  6969. if ( aExp = $7FFF ) then begin
  6970. if ( bits64( aSig0 shl 1 ) <> 0 )
  6971. or ( ( bExp = $7FFF ) and ( bits64( bSig shl 1 ) <> 0 ) ) then begin
  6972. result := propagateFloatx80NaN( a, b );
  6973. exit;
  6974. end;
  6975. goto invalid;
  6976. end;
  6977. if ( bExp = $7FFF ) then begin
  6978. if ( bits64( bSig shl 1 ) <> 0 ) then begin
  6979. result := propagateFloatx80NaN( a, b );
  6980. exit;
  6981. end;
  6982. result := a;
  6983. exit;
  6984. end;
  6985. if ( bExp = 0 ) then begin
  6986. if ( bSig = 0 ) then begin
  6987. invalid:
  6988. float_raise( float_flag_invalid );
  6989. z.low := floatx80_default_nan_low;
  6990. z.high := floatx80_default_nan_high;
  6991. result := z;
  6992. exit;
  6993. end;
  6994. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6995. end;
  6996. if ( aExp = 0 ) then begin
  6997. if ( bits64( aSig0 shl 1 ) = 0 ) then begin
  6998. result := a;
  6999. exit;
  7000. end;
  7001. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  7002. end;
  7003. bSig := bSig or $8000000000000000;
  7004. zSign := aSign;
  7005. expDiff := aExp - bExp;
  7006. aSig1 := 0;
  7007. if ( expDiff < 0 ) then begin
  7008. if ( expDiff < -1 ) then begin
  7009. result := a;
  7010. exit;
  7011. end;
  7012. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  7013. expDiff := 0;
  7014. end;
  7015. q := ord( bSig <= aSig0 );
  7016. if ( q <> 0 ) then dec( aSig0, bSig );
  7017. dec( expDiff, 64 );
  7018. while ( 0 < expDiff ) do begin
  7019. q := estimateDiv128To64( aSig0, aSig1, bSig );
  7020. if ( 2 < q ) then q := q - 2 else q := 0;
  7021. mul64To128( bSig, q, term0, term1 );
  7022. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7023. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  7024. dec( expDiff, 62 );
  7025. end;
  7026. inc( expDiff, 64 );
  7027. if ( 0 < expDiff ) then begin
  7028. q := estimateDiv128To64( aSig0, aSig1, bSig );
  7029. if ( 2 < q ) then q:= q - 2 else q := 0;
  7030. q := q shr ( 64 - expDiff );
  7031. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  7032. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7033. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  7034. while ( le128( term0, term1, aSig0, aSig1 ) <> 0 ) do begin
  7035. inc(q);
  7036. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  7037. end;
  7038. end
  7039. else begin
  7040. term1 := 0;
  7041. term0 := bSig;
  7042. end;
  7043. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  7044. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7045. or ( ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) <> 0 )
  7046. and ( q and 1 <> 0 ) )
  7047. then begin
  7048. aSig0 := alternateASig0;
  7049. aSig1 := alternateASig1;
  7050. zSign := ord( zSign = 0 );
  7051. end;
  7052. result :=
  7053. normalizeRoundAndPackFloatx80(
  7054. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  7055. end;
  7056. {*----------------------------------------------------------------------------
  7057. | Returns the square root of the extended double-precision floating-point
  7058. | value `a'. The operation is performed according to the IEC/IEEE Standard
  7059. | for Binary Floating-Point Arithmetic.
  7060. *----------------------------------------------------------------------------*}
  7061. function floatx80_sqrt(a: floatx80): floatx80;
  7062. var
  7063. aSign: flag;
  7064. aExp, zExp: int32;
  7065. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  7066. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7067. z: floatx80;
  7068. label
  7069. invalid;
  7070. begin
  7071. aSig0 := extractFloatx80Frac( a );
  7072. aExp := extractFloatx80Exp( a );
  7073. aSign := extractFloatx80Sign( a );
  7074. if ( aExp = $7FFF ) then begin
  7075. if ( bits64( aSig0 shl 1 ) <> 0 ) then begin
  7076. result := propagateFloatx80NaN( a, a );
  7077. exit;
  7078. end;
  7079. if ( aSign = 0 ) then begin
  7080. result := a;
  7081. exit;
  7082. end;
  7083. goto invalid;
  7084. end;
  7085. if ( aSign <> 0 ) then begin
  7086. if ( ( aExp or aSig0 ) = 0 ) then begin
  7087. result := a;
  7088. exit;
  7089. end;
  7090. invalid:
  7091. float_raise( float_flag_invalid );
  7092. z.low := floatx80_default_nan_low;
  7093. z.high := floatx80_default_nan_high;
  7094. result := z;
  7095. exit;
  7096. end;
  7097. if ( aExp = 0 ) then begin
  7098. if ( aSig0 = 0 ) then begin
  7099. result := packFloatx80( 0, 0, 0 );
  7100. exit;
  7101. end;
  7102. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  7103. end;
  7104. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFF;
  7105. zSig0 := estimateSqrt32( aExp, aSig0 shr 32 );
  7106. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  7107. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7108. doubleZSig0 := zSig0 shl 1;
  7109. mul64To128( zSig0, zSig0, term0, term1 );
  7110. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7111. while ( sbits64( rem0 ) < 0 ) do begin
  7112. dec(zSig0);
  7113. dec( doubleZSig0, 2 );
  7114. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7115. end;
  7116. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7117. if ( ( zSig1 and $3FFFFFFFFFFFFFFF ) <= 5 ) then begin
  7118. if ( zSig1 = 0 ) then zSig1 := 1;
  7119. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7120. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7121. mul64To128( zSig1, zSig1, term2, term3 );
  7122. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7123. while ( sbits64( rem1 ) < 0 ) do begin
  7124. dec(zSig1);
  7125. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7126. term3 := term3 or 1;
  7127. term2 := term2 or doubleZSig0;
  7128. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7129. end;
  7130. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7131. end;
  7132. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  7133. zSig0 := zSig0 or doubleZSig0;
  7134. result :=
  7135. roundAndPackFloatx80(
  7136. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  7137. end;
  7138. {*----------------------------------------------------------------------------
  7139. | Returns 1 if the extended double-precision floating-point value `a' is
  7140. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  7141. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  7142. | Arithmetic.
  7143. *----------------------------------------------------------------------------*}
  7144. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  7145. begin
  7146. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7147. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 )
  7148. ) or ( ( extractFloatx80Exp( b ) = $7FFF )
  7149. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 )
  7150. ) then begin
  7151. if ( floatx80_is_signaling_nan( a )
  7152. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7153. float_raise( float_flag_invalid );
  7154. end;
  7155. result := 0;
  7156. exit;
  7157. end;
  7158. result := ord(
  7159. ( a.low = b.low )
  7160. and ( ( a.high = b.high )
  7161. or ( ( a.low = 0 )
  7162. and ( bits16 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7163. ) );
  7164. end;
  7165. {*----------------------------------------------------------------------------
  7166. | Returns 1 if the extended double-precision floating-point value `a' is
  7167. | less than or equal to the corresponding value `b', and 0 otherwise. The
  7168. | comparison is performed according to the IEC/IEEE Standard for Binary
  7169. | Floating-Point Arithmetic.
  7170. *----------------------------------------------------------------------------*}
  7171. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  7172. var
  7173. aSign, bSign: flag;
  7174. begin
  7175. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7176. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7177. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7178. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7179. then begin
  7180. float_raise( float_flag_invalid );
  7181. result := 0;
  7182. exit;
  7183. end;
  7184. aSign := extractFloatx80Sign( a );
  7185. bSign := extractFloatx80Sign( b );
  7186. if ( aSign <> bSign ) then begin
  7187. result := ord(
  7188. ( aSign <> 0 )
  7189. or ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low = 0 ) );
  7190. exit;
  7191. end;
  7192. if aSign<>0 then
  7193. result := le128( b.high, b.low, a.high, a.low )
  7194. else
  7195. result := le128( a.high, a.low, b.high, b.low );
  7196. end;
  7197. {*----------------------------------------------------------------------------
  7198. | Returns 1 if the extended double-precision floating-point value `a' is
  7199. | less than the corresponding value `b', and 0 otherwise. The comparison
  7200. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7201. | Arithmetic.
  7202. *----------------------------------------------------------------------------*}
  7203. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  7204. var
  7205. aSign, bSign: flag;
  7206. begin
  7207. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7208. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7209. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7210. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7211. then begin
  7212. float_raise( float_flag_invalid );
  7213. result := 0;
  7214. exit;
  7215. end;
  7216. aSign := extractFloatx80Sign( a );
  7217. bSign := extractFloatx80Sign( b );
  7218. if ( aSign <> bSign ) then begin
  7219. result := ord(
  7220. ( aSign <> 0 )
  7221. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7222. exit;
  7223. end;
  7224. if aSign <> 0 then
  7225. result := lt128( b.high, b.low, a.high, a.low )
  7226. else
  7227. result := lt128( a.high, a.low, b.high, b.low );
  7228. end;
  7229. {*----------------------------------------------------------------------------
  7230. | Returns 1 if the extended double-precision floating-point value `a' is equal
  7231. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  7232. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7233. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7234. *----------------------------------------------------------------------------*}
  7235. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  7236. begin
  7237. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7238. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7239. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7240. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7241. then begin
  7242. float_raise( float_flag_invalid );
  7243. result := 0;
  7244. exit;
  7245. end;
  7246. result := ord(
  7247. ( a.low = b.low )
  7248. and ( ( a.high = b.high )
  7249. or ( ( a.low = 0 )
  7250. and ( bits16( ( a.high or b.high ) shl 1 ) = 0 ) )
  7251. ) );
  7252. end;
  7253. {*----------------------------------------------------------------------------
  7254. | Returns 1 if the extended double-precision floating-point value `a' is less
  7255. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  7256. | do not cause an exception. Otherwise, the comparison is performed according
  7257. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7258. *----------------------------------------------------------------------------*}
  7259. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  7260. var
  7261. aSign, bSign: flag;
  7262. begin
  7263. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7264. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7265. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7266. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7267. then begin
  7268. if ( floatx80_is_signaling_nan( a )
  7269. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7270. float_raise( float_flag_invalid );
  7271. end;
  7272. result := 0;
  7273. exit;
  7274. end;
  7275. aSign := extractFloatx80Sign( a );
  7276. bSign := extractFloatx80Sign( b );
  7277. if ( aSign <> bSign ) then begin
  7278. result := ord(
  7279. ( aSign <> 0 )
  7280. or ( ( bits16( ( a.high or b.high ) shl 1 ) ) or a.low or b.low = 0 ) );
  7281. exit;
  7282. end;
  7283. if aSign <> 0 then
  7284. result := le128( b.high, b.low, a.high, a.low )
  7285. else
  7286. result := le128( a.high, a.low, b.high, b.low );
  7287. end;
  7288. {*----------------------------------------------------------------------------
  7289. | Returns 1 if the extended double-precision floating-point value `a' is less
  7290. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  7291. | an exception. Otherwise, the comparison is performed according to the
  7292. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7293. *----------------------------------------------------------------------------*}
  7294. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  7295. var
  7296. aSign, bSign: flag;
  7297. begin
  7298. if ( ( extractFloatx80Exp( a ) = $7FFF )
  7299. and ( bits64( extractFloatx80Frac( a ) shl 1 ) <> 0 ) )
  7300. or ( ( extractFloatx80Exp( b ) = $7FFF )
  7301. and ( bits64( extractFloatx80Frac( b ) shl 1 ) <> 0 ) )
  7302. then begin
  7303. if ( floatx80_is_signaling_nan( a )
  7304. or floatx80_is_signaling_nan( b ) <> 0 ) then begin
  7305. float_raise( float_flag_invalid );
  7306. end;
  7307. result := 0;
  7308. exit;
  7309. end;
  7310. aSign := extractFloatx80Sign( a );
  7311. bSign := extractFloatx80Sign( b );
  7312. if ( aSign <> bSign ) then begin
  7313. result := ord(
  7314. ( aSign <> 0 )
  7315. and ( bits16( ( a.high or b.high ) shl 1 ) or a.low or b.low <> 0 ) );
  7316. exit;
  7317. end;
  7318. if aSign <> 0 then
  7319. result := lt128( b.high, b.low, a.high, a.low )
  7320. else
  7321. result := lt128( a.high, a.low, b.high, b.low );
  7322. end;
  7323. {$endif FPC_SOFTFLOAT_FLOATX80}
  7324. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  7325. {*----------------------------------------------------------------------------
  7326. | Returns the least-significant 64 fraction bits of the quadruple-precision
  7327. | floating-point value `a'.
  7328. *----------------------------------------------------------------------------*}
  7329. function extractFloat128Frac1(a : float128): bits64;
  7330. begin
  7331. result:=a.low;
  7332. end;
  7333. {*----------------------------------------------------------------------------
  7334. | Returns the most-significant 48 fraction bits of the quadruple-precision
  7335. | floating-point value `a'.
  7336. *----------------------------------------------------------------------------*}
  7337. function extractFloat128Frac0(a : float128): bits64;
  7338. begin
  7339. result:=a.high and int64($0000FFFFFFFFFFFF);
  7340. end;
  7341. {*----------------------------------------------------------------------------
  7342. | Returns the exponent bits of the quadruple-precision floating-point value
  7343. | `a'.
  7344. *----------------------------------------------------------------------------*}
  7345. function extractFloat128Exp(a : float128): int32;
  7346. begin
  7347. result:=( a.high shr 48 ) and $7FFF;
  7348. end;
  7349. {*----------------------------------------------------------------------------
  7350. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  7351. *----------------------------------------------------------------------------*}
  7352. function extractFloat128Sign(a : float128): flag;
  7353. begin
  7354. result:=a.high shr 63;
  7355. end;
  7356. {*----------------------------------------------------------------------------
  7357. | Normalizes the subnormal quadruple-precision floating-point value
  7358. | represented by the denormalized significand formed by the concatenation of
  7359. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  7360. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  7361. | significand are stored at the location pointed to by `zSig0Ptr', and the
  7362. | least significant 64 bits of the normalized significand are stored at the
  7363. | location pointed to by `zSig1Ptr'.
  7364. *----------------------------------------------------------------------------*}
  7365. procedure normalizeFloat128Subnormal(
  7366. aSig0: bits64;
  7367. aSig1: bits64;
  7368. var zExpPtr: int32;
  7369. var zSig0Ptr: bits64;
  7370. var zSig1Ptr: bits64);
  7371. var
  7372. shiftCount: int8;
  7373. begin
  7374. if ( aSig0 = 0 ) then
  7375. begin
  7376. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  7377. if ( shiftCount < 0 ) then
  7378. begin
  7379. zSig0Ptr := aSig1 shr ( - shiftCount );
  7380. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  7381. end
  7382. else begin
  7383. zSig0Ptr := aSig1 shl shiftCount;
  7384. zSig1Ptr := 0;
  7385. end;
  7386. zExpPtr := - shiftCount - 63;
  7387. end
  7388. else begin
  7389. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  7390. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  7391. zExpPtr := 1 - shiftCount;
  7392. end;
  7393. end;
  7394. {*----------------------------------------------------------------------------
  7395. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  7396. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  7397. | floating-point value, returning the result. After being shifted into the
  7398. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  7399. | added together to form the most significant 32 bits of the result. This
  7400. | means that any integer portion of `zSig0' will be added into the exponent.
  7401. | Since a properly normalized significand will have an integer portion equal
  7402. | to 1, the `zExp' input should be 1 less than the desired result exponent
  7403. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  7404. | significand.
  7405. *----------------------------------------------------------------------------*}
  7406. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  7407. var
  7408. z: float128;
  7409. begin
  7410. z.low := zSig1;
  7411. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  7412. result:=z;
  7413. end;
  7414. {*----------------------------------------------------------------------------
  7415. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7416. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  7417. | and `zSig2', and returns the proper quadruple-precision floating-point value
  7418. | corresponding to the abstract input. Ordinarily, the abstract value is
  7419. | simply rounded and packed into the quadruple-precision format, with the
  7420. | inexact exception raised if the abstract input cannot be represented
  7421. | exactly. However, if the abstract value is too large, the overflow and
  7422. | inexact exceptions are raised and an infinity or maximal finite value is
  7423. | returned. If the abstract value is too small, the input value is rounded to
  7424. | a subnormal number, and the underflow and inexact exceptions are raised if
  7425. | the abstract input cannot be represented exactly as a subnormal quadruple-
  7426. | precision floating-point number.
  7427. | The input significand must be normalized or smaller. If the input
  7428. | significand is not normalized, `zExp' must be 0; in that case, the result
  7429. | returned is a subnormal number, and it must not require rounding. In the
  7430. | usual case that the input significand is normalized, `zExp' must be 1 less
  7431. | than the ``true'' floating-point exponent. The handling of underflow and
  7432. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7433. *----------------------------------------------------------------------------*}
  7434. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  7435. var
  7436. roundingMode: TFPURoundingMode;
  7437. roundNearestEven, increment, isTiny: flag;
  7438. begin
  7439. roundingMode := softfloat_rounding_mode;
  7440. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  7441. increment := ord( sbits64(zSig2) < 0 );
  7442. if ( roundNearestEven=0 ) then
  7443. begin
  7444. if ( roundingMode = float_round_to_zero ) then
  7445. begin
  7446. increment := 0;
  7447. end
  7448. else begin
  7449. if ( zSign<>0 ) then
  7450. begin
  7451. increment := ord( roundingMode = float_round_down ) and zSig2;
  7452. end
  7453. else begin
  7454. increment := ord( roundingMode = float_round_up ) and zSig2;
  7455. end;
  7456. end;
  7457. end;
  7458. if ( $7FFD <= bits32(zExp) ) then
  7459. begin
  7460. if ( ord( $7FFD < zExp )
  7461. or ( ord( zExp = $7FFD )
  7462. and eq128(
  7463. int64( $0001FFFFFFFFFFFF ),
  7464. bits64( $FFFFFFFFFFFFFFFF ),
  7465. zSig0,
  7466. zSig1
  7467. )
  7468. and increment
  7469. )
  7470. )<>0 then
  7471. begin
  7472. float_raise( [float_flag_overflow,float_flag_inexact] );
  7473. if ( ord( roundingMode = float_round_to_zero )
  7474. or ( zSign and ord( roundingMode = float_round_up ) )
  7475. or ( ord( zSign = 0) and ord( roundingMode = float_round_down ) )
  7476. )<>0 then
  7477. begin
  7478. result :=
  7479. packFloat128(
  7480. zSign,
  7481. $7FFE,
  7482. int64( $0000FFFFFFFFFFFF ),
  7483. bits64( $FFFFFFFFFFFFFFFF )
  7484. );
  7485. exit;
  7486. end;
  7487. result:=packFloat128( zSign, $7FFF, 0, 0 );
  7488. exit;
  7489. end;
  7490. if ( zExp < 0 ) then
  7491. begin
  7492. isTiny :=
  7493. ord(( softfloat_detect_tininess = float_tininess_before_rounding )
  7494. or ( zExp < -1 )
  7495. or not( increment<>0 )
  7496. or boolean(lt128(
  7497. zSig0,
  7498. zSig1,
  7499. int64( $0001FFFFFFFFFFFF ),
  7500. bits64( $FFFFFFFFFFFFFFFF )
  7501. )));
  7502. shift128ExtraRightJamming(
  7503. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  7504. zExp := 0;
  7505. if ( isTiny and zSig2 )<>0 then
  7506. float_raise( float_flag_underflow );
  7507. if ( roundNearestEven<>0 ) then
  7508. begin
  7509. increment := ord( sbits64(zSig2) < 0 );
  7510. end
  7511. else begin
  7512. if ( zSign<>0 ) then
  7513. begin
  7514. increment := ord( roundingMode = float_round_down ) and zSig2;
  7515. end
  7516. else begin
  7517. increment := ord( roundingMode = float_round_up ) and zSig2;
  7518. end;
  7519. end;
  7520. end;
  7521. end;
  7522. if ( zSig2<>0 ) then
  7523. set_inexact_flag;
  7524. if ( increment<>0 ) then
  7525. begin
  7526. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  7527. zSig1 := zSig1 and not bits64( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  7528. end
  7529. else begin
  7530. if ( ( zSig0 or zSig1 ) = 0 ) then
  7531. zExp := 0;
  7532. end;
  7533. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  7534. end;
  7535. {*----------------------------------------------------------------------------
  7536. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  7537. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  7538. | returns the proper quadruple-precision floating-point value corresponding
  7539. | to the abstract input. This routine is just like `roundAndPackFloat128'
  7540. | except that the input significand has fewer bits and does not have to be
  7541. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  7542. | point exponent.
  7543. *----------------------------------------------------------------------------*}
  7544. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  7545. var
  7546. shiftCount: int8;
  7547. zSig2: bits64;
  7548. begin
  7549. if ( zSig0 = 0 ) then
  7550. begin
  7551. zSig0 := zSig1;
  7552. zSig1 := 0;
  7553. dec(zExp, 64);
  7554. end;
  7555. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  7556. if ( 0 <= shiftCount ) then
  7557. begin
  7558. zSig2 := 0;
  7559. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  7560. end
  7561. else begin
  7562. shift128ExtraRightJamming(
  7563. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  7564. end;
  7565. dec(zExp, shiftCount);
  7566. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7567. end;
  7568. {*----------------------------------------------------------------------------
  7569. | Returns the result of converting the quadruple-precision floating-point
  7570. | value `a' to the 32-bit two's complement integer format. The conversion
  7571. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7572. | Arithmetic---which means in particular that the conversion is rounded
  7573. | according to the current rounding mode. If `a' is a NaN, the largest
  7574. | positive integer is returned. Otherwise, if the conversion overflows, the
  7575. | largest integer with the same sign as `a' is returned.
  7576. *----------------------------------------------------------------------------*}
  7577. function float128_to_int32(a: float128): int32;
  7578. var
  7579. aSign: flag;
  7580. aExp, shiftCount: int32;
  7581. aSig0, aSig1: bits64;
  7582. begin
  7583. aSig1 := extractFloat128Frac1( a );
  7584. aSig0 := extractFloat128Frac0( a );
  7585. aExp := extractFloat128Exp( a );
  7586. aSign := extractFloat128Sign( a );
  7587. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  7588. aSign := 0;
  7589. if ( aExp<>0 ) then
  7590. aSig0 := aSig0 or int64( $0001000000000000 );
  7591. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7592. shiftCount := $4028 - aExp;
  7593. if ( 0 < shiftCount ) then
  7594. shift64RightJamming( aSig0, shiftCount, aSig0 );
  7595. result := roundAndPackInt32( aSign, aSig0 );
  7596. end;
  7597. {*----------------------------------------------------------------------------
  7598. | Returns the result of converting the quadruple-precision floating-point
  7599. | value `a' to the 32-bit two's complement integer format. The conversion
  7600. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7601. | Arithmetic, except that the conversion is always rounded toward zero. If
  7602. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  7603. | conversion overflows, the largest integer with the same sign as `a' is
  7604. | returned.
  7605. *----------------------------------------------------------------------------*}
  7606. function float128_to_int32_round_to_zero(a: float128): int32;
  7607. var
  7608. aSign: flag;
  7609. aExp, shiftCount: int32;
  7610. aSig0, aSig1, savedASig: bits64;
  7611. z: int32;
  7612. label
  7613. invalid;
  7614. begin
  7615. aSig1 := extractFloat128Frac1( a );
  7616. aSig0 := extractFloat128Frac0( a );
  7617. aExp := extractFloat128Exp( a );
  7618. aSign := extractFloat128Sign( a );
  7619. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7620. if ( $401E < aExp ) then
  7621. begin
  7622. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  7623. aSign := 0;
  7624. goto invalid;
  7625. end
  7626. else if ( aExp < $3FFF ) then
  7627. begin
  7628. if ( aExp or aSig0 )<>0 then
  7629. set_inexact_flag;
  7630. result := 0;
  7631. exit;
  7632. end;
  7633. aSig0 := aSig0 or int64( $0001000000000000 );
  7634. shiftCount := $402F - aExp;
  7635. savedASig := aSig0;
  7636. aSig0 := aSig0 shr shiftCount;
  7637. z := aSig0;
  7638. if ( aSign )<>0 then
  7639. z := - z;
  7640. if ( ord( z < 0 ) xor aSign )<>0 then
  7641. begin
  7642. invalid:
  7643. float_raise( float_flag_invalid );
  7644. if aSign<>0 then
  7645. result:= int32( $80000000 )
  7646. else
  7647. result:=$7FFFFFFF;
  7648. exit;
  7649. end;
  7650. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  7651. begin
  7652. set_inexact_flag;
  7653. end;
  7654. result := z;
  7655. end;
  7656. {*----------------------------------------------------------------------------
  7657. | Returns the result of converting the quadruple-precision floating-point
  7658. | value `a' to the 64-bit two's complement integer format. The conversion
  7659. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7660. | Arithmetic---which means in particular that the conversion is rounded
  7661. | according to the current rounding mode. If `a' is a NaN, the largest
  7662. | positive integer is returned. Otherwise, if the conversion overflows, the
  7663. | largest integer with the same sign as `a' is returned.
  7664. *----------------------------------------------------------------------------*}
  7665. function float128_to_int64(a: float128): int64;
  7666. var
  7667. aSign: flag;
  7668. aExp, shiftCount: int32;
  7669. aSig0, aSig1: bits64;
  7670. begin
  7671. aSig1 := extractFloat128Frac1( a );
  7672. aSig0 := extractFloat128Frac0( a );
  7673. aExp := extractFloat128Exp( a );
  7674. aSign := extractFloat128Sign( a );
  7675. if ( aExp<>0 ) then
  7676. aSig0 := aSig0 or int64( $0001000000000000 );
  7677. shiftCount := $402F - aExp;
  7678. if ( shiftCount <= 0 ) then
  7679. begin
  7680. if ( $403E < aExp ) then
  7681. begin
  7682. float_raise( float_flag_invalid );
  7683. if ( (aSign=0)
  7684. or ( ( aExp = $7FFF )
  7685. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  7686. )
  7687. ) then
  7688. begin
  7689. result := int64( $7FFFFFFFFFFFFFFF );
  7690. exit;
  7691. end;
  7692. result := int64( $8000000000000000 );
  7693. exit;
  7694. end;
  7695. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  7696. end
  7697. else begin
  7698. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  7699. end;
  7700. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  7701. end;
  7702. {*----------------------------------------------------------------------------
  7703. | Returns the result of converting the quadruple-precision floating-point
  7704. | value `a' to the 64-bit two's complement integer format. The conversion
  7705. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7706. | Arithmetic, except that the conversion is always rounded toward zero.
  7707. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  7708. | the conversion overflows, the largest integer with the same sign as `a' is
  7709. | returned.
  7710. *----------------------------------------------------------------------------*}
  7711. function float128_to_int64_round_to_zero(a: float128): int64;
  7712. var
  7713. aSign: flag;
  7714. aExp, shiftCount: int32;
  7715. aSig0, aSig1: bits64;
  7716. z: int64;
  7717. begin
  7718. aSig1 := extractFloat128Frac1( a );
  7719. aSig0 := extractFloat128Frac0( a );
  7720. aExp := extractFloat128Exp( a );
  7721. aSign := extractFloat128Sign( a );
  7722. if ( aExp<>0 ) then
  7723. aSig0 := aSig0 or int64( $0001000000000000 );
  7724. shiftCount := aExp - $402F;
  7725. if ( 0 < shiftCount ) then
  7726. begin
  7727. if ( $403E <= aExp ) then
  7728. begin
  7729. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  7730. if ( ( a.high = bits64( $C03E000000000000 ) )
  7731. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  7732. begin
  7733. if ( aSig1<>0 ) then
  7734. set_inexact_flag;
  7735. end
  7736. else begin
  7737. float_raise( float_flag_invalid );
  7738. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  7739. begin
  7740. result := int64( $7FFFFFFFFFFFFFFF );
  7741. exit;
  7742. end;
  7743. end;
  7744. result := int64( $8000000000000000 );
  7745. exit;
  7746. end;
  7747. z := ( aSig0 shl shiftCount ) or ( aSig1 shr ( ( - shiftCount ) and 63 ) );
  7748. if ( int64( aSig1 shl shiftCount )<>0 ) then
  7749. begin
  7750. set_inexact_flag;
  7751. end;
  7752. end
  7753. else begin
  7754. if ( aExp < $3FFF ) then
  7755. begin
  7756. if ( aExp or aSig0 or aSig1 )<>0 then
  7757. begin
  7758. set_inexact_flag;
  7759. end;
  7760. result := 0;
  7761. exit;
  7762. end;
  7763. z := aSig0 shr ( - shiftCount );
  7764. if ( (aSig1<>0)
  7765. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  7766. begin
  7767. set_inexact_flag;
  7768. end;
  7769. end;
  7770. if ( aSign<>0 ) then
  7771. z := - z;
  7772. result := z;
  7773. end;
  7774. {*----------------------------------------------------------------------------
  7775. | Returns the result of converting the quadruple-precision floating-point
  7776. | value `a' to the single-precision floating-point format. The conversion
  7777. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7778. | Arithmetic.
  7779. *----------------------------------------------------------------------------*}
  7780. function float128_to_float32(a: float128): float32;
  7781. var
  7782. aSign: flag;
  7783. aExp: int32;
  7784. aSig0, aSig1: bits64;
  7785. zSig: bits32;
  7786. begin
  7787. aSig1 := extractFloat128Frac1( a );
  7788. aSig0 := extractFloat128Frac0( a );
  7789. aExp := extractFloat128Exp( a );
  7790. aSign := extractFloat128Sign( a );
  7791. if ( aExp = $7FFF ) then
  7792. begin
  7793. if ( aSig0 or aSig1 )<>0 then
  7794. begin
  7795. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  7796. exit;
  7797. end;
  7798. result := packFloat32( aSign, $FF, 0 );
  7799. exit;
  7800. end;
  7801. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7802. shift64RightJamming( aSig0, 18, aSig0 );
  7803. zSig := aSig0;
  7804. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7805. begin
  7806. zSig := zSig or $40000000;
  7807. dec(aExp,$3F81);
  7808. end;
  7809. result := roundAndPackFloat32( aSign, aExp, zSig );
  7810. end;
  7811. {*----------------------------------------------------------------------------
  7812. | Returns the result of converting the quadruple-precision floating-point
  7813. | value `a' to the double-precision floating-point format. The conversion
  7814. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7815. | Arithmetic.
  7816. *----------------------------------------------------------------------------*}
  7817. function float128_to_float64(a: float128): float64;
  7818. var
  7819. aSign: flag;
  7820. aExp: int32;
  7821. aSig0, aSig1: bits64;
  7822. begin
  7823. aSig1 := extractFloat128Frac1( a );
  7824. aSig0 := extractFloat128Frac0( a );
  7825. aExp := extractFloat128Exp( a );
  7826. aSign := extractFloat128Sign( a );
  7827. if ( aExp = $7FFF ) then
  7828. begin
  7829. if ( aSig0 or aSig1 )<>0 then
  7830. begin
  7831. result:=commonNaNToFloat64(float128ToCommonNaN(a));
  7832. exit;
  7833. end;
  7834. result:=packFloat64( aSign, $7FF, 0);
  7835. exit;
  7836. end;
  7837. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7838. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7839. if ( aExp<>0 ) or (aSig0 <> 0 ) then
  7840. begin
  7841. aSig0 := aSig0 or int64( $4000000000000000 );
  7842. dec(aExp,$3C01);
  7843. end;
  7844. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7845. end;
  7846. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7847. {*----------------------------------------------------------------------------
  7848. | Returns the result of converting the quadruple-precision floating-point
  7849. | value `a' to the extended double-precision floating-point format. The
  7850. | conversion is performed according to the IEC/IEEE Standard for Binary
  7851. | Floating-Point Arithmetic.
  7852. *----------------------------------------------------------------------------*}
  7853. function float128_to_floatx80(a: float128): floatx80;
  7854. var
  7855. aSign: flag;
  7856. aExp: int32;
  7857. aSig0, aSig1: bits64;
  7858. begin
  7859. aSig1 := extractFloat128Frac1( a );
  7860. aSig0 := extractFloat128Frac0( a );
  7861. aExp := extractFloat128Exp( a );
  7862. aSign := extractFloat128Sign( a );
  7863. if ( aExp = $7FFF ) then begin
  7864. if ( aSig0 or aSig1 <> 0 ) then begin
  7865. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7866. exit;
  7867. end;
  7868. result := packFloatx80( aSign, $7FFF, bits64( $8000000000000000 ) );
  7869. exit;
  7870. end;
  7871. if ( aExp = 0 ) then begin
  7872. if ( ( aSig0 or aSig1 ) = 0 ) then
  7873. begin
  7874. result := packFloatx80( aSign, 0, 0 );
  7875. exit;
  7876. end;
  7877. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7878. end
  7879. else begin
  7880. aSig0 := aSig0 or int64( $0001000000000000 );
  7881. end;
  7882. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7883. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7884. end;
  7885. {$endif FPC_SOFTFLOAT_FLOATX80}
  7886. {*----------------------------------------------------------------------------
  7887. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7888. | Returns the result as a quadruple-precision floating-point value. The
  7889. | operation is performed according to the IEC/IEEE Standard for Binary
  7890. | Floating-Point Arithmetic.
  7891. *----------------------------------------------------------------------------*}
  7892. function float128_round_to_int(a: float128): float128;
  7893. var
  7894. aSign: flag;
  7895. aExp: int32;
  7896. lastBitMask, roundBitsMask: bits64;
  7897. roundingMode: TFPURoundingMode;
  7898. z: float128;
  7899. begin
  7900. aExp := extractFloat128Exp( a );
  7901. if ( $402F <= aExp ) then
  7902. begin
  7903. if ( $406F <= aExp ) then
  7904. begin
  7905. if ( ( aExp = $7FFF )
  7906. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7907. ) then
  7908. begin
  7909. result := propagateFloat128NaN( a, a );
  7910. exit;
  7911. end;
  7912. result := a;
  7913. exit;
  7914. end;
  7915. lastBitMask := 1;
  7916. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7917. roundBitsMask := lastBitMask - 1;
  7918. z := a;
  7919. roundingMode := softfloat_rounding_mode;
  7920. if ( roundingMode = float_round_nearest_even ) then
  7921. begin
  7922. if ( lastBitMask )<>0 then
  7923. begin
  7924. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7925. if ( ( z.low and roundBitsMask ) = 0 ) then
  7926. z.low := z.low and not(lastBitMask);
  7927. end
  7928. else begin
  7929. if ( sbits64(z.low) < 0 ) then
  7930. begin
  7931. inc(z.high);
  7932. if ( bits64( z.low shl 1 ) = 0 ) then
  7933. z.high := z.high and not bits64( 1 );
  7934. end;
  7935. end;
  7936. end
  7937. else if ( roundingMode <> float_round_to_zero ) then
  7938. begin
  7939. if ( extractFloat128Sign( z )
  7940. xor ord( roundingMode = float_round_up ) )<>0 then
  7941. begin
  7942. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7943. end;
  7944. end;
  7945. z.low := z.low and not(roundBitsMask);
  7946. end
  7947. else begin
  7948. if ( aExp < $3FFF ) then
  7949. begin
  7950. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7951. begin
  7952. result := a;
  7953. exit;
  7954. end;
  7955. set_inexact_flag;
  7956. aSign := extractFloat128Sign( a );
  7957. case softfloat_rounding_mode of
  7958. float_round_nearest_even:
  7959. if ( ( aExp = $3FFE )
  7960. and ( (extractFloat128Frac0( a )<>0)
  7961. or (extractFloat128Frac1( a )<>0) )
  7962. ) then begin
  7963. begin
  7964. result := packFloat128( aSign, $3FFF, 0, 0 );
  7965. exit;
  7966. end;
  7967. end;
  7968. float_round_down:
  7969. begin
  7970. if aSign<>0 then
  7971. result:=packFloat128( 1, $3FFF, 0, 0 )
  7972. else
  7973. result:=packFloat128( 0, 0, 0, 0 );
  7974. exit;
  7975. end;
  7976. float_round_up:
  7977. begin
  7978. if aSign<>0 then
  7979. result := packFloat128( 1, 0, 0, 0 )
  7980. else
  7981. result:=packFloat128( 0, $3FFF, 0, 0 );
  7982. exit;
  7983. end;
  7984. end;
  7985. result := packFloat128( aSign, 0, 0, 0 );
  7986. exit;
  7987. end;
  7988. lastBitMask := 1;
  7989. lastBitMask := lastBitMask shl ($402F - aExp);
  7990. roundBitsMask := lastBitMask - 1;
  7991. z.low := 0;
  7992. z.high := a.high;
  7993. roundingMode := softfloat_rounding_mode;
  7994. if ( roundingMode = float_round_nearest_even ) then begin
  7995. inc(z.high,lastBitMask shr 1);
  7996. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7997. z.high := z.high and not(lastBitMask);
  7998. end;
  7999. end
  8000. else if ( roundingMode <> float_round_to_zero ) then begin
  8001. if ( (extractFloat128Sign( z )<>0)
  8002. xor ( roundingMode = float_round_up ) ) then begin
  8003. z.high := z.high or ord( a.low <> 0 );
  8004. z.high := z.high+roundBitsMask;
  8005. end;
  8006. end;
  8007. z.high := z.high and not(roundBitsMask);
  8008. end;
  8009. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  8010. set_inexact_flag;
  8011. end;
  8012. result := z;
  8013. end;
  8014. {*----------------------------------------------------------------------------
  8015. | Returns the result of adding the absolute values of the quadruple-precision
  8016. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  8017. | before being returned. `zSign' is ignored if the result is a NaN.
  8018. | The addition is performed according to the IEC/IEEE Standard for Binary
  8019. | Floating-Point Arithmetic.
  8020. *----------------------------------------------------------------------------*}
  8021. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  8022. var
  8023. aExp, bExp, zExp: int32;
  8024. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8025. expDiff: int32;
  8026. label
  8027. shiftRight1,roundAndPack;
  8028. begin
  8029. aSig1 := extractFloat128Frac1( a );
  8030. aSig0 := extractFloat128Frac0( a );
  8031. aExp := extractFloat128Exp( a );
  8032. bSig1 := extractFloat128Frac1( b );
  8033. bSig0 := extractFloat128Frac0( b );
  8034. bExp := extractFloat128Exp( b );
  8035. expDiff := aExp - bExp;
  8036. if ( 0 < expDiff ) then begin
  8037. if ( aExp = $7FFF ) then begin
  8038. if ( aSig0 or aSig1 )<>0 then
  8039. begin
  8040. result := propagateFloat128NaN( a, b );
  8041. exit;
  8042. end;
  8043. result := a;
  8044. exit;
  8045. end;
  8046. if ( bExp = 0 ) then begin
  8047. dec(expDiff);
  8048. end
  8049. else begin
  8050. bSig0 := bSig0 or int64( $0001000000000000 );
  8051. end;
  8052. shift128ExtraRightJamming(
  8053. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  8054. zExp := aExp;
  8055. end
  8056. else if ( expDiff < 0 ) then begin
  8057. if ( bExp = $7FFF ) then begin
  8058. if ( bSig0 or bSig1 )<>0 then
  8059. begin
  8060. result := propagateFloat128NaN( a, b );
  8061. exit;
  8062. end;
  8063. result := packFloat128( zSign, $7FFF, 0, 0 );
  8064. exit;
  8065. end;
  8066. if ( aExp = 0 ) then begin
  8067. inc(expDiff);
  8068. end
  8069. else begin
  8070. aSig0 := aSig0 or int64( $0001000000000000 );
  8071. end;
  8072. shift128ExtraRightJamming(
  8073. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  8074. zExp := bExp;
  8075. end
  8076. else begin
  8077. if ( aExp = $7FFF ) then begin
  8078. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8079. result := propagateFloat128NaN( a, b );
  8080. exit;
  8081. end;
  8082. result := a;
  8083. exit;
  8084. end;
  8085. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8086. if ( aExp = 0 ) then
  8087. begin
  8088. result := packFloat128( zSign, 0, zSig0, zSig1 );
  8089. exit;
  8090. end;
  8091. zSig2 := 0;
  8092. zSig0 := zSig0 or int64( $0002000000000000 );
  8093. zExp := aExp;
  8094. goto shiftRight1;
  8095. end;
  8096. aSig0 := aSig0 or int64( $0001000000000000 );
  8097. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8098. dec(zExp);
  8099. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  8100. inc(zExp);
  8101. shiftRight1:
  8102. shift128ExtraRightJamming(
  8103. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8104. roundAndPack:
  8105. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8106. end;
  8107. {*----------------------------------------------------------------------------
  8108. | Returns the result of subtracting the absolute values of the quadruple-
  8109. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  8110. | difference is negated before being returned. `zSign' is ignored if the
  8111. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  8112. | Standard for Binary Floating-Point Arithmetic.
  8113. *----------------------------------------------------------------------------*}
  8114. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  8115. var
  8116. aExp, bExp, zExp: int32;
  8117. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  8118. expDiff: int32;
  8119. z: float128;
  8120. label
  8121. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  8122. begin
  8123. aSig1 := extractFloat128Frac1( a );
  8124. aSig0 := extractFloat128Frac0( a );
  8125. aExp := extractFloat128Exp( a );
  8126. bSig1 := extractFloat128Frac1( b );
  8127. bSig0 := extractFloat128Frac0( b );
  8128. bExp := extractFloat128Exp( b );
  8129. expDiff := aExp - bExp;
  8130. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  8131. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  8132. if ( 0 < expDiff ) then goto aExpBigger;
  8133. if ( expDiff < 0 ) then goto bExpBigger;
  8134. if ( aExp = $7FFF ) then begin
  8135. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  8136. result := propagateFloat128NaN( a, b );
  8137. exit;
  8138. end;
  8139. float_raise( float_flag_invalid );
  8140. z.low := float128_default_nan_low;
  8141. z.high := float128_default_nan_high;
  8142. result := z;
  8143. exit;
  8144. end;
  8145. if ( aExp = 0 ) then begin
  8146. aExp := 1;
  8147. bExp := 1;
  8148. end;
  8149. if ( bSig0 < aSig0 ) then goto aBigger;
  8150. if ( aSig0 < bSig0 ) then goto bBigger;
  8151. if ( bSig1 < aSig1 ) then goto aBigger;
  8152. if ( aSig1 < bSig1 ) then goto bBigger;
  8153. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  8154. exit;
  8155. bExpBigger:
  8156. if ( bExp = $7FFF ) then begin
  8157. if ( bSig0 or bSig1 )<>0 then
  8158. begin
  8159. result := propagateFloat128NaN( a, b );
  8160. exit;
  8161. end;
  8162. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  8163. exit;
  8164. end;
  8165. if ( aExp = 0 ) then begin
  8166. inc(expDiff);
  8167. end
  8168. else begin
  8169. aSig0 := aSig0 or int64( $4000000000000000 );
  8170. end;
  8171. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8172. bSig0 := bSig0 or int64( $4000000000000000 );
  8173. bBigger:
  8174. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  8175. zExp := bExp;
  8176. zSign := zSign xor 1;
  8177. goto normalizeRoundAndPack;
  8178. aExpBigger:
  8179. if ( aExp = $7FFF ) then begin
  8180. if ( aSig0 or aSig1 )<>0 then
  8181. begin
  8182. result := propagateFloat128NaN( a, b );
  8183. exit;
  8184. end;
  8185. result := a;
  8186. exit;
  8187. end;
  8188. if ( bExp = 0 ) then begin
  8189. dec(expDiff);
  8190. end
  8191. else begin
  8192. bSig0 := bSig0 or int64( $4000000000000000 );
  8193. end;
  8194. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  8195. aSig0 := aSig0 or int64( $4000000000000000 );
  8196. aBigger:
  8197. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  8198. zExp := aExp;
  8199. normalizeRoundAndPack:
  8200. dec(zExp);
  8201. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  8202. end;
  8203. {*----------------------------------------------------------------------------
  8204. | Returns the result of adding the quadruple-precision floating-point values
  8205. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  8206. | for Binary Floating-Point Arithmetic.
  8207. *----------------------------------------------------------------------------*}
  8208. function float128_add(a: float128; b: float128): float128;
  8209. var
  8210. aSign, bSign: flag;
  8211. begin
  8212. aSign := extractFloat128Sign( a );
  8213. bSign := extractFloat128Sign( b );
  8214. if ( aSign = bSign ) then begin
  8215. result := addFloat128Sigs( a, b, aSign );
  8216. end
  8217. else begin
  8218. result := subFloat128Sigs( a, b, aSign );
  8219. end;
  8220. end;
  8221. {*----------------------------------------------------------------------------
  8222. | Returns the result of subtracting the quadruple-precision floating-point
  8223. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8224. | Standard for Binary Floating-Point Arithmetic.
  8225. *----------------------------------------------------------------------------*}
  8226. function float128_sub(a: float128; b: float128): float128;
  8227. var
  8228. aSign, bSign: flag;
  8229. begin
  8230. aSign := extractFloat128Sign( a );
  8231. bSign := extractFloat128Sign( b );
  8232. if ( aSign = bSign ) then begin
  8233. result := subFloat128Sigs( a, b, aSign );
  8234. end
  8235. else begin
  8236. result := addFloat128Sigs( a, b, aSign );
  8237. end;
  8238. end;
  8239. {*----------------------------------------------------------------------------
  8240. | Returns the result of multiplying the quadruple-precision floating-point
  8241. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  8242. | Standard for Binary Floating-Point Arithmetic.
  8243. *----------------------------------------------------------------------------*}
  8244. function float128_mul(a: float128; b: float128): float128;
  8245. var
  8246. aSign, bSign, zSign: flag;
  8247. aExp, bExp, zExp: int32;
  8248. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  8249. z: float128;
  8250. label
  8251. invalid;
  8252. begin
  8253. aSig1 := extractFloat128Frac1( a );
  8254. aSig0 := extractFloat128Frac0( a );
  8255. aExp := extractFloat128Exp( a );
  8256. aSign := extractFloat128Sign( a );
  8257. bSig1 := extractFloat128Frac1( b );
  8258. bSig0 := extractFloat128Frac0( b );
  8259. bExp := extractFloat128Exp( b );
  8260. bSign := extractFloat128Sign( b );
  8261. zSign := aSign xor bSign;
  8262. if ( aExp = $7FFF ) then begin
  8263. if ( (( aSig0 or aSig1 )<>0)
  8264. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8265. result := propagateFloat128NaN( a, b );
  8266. exit;
  8267. end;
  8268. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  8269. result := packFloat128( zSign, $7FFF, 0, 0 );
  8270. exit;
  8271. end;
  8272. if ( bExp = $7FFF ) then begin
  8273. if ( bSig0 or bSig1 )<>0 then
  8274. begin
  8275. result := propagateFloat128NaN( a, b );
  8276. exit;
  8277. end;
  8278. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8279. invalid:
  8280. float_raise( float_flag_invalid );
  8281. z.low := float128_default_nan_low;
  8282. z.high := float128_default_nan_high;
  8283. result := z;
  8284. exit;
  8285. end;
  8286. result := packFloat128( zSign, $7FFF, 0, 0 );
  8287. exit;
  8288. end;
  8289. if ( aExp = 0 ) then begin
  8290. if ( ( aSig0 or aSig1 ) = 0 ) then
  8291. begin
  8292. result := packFloat128( zSign, 0, 0, 0 );
  8293. exit;
  8294. end;
  8295. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8296. end;
  8297. if ( bExp = 0 ) then begin
  8298. if ( ( bSig0 or bSig1 ) = 0 ) then
  8299. begin
  8300. result := packFloat128( zSign, 0, 0, 0 );
  8301. exit;
  8302. end;
  8303. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8304. end;
  8305. zExp := aExp + bExp - $4000;
  8306. aSig0 := aSig0 or int64( $0001000000000000 );
  8307. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  8308. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  8309. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  8310. zSig2 := zSig2 or ord( zSig3 <> 0 );
  8311. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  8312. shift128ExtraRightJamming(
  8313. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  8314. inc(zExp);
  8315. end;
  8316. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8317. end;
  8318. {*----------------------------------------------------------------------------
  8319. | Returns the result of dividing the quadruple-precision floating-point value
  8320. | `a' by the corresponding value `b'. The operation is performed according to
  8321. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8322. *----------------------------------------------------------------------------*}
  8323. function float128_div(a: float128; b: float128): float128;
  8324. var
  8325. aSign, bSign, zSign: flag;
  8326. aExp, bExp, zExp: int32;
  8327. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  8328. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8329. z: float128;
  8330. label
  8331. invalid;
  8332. begin
  8333. aSig1 := extractFloat128Frac1( a );
  8334. aSig0 := extractFloat128Frac0( a );
  8335. aExp := extractFloat128Exp( a );
  8336. aSign := extractFloat128Sign( a );
  8337. bSig1 := extractFloat128Frac1( b );
  8338. bSig0 := extractFloat128Frac0( b );
  8339. bExp := extractFloat128Exp( b );
  8340. bSign := extractFloat128Sign( b );
  8341. zSign := aSign xor bSign;
  8342. if ( aExp = $7FFF ) then begin
  8343. if ( aSig0 or aSig1 )<>0 then
  8344. begin
  8345. result := propagateFloat128NaN( a, b );
  8346. exit;
  8347. end;
  8348. if ( bExp = $7FFF ) then begin
  8349. if ( bSig0 or bSig1 )<>0 then
  8350. begin
  8351. result := propagateFloat128NaN( a, b );
  8352. exit;
  8353. end;
  8354. goto invalid;
  8355. end;
  8356. result := packFloat128( zSign, $7FFF, 0, 0 );
  8357. exit;
  8358. end;
  8359. if ( bExp = $7FFF ) then begin
  8360. if ( bSig0 or bSig1 )<>0 then
  8361. begin
  8362. result := propagateFloat128NaN( a, b );
  8363. exit;
  8364. end;
  8365. result := packFloat128( zSign, 0, 0, 0 );
  8366. exit;
  8367. end;
  8368. if ( bExp = 0 ) then begin
  8369. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8370. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  8371. invalid:
  8372. float_raise( float_flag_invalid );
  8373. z.low := float128_default_nan_low;
  8374. z.high := float128_default_nan_high;
  8375. result := z;
  8376. exit;
  8377. end;
  8378. float_raise( float_flag_divbyzero );
  8379. result := packFloat128( zSign, $7FFF, 0, 0 );
  8380. exit;
  8381. end;
  8382. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8383. end;
  8384. if ( aExp = 0 ) then begin
  8385. if ( ( aSig0 or aSig1 ) = 0 ) then
  8386. begin
  8387. result := packFloat128( zSign, 0, 0, 0 );
  8388. exit;
  8389. end;
  8390. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8391. end;
  8392. zExp := aExp - bExp + $3FFD;
  8393. shortShift128Left(
  8394. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  8395. shortShift128Left(
  8396. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8397. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  8398. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  8399. inc(zExp);
  8400. end;
  8401. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8402. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  8403. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  8404. while ( sbits64(rem0) < 0 ) do begin
  8405. dec(zSig0);
  8406. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  8407. end;
  8408. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  8409. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  8410. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  8411. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  8412. while ( sbits64(rem1) < 0 ) do begin
  8413. dec(zSig1);
  8414. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  8415. end;
  8416. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8417. end;
  8418. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  8419. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  8420. end;
  8421. {*----------------------------------------------------------------------------
  8422. | Returns the remainder of the quadruple-precision floating-point value `a'
  8423. | with respect to the corresponding value `b'. The operation is performed
  8424. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8425. *----------------------------------------------------------------------------*}
  8426. function float128_rem(a: float128; b: float128): float128;
  8427. var
  8428. aSign, zSign: flag;
  8429. aExp, bExp, expDiff: int32;
  8430. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  8431. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  8432. sigMean0: sbits64;
  8433. z: float128;
  8434. label
  8435. invalid;
  8436. begin
  8437. aSig1 := extractFloat128Frac1( a );
  8438. aSig0 := extractFloat128Frac0( a );
  8439. aExp := extractFloat128Exp( a );
  8440. aSign := extractFloat128Sign( a );
  8441. bSig1 := extractFloat128Frac1( b );
  8442. bSig0 := extractFloat128Frac0( b );
  8443. bExp := extractFloat128Exp( b );
  8444. if ( aExp = $7FFF ) then begin
  8445. if ( (( aSig0 or aSig1 )<>0)
  8446. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  8447. result := propagateFloat128NaN( a, b );
  8448. exit;
  8449. end;
  8450. goto invalid;
  8451. end;
  8452. if ( bExp = $7FFF ) then begin
  8453. if ( bSig0 or bSig1 )<>0 then
  8454. begin
  8455. result := propagateFloat128NaN( a, b );
  8456. exit;
  8457. end;
  8458. result := a;
  8459. exit;
  8460. end;
  8461. if ( bExp = 0 ) then begin
  8462. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  8463. invalid:
  8464. float_raise( float_flag_invalid );
  8465. z.low := float128_default_nan_low;
  8466. z.high := float128_default_nan_high;
  8467. result := z;
  8468. exit;
  8469. end;
  8470. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  8471. end;
  8472. if ( aExp = 0 ) then begin
  8473. if ( ( aSig0 or aSig1 ) = 0 ) then
  8474. begin
  8475. result := a;
  8476. exit;
  8477. end;
  8478. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8479. end;
  8480. expDiff := aExp - bExp;
  8481. if ( expDiff < -1 ) then
  8482. begin
  8483. result := a;
  8484. exit;
  8485. end;
  8486. shortShift128Left(
  8487. aSig0 or int64( $0001000000000000 ),
  8488. aSig1,
  8489. 15 - ord( expDiff < 0 ),
  8490. aSig0,
  8491. aSig1
  8492. );
  8493. shortShift128Left(
  8494. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  8495. q := le128( bSig0, bSig1, aSig0, aSig1 );
  8496. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8497. dec(expDiff,64);
  8498. while ( 0 < expDiff ) do begin
  8499. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8500. if ( 4 < q ) then
  8501. q := q - 4
  8502. else
  8503. q := 0;
  8504. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8505. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  8506. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  8507. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  8508. dec(expDiff,61);
  8509. end;
  8510. if ( -64 < expDiff ) then begin
  8511. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  8512. if ( 4 < q ) then
  8513. q := q - 4
  8514. else
  8515. q := 0;
  8516. q := q shr (- expDiff);
  8517. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8518. inc(expDiff,52);
  8519. if ( expDiff < 0 ) then begin
  8520. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  8521. end
  8522. else begin
  8523. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  8524. end;
  8525. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  8526. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  8527. end
  8528. else begin
  8529. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  8530. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  8531. end;
  8532. repeat
  8533. alternateASig0 := aSig0;
  8534. alternateASig1 := aSig1;
  8535. inc(q);
  8536. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  8537. until not( 0 <= sbits64(aSig0) );
  8538. add128(
  8539. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  8540. if ( ( sigMean0 < 0 )
  8541. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  8542. aSig0 := alternateASig0;
  8543. aSig1 := alternateASig1;
  8544. end;
  8545. zSign := ord( sbits64(aSig0) < 0 );
  8546. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  8547. result :=
  8548. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  8549. end;
  8550. {*----------------------------------------------------------------------------
  8551. | Returns the square root of the quadruple-precision floating-point value `a'.
  8552. | The operation is performed according to the IEC/IEEE Standard for Binary
  8553. | Floating-Point Arithmetic.
  8554. *----------------------------------------------------------------------------*}
  8555. function float128_sqrt(a: float128): float128;
  8556. var
  8557. aSign: flag;
  8558. aExp, zExp: int32;
  8559. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  8560. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  8561. z: float128;
  8562. label
  8563. invalid;
  8564. begin
  8565. aSig1 := extractFloat128Frac1( a );
  8566. aSig0 := extractFloat128Frac0( a );
  8567. aExp := extractFloat128Exp( a );
  8568. aSign := extractFloat128Sign( a );
  8569. if ( aExp = $7FFF ) then begin
  8570. if ( aSig0 or aSig1 )<>0 then
  8571. begin
  8572. result := propagateFloat128NaN( a, a );
  8573. exit;
  8574. end;
  8575. if ( aSign=0 ) then
  8576. begin
  8577. result := a;
  8578. exit;
  8579. end;
  8580. goto invalid;
  8581. end;
  8582. if ( aSign<>0 ) then begin
  8583. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  8584. begin
  8585. result := a;
  8586. exit;
  8587. end;
  8588. invalid:
  8589. float_raise( float_flag_invalid );
  8590. z.low := float128_default_nan_low;
  8591. z.high := float128_default_nan_high;
  8592. result := z;
  8593. exit;
  8594. end;
  8595. if ( aExp = 0 ) then begin
  8596. if ( ( aSig0 or aSig1 ) = 0 ) then
  8597. begin
  8598. result := packFloat128( 0, 0, 0, 0 );
  8599. exit;
  8600. end;
  8601. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  8602. end;
  8603. zExp := ( ( aExp - $3FFF ) shr 1 ) + $3FFE;
  8604. aSig0 := aSig0 or int64( $0001000000000000 );
  8605. zSig0 := estimateSqrt32( aExp, aSig0 shr 17 );
  8606. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  8607. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  8608. doubleZSig0 := zSig0 shl 1;
  8609. mul64To128( zSig0, zSig0, term0, term1 );
  8610. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  8611. while ( sbits64(rem0) < 0 ) do begin
  8612. dec(zSig0);
  8613. dec(doubleZSig0,2);
  8614. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  8615. end;
  8616. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  8617. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  8618. if ( zSig1 = 0 ) then zSig1 := 1;
  8619. mul64To128( doubleZSig0, zSig1, term1, term2 );
  8620. sub128( rem1, 0, term1, term2, rem1, rem2 );
  8621. mul64To128( zSig1, zSig1, term2, term3 );
  8622. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  8623. while ( sbits64(rem1) < 0 ) do begin
  8624. dec(zSig1);
  8625. shortShift128Left( 0, zSig1, 1, term2, term3 );
  8626. term3 := term3 or 1;
  8627. term2 := term2 or doubleZSig0;
  8628. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  8629. end;
  8630. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  8631. end;
  8632. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  8633. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  8634. end;
  8635. {*----------------------------------------------------------------------------
  8636. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8637. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8638. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8639. *----------------------------------------------------------------------------*}
  8640. function float128_eq(a: float128; b: float128): flag;
  8641. begin
  8642. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8643. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8644. or ( ( extractFloat128Exp( b ) = $7FFF )
  8645. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8646. ) then begin
  8647. if ( (float128_is_signaling_nan( a )<>0)
  8648. or (float128_is_signaling_nan( b )<>0) ) then begin
  8649. float_raise( float_flag_invalid );
  8650. end;
  8651. result := 0;
  8652. exit;
  8653. end;
  8654. result := ord(
  8655. ( a.low = b.low )
  8656. and ( ( a.high = b.high )
  8657. or ( ( a.low = 0 )
  8658. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  8659. ));
  8660. end;
  8661. {*----------------------------------------------------------------------------
  8662. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8663. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  8664. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  8665. | Arithmetic.
  8666. *----------------------------------------------------------------------------*}
  8667. function float128_le(a: float128; b: float128): flag;
  8668. var
  8669. aSign, bSign: flag;
  8670. begin
  8671. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8672. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8673. or ( ( extractFloat128Exp( b ) = $7FFF )
  8674. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8675. ) then begin
  8676. float_raise( float_flag_invalid );
  8677. result := 0;
  8678. exit;
  8679. end;
  8680. aSign := extractFloat128Sign( a );
  8681. bSign := extractFloat128Sign( b );
  8682. if ( aSign <> bSign ) then begin
  8683. result := ord(
  8684. (aSign<>0)
  8685. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8686. = 0 ));
  8687. exit;
  8688. end;
  8689. if aSign<>0 then
  8690. result := le128( b.high, b.low, a.high, a.low )
  8691. else
  8692. result := le128( a.high, a.low, b.high, b.low );
  8693. end;
  8694. {*----------------------------------------------------------------------------
  8695. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8696. | the corresponding value `b', and 0 otherwise. The comparison is performed
  8697. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8698. *----------------------------------------------------------------------------*}
  8699. function float128_lt(a: float128; b: float128): flag;
  8700. var
  8701. aSign, bSign: flag;
  8702. begin
  8703. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8704. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8705. or ( ( extractFloat128Exp( b ) = $7FFF )
  8706. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8707. ) then begin
  8708. float_raise( float_flag_invalid );
  8709. result := 0;
  8710. exit;
  8711. end;
  8712. aSign := extractFloat128Sign( a );
  8713. bSign := extractFloat128Sign( b );
  8714. if ( aSign <> bSign ) then begin
  8715. result := ord(
  8716. (aSign<>0)
  8717. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8718. <> 0 ));
  8719. exit;
  8720. end;
  8721. if aSign<>0 then
  8722. result := lt128( b.high, b.low, a.high, a.low )
  8723. else
  8724. result := lt128( a.high, a.low, b.high, b.low );
  8725. end;
  8726. {*----------------------------------------------------------------------------
  8727. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  8728. | the corresponding value `b', and 0 otherwise. The invalid exception is
  8729. | raised if either operand is a NaN. Otherwise, the comparison is performed
  8730. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8731. *----------------------------------------------------------------------------*}
  8732. function float128_eq_signaling(a: float128; b: float128): flag;
  8733. begin
  8734. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8735. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8736. or ( ( extractFloat128Exp( b ) = $7FFF )
  8737. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8738. ) then begin
  8739. float_raise( float_flag_invalid );
  8740. result := 0;
  8741. exit;
  8742. end;
  8743. result := ord(
  8744. ( a.low = b.low )
  8745. and ( ( a.high = b.high )
  8746. or ( ( a.low = 0 )
  8747. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  8748. ));
  8749. end;
  8750. {*----------------------------------------------------------------------------
  8751. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8752. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  8753. | cause an exception. Otherwise, the comparison is performed according to the
  8754. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  8755. *----------------------------------------------------------------------------*}
  8756. function float128_le_quiet(a: float128; b: float128): flag;
  8757. var
  8758. aSign, bSign: flag;
  8759. begin
  8760. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8761. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8762. or ( ( extractFloat128Exp( b ) = $7FFF )
  8763. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8764. ) then begin
  8765. if ( (float128_is_signaling_nan( a )<>0)
  8766. or (float128_is_signaling_nan( b )<>0) ) then begin
  8767. float_raise( float_flag_invalid );
  8768. end;
  8769. result := 0;
  8770. exit;
  8771. end;
  8772. aSign := extractFloat128Sign( a );
  8773. bSign := extractFloat128Sign( b );
  8774. if ( aSign <> bSign ) then begin
  8775. result := ord(
  8776. (aSign<>0)
  8777. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8778. = 0 ));
  8779. exit;
  8780. end;
  8781. if aSign<>0 then
  8782. result := le128( b.high, b.low, a.high, a.low )
  8783. else
  8784. result := le128( a.high, a.low, b.high, b.low );
  8785. end;
  8786. {*----------------------------------------------------------------------------
  8787. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  8788. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  8789. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  8790. | Standard for Binary Floating-Point Arithmetic.
  8791. *----------------------------------------------------------------------------*}
  8792. function float128_lt_quiet(a: float128; b: float128): flag;
  8793. var
  8794. aSign, bSign: flag;
  8795. begin
  8796. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  8797. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  8798. or ( ( extractFloat128Exp( b ) = $7FFF )
  8799. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8800. ) then begin
  8801. if ( (float128_is_signaling_nan( a )<>0)
  8802. or (float128_is_signaling_nan( b )<>0) ) then begin
  8803. float_raise( float_flag_invalid );
  8804. end;
  8805. result := 0;
  8806. exit;
  8807. end;
  8808. aSign := extractFloat128Sign( a );
  8809. bSign := extractFloat128Sign( b );
  8810. if ( aSign <> bSign ) then begin
  8811. result := ord(
  8812. (aSign<>0)
  8813. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8814. <> 0 ));
  8815. exit;
  8816. end;
  8817. if aSign<>0 then
  8818. result:=lt128( b.high, b.low, a.high, a.low )
  8819. else
  8820. result:=lt128( a.high, a.low, b.high, b.low );
  8821. end;
  8822. {----------------------------------------------------------------------------
  8823. | Returns the result of converting the double-precision floating-point value
  8824. | `a' to the quadruple-precision floating-point format. The conversion is
  8825. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8826. | Arithmetic.
  8827. *----------------------------------------------------------------------------}
  8828. function float64_to_float128( a : float64) : float128;
  8829. var
  8830. aSign : flag;
  8831. aExp : int16;
  8832. aSig, zSig0, zSig1 : bits64;
  8833. begin
  8834. aSig := extractFloat64Frac( a );
  8835. aExp := extractFloat64Exp( a );
  8836. aSign := extractFloat64Sign( a );
  8837. if ( aExp = $7FF ) then begin
  8838. if ( aSig<>0 ) then begin
  8839. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8840. exit;
  8841. end;
  8842. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8843. exit;
  8844. end;
  8845. if ( aExp = 0 ) then begin
  8846. if ( aSig = 0 ) then
  8847. begin
  8848. result:=packFloat128( aSign, 0, 0, 0 );
  8849. exit;
  8850. end;
  8851. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8852. dec(aExp);
  8853. end;
  8854. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8855. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8856. end;
  8857. {$endif FPC_SOFTFLOAT_FLOAT128}
  8858. {$endif not(defined(fpc_softfpu_interface))}
  8859. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8860. end.
  8861. {$ifdef FPC}
  8862. { restore context modified at implmentation start
  8863. to possibly re-enable range and overflow checking explicitly}
  8864. {$pop}
  8865. {$endif FPC}
  8866. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}