softfpu.pp 295 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527
  1. {*
  2. ===============================================================================
  3. The original notice of the softfloat package is shown below. The conversion
  4. to pascal was done by Carl Eric Codere in 2002 ([email protected]).
  5. ===============================================================================
  6. This C source file is part of the SoftFloat IEC/IEEE Floating-Point
  7. Arithmetic Package, Release 2a.
  8. Written by John R. Hauser. This work was made possible in part by the
  9. International Computer Science Institute, located at Suite 600, 1947 Center
  10. Street, Berkeley, California 94704. Funding was partially provided by the
  11. National Science Foundation under grant MIP-9311980. The original version
  12. of this code was written as part of a project to build a fixed-point vector
  13. processor in collaboration with the University of California at Berkeley,
  14. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  15. is available through the Web page
  16. `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
  17. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  18. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  20. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22. Derivative works are acceptable, even for commercial purposes, so long as
  23. (1) they include prominent notice that the work is derivative, and (2) they
  24. include prominent notice akin to these four paragraphs for those parts of
  25. this code that are retained.
  26. ===============================================================================
  27. The float80 and float128 part is translated from the softfloat package
  28. by Florian Klaempfl and contained the following copyright notice
  29. The code might contain some duplicate stuff because the floatx80/float128 port was
  30. done based on the 64 bit enabled softfloat code.
  31. ===============================================================================
  32. This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  33. Package, Release 2b.
  34. Written by John R. Hauser. This work was made possible in part by the
  35. International Computer Science Institute, located at Suite 600, 1947 Center
  36. Street, Berkeley, California 94704. Funding was partially provided by the
  37. National Science Foundation under grant MIP-9311980. The original version
  38. of this code was written as part of a project to build a fixed-point vector
  39. processor in collaboration with the University of California at Berkeley,
  40. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  41. is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  42. arithmetic/SoftFloat.html'.
  43. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  44. been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  45. RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  46. AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  47. COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  48. EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  49. INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  50. OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  51. Derivative works are acceptable, even for commercial purposes, so long as
  52. (1) the source code for the derivative work includes prominent notice that
  53. the work is derivative, and (2) the source code includes prominent notice with
  54. these four paragraphs for those parts of this code that are retained.
  55. ===============================================================================
  56. *}
  57. { $define FPC_SOFTFLOAT_FLOATX80}
  58. { $define FPC_SOFTFLOAT_FLOAT128}
  59. { the softfpu unit can be also embedded directly into the system unit }
  60. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  61. {$mode objfpc}
  62. unit softfpu;
  63. { Overflow checking must be disabled,
  64. since some operations expect overflow!
  65. }
  66. {$Q-}
  67. {$goto on}
  68. interface
  69. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  70. {$if not(defined(fpc_softfpu_implementation))}
  71. {
  72. -------------------------------------------------------------------------------
  73. Software IEC/IEEE floating-point types.
  74. -------------------------------------------------------------------------------
  75. }
  76. TYPE
  77. float32 = longword;
  78. {$define FPC_SYSTEM_HAS_float32}
  79. { we use here a record in the function header because
  80. the record allows bitwise conversion to single }
  81. float32rec = record
  82. float32 : float32;
  83. end;
  84. flag = byte;
  85. uint8 = byte;
  86. int8 = shortint;
  87. uint16 = word;
  88. int16 = smallint;
  89. uint32 = longword;
  90. int32 = longint;
  91. bits8 = byte;
  92. sbits8 = shortint;
  93. bits16 = word;
  94. sbits16 = smallint;
  95. sbits32 = longint;
  96. bits32 = longword;
  97. {$ifndef fpc}
  98. qword = int64;
  99. {$endif}
  100. { now part of the system unit
  101. uint64 = qword;
  102. }
  103. bits64 = qword;
  104. sbits64 = int64;
  105. {$ifdef ENDIAN_LITTLE}
  106. float64 = packed record
  107. low: bits32;
  108. high: bits32;
  109. end;
  110. int64rec = packed record
  111. low: bits32;
  112. high: bits32;
  113. end;
  114. floatx80 = packed record
  115. low : qword;
  116. high : word;
  117. end;
  118. float128 = packed record
  119. low : qword;
  120. high : qword;
  121. end;
  122. {$else}
  123. float64 = record
  124. case byte of
  125. 1: (high,low : bits32);
  126. // force the record to be aligned like a double
  127. // else *_to_double will fail for cpus like sparc
  128. 2: (dummy : double);
  129. end;
  130. int64rec = packed record
  131. high,low : bits32;
  132. end;
  133. floatx80 = packed record
  134. high : word;
  135. low : qword;
  136. end;
  137. float128 = packed record
  138. high : qword;
  139. low : qword;
  140. end;
  141. {$endif}
  142. {$define FPC_SYSTEM_HAS_float64}
  143. {*
  144. -------------------------------------------------------------------------------
  145. Returns 1 if the double-precision floating-point value `a' is less than
  146. the corresponding value `b', and 0 otherwise. The comparison is performed
  147. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  148. -------------------------------------------------------------------------------
  149. *}
  150. Function float64_lt(a: float64;b: float64): flag; compilerproc;
  151. {*
  152. -------------------------------------------------------------------------------
  153. Returns 1 if the double-precision floating-point value `a' is less than
  154. or equal to the corresponding value `b', and 0 otherwise. The comparison
  155. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  156. Arithmetic.
  157. -------------------------------------------------------------------------------
  158. *}
  159. Function float64_le(a: float64;b: float64): flag; compilerproc;
  160. {*
  161. -------------------------------------------------------------------------------
  162. Returns 1 if the double-precision floating-point value `a' is equal to
  163. the corresponding value `b', and 0 otherwise. The comparison is performed
  164. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  165. -------------------------------------------------------------------------------
  166. *}
  167. Function float64_eq(a: float64;b: float64): flag; compilerproc;
  168. {*
  169. -------------------------------------------------------------------------------
  170. Returns the square root of the double-precision floating-point value `a'.
  171. The operation is performed according to the IEC/IEEE Standard for Binary
  172. Floating-Point Arithmetic.
  173. -------------------------------------------------------------------------------
  174. *}
  175. Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
  176. {*
  177. -------------------------------------------------------------------------------
  178. Returns the remainder of the double-precision floating-point value `a'
  179. with respect to the corresponding value `b'. The operation is performed
  180. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  181. -------------------------------------------------------------------------------
  182. *}
  183. Function float64_rem(a: float64; b : float64) : float64; compilerproc;
  184. {*
  185. -------------------------------------------------------------------------------
  186. Returns the result of dividing the double-precision floating-point value `a'
  187. by the corresponding value `b'. The operation is performed according to the
  188. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  189. -------------------------------------------------------------------------------
  190. *}
  191. Function float64_div(a: float64; b : float64) : float64; compilerproc;
  192. {*
  193. -------------------------------------------------------------------------------
  194. Returns the result of multiplying the double-precision floating-point values
  195. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  196. for Binary Floating-Point Arithmetic.
  197. -------------------------------------------------------------------------------
  198. *}
  199. Function float64_mul( a: float64; b:float64) : float64; compilerproc;
  200. {*
  201. -------------------------------------------------------------------------------
  202. Returns the result of subtracting the double-precision floating-point values
  203. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  204. for Binary Floating-Point Arithmetic.
  205. -------------------------------------------------------------------------------
  206. *}
  207. Function float64_sub(a: float64; b : float64) : float64; compilerproc;
  208. {*
  209. -------------------------------------------------------------------------------
  210. Returns the result of adding the double-precision floating-point values `a'
  211. and `b'. The operation is performed according to the IEC/IEEE Standard for
  212. Binary Floating-Point Arithmetic.
  213. -------------------------------------------------------------------------------
  214. *}
  215. Function float64_add( a: float64; b : float64) : float64; compilerproc;
  216. {*
  217. -------------------------------------------------------------------------------
  218. Rounds the double-precision floating-point value `a' to an integer,
  219. and returns the result as a double-precision floating-point value. The
  220. operation is performed according to the IEC/IEEE Standard for Binary
  221. Floating-Point Arithmetic.
  222. -------------------------------------------------------------------------------
  223. *}
  224. Function float64_round_to_int(a: float64) : float64; compilerproc;
  225. {*
  226. -------------------------------------------------------------------------------
  227. Returns the result of converting the double-precision floating-point value
  228. `a' to the single-precision floating-point format. The conversion is
  229. performed according to the IEC/IEEE Standard for Binary Floating-Point
  230. Arithmetic.
  231. -------------------------------------------------------------------------------
  232. *}
  233. Function float64_to_float32(a: float64) : float32rec; compilerproc;
  234. {*
  235. -------------------------------------------------------------------------------
  236. Returns the result of converting the double-precision floating-point value
  237. `a' to the 32-bit two's complement integer format. The conversion is
  238. performed according to the IEC/IEEE Standard for Binary Floating-Point
  239. Arithmetic, except that the conversion is always rounded toward zero.
  240. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  241. the conversion overflows, the largest integer with the same sign as `a' is
  242. returned.
  243. -------------------------------------------------------------------------------
  244. *}
  245. Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
  246. {*
  247. -------------------------------------------------------------------------------
  248. Returns the result of converting the double-precision floating-point value
  249. `a' to the 32-bit two's complement integer format. The conversion is
  250. performed according to the IEC/IEEE Standard for Binary Floating-Point
  251. Arithmetic---which means in particular that the conversion is rounded
  252. according to the current rounding mode. If `a' is a NaN, the largest
  253. positive integer is returned. Otherwise, if the conversion overflows, the
  254. largest integer with the same sign as `a' is returned.
  255. -------------------------------------------------------------------------------
  256. *}
  257. Function float64_to_int32(a: float64): int32; compilerproc;
  258. {*
  259. -------------------------------------------------------------------------------
  260. Returns 1 if the single-precision floating-point value `a' is less than
  261. the corresponding value `b', and 0 otherwise. The comparison is performed
  262. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  263. -------------------------------------------------------------------------------
  264. *}
  265. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  266. {*
  267. -------------------------------------------------------------------------------
  268. Returns 1 if the single-precision floating-point value `a' is less than
  269. or equal to the corresponding value `b', and 0 otherwise. The comparison
  270. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  271. Arithmetic.
  272. -------------------------------------------------------------------------------
  273. *}
  274. Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
  275. {*
  276. -------------------------------------------------------------------------------
  277. Returns 1 if the single-precision floating-point value `a' is equal to
  278. the corresponding value `b', and 0 otherwise. The comparison is performed
  279. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  280. -------------------------------------------------------------------------------
  281. *}
  282. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  283. {*
  284. -------------------------------------------------------------------------------
  285. Returns the square root of the single-precision floating-point value `a'.
  286. The operation is performed according to the IEC/IEEE Standard for Binary
  287. Floating-Point Arithmetic.
  288. -------------------------------------------------------------------------------
  289. *}
  290. Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
  291. {*
  292. -------------------------------------------------------------------------------
  293. Returns the remainder of the single-precision floating-point value `a'
  294. with respect to the corresponding value `b'. The operation is performed
  295. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  296. -------------------------------------------------------------------------------
  297. *}
  298. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  299. {*
  300. -------------------------------------------------------------------------------
  301. Returns the result of dividing the single-precision floating-point value `a'
  302. by the corresponding value `b'. The operation is performed according to the
  303. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  304. -------------------------------------------------------------------------------
  305. *}
  306. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  307. {*
  308. -------------------------------------------------------------------------------
  309. Returns the result of multiplying the single-precision floating-point values
  310. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  311. for Binary Floating-Point Arithmetic.
  312. -------------------------------------------------------------------------------
  313. *}
  314. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  315. {*
  316. -------------------------------------------------------------------------------
  317. Returns the result of subtracting the single-precision floating-point values
  318. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  319. for Binary Floating-Point Arithmetic.
  320. -------------------------------------------------------------------------------
  321. *}
  322. Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
  323. {*
  324. -------------------------------------------------------------------------------
  325. Returns the result of adding the single-precision floating-point values `a'
  326. and `b'. The operation is performed according to the IEC/IEEE Standard for
  327. Binary Floating-Point Arithmetic.
  328. -------------------------------------------------------------------------------
  329. *}
  330. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  331. {*
  332. -------------------------------------------------------------------------------
  333. Rounds the single-precision floating-point value `a' to an integer,
  334. and returns the result as a single-precision floating-point value. The
  335. operation is performed according to the IEC/IEEE Standard for Binary
  336. Floating-Point Arithmetic.
  337. -------------------------------------------------------------------------------
  338. *}
  339. Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
  340. {*
  341. -------------------------------------------------------------------------------
  342. Returns the result of converting the single-precision floating-point value
  343. `a' to the double-precision floating-point format. The conversion is
  344. performed according to the IEC/IEEE Standard for Binary Floating-Point
  345. Arithmetic.
  346. -------------------------------------------------------------------------------
  347. *}
  348. Function float32_to_float64( a : float32rec) : Float64; compilerproc;
  349. {*
  350. -------------------------------------------------------------------------------
  351. Returns the result of converting the single-precision floating-point value
  352. `a' to the 32-bit two's complement integer format. The conversion is
  353. performed according to the IEC/IEEE Standard for Binary Floating-Point
  354. Arithmetic, except that the conversion is always rounded toward zero.
  355. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  356. the conversion overflows, the largest integer with the same sign as `a' is
  357. returned.
  358. -------------------------------------------------------------------------------
  359. *}
  360. Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
  361. {*
  362. -------------------------------------------------------------------------------
  363. Returns the result of converting the single-precision floating-point value
  364. `a' to the 32-bit two's complement integer format. The conversion is
  365. performed according to the IEC/IEEE Standard for Binary Floating-Point
  366. Arithmetic---which means in particular that the conversion is rounded
  367. according to the current rounding mode. If `a' is a NaN, the largest
  368. positive integer is returned. Otherwise, if the conversion overflows, the
  369. largest integer with the same sign as `a' is returned.
  370. -------------------------------------------------------------------------------
  371. *}
  372. Function float32_to_int32( a : float32rec) : int32; compilerproc;
  373. {*
  374. -------------------------------------------------------------------------------
  375. Returns the result of converting the 32-bit two's complement integer `a' to
  376. the double-precision floating-point format. The conversion is performed
  377. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  378. -------------------------------------------------------------------------------
  379. *}
  380. Function int32_to_float64( a: int32) : float64; compilerproc;
  381. {*
  382. -------------------------------------------------------------------------------
  383. Returns the result of converting the 32-bit two's complement integer `a' to
  384. the single-precision floating-point format. The conversion is performed
  385. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  386. -------------------------------------------------------------------------------
  387. *}
  388. Function int32_to_float32( a: int32): float32rec; compilerproc;
  389. {*----------------------------------------------------------------------------
  390. | Returns the result of converting the 64-bit two's complement integer `a'
  391. | to the double-precision floating-point format. The conversion is performed
  392. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  393. *----------------------------------------------------------------------------*}
  394. Function int64_to_float64( a: int64 ): float64; compilerproc;
  395. Function qword_to_float64( a: qword ): float64; compilerproc;
  396. {*----------------------------------------------------------------------------
  397. | Returns the result of converting the 64-bit two's complement integer `a'
  398. | to the single-precision floating-point format. The conversion is performed
  399. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  400. *----------------------------------------------------------------------------*}
  401. Function int64_to_float32( a: int64 ): float32rec; compilerproc;
  402. Function qword_to_float32( a: qword ): float32rec; compilerproc;
  403. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  404. function float128_is_nan( a : float128): flag;
  405. function float128_is_signaling_nan( a : float128): flag;
  406. function float128_to_int32(a: float128): int32;
  407. function float128_to_int32_round_to_zero(a: float128): int32;
  408. function float128_to_int64(a: float128): int64;
  409. function float128_to_int64_round_to_zero(a: float128): int64;
  410. function float128_to_float32(a: float128): float32;
  411. function float128_to_float64(a: float128): float64;
  412. function float64_to_float128( a : float64) : float128;
  413. {$ifdef FPC_SOFTFLOAT_FLOAT80}
  414. function float128_to_floatx80(a: float128): floatx80;
  415. {$endif FPC_SOFTFLOAT_FLOAT80}
  416. function float128_round_to_int(a: float128): float128;
  417. function float128_add(a: float128; b: float128): float128;
  418. function float128_sub(a: float128; b: float128): float128;
  419. function float128_mul(a: float128; b: float128): float128;
  420. function float128_div(a: float128; b: float128): float128;
  421. function float128_rem(a: float128; b: float128): float128;
  422. function float128_sqrt(a: float128): float128;
  423. function float128_eq(a: float128; b: float128): flag;
  424. function float128_le(a: float128; b: float128): flag;
  425. function float128_lt(a: float128; b: float128): flag;
  426. function float128_eq_signaling(a: float128; b: float128): flag;
  427. function float128_le_quiet(a: float128; b: float128): flag;
  428. function float128_lt_quiet(a: float128; b: float128): flag;
  429. {$endif FPC_SOFTFLOAT_FLOAT128}
  430. CONST
  431. {-------------------------------------------------------------------------------
  432. Software IEC/IEEE floating-point underflow tininess-detection mode.
  433. -------------------------------------------------------------------------------
  434. *}
  435. float_tininess_after_rounding = 0;
  436. float_tininess_before_rounding = 1;
  437. {*
  438. -------------------------------------------------------------------------------
  439. Underflow tininess-detection mode, statically initialized to default value.
  440. (The declaration in `softfloat.h' must match the `int8' type here.)
  441. -------------------------------------------------------------------------------
  442. *}
  443. const float_detect_tininess: int8 = float_tininess_after_rounding;
  444. {$endif not(defined(fpc_softfpu_implementation))}
  445. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  446. implementation
  447. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  448. {$if not(defined(fpc_softfpu_interface))}
  449. (*****************************************************************************)
  450. (*----------------------------------------------------------------------------*)
  451. (* Primitive arithmetic functions, including multi-word arithmetic, and *)
  452. (* division and square root approximations. (Can be specialized to target if *)
  453. (* desired.) *)
  454. (* ---------------------------------------------------------------------------*)
  455. (*****************************************************************************)
  456. {*----------------------------------------------------------------------------
  457. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  458. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  459. | input. If `zSign' is 1, the input is negated before being converted to an
  460. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  461. | is simply rounded to an integer, with the inexact exception raised if the
  462. | input cannot be represented exactly as an integer. However, if the fixed-
  463. | point input is too large, the invalid exception is raised and the largest
  464. | positive or negative integer is returned.
  465. *----------------------------------------------------------------------------*}
  466. function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
  467. var
  468. roundingMode: int8;
  469. roundNearestEven: flag;
  470. roundIncrement, roundBits: int8;
  471. z: int32;
  472. begin
  473. roundingMode := softfloat_rounding_mode;
  474. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  475. roundIncrement := $40;
  476. if ( roundNearestEven=0 ) then
  477. begin
  478. if ( roundingMode = float_round_to_zero ) then
  479. begin
  480. roundIncrement := 0;
  481. end
  482. else begin
  483. roundIncrement := $7F;
  484. if ( zSign<>0 ) then
  485. begin
  486. if ( roundingMode = float_round_up ) then
  487. roundIncrement := 0;
  488. end
  489. else begin
  490. if ( roundingMode = float_round_down ) then
  491. roundIncrement := 0;
  492. end;
  493. end;
  494. end;
  495. roundBits := absZ and $7F;
  496. absZ := ( absZ + roundIncrement ) shr 7;
  497. absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
  498. z := absZ;
  499. if ( zSign<>0 ) then
  500. z := - z;
  501. if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
  502. begin
  503. float_raise( float_flag_invalid );
  504. if zSign<>0 then
  505. result:=sbits32($80000000)
  506. else
  507. result:=$7FFFFFFF;
  508. exit;
  509. end;
  510. if ( roundBits<>0 ) then
  511. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  512. result:=z;
  513. end;
  514. {*----------------------------------------------------------------------------
  515. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  516. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  517. | and returns the properly rounded 64-bit integer corresponding to the input.
  518. | If `zSign' is 1, the input is negated before being converted to an integer.
  519. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  520. | the inexact exception raised if the input cannot be represented exactly as
  521. | an integer. However, if the fixed-point input is too large, the invalid
  522. | exception is raised and the largest positive or negative integer is
  523. | returned.
  524. *----------------------------------------------------------------------------*}
  525. function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
  526. var
  527. roundingMode: int8;
  528. roundNearestEven, increment: flag;
  529. z: int64;
  530. label
  531. overflow;
  532. begin
  533. roundingMode := softfloat_rounding_mode;
  534. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  535. increment := ord( sbits64(absZ1) < 0 );
  536. if ( roundNearestEven=0 ) then
  537. begin
  538. if ( roundingMode = float_round_to_zero ) then
  539. begin
  540. increment := 0;
  541. end
  542. else begin
  543. if ( zSign<>0 ) then
  544. begin
  545. increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
  546. end
  547. else begin
  548. increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
  549. end;
  550. end;
  551. end;
  552. if ( increment<>0 ) then
  553. begin
  554. inc(absZ0);
  555. if ( absZ0 = 0 ) then
  556. goto overflow;
  557. absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
  558. end;
  559. z := absZ0;
  560. if ( zSign<>0 ) then
  561. z := - z;
  562. if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
  563. begin
  564. overflow:
  565. float_raise( float_flag_invalid );
  566. if zSign<>0 then
  567. result:=int64($8000000000000000)
  568. else
  569. result:=int64($7FFFFFFFFFFFFFFF);
  570. end;
  571. if ( absZ1<>0 ) then
  572. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  573. result:=z;
  574. end;
  575. {*
  576. -------------------------------------------------------------------------------
  577. Shifts `a' right by the number of bits given in `count'. If any nonzero
  578. bits are shifted off, they are ``jammed'' into the least significant bit of
  579. the result by setting the least significant bit to 1. The value of `count'
  580. can be arbitrarily large; in particular, if `count' is greater than 32, the
  581. result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  582. The result is stored in the location pointed to by `zPtr'.
  583. -------------------------------------------------------------------------------
  584. *}
  585. Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
  586. var
  587. z: Bits32;
  588. Begin
  589. if ( count = 0 ) then
  590. z := a
  591. else
  592. if ( count < 32 ) then
  593. Begin
  594. z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
  595. End
  596. else
  597. Begin
  598. z := bits32( a <> 0 );
  599. End;
  600. zPtr := z;
  601. End;
  602. {*----------------------------------------------------------------------------
  603. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  604. | number of bits given in `count'. Any bits shifted off are lost. The value
  605. | of `count' can be arbitrarily large; in particular, if `count' is greater
  606. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  607. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  608. *----------------------------------------------------------------------------*}
  609. procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  610. var
  611. z0, z1: bits64;
  612. negCount: int8;
  613. begin
  614. negCount := ( - count ) and 63;
  615. if ( count = 0 ) then
  616. begin
  617. z1 := a1;
  618. z0 := a0;
  619. end
  620. else if ( count < 64 ) then
  621. begin
  622. z1 := ( a0 shl negCount ) or ( a1 shr count );
  623. z0 := a0 shr count;
  624. end
  625. else
  626. begin
  627. if ( count shl 64 )<>0 then
  628. z1 := a0 shr ( count and 63 )
  629. else
  630. z1 := 0;
  631. z0 := 0;
  632. end;
  633. z1Ptr := z1;
  634. z0Ptr := z0;
  635. end;
  636. {*----------------------------------------------------------------------------
  637. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  638. | number of bits given in `count'. If any nonzero bits are shifted off, they
  639. | are ``jammed'' into the least significant bit of the result by setting the
  640. | least significant bit to 1. The value of `count' can be arbitrarily large;
  641. | in particular, if `count' is greater than 128, the result will be either
  642. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  643. | nonzero. The result is broken into two 64-bit pieces which are stored at
  644. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  645. *----------------------------------------------------------------------------*}
  646. procedure shift128RightJamming(a0,a1 : bits64; count : int16; var z0Ptr, z1Ptr : bits64);
  647. var
  648. z0,z1 : bits64;
  649. negCount : int8;
  650. begin
  651. negCount := ( - count ) and 63;
  652. if ( count = 0 ) then begin
  653. z1 := a1;
  654. z0 := a0;
  655. end
  656. else if ( count < 64 ) then begin
  657. z1 := ( a0 shl negCount ) or ( a1 shr count ) or ord( ( a1 shl negCount ) <> 0 );
  658. z0 := a0>>count;
  659. end
  660. else begin
  661. if ( count = 64 ) then begin
  662. z1 := a0 or ord( a1 <> 0 );
  663. end
  664. else if ( count < 128 ) then begin
  665. z1 := ( a0 shr ( count and 63 ) ) or ord( ( ( a0 shl negCount ) or a1 ) <> 0 );
  666. end
  667. else begin
  668. z1 := ord( ( a0 or a1 ) <> 0 );
  669. end;
  670. z0 := 0;
  671. end;
  672. z1Ptr := z1;
  673. z0Ptr := z0;
  674. end;
  675. {*
  676. -------------------------------------------------------------------------------
  677. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  678. number of bits given in `count'. Any bits shifted off are lost. The value
  679. of `count' can be arbitrarily large; in particular, if `count' is greater
  680. than 64, the result will be 0. The result is broken into two 32-bit pieces
  681. which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  682. -------------------------------------------------------------------------------
  683. *}
  684. Procedure
  685. shift64Right(
  686. a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
  687. Var
  688. z0, z1: bits32;
  689. negCount : int8;
  690. Begin
  691. negCount := ( - count ) AND 31;
  692. if ( count = 0 ) then
  693. Begin
  694. z1 := a1;
  695. z0 := a0;
  696. End
  697. else if ( count < 32 ) then
  698. Begin
  699. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  700. z0 := a0 shr count;
  701. End
  702. else
  703. Begin
  704. if (count < 64) then
  705. z1 := ( a0 shr ( count AND 31 ) )
  706. else
  707. z1 := 0;
  708. z0 := 0;
  709. End;
  710. z1Ptr := z1;
  711. z0Ptr := z0;
  712. End;
  713. {*
  714. -------------------------------------------------------------------------------
  715. Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
  716. number of bits given in `count'. If any nonzero bits are shifted off, they
  717. are ``jammed'' into the least significant bit of the result by setting the
  718. least significant bit to 1. The value of `count' can be arbitrarily large;
  719. in particular, if `count' is greater than 64, the result will be either 0
  720. or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  721. nonzero. The result is broken into two 32-bit pieces which are stored at
  722. the locations pointed to by `z0Ptr' and `z1Ptr'.
  723. -------------------------------------------------------------------------------
  724. *}
  725. Procedure
  726. shift64RightJamming(
  727. a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
  728. VAR
  729. z0, z1 : bits32;
  730. negCount : int8;
  731. Begin
  732. negCount := ( - count ) AND 31;
  733. if ( count = 0 ) then
  734. Begin
  735. z1 := a1;
  736. z0 := a0;
  737. End
  738. else
  739. if ( count < 32 ) then
  740. Begin
  741. z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
  742. z0 := a0 shr count;
  743. End
  744. else
  745. Begin
  746. if ( count = 32 ) then
  747. Begin
  748. z1 := a0 OR bits32( a1 <> 0 );
  749. End
  750. else
  751. if ( count < 64 ) Then
  752. Begin
  753. z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
  754. End
  755. else
  756. Begin
  757. z1 := bits32( ( a0 OR a1 ) <> 0 );
  758. End;
  759. z0 := 0;
  760. End;
  761. z1Ptr := z1;
  762. z0Ptr := z0;
  763. End;
  764. {*----------------------------------------------------------------------------
  765. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  766. | bits are shifted off, they are ``jammed'' into the least significant bit of
  767. | the result by setting the least significant bit to 1. The value of `count'
  768. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  769. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  770. | The result is stored in the location pointed to by `zPtr'.
  771. *----------------------------------------------------------------------------*}
  772. procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
  773. var
  774. z: bits64;
  775. begin
  776. if ( count = 0 ) then
  777. begin
  778. z := a;
  779. end
  780. else if ( count < 64 ) then
  781. begin
  782. z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
  783. end
  784. else
  785. begin
  786. z := ord( a <> 0 );
  787. end;
  788. zPtr := z;
  789. end;
  790. {*
  791. -------------------------------------------------------------------------------
  792. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
  793. by 32 _plus_ the number of bits given in `count'. The shifted result is
  794. at most 64 nonzero bits; these are broken into two 32-bit pieces which are
  795. stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  796. off form a third 32-bit result as follows: The _last_ bit shifted off is
  797. the most-significant bit of the extra result, and the other 31 bits of the
  798. extra result are all zero if and only if _all_but_the_last_ bits shifted off
  799. were all zero. This extra result is stored in the location pointed to by
  800. `z2Ptr'. The value of `count' can be arbitrarily large.
  801. (This routine makes more sense if `a0', `a1', and `a2' are considered
  802. to form a fixed-point value with binary point between `a1' and `a2'. This
  803. fixed-point value is shifted right by the number of bits given in `count',
  804. and the integer part of the result is returned at the locations pointed to
  805. by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  806. corrupted as described above, and is returned at the location pointed to by
  807. `z2Ptr'.)
  808. -------------------------------------------------------------------------------
  809. }
  810. Procedure
  811. shift64ExtraRightJamming(
  812. a0: bits32;
  813. a1: bits32;
  814. a2: bits32;
  815. count: int16;
  816. VAR z0Ptr: bits32;
  817. VAR z1Ptr: bits32;
  818. VAR z2Ptr: bits32
  819. );
  820. Var
  821. z0, z1, z2: bits32;
  822. negCount : int8;
  823. Begin
  824. negCount := ( - count ) AND 31;
  825. if ( count = 0 ) then
  826. Begin
  827. z2 := a2;
  828. z1 := a1;
  829. z0 := a0;
  830. End
  831. else
  832. Begin
  833. if ( count < 32 ) Then
  834. Begin
  835. z2 := a1 shl negCount;
  836. z1 := ( a0 shl negCount ) OR ( a1 shr count );
  837. z0 := a0 shr count;
  838. End
  839. else
  840. Begin
  841. if ( count = 32 ) then
  842. Begin
  843. z2 := a1;
  844. z1 := a0;
  845. End
  846. else
  847. Begin
  848. a2 := a2 or a1;
  849. if ( count < 64 ) then
  850. Begin
  851. z2 := a0 shl negCount;
  852. z1 := a0 shr ( count AND 31 );
  853. End
  854. else
  855. Begin
  856. if count = 64 then
  857. z2 := a0
  858. else
  859. z2 := bits32(a0 <> 0);
  860. z1 := 0;
  861. End;
  862. End;
  863. z0 := 0;
  864. End;
  865. z2 := z2 or bits32( a2 <> 0 );
  866. End;
  867. z2Ptr := z2;
  868. z1Ptr := z1;
  869. z0Ptr := z0;
  870. End;
  871. {*
  872. -------------------------------------------------------------------------------
  873. Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
  874. number of bits given in `count'. Any bits shifted off are lost. The value
  875. of `count' must be less than 32. The result is broken into two 32-bit
  876. pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  877. -------------------------------------------------------------------------------
  878. *}
  879. Procedure
  880. shortShift64Left(
  881. a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  882. Begin
  883. z1Ptr := a1 shl count;
  884. if count = 0 then
  885. z0Ptr := a0
  886. else
  887. z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
  888. End;
  889. {*
  890. -------------------------------------------------------------------------------
  891. Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
  892. by the number of bits given in `count'. Any bits shifted off are lost.
  893. The value of `count' must be less than 32. The result is broken into three
  894. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  895. `z1Ptr', and `z2Ptr'.
  896. -------------------------------------------------------------------------------
  897. *}
  898. Procedure
  899. shortShift96Left(
  900. a0: bits32;
  901. a1: bits32;
  902. a2: bits32;
  903. count: int16;
  904. VAR z0Ptr: bits32;
  905. VAR z1Ptr: bits32;
  906. VAR z2Ptr: bits32
  907. );
  908. Var
  909. z0, z1, z2: bits32;
  910. negCount: int8;
  911. Begin
  912. z2 := a2 shl count;
  913. z1 := a1 shl count;
  914. z0 := a0 shl count;
  915. if ( 0 < count ) then
  916. Begin
  917. negCount := ( ( - count ) AND 31 );
  918. z1 := z1 or (a2 shr negCount);
  919. z0 := z0 or (a1 shr negCount);
  920. End;
  921. z2Ptr := z2;
  922. z1Ptr := z1;
  923. z0Ptr := z0;
  924. End;
  925. {*----------------------------------------------------------------------------
  926. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  927. | number of bits given in `count'. Any bits shifted off are lost. The value
  928. | of `count' must be less than 64. The result is broken into two 64-bit
  929. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  930. *----------------------------------------------------------------------------*}
  931. procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
  932. begin
  933. z1Ptr := a1 shl count;
  934. if count=0 then
  935. z0Ptr:=a0
  936. else
  937. z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
  938. end;
  939. {*
  940. -------------------------------------------------------------------------------
  941. Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
  942. value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
  943. any carry out is lost. The result is broken into two 32-bit pieces which
  944. are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  945. -------------------------------------------------------------------------------
  946. *}
  947. Procedure
  948. add64(
  949. a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
  950. Var
  951. z1: bits32;
  952. Begin
  953. z1 := a1 + b1;
  954. z1Ptr := z1;
  955. z0Ptr := a0 + b0 + bits32( z1 < a1 );
  956. End;
  957. {*
  958. -------------------------------------------------------------------------------
  959. Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
  960. 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  961. modulo 2^96, so any carry out is lost. The result is broken into three
  962. 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
  963. `z1Ptr', and `z2Ptr'.
  964. -------------------------------------------------------------------------------
  965. *}
  966. Procedure
  967. add96(
  968. a0: bits32;
  969. a1: bits32;
  970. a2: bits32;
  971. b0: bits32;
  972. b1: bits32;
  973. b2: bits32;
  974. VAR z0Ptr: bits32;
  975. VAR z1Ptr: bits32;
  976. VAR z2Ptr: bits32
  977. );
  978. var
  979. z0, z1, z2: bits32;
  980. carry0, carry1: int8;
  981. Begin
  982. z2 := a2 + b2;
  983. carry1 := int8( z2 < a2 );
  984. z1 := a1 + b1;
  985. carry0 := int8( z1 < a1 );
  986. z0 := a0 + b0;
  987. z1 := z1 + carry1;
  988. z0 := z0 + bits32( z1 < carry1 );
  989. z0 := z0 + carry0;
  990. z2Ptr := z2;
  991. z1Ptr := z1;
  992. z0Ptr := z0;
  993. End;
  994. {*----------------------------------------------------------------------------
  995. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  996. | by the number of bits given in `count'. Any bits shifted off are lost.
  997. | The value of `count' must be less than 64. The result is broken into three
  998. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  999. | `z1Ptr', and `z2Ptr'.
  1000. *----------------------------------------------------------------------------*}
  1001. procedure shortShift192Left(a0,a1,a2 : bits64;count : int16;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1002. var
  1003. z0, z1, z2 : bits64;
  1004. negCount : int8;
  1005. begin
  1006. z2 := a2 shl count;
  1007. z1 := a1 shl count;
  1008. z0 := a0 shl count;
  1009. if ( 0 < count ) then
  1010. begin
  1011. negCount := ( ( - count ) and 63 );
  1012. z1 := z1 or (a2 shr negCount);
  1013. z0 := z0 or (a1 shr negCount);
  1014. end;
  1015. z2Ptr := z2;
  1016. z1Ptr := z1;
  1017. z0Ptr := z0;
  1018. end;
  1019. {*----------------------------------------------------------------------------
  1020. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  1021. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  1022. | any carry out is lost. The result is broken into two 64-bit pieces which
  1023. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  1024. *----------------------------------------------------------------------------*}
  1025. procedure add128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);inline;
  1026. var
  1027. z1 : bits64;
  1028. begin
  1029. z1 := a1 + b1;
  1030. z1Ptr := z1;
  1031. z0Ptr := a0 + b0 + ord( z1 < a1 );
  1032. end;
  1033. {*----------------------------------------------------------------------------
  1034. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  1035. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  1036. | modulo 2^192, so any carry out is lost. The result is broken into three
  1037. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  1038. | `z1Ptr', and `z2Ptr'.
  1039. *----------------------------------------------------------------------------*}
  1040. procedure add192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1041. var
  1042. z0, z1, z2 : bits64;
  1043. carry0, carry1 : int8;
  1044. begin
  1045. z2 := a2 + b2;
  1046. carry1 := ord( z2 < a2 );
  1047. z1 := a1 + b1;
  1048. carry0 := ord( z1 < a1 );
  1049. z0 := a0 + b0;
  1050. inc(z1, carry1);
  1051. inc(z0, ord( z1 < carry1 ));
  1052. inc(z0, carry0);
  1053. z2Ptr := z2;
  1054. z1Ptr := z1;
  1055. z0Ptr := z0;
  1056. end;
  1057. {*
  1058. -------------------------------------------------------------------------------
  1059. Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
  1060. 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1061. 2^64, so any borrow out (carry out) is lost. The result is broken into two
  1062. 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1063. `z1Ptr'.
  1064. -------------------------------------------------------------------------------
  1065. *}
  1066. Procedure
  1067. sub64(
  1068. a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
  1069. Begin
  1070. z1Ptr := a1 - b1;
  1071. z0Ptr := a0 - b0 - bits32( a1 < b1 );
  1072. End;
  1073. {*
  1074. -------------------------------------------------------------------------------
  1075. Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
  1076. the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
  1077. is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
  1078. into three 32-bit pieces which are stored at the locations pointed to by
  1079. `z0Ptr', `z1Ptr', and `z2Ptr'.
  1080. -------------------------------------------------------------------------------
  1081. *}
  1082. Procedure
  1083. sub96(
  1084. a0:bits32;
  1085. a1:bits32;
  1086. a2:bits32;
  1087. b0:bits32;
  1088. b1:bits32;
  1089. b2:bits32;
  1090. VAR z0Ptr:bits32;
  1091. VAR z1Ptr:bits32;
  1092. VAR z2Ptr:bits32
  1093. );
  1094. Var
  1095. z0, z1, z2: bits32;
  1096. borrow0, borrow1: int8;
  1097. Begin
  1098. z2 := a2 - b2;
  1099. borrow1 := int8( a2 < b2 );
  1100. z1 := a1 - b1;
  1101. borrow0 := int8( a1 < b1 );
  1102. z0 := a0 - b0;
  1103. z0 := z0 - bits32( z1 < borrow1 );
  1104. z1 := z1 - borrow1;
  1105. z0 := z0 -borrow0;
  1106. z2Ptr := z2;
  1107. z1Ptr := z1;
  1108. z0Ptr := z0;
  1109. End;
  1110. {*----------------------------------------------------------------------------
  1111. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  1112. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  1113. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  1114. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  1115. | `z1Ptr'.
  1116. *----------------------------------------------------------------------------*}
  1117. procedure sub128( a0, a1, b0, b1 : bits64; var z0Ptr, z1Ptr : bits64);
  1118. begin
  1119. z1Ptr := a1 - b1;
  1120. z0Ptr := a0 - b0 - ord( a1 < b1 );
  1121. end;
  1122. {*----------------------------------------------------------------------------
  1123. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  1124. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  1125. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  1126. | result is broken into three 64-bit pieces which are stored at the locations
  1127. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  1128. *----------------------------------------------------------------------------*}
  1129. procedure sub192(a0,a1,a2,b0,b1,b2: bits64; var z0Ptr,z1Ptr,z2Ptr : bits64);
  1130. var
  1131. z0, z1, z2 : bits64;
  1132. borrow0, borrow1 : int8;
  1133. begin
  1134. z2 := a2 - b2;
  1135. borrow1 := ord( a2 < b2 );
  1136. z1 := a1 - b1;
  1137. borrow0 := ord( a1 < b1 );
  1138. z0 := a0 - b0;
  1139. dec(z0, ord( z1 < borrow1 ));
  1140. dec(z1, borrow1);
  1141. dec(z0, borrow0);
  1142. z2Ptr := z2;
  1143. z1Ptr := z1;
  1144. z0Ptr := z0;
  1145. end;
  1146. {*
  1147. -------------------------------------------------------------------------------
  1148. Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
  1149. into two 32-bit pieces which are stored at the locations pointed to by
  1150. `z0Ptr' and `z1Ptr'.
  1151. -------------------------------------------------------------------------------
  1152. *}
  1153. Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
  1154. :bits32 );
  1155. Var
  1156. aHigh, aLow, bHigh, bLow: bits16;
  1157. z0, zMiddleA, zMiddleB, z1: bits32;
  1158. Begin
  1159. aLow := a and $ffff;
  1160. aHigh := a shr 16;
  1161. bLow := b and $ffff;
  1162. bHigh := b shr 16;
  1163. z1 := ( bits32( aLow) ) * bLow;
  1164. zMiddleA := ( bits32 (aLow) ) * bHigh;
  1165. zMiddleB := ( bits32 (aHigh) ) * bLow;
  1166. z0 := ( bits32 (aHigh) ) * bHigh;
  1167. zMiddleA := zMiddleA + zMiddleB;
  1168. z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
  1169. zMiddleA := zmiddleA shl 16;
  1170. z1 := z1 + zMiddleA;
  1171. z0 := z0 + bits32( z1 < zMiddleA );
  1172. z1Ptr := z1;
  1173. z0Ptr := z0;
  1174. End;
  1175. {*
  1176. -------------------------------------------------------------------------------
  1177. Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
  1178. to obtain a 96-bit product. The product is broken into three 32-bit pieces
  1179. which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1180. `z2Ptr'.
  1181. -------------------------------------------------------------------------------
  1182. *}
  1183. Procedure
  1184. mul64By32To96(
  1185. a0:bits32;
  1186. a1:bits32;
  1187. b:bits32;
  1188. VAR z0Ptr:bits32;
  1189. VAR z1Ptr:bits32;
  1190. VAR z2Ptr:bits32
  1191. );
  1192. Var
  1193. z0, z1, z2, more1: bits32;
  1194. Begin
  1195. mul32To64( a1, b, z1, z2 );
  1196. mul32To64( a0, b, z0, more1 );
  1197. add64( z0, more1, 0, z1, z0, z1 );
  1198. z2Ptr := z2;
  1199. z1Ptr := z1;
  1200. z0Ptr := z0;
  1201. End;
  1202. {*
  1203. -------------------------------------------------------------------------------
  1204. Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
  1205. 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
  1206. product. The product is broken into four 32-bit pieces which are stored at
  1207. the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1208. -------------------------------------------------------------------------------
  1209. *}
  1210. Procedure
  1211. mul64To128(
  1212. a0:bits32;
  1213. a1:bits32;
  1214. b0:bits32;
  1215. b1:bits32;
  1216. VAR z0Ptr:bits32;
  1217. VAR z1Ptr:bits32;
  1218. VAR z2Ptr:bits32;
  1219. VAR z3Ptr:bits32
  1220. );
  1221. Var
  1222. z0, z1, z2, z3: bits32;
  1223. more1, more2: bits32;
  1224. Begin
  1225. mul32To64( a1, b1, z2, z3 );
  1226. mul32To64( a1, b0, z1, more2 );
  1227. add64( z1, more2, 0, z2, z1, z2 );
  1228. mul32To64( a0, b0, z0, more1 );
  1229. add64( z0, more1, 0, z1, z0, z1 );
  1230. mul32To64( a0, b1, more1, more2 );
  1231. add64( more1, more2, 0, z2, more1, z2 );
  1232. add64( z0, z1, 0, more1, z0, z1 );
  1233. z3Ptr := z3;
  1234. z2Ptr := z2;
  1235. z1Ptr := z1;
  1236. z0Ptr := z0;
  1237. End;
  1238. {*----------------------------------------------------------------------------
  1239. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  1240. | into two 64-bit pieces which are stored at the locations pointed to by
  1241. | `z0Ptr' and `z1Ptr'.
  1242. *----------------------------------------------------------------------------*}
  1243. procedure mul64To128( a, b : bits64; var z0Ptr, z1Ptr : bits64);
  1244. var
  1245. aHigh, aLow, bHigh, bLow : bits32;
  1246. z0, zMiddleA, zMiddleB, z1 : bits64;
  1247. begin
  1248. aLow := a;
  1249. aHigh := a shr 32;
  1250. bLow := b;
  1251. bHigh := b shr 32;
  1252. z1 := ( bits64(aLow) ) * bLow;
  1253. zMiddleA := ( bits64( aLow )) * bHigh;
  1254. zMiddleB := ( bits64( aHigh )) * bLow;
  1255. z0 := ( bits64(aHigh) ) * bHigh;
  1256. inc(zMiddleA, zMiddleB);
  1257. inc(z0 ,( ( bits64( zMiddleA < zMiddleB ) ) shl 32 ) + ( zMiddleA shr 32 ));
  1258. zMiddleA := zMiddleA shl 32;
  1259. inc(z1, zMiddleA);
  1260. inc(z0, ord( z1 < zMiddleA ));
  1261. z1Ptr := z1;
  1262. z0Ptr := z0;
  1263. end;
  1264. {*----------------------------------------------------------------------------
  1265. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  1266. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  1267. | product. The product is broken into four 64-bit pieces which are stored at
  1268. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  1269. *----------------------------------------------------------------------------*}
  1270. procedure mul128To256(a0,a1,b0,b1 : bits64;var z0Ptr,z1Ptr,z2Ptr,z3Ptr : bits64);
  1271. var
  1272. z0,z1,z2,z3,more1,more2 : bits64;
  1273. begin
  1274. mul64To128( a1, b1, z2, z3 );
  1275. mul64To128( a1, b0, z1, more2 );
  1276. add128( z1, more2, 0, z2, z1, z2 );
  1277. mul64To128( a0, b0, z0, more1 );
  1278. add128( z0, more1, 0, z1, z0, z1 );
  1279. mul64To128( a0, b1, more1, more2 );
  1280. add128( more1, more2, 0, z2, more1, z2 );
  1281. add128( z0, z1, 0, more1, z0, z1 );
  1282. z3Ptr := z3;
  1283. z2Ptr := z2;
  1284. z1Ptr := z1;
  1285. z0Ptr := z0;
  1286. end;
  1287. {*----------------------------------------------------------------------------
  1288. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  1289. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  1290. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  1291. | `z2Ptr'.
  1292. *----------------------------------------------------------------------------*}
  1293. procedure mul128By64To192(a0,a1,b : bits64;var z0Ptr,z1Ptr,z2Ptr : bits64);
  1294. var
  1295. z0, z1, z2, more1 : bits64;
  1296. begin
  1297. mul64To128( a1, b, z1, z2 );
  1298. mul64To128( a0, b, z0, more1 );
  1299. add128( z0, more1, 0, z1, z0, z1 );
  1300. z2Ptr := z2;
  1301. z1Ptr := z1;
  1302. z0Ptr := z0;
  1303. end;
  1304. {*----------------------------------------------------------------------------
  1305. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  1306. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  1307. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  1308. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  1309. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  1310. | unsigned integer is returned.
  1311. *----------------------------------------------------------------------------*}
  1312. Function estimateDiv128To64( a0:bits64; a1: bits64; b:bits64): bits64;
  1313. var
  1314. b0, b1, rem0, rem1, term0, term1, z : bits64;
  1315. begin
  1316. if ( b <= a0 ) then
  1317. begin
  1318. result:=qword( $FFFFFFFFFFFFFFFF );
  1319. exit;
  1320. end;
  1321. b0 := b shr 32;
  1322. if ( b0 shl 32 <= a0 ) then
  1323. z:=qword( $FFFFFFFF00000000 )
  1324. else
  1325. z:=( a0 div b0 ) shl 32;
  1326. mul64To128( b, z, term0, term1 );
  1327. sub128( a0, a1, term0, term1, rem0, rem1 );
  1328. while ( ( sbits64(rem0) ) < 0 ) do begin
  1329. dec(z,qword( $100000000 ));
  1330. b1 := b shl 32;
  1331. add128( rem0, rem1, b0, b1, rem0, rem1 );
  1332. end;
  1333. rem0 := ( rem0 shl 32 ) or ( rem1 shr 32 );
  1334. if ( b0 shl 32 <= rem0 ) then
  1335. z:=z or $FFFFFFFF
  1336. else
  1337. z:=z or rem0 div b0;
  1338. result:=z;
  1339. end;
  1340. {*
  1341. -------------------------------------------------------------------------------
  1342. Returns an approximation to the 32-bit integer quotient obtained by dividing
  1343. `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
  1344. divisor `b' must be at least 2^31. If q is the exact quotient truncated
  1345. toward zero, the approximation returned lies between q and q + 2 inclusive.
  1346. If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
  1347. unsigned integer is returned.
  1348. -------------------------------------------------------------------------------
  1349. *}
  1350. Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
  1351. Var
  1352. b0, b1: bits32;
  1353. rem0, rem1, term0, term1: bits32;
  1354. z: bits32;
  1355. Begin
  1356. if ( b <= a0 ) then
  1357. Begin
  1358. estimateDiv64To32 := $FFFFFFFF;
  1359. exit;
  1360. End;
  1361. b0 := b shr 16;
  1362. if ( b0 shl 16 <= a0 ) then
  1363. z:= $FFFF0000
  1364. else
  1365. z:= ( a0 div b0 ) shl 16;
  1366. mul32To64( b, z, term0, term1 );
  1367. sub64( a0, a1, term0, term1, rem0, rem1 );
  1368. while ( ( sbits32 (rem0) ) < 0 ) do
  1369. Begin
  1370. z := z - $10000;
  1371. b1 := b shl 16;
  1372. add64( rem0, rem1, b0, b1, rem0, rem1 );
  1373. End;
  1374. rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
  1375. if ( b0 shl 16 <= rem0 ) then
  1376. z := z or $FFFF
  1377. else
  1378. z := z or (rem0 div b0);
  1379. estimateDiv64To32 := z;
  1380. End;
  1381. {*
  1382. -------------------------------------------------------------------------------
  1383. Returns an approximation to the square root of the 32-bit significand given
  1384. by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  1385. `aExp' (the least significant bit) is 1, the integer returned approximates
  1386. 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  1387. is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  1388. case, the approximation returned lies strictly within +/-2 of the exact
  1389. value.
  1390. -------------------------------------------------------------------------------
  1391. *}
  1392. Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
  1393. const sqrtOddAdjustments: array[0..15] of bits16 = (
  1394. $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
  1395. $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
  1396. );
  1397. const sqrtEvenAdjustments: array[0..15] of bits16 = (
  1398. $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
  1399. $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
  1400. );
  1401. Var
  1402. index: int8;
  1403. z: bits32;
  1404. Begin
  1405. index := ( a shr 27 ) AND 15;
  1406. if ( aExp AND 1 ) <> 0 then
  1407. Begin
  1408. z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
  1409. z := ( ( a div z ) shl 14 ) + ( z shl 15 );
  1410. a := a shr 1;
  1411. End
  1412. else
  1413. Begin
  1414. z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
  1415. z := a div z + z;
  1416. if ( $20000 <= z ) then
  1417. z := $FFFF8000
  1418. else
  1419. z := ( z shl 15 );
  1420. if ( z <= a ) then
  1421. Begin
  1422. estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
  1423. exit;
  1424. End;
  1425. End;
  1426. estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
  1427. End;
  1428. {*
  1429. -------------------------------------------------------------------------------
  1430. Returns the number of leading 0 bits before the most-significant 1 bit of
  1431. `a'. If `a' is zero, 32 is returned.
  1432. -------------------------------------------------------------------------------
  1433. *}
  1434. Function countLeadingZeros32( a:bits32 ): int8;
  1435. const countLeadingZerosHigh:array[0..255] of int8 = (
  1436. 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  1437. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1438. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1439. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1440. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1441. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1442. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1443. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1444. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1445. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1446. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1447. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1448. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1449. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1450. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1451. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  1452. );
  1453. Var
  1454. shiftCount: int8;
  1455. Begin
  1456. shiftCount := 0;
  1457. if ( a < $10000 ) then
  1458. Begin
  1459. shiftCount := shiftcount + 16;
  1460. a := a shl 16;
  1461. End;
  1462. if ( a < $1000000 ) then
  1463. Begin
  1464. shiftCount := shiftcount + 8;
  1465. a := a shl 8;
  1466. end;
  1467. shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
  1468. countLeadingZeros32:= shiftCount;
  1469. End;
  1470. {*----------------------------------------------------------------------------
  1471. | Returns the number of leading 0 bits before the most-significant 1 bit of
  1472. | `a'. If `a' is zero, 64 is returned.
  1473. *----------------------------------------------------------------------------*}
  1474. function countLeadingZeros64( a : bits64): int8;
  1475. var
  1476. shiftcount : int8;
  1477. Begin
  1478. shiftCount := 0;
  1479. if ( a < bits64(bits64(1) shl 32 )) then
  1480. shiftCount := shiftcount + 32
  1481. else
  1482. a := a shr 32;
  1483. shiftCount := shiftCount + countLeadingZeros32( a );
  1484. countLeadingZeros64:= shiftCount;
  1485. End;
  1486. {*
  1487. -------------------------------------------------------------------------------
  1488. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
  1489. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1490. returns 0.
  1491. -------------------------------------------------------------------------------
  1492. *}
  1493. Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1494. Begin
  1495. eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
  1496. End;
  1497. {*
  1498. -------------------------------------------------------------------------------
  1499. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1500. than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
  1501. Otherwise, returns 0.
  1502. -------------------------------------------------------------------------------
  1503. *}
  1504. Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1505. Begin
  1506. le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
  1507. End;
  1508. {*
  1509. -------------------------------------------------------------------------------
  1510. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
  1511. than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1512. returns 0.
  1513. -------------------------------------------------------------------------------
  1514. *}
  1515. Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1516. Begin
  1517. lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
  1518. End;
  1519. {*
  1520. -------------------------------------------------------------------------------
  1521. Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
  1522. equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1523. returns 0.
  1524. -------------------------------------------------------------------------------
  1525. *}
  1526. Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
  1527. Begin
  1528. ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
  1529. End;
  1530. const
  1531. float128_default_nan_high = qword($FFFFFFFFFFFFFFFF);
  1532. float128_default_nan_low = qword($FFFFFFFFFFFFFFFF);
  1533. (*****************************************************************************)
  1534. (* End Low-Level arithmetic *)
  1535. (*****************************************************************************)
  1536. {*
  1537. -------------------------------------------------------------------------------
  1538. Functions and definitions to determine: (1) whether tininess for underflow
  1539. is detected before or after rounding by default, (2) what (if anything)
  1540. happens when exceptions are raised, (3) how signaling NaNs are distinguished
  1541. from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
  1542. are propagated from function inputs to output. These details are ENDIAN
  1543. specific
  1544. -------------------------------------------------------------------------------
  1545. *}
  1546. {$IFDEF ENDIAN_LITTLE}
  1547. {*
  1548. -------------------------------------------------------------------------------
  1549. Internal canonical NaN format.
  1550. -------------------------------------------------------------------------------
  1551. *}
  1552. TYPE
  1553. commonNaNT = packed record
  1554. sign: flag;
  1555. high, low : bits32;
  1556. end;
  1557. {*
  1558. -------------------------------------------------------------------------------
  1559. The pattern for a default generated single-precision NaN.
  1560. -------------------------------------------------------------------------------
  1561. *}
  1562. const float32_default_nan = $FFC00000;
  1563. {*
  1564. -------------------------------------------------------------------------------
  1565. Returns 1 if the single-precision floating-point value `a' is a NaN;
  1566. otherwise returns 0.
  1567. -------------------------------------------------------------------------------
  1568. *}
  1569. Function float32_is_nan( a : float32 ): flag;
  1570. Begin
  1571. float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
  1572. End;
  1573. {*
  1574. -------------------------------------------------------------------------------
  1575. Returns 1 if the single-precision floating-point value `a' is a signaling
  1576. NaN; otherwise returns 0.
  1577. -------------------------------------------------------------------------------
  1578. *}
  1579. Function float32_is_signaling_nan( a : float32 ): flag;
  1580. Begin
  1581. float32_is_signaling_nan := flag
  1582. ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
  1583. End;
  1584. {*
  1585. -------------------------------------------------------------------------------
  1586. Returns the result of converting the single-precision floating-point NaN
  1587. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1588. exception is raised.
  1589. -------------------------------------------------------------------------------
  1590. *}
  1591. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1592. var
  1593. z : commonNaNT ;
  1594. Begin
  1595. if ( float32_is_signaling_nan( a ) <> 0) then
  1596. float_raise( float_flag_invalid );
  1597. z.sign := a shr 31;
  1598. z.low := 0;
  1599. z.high := a shl 9;
  1600. c := z;
  1601. End;
  1602. {*
  1603. -------------------------------------------------------------------------------
  1604. Returns the result of converting the canonical NaN `a' to the single-
  1605. precision floating-point format.
  1606. -------------------------------------------------------------------------------
  1607. *}
  1608. Function commonNaNToFloat32( a : commonNaNT ): float32;
  1609. Begin
  1610. commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
  1611. End;
  1612. {*
  1613. -------------------------------------------------------------------------------
  1614. Takes two single-precision floating-point values `a' and `b', one of which
  1615. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1616. signaling NaN, the invalid exception is raised.
  1617. -------------------------------------------------------------------------------
  1618. *}
  1619. Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
  1620. Var
  1621. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1622. label returnLargerSignificand;
  1623. Begin
  1624. aIsNaN := float32_is_nan( a );
  1625. aIsSignalingNaN := float32_is_signaling_nan( a );
  1626. bIsNaN := float32_is_nan( b );
  1627. bIsSignalingNaN := float32_is_signaling_nan( b );
  1628. a := a or $00400000;
  1629. b := b or $00400000;
  1630. if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
  1631. float_raise( float_flag_invalid );
  1632. if ( aIsSignalingNaN )<> 0 then
  1633. Begin
  1634. if ( bIsSignalingNaN ) <> 0 then
  1635. goto returnLargerSignificand;
  1636. if bIsNan <> 0 then
  1637. propagateFloat32NaN := b
  1638. else
  1639. propagateFloat32NaN := a;
  1640. exit;
  1641. End
  1642. else if ( aIsNaN <> 0) then
  1643. Begin
  1644. if ( bIsSignalingNaN or not bIsNaN )<> 0 then
  1645. Begin
  1646. propagateFloat32NaN := a;
  1647. exit;
  1648. End;
  1649. returnLargerSignificand:
  1650. if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
  1651. Begin
  1652. propagateFloat32NaN := b;
  1653. exit;
  1654. End;
  1655. if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
  1656. Begin
  1657. propagateFloat32NaN := a;
  1658. End;
  1659. if a < b then
  1660. propagateFloat32NaN := a
  1661. else
  1662. propagateFloat32NaN := b;
  1663. exit;
  1664. End
  1665. else
  1666. Begin
  1667. propagateFloat32NaN := b;
  1668. exit;
  1669. End;
  1670. End;
  1671. {*
  1672. -------------------------------------------------------------------------------
  1673. The pattern for a default generated double-precision NaN. The `high' and
  1674. `low' values hold the most- and least-significant bits, respectively.
  1675. -------------------------------------------------------------------------------
  1676. *}
  1677. const
  1678. float64_default_nan_high = $FFF80000;
  1679. float64_default_nan_low = $00000000;
  1680. {*
  1681. -------------------------------------------------------------------------------
  1682. Returns 1 if the double-precision floating-point value `a' is a NaN;
  1683. otherwise returns 0.
  1684. -------------------------------------------------------------------------------
  1685. *}
  1686. Function float64_is_nan( a : float64 ) : flag;
  1687. Begin
  1688. float64_is_nan :=
  1689. flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
  1690. and ( a.low or ( a.high and $000FFFFF ) );
  1691. End;
  1692. {*
  1693. -------------------------------------------------------------------------------
  1694. Returns 1 if the double-precision floating-point value `a' is a signaling
  1695. NaN; otherwise returns 0.
  1696. -------------------------------------------------------------------------------
  1697. *}
  1698. Function float64_is_signaling_nan( a : float64 ): flag;
  1699. Begin
  1700. float64_is_signaling_nan :=
  1701. flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  1702. and ( a.low or ( a.high and $0007FFFF ) );
  1703. End;
  1704. {*
  1705. -------------------------------------------------------------------------------
  1706. Returns the result of converting the double-precision floating-point NaN
  1707. `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1708. exception is raised.
  1709. -------------------------------------------------------------------------------
  1710. *}
  1711. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  1712. Var
  1713. z : commonNaNT;
  1714. Begin
  1715. if ( float64_is_signaling_nan( a )<>0 ) then
  1716. float_raise( float_flag_invalid );
  1717. z.sign := a.high shr 31;
  1718. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1719. c := z;
  1720. End;
  1721. function float64ToCommonNaN( a : float64 ) : commonNaNT;
  1722. Var
  1723. z : commonNaNT;
  1724. Begin
  1725. if ( float64_is_signaling_nan( a )<>0 ) then
  1726. float_raise( float_flag_invalid );
  1727. z.sign := a.high shr 31;
  1728. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  1729. result := z;
  1730. End;
  1731. {*
  1732. -------------------------------------------------------------------------------
  1733. Returns the result of converting the canonical NaN `a' to the double-
  1734. precision floating-point format.
  1735. -------------------------------------------------------------------------------
  1736. *}
  1737. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  1738. Var
  1739. z: float64;
  1740. Begin
  1741. shift64Right( a.high, a.low, 12, z.high, z.low );
  1742. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  1743. c := z;
  1744. End;
  1745. {*
  1746. -------------------------------------------------------------------------------
  1747. Takes two double-precision floating-point values `a' and `b', one of which
  1748. is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1749. signaling NaN, the invalid exception is raised.
  1750. -------------------------------------------------------------------------------
  1751. *}
  1752. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  1753. Var
  1754. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1755. label returnLargerSignificand;
  1756. Begin
  1757. aIsNaN := float64_is_nan( a );
  1758. aIsSignalingNaN := float64_is_signaling_nan( a );
  1759. bIsNaN := float64_is_nan( b );
  1760. bIsSignalingNaN := float64_is_signaling_nan( b );
  1761. a.high := a.high or $00080000;
  1762. b.high := b.high or $00080000;
  1763. if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
  1764. float_raise( float_flag_invalid );
  1765. if ( aIsSignalingNaN )<>0 then
  1766. Begin
  1767. if ( bIsSignalingNaN )<>0 then
  1768. goto returnLargerSignificand;
  1769. if bIsNan <> 0 then
  1770. c := b
  1771. else
  1772. c := a;
  1773. exit;
  1774. End
  1775. else if ( aIsNaN )<> 0 then
  1776. Begin
  1777. if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
  1778. Begin
  1779. c := a;
  1780. exit;
  1781. End;
  1782. returnLargerSignificand:
  1783. if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
  1784. Begin
  1785. c := b;
  1786. exit;
  1787. End;
  1788. if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
  1789. Begin
  1790. c := a;
  1791. exit;
  1792. End;
  1793. if a.high < b.high then
  1794. c := a
  1795. else
  1796. c := b;
  1797. exit;
  1798. End
  1799. else
  1800. Begin
  1801. c := b;
  1802. exit;
  1803. End;
  1804. End;
  1805. {*----------------------------------------------------------------------------
  1806. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  1807. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  1808. | returns 0.
  1809. *----------------------------------------------------------------------------*}
  1810. function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  1811. begin
  1812. result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
  1813. end;
  1814. {*----------------------------------------------------------------------------
  1815. | Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  1816. | otherwise returns 0.
  1817. *----------------------------------------------------------------------------*}
  1818. function float128_is_nan( a : float128): flag;
  1819. begin
  1820. result:= ord(( int64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
  1821. and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
  1822. end;
  1823. {*----------------------------------------------------------------------------
  1824. | Returns 1 if the quadruple-precision floating-point value `a' is a
  1825. | signaling NaN; otherwise returns 0.
  1826. *----------------------------------------------------------------------------*}
  1827. function float128_is_signaling_nan( a : float128): flag;
  1828. begin
  1829. result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
  1830. ( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
  1831. end;
  1832. {*----------------------------------------------------------------------------
  1833. | Returns the result of converting the quadruple-precision floating-point NaN
  1834. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1835. | exception is raised.
  1836. *----------------------------------------------------------------------------*}
  1837. function float128ToCommonNaN( a : float128): commonNaNT;
  1838. var
  1839. z: commonNaNT;
  1840. qhigh,qlow : qword;
  1841. begin
  1842. if ( float128_is_signaling_nan( a )<>0) then
  1843. float_raise( float_flag_invalid );
  1844. z.sign := a.high shr 63;
  1845. shortShift128Left( a.high, a.low, 16, qhigh, qlow );
  1846. z.high:=qhigh shr 32;
  1847. z.low:=qhigh and $ffffffff;
  1848. result:=z;
  1849. end;
  1850. {*----------------------------------------------------------------------------
  1851. | Returns the result of converting the canonical NaN `a' to the quadruple-
  1852. | precision floating-point format.
  1853. *----------------------------------------------------------------------------*}
  1854. function commonNaNToFloat128( a : commonNaNT): float128;
  1855. var
  1856. z: float128;
  1857. begin
  1858. shift128Right( a.high, a.low, 16, z.high, z.low );
  1859. z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
  1860. result:=z;
  1861. end;
  1862. {*----------------------------------------------------------------------------
  1863. | Takes two quadruple-precision floating-point values `a' and `b', one of
  1864. | which is a NaN, and returns the appropriate NaN result. If either `a' or
  1865. | `b' is a signaling NaN, the invalid exception is raised.
  1866. *----------------------------------------------------------------------------*}
  1867. function propagateFloat128NaN( a: float128; b : float128): float128;
  1868. var
  1869. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1870. label
  1871. returnLargerSignificand;
  1872. begin
  1873. aIsNaN := float128_is_nan( a );
  1874. aIsSignalingNaN := float128_is_signaling_nan( a );
  1875. bIsNaN := float128_is_nan( b );
  1876. bIsSignalingNaN := float128_is_signaling_nan( b );
  1877. a.high := a.high or int64( $0000800000000000 );
  1878. b.high := b.high or int64( $0000800000000000 );
  1879. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1880. float_raise( float_flag_invalid );
  1881. if ( aIsSignalingNaN )<>0 then
  1882. begin
  1883. if ( bIsSignalingNaN )<>0 then
  1884. goto returnLargerSignificand;
  1885. if bIsNaN<>0 then
  1886. result := b
  1887. else
  1888. result := a;
  1889. exit;
  1890. end
  1891. else if ( aIsNaN )<>0 then
  1892. begin
  1893. if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
  1894. begin
  1895. result := a;
  1896. exit;
  1897. end;
  1898. returnLargerSignificand:
  1899. if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
  1900. begin
  1901. result := b;
  1902. exit;
  1903. end;
  1904. if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
  1905. begin
  1906. result := a;
  1907. exit
  1908. end;
  1909. if ( a.high < b.high ) then
  1910. result := a
  1911. else
  1912. result := b;
  1913. exit;
  1914. end
  1915. else
  1916. result:=b;
  1917. end;
  1918. {$ELSE}
  1919. { Big endian code }
  1920. (*----------------------------------------------------------------------------
  1921. | Internal canonical NaN format.
  1922. *----------------------------------------------------------------------------*)
  1923. type
  1924. commonNANT = packed record
  1925. sign : flag;
  1926. high, low : bits32;
  1927. end;
  1928. (*----------------------------------------------------------------------------
  1929. | The pattern for a default generated single-precision NaN.
  1930. *----------------------------------------------------------------------------*)
  1931. const float32_default_nan = $7FFFFFFF;
  1932. (*----------------------------------------------------------------------------
  1933. | Returns 1 if the single-precision floating-point value `a' is a NaN;
  1934. | otherwise returns 0.
  1935. *----------------------------------------------------------------------------*)
  1936. function float32_is_nan(a: float32): flag;
  1937. begin
  1938. float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
  1939. end;
  1940. (*----------------------------------------------------------------------------
  1941. | Returns 1 if the single-precision floating-point value `a' is a signaling
  1942. | NaN; otherwise returns 0.
  1943. *----------------------------------------------------------------------------*)
  1944. function float32_is_signaling_nan(a: float32):flag;
  1945. begin
  1946. float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
  1947. end;
  1948. (*----------------------------------------------------------------------------
  1949. | Returns the result of converting the single-precision floating-point NaN
  1950. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  1951. | exception is raised.
  1952. *----------------------------------------------------------------------------*)
  1953. Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
  1954. var
  1955. z: commonNANT;
  1956. begin
  1957. if float32_is_signaling_nan(a)<>0 then
  1958. float_raise(float_flag_invalid);
  1959. z.sign := a shr 31;
  1960. z.low := 0;
  1961. z.high := a shl 9;
  1962. c:=z;
  1963. end;
  1964. (*----------------------------------------------------------------------------
  1965. | Returns the result of converting the canonical NaN `a' to the single-
  1966. | precision floating-point format.
  1967. *----------------------------------------------------------------------------*)
  1968. function CommonNanToFloat32(a : CommonNaNT): float32;
  1969. begin
  1970. CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
  1971. end;
  1972. (*----------------------------------------------------------------------------
  1973. | Takes two single-precision floating-point values `a' and `b', one of which
  1974. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  1975. | signaling NaN, the invalid exception is raised.
  1976. *----------------------------------------------------------------------------*)
  1977. function propagateFloat32NaN( a: float32 ; b: float32): float32;
  1978. var
  1979. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
  1980. begin
  1981. aIsNaN := float32_is_nan( a );
  1982. aIsSignalingNaN := float32_is_signaling_nan( a );
  1983. bIsNaN := float32_is_nan( b );
  1984. bIsSignalingNaN := float32_is_signaling_nan( b );
  1985. a := a or $00400000;
  1986. b := b or $00400000;
  1987. if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
  1988. float_raise( float_flag_invalid );
  1989. if bIsSignalingNaN<>0 then
  1990. propagateFloat32Nan := b
  1991. else if aIsSignalingNan<>0 then
  1992. propagateFloat32Nan := a
  1993. else if bIsNan<>0 then
  1994. propagateFloat32Nan := b
  1995. else
  1996. propagateFloat32Nan := a;
  1997. end;
  1998. (*----------------------------------------------------------------------------
  1999. | The pattern for a default generated double-precision NaN. The `high' and
  2000. | `low' values hold the most- and least-significant bits, respectively.
  2001. *----------------------------------------------------------------------------*)
  2002. const
  2003. float64_default_nan_high = $7FFFFFFF;
  2004. float64_default_nan_low = $FFFFFFFF;
  2005. (*----------------------------------------------------------------------------
  2006. | Returns 1 if the double-precision floating-point value `a' is a NaN;
  2007. | otherwise returns 0.
  2008. *----------------------------------------------------------------------------*)
  2009. function float64_is_nan(a: float64): flag;
  2010. begin
  2011. float64_is_nan := flag (
  2012. ( $FFE00000 <= bits32 ( a.high shl 1 ) )
  2013. and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
  2014. end;
  2015. (*----------------------------------------------------------------------------
  2016. | Returns 1 if the double-precision floating-point value `a' is a signaling
  2017. | NaN; otherwise returns 0.
  2018. *----------------------------------------------------------------------------*)
  2019. function float64_is_signaling_nan( a:float64): flag;
  2020. begin
  2021. float64_is_signaling_nan := flag(
  2022. ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
  2023. and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
  2024. end;
  2025. (*----------------------------------------------------------------------------
  2026. | Returns the result of converting the double-precision floating-point NaN
  2027. | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  2028. | exception is raised.
  2029. *----------------------------------------------------------------------------*)
  2030. Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
  2031. var
  2032. z : commonNaNT;
  2033. begin
  2034. if ( float64_is_signaling_nan( a )<>0 ) then
  2035. float_raise( float_flag_invalid );
  2036. z.sign := a.high shr 31;
  2037. shortShift64Left( a.high, a.low, 12, z.high, z.low );
  2038. c:=z;
  2039. end;
  2040. (*----------------------------------------------------------------------------
  2041. | Returns the result of converting the canonical NaN `a' to the double-
  2042. | precision floating-point format.
  2043. *----------------------------------------------------------------------------*)
  2044. Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
  2045. var
  2046. z: float64;
  2047. begin
  2048. shift64Right( a.high, a.low, 12, z.high, z.low );
  2049. z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
  2050. c:=z;
  2051. end;
  2052. (*----------------------------------------------------------------------------
  2053. | Takes two double-precision floating-point values `a' and `b', one of which
  2054. | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  2055. | signaling NaN, the invalid exception is raised.
  2056. *----------------------------------------------------------------------------*)
  2057. Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
  2058. var
  2059. aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
  2060. begin
  2061. aIsNaN := float64_is_nan( a );
  2062. aIsSignalingNaN := float64_is_signaling_nan( a );
  2063. bIsNaN := float64_is_nan( b );
  2064. bIsSignalingNaN := float64_is_signaling_nan( b );
  2065. a.high := a.high or $00080000;
  2066. b.high := b.high or $00080000;
  2067. if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
  2068. float_raise( float_flag_invalid );
  2069. if bIsSignalingNaN<>0 then
  2070. c := b
  2071. else if aIsSignalingNan<>0 then
  2072. c := a
  2073. else if bIsNan<>0 then
  2074. c := b
  2075. else
  2076. c := a;
  2077. end;
  2078. {$ENDIF}
  2079. (****************************************************************************)
  2080. (* END ENDIAN SPECIFIC CODE *)
  2081. (****************************************************************************)
  2082. {*
  2083. -------------------------------------------------------------------------------
  2084. Returns the fraction bits of the single-precision floating-point value `a'.
  2085. -------------------------------------------------------------------------------
  2086. *}
  2087. Function ExtractFloat32Frac(a : Float32) : Bits32;
  2088. Begin
  2089. ExtractFloat32Frac := A AND $007FFFFF;
  2090. End;
  2091. {*
  2092. -------------------------------------------------------------------------------
  2093. Returns the exponent bits of the single-precision floating-point value `a'.
  2094. -------------------------------------------------------------------------------
  2095. *}
  2096. Function extractFloat32Exp( a: float32 ): Int16;
  2097. Begin
  2098. extractFloat32Exp := (a shr 23) AND $FF;
  2099. End;
  2100. {*
  2101. -------------------------------------------------------------------------------
  2102. Returns the sign bit of the single-precision floating-point value `a'.
  2103. -------------------------------------------------------------------------------
  2104. *}
  2105. Function extractFloat32Sign( a: float32 ): Flag;
  2106. Begin
  2107. extractFloat32Sign := a shr 31;
  2108. End;
  2109. {*
  2110. -------------------------------------------------------------------------------
  2111. Normalizes the subnormal single-precision floating-point value represented
  2112. by the denormalized significand `aSig'. The normalized exponent and
  2113. significand are stored at the locations pointed to by `zExpPtr' and
  2114. `zSigPtr', respectively.
  2115. -------------------------------------------------------------------------------
  2116. *}
  2117. Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
  2118. Var
  2119. ShiftCount : BYTE;
  2120. Begin
  2121. shiftCount := countLeadingZeros32( aSig ) - 8;
  2122. zSigPtr := aSig shl shiftCount;
  2123. zExpPtr := 1 - shiftCount;
  2124. End;
  2125. {*
  2126. -------------------------------------------------------------------------------
  2127. Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2128. single-precision floating-point value, returning the result. After being
  2129. shifted into the proper positions, the three fields are simply added
  2130. together to form the result. This means that any integer portion of `zSig'
  2131. will be added into the exponent. Since a properly normalized significand
  2132. will have an integer portion equal to 1, the `zExp' input should be 1 less
  2133. than the desired result exponent whenever `zSig' is a complete, normalized
  2134. significand.
  2135. -------------------------------------------------------------------------------
  2136. *}
  2137. Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
  2138. Begin
  2139. packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
  2140. + zSig;
  2141. End;
  2142. {*
  2143. -------------------------------------------------------------------------------
  2144. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2145. and significand `zSig', and returns the proper single-precision floating-
  2146. point value corresponding to the abstract input. Ordinarily, the abstract
  2147. value is simply rounded and packed into the single-precision format, with
  2148. the inexact exception raised if the abstract input cannot be represented
  2149. exactly. However, if the abstract value is too large, the overflow and
  2150. inexact exceptions are raised and an infinity or maximal finite value is
  2151. returned. If the abstract value is too small, the input value is rounded to
  2152. a subnormal number, and the underflow and inexact exceptions are raised if
  2153. the abstract input cannot be represented exactly as a subnormal single-
  2154. precision floating-point number.
  2155. The input significand `zSig' has its binary point between bits 30
  2156. and 29, which is 7 bits to the left of the usual location. This shifted
  2157. significand must be normalized or smaller. If `zSig' is not normalized,
  2158. `zExp' must be 0; in that case, the result returned is a subnormal number,
  2159. and it must not require rounding. In the usual case that `zSig' is
  2160. normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2161. The handling of underflow and overflow follows the IEC/IEEE Standard for
  2162. Binary Floating-Point Arithmetic.
  2163. -------------------------------------------------------------------------------
  2164. *}
  2165. Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
  2166. Var
  2167. roundingMode : BYTE;
  2168. roundNearestEven : Flag;
  2169. roundIncrement, roundBits : BYTE;
  2170. IsTiny : Flag;
  2171. Begin
  2172. roundingMode := softfloat_rounding_mode;
  2173. if (roundingMode = float_round_nearest_even) then
  2174. Begin
  2175. roundNearestEven := Flag(TRUE);
  2176. end
  2177. else
  2178. roundNearestEven := Flag(FALSE);
  2179. roundIncrement := $40;
  2180. if ( Boolean(roundNearestEven) = FALSE) then
  2181. Begin
  2182. if ( roundingMode = float_round_to_zero ) Then
  2183. Begin
  2184. roundIncrement := 0;
  2185. End
  2186. else
  2187. Begin
  2188. roundIncrement := $7F;
  2189. if ( zSign <> 0 ) then
  2190. Begin
  2191. if roundingMode = float_round_up then roundIncrement := 0;
  2192. End
  2193. else
  2194. Begin
  2195. if roundingMode = float_round_down then roundIncrement := 0;
  2196. End;
  2197. End
  2198. End;
  2199. roundBits := zSig AND $7F;
  2200. if ($FD <= bits16 (zExp) ) then
  2201. Begin
  2202. if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
  2203. Begin
  2204. float_raise( float_flag_overflow OR float_flag_inexact );
  2205. roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
  2206. exit;
  2207. End;
  2208. if ( zExp < 0 ) then
  2209. Begin
  2210. isTiny :=
  2211. flag(( float_detect_tininess = float_tininess_before_rounding )
  2212. OR ( zExp < -1 )
  2213. OR ( (zSig + roundIncrement) < $80000000 ));
  2214. shift32RightJamming( zSig, - zExp, zSig );
  2215. zExp := 0;
  2216. roundBits := zSig AND $7F;
  2217. if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
  2218. float_raise( float_flag_underflow );
  2219. End;
  2220. End;
  2221. if ( roundBits )<> 0 then
  2222. softfloat_exception_flags := float_flag_inexact OR softfloat_exception_flags;
  2223. zSig := ( zSig + roundIncrement ) shr 7;
  2224. zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
  2225. if ( zSig = 0 ) then zExp := 0;
  2226. roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
  2227. exit;
  2228. End;
  2229. {*
  2230. -------------------------------------------------------------------------------
  2231. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2232. and significand `zSig', and returns the proper single-precision floating-
  2233. point value corresponding to the abstract input. This routine is just like
  2234. `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  2235. Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  2236. floating-point exponent.
  2237. -------------------------------------------------------------------------------
  2238. *}
  2239. Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
  2240. Var
  2241. ShiftCount : int8;
  2242. Begin
  2243. shiftCount := countLeadingZeros32( zSig ) - 1;
  2244. normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
  2245. End;
  2246. {*
  2247. -------------------------------------------------------------------------------
  2248. Returns the most-significant 20 fraction bits of the double-precision
  2249. floating-point value `a'.
  2250. -------------------------------------------------------------------------------
  2251. *}
  2252. Function extractFloat64Frac0(a: float64): bits32;
  2253. Begin
  2254. extractFloat64Frac0 := a.high and $000FFFFF;
  2255. End;
  2256. {*
  2257. -------------------------------------------------------------------------------
  2258. Returns the least-significant 32 fraction bits of the double-precision
  2259. floating-point value `a'.
  2260. -------------------------------------------------------------------------------
  2261. *}
  2262. Function extractFloat64Frac1(a: float64): bits32;
  2263. Begin
  2264. extractFloat64Frac1 := a.low;
  2265. End;
  2266. {$define FPC_SYSTEM_HAS_extractFloat64Frac}
  2267. Function extractFloat64Frac(a: float64): bits64;
  2268. Begin
  2269. extractFloat64Frac := bits64(a) and $000FFFFFFFFFFFFF;
  2270. End;
  2271. {*
  2272. -------------------------------------------------------------------------------
  2273. Returns the exponent bits of the double-precision floating-point value `a'.
  2274. -------------------------------------------------------------------------------
  2275. *}
  2276. Function extractFloat64Exp(a: float64): int16;
  2277. Begin
  2278. extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
  2279. End;
  2280. {*
  2281. -------------------------------------------------------------------------------
  2282. Returns the sign bit of the double-precision floating-point value `a'.
  2283. -------------------------------------------------------------------------------
  2284. *}
  2285. Function extractFloat64Sign(a: float64) : flag;
  2286. Begin
  2287. extractFloat64Sign := a.high shr 31;
  2288. End;
  2289. {*
  2290. -------------------------------------------------------------------------------
  2291. Normalizes the subnormal double-precision floating-point value represented
  2292. by the denormalized significand formed by the concatenation of `aSig0' and
  2293. `aSig1'. The normalized exponent is stored at the location pointed to by
  2294. `zExpPtr'. The most significant 21 bits of the normalized significand are
  2295. stored at the location pointed to by `zSig0Ptr', and the least significant
  2296. 32 bits of the normalized significand are stored at the location pointed to
  2297. by `zSig1Ptr'.
  2298. -------------------------------------------------------------------------------
  2299. *}
  2300. Procedure normalizeFloat64Subnormal(
  2301. aSig0: bits32;
  2302. aSig1: bits32;
  2303. VAR zExpPtr : Int16;
  2304. VAR zSig0Ptr : Bits32;
  2305. VAR zSig1Ptr : Bits32
  2306. );
  2307. Var
  2308. ShiftCount : Int8;
  2309. Begin
  2310. if ( aSig0 = 0 ) then
  2311. Begin
  2312. shiftCount := countLeadingZeros32( aSig1 ) - 11;
  2313. if ( shiftCount < 0 ) then
  2314. Begin
  2315. zSig0Ptr := aSig1 shr ( - shiftCount );
  2316. zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
  2317. End
  2318. else
  2319. Begin
  2320. zSig0Ptr := aSig1 shl shiftCount;
  2321. zSig1Ptr := 0;
  2322. End;
  2323. zExpPtr := - shiftCount - 31;
  2324. End
  2325. else
  2326. Begin
  2327. shiftCount := countLeadingZeros32( aSig0 ) - 11;
  2328. shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  2329. zExpPtr := 1 - shiftCount;
  2330. End;
  2331. End;
  2332. procedure normalizeFloat64Subnormal(aSig : bits64;var zExpPtr : int16; var zSigPtr : bits64);
  2333. var
  2334. shiftCount : int8;
  2335. begin
  2336. shiftCount := countLeadingZeros64( aSig ) - 11;
  2337. zSigPtr := aSig shl shiftCount;
  2338. zExpPtr := 1 - shiftCount;
  2339. end;
  2340. {*
  2341. -------------------------------------------------------------------------------
  2342. Packs the sign `zSign', the exponent `zExp', and the significand formed by
  2343. the concatenation of `zSig0' and `zSig1' into a double-precision floating-
  2344. point value, returning the result. After being shifted into the proper
  2345. positions, the three fields `zSign', `zExp', and `zSig0' are simply added
  2346. together to form the most significant 32 bits of the result. This means
  2347. that any integer portion of `zSig0' will be added into the exponent. Since
  2348. a properly normalized significand will have an integer portion equal to 1,
  2349. the `zExp' input should be 1 less than the desired result exponent whenever
  2350. `zSig0' and `zSig1' concatenated form a complete, normalized significand.
  2351. -------------------------------------------------------------------------------
  2352. *}
  2353. Procedure
  2354. packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
  2355. var
  2356. z: Float64;
  2357. Begin
  2358. z.low := zSig1;
  2359. z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
  2360. c := z;
  2361. End;
  2362. {*----------------------------------------------------------------------------
  2363. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  2364. | double-precision floating-point value, returning the result. After being
  2365. | shifted into the proper positions, the three fields are simply added
  2366. | together to form the result. This means that any integer portion of `zSig'
  2367. | will be added into the exponent. Since a properly normalized significand
  2368. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  2369. | than the desired result exponent whenever `zSig' is a complete, normalized
  2370. | significand.
  2371. *----------------------------------------------------------------------------*}
  2372. function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
  2373. begin
  2374. result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
  2375. end;
  2376. {*
  2377. -------------------------------------------------------------------------------
  2378. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2379. and extended significand formed by the concatenation of `zSig0', `zSig1',
  2380. and `zSig2', and returns the proper double-precision floating-point value
  2381. corresponding to the abstract input. Ordinarily, the abstract value is
  2382. simply rounded and packed into the double-precision format, with the inexact
  2383. exception raised if the abstract input cannot be represented exactly.
  2384. However, if the abstract value is too large, the overflow and inexact
  2385. exceptions are raised and an infinity or maximal finite value is returned.
  2386. If the abstract value is too small, the input value is rounded to a
  2387. subnormal number, and the underflow and inexact exceptions are raised if the
  2388. abstract input cannot be represented exactly as a subnormal double-precision
  2389. floating-point number.
  2390. The input significand must be normalized or smaller. If the input
  2391. significand is not normalized, `zExp' must be 0; in that case, the result
  2392. returned is a subnormal number, and it must not require rounding. In the
  2393. usual case that the input significand is normalized, `zExp' must be 1 less
  2394. than the ``true'' floating-point exponent. The handling of underflow and
  2395. overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2396. -------------------------------------------------------------------------------
  2397. *}
  2398. Procedure
  2399. roundAndPackFloat64(
  2400. zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
  2401. Var
  2402. roundingMode : Int8;
  2403. roundNearestEven, increment, isTiny : Flag;
  2404. Begin
  2405. roundingMode := softfloat_rounding_mode;
  2406. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  2407. increment := flag( sbits32 (zSig2) < 0 );
  2408. if ( roundNearestEven = flag(FALSE) ) then
  2409. Begin
  2410. if ( roundingMode = float_round_to_zero ) then
  2411. increment := 0
  2412. else
  2413. Begin
  2414. if ( zSign )<> 0 then
  2415. Begin
  2416. increment := flag( roundingMode = float_round_down ) and zSig2;
  2417. End
  2418. else
  2419. Begin
  2420. increment := flag( roundingMode = float_round_up ) and zSig2;
  2421. End
  2422. End
  2423. End;
  2424. if ( $7FD <= bits16 (zExp) ) then
  2425. Begin
  2426. if (( $7FD < zExp )
  2427. or (( zExp = $7FD )
  2428. and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
  2429. and (increment<>0)
  2430. )
  2431. ) then
  2432. Begin
  2433. float_raise( float_flag_overflow OR float_flag_inexact );
  2434. if (( roundingMode = float_round_to_zero )
  2435. or ( (zSign<>0) and ( roundingMode = float_round_up ) )
  2436. or ( (zSign = 0) and ( roundingMode = float_round_down ) )
  2437. ) then
  2438. Begin
  2439. packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
  2440. exit;
  2441. End;
  2442. packFloat64( zSign, $7FF, 0, 0, c );
  2443. exit;
  2444. End;
  2445. if ( zExp < 0 ) then
  2446. Begin
  2447. isTiny :=
  2448. flag( float_detect_tininess = float_tininess_before_rounding )
  2449. or flag( zExp < -1 )
  2450. or flag(increment = 0)
  2451. or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
  2452. shift64ExtraRightJamming(
  2453. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  2454. zExp := 0;
  2455. if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
  2456. if ( roundNearestEven )<>0 then
  2457. Begin
  2458. increment := flag( sbits32 (zSig2) < 0 );
  2459. End
  2460. else
  2461. Begin
  2462. if ( zSign )<>0 then
  2463. Begin
  2464. increment := flag( roundingMode = float_round_down ) and zSig2;
  2465. End
  2466. else
  2467. Begin
  2468. increment := flag( roundingMode = float_round_up ) and zSig2;
  2469. End
  2470. End;
  2471. End;
  2472. End;
  2473. if ( zSig2 )<>0 then
  2474. softfloat_exception_flags := softfloat_exception_flags OR float_flag_inexact;
  2475. if ( increment )<>0 then
  2476. Begin
  2477. add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  2478. zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
  2479. End
  2480. else
  2481. Begin
  2482. if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
  2483. End;
  2484. packFloat64( zSign, zExp, zSig0, zSig1, c );
  2485. End;
  2486. {*----------------------------------------------------------------------------
  2487. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2488. | and significand `zSig', and returns the proper double-precision floating-
  2489. | point value corresponding to the abstract input. Ordinarily, the abstract
  2490. | value is simply rounded and packed into the double-precision format, with
  2491. | the inexact exception raised if the abstract input cannot be represented
  2492. | exactly. However, if the abstract value is too large, the overflow and
  2493. | inexact exceptions are raised and an infinity or maximal finite value is
  2494. | returned. If the abstract value is too small, the input value is rounded
  2495. | to a subnormal number, and the underflow and inexact exceptions are raised
  2496. | if the abstract input cannot be represented exactly as a subnormal double-
  2497. | precision floating-point number.
  2498. | The input significand `zSig' has its binary point between bits 62
  2499. | and 61, which is 10 bits to the left of the usual location. This shifted
  2500. | significand must be normalized or smaller. If `zSig' is not normalized,
  2501. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  2502. | and it must not require rounding. In the usual case that `zSig' is
  2503. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  2504. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  2505. | Binary Floating-Point Arithmetic.
  2506. *----------------------------------------------------------------------------*}
  2507. function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
  2508. var
  2509. roundingMode: int8;
  2510. roundNearestEven: flag;
  2511. roundIncrement, roundBits: int16;
  2512. isTiny: flag;
  2513. begin
  2514. roundingMode := softfloat_rounding_mode;
  2515. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  2516. roundIncrement := $200;
  2517. if ( roundNearestEven=0 ) then
  2518. begin
  2519. if ( roundingMode = float_round_to_zero ) then
  2520. begin
  2521. roundIncrement := 0;
  2522. end
  2523. else begin
  2524. roundIncrement := $3FF;
  2525. if ( zSign<>0 ) then
  2526. begin
  2527. if ( roundingMode = float_round_up ) then
  2528. roundIncrement := 0;
  2529. end
  2530. else begin
  2531. if ( roundingMode = float_round_down ) then
  2532. roundIncrement := 0;
  2533. end
  2534. end
  2535. end;
  2536. roundBits := zSig and $3FF;
  2537. if ( $7FD <= bits16(zExp) ) then
  2538. begin
  2539. if ( ( $7FD < zExp )
  2540. or ( ( zExp = $7FD )
  2541. and ( sbits64( zSig + roundIncrement ) < 0 ) )
  2542. ) then
  2543. begin
  2544. float_raise( float_flag_overflow or float_flag_inexact );
  2545. result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
  2546. exit;
  2547. end;
  2548. if ( zExp < 0 ) then
  2549. begin
  2550. isTiny := ord(
  2551. ( float_detect_tininess = float_tininess_before_rounding )
  2552. or ( zExp < -1 )
  2553. or ( (zSig + roundIncrement) < int64( $8000000000000000 ) ) );
  2554. shift64RightJamming( zSig, - zExp, zSig );
  2555. zExp := 0;
  2556. roundBits := zSig and $3FF;
  2557. if ( isTiny and roundBits )<>0 then
  2558. float_raise( float_flag_underflow );
  2559. end
  2560. end;
  2561. if ( roundBits<>0 ) then
  2562. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2563. zSig := ( zSig + roundIncrement ) shr 10;
  2564. zSig := zSig and not( ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven );
  2565. if ( zSig = 0 ) then
  2566. zExp := 0;
  2567. result:=packFloat64( zSign, zExp, zSig );
  2568. end;
  2569. {*
  2570. -------------------------------------------------------------------------------
  2571. Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  2572. and significand formed by the concatenation of `zSig0' and `zSig1', and
  2573. returns the proper double-precision floating-point value corresponding
  2574. to the abstract input. This routine is just like `roundAndPackFloat64'
  2575. except that the input significand has fewer bits and does not have to be
  2576. normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  2577. point exponent.
  2578. -------------------------------------------------------------------------------
  2579. *}
  2580. Procedure
  2581. normalizeRoundAndPackFloat64(
  2582. zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
  2583. Var
  2584. shiftCount : int8;
  2585. zSig2 : bits32;
  2586. Begin
  2587. if ( zSig0 = 0 ) then
  2588. Begin
  2589. zSig0 := zSig1;
  2590. zSig1 := 0;
  2591. zExp := zExp -32;
  2592. End;
  2593. shiftCount := countLeadingZeros32( zSig0 ) - 11;
  2594. if ( 0 <= shiftCount ) then
  2595. Begin
  2596. zSig2 := 0;
  2597. shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  2598. End
  2599. else
  2600. Begin
  2601. shift64ExtraRightJamming
  2602. (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  2603. End;
  2604. zExp := zExp - shiftCount;
  2605. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
  2606. End;
  2607. {*
  2608. -------------------------------------------------------------------------------
  2609. Returns the result of converting the 32-bit two's complement integer `a' to
  2610. the single-precision floating-point format. The conversion is performed
  2611. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2612. -------------------------------------------------------------------------------
  2613. *}
  2614. Function int32_to_float32( a: int32): float32rec; compilerproc;
  2615. Var
  2616. zSign : Flag;
  2617. Begin
  2618. if ( a = 0 ) then
  2619. Begin
  2620. int32_to_float32.float32 := 0;
  2621. exit;
  2622. End;
  2623. if ( a = sbits32 ($80000000) ) then
  2624. Begin
  2625. int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
  2626. exit;
  2627. end;
  2628. zSign := flag( a < 0 );
  2629. If zSign<>0 then
  2630. a := -a;
  2631. int32_to_float32.float32:=
  2632. normalizeRoundAndPackFloat32( zSign, $9C, a );
  2633. End;
  2634. {*
  2635. -------------------------------------------------------------------------------
  2636. Returns the result of converting the 32-bit two's complement integer `a' to
  2637. the double-precision floating-point format. The conversion is performed
  2638. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2639. -------------------------------------------------------------------------------
  2640. *}
  2641. Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
  2642. var
  2643. zSign : flag;
  2644. absA : bits32;
  2645. shiftCount : int8;
  2646. zSig0, zSig1 : bits32;
  2647. Begin
  2648. if ( a = 0 ) then
  2649. Begin
  2650. packFloat64( 0, 0, 0, 0, result );
  2651. exit;
  2652. end;
  2653. zSign := flag( a < 0 );
  2654. if ZSign<>0 then
  2655. AbsA := -a
  2656. else
  2657. AbsA := a;
  2658. shiftCount := countLeadingZeros32( absA ) - 11;
  2659. if ( 0 <= shiftCount ) then
  2660. Begin
  2661. zSig0 := absA shl shiftCount;
  2662. zSig1 := 0;
  2663. End
  2664. else
  2665. Begin
  2666. shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
  2667. End;
  2668. packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
  2669. End;
  2670. {*
  2671. -------------------------------------------------------------------------------
  2672. Returns the result of converting the single-precision floating-point value
  2673. `a' to the 32-bit two's complement integer format. The conversion is
  2674. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2675. Arithmetic---which means in particular that the conversion is rounded
  2676. according to the current rounding mode. If `a' is a NaN, the largest
  2677. positive integer is returned. Otherwise, if the conversion overflows, the
  2678. largest integer with the same sign as `a' is returned.
  2679. -------------------------------------------------------------------------------
  2680. *}
  2681. Function float32_to_int32( a : float32rec) : int32;compilerproc;
  2682. Var
  2683. aSign: flag;
  2684. aExp, shiftCount: int16;
  2685. aSig, aSigExtra: bits32;
  2686. z: int32;
  2687. roundingMode: int8;
  2688. Begin
  2689. aSig := extractFloat32Frac( a.float32 );
  2690. aExp := extractFloat32Exp( a.float32 );
  2691. aSign := extractFloat32Sign( a.float32 );
  2692. shiftCount := aExp - $96;
  2693. if ( 0 <= shiftCount ) then
  2694. Begin
  2695. if ( $9E <= aExp ) then
  2696. Begin
  2697. if ( a.float32 <> $CF000000 ) then
  2698. Begin
  2699. float_raise( float_flag_invalid );
  2700. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2701. Begin
  2702. float32_to_int32 := $7FFFFFFF;
  2703. exit;
  2704. End;
  2705. End;
  2706. float32_to_int32 := sbits32 ($80000000);
  2707. exit;
  2708. End;
  2709. z := ( aSig or $00800000 ) shl shiftCount;
  2710. if ( aSign<>0 ) then z := - z;
  2711. End
  2712. else
  2713. Begin
  2714. if ( aExp < $7E ) then
  2715. Begin
  2716. aSigExtra := aExp OR aSig;
  2717. z := 0;
  2718. End
  2719. else
  2720. Begin
  2721. aSig := aSig OR $00800000;
  2722. aSigExtra := aSig shl ( shiftCount and 31 );
  2723. z := aSig shr ( - shiftCount );
  2724. End;
  2725. if ( aSigExtra<>0 ) then
  2726. softfloat_exception_flags := softfloat_exception_flags
  2727. or float_flag_inexact;
  2728. roundingMode := softfloat_rounding_mode;
  2729. if ( roundingMode = float_round_nearest_even ) then
  2730. Begin
  2731. if ( sbits32 (aSigExtra) < 0 ) then
  2732. Begin
  2733. Inc(z);
  2734. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  2735. z := z and not 1;
  2736. End;
  2737. if ( aSign<>0 ) then
  2738. z := - z;
  2739. End
  2740. else
  2741. Begin
  2742. aSigExtra := flag( aSigExtra <> 0 );
  2743. if ( aSign<>0 ) then
  2744. Begin
  2745. z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
  2746. z := - z;
  2747. End
  2748. else
  2749. Begin
  2750. z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
  2751. End
  2752. End;
  2753. End;
  2754. float32_to_int32 := z;
  2755. End;
  2756. {*
  2757. -------------------------------------------------------------------------------
  2758. Returns the result of converting the single-precision floating-point value
  2759. `a' to the 32-bit two's complement integer format. The conversion is
  2760. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2761. Arithmetic, except that the conversion is always rounded toward zero.
  2762. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2763. the conversion overflows, the largest integer with the same sign as `a' is
  2764. returned.
  2765. -------------------------------------------------------------------------------
  2766. *}
  2767. Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
  2768. Var
  2769. aSign : flag;
  2770. aExp, shiftCount : int16;
  2771. aSig : bits32;
  2772. z : int32;
  2773. Begin
  2774. aSig := extractFloat32Frac( a.float32 );
  2775. aExp := extractFloat32Exp( a.float32 );
  2776. aSign := extractFloat32Sign( a.float32 );
  2777. shiftCount := aExp - $9E;
  2778. if ( 0 <= shiftCount ) then
  2779. Begin
  2780. if ( a.float32 <> $CF000000 ) then
  2781. Begin
  2782. float_raise( float_flag_invalid );
  2783. if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
  2784. Begin
  2785. float32_to_int32_round_to_zero := $7FFFFFFF;
  2786. exit;
  2787. end;
  2788. End;
  2789. float32_to_int32_round_to_zero:= sbits32 ($80000000);
  2790. exit;
  2791. End
  2792. else
  2793. if ( aExp <= $7E ) then
  2794. Begin
  2795. if ( aExp or aSig )<>0 then
  2796. softfloat_exception_flags :=
  2797. softfloat_exception_flags or float_flag_inexact;
  2798. float32_to_int32_round_to_zero := 0;
  2799. exit;
  2800. End;
  2801. aSig := ( aSig or $00800000 ) shl 8;
  2802. z := aSig shr ( - shiftCount );
  2803. if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
  2804. Begin
  2805. softfloat_exception_flags :=
  2806. softfloat_exception_flags or float_flag_inexact;
  2807. End;
  2808. if ( aSign<>0 ) then z := - z;
  2809. float32_to_int32_round_to_zero := z;
  2810. End;
  2811. {*
  2812. -------------------------------------------------------------------------------
  2813. Returns the result of converting the single-precision floating-point value
  2814. `a' to the double-precision floating-point format. The conversion is
  2815. performed according to the IEC/IEEE Standard for Binary Floating-Point
  2816. Arithmetic.
  2817. -------------------------------------------------------------------------------
  2818. *}
  2819. Function float32_to_float64( a : float32rec) : Float64;compilerproc;
  2820. Var
  2821. aSign : flag;
  2822. aExp : int16;
  2823. aSig, zSig0, zSig1: bits32;
  2824. tmp : CommonNanT;
  2825. Begin
  2826. aSig := extractFloat32Frac( a.float32 );
  2827. aExp := extractFloat32Exp( a.float32 );
  2828. aSign := extractFloat32Sign( a.float32 );
  2829. if ( aExp = $FF ) then
  2830. Begin
  2831. if ( aSig<>0 ) then
  2832. Begin
  2833. float32ToCommonNaN(a.float32, tmp);
  2834. commonNaNToFloat64(tmp , result);
  2835. exit;
  2836. End;
  2837. packFloat64( aSign, $7FF, 0, 0, result);
  2838. exit;
  2839. End;
  2840. if ( aExp = 0 ) then
  2841. Begin
  2842. if ( aSig = 0 ) then
  2843. Begin
  2844. packFloat64( aSign, 0, 0, 0, result );
  2845. exit;
  2846. end;
  2847. normalizeFloat32Subnormal( aSig, aExp, aSig );
  2848. Dec(aExp);
  2849. End;
  2850. shift64Right( aSig, 0, 3, zSig0, zSig1 );
  2851. packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
  2852. End;
  2853. {*
  2854. -------------------------------------------------------------------------------
  2855. Rounds the single-precision floating-point value `a' to an integer,
  2856. and returns the result as a single-precision floating-point value. The
  2857. operation is performed according to the IEC/IEEE Standard for Binary
  2858. Floating-Point Arithmetic.
  2859. -------------------------------------------------------------------------------
  2860. *}
  2861. Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
  2862. Var
  2863. aSign: flag;
  2864. aExp: int16;
  2865. lastBitMask, roundBitsMask: bits32;
  2866. roundingMode: int8;
  2867. z: float32;
  2868. Begin
  2869. aExp := extractFloat32Exp( a.float32 );
  2870. if ( $96 <= aExp ) then
  2871. Begin
  2872. if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2873. Begin
  2874. float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
  2875. exit;
  2876. End;
  2877. float32_round_to_int:=a;
  2878. exit;
  2879. End;
  2880. if ( aExp <= $7E ) then
  2881. Begin
  2882. if ( bits32 ( a.float32 shl 1 ) = 0 ) then
  2883. Begin
  2884. float32_round_to_int:=a;
  2885. exit;
  2886. end;
  2887. softfloat_exception_flags
  2888. := softfloat_exception_flags OR float_flag_inexact;
  2889. aSign := extractFloat32Sign( a.float32 );
  2890. case ( softfloat_rounding_mode ) of
  2891. float_round_nearest_even:
  2892. Begin
  2893. if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
  2894. Begin
  2895. float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
  2896. exit;
  2897. End;
  2898. End;
  2899. float_round_down:
  2900. Begin
  2901. if aSign <> 0 then
  2902. float32_round_to_int.float32 := $BF800000
  2903. else
  2904. float32_round_to_int.float32 := 0;
  2905. exit;
  2906. End;
  2907. float_round_up:
  2908. Begin
  2909. if aSign <> 0 then
  2910. float32_round_to_int.float32 := $80000000
  2911. else
  2912. float32_round_to_int.float32 := $3F800000;
  2913. exit;
  2914. End;
  2915. end;
  2916. float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
  2917. End;
  2918. lastBitMask := 1;
  2919. {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
  2920. lastBitMask := lastBitMask shl ($96 - aExp);
  2921. roundBitsMask := lastBitMask - 1;
  2922. z := a.float32;
  2923. roundingMode := softfloat_rounding_mode;
  2924. if ( roundingMode = float_round_nearest_even ) then
  2925. Begin
  2926. z := z + (lastBitMask shr 1);
  2927. if ( ( z and roundBitsMask ) = 0 ) then
  2928. z := z and not lastBitMask;
  2929. End
  2930. else if ( roundingMode <> float_round_to_zero ) then
  2931. Begin
  2932. if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
  2933. Begin
  2934. z := z + roundBitsMask;
  2935. End;
  2936. End;
  2937. z := z and not roundBitsMask;
  2938. if ( z <> a.float32 ) then
  2939. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  2940. float32_round_to_int.float32 := z;
  2941. End;
  2942. {*
  2943. -------------------------------------------------------------------------------
  2944. Returns the result of adding the absolute values of the single-precision
  2945. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  2946. before being returned. `zSign' is ignored if the result is a NaN.
  2947. The addition is performed according to the IEC/IEEE Standard for Binary
  2948. Floating-Point Arithmetic.
  2949. -------------------------------------------------------------------------------
  2950. *}
  2951. Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
  2952. Var
  2953. aExp, bExp, zExp: int16;
  2954. aSig, bSig, zSig: bits32;
  2955. expDiff: int16;
  2956. label roundAndPack;
  2957. Begin
  2958. aSig:=extractFloat32Frac( a );
  2959. aExp:=extractFloat32Exp( a );
  2960. bSig:=extractFloat32Frac( b );
  2961. bExp := extractFloat32Exp( b );
  2962. expDiff := aExp - bExp;
  2963. aSig := aSig shl 6;
  2964. bSig := bSig shl 6;
  2965. if ( 0 < expDiff ) then
  2966. Begin
  2967. if ( aExp = $FF ) then
  2968. Begin
  2969. if ( aSig <> 0) then
  2970. Begin
  2971. addFloat32Sigs := propagateFloat32NaN( a, b );
  2972. exit;
  2973. End;
  2974. addFloat32Sigs := a;
  2975. exit;
  2976. End;
  2977. if ( bExp = 0 ) then
  2978. Begin
  2979. Dec(expDiff);
  2980. End
  2981. else
  2982. Begin
  2983. bSig := bSig or $20000000;
  2984. End;
  2985. shift32RightJamming( bSig, expDiff, bSig );
  2986. zExp := aExp;
  2987. End
  2988. else
  2989. If ( expDiff < 0 ) then
  2990. Begin
  2991. if ( bExp = $FF ) then
  2992. Begin
  2993. if ( bSig<>0 ) then
  2994. Begin
  2995. addFloat32Sigs := propagateFloat32NaN( a, b );
  2996. exit;
  2997. end;
  2998. addFloat32Sigs := packFloat32( zSign, $FF, 0 );
  2999. exit;
  3000. End;
  3001. if ( aExp = 0 ) then
  3002. Begin
  3003. Inc(expDiff);
  3004. End
  3005. else
  3006. Begin
  3007. aSig := aSig OR $20000000;
  3008. End;
  3009. shift32RightJamming( aSig, - expDiff, aSig );
  3010. zExp := bExp;
  3011. End
  3012. else
  3013. Begin
  3014. if ( aExp = $FF ) then
  3015. Begin
  3016. if ( aSig OR bSig )<> 0 then
  3017. Begin
  3018. addFloat32Sigs := propagateFloat32NaN( a, b );
  3019. exit;
  3020. end;
  3021. addFloat32Sigs := a;
  3022. exit;
  3023. End;
  3024. if ( aExp = 0 ) then
  3025. Begin
  3026. addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
  3027. exit;
  3028. end;
  3029. zSig := $40000000 + aSig + bSig;
  3030. zExp := aExp;
  3031. goto roundAndPack;
  3032. End;
  3033. aSig := aSig OR $20000000;
  3034. zSig := ( aSig + bSig ) shl 1;
  3035. Dec(zExp);
  3036. if ( sbits32 (zSig) < 0 ) then
  3037. Begin
  3038. zSig := aSig + bSig;
  3039. Inc(zExp);
  3040. End;
  3041. roundAndPack:
  3042. addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
  3043. End;
  3044. {*
  3045. -------------------------------------------------------------------------------
  3046. Returns the result of subtracting the absolute values of the single-
  3047. precision floating-point values `a' and `b'. If `zSign' is 1, the
  3048. difference is negated before being returned. `zSign' is ignored if the
  3049. result is a NaN. The subtraction is performed according to the IEC/IEEE
  3050. Standard for Binary Floating-Point Arithmetic.
  3051. -------------------------------------------------------------------------------
  3052. *}
  3053. Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
  3054. Var
  3055. aExp, bExp, zExp: int16;
  3056. aSig, bSig, zSig: bits32;
  3057. expDiff : int16;
  3058. label aExpBigger;
  3059. label bExpBigger;
  3060. label aBigger;
  3061. label bBigger;
  3062. label normalizeRoundAndPack;
  3063. Begin
  3064. aSig := extractFloat32Frac( a );
  3065. aExp := extractFloat32Exp( a );
  3066. bSig := extractFloat32Frac( b );
  3067. bExp := extractFloat32Exp( b );
  3068. expDiff := aExp - bExp;
  3069. aSig := aSig shl 7;
  3070. bSig := bSig shl 7;
  3071. if ( 0 < expDiff ) then goto aExpBigger;
  3072. if ( expDiff < 0 ) then goto bExpBigger;
  3073. if ( aExp = $FF ) then
  3074. Begin
  3075. if ( aSig OR bSig )<> 0 then
  3076. Begin
  3077. subFloat32Sigs := propagateFloat32NaN( a, b );
  3078. exit;
  3079. End;
  3080. float_raise( float_flag_invalid );
  3081. subFloat32Sigs := float32_default_nan;
  3082. exit;
  3083. End;
  3084. if ( aExp = 0 ) then
  3085. Begin
  3086. aExp := 1;
  3087. bExp := 1;
  3088. End;
  3089. if ( bSig < aSig ) Then goto aBigger;
  3090. if ( aSig < bSig ) Then goto bBigger;
  3091. subFloat32Sigs := packFloat32( flag(softfloat_rounding_mode = float_round_down), 0, 0 );
  3092. exit;
  3093. bExpBigger:
  3094. if ( bExp = $FF ) then
  3095. Begin
  3096. if ( bSig<>0 ) then
  3097. Begin
  3098. subFloat32Sigs := propagateFloat32NaN( a, b );
  3099. exit;
  3100. End;
  3101. subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
  3102. exit;
  3103. End;
  3104. if ( aExp = 0 ) then
  3105. Begin
  3106. Inc(expDiff);
  3107. End
  3108. else
  3109. Begin
  3110. aSig := aSig OR $40000000;
  3111. End;
  3112. shift32RightJamming( aSig, - expDiff, aSig );
  3113. bSig := bSig OR $40000000;
  3114. bBigger:
  3115. zSig := bSig - aSig;
  3116. zExp := bExp;
  3117. zSign := zSign xor 1;
  3118. goto normalizeRoundAndPack;
  3119. aExpBigger:
  3120. if ( aExp = $FF ) then
  3121. Begin
  3122. if ( aSig <> 0) then
  3123. Begin
  3124. subFloat32Sigs := propagateFloat32NaN( a, b );
  3125. exit;
  3126. End;
  3127. subFloat32Sigs := a;
  3128. exit;
  3129. End;
  3130. if ( bExp = 0 ) then
  3131. Begin
  3132. Dec(expDiff);
  3133. End
  3134. else
  3135. Begin
  3136. bSig := bSig OR $40000000;
  3137. End;
  3138. shift32RightJamming( bSig, expDiff, bSig );
  3139. aSig := aSig OR $40000000;
  3140. aBigger:
  3141. zSig := aSig - bSig;
  3142. zExp := aExp;
  3143. normalizeRoundAndPack:
  3144. Dec(zExp);
  3145. subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
  3146. End;
  3147. {*
  3148. -------------------------------------------------------------------------------
  3149. Returns the result of adding the single-precision floating-point values `a'
  3150. and `b'. The operation is performed according to the IEC/IEEE Standard for
  3151. Binary Floating-Point Arithmetic.
  3152. -------------------------------------------------------------------------------
  3153. *}
  3154. Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
  3155. Var
  3156. aSign, bSign: Flag;
  3157. Begin
  3158. aSign := extractFloat32Sign( a.float32 );
  3159. bSign := extractFloat32Sign( b.float32 );
  3160. if ( aSign = bSign ) then
  3161. Begin
  3162. float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3163. End
  3164. else
  3165. Begin
  3166. float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3167. End;
  3168. End;
  3169. {*
  3170. -------------------------------------------------------------------------------
  3171. Returns the result of subtracting the single-precision floating-point values
  3172. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3173. for Binary Floating-Point Arithmetic.
  3174. -------------------------------------------------------------------------------
  3175. *}
  3176. Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
  3177. Var
  3178. aSign, bSign: flag;
  3179. Begin
  3180. aSign := extractFloat32Sign( a.float32 );
  3181. bSign := extractFloat32Sign( b.float32 );
  3182. if ( aSign = bSign ) then
  3183. Begin
  3184. float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
  3185. End
  3186. else
  3187. Begin
  3188. float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
  3189. End;
  3190. End;
  3191. {*
  3192. -------------------------------------------------------------------------------
  3193. Returns the result of multiplying the single-precision floating-point values
  3194. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3195. for Binary Floating-Point Arithmetic.
  3196. -------------------------------------------------------------------------------
  3197. *}
  3198. Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
  3199. Var
  3200. aSign, bSign, zSign: flag;
  3201. aExp, bExp, zExp : int16;
  3202. aSig, bSig, zSig0, zSig1: bits32;
  3203. Begin
  3204. aSig := extractFloat32Frac( a.float32 );
  3205. aExp := extractFloat32Exp( a.float32 );
  3206. aSign := extractFloat32Sign( a.float32 );
  3207. bSig := extractFloat32Frac( b.float32 );
  3208. bExp := extractFloat32Exp( b.float32 );
  3209. bSign := extractFloat32Sign( b.float32 );
  3210. zSign := aSign xor bSign;
  3211. if ( aExp = $FF ) then
  3212. Begin
  3213. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
  3214. Begin
  3215. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3216. End;
  3217. if ( ( bExp OR bSig ) = 0 ) then
  3218. Begin
  3219. float_raise( float_flag_invalid );
  3220. float32_mul.float32 := float32_default_nan;
  3221. exit;
  3222. End;
  3223. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3224. exit;
  3225. End;
  3226. if ( bExp = $FF ) then
  3227. Begin
  3228. if ( bSig <> 0 ) then
  3229. Begin
  3230. float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3231. exit;
  3232. End;
  3233. if ( ( aExp OR aSig ) = 0 ) then
  3234. Begin
  3235. float_raise( float_flag_invalid );
  3236. float32_mul.float32 := float32_default_nan;
  3237. exit;
  3238. End;
  3239. float32_mul.float32 := packFloat32( zSign, $FF, 0 );
  3240. exit;
  3241. End;
  3242. if ( aExp = 0 ) then
  3243. Begin
  3244. if ( aSig = 0 ) then
  3245. Begin
  3246. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3247. exit;
  3248. End;
  3249. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3250. End;
  3251. if ( bExp = 0 ) then
  3252. Begin
  3253. if ( bSig = 0 ) then
  3254. Begin
  3255. float32_mul.float32 := packFloat32( zSign, 0, 0 );
  3256. exit;
  3257. End;
  3258. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3259. End;
  3260. zExp := aExp + bExp - $7F;
  3261. aSig := ( aSig OR $00800000 ) shl 7;
  3262. bSig := ( bSig OR $00800000 ) shl 8;
  3263. mul32To64( aSig, bSig, zSig0, zSig1 );
  3264. zSig0 := zSig0 OR bits32( zSig1 <> 0 );
  3265. if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
  3266. Begin
  3267. zSig0 := zSig0 shl 1;
  3268. Dec(zExp);
  3269. End;
  3270. float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
  3271. End;
  3272. {*
  3273. -------------------------------------------------------------------------------
  3274. Returns the result of dividing the single-precision floating-point value `a'
  3275. by the corresponding value `b'. The operation is performed according to the
  3276. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3277. -------------------------------------------------------------------------------
  3278. *}
  3279. Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
  3280. Var
  3281. aSign, bSign, zSign: flag;
  3282. aExp, bExp, zExp: int16;
  3283. aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
  3284. Begin
  3285. aSig := extractFloat32Frac( a.float32 );
  3286. aExp := extractFloat32Exp( a.float32 );
  3287. aSign := extractFloat32Sign( a.float32 );
  3288. bSig := extractFloat32Frac( b.float32 );
  3289. bExp := extractFloat32Exp( b.float32 );
  3290. bSign := extractFloat32Sign( b.float32 );
  3291. zSign := aSign xor bSign;
  3292. if ( aExp = $FF ) then
  3293. Begin
  3294. if ( aSig <> 0 ) then
  3295. Begin
  3296. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3297. exit;
  3298. End;
  3299. if ( bExp = $FF ) then
  3300. Begin
  3301. if ( bSig <> 0) then
  3302. Begin
  3303. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3304. End;
  3305. float_raise( float_flag_invalid );
  3306. float32_div.float32 := float32_default_nan;
  3307. exit;
  3308. End;
  3309. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3310. exit;
  3311. End;
  3312. if ( bExp = $FF ) then
  3313. Begin
  3314. if ( bSig <> 0) then
  3315. Begin
  3316. float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3317. exit;
  3318. End;
  3319. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3320. exit;
  3321. End;
  3322. if ( bExp = 0 ) Then
  3323. Begin
  3324. if ( bSig = 0 ) Then
  3325. Begin
  3326. if ( ( aExp OR aSig ) = 0 ) then
  3327. Begin
  3328. float_raise( float_flag_invalid );
  3329. float32_div.float32 := float32_default_nan;
  3330. exit;
  3331. End;
  3332. float_raise( float_flag_divbyzero );
  3333. float32_div.float32 := packFloat32( zSign, $FF, 0 );
  3334. exit;
  3335. End;
  3336. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3337. End;
  3338. if ( aExp = 0 ) Then
  3339. Begin
  3340. if ( aSig = 0 ) Then
  3341. Begin
  3342. float32_div.float32 := packFloat32( zSign, 0, 0 );
  3343. exit;
  3344. End;
  3345. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3346. End;
  3347. zExp := aExp - bExp + $7D;
  3348. aSig := ( aSig OR $00800000 ) shl 7;
  3349. bSig := ( bSig OR $00800000 ) shl 8;
  3350. if ( bSig <= ( aSig + aSig ) ) then
  3351. Begin
  3352. aSig := aSig shr 1;
  3353. Inc(zExp);
  3354. End;
  3355. zSig := estimateDiv64To32( aSig, 0, bSig );
  3356. if ( ( zSig and $3F ) <= 2 ) then
  3357. Begin
  3358. mul32To64( bSig, zSig, term0, term1 );
  3359. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3360. while ( sbits32 (rem0) < 0 ) do
  3361. Begin
  3362. Dec(zSig);
  3363. add64( rem0, rem1, 0, bSig, rem0, rem1 );
  3364. End;
  3365. zSig := zSig or bits32( rem1 <> 0 );
  3366. End;
  3367. float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
  3368. End;
  3369. {*
  3370. -------------------------------------------------------------------------------
  3371. Returns the remainder of the single-precision floating-point value `a'
  3372. with respect to the corresponding value `b'. The operation is performed
  3373. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3374. -------------------------------------------------------------------------------
  3375. *}
  3376. Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
  3377. Var
  3378. aSign, bSign, zSign: flag;
  3379. aExp, bExp, expDiff: int16;
  3380. aSig, bSig, q, allZero, alternateASig: bits32;
  3381. sigMean: sbits32;
  3382. Begin
  3383. aSig := extractFloat32Frac( a.float32 );
  3384. aExp := extractFloat32Exp( a.float32 );
  3385. aSign := extractFloat32Sign( a.float32 );
  3386. bSig := extractFloat32Frac( b.float32 );
  3387. bExp := extractFloat32Exp( b.float32 );
  3388. bSign := extractFloat32Sign( b.float32 );
  3389. if ( aExp = $FF ) then
  3390. Begin
  3391. if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
  3392. Begin
  3393. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3394. exit;
  3395. End;
  3396. float_raise( float_flag_invalid );
  3397. float32_rem.float32 := float32_default_nan;
  3398. exit;
  3399. End;
  3400. if ( bExp = $FF ) then
  3401. Begin
  3402. if ( bSig <> 0 ) then
  3403. Begin
  3404. float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
  3405. exit;
  3406. End;
  3407. float32_rem := a;
  3408. exit;
  3409. End;
  3410. if ( bExp = 0 ) then
  3411. Begin
  3412. if ( bSig = 0 ) then
  3413. Begin
  3414. float_raise( float_flag_invalid );
  3415. float32_rem.float32 := float32_default_nan;
  3416. exit;
  3417. End;
  3418. normalizeFloat32Subnormal( bSig, bExp, bSig );
  3419. End;
  3420. if ( aExp = 0 ) then
  3421. Begin
  3422. if ( aSig = 0 ) then
  3423. Begin
  3424. float32_rem := a;
  3425. exit;
  3426. End;
  3427. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3428. End;
  3429. expDiff := aExp - bExp;
  3430. aSig := ( aSig OR $00800000 ) shl 8;
  3431. bSig := ( bSig OR $00800000 ) shl 8;
  3432. if ( expDiff < 0 ) then
  3433. Begin
  3434. if ( expDiff < -1 ) then
  3435. Begin
  3436. float32_rem := a;
  3437. exit;
  3438. End;
  3439. aSig := aSig shr 1;
  3440. End;
  3441. q := bits32( bSig <= aSig );
  3442. if ( q <> 0) then
  3443. aSig := aSig - bSig;
  3444. expDiff := expDiff - 32;
  3445. while ( 0 < expDiff ) do
  3446. Begin
  3447. q := estimateDiv64To32( aSig, 0, bSig );
  3448. if (2 < q) then
  3449. q := q - 2
  3450. else
  3451. q := 0;
  3452. aSig := - ( ( bSig shr 2 ) * q );
  3453. expDiff := expDiff - 30;
  3454. End;
  3455. expDiff := expDiff + 32;
  3456. if ( 0 < expDiff ) then
  3457. Begin
  3458. q := estimateDiv64To32( aSig, 0, bSig );
  3459. if (2 < q) then
  3460. q := q - 2
  3461. else
  3462. q := 0;
  3463. q := q shr (32 - expDiff);
  3464. bSig := bSig shr 2;
  3465. aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
  3466. End
  3467. else
  3468. Begin
  3469. aSig := aSig shr 2;
  3470. bSig := bSig shr 2;
  3471. End;
  3472. Repeat
  3473. alternateASig := aSig;
  3474. Inc(q);
  3475. aSig := aSig - bSig;
  3476. Until not ( 0 <= sbits32 (aSig) );
  3477. sigMean := aSig + alternateASig;
  3478. if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
  3479. Begin
  3480. aSig := alternateASig;
  3481. End;
  3482. zSign := flag( sbits32 (aSig) < 0 );
  3483. if ( zSign<>0 ) then
  3484. aSig := - aSig;
  3485. float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
  3486. End;
  3487. {*
  3488. -------------------------------------------------------------------------------
  3489. Returns the square root of the single-precision floating-point value `a'.
  3490. The operation is performed according to the IEC/IEEE Standard for Binary
  3491. Floating-Point Arithmetic.
  3492. -------------------------------------------------------------------------------
  3493. *}
  3494. Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
  3495. Var
  3496. aSign : flag;
  3497. aExp, zExp : int16;
  3498. aSig, zSig, rem0, rem1, term0, term1: bits32;
  3499. label roundAndPack;
  3500. Begin
  3501. aSig := extractFloat32Frac( a.float32 );
  3502. aExp := extractFloat32Exp( a.float32 );
  3503. aSign := extractFloat32Sign( a.float32 );
  3504. if ( aExp = $FF ) then
  3505. Begin
  3506. if ( aSig <> 0) then
  3507. Begin
  3508. float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
  3509. exit;
  3510. End;
  3511. if ( aSign = 0) then
  3512. Begin
  3513. float32_sqrt := a;
  3514. exit;
  3515. End;
  3516. float_raise( float_flag_invalid );
  3517. float32_sqrt.float32 := float32_default_nan;
  3518. exit;
  3519. End;
  3520. if ( aSign <> 0) then
  3521. Begin
  3522. if ( ( aExp OR aSig ) = 0 ) then
  3523. Begin
  3524. float32_sqrt := a;
  3525. exit;
  3526. End;
  3527. float_raise( float_flag_invalid );
  3528. float32_sqrt.float32 := float32_default_nan;
  3529. exit;
  3530. End;
  3531. if ( aExp = 0 ) then
  3532. Begin
  3533. if ( aSig = 0 ) then
  3534. Begin
  3535. float32_sqrt.float32 := 0;
  3536. exit;
  3537. End;
  3538. normalizeFloat32Subnormal( aSig, aExp, aSig );
  3539. End;
  3540. zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
  3541. aSig := ( aSig OR $00800000 ) shl 8;
  3542. zSig := estimateSqrt32( aExp, aSig ) + 2;
  3543. if ( ( zSig and $7F ) <= 5 ) then
  3544. Begin
  3545. if ( zSig < 2 ) then
  3546. Begin
  3547. zSig := $7FFFFFFF;
  3548. goto roundAndPack;
  3549. End
  3550. else
  3551. Begin
  3552. aSig := aSig shr (aExp and 1);
  3553. mul32To64( zSig, zSig, term0, term1 );
  3554. sub64( aSig, 0, term0, term1, rem0, rem1 );
  3555. while ( sbits32 (rem0) < 0 ) do
  3556. Begin
  3557. Dec(zSig);
  3558. shortShift64Left( 0, zSig, 1, term0, term1 );
  3559. term1 := term1 or 1;
  3560. add64( rem0, rem1, term0, term1, rem0, rem1 );
  3561. End;
  3562. zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
  3563. End;
  3564. End;
  3565. shift32RightJamming( zSig, 1, zSig );
  3566. roundAndPack:
  3567. float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
  3568. End;
  3569. {*
  3570. -------------------------------------------------------------------------------
  3571. Returns 1 if the single-precision floating-point value `a' is equal to
  3572. the corresponding value `b', and 0 otherwise. The comparison is performed
  3573. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3574. -------------------------------------------------------------------------------
  3575. *}
  3576. Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
  3577. Begin
  3578. if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
  3579. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3580. ) then
  3581. Begin
  3582. if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
  3583. Begin
  3584. float_raise( float_flag_invalid );
  3585. End;
  3586. float32_eq := 0;
  3587. exit;
  3588. End;
  3589. float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3590. End;
  3591. {*
  3592. -------------------------------------------------------------------------------
  3593. Returns 1 if the single-precision floating-point value `a' is less than
  3594. or equal to the corresponding value `b', and 0 otherwise. The comparison
  3595. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3596. Arithmetic.
  3597. -------------------------------------------------------------------------------
  3598. *}
  3599. Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
  3600. var
  3601. aSign, bSign: flag;
  3602. Begin
  3603. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
  3604. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
  3605. ) then
  3606. Begin
  3607. float_raise( float_flag_invalid );
  3608. float32_le := 0;
  3609. exit;
  3610. End;
  3611. aSign := extractFloat32Sign( a.float32 );
  3612. bSign := extractFloat32Sign( b.float32 );
  3613. if ( aSign <> bSign ) then
  3614. Begin
  3615. float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
  3616. exit;
  3617. End;
  3618. float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
  3619. End;
  3620. {*
  3621. -------------------------------------------------------------------------------
  3622. Returns 1 if the single-precision floating-point value `a' is less than
  3623. the corresponding value `b', and 0 otherwise. The comparison is performed
  3624. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3625. -------------------------------------------------------------------------------
  3626. *}
  3627. Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
  3628. var
  3629. aSign, bSign: flag;
  3630. Begin
  3631. if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
  3632. OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
  3633. ) then
  3634. Begin
  3635. float_raise( float_flag_invalid );
  3636. float32_lt :=0;
  3637. exit;
  3638. End;
  3639. aSign := extractFloat32Sign( a.float32 );
  3640. bSign := extractFloat32Sign( b.float32 );
  3641. if ( aSign <> bSign ) then
  3642. Begin
  3643. float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
  3644. exit;
  3645. End;
  3646. float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
  3647. End;
  3648. {*
  3649. -------------------------------------------------------------------------------
  3650. Returns 1 if the single-precision floating-point value `a' is equal to
  3651. the corresponding value `b', and 0 otherwise. The invalid exception is
  3652. raised if either operand is a NaN. Otherwise, the comparison is performed
  3653. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3654. -------------------------------------------------------------------------------
  3655. *}
  3656. Function float32_eq_signaling( a: float32; b: float32) : flag;
  3657. Begin
  3658. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
  3659. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
  3660. ) then
  3661. Begin
  3662. float_raise( float_flag_invalid );
  3663. float32_eq_signaling := 0;
  3664. exit;
  3665. End;
  3666. float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
  3667. End;
  3668. {*
  3669. -------------------------------------------------------------------------------
  3670. Returns 1 if the single-precision floating-point value `a' is less than or
  3671. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3672. cause an exception. Otherwise, the comparison is performed according to the
  3673. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3674. -------------------------------------------------------------------------------
  3675. *}
  3676. Function float32_le_quiet( a: float32 ; b : float32 ): flag;
  3677. Var
  3678. aSign, bSign: flag;
  3679. aExp, bExp: int16;
  3680. Begin
  3681. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3682. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3683. ) then
  3684. Begin
  3685. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3686. Begin
  3687. float_raise( float_flag_invalid );
  3688. End;
  3689. float32_le_quiet := 0;
  3690. exit;
  3691. End;
  3692. aSign := extractFloat32Sign( a );
  3693. bSign := extractFloat32Sign( b );
  3694. if ( aSign <> bSign ) then
  3695. Begin
  3696. float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
  3697. exit;
  3698. End;
  3699. float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
  3700. End;
  3701. {*
  3702. -------------------------------------------------------------------------------
  3703. Returns 1 if the single-precision floating-point value `a' is less than
  3704. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3705. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3706. Standard for Binary Floating-Point Arithmetic.
  3707. -------------------------------------------------------------------------------
  3708. *}
  3709. Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
  3710. Var
  3711. aSign, bSign: flag;
  3712. Begin
  3713. if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
  3714. OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
  3715. ) then
  3716. Begin
  3717. if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
  3718. Begin
  3719. float_raise( float_flag_invalid );
  3720. End;
  3721. float32_lt_quiet := 0;
  3722. exit;
  3723. End;
  3724. aSign := extractFloat32Sign( a );
  3725. bSign := extractFloat32Sign( b );
  3726. if ( aSign <> bSign ) then
  3727. Begin
  3728. float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
  3729. exit;
  3730. End;
  3731. float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
  3732. End;
  3733. {*
  3734. -------------------------------------------------------------------------------
  3735. Returns the result of converting the double-precision floating-point value
  3736. `a' to the 32-bit two's complement integer format. The conversion is
  3737. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3738. Arithmetic---which means in particular that the conversion is rounded
  3739. according to the current rounding mode. If `a' is a NaN, the largest
  3740. positive integer is returned. Otherwise, if the conversion overflows, the
  3741. largest integer with the same sign as `a' is returned.
  3742. -------------------------------------------------------------------------------
  3743. *}
  3744. Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
  3745. var
  3746. aSign: flag;
  3747. aExp, shiftCount: int16;
  3748. aSig0, aSig1, absZ, aSigExtra: bits32;
  3749. z: int32;
  3750. roundingMode: int8;
  3751. label invalid;
  3752. Begin
  3753. aSig1 := extractFloat64Frac1( a );
  3754. aSig0 := extractFloat64Frac0( a );
  3755. aExp := extractFloat64Exp( a );
  3756. aSign := extractFloat64Sign( a );
  3757. shiftCount := aExp - $413;
  3758. if ( 0 <= shiftCount ) then
  3759. Begin
  3760. if ( $41E < aExp ) then
  3761. Begin
  3762. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3763. aSign := 0;
  3764. goto invalid;
  3765. End;
  3766. shortShift64Left(
  3767. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3768. if ( $80000000 < absZ ) then
  3769. goto invalid;
  3770. End
  3771. else
  3772. Begin
  3773. aSig1 := flag( aSig1 <> 0 );
  3774. if ( aExp < $3FE ) then
  3775. Begin
  3776. aSigExtra := aExp OR aSig0 OR aSig1;
  3777. absZ := 0;
  3778. End
  3779. else
  3780. Begin
  3781. aSig0 := aSig0 OR $00100000;
  3782. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3783. absZ := aSig0 shr ( - shiftCount );
  3784. End;
  3785. End;
  3786. roundingMode := softfloat_rounding_mode;
  3787. if ( roundingMode = float_round_nearest_even ) then
  3788. Begin
  3789. if ( sbits32(aSigExtra) < 0 ) then
  3790. Begin
  3791. Inc(absZ);
  3792. if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
  3793. absZ := absZ and not 1;
  3794. End;
  3795. if aSign <> 0 then
  3796. z := - absZ
  3797. else
  3798. z := absZ;
  3799. End
  3800. else
  3801. Begin
  3802. aSigExtra := bits32( aSigExtra <> 0 );
  3803. if ( aSign <> 0) then
  3804. Begin
  3805. z := - ( absZ
  3806. + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
  3807. End
  3808. else
  3809. Begin
  3810. z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
  3811. End
  3812. End;
  3813. if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
  3814. Begin
  3815. invalid:
  3816. float_raise( float_flag_invalid );
  3817. if (aSign <> 0 ) then
  3818. float64_to_int32 := sbits32 ($80000000)
  3819. else
  3820. float64_to_int32 := $7FFFFFFF;
  3821. exit;
  3822. End;
  3823. if ( aSigExtra <> 0) then
  3824. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3825. float64_to_int32 := z;
  3826. End;
  3827. {*
  3828. -------------------------------------------------------------------------------
  3829. Returns the result of converting the double-precision floating-point value
  3830. `a' to the 32-bit two's complement integer format. The conversion is
  3831. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3832. Arithmetic, except that the conversion is always rounded toward zero.
  3833. If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  3834. the conversion overflows, the largest integer with the same sign as `a' is
  3835. returned.
  3836. -------------------------------------------------------------------------------
  3837. *}
  3838. Function float64_to_int32_round_to_zero(a: float64 ): int32;
  3839. {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
  3840. Var
  3841. aSign: flag;
  3842. aExp, shiftCount: int16;
  3843. aSig0, aSig1, absZ, aSigExtra: bits32;
  3844. z: int32;
  3845. label invalid;
  3846. Begin
  3847. aSig1 := extractFloat64Frac1( a );
  3848. aSig0 := extractFloat64Frac0( a );
  3849. aExp := extractFloat64Exp( a );
  3850. aSign := extractFloat64Sign( a );
  3851. shiftCount := aExp - $413;
  3852. if ( 0 <= shiftCount ) then
  3853. Begin
  3854. if ( $41E < aExp ) then
  3855. Begin
  3856. if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
  3857. aSign := 0;
  3858. goto invalid;
  3859. End;
  3860. shortShift64Left(
  3861. aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
  3862. End
  3863. else
  3864. Begin
  3865. if ( aExp < $3FF ) then
  3866. Begin
  3867. if ( aExp OR aSig0 OR aSig1 )<>0 then
  3868. Begin
  3869. softfloat_exception_flags :=
  3870. softfloat_exception_flags or float_flag_inexact;
  3871. End;
  3872. float64_to_int32_round_to_zero := 0;
  3873. exit;
  3874. End;
  3875. aSig0 := aSig0 or $00100000;
  3876. aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
  3877. absZ := aSig0 shr ( - shiftCount );
  3878. End;
  3879. if aSign <> 0 then
  3880. z := - absZ
  3881. else
  3882. z := absZ;
  3883. if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
  3884. Begin
  3885. invalid:
  3886. float_raise( float_flag_invalid );
  3887. if (aSign <> 0) then
  3888. float64_to_int32_round_to_zero := sbits32 ($80000000)
  3889. else
  3890. float64_to_int32_round_to_zero := $7FFFFFFF;
  3891. exit;
  3892. End;
  3893. if ( aSigExtra <> 0) then
  3894. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  3895. float64_to_int32_round_to_zero := z;
  3896. End;
  3897. {*
  3898. -------------------------------------------------------------------------------
  3899. Returns the result of converting the double-precision floating-point value
  3900. `a' to the single-precision floating-point format. The conversion is
  3901. performed according to the IEC/IEEE Standard for Binary Floating-Point
  3902. Arithmetic.
  3903. -------------------------------------------------------------------------------
  3904. *}
  3905. Function float64_to_float32(a: float64 ): float32rec;compilerproc;
  3906. Var
  3907. aSign: flag;
  3908. aExp: int16;
  3909. aSig0, aSig1, zSig: bits32;
  3910. allZero: bits32;
  3911. tmp : CommonNanT;
  3912. Begin
  3913. aSig1 := extractFloat64Frac1( a );
  3914. aSig0 := extractFloat64Frac0( a );
  3915. aExp := extractFloat64Exp( a );
  3916. aSign := extractFloat64Sign( a );
  3917. if ( aExp = $7FF ) then
  3918. Begin
  3919. if ( aSig0 OR aSig1 ) <> 0 then
  3920. Begin
  3921. float64ToCommonNaN( a, tmp );
  3922. float64_to_float32.float32 := commonNaNToFloat32( tmp );
  3923. exit;
  3924. End;
  3925. float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
  3926. exit;
  3927. End;
  3928. shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
  3929. if ( aExp <> 0) then
  3930. zSig := zSig OR $40000000;
  3931. float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
  3932. End;
  3933. {*
  3934. -------------------------------------------------------------------------------
  3935. Rounds the double-precision floating-point value `a' to an integer,
  3936. and returns the result as a double-precision floating-point value. The
  3937. operation is performed according to the IEC/IEEE Standard for Binary
  3938. Floating-Point Arithmetic.
  3939. -------------------------------------------------------------------------------
  3940. *}
  3941. function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
  3942. Var
  3943. aSign: flag;
  3944. aExp: int16;
  3945. lastBitMask, roundBitsMask: bits32;
  3946. roundingMode: int8;
  3947. z: float64;
  3948. Begin
  3949. aExp := extractFloat64Exp( a );
  3950. if ( $413 <= aExp ) then
  3951. Begin
  3952. if ( $433 <= aExp ) then
  3953. Begin
  3954. if ( ( aExp = $7FF )
  3955. AND
  3956. (
  3957. ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
  3958. ) <>0)
  3959. ) then
  3960. Begin
  3961. propagateFloat64NaN( a, a, result );
  3962. exit;
  3963. End;
  3964. result := a;
  3965. exit;
  3966. End;
  3967. lastBitMask := 1;
  3968. lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
  3969. roundBitsMask := lastBitMask - 1;
  3970. z := a;
  3971. roundingMode := softfloat_rounding_mode;
  3972. if ( roundingMode = float_round_nearest_even ) then
  3973. Begin
  3974. if ( lastBitMask <> 0) then
  3975. Begin
  3976. add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  3977. if ( ( z.low and roundBitsMask ) = 0 ) then
  3978. z.low := z.low and not lastBitMask;
  3979. End
  3980. else
  3981. Begin
  3982. if ( sbits32 (z.low) < 0 ) then
  3983. Begin
  3984. Inc(z.high);
  3985. if ( bits32 ( z.low shl 1 ) = 0 ) then
  3986. z.high := z.high and not 1;
  3987. End;
  3988. End;
  3989. End
  3990. else if ( roundingMode <> float_round_to_zero ) then
  3991. Begin
  3992. if ( extractFloat64Sign( z )
  3993. xor flag( roundingMode = float_round_up ) )<> 0 then
  3994. Begin
  3995. add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  3996. End;
  3997. End;
  3998. z.low := z.low and not roundBitsMask;
  3999. End
  4000. else
  4001. Begin
  4002. if ( aExp <= $3FE ) then
  4003. Begin
  4004. if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
  4005. Begin
  4006. result := a;
  4007. exit;
  4008. End;
  4009. softfloat_exception_flags := softfloat_exception_flags or
  4010. float_flag_inexact;
  4011. aSign := extractFloat64Sign( a );
  4012. case ( softfloat_rounding_mode ) of
  4013. float_round_nearest_even:
  4014. Begin
  4015. if ( ( aExp = $3FE )
  4016. AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
  4017. ) then
  4018. Begin
  4019. packFloat64( aSign, $3FF, 0, 0, result );
  4020. exit;
  4021. End;
  4022. End;
  4023. float_round_down:
  4024. Begin
  4025. if aSign<>0 then
  4026. packFloat64( 1, $3FF, 0, 0, result )
  4027. else
  4028. packFloat64( 0, 0, 0, 0, result );
  4029. exit;
  4030. End;
  4031. float_round_up:
  4032. Begin
  4033. if aSign <> 0 then
  4034. packFloat64( 1, 0, 0, 0, result )
  4035. else
  4036. packFloat64( 0, $3FF, 0, 0, result );
  4037. exit;
  4038. End;
  4039. end;
  4040. packFloat64( aSign, 0, 0, 0, result );
  4041. exit;
  4042. End;
  4043. lastBitMask := 1;
  4044. lastBitMask := lastBitMask shl ($413 - aExp);
  4045. roundBitsMask := lastBitMask - 1;
  4046. z.low := 0;
  4047. z.high := a.high;
  4048. roundingMode := softfloat_rounding_mode;
  4049. if ( roundingMode = float_round_nearest_even ) then
  4050. Begin
  4051. z.high := z.high + lastBitMask shr 1;
  4052. if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
  4053. Begin
  4054. z.high := z.high and not lastBitMask;
  4055. End;
  4056. End
  4057. else if ( roundingMode <> float_round_to_zero ) then
  4058. Begin
  4059. if ( extractFloat64Sign( z )
  4060. xor flag( roundingMode = float_round_up ) )<> 0 then
  4061. Begin
  4062. z.high := z.high or bits32( a.low <> 0 );
  4063. z.high := z.high + roundBitsMask;
  4064. End;
  4065. End;
  4066. z.high := z.high and not roundBitsMask;
  4067. End;
  4068. if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
  4069. Begin
  4070. softfloat_exception_flags :=
  4071. softfloat_exception_flags or float_flag_inexact;
  4072. End;
  4073. result := z;
  4074. End;
  4075. {*
  4076. -------------------------------------------------------------------------------
  4077. Returns the result of adding the absolute values of the double-precision
  4078. floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4079. before being returned. `zSign' is ignored if the result is a NaN.
  4080. The addition is performed according to the IEC/IEEE Standard for Binary
  4081. Floating-Point Arithmetic.
  4082. -------------------------------------------------------------------------------
  4083. *}
  4084. Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
  4085. Var
  4086. aExp, bExp, zExp: int16;
  4087. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4088. expDiff: int16;
  4089. label shiftRight1;
  4090. label roundAndPack;
  4091. Begin
  4092. aSig1 := extractFloat64Frac1( a );
  4093. aSig0 := extractFloat64Frac0( a );
  4094. aExp := extractFloat64Exp( a );
  4095. bSig1 := extractFloat64Frac1( b );
  4096. bSig0 := extractFloat64Frac0( b );
  4097. bExp := extractFloat64Exp( b );
  4098. expDiff := aExp - bExp;
  4099. if ( 0 < expDiff ) then
  4100. Begin
  4101. if ( aExp = $7FF ) then
  4102. Begin
  4103. if ( aSig0 OR aSig1 ) <> 0 then
  4104. Begin
  4105. propagateFloat64NaN( a, b, out );
  4106. exit;
  4107. end;
  4108. out := a;
  4109. exit;
  4110. End;
  4111. if ( bExp = 0 ) then
  4112. Begin
  4113. Dec(expDiff);
  4114. End
  4115. else
  4116. Begin
  4117. bSig0 := bSig0 or $00100000;
  4118. End;
  4119. shift64ExtraRightJamming(
  4120. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  4121. zExp := aExp;
  4122. End
  4123. else if ( expDiff < 0 ) then
  4124. Begin
  4125. if ( bExp = $7FF ) then
  4126. Begin
  4127. if ( bSig0 OR bSig1 ) <> 0 then
  4128. Begin
  4129. propagateFloat64NaN( a, b, out );
  4130. exit;
  4131. End;
  4132. packFloat64( zSign, $7FF, 0, 0, out );
  4133. End;
  4134. if ( aExp = 0 ) then
  4135. Begin
  4136. Inc(expDiff);
  4137. End
  4138. else
  4139. Begin
  4140. aSig0 := aSig0 or $00100000;
  4141. End;
  4142. shift64ExtraRightJamming(
  4143. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  4144. zExp := bExp;
  4145. End
  4146. else
  4147. Begin
  4148. if ( aExp = $7FF ) then
  4149. Begin
  4150. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4151. Begin
  4152. propagateFloat64NaN( a, b, out );
  4153. exit;
  4154. End;
  4155. out := a;
  4156. exit;
  4157. End;
  4158. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4159. if ( aExp = 0 ) then
  4160. Begin
  4161. packFloat64( zSign, 0, zSig0, zSig1, out );
  4162. exit;
  4163. End;
  4164. zSig2 := 0;
  4165. zSig0 := zSig0 or $00200000;
  4166. zExp := aExp;
  4167. goto shiftRight1;
  4168. End;
  4169. aSig0 := aSig0 or $00100000;
  4170. add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4171. Dec(zExp);
  4172. if ( zSig0 < $00200000 ) then
  4173. goto roundAndPack;
  4174. Inc(zExp);
  4175. shiftRight1:
  4176. shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4177. roundAndPack:
  4178. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
  4179. End;
  4180. {*
  4181. -------------------------------------------------------------------------------
  4182. Returns the result of subtracting the absolute values of the double-
  4183. precision floating-point values `a' and `b'. If `zSign' is 1, the
  4184. difference is negated before being returned. `zSign' is ignored if the
  4185. result is a NaN. The subtraction is performed according to the IEC/IEEE
  4186. Standard for Binary Floating-Point Arithmetic.
  4187. -------------------------------------------------------------------------------
  4188. *}
  4189. Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
  4190. Var
  4191. aExp, bExp, zExp: int16;
  4192. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
  4193. expDiff: int16;
  4194. z: float64;
  4195. label aExpBigger;
  4196. label bExpBigger;
  4197. label aBigger;
  4198. label bBigger;
  4199. label normalizeRoundAndPack;
  4200. Begin
  4201. aSig1 := extractFloat64Frac1( a );
  4202. aSig0 := extractFloat64Frac0( a );
  4203. aExp := extractFloat64Exp( a );
  4204. bSig1 := extractFloat64Frac1( b );
  4205. bSig0 := extractFloat64Frac0( b );
  4206. bExp := extractFloat64Exp( b );
  4207. expDiff := aExp - bExp;
  4208. shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
  4209. shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
  4210. if ( 0 < expDiff ) then goto aExpBigger;
  4211. if ( expDiff < 0 ) then goto bExpBigger;
  4212. if ( aExp = $7FF ) then
  4213. Begin
  4214. if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
  4215. Begin
  4216. propagateFloat64NaN( a, b, out );
  4217. exit;
  4218. End;
  4219. float_raise( float_flag_invalid );
  4220. z.low := float64_default_nan_low;
  4221. z.high := float64_default_nan_high;
  4222. out := z;
  4223. exit;
  4224. End;
  4225. if ( aExp = 0 ) then
  4226. Begin
  4227. aExp := 1;
  4228. bExp := 1;
  4229. End;
  4230. if ( bSig0 < aSig0 ) then goto aBigger;
  4231. if ( aSig0 < bSig0 ) then goto bBigger;
  4232. if ( bSig1 < aSig1 ) then goto aBigger;
  4233. if ( aSig1 < bSig1 ) then goto bBigger;
  4234. packFloat64( flag(softfloat_rounding_mode = float_round_down), 0, 0, 0 , out);
  4235. exit;
  4236. bExpBigger:
  4237. if ( bExp = $7FF ) then
  4238. Begin
  4239. if ( bSig0 OR bSig1 ) <> 0 then
  4240. Begin
  4241. propagateFloat64NaN( a, b, out );
  4242. exit;
  4243. End;
  4244. packFloat64( zSign xor 1, $7FF, 0, 0, out );
  4245. exit;
  4246. End;
  4247. if ( aExp = 0 ) then
  4248. Begin
  4249. Inc(expDiff);
  4250. End
  4251. else
  4252. Begin
  4253. aSig0 := aSig0 or $40000000;
  4254. End;
  4255. shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4256. bSig0 := bSig0 or $40000000;
  4257. bBigger:
  4258. sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  4259. zExp := bExp;
  4260. zSign := zSign xor 1;
  4261. goto normalizeRoundAndPack;
  4262. aExpBigger:
  4263. if ( aExp = $7FF ) then
  4264. Begin
  4265. if ( aSig0 OR aSig1 ) <> 0 then
  4266. Begin
  4267. propagateFloat64NaN( a, b, out );
  4268. exit;
  4269. End;
  4270. out := a;
  4271. exit;
  4272. End;
  4273. if ( bExp = 0 ) then
  4274. Begin
  4275. Dec(expDiff);
  4276. End
  4277. else
  4278. Begin
  4279. bSig0 := bSig0 or $40000000;
  4280. End;
  4281. shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  4282. aSig0 := aSig0 or $40000000;
  4283. aBigger:
  4284. sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  4285. zExp := aExp;
  4286. normalizeRoundAndPack:
  4287. Dec(zExp);
  4288. normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
  4289. End;
  4290. {*
  4291. -------------------------------------------------------------------------------
  4292. Returns the result of adding the double-precision floating-point values `a'
  4293. and `b'. The operation is performed according to the IEC/IEEE Standard for
  4294. Binary Floating-Point Arithmetic.
  4295. -------------------------------------------------------------------------------
  4296. *}
  4297. Function float64_add( a: float64; b : float64) : Float64;
  4298. {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
  4299. Var
  4300. aSign, bSign: flag;
  4301. Begin
  4302. aSign := extractFloat64Sign( a );
  4303. bSign := extractFloat64Sign( b );
  4304. if ( aSign = bSign ) then
  4305. Begin
  4306. addFloat64Sigs( a, b, aSign, result );
  4307. End
  4308. else
  4309. Begin
  4310. subFloat64Sigs( a, b, aSign, result );
  4311. End;
  4312. End;
  4313. {*
  4314. -------------------------------------------------------------------------------
  4315. Returns the result of subtracting the double-precision floating-point values
  4316. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4317. for Binary Floating-Point Arithmetic.
  4318. -------------------------------------------------------------------------------
  4319. *}
  4320. Function float64_sub(a: float64; b : float64) : Float64;
  4321. {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
  4322. Var
  4323. aSign, bSign: flag;
  4324. Begin
  4325. aSign := extractFloat64Sign( a );
  4326. bSign := extractFloat64Sign( b );
  4327. if ( aSign = bSign ) then
  4328. Begin
  4329. subFloat64Sigs( a, b, aSign, result );
  4330. End
  4331. else
  4332. Begin
  4333. addFloat64Sigs( a, b, aSign, result );
  4334. End;
  4335. End;
  4336. {*
  4337. -------------------------------------------------------------------------------
  4338. Returns the result of multiplying the double-precision floating-point values
  4339. `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4340. for Binary Floating-Point Arithmetic.
  4341. -------------------------------------------------------------------------------
  4342. *}
  4343. Function float64_mul( a: float64; b:float64) : Float64;
  4344. {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
  4345. Var
  4346. aSign, bSign, zSign: flag;
  4347. aExp, bExp, zExp: int16;
  4348. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
  4349. z: float64;
  4350. label invalid;
  4351. Begin
  4352. aSig1 := extractFloat64Frac1( a );
  4353. aSig0 := extractFloat64Frac0( a );
  4354. aExp := extractFloat64Exp( a );
  4355. aSign := extractFloat64Sign( a );
  4356. bSig1 := extractFloat64Frac1( b );
  4357. bSig0 := extractFloat64Frac0( b );
  4358. bExp := extractFloat64Exp( b );
  4359. bSign := extractFloat64Sign( b );
  4360. zSign := aSign xor bSign;
  4361. if ( aExp = $7FF ) then
  4362. Begin
  4363. if ( (( aSig0 OR aSig1 ) <>0)
  4364. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4365. Begin
  4366. propagateFloat64NaN( a, b, result );
  4367. exit;
  4368. End;
  4369. if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
  4370. packFloat64( zSign, $7FF, 0, 0, result );
  4371. exit;
  4372. End;
  4373. if ( bExp = $7FF ) then
  4374. Begin
  4375. if ( bSig0 OR bSig1 )<> 0 then
  4376. Begin
  4377. propagateFloat64NaN( a, b, result );
  4378. exit;
  4379. End;
  4380. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4381. Begin
  4382. invalid:
  4383. float_raise( float_flag_invalid );
  4384. z.low := float64_default_nan_low;
  4385. z.high := float64_default_nan_high;
  4386. result := z;
  4387. exit;
  4388. End;
  4389. packFloat64( zSign, $7FF, 0, 0, result );
  4390. exit;
  4391. End;
  4392. if ( aExp = 0 ) then
  4393. Begin
  4394. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4395. Begin
  4396. packFloat64( zSign, 0, 0, 0, result );
  4397. exit;
  4398. End;
  4399. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4400. End;
  4401. if ( bExp = 0 ) then
  4402. Begin
  4403. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4404. Begin
  4405. packFloat64( zSign, 0, 0, 0, result );
  4406. exit;
  4407. End;
  4408. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4409. End;
  4410. zExp := aExp + bExp - $400;
  4411. aSig0 := aSig0 or $00100000;
  4412. shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
  4413. mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  4414. add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  4415. zSig2 := zSig2 or flag( zSig3 <> 0 );
  4416. if ( $00200000 <= zSig0 ) then
  4417. Begin
  4418. shift64ExtraRightJamming(
  4419. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  4420. Inc(zExp);
  4421. End;
  4422. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4423. End;
  4424. {*
  4425. -------------------------------------------------------------------------------
  4426. Returns the result of dividing the double-precision floating-point value `a'
  4427. by the corresponding value `b'. The operation is performed according to the
  4428. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4429. -------------------------------------------------------------------------------
  4430. *}
  4431. Function float64_div(a: float64; b : float64) : Float64;
  4432. {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
  4433. Var
  4434. aSign, bSign, zSign: flag;
  4435. aExp, bExp, zExp: int16;
  4436. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
  4437. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4438. z: float64;
  4439. label invalid;
  4440. Begin
  4441. aSig1 := extractFloat64Frac1( a );
  4442. aSig0 := extractFloat64Frac0( a );
  4443. aExp := extractFloat64Exp( a );
  4444. aSign := extractFloat64Sign( a );
  4445. bSig1 := extractFloat64Frac1( b );
  4446. bSig0 := extractFloat64Frac0( b );
  4447. bExp := extractFloat64Exp( b );
  4448. bSign := extractFloat64Sign( b );
  4449. zSign := aSign xor bSign;
  4450. if ( aExp = $7FF ) then
  4451. Begin
  4452. if ( aSig0 OR aSig1 )<> 0 then
  4453. Begin
  4454. propagateFloat64NaN( a, b, result );
  4455. exit;
  4456. end;
  4457. if ( bExp = $7FF ) then
  4458. Begin
  4459. if ( bSig0 OR bSig1 )<>0 then
  4460. Begin
  4461. propagateFloat64NaN( a, b, result );
  4462. exit;
  4463. End;
  4464. goto invalid;
  4465. End;
  4466. packFloat64( zSign, $7FF, 0, 0, result );
  4467. exit;
  4468. End;
  4469. if ( bExp = $7FF ) then
  4470. Begin
  4471. if ( bSig0 OR bSig1 )<> 0 then
  4472. Begin
  4473. propagateFloat64NaN( a, b, result );
  4474. exit;
  4475. End;
  4476. packFloat64( zSign, 0, 0, 0, result );
  4477. exit;
  4478. End;
  4479. if ( bExp = 0 ) then
  4480. Begin
  4481. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4482. Begin
  4483. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4484. Begin
  4485. invalid:
  4486. float_raise( float_flag_invalid );
  4487. z.low := float64_default_nan_low;
  4488. z.high := float64_default_nan_high;
  4489. result := z;
  4490. exit;
  4491. End;
  4492. float_raise( float_flag_divbyzero );
  4493. packFloat64( zSign, $7FF, 0, 0, result );
  4494. exit;
  4495. End;
  4496. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4497. End;
  4498. if ( aExp = 0 ) then
  4499. Begin
  4500. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4501. Begin
  4502. packFloat64( zSign, 0, 0, 0, result );
  4503. exit;
  4504. End;
  4505. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4506. End;
  4507. zExp := aExp - bExp + $3FD;
  4508. shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
  4509. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4510. if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
  4511. Begin
  4512. shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
  4513. Inc(zExp);
  4514. End;
  4515. zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4516. mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
  4517. sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  4518. while ( sbits32 (rem0) < 0 ) do
  4519. Begin
  4520. Dec(zSig0);
  4521. add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  4522. End;
  4523. zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
  4524. if ( ( zSig1 and $3FF ) <= 4 ) then
  4525. Begin
  4526. mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
  4527. sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  4528. while ( sbits32 (rem1) < 0 ) do
  4529. Begin
  4530. Dec(zSig1);
  4531. add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  4532. End;
  4533. zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4534. End;
  4535. shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
  4536. roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
  4537. End;
  4538. {*
  4539. -------------------------------------------------------------------------------
  4540. Returns the remainder of the double-precision floating-point value `a'
  4541. with respect to the corresponding value `b'. The operation is performed
  4542. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4543. -------------------------------------------------------------------------------
  4544. *}
  4545. Function float64_rem(a: float64; b : float64) : float64;
  4546. {$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
  4547. Var
  4548. aSign, bSign, zSign: flag;
  4549. aExp, bExp, expDiff: int16;
  4550. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
  4551. allZero, alternateASig0, alternateASig1, sigMean1: bits32;
  4552. sigMean0: sbits32;
  4553. z: float64;
  4554. label invalid;
  4555. Begin
  4556. aSig1 := extractFloat64Frac1( a );
  4557. aSig0 := extractFloat64Frac0( a );
  4558. aExp := extractFloat64Exp( a );
  4559. aSign := extractFloat64Sign( a );
  4560. bSig1 := extractFloat64Frac1( b );
  4561. bSig0 := extractFloat64Frac0( b );
  4562. bExp := extractFloat64Exp( b );
  4563. bSign := extractFloat64Sign( b );
  4564. if ( aExp = $7FF ) then
  4565. Begin
  4566. if ((( aSig0 OR aSig1 )<>0)
  4567. OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
  4568. Begin
  4569. propagateFloat64NaN( a, b, result );
  4570. exit;
  4571. End;
  4572. goto invalid;
  4573. End;
  4574. if ( bExp = $7FF ) then
  4575. Begin
  4576. if ( bSig0 OR bSig1 ) <> 0 then
  4577. Begin
  4578. propagateFloat64NaN( a, b, result );
  4579. exit;
  4580. End;
  4581. result := a;
  4582. exit;
  4583. End;
  4584. if ( bExp = 0 ) then
  4585. Begin
  4586. if ( ( bSig0 OR bSig1 ) = 0 ) then
  4587. Begin
  4588. invalid:
  4589. float_raise( float_flag_invalid );
  4590. z.low := float64_default_nan_low;
  4591. z.high := float64_default_nan_high;
  4592. result := z;
  4593. exit;
  4594. End;
  4595. normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  4596. End;
  4597. if ( aExp = 0 ) then
  4598. Begin
  4599. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4600. Begin
  4601. result := a;
  4602. exit;
  4603. End;
  4604. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4605. End;
  4606. expDiff := aExp - bExp;
  4607. if ( expDiff < -1 ) then
  4608. Begin
  4609. result := a;
  4610. exit;
  4611. End;
  4612. shortShift64Left(
  4613. aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
  4614. shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
  4615. q := le64( bSig0, bSig1, aSig0, aSig1 );
  4616. if ( q )<>0 then
  4617. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4618. expDiff := expDiff - 32;
  4619. while ( 0 < expDiff ) do
  4620. Begin
  4621. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4622. if 4 < q then
  4623. q:= q - 4
  4624. else
  4625. q := 0;
  4626. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4627. shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
  4628. shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
  4629. sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
  4630. expDiff := expDiff - 29;
  4631. End;
  4632. if ( -32 < expDiff ) then
  4633. Begin
  4634. q := estimateDiv64To32( aSig0, aSig1, bSig0 );
  4635. if 4 < q then
  4636. q := q - 4
  4637. else
  4638. q := 0;
  4639. q := q shr (- expDiff);
  4640. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4641. expDiff := expDiff + 24;
  4642. if ( expDiff < 0 ) then
  4643. Begin
  4644. shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  4645. End
  4646. else
  4647. Begin
  4648. shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  4649. End;
  4650. mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
  4651. sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  4652. End
  4653. else
  4654. Begin
  4655. shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
  4656. shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
  4657. End;
  4658. Repeat
  4659. alternateASig0 := aSig0;
  4660. alternateASig1 := aSig1;
  4661. Inc(q);
  4662. sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  4663. Until not ( 0 <= sbits32 (aSig0) );
  4664. add64(
  4665. aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
  4666. if ( ( sigMean0 < 0 )
  4667. OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
  4668. Begin
  4669. aSig0 := alternateASig0;
  4670. aSig1 := alternateASig1;
  4671. End;
  4672. zSign := flag( sbits32 (aSig0) < 0 );
  4673. if ( zSign <> 0 ) then
  4674. sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  4675. normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
  4676. End;
  4677. {*
  4678. -------------------------------------------------------------------------------
  4679. Returns the square root of the double-precision floating-point value `a'.
  4680. The operation is performed according to the IEC/IEEE Standard for Binary
  4681. Floating-Point Arithmetic.
  4682. -------------------------------------------------------------------------------
  4683. *}
  4684. Procedure float64_sqrt( a: float64; var out: float64 );
  4685. {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
  4686. Var
  4687. aSign: flag;
  4688. aExp, zExp: int16;
  4689. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
  4690. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
  4691. z: float64;
  4692. label invalid;
  4693. Begin
  4694. aSig1 := extractFloat64Frac1( a );
  4695. aSig0 := extractFloat64Frac0( a );
  4696. aExp := extractFloat64Exp( a );
  4697. aSign := extractFloat64Sign( a );
  4698. if ( aExp = $7FF ) then
  4699. Begin
  4700. if ( aSig0 OR aSig1 ) <> 0 then
  4701. Begin
  4702. propagateFloat64NaN( a, a, out );
  4703. exit;
  4704. End;
  4705. if ( aSign = 0) then
  4706. Begin
  4707. out := a;
  4708. exit;
  4709. End;
  4710. goto invalid;
  4711. End;
  4712. if ( aSign <> 0 ) then
  4713. Begin
  4714. if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
  4715. Begin
  4716. out := a;
  4717. exit;
  4718. End;
  4719. invalid:
  4720. float_raise( float_flag_invalid );
  4721. z.low := float64_default_nan_low;
  4722. z.high := float64_default_nan_high;
  4723. out := z;
  4724. exit;
  4725. End;
  4726. if ( aExp = 0 ) then
  4727. Begin
  4728. if ( ( aSig0 OR aSig1 ) = 0 ) then
  4729. Begin
  4730. packFloat64( 0, 0, 0, 0, out );
  4731. exit;
  4732. End;
  4733. normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  4734. End;
  4735. zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
  4736. aSig0 := aSig0 or $00100000;
  4737. shortShift64Left( aSig0, aSig1, 11, term0, term1 );
  4738. zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
  4739. if ( zSig0 = 0 ) then
  4740. zSig0 := $7FFFFFFF;
  4741. doubleZSig0 := zSig0 + zSig0;
  4742. shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
  4743. mul32To64( zSig0, zSig0, term0, term1 );
  4744. sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
  4745. while ( sbits32 (rem0) < 0 ) do
  4746. Begin
  4747. Dec(zSig0);
  4748. doubleZSig0 := doubleZSig0 - 2;
  4749. add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
  4750. End;
  4751. zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
  4752. if ( ( zSig1 and $1FF ) <= 5 ) then
  4753. Begin
  4754. if ( zSig1 = 0 ) then
  4755. zSig1 := 1;
  4756. mul32To64( doubleZSig0, zSig1, term1, term2 );
  4757. sub64( rem1, 0, term1, term2, rem1, rem2 );
  4758. mul32To64( zSig1, zSig1, term2, term3 );
  4759. sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  4760. while ( sbits32 (rem1) < 0 ) do
  4761. Begin
  4762. Dec(zSig1);
  4763. shortShift64Left( 0, zSig1, 1, term2, term3 );
  4764. term3 := term3 or 1;
  4765. term2 := term2 or doubleZSig0;
  4766. add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  4767. End;
  4768. zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
  4769. End;
  4770. shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
  4771. roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
  4772. End;
  4773. {*
  4774. -------------------------------------------------------------------------------
  4775. Returns 1 if the double-precision floating-point value `a' is equal to
  4776. the corresponding value `b', and 0 otherwise. The comparison is performed
  4777. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4778. -------------------------------------------------------------------------------
  4779. *}
  4780. Function float64_eq(a: float64; b: float64): flag;
  4781. {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
  4782. Begin
  4783. if
  4784. (
  4785. ( extractFloat64Exp( a ) = $7FF )
  4786. AND
  4787. (
  4788. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4789. )
  4790. )
  4791. OR (
  4792. ( extractFloat64Exp( b ) = $7FF )
  4793. AND (
  4794. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4795. )
  4796. )
  4797. ) then
  4798. Begin
  4799. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4800. float_raise( float_flag_invalid );
  4801. float64_eq := 0;
  4802. exit;
  4803. End;
  4804. float64_eq := flag(
  4805. ( a.low = b.low )
  4806. AND ( ( a.high = b.high )
  4807. OR ( ( a.low = 0 )
  4808. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4809. ));
  4810. End;
  4811. {*
  4812. -------------------------------------------------------------------------------
  4813. Returns 1 if the double-precision floating-point value `a' is less than
  4814. or equal to the corresponding value `b', and 0 otherwise. The comparison
  4815. is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4816. Arithmetic.
  4817. -------------------------------------------------------------------------------
  4818. *}
  4819. Function float64_le(a: float64;b: float64): flag;
  4820. {$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
  4821. Var
  4822. aSign, bSign: flag;
  4823. Begin
  4824. if
  4825. (
  4826. ( extractFloat64Exp( a ) = $7FF )
  4827. AND
  4828. (
  4829. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4830. )
  4831. )
  4832. OR (
  4833. ( extractFloat64Exp( b ) = $7FF )
  4834. AND (
  4835. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4836. )
  4837. )
  4838. ) then
  4839. Begin
  4840. float_raise( float_flag_invalid );
  4841. float64_le := 0;
  4842. exit;
  4843. End;
  4844. aSign := extractFloat64Sign( a );
  4845. bSign := extractFloat64Sign( b );
  4846. if ( aSign <> bSign ) then
  4847. Begin
  4848. float64_le := flag(
  4849. (aSign <> 0)
  4850. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4851. = 0 ));
  4852. exit;
  4853. End;
  4854. if aSign <> 0 then
  4855. float64_le := le64( b.high, b.low, a.high, a.low )
  4856. else
  4857. float64_le := le64( a.high, a.low, b.high, b.low );
  4858. End;
  4859. {*
  4860. -------------------------------------------------------------------------------
  4861. Returns 1 if the double-precision floating-point value `a' is less than
  4862. the corresponding value `b', and 0 otherwise. The comparison is performed
  4863. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4864. -------------------------------------------------------------------------------
  4865. *}
  4866. Function float64_lt(a: float64;b: float64): flag;
  4867. {$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
  4868. Var
  4869. aSign, bSign: flag;
  4870. Begin
  4871. if
  4872. (
  4873. ( extractFloat64Exp( a ) = $7FF )
  4874. AND
  4875. (
  4876. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4877. )
  4878. )
  4879. OR (
  4880. ( extractFloat64Exp( b ) = $7FF )
  4881. AND (
  4882. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4883. )
  4884. )
  4885. ) then
  4886. Begin
  4887. float_raise( float_flag_invalid );
  4888. float64_lt := 0;
  4889. exit;
  4890. End;
  4891. aSign := extractFloat64Sign( a );
  4892. bSign := extractFloat64Sign( b );
  4893. if ( aSign <> bSign ) then
  4894. Begin
  4895. float64_lt := flag(
  4896. (aSign <> 0)
  4897. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4898. <> 0 ));
  4899. exit;
  4900. End;
  4901. if aSign <> 0 then
  4902. float64_lt := lt64( b.high, b.low, a.high, a.low )
  4903. else
  4904. float64_lt := lt64( a.high, a.low, b.high, b.low );
  4905. End;
  4906. {*
  4907. -------------------------------------------------------------------------------
  4908. Returns 1 if the double-precision floating-point value `a' is equal to
  4909. the corresponding value `b', and 0 otherwise. The invalid exception is
  4910. raised if either operand is a NaN. Otherwise, the comparison is performed
  4911. according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4912. -------------------------------------------------------------------------------
  4913. *}
  4914. Function float64_eq_signaling( a: float64; b: float64): flag;
  4915. Begin
  4916. if
  4917. (
  4918. ( extractFloat64Exp( a ) = $7FF )
  4919. AND
  4920. (
  4921. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4922. )
  4923. )
  4924. OR (
  4925. ( extractFloat64Exp( b ) = $7FF )
  4926. AND (
  4927. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4928. )
  4929. )
  4930. ) then
  4931. Begin
  4932. float_raise( float_flag_invalid );
  4933. float64_eq_signaling := 0;
  4934. exit;
  4935. End;
  4936. float64_eq_signaling := flag(
  4937. ( a.low = b.low )
  4938. AND ( ( a.high = b.high )
  4939. OR ( ( a.low = 0 )
  4940. AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
  4941. ));
  4942. End;
  4943. {*
  4944. -------------------------------------------------------------------------------
  4945. Returns 1 if the double-precision floating-point value `a' is less than or
  4946. equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4947. cause an exception. Otherwise, the comparison is performed according to the
  4948. IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4949. -------------------------------------------------------------------------------
  4950. *}
  4951. Function float64_le_quiet(a: float64 ; b: float64 ): flag;
  4952. Var
  4953. aSign, bSign : flag;
  4954. Begin
  4955. if
  4956. (
  4957. ( extractFloat64Exp( a ) = $7FF )
  4958. AND
  4959. (
  4960. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  4961. )
  4962. )
  4963. OR (
  4964. ( extractFloat64Exp( b ) = $7FF )
  4965. AND (
  4966. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  4967. )
  4968. )
  4969. ) then
  4970. Begin
  4971. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  4972. float_raise( float_flag_invalid );
  4973. float64_le_quiet := 0;
  4974. exit;
  4975. End;
  4976. aSign := extractFloat64Sign( a );
  4977. bSign := extractFloat64Sign( b );
  4978. if ( aSign <> bSign ) then
  4979. Begin
  4980. float64_le_quiet := flag
  4981. ((aSign <> 0)
  4982. OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  4983. = 0 ));
  4984. exit;
  4985. End;
  4986. if aSign <> 0 then
  4987. float64_le_quiet := le64( b.high, b.low, a.high, a.low )
  4988. else
  4989. float64_le_quiet := le64( a.high, a.low, b.high, b.low );
  4990. End;
  4991. {*
  4992. -------------------------------------------------------------------------------
  4993. Returns 1 if the double-precision floating-point value `a' is less than
  4994. the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4995. exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4996. Standard for Binary Floating-Point Arithmetic.
  4997. -------------------------------------------------------------------------------
  4998. *}
  4999. Function float64_lt_quiet(a: float64; b: float64 ): Flag;
  5000. Var
  5001. aSign, bSign: flag;
  5002. Begin
  5003. if
  5004. (
  5005. ( extractFloat64Exp( a ) = $7FF )
  5006. AND
  5007. (
  5008. (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
  5009. )
  5010. )
  5011. OR (
  5012. ( extractFloat64Exp( b ) = $7FF )
  5013. AND (
  5014. (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
  5015. )
  5016. )
  5017. ) then
  5018. Begin
  5019. if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
  5020. float_raise( float_flag_invalid );
  5021. float64_lt_quiet := 0;
  5022. exit;
  5023. End;
  5024. aSign := extractFloat64Sign( a );
  5025. bSign := extractFloat64Sign( b );
  5026. if ( aSign <> bSign ) then
  5027. Begin
  5028. float64_lt_quiet := flag(
  5029. (aSign<>0)
  5030. AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
  5031. <> 0 ));
  5032. exit;
  5033. End;
  5034. If aSign <> 0 then
  5035. float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
  5036. else
  5037. float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
  5038. End;
  5039. {*----------------------------------------------------------------------------
  5040. | Returns the result of converting the 64-bit two's complement integer `a'
  5041. | to the single-precision floating-point format. The conversion is performed
  5042. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5043. *----------------------------------------------------------------------------*}
  5044. function int64_to_float32( a: int64 ): float32rec; compilerproc;
  5045. var
  5046. zSign : flag;
  5047. absA : uint64;
  5048. shiftCount: int8;
  5049. zSig : bits32;
  5050. intval : int64rec;
  5051. Begin
  5052. if ( a = 0 ) then
  5053. begin
  5054. int64_to_float32.float32 := 0;
  5055. exit;
  5056. end;
  5057. if a < 0 then
  5058. zSign := flag(TRUE)
  5059. else
  5060. zSign := flag(FALSE);
  5061. if zSign<>0 then
  5062. absA := -a
  5063. else
  5064. absA := a;
  5065. shiftCount := countLeadingZeros64( absA ) - 40;
  5066. if ( 0 <= shiftCount ) then
  5067. begin
  5068. int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5069. end
  5070. else
  5071. begin
  5072. shiftCount := shiftCount + 7;
  5073. if ( shiftCount < 0 ) then
  5074. begin
  5075. intval.low := int64rec(AbsA).low;
  5076. intval.high := int64rec(AbsA).high;
  5077. shift64RightJamming( intval.low, intval.high, - shiftCount,
  5078. intval.low, intval.high);
  5079. int64rec(absA).low := intval.low;
  5080. int64rec(absA).high := intval.high;
  5081. end
  5082. else
  5083. absA := absA shl shiftCount;
  5084. int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5085. end;
  5086. End;
  5087. {*----------------------------------------------------------------------------
  5088. | Returns the result of converting the 64-bit two's complement integer `a'
  5089. | to the single-precision floating-point format. The conversion is performed
  5090. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5091. | Unisgned version.
  5092. *----------------------------------------------------------------------------*}
  5093. function qword_to_float32( a: qword ): float32rec; compilerproc;
  5094. var
  5095. zSign : flag;
  5096. absA : uint64;
  5097. shiftCount: int8;
  5098. zSig : bits32;
  5099. intval : int64rec;
  5100. Begin
  5101. if ( a = 0 ) then
  5102. begin
  5103. qword_to_float32.float32 := 0;
  5104. exit;
  5105. end;
  5106. zSign := flag(FALSE);
  5107. absA := a;
  5108. shiftCount := countLeadingZeros64( absA ) - 40;
  5109. if ( 0 <= shiftCount ) then
  5110. begin
  5111. qword_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
  5112. end
  5113. else
  5114. begin
  5115. shiftCount := shiftCount + 7;
  5116. if ( shiftCount < 0 ) then
  5117. begin
  5118. intval.low := int64rec(AbsA).low;
  5119. intval.high := int64rec(AbsA).high;
  5120. shift64RightJamming( intval.low, intval.high, - shiftCount,
  5121. intval.low, intval.high);
  5122. int64rec(absA).low := intval.low;
  5123. int64rec(absA).high := intval.high;
  5124. end
  5125. else
  5126. absA := absA shl shiftCount;
  5127. qword_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
  5128. end;
  5129. End;
  5130. {*----------------------------------------------------------------------------
  5131. | Returns the result of converting the 64-bit two's complement integer `a'
  5132. | to the double-precision floating-point format. The conversion is performed
  5133. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5134. *----------------------------------------------------------------------------*}
  5135. function qword_to_float64( a: qword ): float64;
  5136. {$ifdef fpc}[public,Alias:'QWORD_TO_FLOAT64'];compilerproc;{$endif}
  5137. var
  5138. zSign : flag;
  5139. float_result : float64;
  5140. intval : int64rec;
  5141. AbsA : bits64;
  5142. shiftcount : int8;
  5143. zSig0, zSig1 : bits32;
  5144. Begin
  5145. if ( a = 0 ) then
  5146. Begin
  5147. packFloat64( 0, 0, 0, 0, result );
  5148. exit;
  5149. end;
  5150. zSign := flag(FALSE);
  5151. AbsA := a;
  5152. shiftCount := countLeadingZeros64( absA ) - 11;
  5153. if ( 0 <= shiftCount ) then
  5154. Begin
  5155. absA := absA shl shiftcount;
  5156. zSig0:=int64rec(absA).high;
  5157. zSig1:=int64rec(absA).low;
  5158. End
  5159. else
  5160. Begin
  5161. shift64Right( int64rec(absA).high, int64rec(absA).low,
  5162. - shiftCount, zSig0, zSig1 );
  5163. End;
  5164. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5165. qword_to_float64:= float_result;
  5166. End;
  5167. {*----------------------------------------------------------------------------
  5168. | Returns the result of converting the 64-bit two's complement integer `a'
  5169. | to the double-precision floating-point format. The conversion is performed
  5170. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5171. *----------------------------------------------------------------------------*}
  5172. function int64_to_float64( a: int64 ): float64;
  5173. {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
  5174. var
  5175. zSign : flag;
  5176. float_result : float64;
  5177. intval : int64rec;
  5178. AbsA : bits64;
  5179. shiftcount : int8;
  5180. zSig0, zSig1 : bits32;
  5181. Begin
  5182. if ( a = 0 ) then
  5183. Begin
  5184. packFloat64( 0, 0, 0, 0, result );
  5185. exit;
  5186. end;
  5187. zSign := flag( a < 0 );
  5188. if ZSign<>0 then
  5189. AbsA := -a
  5190. else
  5191. AbsA := a;
  5192. shiftCount := countLeadingZeros64( absA ) - 11;
  5193. if ( 0 <= shiftCount ) then
  5194. Begin
  5195. absA := absA shl shiftcount;
  5196. zSig0:=int64rec(absA).high;
  5197. zSig1:=int64rec(absA).low;
  5198. End
  5199. else
  5200. Begin
  5201. shift64Right( int64rec(absA).high, int64rec(absA).low,
  5202. - shiftCount, zSig0, zSig1 );
  5203. End;
  5204. packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
  5205. int64_to_float64:= float_result;
  5206. End;
  5207. {*----------------------------------------------------------------------------
  5208. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  5209. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5210. | Otherwise, returns 0.
  5211. *----------------------------------------------------------------------------*}
  5212. function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5213. begin
  5214. result := ord(( a0 = b0 ) and ( a1 = b1 ));
  5215. end;
  5216. {*----------------------------------------------------------------------------
  5217. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  5218. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  5219. | Otherwise, returns 0.
  5220. *----------------------------------------------------------------------------*}
  5221. function le128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
  5222. begin
  5223. result:=ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 <= b1 ) ));
  5224. end;
  5225. {*----------------------------------------------------------------------------
  5226. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  5227. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  5228. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  5229. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  5230. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  5231. | the most-significant bit of the extra result, and the other 63 bits of the
  5232. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  5233. | were all zero. This extra result is stored in the location pointed to by
  5234. | `z2Ptr'. The value of `count' can be arbitrarily large.
  5235. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  5236. | to form a fixed-point value with binary point between `a1' and `a2'. This
  5237. | fixed-point value is shifted right by the number of bits given in `count',
  5238. | and the integer part of the result is returned at the locations pointed to
  5239. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  5240. | corrupted as described above, and is returned at the location pointed to by
  5241. | `z2Ptr'.)
  5242. *----------------------------------------------------------------------------*}
  5243. procedure shift128ExtraRightJamming(
  5244. a0: bits64;
  5245. a1: bits64;
  5246. a2: bits64;
  5247. count: int16;
  5248. var z0Ptr: bits64;
  5249. var z1Ptr: bits64;
  5250. var z2Ptr: bits64);
  5251. var
  5252. z0, z1, z2: bits64;
  5253. negCount: int8;
  5254. begin
  5255. negCount := ( - count ) and 63;
  5256. if ( count = 0 ) then
  5257. begin
  5258. z2 := a2;
  5259. z1 := a1;
  5260. z0 := a0;
  5261. end
  5262. else begin
  5263. if ( count < 64 ) then
  5264. begin
  5265. z2 := a1 shr negCount;
  5266. z1 := ( a0 shl negCount ) or ( a1 shr count );
  5267. z0 := a0 shr count;
  5268. end
  5269. else begin
  5270. if ( count = 64 ) then
  5271. begin
  5272. z2 := a1;
  5273. z1 := a0;
  5274. end
  5275. else begin
  5276. a2 := a2 or a1;
  5277. if ( count < 128 ) then
  5278. begin
  5279. z2 := a0 shl negCount;
  5280. z1 := a0 shr ( count and 63 );
  5281. end
  5282. else begin
  5283. if ( count = 128 ) then
  5284. z2 := a0
  5285. else
  5286. z2 := ord( a0 <> 0 );
  5287. z1 := 0;
  5288. end;
  5289. end;
  5290. z0 := 0;
  5291. end;
  5292. z2 := z2 or ord( a2 <> 0 );
  5293. end;
  5294. z2Ptr := z2;
  5295. z1Ptr := z1;
  5296. z0Ptr := z0;
  5297. end;
  5298. {*----------------------------------------------------------------------------
  5299. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  5300. | _plus_ the number of bits given in `count'. The shifted result is at most
  5301. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  5302. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  5303. | shifted off is the most-significant bit of the extra result, and the other
  5304. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  5305. | bits shifted off were all zero. This extra result is stored in the location
  5306. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  5307. | (This routine makes more sense if `a0' and `a1' are considered to form
  5308. | a fixed-point value with binary point between `a0' and `a1'. This fixed-
  5309. | point value is shifted right by the number of bits given in `count', and
  5310. | the integer part of the result is returned at the location pointed to by
  5311. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  5312. | described above, and is returned at the location pointed to by `z1Ptr'.)
  5313. *----------------------------------------------------------------------------*}
  5314. procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
  5315. var
  5316. z0, z1: bits64;
  5317. negCount: int8;
  5318. begin
  5319. negCount := ( - count ) and 63;
  5320. if ( count = 0 ) then
  5321. begin
  5322. z1 := a1;
  5323. z0 := a0;
  5324. end
  5325. else if ( count < 64 ) then
  5326. begin
  5327. z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
  5328. z0 := a0 shr count;
  5329. end
  5330. else begin
  5331. if ( count = 64 ) then
  5332. begin
  5333. z1 := a0 or ord( a1 <> 0 );
  5334. end
  5335. else begin
  5336. z1 := ord( ( a0 or a1 ) <> 0 );
  5337. end;
  5338. z0 := 0;
  5339. end;
  5340. z1Ptr := z1;
  5341. z0Ptr := z0;
  5342. end;
  5343. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  5344. {*----------------------------------------------------------------------------
  5345. | Returns the fraction bits of the extended double-precision floating-point
  5346. | value `a'.
  5347. *----------------------------------------------------------------------------*}
  5348. function extractFloatx80Frac(a : floatx80): bits64;inline;
  5349. begin
  5350. result:=a.low;
  5351. end;
  5352. {*----------------------------------------------------------------------------
  5353. | Returns the exponent bits of the extended double-precision floating-point
  5354. | value `a'.
  5355. *----------------------------------------------------------------------------*}
  5356. function extractFloatx80Exp(a : floatx80): int32;inline;
  5357. begin
  5358. result:=a.high and $7FFF;
  5359. end;
  5360. {*----------------------------------------------------------------------------
  5361. | Returns the sign bit of the extended double-precision floating-point value
  5362. | `a'.
  5363. *----------------------------------------------------------------------------*}
  5364. function extractFloatx80Sign(a : floatx80): flag;inline;
  5365. begin
  5366. result:=a.high shr 15;
  5367. end;
  5368. {*----------------------------------------------------------------------------
  5369. | Normalizes the subnormal extended double-precision floating-point value
  5370. | represented by the denormalized significand `aSig'. The normalized exponent
  5371. | and significand are stored at the locations pointed to by `zExpPtr' and
  5372. | `zSigPtr', respectively.
  5373. *----------------------------------------------------------------------------*}
  5374. procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
  5375. var
  5376. shiftCount: int8;
  5377. begin
  5378. shiftCount := countLeadingZeros64( aSig );
  5379. zSigPtr := aSig shl shiftCount;
  5380. zExpPtr := 1 - shiftCount;
  5381. end;
  5382. {*----------------------------------------------------------------------------
  5383. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  5384. | extended double-precision floating-point value, returning the result.
  5385. *----------------------------------------------------------------------------*}
  5386. function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
  5387. var
  5388. z: floatx80;
  5389. begin
  5390. z.low := zSig;
  5391. z.high := ( bits16(zSign) shl 15 ) + zExp;
  5392. result:=z;
  5393. end;
  5394. {*----------------------------------------------------------------------------
  5395. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  5396. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  5397. | and returns the proper extended double-precision floating-point value
  5398. | corresponding to the abstract input. Ordinarily, the abstract value is
  5399. | rounded and packed into the extended double-precision format, with the
  5400. | inexact exception raised if the abstract input cannot be represented
  5401. | exactly. However, if the abstract value is too large, the overflow and
  5402. | inexact exceptions are raised and an infinity or maximal finite value is
  5403. | returned. If the abstract value is too small, the input value is rounded to
  5404. | a subnormal number, and the underflow and inexact exceptions are raised if
  5405. | the abstract input cannot be represented exactly as a subnormal extended
  5406. | double-precision floating-point number.
  5407. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  5408. | number of bits as single or double precision, respectively. Otherwise, the
  5409. | result is rounded to the full precision of the extended double-precision
  5410. | format.
  5411. | The input significand must be normalized or smaller. If the input
  5412. | significand is not normalized, `zExp' must be 0; in that case, the result
  5413. | returned is a subnormal number, and it must not require rounding. The
  5414. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  5415. | Floating-Point Arithmetic.
  5416. *----------------------------------------------------------------------------*}
  5417. function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5418. var
  5419. roundingMode: int8;
  5420. roundNearestEven, increment, isTiny: flag;
  5421. roundIncrement, roundMask, roundBits: int64;
  5422. label
  5423. precision80;
  5424. begin
  5425. roundingMode := softfloat_rounding_mode;
  5426. roundNearestEven := flag( roundingMode = float_round_nearest_even );
  5427. if ( roundingPrecision = 80 ) then
  5428. goto precision80;
  5429. if ( roundingPrecision = 64 ) then
  5430. begin
  5431. roundIncrement := int64( $0000000000000400 );
  5432. roundMask := int64( $00000000000007FF );
  5433. end
  5434. else if ( roundingPrecision = 32 ) then
  5435. begin
  5436. roundIncrement := int64( $0000008000000000 );
  5437. roundMask := int64( $000000FFFFFFFFFF );
  5438. end
  5439. else begin
  5440. goto precision80;
  5441. end;
  5442. zSig0 := zSig0 or ord( zSig1 <> 0 );
  5443. if ( not (roundNearestEven<>0) ) then
  5444. begin
  5445. if ( roundingMode = float_round_to_zero ) then
  5446. begin
  5447. roundIncrement := 0;
  5448. end
  5449. else begin
  5450. roundIncrement := roundMask;
  5451. if ( zSign<>0 ) then
  5452. begin
  5453. if ( roundingMode = float_round_up ) then
  5454. roundIncrement := 0;
  5455. end
  5456. else begin
  5457. if ( roundingMode = float_round_down ) then
  5458. roundIncrement := 0;
  5459. end;
  5460. end;
  5461. end;
  5462. roundBits := zSig0 and roundMask;
  5463. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5464. if ( ( $7FFE < zExp )
  5465. or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
  5466. ) begin
  5467. goto overflow;
  5468. end;
  5469. if ( zExp <= 0 ) begin
  5470. isTiny =
  5471. ( float_detect_tininess = float_tininess_before_rounding )
  5472. or ( zExp < 0 )
  5473. or ( zSig0 <= zSig0 + roundIncrement );
  5474. shift64RightJamming( zSig0, 1 - zExp, zSig0 );
  5475. zExp := 0;
  5476. roundBits := zSig0 and roundMask;
  5477. if ( isTiny and roundBits ) float_raise( float_flag_underflow );
  5478. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5479. zSig0 += roundIncrement;
  5480. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5481. roundIncrement := roundMask + 1;
  5482. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5483. roundMask |= roundIncrement;
  5484. end;
  5485. zSig0 = ~ roundMask;
  5486. result:=packFloatx80( zSign, zExp, zSig0 );
  5487. end;
  5488. end;
  5489. if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
  5490. zSig0 += roundIncrement;
  5491. if ( zSig0 < roundIncrement ) begin
  5492. ++zExp;
  5493. zSig0 := LIT64( $8000000000000000 );
  5494. end;
  5495. roundIncrement := roundMask + 1;
  5496. if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
  5497. roundMask |= roundIncrement;
  5498. end;
  5499. zSig0 = ~ roundMask;
  5500. if ( zSig0 = 0 ) zExp := 0;
  5501. result:=packFloatx80( zSign, zExp, zSig0 );
  5502. precision80:
  5503. increment := ( (sbits64) zSig1 < 0 );
  5504. if ( ! roundNearestEven ) begin
  5505. if ( roundingMode = float_round_to_zero ) begin
  5506. increment := 0;
  5507. end;
  5508. else begin
  5509. if ( zSign ) begin
  5510. increment := ( roundingMode = float_round_down ) and zSig1;
  5511. end;
  5512. else begin
  5513. increment := ( roundingMode = float_round_up ) and zSig1;
  5514. end;
  5515. end;
  5516. end;
  5517. if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
  5518. if ( ( $7FFE < zExp )
  5519. or ( ( zExp = $7FFE )
  5520. and ( zSig0 = LIT64( $FFFFFFFFFFFFFFFF ) )
  5521. and increment
  5522. )
  5523. ) begin
  5524. roundMask := 0;
  5525. overflow:
  5526. float_raise( float_flag_overflow or float_flag_inexact );
  5527. if ( ( roundingMode = float_round_to_zero )
  5528. or ( zSign and ( roundingMode = float_round_up ) )
  5529. or ( ! zSign and ( roundingMode = float_round_down ) )
  5530. ) begin
  5531. result:=packFloatx80( zSign, $7FFE, ~ roundMask );
  5532. end;
  5533. result:=packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5534. end;
  5535. if ( zExp <= 0 ) begin
  5536. isTiny =
  5537. ( float_detect_tininess = float_tininess_before_rounding )
  5538. or ( zExp < 0 )
  5539. or ! increment
  5540. or ( zSig0 < LIT64( $FFFFFFFFFFFFFFFF ) );
  5541. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, zSig0, zSig1 );
  5542. zExp := 0;
  5543. if ( isTiny and zSig1 ) float_raise( float_flag_underflow );
  5544. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5545. if ( roundNearestEven ) begin
  5546. increment := ( (sbits64) zSig1 < 0 );
  5547. end;
  5548. else begin
  5549. if ( zSign ) begin
  5550. increment := ( roundingMode = float_round_down ) and zSig1;
  5551. end;
  5552. else begin
  5553. increment := ( roundingMode = float_round_up ) and zSig1;
  5554. end;
  5555. end;
  5556. if ( increment ) begin
  5557. ++zSig0;
  5558. zSig0 =
  5559. ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5560. if ( (sbits64) zSig0 < 0 ) zExp := 1;
  5561. end;
  5562. result:=packFloatx80( zSign, zExp, zSig0 );
  5563. end;
  5564. end;
  5565. if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
  5566. if ( increment ) begin
  5567. ++zSig0;
  5568. if ( zSig0 = 0 ) begin
  5569. ++zExp;
  5570. zSig0 := LIT64( $8000000000000000 );
  5571. end;
  5572. else begin
  5573. zSig0 = ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
  5574. end;
  5575. end;
  5576. else begin
  5577. if ( zSig0 = 0 ) zExp := 0;
  5578. end;
  5579. result:=packFloatx80( zSign, zExp, zSig0 );
  5580. end;
  5581. {*----------------------------------------------------------------------------
  5582. | Takes an abstract floating-point value having sign `zSign', exponent
  5583. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  5584. | and returns the proper extended double-precision floating-point value
  5585. | corresponding to the abstract input. This routine is just like
  5586. | `roundAndPackFloatx80' except that the input significand does not have to be
  5587. | normalized.
  5588. *----------------------------------------------------------------------------*}
  5589. function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
  5590. var
  5591. shiftCount: int8;
  5592. begin
  5593. if ( zSig0 = 0 ) begin
  5594. zSig0 := zSig1;
  5595. zSig1 := 0;
  5596. zExp -= 64;
  5597. end;
  5598. shiftCount := countLeadingZeros64( zSig0 );
  5599. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  5600. zExp := eExp - shiftCount;
  5601. return
  5602. roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  5603. end;
  5604. {*----------------------------------------------------------------------------
  5605. | Returns the result of converting the extended double-precision floating-
  5606. | point value `a' to the 32-bit two's complement integer format. The
  5607. | conversion is performed according to the IEC/IEEE Standard for Binary
  5608. | Floating-Point Arithmetic---which means in particular that the conversion
  5609. | is rounded according to the current rounding mode. If `a' is a NaN, the
  5610. | largest positive integer is returned. Otherwise, if the conversion
  5611. | overflows, the largest integer with the same sign as `a' is returned.
  5612. *----------------------------------------------------------------------------*}
  5613. function floatx80_to_int32(a: floatx80): int32;
  5614. var
  5615. aSign: flag;
  5616. aExp, shiftCount: int32;
  5617. aSig: bits64;
  5618. begin
  5619. aSig := extractFloatx80Frac( a );
  5620. aExp := extractFloatx80Exp( a );
  5621. aSign := extractFloatx80Sign( a );
  5622. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5623. shiftCount := $4037 - aExp;
  5624. if ( shiftCount <= 0 ) shiftCount := 1;
  5625. shift64RightJamming( aSig, shiftCount, aSig );
  5626. result := roundAndPackInt32( aSign, aSig );
  5627. end;
  5628. {*----------------------------------------------------------------------------
  5629. | Returns the result of converting the extended double-precision floating-
  5630. | point value `a' to the 32-bit two's complement integer format. The
  5631. | conversion is performed according to the IEC/IEEE Standard for Binary
  5632. | Floating-Point Arithmetic, except that the conversion is always rounded
  5633. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5634. | Otherwise, if the conversion overflows, the largest integer with the same
  5635. | sign as `a' is returned.
  5636. *----------------------------------------------------------------------------*}
  5637. function floatx80_to_int32_round_to_zero(a: floatx80): int32;
  5638. var
  5639. aSign: flag;
  5640. aExp, shiftCount: int32;
  5641. aSig, savedASig: bits64;
  5642. z: int32;
  5643. begin
  5644. aSig := extractFloatx80Frac( a );
  5645. aExp := extractFloatx80Exp( a );
  5646. aSign := extractFloatx80Sign( a );
  5647. if ( $401E < aExp ) begin
  5648. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
  5649. goto invalid;
  5650. end;
  5651. else if ( aExp < $3FFF ) begin
  5652. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5653. result := 0;
  5654. end;
  5655. shiftCount := $403E - aExp;
  5656. savedASig := aSig;
  5657. aSig >>= shiftCount;
  5658. z := aSig;
  5659. if ( aSign ) z := - z;
  5660. if ( ( z < 0 ) xor aSign ) begin
  5661. invalid:
  5662. float_raise( float_flag_invalid );
  5663. result := aSign ? (sbits32) $80000000 : $7FFFFFFF;
  5664. end;
  5665. if ( ( aSig shl shiftCount ) <> savedASig ) begin
  5666. softfloat_exception_flags or= float_flag_inexact;
  5667. end;
  5668. result := z;
  5669. end;
  5670. {*----------------------------------------------------------------------------
  5671. | Returns the result of converting the extended double-precision floating-
  5672. | point value `a' to the 64-bit two's complement integer format. The
  5673. | conversion is performed according to the IEC/IEEE Standard for Binary
  5674. | Floating-Point Arithmetic---which means in particular that the conversion
  5675. | is rounded according to the current rounding mode. If `a' is a NaN,
  5676. | the largest positive integer is returned. Otherwise, if the conversion
  5677. | overflows, the largest integer with the same sign as `a' is returned.
  5678. *----------------------------------------------------------------------------*}
  5679. function floatx80_to_int64(a: floatx80): int64;
  5680. var
  5681. aSign: flag;
  5682. aExp, shiftCount: int32;
  5683. aSig, aSigExtra: bits64;
  5684. begin
  5685. aSig := extractFloatx80Frac( a );
  5686. aExp := extractFloatx80Exp( a );
  5687. aSign := extractFloatx80Sign( a );
  5688. shiftCount := $403E - aExp;
  5689. if ( shiftCount <= 0 ) begin
  5690. if ( shiftCount ) begin
  5691. float_raise( float_flag_invalid );
  5692. if ( ! aSign
  5693. or ( ( aExp = $7FFF )
  5694. and ( aSig <> LIT64( $8000000000000000 ) ) )
  5695. ) begin
  5696. result := LIT64( $7FFFFFFFFFFFFFFF );
  5697. end;
  5698. result := (sbits64) LIT64( $8000000000000000 );
  5699. end;
  5700. aSigExtra := 0;
  5701. end;
  5702. else begin
  5703. shift64ExtraRightJamming( aSig, 0, shiftCount, aSig, aSigExtra );
  5704. end;
  5705. result := roundAndPackInt64( aSign, aSig, aSigExtra );
  5706. end;
  5707. {*----------------------------------------------------------------------------
  5708. | Returns the result of converting the extended double-precision floating-
  5709. | point value `a' to the 64-bit two's complement integer format. The
  5710. | conversion is performed according to the IEC/IEEE Standard for Binary
  5711. | Floating-Point Arithmetic, except that the conversion is always rounded
  5712. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  5713. | Otherwise, if the conversion overflows, the largest integer with the same
  5714. | sign as `a' is returned.
  5715. *----------------------------------------------------------------------------*}
  5716. function floatx80_to_int64_round_to_zero(a: floatx80): int64;
  5717. var
  5718. aSign: flag;
  5719. aExp, shiftCount: int32;
  5720. aSig: bits64;
  5721. z: int64;
  5722. begin
  5723. aSig := extractFloatx80Frac( a );
  5724. aExp := extractFloatx80Exp( a );
  5725. aSign := extractFloatx80Sign( a );
  5726. shiftCount := aExp - $403E;
  5727. if ( 0 <= shiftCount ) begin
  5728. aSig = LIT64( $7FFFFFFFFFFFFFFF );
  5729. if ( ( a.high <> $C03E ) or aSig ) begin
  5730. float_raise( float_flag_invalid );
  5731. if ( ! aSign or ( ( aExp = $7FFF ) and aSig ) ) begin
  5732. result := LIT64( $7FFFFFFFFFFFFFFF );
  5733. end;
  5734. end;
  5735. result := (sbits64) LIT64( $8000000000000000 );
  5736. end;
  5737. else if ( aExp < $3FFF ) begin
  5738. if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
  5739. result := 0;
  5740. end;
  5741. z := aSig>>( - shiftCount );
  5742. if ( (bits64) ( aSig shl ( shiftCount and 63 ) ) ) begin
  5743. softfloat_exception_flags or= float_flag_inexact;
  5744. end;
  5745. if ( aSign ) z := - z;
  5746. result := z;
  5747. end;
  5748. {*----------------------------------------------------------------------------
  5749. | Returns the result of converting the extended double-precision floating-
  5750. | point value `a' to the single-precision floating-point format. The
  5751. | conversion is performed according to the IEC/IEEE Standard for Binary
  5752. | Floating-Point Arithmetic.
  5753. *----------------------------------------------------------------------------*}
  5754. function floatx80_to_float32(a: floatx80): float32;
  5755. var
  5756. aSign: flag;
  5757. aExp: int32;
  5758. aSig: bits64;
  5759. begin
  5760. aSig := extractFloatx80Frac( a );
  5761. aExp := extractFloatx80Exp( a );
  5762. aSign := extractFloatx80Sign( a );
  5763. if ( aExp = $7FFF ) begin
  5764. if ( (bits64) ( aSig shl 1 ) ) begin
  5765. result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
  5766. end;
  5767. result := packFloat32( aSign, $FF, 0 );
  5768. end;
  5769. shift64RightJamming( aSig, 33, aSig );
  5770. if ( aExp or aSig ) aExp -= $3F81;
  5771. result := roundAndPackFloat32( aSign, aExp, aSig );
  5772. end;
  5773. {*----------------------------------------------------------------------------
  5774. | Returns the result of converting the extended double-precision floating-
  5775. | point value `a' to the double-precision floating-point format. The
  5776. | conversion is performed according to the IEC/IEEE Standard for Binary
  5777. | Floating-Point Arithmetic.
  5778. *----------------------------------------------------------------------------*}
  5779. function floatx80_to_float64(a: floatx80): float64;
  5780. var
  5781. aSign: flag;
  5782. aExp: int32;
  5783. aSig, zSig: bits64;
  5784. begin
  5785. aSig := extractFloatx80Frac( a );
  5786. aExp := extractFloatx80Exp( a );
  5787. aSign := extractFloatx80Sign( a );
  5788. if ( aExp = $7FFF ) begin
  5789. if ( (bits64) ( aSig shl 1 ) ) begin
  5790. result := commonNaNToFloat64( floatx80ToCommonNaN( a ) );
  5791. end;
  5792. result := packFloat64( aSign, $7FF, 0 );
  5793. end;
  5794. shift64RightJamming( aSig, 1, zSig );
  5795. if ( aExp or aSig ) aExp -= $3C01;
  5796. result := roundAndPackFloat64( aSign, aExp, zSig );
  5797. end;
  5798. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  5799. {*----------------------------------------------------------------------------
  5800. | Returns the result of converting the extended double-precision floating-
  5801. | point value `a' to the quadruple-precision floating-point format. The
  5802. | conversion is performed according to the IEC/IEEE Standard for Binary
  5803. | Floating-Point Arithmetic.
  5804. *----------------------------------------------------------------------------*}
  5805. function floatx80_to_float128(a: floatx80): float128;
  5806. var
  5807. aSign: flag;
  5808. aExp: int16;
  5809. aSig, zSig0, zSig1: bits64;
  5810. begin
  5811. aSig := extractFloatx80Frac( a );
  5812. aExp := extractFloatx80Exp( a );
  5813. aSign := extractFloatx80Sign( a );
  5814. if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) begin
  5815. result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
  5816. end;
  5817. shift128Right( aSig shl 1, 0, 16, zSig0, zSig1 );
  5818. result := packFloat128( aSign, aExp, zSig0, zSig1 );
  5819. end;
  5820. {$endif FPC_SOFTFLOAT_FLOAT128}
  5821. {*----------------------------------------------------------------------------
  5822. | Rounds the extended double-precision floating-point value `a' to an integer,
  5823. | and Returns the result as an extended quadruple-precision floating-point
  5824. | value. The operation is performed according to the IEC/IEEE Standard for
  5825. | Binary Floating-Point Arithmetic.
  5826. *----------------------------------------------------------------------------*}
  5827. function floatx80_round_to_int(a: floatx80): floatx80;
  5828. var
  5829. aSign: flag;
  5830. aExp: int32;
  5831. lastBitMask, roundBitsMask: bits64;
  5832. roundingMode: int8;
  5833. z: floatx80;
  5834. begin
  5835. aExp := extractFloatx80Exp( a );
  5836. if ( $403E <= aExp ) begin
  5837. if ( ( aExp = $7FFF ) and (bits64) ( extractFloatx80Frac( a ) shl 1 ) ) begin
  5838. result := propagateFloatx80NaN( a, a );
  5839. end;
  5840. result := a;
  5841. end;
  5842. if ( aExp < $3FFF ) begin
  5843. if ( ( aExp = 0 )
  5844. and ( (bits64) ( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) begin
  5845. result := a;
  5846. end;
  5847. softfloat_exception_flags or= float_flag_inexact;
  5848. aSign := extractFloatx80Sign( a );
  5849. switch ( softfloat_rounding_mode ) begin
  5850. case float_round_nearest_even:
  5851. if ( ( aExp = $3FFE ) and (bits64) ( extractFloatx80Frac( a ) shl 1 )
  5852. ) begin
  5853. result :=
  5854. packFloatx80( aSign, $3FFF, LIT64( $8000000000000000 ) );
  5855. end;
  5856. break;
  5857. case float_round_down:
  5858. result :=
  5859. aSign ?
  5860. packFloatx80( 1, $3FFF, LIT64( $8000000000000000 ) )
  5861. : packFloatx80( 0, 0, 0 );
  5862. case float_round_up:
  5863. result :=
  5864. aSign ? packFloatx80( 1, 0, 0 )
  5865. : packFloatx80( 0, $3FFF, LIT64( $8000000000000000 ) );
  5866. end;
  5867. result := packFloatx80( aSign, 0, 0 );
  5868. end;
  5869. lastBitMask := 1;
  5870. lastBitMask shl = $403E - aExp;
  5871. roundBitsMask := lastBitMask - 1;
  5872. z := a;
  5873. roundingMode := softfloat_rounding_mode;
  5874. if ( roundingMode = float_round_nearest_even ) begin
  5875. z.low += lastBitMask>>1;
  5876. if ( ( z.low and roundBitsMask ) = 0 ) z.low = ~ lastBitMask;
  5877. end;
  5878. else if ( roundingMode <> float_round_to_zero ) begin
  5879. if ( extractFloatx80Sign( z ) xor ( roundingMode = float_round_up ) ) begin
  5880. z.low += roundBitsMask;
  5881. end;
  5882. end;
  5883. z.low = ~ roundBitsMask;
  5884. if ( z.low = 0 ) begin
  5885. ++z.high;
  5886. z.low := LIT64( $8000000000000000 );
  5887. end;
  5888. if ( z.low <> a.low ) softfloat_exception_flags or= float_flag_inexact;
  5889. result := z;
  5890. end;
  5891. {*----------------------------------------------------------------------------
  5892. | Returns the result of adding the absolute values of the extended double-
  5893. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  5894. | negated before being returned. `zSign' is ignored if the result is a NaN.
  5895. | The addition is performed according to the IEC/IEEE Standard for Binary
  5896. | Floating-Point Arithmetic.
  5897. *----------------------------------------------------------------------------*}
  5898. function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5899. var
  5900. aExp, bExp, zExp: int32;
  5901. aSig, bSig, zSig0, zSig1: bits64;
  5902. expDiff: int32;
  5903. begin
  5904. aSig := extractFloatx80Frac( a );
  5905. aExp := extractFloatx80Exp( a );
  5906. bSig := extractFloatx80Frac( b );
  5907. bExp := extractFloatx80Exp( b );
  5908. expDiff := aExp - bExp;
  5909. if ( 0 < expDiff ) begin
  5910. if ( aExp = $7FFF ) begin
  5911. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5912. result := a;
  5913. end;
  5914. if ( bExp = 0 ) --expDiff;
  5915. shift64ExtraRightJamming( bSig, 0, expDiff, bSig, zSig1 );
  5916. zExp := aExp;
  5917. end;
  5918. else if ( expDiff < 0 ) begin
  5919. if ( bExp = $7FFF ) begin
  5920. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5921. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  5922. end;
  5923. if ( aExp = 0 ) ++expDiff;
  5924. shift64ExtraRightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  5925. zExp := bExp;
  5926. end;
  5927. else begin
  5928. if ( aExp = $7FFF ) begin
  5929. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5930. result := propagateFloatx80NaN( a, b );
  5931. end;
  5932. result := a;
  5933. end;
  5934. zSig1 := 0;
  5935. zSig0 := aSig + bSig;
  5936. if ( aExp = 0 ) begin
  5937. normalizeFloatx80Subnormal( zSig0, zExp, zSig0 );
  5938. goto roundAndPack;
  5939. end;
  5940. zExp := aExp;
  5941. goto shiftRight1;
  5942. end;
  5943. zSig0 := aSig + bSig;
  5944. if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
  5945. shiftRight1:
  5946. shift64ExtraRightJamming( zSig0, zSig1, 1, zSig0, zSig1 );
  5947. zSig0 or= LIT64( $8000000000000000 );
  5948. ++zExp;
  5949. roundAndPack:
  5950. result :=
  5951. roundAndPackFloatx80(
  5952. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  5953. end;
  5954. {*----------------------------------------------------------------------------
  5955. | Returns the result of subtracting the absolute values of the extended
  5956. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  5957. | difference is negated before being returned. `zSign' is ignored if the
  5958. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  5959. | Standard for Binary Floating-Point Arithmetic.
  5960. *----------------------------------------------------------------------------*}
  5961. function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
  5962. var
  5963. aExp, bExp, zExp: int32;
  5964. aSig, bSig, zSig0, zSig1: bits64;
  5965. expDiff: int32;
  5966. z: floatx80;
  5967. begin
  5968. aSig := extractFloatx80Frac( a );
  5969. aExp := extractFloatx80Exp( a );
  5970. bSig := extractFloatx80Frac( b );
  5971. bExp := extractFloatx80Exp( b );
  5972. expDiff := aExp - bExp;
  5973. if ( 0 < expDiff ) goto aExpBigger;
  5974. if ( expDiff < 0 ) goto bExpBigger;
  5975. if ( aExp = $7FFF ) begin
  5976. if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
  5977. result := propagateFloatx80NaN( a, b );
  5978. end;
  5979. float_raise( float_flag_invalid );
  5980. z.low := floatx80_default_nan_low;
  5981. z.high := floatx80_default_nan_high;
  5982. result := z;
  5983. end;
  5984. if ( aExp = 0 ) begin
  5985. aExp := 1;
  5986. bExp := 1;
  5987. end;
  5988. zSig1 := 0;
  5989. if ( bSig < aSig ) goto aBigger;
  5990. if ( aSig < bSig ) goto bBigger;
  5991. result := packFloatx80( softfloat_rounding_mode = float_round_down, 0, 0 );
  5992. bExpBigger:
  5993. if ( bExp = $7FFF ) begin
  5994. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  5995. result := packFloatx80( zSign xor 1, $7FFF, LIT64( $8000000000000000 ) );
  5996. end;
  5997. if ( aExp = 0 ) ++expDiff;
  5998. shift128RightJamming( aSig, 0, - expDiff, aSig, zSig1 );
  5999. bBigger:
  6000. sub128( bSig, 0, aSig, zSig1, zSig0, zSig1 );
  6001. zExp := bExp;
  6002. zSign xor = 1;
  6003. goto normalizeRoundAndPack;
  6004. aExpBigger:
  6005. if ( aExp = $7FFF ) begin
  6006. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6007. result := a;
  6008. end;
  6009. if ( bExp = 0 ) --expDiff;
  6010. shift128RightJamming( bSig, 0, expDiff, bSig, zSig1 );
  6011. aBigger:
  6012. sub128( aSig, 0, bSig, zSig1, zSig0, zSig1 );
  6013. zExp := aExp;
  6014. normalizeRoundAndPack:
  6015. result :=
  6016. normalizeRoundAndPackFloatx80(
  6017. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6018. end;
  6019. {*----------------------------------------------------------------------------
  6020. | Returns the result of adding the extended double-precision floating-point
  6021. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6022. | Standard for Binary Floating-Point Arithmetic.
  6023. *----------------------------------------------------------------------------*}
  6024. function floatx80_add(a: floatx80; b: floatx80): floatx80;
  6025. var
  6026. aSign, bSign: flag;
  6027. begin
  6028. aSign := extractFloatx80Sign( a );
  6029. bSign := extractFloatx80Sign( b );
  6030. if ( aSign = bSign ) begin
  6031. result := addFloatx80Sigs( a, b, aSign );
  6032. end;
  6033. else begin
  6034. result := subFloatx80Sigs( a, b, aSign );
  6035. end;
  6036. end;
  6037. {*----------------------------------------------------------------------------
  6038. | Returns the result of subtracting the extended double-precision floating-
  6039. | point values `a' and `b'. The operation is performed according to the
  6040. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6041. *----------------------------------------------------------------------------*}
  6042. function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
  6043. var
  6044. aSign, bSign: flag;
  6045. begin
  6046. aSign := extractFloatx80Sign( a );
  6047. bSign := extractFloatx80Sign( b );
  6048. if ( aSign = bSign ) begin
  6049. result := subFloatx80Sigs( a, b, aSign );
  6050. end;
  6051. else begin
  6052. result := addFloatx80Sigs( a, b, aSign );
  6053. end;
  6054. end;
  6055. {*----------------------------------------------------------------------------
  6056. | Returns the result of multiplying the extended double-precision floating-
  6057. | point values `a' and `b'. The operation is performed according to the
  6058. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6059. *----------------------------------------------------------------------------*}
  6060. function floatx80_mul(a: floatx80; b: floatx80): floatx80;
  6061. var
  6062. aSign, bSign, zSign: flag;
  6063. aExp, bExp, zExp: int32;
  6064. aSig, bSig, zSig0, zSig1: bits64;
  6065. z: floatx80;
  6066. begin
  6067. aSig := extractFloatx80Frac( a );
  6068. aExp := extractFloatx80Exp( a );
  6069. aSign := extractFloatx80Sign( a );
  6070. bSig := extractFloatx80Frac( b );
  6071. bExp := extractFloatx80Exp( b );
  6072. bSign := extractFloatx80Sign( b );
  6073. zSign := aSign xor bSign;
  6074. if ( aExp = $7FFF ) begin
  6075. if ( (bits64) ( aSig shl 1 )
  6076. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  6077. result := propagateFloatx80NaN( a, b );
  6078. end;
  6079. if ( ( bExp or bSig ) = 0 ) goto invalid;
  6080. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6081. end;
  6082. if ( bExp = $7FFF ) begin
  6083. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6084. if ( ( aExp or aSig ) = 0 ) begin
  6085. invalid:
  6086. float_raise( float_flag_invalid );
  6087. z.low := floatx80_default_nan_low;
  6088. z.high := floatx80_default_nan_high;
  6089. result := z;
  6090. end;
  6091. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6092. end;
  6093. if ( aExp = 0 ) begin
  6094. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6095. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6096. end;
  6097. if ( bExp = 0 ) begin
  6098. if ( bSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6099. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6100. end;
  6101. zExp := aExp + bExp - $3FFE;
  6102. mul64To128( aSig, bSig, zSig0, zSig1 );
  6103. if ( 0 < (sbits64) zSig0 ) begin
  6104. shortShift128Left( zSig0, zSig1, 1, zSig0, zSig1 );
  6105. --zExp;
  6106. end;
  6107. result :=
  6108. roundAndPackFloatx80(
  6109. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6110. end;
  6111. {*----------------------------------------------------------------------------
  6112. | Returns the result of dividing the extended double-precision floating-point
  6113. | value `a' by the corresponding value `b'. The operation is performed
  6114. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6115. *----------------------------------------------------------------------------*}
  6116. function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
  6117. var
  6118. aSign, bSign, zSign: flag;
  6119. aExp, bExp, zExp: int32;
  6120. aSig, bSig, zSig0, zSig1: bits64;
  6121. rem0, rem1, rem2, term0, term1, term2: bits64;
  6122. z: floatx80;
  6123. begin
  6124. aSig := extractFloatx80Frac( a );
  6125. aExp := extractFloatx80Exp( a );
  6126. aSign := extractFloatx80Sign( a );
  6127. bSig := extractFloatx80Frac( b );
  6128. bExp := extractFloatx80Exp( b );
  6129. bSign := extractFloatx80Sign( b );
  6130. zSign := aSign xor bSign;
  6131. if ( aExp = $7FFF ) begin
  6132. if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6133. if ( bExp = $7FFF ) begin
  6134. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6135. goto invalid;
  6136. end;
  6137. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6138. end;
  6139. if ( bExp = $7FFF ) begin
  6140. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6141. result := packFloatx80( zSign, 0, 0 );
  6142. end;
  6143. if ( bExp = 0 ) begin
  6144. if ( bSig = 0 ) begin
  6145. if ( ( aExp or aSig ) = 0 ) begin
  6146. invalid:
  6147. float_raise( float_flag_invalid );
  6148. z.low := floatx80_default_nan_low;
  6149. z.high := floatx80_default_nan_high;
  6150. result := z;
  6151. end;
  6152. float_raise( float_flag_divbyzero );
  6153. result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
  6154. end;
  6155. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6156. end;
  6157. if ( aExp = 0 ) begin
  6158. if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
  6159. normalizeFloatx80Subnormal( aSig, aExp, aSig );
  6160. end;
  6161. zExp := aExp - bExp + $3FFE;
  6162. rem1 := 0;
  6163. if ( bSig <= aSig ) begin
  6164. shift128Right( aSig, 0, 1, aSig, rem1 );
  6165. ++zExp;
  6166. end;
  6167. zSig0 := estimateDiv128To64( aSig, rem1, bSig );
  6168. mul64To128( bSig, zSig0, term0, term1 );
  6169. sub128( aSig, rem1, term0, term1, rem0, rem1 );
  6170. while ( (sbits64) rem0 < 0 ) begin
  6171. --zSig0;
  6172. add128( rem0, rem1, 0, bSig, rem0, rem1 );
  6173. end;
  6174. zSig1 := estimateDiv128To64( rem1, 0, bSig );
  6175. if ( (bits64) ( zSig1 shl 1 ) <= 8 ) begin
  6176. mul64To128( bSig, zSig1, term1, term2 );
  6177. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6178. while ( (sbits64) rem1 < 0 ) begin
  6179. --zSig1;
  6180. add128( rem1, rem2, 0, bSig, rem1, rem2 );
  6181. end;
  6182. zSig1 or= ( ( rem1 or rem2 ) <> 0 );
  6183. end;
  6184. result :=
  6185. roundAndPackFloatx80(
  6186. floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
  6187. end;
  6188. {*----------------------------------------------------------------------------
  6189. | Returns the remainder of the extended double-precision floating-point value
  6190. | `a' with respect to the corresponding value `b'. The operation is performed
  6191. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6192. *----------------------------------------------------------------------------*}
  6193. function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
  6194. var
  6195. aSign, bSign, zSign: flag;
  6196. aExp, bExp, expDiff: int32;
  6197. aSig0, aSig1, bSig: bits64;
  6198. q, term0, term1, alternateASig0, alternateASig1: bits64;
  6199. z: floatx80;
  6200. begin
  6201. aSig0 := extractFloatx80Frac( a );
  6202. aExp := extractFloatx80Exp( a );
  6203. aSign := extractFloatx80Sign( a );
  6204. bSig := extractFloatx80Frac( b );
  6205. bExp := extractFloatx80Exp( b );
  6206. bSign := extractFloatx80Sign( b );
  6207. if ( aExp = $7FFF ) begin
  6208. if ( (bits64) ( aSig0 shl 1 )
  6209. or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
  6210. result := propagateFloatx80NaN( a, b );
  6211. end;
  6212. goto invalid;
  6213. end;
  6214. if ( bExp = $7FFF ) begin
  6215. if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
  6216. result := a;
  6217. end;
  6218. if ( bExp = 0 ) begin
  6219. if ( bSig = 0 ) begin
  6220. invalid:
  6221. float_raise( float_flag_invalid );
  6222. z.low := floatx80_default_nan_low;
  6223. z.high := floatx80_default_nan_high;
  6224. result := z;
  6225. end;
  6226. normalizeFloatx80Subnormal( bSig, bExp, bSig );
  6227. end;
  6228. if ( aExp = 0 ) begin
  6229. if ( (bits64) ( aSig0 shl 1 ) = 0 ) result := a;
  6230. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6231. end;
  6232. bSig or= LIT64( $8000000000000000 );
  6233. zSign := aSign;
  6234. expDiff := aExp - bExp;
  6235. aSig1 := 0;
  6236. if ( expDiff < 0 ) begin
  6237. if ( expDiff < -1 ) result := a;
  6238. shift128Right( aSig0, 0, 1, aSig0, aSig1 );
  6239. expDiff := 0;
  6240. end;
  6241. q := ( bSig <= aSig0 );
  6242. if ( q ) aSig0 -= bSig;
  6243. expDiff -= 64;
  6244. while ( 0 < expDiff ) begin
  6245. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6246. q := ( 2 < q ) ? q - 2 : 0;
  6247. mul64To128( bSig, q, term0, term1 );
  6248. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6249. shortShift128Left( aSig0, aSig1, 62, aSig0, aSig1 );
  6250. expDiff -= 62;
  6251. end;
  6252. expDiff += 64;
  6253. if ( 0 < expDiff ) begin
  6254. q := estimateDiv128To64( aSig0, aSig1, bSig );
  6255. q := ( 2 < q ) ? q - 2 : 0;
  6256. q >>= 64 - expDiff;
  6257. mul64To128( bSig, q shl ( 64 - expDiff ), term0, term1 );
  6258. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6259. shortShift128Left( 0, bSig, 64 - expDiff, term0, term1 );
  6260. while ( le128( term0, term1, aSig0, aSig1 ) ) begin
  6261. ++q;
  6262. sub128( aSig0, aSig1, term0, term1, aSig0, aSig1 );
  6263. end;
  6264. end;
  6265. else begin
  6266. term1 := 0;
  6267. term0 := bSig;
  6268. end;
  6269. sub128( term0, term1, aSig0, aSig1, alternateASig0, alternateASig1 );
  6270. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6271. or ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
  6272. and ( q and 1 ) )
  6273. ) begin
  6274. aSig0 := alternateASig0;
  6275. aSig1 := alternateASig1;
  6276. zSign := ! zSign;
  6277. end;
  6278. result :=
  6279. normalizeRoundAndPackFloatx80(
  6280. 80, zSign, bExp + expDiff, aSig0, aSig1 );
  6281. end;
  6282. {*----------------------------------------------------------------------------
  6283. | Returns the square root of the extended double-precision floating-point
  6284. | value `a'. The operation is performed according to the IEC/IEEE Standard
  6285. | for Binary Floating-Point Arithmetic.
  6286. *----------------------------------------------------------------------------*}
  6287. function floatx80_sqrt(a: floatx80): floatx80;
  6288. var
  6289. aSign: flag;
  6290. aExp, zExp: int32;
  6291. aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
  6292. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  6293. z: floatx80;
  6294. label
  6295. invalid;
  6296. begin
  6297. aSig0 := extractFloatx80Frac( a );
  6298. aExp := extractFloatx80Exp( a );
  6299. aSign := extractFloatx80Sign( a );
  6300. if ( aExp = $7FFF ) begin
  6301. if ( (bits64) ( aSig0 shl 1 ) ) result := propagateFloatx80NaN( a, a );
  6302. if ( ! aSign ) result := a;
  6303. goto invalid;
  6304. end;
  6305. if ( aSign ) begin
  6306. if ( ( aExp or aSig0 ) = 0 ) result := a;
  6307. invalid:
  6308. float_raise( float_flag_invalid );
  6309. z.low := floatx80_default_nan_low;
  6310. z.high := floatx80_default_nan_high;
  6311. result := z;
  6312. end;
  6313. if ( aExp = 0 ) begin
  6314. if ( aSig0 = 0 ) result := packFloatx80( 0, 0, 0 );
  6315. normalizeFloatx80Subnormal( aSig0, aExp, aSig0 );
  6316. end;
  6317. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
  6318. zSig0 := estimateSqrt32( aExp, aSig0>>32 );
  6319. shift128Right( aSig0, 0, 2 + ( aExp and 1 ), aSig0, aSig1 );
  6320. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  6321. doubleZSig0 := zSig0 shl 1;
  6322. mul64To128( zSig0, zSig0, term0, term1 );
  6323. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  6324. while ( (sbits64) rem0 < 0 ) begin
  6325. --zSig0;
  6326. doubleZSig0 -= 2;
  6327. add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, rem0, rem1 );
  6328. end;
  6329. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  6330. if ( ( zSig1 and LIT64( $3FFFFFFFFFFFFFFF ) ) <= 5 ) begin
  6331. if ( zSig1 = 0 ) zSig1 := 1;
  6332. mul64To128( doubleZSig0, zSig1, term1, term2 );
  6333. sub128( rem1, 0, term1, term2, rem1, rem2 );
  6334. mul64To128( zSig1, zSig1, term2, term3 );
  6335. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  6336. while ( (sbits64) rem1 < 0 ) begin
  6337. --zSig1;
  6338. shortShift128Left( 0, zSig1, 1, term2, term3 );
  6339. term3 or= 1;
  6340. term2 or= doubleZSig0;
  6341. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  6342. end;
  6343. zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
  6344. end;
  6345. shortShift128Left( 0, zSig1, 1, zSig0, zSig1 );
  6346. zSig0 or= doubleZSig0;
  6347. result :=
  6348. roundAndPackFloatx80(
  6349. floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
  6350. end;
  6351. {*----------------------------------------------------------------------------
  6352. | Returns 1 if the extended double-precision floating-point value `a' is
  6353. | equal to the corresponding value `b', and 0 otherwise. The comparison is
  6354. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  6355. | Arithmetic.
  6356. *----------------------------------------------------------------------------*}
  6357. function floatx80_eq(a: floatx80; b: floatx80 ): flag;
  6358. begin
  6359. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6360. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6361. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6362. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6363. ) begin
  6364. if ( floatx80_is_signaling_nan( a )
  6365. or floatx80_is_signaling_nan( b ) ) begin
  6366. float_raise( float_flag_invalid );
  6367. end;
  6368. result := 0;
  6369. end;
  6370. result :=
  6371. ( a.low = b.low )
  6372. and ( ( a.high = b.high )
  6373. or ( ( a.low = 0 )
  6374. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6375. );
  6376. end;
  6377. {*----------------------------------------------------------------------------
  6378. | Returns 1 if the extended double-precision floating-point value `a' is
  6379. | less than or equal to the corresponding value `b', and 0 otherwise. The
  6380. | comparison is performed according to the IEC/IEEE Standard for Binary
  6381. | Floating-Point Arithmetic.
  6382. *----------------------------------------------------------------------------*}
  6383. function floatx80_le(a: floatx80; b: floatx80 ): flag;
  6384. var
  6385. aSign, bSign: flag;
  6386. begin
  6387. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6388. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6389. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6390. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6391. ) begin
  6392. float_raise( float_flag_invalid );
  6393. result := 0;
  6394. end;
  6395. aSign := extractFloatx80Sign( a );
  6396. bSign := extractFloatx80Sign( b );
  6397. if ( aSign <> bSign ) begin
  6398. result :=
  6399. aSign
  6400. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6401. = 0 );
  6402. end;
  6403. result :=
  6404. aSign ? le128( b.high, b.low, a.high, a.low )
  6405. : le128( a.high, a.low, b.high, b.low );
  6406. end;
  6407. {*----------------------------------------------------------------------------
  6408. | Returns 1 if the extended double-precision floating-point value `a' is
  6409. | less than the corresponding value `b', and 0 otherwise. The comparison
  6410. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6411. | Arithmetic.
  6412. *----------------------------------------------------------------------------*}
  6413. function floatx80_lt(a: floatx80; b: floatx80 ): flag;
  6414. var
  6415. aSign, bSign: flag;
  6416. begin
  6417. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6418. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6419. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6420. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6421. ) begin
  6422. float_raise( float_flag_invalid );
  6423. result := 0;
  6424. end;
  6425. aSign := extractFloatx80Sign( a );
  6426. bSign := extractFloatx80Sign( b );
  6427. if ( aSign <> bSign ) begin
  6428. result :=
  6429. aSign
  6430. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6431. <> 0 );
  6432. end;
  6433. result :=
  6434. aSign ? lt128( b.high, b.low, a.high, a.low )
  6435. : lt128( a.high, a.low, b.high, b.low );
  6436. end;
  6437. {*----------------------------------------------------------------------------
  6438. | Returns 1 if the extended double-precision floating-point value `a' is equal
  6439. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  6440. | raised if either operand is a NaN. Otherwise, the comparison is performed
  6441. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6442. *----------------------------------------------------------------------------*}
  6443. function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
  6444. begin
  6445. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6446. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6447. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6448. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6449. ) begin
  6450. float_raise( float_flag_invalid );
  6451. result := 0;
  6452. end;
  6453. result :=
  6454. ( a.low = b.low )
  6455. and ( ( a.high = b.high )
  6456. or ( ( a.low = 0 )
  6457. and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
  6458. );
  6459. end;
  6460. {*----------------------------------------------------------------------------
  6461. | Returns 1 if the extended double-precision floating-point value `a' is less
  6462. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  6463. | do not cause an exception. Otherwise, the comparison is performed according
  6464. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6465. *----------------------------------------------------------------------------*}
  6466. function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
  6467. var
  6468. aSign, bSign: flag;
  6469. begin
  6470. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6471. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6472. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6473. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6474. ) begin
  6475. if ( floatx80_is_signaling_nan( a )
  6476. or floatx80_is_signaling_nan( b ) ) begin
  6477. float_raise( float_flag_invalid );
  6478. end;
  6479. result := 0;
  6480. end;
  6481. aSign := extractFloatx80Sign( a );
  6482. bSign := extractFloatx80Sign( b );
  6483. if ( aSign <> bSign ) begin
  6484. result :=
  6485. aSign
  6486. or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6487. = 0 );
  6488. end;
  6489. result :=
  6490. aSign ? le128( b.high, b.low, a.high, a.low )
  6491. : le128( a.high, a.low, b.high, b.low );
  6492. end;
  6493. {*----------------------------------------------------------------------------
  6494. | Returns 1 if the extended double-precision floating-point value `a' is less
  6495. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  6496. | an exception. Otherwise, the comparison is performed according to the
  6497. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6498. *----------------------------------------------------------------------------*}
  6499. function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
  6500. var
  6501. aSign, bSign: flag;
  6502. begin
  6503. if ( ( ( extractFloatx80Exp( a ) = $7FFF )
  6504. and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
  6505. or ( ( extractFloatx80Exp( b ) = $7FFF )
  6506. and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
  6507. ) begin
  6508. if ( floatx80_is_signaling_nan( a )
  6509. or floatx80_is_signaling_nan( b ) ) begin
  6510. float_raise( float_flag_invalid );
  6511. end;
  6512. result := 0;
  6513. end;
  6514. aSign := extractFloatx80Sign( a );
  6515. bSign := extractFloatx80Sign( b );
  6516. if ( aSign <> bSign ) begin
  6517. result :=
  6518. aSign
  6519. and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  6520. <> 0 );
  6521. end;
  6522. result :=
  6523. aSign ? lt128( b.high, b.low, a.high, a.low )
  6524. : lt128( a.high, a.low, b.high, b.low );
  6525. end;
  6526. {$endif FPC_SOFTFLOAT_FLOATX80}
  6527. {$ifdef FPC_SOFTFLOAT_FLOAT128}
  6528. {*----------------------------------------------------------------------------
  6529. | Returns the least-significant 64 fraction bits of the quadruple-precision
  6530. | floating-point value `a'.
  6531. *----------------------------------------------------------------------------*}
  6532. function extractFloat128Frac1(a : float128): bits64;
  6533. begin
  6534. result:=a.low;
  6535. end;
  6536. {*----------------------------------------------------------------------------
  6537. | Returns the most-significant 48 fraction bits of the quadruple-precision
  6538. | floating-point value `a'.
  6539. *----------------------------------------------------------------------------*}
  6540. function extractFloat128Frac0(a : float128): bits64;
  6541. begin
  6542. result:=a.high and int64($0000FFFFFFFFFFFF);
  6543. end;
  6544. {*----------------------------------------------------------------------------
  6545. | Returns the exponent bits of the quadruple-precision floating-point value
  6546. | `a'.
  6547. *----------------------------------------------------------------------------*}
  6548. function extractFloat128Exp(a : float128): int32;
  6549. begin
  6550. result:=( a.high shr 48 ) and $7FFF;
  6551. end;
  6552. {*----------------------------------------------------------------------------
  6553. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  6554. *----------------------------------------------------------------------------*}
  6555. function extractFloat128Sign(a : float128): flag;
  6556. begin
  6557. result:=a.high shr 63;
  6558. end;
  6559. {*----------------------------------------------------------------------------
  6560. | Normalizes the subnormal quadruple-precision floating-point value
  6561. | represented by the denormalized significand formed by the concatenation of
  6562. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  6563. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  6564. | significand are stored at the location pointed to by `zSig0Ptr', and the
  6565. | least significant 64 bits of the normalized significand are stored at the
  6566. | location pointed to by `zSig1Ptr'.
  6567. *----------------------------------------------------------------------------*}
  6568. procedure normalizeFloat128Subnormal(
  6569. aSig0: bits64;
  6570. aSig1: bits64;
  6571. var zExpPtr: int32;
  6572. var zSig0Ptr: bits64;
  6573. var zSig1Ptr: bits64);
  6574. var
  6575. shiftCount: int8;
  6576. begin
  6577. if ( aSig0 = 0 ) then
  6578. begin
  6579. shiftCount := countLeadingZeros64( aSig1 ) - 15;
  6580. if ( shiftCount < 0 ) then
  6581. begin
  6582. zSig0Ptr := aSig1 shr ( - shiftCount );
  6583. zSig1Ptr := aSig1 shl ( shiftCount and 63 );
  6584. end
  6585. else begin
  6586. zSig0Ptr := aSig1 shl shiftCount;
  6587. zSig1Ptr := 0;
  6588. end;
  6589. zExpPtr := - shiftCount - 63;
  6590. end
  6591. else begin
  6592. shiftCount := countLeadingZeros64( aSig0 ) - 15;
  6593. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  6594. zExpPtr := 1 - shiftCount;
  6595. end;
  6596. end;
  6597. {*----------------------------------------------------------------------------
  6598. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  6599. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  6600. | floating-point value, returning the result. After being shifted into the
  6601. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  6602. | added together to form the most significant 32 bits of the result. This
  6603. | means that any integer portion of `zSig0' will be added into the exponent.
  6604. | Since a properly normalized significand will have an integer portion equal
  6605. | to 1, the `zExp' input should be 1 less than the desired result exponent
  6606. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  6607. | significand.
  6608. *----------------------------------------------------------------------------*}
  6609. function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
  6610. var
  6611. z: float128;
  6612. begin
  6613. z.low := zSig1;
  6614. z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
  6615. result:=z;
  6616. end;
  6617. {*----------------------------------------------------------------------------
  6618. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6619. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  6620. | and `zSig2', and returns the proper quadruple-precision floating-point value
  6621. | corresponding to the abstract input. Ordinarily, the abstract value is
  6622. | simply rounded and packed into the quadruple-precision format, with the
  6623. | inexact exception raised if the abstract input cannot be represented
  6624. | exactly. However, if the abstract value is too large, the overflow and
  6625. | inexact exceptions are raised and an infinity or maximal finite value is
  6626. | returned. If the abstract value is too small, the input value is rounded to
  6627. | a subnormal number, and the underflow and inexact exceptions are raised if
  6628. | the abstract input cannot be represented exactly as a subnormal quadruple-
  6629. | precision floating-point number.
  6630. | The input significand must be normalized or smaller. If the input
  6631. | significand is not normalized, `zExp' must be 0; in that case, the result
  6632. | returned is a subnormal number, and it must not require rounding. In the
  6633. | usual case that the input significand is normalized, `zExp' must be 1 less
  6634. | than the ``true'' floating-point exponent. The handling of underflow and
  6635. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6636. *----------------------------------------------------------------------------*}
  6637. function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
  6638. var
  6639. roundingMode: int8;
  6640. roundNearestEven, increment, isTiny: flag;
  6641. begin
  6642. roundingMode := softfloat_rounding_mode;
  6643. roundNearestEven := ord( roundingMode = float_round_nearest_even );
  6644. increment := ord( sbits64(zSig2) < 0 );
  6645. if ( roundNearestEven=0 ) then
  6646. begin
  6647. if ( roundingMode = float_round_to_zero ) then
  6648. begin
  6649. increment := 0;
  6650. end
  6651. else begin
  6652. if ( zSign<>0 ) then
  6653. begin
  6654. increment := ord( roundingMode = float_round_down ) and zSig2;
  6655. end
  6656. else begin
  6657. increment := ord( roundingMode = float_round_up ) and zSig2;
  6658. end;
  6659. end;
  6660. end;
  6661. if ( $7FFD <= bits32(zExp) ) then
  6662. begin
  6663. if ( ord( $7FFD < zExp )
  6664. or ( ord( zExp = $7FFD )
  6665. and eq128(
  6666. int64( $0001FFFFFFFFFFFF ),
  6667. int64( $FFFFFFFFFFFFFFFF ),
  6668. zSig0,
  6669. zSig1
  6670. )
  6671. and increment
  6672. )
  6673. )<>0 then
  6674. begin
  6675. float_raise( float_flag_overflow or float_flag_inexact );
  6676. if ( ord( roundingMode = float_round_to_zero )
  6677. or ( zSign and ord( roundingMode = float_round_up ) )
  6678. or ( not(zSign) and ord( roundingMode = float_round_down ) )
  6679. )<>0 then
  6680. begin
  6681. result :=
  6682. packFloat128(
  6683. zSign,
  6684. $7FFE,
  6685. int64( $0000FFFFFFFFFFFF ),
  6686. int64( $FFFFFFFFFFFFFFFF )
  6687. );
  6688. end;
  6689. result:=packFloat128( zSign, $7FFF, 0, 0 );
  6690. end;
  6691. if ( zExp < 0 ) then
  6692. begin
  6693. isTiny :=
  6694. ord(( float_detect_tininess = float_tininess_before_rounding )
  6695. or ( zExp < -1 )
  6696. or not( increment<>0 )
  6697. or boolean(lt128(
  6698. zSig0,
  6699. zSig1,
  6700. int64( $0001FFFFFFFFFFFF ),
  6701. int64( $FFFFFFFFFFFFFFFF )
  6702. )));
  6703. shift128ExtraRightJamming(
  6704. zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
  6705. zExp := 0;
  6706. if ( isTiny and zSig2 )<>0 then
  6707. float_raise( float_flag_underflow );
  6708. if ( roundNearestEven<>0 ) then
  6709. begin
  6710. increment := ord( sbits64(zSig2) < 0 );
  6711. end
  6712. else begin
  6713. if ( zSign<>0 ) then
  6714. begin
  6715. increment := ord( roundingMode = float_round_down ) and zSig2;
  6716. end
  6717. else begin
  6718. increment := ord( roundingMode = float_round_up ) and zSig2;
  6719. end;
  6720. end;
  6721. end;
  6722. end;
  6723. if ( zSig2<>0 ) then
  6724. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6725. if ( increment<>0 ) then
  6726. begin
  6727. add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
  6728. zSig1 := zSig1 and not( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
  6729. end
  6730. else begin
  6731. if ( ( zSig0 or zSig1 ) = 0 ) then
  6732. zExp := 0;
  6733. end;
  6734. result:=packFloat128( zSign, zExp, zSig0, zSig1 );
  6735. end;
  6736. {*----------------------------------------------------------------------------
  6737. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  6738. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  6739. | returns the proper quadruple-precision floating-point value corresponding
  6740. | to the abstract input. This routine is just like `roundAndPackFloat128'
  6741. | except that the input significand has fewer bits and does not have to be
  6742. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  6743. | point exponent.
  6744. *----------------------------------------------------------------------------*}
  6745. function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
  6746. var
  6747. shiftCount: int8;
  6748. zSig2: bits64;
  6749. begin
  6750. if ( zSig0 = 0 ) then
  6751. begin
  6752. zSig0 := zSig1;
  6753. zSig1 := 0;
  6754. dec(zExp, 64);
  6755. end;
  6756. shiftCount := countLeadingZeros64( zSig0 ) - 15;
  6757. if ( 0 <= shiftCount ) then
  6758. begin
  6759. zSig2 := 0;
  6760. shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
  6761. end
  6762. else begin
  6763. shift128ExtraRightJamming(
  6764. zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
  6765. end;
  6766. dec(zExp, shiftCount);
  6767. result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  6768. end;
  6769. {*----------------------------------------------------------------------------
  6770. | Returns the result of converting the quadruple-precision floating-point
  6771. | value `a' to the 32-bit two's complement integer format. The conversion
  6772. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6773. | Arithmetic---which means in particular that the conversion is rounded
  6774. | according to the current rounding mode. If `a' is a NaN, the largest
  6775. | positive integer is returned. Otherwise, if the conversion overflows, the
  6776. | largest integer with the same sign as `a' is returned.
  6777. *----------------------------------------------------------------------------*}
  6778. function float128_to_int32(a: float128): int32;
  6779. var
  6780. aSign: flag;
  6781. aExp, shiftCount: int32;
  6782. aSig0, aSig1: bits64;
  6783. begin
  6784. aSig1 := extractFloat128Frac1( a );
  6785. aSig0 := extractFloat128Frac0( a );
  6786. aExp := extractFloat128Exp( a );
  6787. aSign := extractFloat128Sign( a );
  6788. if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
  6789. aSign := 0;
  6790. if ( aExp<>0 ) then
  6791. aSig0 := aSig0 or int64( $0001000000000000 );
  6792. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6793. shiftCount := $4028 - aExp;
  6794. if ( 0 < shiftCount ) then
  6795. shift64RightJamming( aSig0, shiftCount, aSig0 );
  6796. result := roundAndPackInt32( aSign, aSig0 );
  6797. end;
  6798. {*----------------------------------------------------------------------------
  6799. | Returns the result of converting the quadruple-precision floating-point
  6800. | value `a' to the 32-bit two's complement integer format. The conversion
  6801. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6802. | Arithmetic, except that the conversion is always rounded toward zero. If
  6803. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  6804. | conversion overflows, the largest integer with the same sign as `a' is
  6805. | returned.
  6806. *----------------------------------------------------------------------------*}
  6807. function float128_to_int32_round_to_zero(a: float128): int32;
  6808. var
  6809. aSign: flag;
  6810. aExp, shiftCount: int32;
  6811. aSig0, aSig1, savedASig: bits64;
  6812. z: int32;
  6813. label
  6814. invalid;
  6815. begin
  6816. aSig1 := extractFloat128Frac1( a );
  6817. aSig0 := extractFloat128Frac0( a );
  6818. aExp := extractFloat128Exp( a );
  6819. aSign := extractFloat128Sign( a );
  6820. aSig0 := aSig0 or ord( aSig1 <> 0 );
  6821. if ( $401E < aExp ) then
  6822. begin
  6823. if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
  6824. aSign := 0;
  6825. goto invalid;
  6826. end
  6827. else if ( aExp < $3FFF ) then
  6828. begin
  6829. if ( aExp or aSig0 )<>0 then
  6830. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6831. result := 0;
  6832. exit;
  6833. end;
  6834. aSig0 := aSig0 or int64( $0001000000000000 );
  6835. shiftCount := $402F - aExp;
  6836. savedASig := aSig0;
  6837. aSig0 := aSig0 shr shiftCount;
  6838. z := aSig0;
  6839. if ( aSign )<>0 then
  6840. z := - z;
  6841. if ( ord( z < 0 ) xor aSign )<>0 then
  6842. begin
  6843. invalid:
  6844. float_raise( float_flag_invalid );
  6845. if aSign<>0 then
  6846. result:=$80000000
  6847. else
  6848. result:=$7FFFFFFF;
  6849. exit;
  6850. end;
  6851. if ( ( aSig0 shl shiftCount ) <> savedASig ) then
  6852. begin
  6853. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6854. end;
  6855. result := z;
  6856. end;
  6857. {*----------------------------------------------------------------------------
  6858. | Returns the result of converting the quadruple-precision floating-point
  6859. | value `a' to the 64-bit two's complement integer format. The conversion
  6860. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6861. | Arithmetic---which means in particular that the conversion is rounded
  6862. | according to the current rounding mode. If `a' is a NaN, the largest
  6863. | positive integer is returned. Otherwise, if the conversion overflows, the
  6864. | largest integer with the same sign as `a' is returned.
  6865. *----------------------------------------------------------------------------*}
  6866. function float128_to_int64(a: float128): int64;
  6867. var
  6868. aSign: flag;
  6869. aExp, shiftCount: int32;
  6870. aSig0, aSig1: bits64;
  6871. begin
  6872. aSig1 := extractFloat128Frac1( a );
  6873. aSig0 := extractFloat128Frac0( a );
  6874. aExp := extractFloat128Exp( a );
  6875. aSign := extractFloat128Sign( a );
  6876. if ( aExp<>0 ) then
  6877. aSig0 := aSig0 or int64( $0001000000000000 );
  6878. shiftCount := $402F - aExp;
  6879. if ( shiftCount <= 0 ) then
  6880. begin
  6881. if ( $403E < aExp ) then
  6882. begin
  6883. float_raise( float_flag_invalid );
  6884. if ( (aSign=0)
  6885. or ( ( aExp = $7FFF )
  6886. and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
  6887. )
  6888. ) then
  6889. begin
  6890. result := int64( $7FFFFFFFFFFFFFFF );
  6891. end;
  6892. result := int64( $8000000000000000 );
  6893. end;
  6894. shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
  6895. end
  6896. else begin
  6897. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
  6898. end;
  6899. result := roundAndPackInt64( aSign, aSig0, aSig1 );
  6900. end;
  6901. {*----------------------------------------------------------------------------
  6902. | Returns the result of converting the quadruple-precision floating-point
  6903. | value `a' to the 64-bit two's complement integer format. The conversion
  6904. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6905. | Arithmetic, except that the conversion is always rounded toward zero.
  6906. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  6907. | the conversion overflows, the largest integer with the same sign as `a' is
  6908. | returned.
  6909. *----------------------------------------------------------------------------*}
  6910. function float128_to_int64_round_to_zero(a: float128): int64;
  6911. var
  6912. aSign: flag;
  6913. aExp, shiftCount: int32;
  6914. aSig0, aSig1: bits64;
  6915. z: int64;
  6916. begin
  6917. aSig1 := extractFloat128Frac1( a );
  6918. aSig0 := extractFloat128Frac0( a );
  6919. aExp := extractFloat128Exp( a );
  6920. aSign := extractFloat128Sign( a );
  6921. if ( aExp<>0 ) then
  6922. aSig0 := aSig0 or int64( $0001000000000000 );
  6923. shiftCount := aExp - $402F;
  6924. if ( 0 < shiftCount ) then
  6925. begin
  6926. if ( $403E <= aExp ) then
  6927. begin
  6928. aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
  6929. if ( ( a.high = int64( $C03E000000000000 ) )
  6930. and ( aSig1 < int64( $0002000000000000 ) ) ) then
  6931. begin
  6932. if ( aSig1<>0 ) then
  6933. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6934. end
  6935. else begin
  6936. float_raise( float_flag_invalid );
  6937. if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
  6938. begin
  6939. result := int64( $7FFFFFFFFFFFFFFF );
  6940. exit;
  6941. end;
  6942. end;
  6943. result := int64( $8000000000000000 );
  6944. exit;
  6945. end;
  6946. z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
  6947. if ( int64( aSig1 shl shiftCount )<>0 ) then
  6948. begin
  6949. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6950. end;
  6951. end
  6952. else begin
  6953. if ( aExp < $3FFF ) then
  6954. begin
  6955. if ( aExp or aSig0 or aSig1 )<>0 then
  6956. begin
  6957. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6958. end;
  6959. result := 0;
  6960. exit;
  6961. end;
  6962. z := aSig0 shr ( - shiftCount );
  6963. if ( (aSig1<>0)
  6964. or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
  6965. begin
  6966. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  6967. end;
  6968. end;
  6969. if ( aSign<>0 ) then
  6970. z := - z;
  6971. result := z;
  6972. end;
  6973. {*----------------------------------------------------------------------------
  6974. | Returns the result of converting the quadruple-precision floating-point
  6975. | value `a' to the single-precision floating-point format. The conversion
  6976. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  6977. | Arithmetic.
  6978. *----------------------------------------------------------------------------*}
  6979. function float128_to_float32(a: float128): float32;
  6980. var
  6981. aSign: flag;
  6982. aExp: int32;
  6983. aSig0, aSig1: bits64;
  6984. zSig: bits32;
  6985. begin
  6986. aSig1 := extractFloat128Frac1( a );
  6987. aSig0 := extractFloat128Frac0( a );
  6988. aExp := extractFloat128Exp( a );
  6989. aSign := extractFloat128Sign( a );
  6990. if ( aExp = $7FFF ) then
  6991. begin
  6992. if ( aSig0 or aSig1 )<>0 then
  6993. begin
  6994. result := commonNaNToFloat32( float128ToCommonNaN( a ) );
  6995. exit;
  6996. end;
  6997. result := packFloat32( aSign, $FF, 0 );
  6998. exit;
  6999. end;
  7000. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7001. shift64RightJamming( aSig0, 18, aSig0 );
  7002. zSig := aSig0;
  7003. if ( aExp or zSig )<>0 then
  7004. begin
  7005. zSig := zSig or $40000000;
  7006. dec(aExp,$3F81);
  7007. end;
  7008. result := roundAndPackFloat32( aSign, aExp, zSig );
  7009. end;
  7010. {*----------------------------------------------------------------------------
  7011. | Returns the result of converting the quadruple-precision floating-point
  7012. | value `a' to the double-precision floating-point format. The conversion
  7013. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7014. | Arithmetic.
  7015. *----------------------------------------------------------------------------*}
  7016. function float128_to_float64(a: float128): float64;
  7017. var
  7018. aSign: flag;
  7019. aExp: int32;
  7020. aSig0, aSig1: bits64;
  7021. begin
  7022. aSig1 := extractFloat128Frac1( a );
  7023. aSig0 := extractFloat128Frac0( a );
  7024. aExp := extractFloat128Exp( a );
  7025. aSign := extractFloat128Sign( a );
  7026. if ( aExp = $7FFF ) then
  7027. begin
  7028. if ( aSig0 or aSig1 )<>0 then
  7029. begin
  7030. commonNaNToFloat64( float128ToCommonNaN( a ),result);
  7031. exit;
  7032. end;
  7033. result:=packFloat64( aSign, $7FF, 0);
  7034. exit;
  7035. end;
  7036. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7037. aSig0 := aSig0 or ord( aSig1 <> 0 );
  7038. if ( aExp or aSig0 )<>0 then
  7039. begin
  7040. aSig0 := aSig0 or int64( $4000000000000000 );
  7041. dec(aExp,$3C01);
  7042. end;
  7043. result := roundAndPackFloat64( aSign, aExp, aSig0 );
  7044. end;
  7045. {$ifdef FPC_SOFTFLOAT_FLOATX80}
  7046. {*----------------------------------------------------------------------------
  7047. | Returns the result of converting the quadruple-precision floating-point
  7048. | value `a' to the extended double-precision floating-point format. The
  7049. | conversion is performed according to the IEC/IEEE Standard for Binary
  7050. | Floating-Point Arithmetic.
  7051. *----------------------------------------------------------------------------*}
  7052. function float128_to_floatx80(a: float128): floatx80;
  7053. var
  7054. aSign: flag;
  7055. aExp: int32;
  7056. aSig0, aSig1: bits64;
  7057. begin
  7058. aSig1 := extractFloat128Frac1( a );
  7059. aSig0 := extractFloat128Frac0( a );
  7060. aExp := extractFloat128Exp( a );
  7061. aSign := extractFloat128Sign( a );
  7062. if ( aExp = $7FFF ) begin
  7063. if ( aSig0 or aSig1 ) begin
  7064. result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
  7065. exit;
  7066. end;
  7067. result := packFloatx80( aSign, $7FFF, int64( $8000000000000000 ) );
  7068. exit;
  7069. end;
  7070. if ( aExp = 0 ) begin
  7071. if ( ( aSig0 or aSig1 ) = 0 ) then
  7072. begin
  7073. result := packFloatx80( aSign, 0, 0 );
  7074. exit;
  7075. end;
  7076. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7077. end;
  7078. else begin
  7079. aSig0 or= int64( $0001000000000000 );
  7080. end;
  7081. shortShift128Left( aSig0, aSig1, 15, aSig0, aSig1 );
  7082. result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
  7083. end;
  7084. {$endif FPC_SOFTFLOAT_FLOATX80}
  7085. {*----------------------------------------------------------------------------
  7086. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  7087. | Returns the result as a quadruple-precision floating-point value. The
  7088. | operation is performed according to the IEC/IEEE Standard for Binary
  7089. | Floating-Point Arithmetic.
  7090. *----------------------------------------------------------------------------*}
  7091. function float128_round_to_int(a: float128): float128;
  7092. var
  7093. aSign: flag;
  7094. aExp: int32;
  7095. lastBitMask, roundBitsMask: bits64;
  7096. roundingMode: int8;
  7097. z: float128;
  7098. begin
  7099. aExp := extractFloat128Exp( a );
  7100. if ( $402F <= aExp ) then
  7101. begin
  7102. if ( $406F <= aExp ) then
  7103. begin
  7104. if ( ( aExp = $7FFF )
  7105. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
  7106. ) then
  7107. begin
  7108. result := propagateFloat128NaN( a, a );
  7109. exit;
  7110. end;
  7111. result := a;
  7112. exit;
  7113. end;
  7114. lastBitMask := 1;
  7115. lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
  7116. roundBitsMask := lastBitMask - 1;
  7117. z := a;
  7118. roundingMode := softfloat_rounding_mode;
  7119. if ( roundingMode = float_round_nearest_even ) then
  7120. begin
  7121. if ( lastBitMask )<>0 then
  7122. begin
  7123. add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
  7124. if ( ( z.low and roundBitsMask ) = 0 ) then
  7125. z.low := z.low and not(lastBitMask);
  7126. end
  7127. else begin
  7128. if ( sbits64(z.low) < 0 ) then
  7129. begin
  7130. inc(z.high);
  7131. if ( bits64( z.low shl 1 ) = 0 ) then
  7132. z.high := z.high and not(1);
  7133. end;
  7134. end;
  7135. end
  7136. else if ( roundingMode <> float_round_to_zero ) then
  7137. begin
  7138. if ( extractFloat128Sign( z )
  7139. xor ord( roundingMode = float_round_up ) )<>0 then
  7140. begin
  7141. add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
  7142. end;
  7143. end;
  7144. z.low := z.low and not(roundBitsMask);
  7145. end
  7146. else begin
  7147. if ( aExp < $3FFF ) then
  7148. begin
  7149. if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
  7150. begin
  7151. result := a;
  7152. exit;
  7153. end;
  7154. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7155. aSign := extractFloat128Sign( a );
  7156. case softfloat_rounding_mode of
  7157. float_round_nearest_even:
  7158. if ( ( aExp = $3FFE )
  7159. and ( (extractFloat128Frac0( a )<>0)
  7160. or (extractFloat128Frac1( a )<>0) )
  7161. ) then begin
  7162. begin
  7163. result := packFloat128( aSign, $3FFF, 0, 0 );
  7164. exit;
  7165. end;
  7166. end;
  7167. float_round_down:
  7168. begin
  7169. if aSign<>0 then
  7170. result:=packFloat128( 1, $3FFF, 0, 0 )
  7171. else
  7172. result:=packFloat128( 0, 0, 0, 0 );
  7173. exit;
  7174. end;
  7175. float_round_up:
  7176. begin
  7177. if aSign<>0 then
  7178. result := packFloat128( 1, 0, 0, 0 )
  7179. else
  7180. result:=packFloat128( 0, $3FFF, 0, 0 );
  7181. exit;
  7182. end;
  7183. end;
  7184. result := packFloat128( aSign, 0, 0, 0 );
  7185. exit;
  7186. end;
  7187. lastBitMask := 1;
  7188. lastBitMask := lastBitMask shl ($402F - aExp);
  7189. roundBitsMask := lastBitMask - 1;
  7190. z.low := 0;
  7191. z.high := a.high;
  7192. roundingMode := softfloat_rounding_mode;
  7193. if ( roundingMode = float_round_nearest_even ) then begin
  7194. inc(z.high,lastBitMask shr 1);
  7195. if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) then begin
  7196. z.high := z.high and not(lastBitMask);
  7197. end;
  7198. end
  7199. else if ( roundingMode <> float_round_to_zero ) then begin
  7200. if ( (extractFloat128Sign( z )<>0)
  7201. xor ( roundingMode = float_round_up ) ) then begin
  7202. z.high := z.high or ord( a.low <> 0 );
  7203. z.high := z.high+roundBitsMask;
  7204. end;
  7205. end;
  7206. z.high := z.high and not(roundBitsMask);
  7207. end;
  7208. if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) then begin
  7209. softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
  7210. end;
  7211. result := z;
  7212. end;
  7213. {*----------------------------------------------------------------------------
  7214. | Returns the result of adding the absolute values of the quadruple-precision
  7215. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  7216. | before being returned. `zSign' is ignored if the result is a NaN.
  7217. | The addition is performed according to the IEC/IEEE Standard for Binary
  7218. | Floating-Point Arithmetic.
  7219. *----------------------------------------------------------------------------*}
  7220. function addFloat128Sigs(a,b : float128; zSign : flag ): float128;
  7221. var
  7222. aExp, bExp, zExp: int32;
  7223. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7224. expDiff: int32;
  7225. label
  7226. shiftRight1,roundAndPack;
  7227. begin
  7228. aSig1 := extractFloat128Frac1( a );
  7229. aSig0 := extractFloat128Frac0( a );
  7230. aExp := extractFloat128Exp( a );
  7231. bSig1 := extractFloat128Frac1( b );
  7232. bSig0 := extractFloat128Frac0( b );
  7233. bExp := extractFloat128Exp( b );
  7234. expDiff := aExp - bExp;
  7235. if ( 0 < expDiff ) then begin
  7236. if ( aExp = $7FFF ) then begin
  7237. if ( aSig0 or aSig1 )<>0 then
  7238. begin
  7239. result := propagateFloat128NaN( a, b );
  7240. exit;
  7241. end;
  7242. result := a;
  7243. exit;
  7244. end;
  7245. if ( bExp = 0 ) then begin
  7246. dec(expDiff);
  7247. end
  7248. else begin
  7249. bSig0 := bSig0 or int64( $0001000000000000 );
  7250. end;
  7251. shift128ExtraRightJamming(
  7252. bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
  7253. zExp := aExp;
  7254. end
  7255. else if ( expDiff < 0 ) then begin
  7256. if ( bExp = $7FFF ) then begin
  7257. if ( bSig0 or bSig1 )<>0 then
  7258. begin
  7259. result := propagateFloat128NaN( a, b );
  7260. exit;
  7261. end;
  7262. result := packFloat128( zSign, $7FFF, 0, 0 );
  7263. exit;
  7264. end;
  7265. if ( aExp = 0 ) then begin
  7266. inc(expDiff);
  7267. end
  7268. else begin
  7269. aSig0 := aSig0 or int64( $0001000000000000 );
  7270. end;
  7271. shift128ExtraRightJamming(
  7272. aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
  7273. zExp := bExp;
  7274. end
  7275. else begin
  7276. if ( aExp = $7FFF ) then begin
  7277. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7278. result := propagateFloat128NaN( a, b );
  7279. exit;
  7280. end;
  7281. result := a;
  7282. exit;
  7283. end;
  7284. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7285. if ( aExp = 0 ) then
  7286. begin
  7287. result := packFloat128( zSign, 0, zSig0, zSig1 );
  7288. exit;
  7289. end;
  7290. zSig2 := 0;
  7291. zSig0 := zSig0 or int64( $0002000000000000 );
  7292. zExp := aExp;
  7293. goto shiftRight1;
  7294. end;
  7295. aSig0 := aSig0 or int64( $0001000000000000 );
  7296. add128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7297. dec(zExp);
  7298. if ( zSig0 < int64( $0002000000000000 ) ) then goto roundAndPack;
  7299. inc(zExp);
  7300. shiftRight1:
  7301. shift128ExtraRightJamming(
  7302. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7303. roundAndPack:
  7304. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7305. end;
  7306. {*----------------------------------------------------------------------------
  7307. | Returns the result of subtracting the absolute values of the quadruple-
  7308. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  7309. | difference is negated before being returned. `zSign' is ignored if the
  7310. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  7311. | Standard for Binary Floating-Point Arithmetic.
  7312. *----------------------------------------------------------------------------*}
  7313. function subFloat128Sigs( a, b : float128; zSign : flag): float128;
  7314. var
  7315. aExp, bExp, zExp: int32;
  7316. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
  7317. expDiff: int32;
  7318. z: float128;
  7319. label
  7320. aExpBigger,bExpBigger,aBigger,bBigger,normalizeRoundAndPack;
  7321. begin
  7322. aSig1 := extractFloat128Frac1( a );
  7323. aSig0 := extractFloat128Frac0( a );
  7324. aExp := extractFloat128Exp( a );
  7325. bSig1 := extractFloat128Frac1( b );
  7326. bSig0 := extractFloat128Frac0( b );
  7327. bExp := extractFloat128Exp( b );
  7328. expDiff := aExp - bExp;
  7329. shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
  7330. shortShift128Left( bSig0, bSig1, 14, bSig0, bSig1 );
  7331. if ( 0 < expDiff ) then goto aExpBigger;
  7332. if ( expDiff < 0 ) then goto bExpBigger;
  7333. if ( aExp = $7FFF ) then begin
  7334. if ( aSig0 or aSig1 or bSig0 or bSig1 )<>0 then begin
  7335. result := propagateFloat128NaN( a, b );
  7336. exit;
  7337. end;
  7338. float_raise( float_flag_invalid );
  7339. z.low := float128_default_nan_low;
  7340. z.high := float128_default_nan_high;
  7341. result := z;
  7342. exit;
  7343. end;
  7344. if ( aExp = 0 ) then begin
  7345. aExp := 1;
  7346. bExp := 1;
  7347. end;
  7348. if ( bSig0 < aSig0 ) then goto aBigger;
  7349. if ( aSig0 < bSig0 ) then goto bBigger;
  7350. if ( bSig1 < aSig1 ) then goto aBigger;
  7351. if ( aSig1 < bSig1 ) then goto bBigger;
  7352. result := packFloat128( ord(softfloat_rounding_mode = float_round_down), 0, 0, 0 );
  7353. exit;
  7354. bExpBigger:
  7355. if ( bExp = $7FFF ) then begin
  7356. if ( bSig0 or bSig1 )<>0 then
  7357. begin
  7358. result := propagateFloat128NaN( a, b );
  7359. exit;
  7360. end;
  7361. result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
  7362. exit;
  7363. end;
  7364. if ( aExp = 0 ) then begin
  7365. inc(expDiff);
  7366. end
  7367. else begin
  7368. aSig0 := aSig0 or int64( $4000000000000000 );
  7369. end;
  7370. shift128RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7371. bSig0 := bSig0 or int64( $4000000000000000 );
  7372. bBigger:
  7373. sub128( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
  7374. zExp := bExp;
  7375. zSign := zSign xor 1;
  7376. goto normalizeRoundAndPack;
  7377. aExpBigger:
  7378. if ( aExp = $7FFF ) then begin
  7379. if ( aSig0 or aSig1 )<>0 then
  7380. begin
  7381. result := propagateFloat128NaN( a, b );
  7382. exit;
  7383. end;
  7384. result := a;
  7385. exit;
  7386. end;
  7387. if ( bExp = 0 ) then begin
  7388. dec(expDiff);
  7389. end
  7390. else begin
  7391. bSig0 := bSig0 or int64( $4000000000000000 );
  7392. end;
  7393. shift128RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
  7394. aSig0 := aSig0 or int64( $4000000000000000 );
  7395. aBigger:
  7396. sub128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
  7397. zExp := aExp;
  7398. normalizeRoundAndPack:
  7399. dec(zExp);
  7400. result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
  7401. end;
  7402. {*----------------------------------------------------------------------------
  7403. | Returns the result of adding the quadruple-precision floating-point values
  7404. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  7405. | for Binary Floating-Point Arithmetic.
  7406. *----------------------------------------------------------------------------*}
  7407. function float128_add(a: float128; b: float128): float128;
  7408. var
  7409. aSign, bSign: flag;
  7410. begin
  7411. aSign := extractFloat128Sign( a );
  7412. bSign := extractFloat128Sign( b );
  7413. if ( aSign = bSign ) then begin
  7414. result := addFloat128Sigs( a, b, aSign );
  7415. end
  7416. else begin
  7417. result := subFloat128Sigs( a, b, aSign );
  7418. end;
  7419. end;
  7420. {*----------------------------------------------------------------------------
  7421. | Returns the result of subtracting the quadruple-precision floating-point
  7422. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7423. | Standard for Binary Floating-Point Arithmetic.
  7424. *----------------------------------------------------------------------------*}
  7425. function float128_sub(a: float128; b: float128): float128;
  7426. var
  7427. aSign, bSign: flag;
  7428. begin
  7429. aSign := extractFloat128Sign( a );
  7430. bSign := extractFloat128Sign( b );
  7431. if ( aSign = bSign ) then begin
  7432. result := subFloat128Sigs( a, b, aSign );
  7433. end
  7434. else begin
  7435. result := addFloat128Sigs( a, b, aSign );
  7436. end;
  7437. end;
  7438. {*----------------------------------------------------------------------------
  7439. | Returns the result of multiplying the quadruple-precision floating-point
  7440. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  7441. | Standard for Binary Floating-Point Arithmetic.
  7442. *----------------------------------------------------------------------------*}
  7443. function float128_mul(a: float128; b: float128): float128;
  7444. var
  7445. aSign, bSign, zSign: flag;
  7446. aExp, bExp, zExp: int32;
  7447. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
  7448. z: float128;
  7449. label
  7450. invalid;
  7451. begin
  7452. aSig1 := extractFloat128Frac1( a );
  7453. aSig0 := extractFloat128Frac0( a );
  7454. aExp := extractFloat128Exp( a );
  7455. aSign := extractFloat128Sign( a );
  7456. bSig1 := extractFloat128Frac1( b );
  7457. bSig0 := extractFloat128Frac0( b );
  7458. bExp := extractFloat128Exp( b );
  7459. bSign := extractFloat128Sign( b );
  7460. zSign := aSign xor bSign;
  7461. if ( aExp = $7FFF ) then begin
  7462. if ( (( aSig0 or aSig1 )<>0)
  7463. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7464. result := propagateFloat128NaN( a, b );
  7465. exit;
  7466. end;
  7467. if ( ( bExp or bSig0 or bSig1 ) = 0 ) then goto invalid;
  7468. result := packFloat128( zSign, $7FFF, 0, 0 );
  7469. exit;
  7470. end;
  7471. if ( bExp = $7FFF ) then begin
  7472. if ( bSig0 or bSig1 )<>0 then
  7473. begin
  7474. result := propagateFloat128NaN( a, b );
  7475. exit;
  7476. end;
  7477. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7478. invalid:
  7479. float_raise( float_flag_invalid );
  7480. z.low := float128_default_nan_low;
  7481. z.high := float128_default_nan_high;
  7482. result := z;
  7483. exit;
  7484. end;
  7485. result := packFloat128( zSign, $7FFF, 0, 0 );
  7486. exit;
  7487. end;
  7488. if ( aExp = 0 ) then begin
  7489. if ( ( aSig0 or aSig1 ) = 0 ) then
  7490. begin
  7491. result := packFloat128( zSign, 0, 0, 0 );
  7492. exit;
  7493. end;
  7494. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7495. end;
  7496. if ( bExp = 0 ) then begin
  7497. if ( ( bSig0 or bSig1 ) = 0 ) then
  7498. begin
  7499. result := packFloat128( zSign, 0, 0, 0 );
  7500. exit;
  7501. end;
  7502. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7503. end;
  7504. zExp := aExp + bExp - $4000;
  7505. aSig0 := aSig0 or int64( $0001000000000000 );
  7506. shortShift128Left( bSig0, bSig1, 16, bSig0, bSig1 );
  7507. mul128To256( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
  7508. add128( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
  7509. zSig2 := zSig2 or ord( zSig3 <> 0 );
  7510. if ( int64( $0002000000000000 ) <= zSig0 ) then begin
  7511. shift128ExtraRightJamming(
  7512. zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
  7513. inc(zExp);
  7514. end;
  7515. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7516. end;
  7517. {*----------------------------------------------------------------------------
  7518. | Returns the result of dividing the quadruple-precision floating-point value
  7519. | `a' by the corresponding value `b'. The operation is performed according to
  7520. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7521. *----------------------------------------------------------------------------*}
  7522. function float128_div(a: float128; b: float128): float128;
  7523. var
  7524. aSign, bSign, zSign: flag;
  7525. aExp, bExp, zExp: int32;
  7526. aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
  7527. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7528. z: float128;
  7529. label
  7530. invalid;
  7531. begin
  7532. aSig1 := extractFloat128Frac1( a );
  7533. aSig0 := extractFloat128Frac0( a );
  7534. aExp := extractFloat128Exp( a );
  7535. aSign := extractFloat128Sign( a );
  7536. bSig1 := extractFloat128Frac1( b );
  7537. bSig0 := extractFloat128Frac0( b );
  7538. bExp := extractFloat128Exp( b );
  7539. bSign := extractFloat128Sign( b );
  7540. zSign := aSign xor bSign;
  7541. if ( aExp = $7FFF ) then begin
  7542. if ( aSig0 or aSig1 )<>0 then
  7543. begin
  7544. result := propagateFloat128NaN( a, b );
  7545. exit;
  7546. end;
  7547. if ( bExp = $7FFF ) then begin
  7548. if ( bSig0 or bSig1 )<>0 then
  7549. begin
  7550. result := propagateFloat128NaN( a, b );
  7551. exit;
  7552. end;
  7553. goto invalid;
  7554. end;
  7555. result := packFloat128( zSign, $7FFF, 0, 0 );
  7556. exit;
  7557. end;
  7558. if ( bExp = $7FFF ) then begin
  7559. if ( bSig0 or bSig1 )<>0 then
  7560. begin
  7561. result := propagateFloat128NaN( a, b );
  7562. exit;
  7563. end;
  7564. result := packFloat128( zSign, 0, 0, 0 );
  7565. exit;
  7566. end;
  7567. if ( bExp = 0 ) then begin
  7568. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7569. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then begin
  7570. invalid:
  7571. float_raise( float_flag_invalid );
  7572. z.low := float128_default_nan_low;
  7573. z.high := float128_default_nan_high;
  7574. result := z;
  7575. exit;
  7576. end;
  7577. float_raise( float_flag_divbyzero );
  7578. result := packFloat128( zSign, $7FFF, 0, 0 );
  7579. exit;
  7580. end;
  7581. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7582. end;
  7583. if ( aExp = 0 ) then begin
  7584. if ( ( aSig0 or aSig1 ) = 0 ) then
  7585. begin
  7586. result := packFloat128( zSign, 0, 0, 0 );
  7587. exit;
  7588. end;
  7589. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7590. end;
  7591. zExp := aExp - bExp + $3FFD;
  7592. shortShift128Left(
  7593. aSig0 or int64( $0001000000000000 ), aSig1, 15, aSig0, aSig1 );
  7594. shortShift128Left(
  7595. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7596. if ( le128( bSig0, bSig1, aSig0, aSig1 )<>0 ) then begin
  7597. shift128Right( aSig0, aSig1, 1, aSig0, aSig1 );
  7598. inc(zExp);
  7599. end;
  7600. zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7601. mul128By64To192( bSig0, bSig1, zSig0, term0, term1, term2 );
  7602. sub192( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
  7603. while ( sbits64(rem0) < 0 ) do begin
  7604. dec(zSig0);
  7605. add192( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
  7606. end;
  7607. zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
  7608. if ( ( zSig1 and $3FFF ) <= 4 ) then begin
  7609. mul128By64To192( bSig0, bSig1, zSig1, term1, term2, term3 );
  7610. sub192( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
  7611. while ( sbits64(rem1) < 0 ) do begin
  7612. dec(zSig1);
  7613. add192( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
  7614. end;
  7615. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7616. end;
  7617. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, zSig0, zSig1, zSig2 );
  7618. result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
  7619. end;
  7620. {*----------------------------------------------------------------------------
  7621. | Returns the remainder of the quadruple-precision floating-point value `a'
  7622. | with respect to the corresponding value `b'. The operation is performed
  7623. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7624. *----------------------------------------------------------------------------*}
  7625. function float128_rem(a: float128; b: float128): float128;
  7626. var
  7627. aSign, bSign, zSign: flag;
  7628. aExp, bExp, expDiff: int32;
  7629. aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
  7630. allZero, alternateASig0, alternateASig1, sigMean1: bits64;
  7631. sigMean0: sbits64;
  7632. z: float128;
  7633. label
  7634. invalid;
  7635. begin
  7636. aSig1 := extractFloat128Frac1( a );
  7637. aSig0 := extractFloat128Frac0( a );
  7638. aExp := extractFloat128Exp( a );
  7639. aSign := extractFloat128Sign( a );
  7640. bSig1 := extractFloat128Frac1( b );
  7641. bSig0 := extractFloat128Frac0( b );
  7642. bExp := extractFloat128Exp( b );
  7643. bSign := extractFloat128Sign( b );
  7644. if ( aExp = $7FFF ) then begin
  7645. if ( (( aSig0 or aSig1 )<>0)
  7646. or ( ( bExp = $7FFF ) and (( bSig0 or bSig1 )<>0) ) ) then begin
  7647. result := propagateFloat128NaN( a, b );
  7648. exit;
  7649. end;
  7650. goto invalid;
  7651. end;
  7652. if ( bExp = $7FFF ) then begin
  7653. if ( bSig0 or bSig1 )<>0 then
  7654. begin
  7655. result := propagateFloat128NaN( a, b );
  7656. exit;
  7657. end;
  7658. result := a;
  7659. exit;
  7660. end;
  7661. if ( bExp = 0 ) then begin
  7662. if ( ( bSig0 or bSig1 ) = 0 ) then begin
  7663. invalid:
  7664. float_raise( float_flag_invalid );
  7665. z.low := float128_default_nan_low;
  7666. z.high := float128_default_nan_high;
  7667. result := z;
  7668. exit;
  7669. end;
  7670. normalizeFloat128Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
  7671. end;
  7672. if ( aExp = 0 ) then begin
  7673. if ( ( aSig0 or aSig1 ) = 0 ) then
  7674. begin
  7675. result := a;
  7676. exit;
  7677. end;
  7678. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7679. end;
  7680. expDiff := aExp - bExp;
  7681. if ( expDiff < -1 ) then
  7682. begin
  7683. result := a;
  7684. exit;
  7685. end;
  7686. shortShift128Left(
  7687. aSig0 or int64( $0001000000000000 ),
  7688. aSig1,
  7689. 15 - ord( expDiff < 0 ),
  7690. aSig0,
  7691. aSig1
  7692. );
  7693. shortShift128Left(
  7694. bSig0 or int64( $0001000000000000 ), bSig1, 15, bSig0, bSig1 );
  7695. q := le128( bSig0, bSig1, aSig0, aSig1 );
  7696. if ( q )<>0 then sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7697. dec(expDiff,64);
  7698. while ( 0 < expDiff ) do begin
  7699. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7700. if ( 4 < q ) then
  7701. q := q - 4
  7702. else
  7703. q := 0;
  7704. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7705. shortShift192Left( term0, term1, term2, 61, term1, term2, allZero );
  7706. shortShift128Left( aSig0, aSig1, 61, aSig0, allZero );
  7707. sub128( aSig0, 0, term1, term2, aSig0, aSig1 );
  7708. dec(expDiff,61);
  7709. end;
  7710. if ( -64 < expDiff ) then begin
  7711. q := estimateDiv128To64( aSig0, aSig1, bSig0 );
  7712. if ( 4 < q ) then
  7713. q := q - 4
  7714. else
  7715. q := 0;
  7716. q := q shr (- expDiff);
  7717. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7718. inc(expDiff,52);
  7719. if ( expDiff < 0 ) then begin
  7720. shift128Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
  7721. end
  7722. else begin
  7723. shortShift128Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
  7724. end;
  7725. mul128By64To192( bSig0, bSig1, q, term0, term1, term2 );
  7726. sub128( aSig0, aSig1, term1, term2, aSig0, aSig1 );
  7727. end
  7728. else begin
  7729. shift128Right( aSig0, aSig1, 12, aSig0, aSig1 );
  7730. shift128Right( bSig0, bSig1, 12, bSig0, bSig1 );
  7731. end;
  7732. repeat
  7733. alternateASig0 := aSig0;
  7734. alternateASig1 := aSig1;
  7735. inc(q);
  7736. sub128( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
  7737. until not( 0 <= sbits64(aSig0) );
  7738. add128(
  7739. aSig0, aSig1, alternateASig0, alternateASig1, bits64(sigMean0), sigMean1 );
  7740. if ( ( sigMean0 < 0 )
  7741. or ( ( ( sigMean0 or sigMean1 ) = 0 ) and (( q and 1 )<>0) ) ) then begin
  7742. aSig0 := alternateASig0;
  7743. aSig1 := alternateASig1;
  7744. end;
  7745. zSign := ord( sbits64(aSig0) < 0 );
  7746. if ( zSign<>0 ) then sub128( 0, 0, aSig0, aSig1, aSig0, aSig1 );
  7747. result :=
  7748. normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
  7749. end;
  7750. {*----------------------------------------------------------------------------
  7751. | Returns the square root of the quadruple-precision floating-point value `a'.
  7752. | The operation is performed according to the IEC/IEEE Standard for Binary
  7753. | Floating-Point Arithmetic.
  7754. *----------------------------------------------------------------------------*}
  7755. function float128_sqrt(a: float128): float128;
  7756. var
  7757. aSign: flag;
  7758. aExp, zExp: int32;
  7759. aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
  7760. rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
  7761. z: float128;
  7762. label
  7763. invalid;
  7764. begin
  7765. aSig1 := extractFloat128Frac1( a );
  7766. aSig0 := extractFloat128Frac0( a );
  7767. aExp := extractFloat128Exp( a );
  7768. aSign := extractFloat128Sign( a );
  7769. if ( aExp = $7FFF ) then begin
  7770. if ( aSig0 or aSig1 )<>0 then
  7771. begin
  7772. result := propagateFloat128NaN( a, a );
  7773. exit;
  7774. end;
  7775. if ( aSign=0 ) then
  7776. begin
  7777. result := a;
  7778. exit;
  7779. end;
  7780. goto invalid;
  7781. end;
  7782. if ( aSign<>0 ) then begin
  7783. if ( ( aExp or aSig0 or aSig1 ) = 0 ) then
  7784. begin
  7785. result := a;
  7786. exit;
  7787. end;
  7788. invalid:
  7789. float_raise( float_flag_invalid );
  7790. z.low := float128_default_nan_low;
  7791. z.high := float128_default_nan_high;
  7792. result := z;
  7793. exit;
  7794. end;
  7795. if ( aExp = 0 ) then begin
  7796. if ( ( aSig0 or aSig1 ) = 0 ) then
  7797. begin
  7798. result := packFloat128( 0, 0, 0, 0 );
  7799. exit;
  7800. end;
  7801. normalizeFloat128Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
  7802. end;
  7803. zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
  7804. aSig0 := aSig0 or int64( $0001000000000000 );
  7805. zSig0 := estimateSqrt32( aExp, aSig0>>17 );
  7806. shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), aSig0, aSig1 );
  7807. zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
  7808. doubleZSig0 := zSig0 shl 1;
  7809. mul64To128( zSig0, zSig0, term0, term1 );
  7810. sub128( aSig0, aSig1, term0, term1, rem0, rem1 );
  7811. while ( sbits64(rem0) < 0 ) do begin
  7812. dec(zSig0);
  7813. dec(doubleZSig0,2);
  7814. add128( rem0, rem1, zSig0 shr 63, doubleZSig0 or 1, rem0, rem1 );
  7815. end;
  7816. zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
  7817. if ( ( zSig1 and $1FFF ) <= 5 ) then begin
  7818. if ( zSig1 = 0 ) then zSig1 := 1;
  7819. mul64To128( doubleZSig0, zSig1, term1, term2 );
  7820. sub128( rem1, 0, term1, term2, rem1, rem2 );
  7821. mul64To128( zSig1, zSig1, term2, term3 );
  7822. sub192( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
  7823. while ( sbits64(rem1) < 0 ) do begin
  7824. dec(zSig1);
  7825. shortShift128Left( 0, zSig1, 1, term2, term3 );
  7826. term3 := term3 or 1;
  7827. term2 := term2 or doubleZSig0;
  7828. add192( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
  7829. end;
  7830. zSig1 := zSig1 or ord( ( rem1 or rem2 or rem3 ) <> 0 );
  7831. end;
  7832. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, zSig0, zSig1, zSig2 );
  7833. result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
  7834. end;
  7835. {*----------------------------------------------------------------------------
  7836. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7837. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7838. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7839. *----------------------------------------------------------------------------*}
  7840. function float128_eq(a: float128; b: float128): flag;
  7841. begin
  7842. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7843. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7844. or ( ( extractFloat128Exp( b ) = $7FFF )
  7845. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7846. ) then begin
  7847. if ( (float128_is_signaling_nan( a )<>0)
  7848. or (float128_is_signaling_nan( b )<>0) ) then begin
  7849. float_raise( float_flag_invalid );
  7850. end;
  7851. result := 0;
  7852. exit;
  7853. end;
  7854. result := ord(
  7855. ( a.low = b.low )
  7856. and ( ( a.high = b.high )
  7857. or ( ( a.low = 0 )
  7858. and ( bits64( ( a.high or b.high ) shl 1 ) = 0 ) )
  7859. ));
  7860. end;
  7861. {*----------------------------------------------------------------------------
  7862. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7863. | or equal to the corresponding value `b', and 0 otherwise. The comparison
  7864. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  7865. | Arithmetic.
  7866. *----------------------------------------------------------------------------*}
  7867. function float128_le(a: float128; b: float128): flag;
  7868. var
  7869. aSign, bSign: flag;
  7870. begin
  7871. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7872. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7873. or ( ( extractFloat128Exp( b ) = $7FFF )
  7874. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7875. ) then begin
  7876. float_raise( float_flag_invalid );
  7877. result := 0;
  7878. exit;
  7879. end;
  7880. aSign := extractFloat128Sign( a );
  7881. bSign := extractFloat128Sign( b );
  7882. if ( aSign <> bSign ) then begin
  7883. result := ord(
  7884. (aSign<>0)
  7885. or ( ( ( bits64 ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7886. = 0 ));
  7887. exit;
  7888. end;
  7889. if aSign<>0 then
  7890. result := le128( b.high, b.low, a.high, a.low )
  7891. else
  7892. result := le128( a.high, a.low, b.high, b.low );
  7893. end;
  7894. {*----------------------------------------------------------------------------
  7895. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7896. | the corresponding value `b', and 0 otherwise. The comparison is performed
  7897. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7898. *----------------------------------------------------------------------------*}
  7899. function float128_lt(a: float128; b: float128): flag;
  7900. var
  7901. aSign, bSign: flag;
  7902. begin
  7903. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7904. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7905. or ( ( extractFloat128Exp( b ) = $7FFF )
  7906. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7907. ) then begin
  7908. float_raise( float_flag_invalid );
  7909. result := 0;
  7910. exit;
  7911. end;
  7912. aSign := extractFloat128Sign( a );
  7913. bSign := extractFloat128Sign( b );
  7914. if ( aSign <> bSign ) then begin
  7915. result := ord(
  7916. (aSign<>0)
  7917. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7918. <> 0 ));
  7919. exit;
  7920. end;
  7921. if aSign<>0 then
  7922. result := lt128( b.high, b.low, a.high, a.low )
  7923. else
  7924. result := lt128( a.high, a.low, b.high, b.low );
  7925. end;
  7926. {*----------------------------------------------------------------------------
  7927. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  7928. | the corresponding value `b', and 0 otherwise. The invalid exception is
  7929. | raised if either operand is a NaN. Otherwise, the comparison is performed
  7930. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7931. *----------------------------------------------------------------------------*}
  7932. function float128_eq_signaling(a: float128; b: float128): flag;
  7933. begin
  7934. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7935. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7936. or ( ( extractFloat128Exp( b ) = $7FFF )
  7937. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7938. ) then begin
  7939. float_raise( float_flag_invalid );
  7940. result := 0;
  7941. exit;
  7942. end;
  7943. result := ord(
  7944. ( a.low = b.low )
  7945. and ( ( a.high = b.high )
  7946. or ( ( a.low = 0 )
  7947. and ( bits64 ( ( a.high or b.high ) shl 1 ) = 0 ) )
  7948. ));
  7949. end;
  7950. {*----------------------------------------------------------------------------
  7951. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7952. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  7953. | cause an exception. Otherwise, the comparison is performed according to the
  7954. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  7955. *----------------------------------------------------------------------------*}
  7956. function float128_le_quiet(a: float128; b: float128): flag;
  7957. var
  7958. aSign, bSign: flag;
  7959. begin
  7960. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7961. and ( ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7962. or ( ( extractFloat128Exp( b ) = $7FFF )
  7963. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  7964. ) then begin
  7965. if ( (float128_is_signaling_nan( a )<>0)
  7966. or (float128_is_signaling_nan( b )<>0) ) then begin
  7967. float_raise( float_flag_invalid );
  7968. end;
  7969. result := 0;
  7970. exit;
  7971. end;
  7972. aSign := extractFloat128Sign( a );
  7973. bSign := extractFloat128Sign( b );
  7974. if ( aSign <> bSign ) then begin
  7975. result := ord(
  7976. (aSign<>0)
  7977. or ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  7978. = 0 ));
  7979. exit;
  7980. end;
  7981. if aSign<>0 then
  7982. result := le128( b.high, b.low, a.high, a.low )
  7983. else
  7984. result := le128( a.high, a.low, b.high, b.low );
  7985. end;
  7986. {*----------------------------------------------------------------------------
  7987. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  7988. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  7989. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  7990. | Standard for Binary Floating-Point Arithmetic.
  7991. *----------------------------------------------------------------------------*}
  7992. function float128_lt_quiet(a: float128; b: float128): flag;
  7993. var
  7994. aSign, bSign: flag;
  7995. begin
  7996. if ( ( ( extractFloat128Exp( a ) = $7FFF )
  7997. and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ))<>0 ) )
  7998. or ( ( extractFloat128Exp( b ) = $7FFF )
  7999. and ( (extractFloat128Frac0( b ) or extractFloat128Frac1( b ))<>0 ) )
  8000. ) then begin
  8001. if ( (float128_is_signaling_nan( a )<>0)
  8002. or (float128_is_signaling_nan( b )<>0) ) then begin
  8003. float_raise( float_flag_invalid );
  8004. end;
  8005. result := 0;
  8006. exit;
  8007. end;
  8008. aSign := extractFloat128Sign( a );
  8009. bSign := extractFloat128Sign( b );
  8010. if ( aSign <> bSign ) then begin
  8011. result := ord(
  8012. (aSign<>0)
  8013. and ( ( ( bits64( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
  8014. <> 0 ));
  8015. exit;
  8016. end;
  8017. if aSign<>0 then
  8018. result:=lt128( b.high, b.low, a.high, a.low )
  8019. else
  8020. result:=lt128( a.high, a.low, b.high, b.low );
  8021. end;
  8022. {----------------------------------------------------------------------------
  8023. | Returns the result of converting the double-precision floating-point value
  8024. | `a' to the quadruple-precision floating-point format. The conversion is
  8025. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  8026. | Arithmetic.
  8027. *----------------------------------------------------------------------------}
  8028. function float64_to_float128( a : float64) : float128;
  8029. var
  8030. aSign : flag;
  8031. aExp : int16;
  8032. aSig, zSig0, zSig1 : bits64;
  8033. begin
  8034. aSig := extractFloat64Frac( a );
  8035. aExp := extractFloat64Exp( a );
  8036. aSign := extractFloat64Sign( a );
  8037. if ( aExp = $7FF ) then begin
  8038. if ( aSig<>0 ) then
  8039. result:=commonNaNToFloat128( float64ToCommonNaN( a ) );
  8040. result:=packFloat128( aSign, $7FFF, 0, 0 );
  8041. exit;
  8042. end;
  8043. if ( aExp = 0 ) then begin
  8044. if ( aSig = 0 ) then
  8045. begin
  8046. result:=packFloat128( aSign, 0, 0, 0 );
  8047. exit;
  8048. end;
  8049. normalizeFloat64Subnormal( aSig, aExp, aSig );
  8050. dec(aExp);
  8051. end;
  8052. shift128Right( aSig, 0, 4, zSig0, zSig1 );
  8053. result:=packFloat128( aSign, aExp + $3C00, zSig0, zSig1 );
  8054. end;
  8055. {$endif FPC_SOFTFLOAT_FLOAT128}
  8056. {$endif not(defined(fpc_softfpu_interface))}
  8057. {$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
  8058. end.
  8059. {$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}