12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685 |
- {*
- ===============================================================================
- The original notice of the softfloat package is shown below. The conversion
- to pascal was done by Carl Eric Codere in 2002 ([email protected]).
- ===============================================================================
- This C source file is part of the SoftFloat IEC/IEEE Floating-Point
- Arithmetic Package, Release 2a.
- Written by John R. Hauser. This work was made possible in part by the
- International Computer Science Institute, located at Suite 600, 1947 Center
- Street, Berkeley, California 94704. Funding was partially provided by the
- National Science Foundation under grant MIP-9311980. The original version
- of this code was written as part of a project to build a fixed-point vector
- processor in collaboration with the University of California at Berkeley,
- overseen by Profs. Nelson Morgan and John Wawrzynek. More information
- is available through the Web page
- `http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
- THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
- has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
- TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
- PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
- AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
- Derivative works are acceptable, even for commercial purposes, so long as
- (1) they include prominent notice that the work is derivative, and (2) they
- include prominent notice akin to these four paragraphs for those parts of
- this code that are retained.
- ===============================================================================
- *}
- unit softfpu;
- { Overflow checking must be disabled,
- since some operations expect overflow!
- }
- {$Q-}
- {$ifndef ver1_0}
- {$ifdef fpc}
- {$define hascompilerproc}
- {$endif}
- {$endif}
- {$ifdef fpc}
- {$goto on}
- {$endif}
- interface
- {
- -------------------------------------------------------------------------------
- Software IEC/IEEE floating-point types.
- -------------------------------------------------------------------------------
- }
- TYPE
- float32 = longword;
- flag = byte;
- uint8 = byte;
- int8 = shortint;
- uint16 = word;
- int16 = integer;
- uint32 = longword;
- int32 = longint;
- bits8 = byte;
- sbits8 = shortint;
- bits16 = word;
- sbits16 = integer;
- sbits32 = longint;
- bits32 = longword;
- {$ifndef fpc}
- qword = int64;
- {$endif}
- uint64 = qword;
- bits64 = qword;
- sbits64 = int64;
- {$ifdef ENDIAN_LITTLE}
- float64 = packed record
- low: bits32;
- high: bits32;
- end;
- int64rec = packed record
- low: bits32;
- high: bits32;
- end;
- {$else}
- float64 = packed record
- high,low : bits32;
- end;
- int64rec = packed record
- high,low : bits32;
- end;
- {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is less than
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_lt(a: float64;b: float64): flag; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is less than
- or equal to the corresponding value `b', and 0 otherwise. The comparison
- is performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_le(a: float64;b: float64): flag; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is equal to
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_eq(a: float64;b: float64): flag; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the square root of the double-precision floating-point value `a'.
- The operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float64_sqrt( a: float64; var out: float64 ); {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the remainder of the double-precision floating-point value `a'
- with respect to the corresponding value `b'. The operation is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float64_rem(a: float64; b : float64; var out: float64); {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of dividing the double-precision floating-point value `a'
- by the corresponding value `b'. The operation is performed according to the
- IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float64_div(a: float64; b : float64 ; var out: float64 ); {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of multiplying the double-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float64_mul( a: float64; b:float64; Var out: float64); {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of subtracting the double-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float64_sub(a: float64; b : float64; var out: float64); {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of adding the double-precision floating-point values `a'
- and `b'. The operation is performed according to the IEC/IEEE Standard for
- Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float64_add( a: float64; b : float64; Var out : float64); {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Rounds the double-precision floating-point value `a' to an integer,
- and returns the result as a double-precision floating-point value. The
- operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float64_round_to_int(a: float64; var out: float64 ); {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the double-precision floating-point value
- `a' to the single-precision floating-point format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_to_float32(a: float64 ): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the double-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic, except that the conversion is always rounded toward zero.
- If `a' is a NaN, the largest positive integer is returned. Otherwise, if
- the conversion overflows, the largest integer with the same sign as `a' is
- returned.
- -------------------------------------------------------------------------------
- *}
- Function float64_to_int32_round_to_zero(a: float64 ): int32; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the double-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic---which means in particular that the conversion is rounded
- according to the current rounding mode. If `a' is a NaN, the largest
- positive integer is returned. Otherwise, if the conversion overflows, the
- largest integer with the same sign as `a' is returned.
- -------------------------------------------------------------------------------
- *}
- Function float64_to_int32(a: float64): int32; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is less than
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_lt( a:float32 ; b : float32): flag; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is less than
- or equal to the corresponding value `b', and 0 otherwise. The comparison
- is performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_le( a: float32; b : float32 ):flag; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is equal to
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_eq( a:float32; b:float32): flag; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the square root of the single-precision floating-point value `a'.
- The operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_sqrt(a: float32 ): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the remainder of the single-precision floating-point value `a'
- with respect to the corresponding value `b'. The operation is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_rem(a: float32; b: float32 ):float32; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of dividing the single-precision floating-point value `a'
- by the corresponding value `b'. The operation is performed according to the
- IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_div(a: float32;b: float32 ): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of multiplying the single-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_mul(a: float32; b: float32 ) : float32; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of subtracting the single-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_sub( a: float32 ; b:float32 ): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of adding the single-precision floating-point values `a'
- and `b'. The operation is performed according to the IEC/IEEE Standard for
- Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_add( a: float32; b:float32 ): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Rounds the single-precision floating-point value `a' to an integer,
- and returns the result as a single-precision floating-point value. The
- operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_round_to_int( a: float32): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the single-precision floating-point value
- `a' to the double-precision floating-point format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float32_to_float64( a : float32; var out: Float64); {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the single-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic, except that the conversion is always rounded toward zero.
- If `a' is a NaN, the largest positive integer is returned. Otherwise, if
- the conversion overflows, the largest integer with the same sign as `a' is
- returned.
- -------------------------------------------------------------------------------
- *}
- Function float32_to_int32_round_to_zero( a: Float32 ): int32; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the single-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic---which means in particular that the conversion is rounded
- according to the current rounding mode. If `a' is a NaN, the largest
- positive integer is returned. Otherwise, if the conversion overflows, the
- largest integer with the same sign as `a' is returned.
- -------------------------------------------------------------------------------
- *}
- Function float32_to_int32( a : float32) : int32; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the 32-bit two's complement integer `a' to
- the double-precision floating-point format. The conversion is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure int32_to_float64( a: int32; var c: float64 ); {$ifdef hascompilerproc} compilerproc; {$endif}
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the 32-bit two's complement integer `a' to
- the single-precision floating-point format. The conversion is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function int32_to_float32( a: int32): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 64-bit two's complement integer `a'
- | to the double-precision floating-point format. The conversion is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function int64_to_float64( a: int64 ): float64; {$ifdef hascompilerproc} compilerproc; {$endif}
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 64-bit two's complement integer `a'
- | to the single-precision floating-point format. The conversion is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function int64_to_float32( a: int64 ): float32; {$ifdef hascompilerproc} compilerproc; {$endif}
- CONST
- {-------------------------------------------------------------------------------
- Software IEC/IEEE floating-point underflow tininess-detection mode.
- -------------------------------------------------------------------------------
- *}
- float_tininess_after_rounding = 0;
- float_tininess_before_rounding = 1;
- {*
- -------------------------------------------------------------------------------
- Software IEC/IEEE floating-point rounding mode.
- -------------------------------------------------------------------------------
- *}
- {
- Round to nearest.
- This is the default mode. It should be used unless there is a specific
- need for one of the others. In this mode results are rounded to the
- nearest representable value. If the result is midway between two
- representable values, the even representable is chosen. Even here
- means the lowest-order bit is zero. This rounding mode prevents
- statistical bias and guarantees numeric stability: round-off errors
- in a lengthy calculation will remain smaller than half of FLT_EPSILON.
- Round toward plus Infinity.
- All results are rounded to the smallest representable value which is
- greater than the result.
- Round toward minus Infinity.
- All results are rounded to the largest representable value which is
- less than the result.
- Round toward zero.
- All results are rounded to the largest representable value whose
- magnitude is less than that of the result. In other words, if the
- result is negative it is rounded up; if it is positive, it is
- rounded down.
- }
- float_round_nearest_even = 0;
- float_round_down = 1;
- float_round_up = 2;
- float_round_to_zero = 3;
- {*
- -------------------------------------------------------------------------------
- Software IEC/IEEE floating-point exception flags.
- -------------------------------------------------------------------------------
- *}
- float_flag_invalid = 1;
- float_flag_divbyzero = 4;
- float_flag_overflow = 8;
- float_flag_underflow = 16;
- float_flag_inexact = 32;
- {*
- -------------------------------------------------------------------------------
- Floating-point rounding mode and exception flags.
- -------------------------------------------------------------------------------
- *}
- const
- float_rounding_mode : Byte = float_round_nearest_even;
- float_exception_flags : Byte = 0;
- {*
- -------------------------------------------------------------------------------
- Underflow tininess-detection mode, statically initialized to default value.
- (The declaration in `softfloat.h' must match the `int8' type here.)
- -------------------------------------------------------------------------------
- *}
- const float_detect_tininess: int8 = float_tininess_after_rounding;
- implementation
- {*
- -------------------------------------------------------------------------------
- Raises the exceptions specified by `flags'. Floating-point traps can be
- defined here if desired. It is currently not possible for such a trap
- to substitute a result value. If traps are not implemented, this routine
- should be simply `float_exception_flags |= flags;'.
- -------------------------------------------------------------------------------
- *}
- procedure float_raise( i: shortint );
- Begin
- float_exception_flags := float_exception_flags or i;
- if (float_exception_flags and float_flag_invalid) <> 0 then
- RunError(207)
- else
- if (float_exception_flags and float_flag_divbyzero) <> 0 then
- RunError(200)
- else
- if (float_exception_flags and float_flag_overflow) <> 0 then
- RunError(205)
- else
- if (float_exception_flags and float_flag_underflow) <> 0 then
- RunError(206);
- end;
- (*****************************************************************************)
- (*----------------------------------------------------------------------------*)
- (* Primitive arithmetic functions, including multi-word arithmetic, and *)
- (* division and square root approximations. (Can be specialized to target if *)
- (* desired.) *)
- (* ---------------------------------------------------------------------------*)
- (*****************************************************************************)
- {*
- -------------------------------------------------------------------------------
- Shifts `a' right by the number of bits given in `count'. If any nonzero
- bits are shifted off, they are ``jammed'' into the least significant bit of
- the result by setting the least significant bit to 1. The value of `count'
- can be arbitrarily large; in particular, if `count' is greater than 32, the
- result will be either 0 or 1, depending on whether `a' is zero or nonzero.
- The result is stored in the location pointed to by `zPtr'.
- -------------------------------------------------------------------------------
- *}
- Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
- var
- z: Bits32;
- Begin
- if ( count = 0 ) then
- z := a
- else
- if ( count < 32 ) then
- Begin
- z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
- End
- else
- Begin
- z := bits32( a <> 0 );
- End;
- zPtr := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
- number of bits given in `count'. Any bits shifted off are lost. The value
- of `count' can be arbitrarily large; in particular, if `count' is greater
- than 64, the result will be 0. The result is broken into two 32-bit pieces
- which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- shift64Right(
- a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
- Var
- z0, z1: bits32;
- negCount : int8;
- Begin
- negCount := ( - count ) AND 31;
- if ( count = 0 ) then
- Begin
- z1 := a1;
- z0 := a0;
- End
- else if ( count < 32 ) then
- Begin
- z1 := ( a0 shl negCount ) OR ( a1 shr count );
- z0 := a0 shr count;
- End
- else
- Begin
- if (count < 64) then
- z1 := ( a0 shr ( count AND 31 ) )
- else
- z1 := 0;
- z0 := 0;
- End;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*
- -------------------------------------------------------------------------------
- Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
- number of bits given in `count'. If any nonzero bits are shifted off, they
- are ``jammed'' into the least significant bit of the result by setting the
- least significant bit to 1. The value of `count' can be arbitrarily large;
- in particular, if `count' is greater than 64, the result will be either 0
- or 1, depending on whether the concatenation of `a0' and `a1' is zero or
- nonzero. The result is broken into two 32-bit pieces which are stored at
- the locations pointed to by `z0Ptr' and `z1Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- shift64RightJamming(
- a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
- VAR
- z0, z1 : bits32;
- negCount : int8;
- Begin
- negCount := ( - count ) AND 31;
- if ( count = 0 ) then
- Begin
- z1 := a1;
- z0 := a0;
- End
- else
- if ( count < 32 ) then
- Begin
- z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
- z0 := a0 shr count;
- End
- else
- Begin
- if ( count = 32 ) then
- Begin
- z1 := a0 OR bits32( a1 <> 0 );
- End
- else
- if ( count < 64 ) Then
- Begin
- z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
- End
- else
- Begin
- z1 := bits32( ( a0 OR a1 ) <> 0 );
- End;
- z0 := 0;
- End;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*
- -------------------------------------------------------------------------------
- Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
- by 32 _plus_ the number of bits given in `count'. The shifted result is
- at most 64 nonzero bits; these are broken into two 32-bit pieces which are
- stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
- off form a third 32-bit result as follows: The _last_ bit shifted off is
- the most-significant bit of the extra result, and the other 31 bits of the
- extra result are all zero if and only if _all_but_the_last_ bits shifted off
- were all zero. This extra result is stored in the location pointed to by
- `z2Ptr'. The value of `count' can be arbitrarily large.
- (This routine makes more sense if `a0', `a1', and `a2' are considered
- to form a fixed-point value with binary point between `a1' and `a2'. This
- fixed-point value is shifted right by the number of bits given in `count',
- and the integer part of the result is returned at the locations pointed to
- by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
- corrupted as described above, and is returned at the location pointed to by
- `z2Ptr'.)
- -------------------------------------------------------------------------------
- }
- Procedure
- shift64ExtraRightJamming(
- a0: bits32;
- a1: bits32;
- a2: bits32;
- count: int16;
- VAR z0Ptr: bits32;
- VAR z1Ptr: bits32;
- VAR z2Ptr: bits32
- );
- Var
- z0, z1, z2: bits32;
- negCount : int8;
- Begin
- negCount := ( - count ) AND 31;
- if ( count = 0 ) then
- Begin
- z2 := a2;
- z1 := a1;
- z0 := a0;
- End
- else
- Begin
- if ( count < 32 ) Then
- Begin
- z2 := a1 shl negCount;
- z1 := ( a0 shl negCount ) OR ( a1 shr count );
- z0 := a0 shr count;
- End
- else
- Begin
- if ( count = 32 ) then
- Begin
- z2 := a1;
- z1 := a0;
- End
- else
- Begin
- a2 := a2 or a1;
- if ( count < 64 ) then
- Begin
- z2 := a0 shl negCount;
- z1 := a0 shr ( count AND 31 );
- End
- else
- Begin
- if count = 64 then
- z2 := a0
- else
- z2 := bits32(a0 <> 0);
- z1 := 0;
- End;
- End;
- z0 := 0;
- End;
- z2 := z2 or bits32( a2 <> 0 );
- End;
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*
- -------------------------------------------------------------------------------
- Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
- number of bits given in `count'. Any bits shifted off are lost. The value
- of `count' must be less than 32. The result is broken into two 32-bit
- pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- shortShift64Left(
- a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
- Begin
- z1Ptr := a1 shl count;
- if count = 0 then
- z0Ptr := a0
- else
- z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
- End;
- {*
- -------------------------------------------------------------------------------
- Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
- by the number of bits given in `count'. Any bits shifted off are lost.
- The value of `count' must be less than 32. The result is broken into three
- 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
- `z1Ptr', and `z2Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- shortShift96Left(
- a0: bits32;
- a1: bits32;
- a2: bits32;
- count: int16;
- VAR z0Ptr: bits32;
- VAR z1Ptr: bits32;
- VAR z2Ptr: bits32
- );
- Var
- z0, z1, z2: bits32;
- negCount: int8;
- Begin
- z2 := a2 shl count;
- z1 := a1 shl count;
- z0 := a0 shl count;
- if ( 0 < count ) then
- Begin
- negCount := ( ( - count ) AND 31 );
- z1 := z1 or (a2 shr negCount);
- z0 := z0 or (a1 shr negCount);
- End;
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*
- -------------------------------------------------------------------------------
- Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
- value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
- any carry out is lost. The result is broken into two 32-bit pieces which
- are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- add64(
- a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
- Var
- z1: bits32;
- Begin
- z1 := a1 + b1;
- z1Ptr := z1;
- z0Ptr := a0 + b0 + bits32( z1 < a1 );
- End;
- {*
- -------------------------------------------------------------------------------
- Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
- 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
- modulo 2^96, so any carry out is lost. The result is broken into three
- 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
- `z1Ptr', and `z2Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- add96(
- a0: bits32;
- a1: bits32;
- a2: bits32;
- b0: bits32;
- b1: bits32;
- b2: bits32;
- VAR z0Ptr: bits32;
- VAR z1Ptr: bits32;
- VAR z2Ptr: bits32
- );
- var
- z0, z1, z2: bits32;
- carry0, carry1: int8;
- Begin
- z2 := a2 + b2;
- carry1 := int8( z2 < a2 );
- z1 := a1 + b1;
- carry0 := int8( z1 < a1 );
- z0 := a0 + b0;
- z1 := z1 + carry1;
- z0 := z0 + bits32( z1 < carry1 );
- z0 := z0 + carry0;
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*
- -------------------------------------------------------------------------------
- Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
- 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
- 2^64, so any borrow out (carry out) is lost. The result is broken into two
- 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
- `z1Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- sub64(
- a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
- Begin
- z1Ptr := a1 - b1;
- z0Ptr := a0 - b0 - bits32( a1 < b1 );
- End;
- {*
- -------------------------------------------------------------------------------
- Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
- the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
- is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
- into three 32-bit pieces which are stored at the locations pointed to by
- `z0Ptr', `z1Ptr', and `z2Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- sub96(
- a0:bits32;
- a1:bits32;
- a2:bits32;
- b0:bits32;
- b1:bits32;
- b2:bits32;
- VAR z0Ptr:bits32;
- VAR z1Ptr:bits32;
- VAR z2Ptr:bits32
- );
- Var
- z0, z1, z2: bits32;
- borrow0, borrow1: int8;
- Begin
- z2 := a2 - b2;
- borrow1 := int8( a2 < b2 );
- z1 := a1 - b1;
- borrow0 := int8( a1 < b1 );
- z0 := a0 - b0;
- z0 := z0 - bits32( z1 < borrow1 );
- z1 := z1 - borrow1;
- z0 := z0 -borrow0;
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*
- -------------------------------------------------------------------------------
- Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
- into two 32-bit pieces which are stored at the locations pointed to by
- `z0Ptr' and `z1Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
- :bits32 );
- Var
- aHigh, aLow, bHigh, bLow: bits16;
- z0, zMiddleA, zMiddleB, z1: bits32;
- Begin
- aLow := a and $ffff;
- aHigh := a shr 16;
- bLow := b and $ffff;
- bHigh := b shr 16;
- z1 := ( bits32( aLow) ) * bLow;
- zMiddleA := ( bits32 (aLow) ) * bHigh;
- zMiddleB := ( bits32 (aHigh) ) * bLow;
- z0 := ( bits32 (aHigh) ) * bHigh;
- zMiddleA := zMiddleA + zMiddleB;
- z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
- zMiddleA := zmiddleA shl 16;
- z1 := z1 + zMiddleA;
- z0 := z0 + bits32( z1 < zMiddleA );
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*
- -------------------------------------------------------------------------------
- Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
- to obtain a 96-bit product. The product is broken into three 32-bit pieces
- which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
- `z2Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- mul64By32To96(
- a0:bits32;
- a1:bits32;
- b:bits32;
- VAR z0Ptr:bits32;
- VAR z1Ptr:bits32;
- VAR z2Ptr:bits32
- );
- Var
- z0, z1, z2, more1: bits32;
- Begin
- mul32To64( a1, b, z1, z2 );
- mul32To64( a0, b, z0, more1 );
- add64( z0, more1, 0, z1, z0, z1 );
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*
- -------------------------------------------------------------------------------
- Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
- 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
- product. The product is broken into four 32-bit pieces which are stored at
- the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure
- mul64To128(
- a0:bits32;
- a1:bits32;
- b0:bits32;
- b1:bits32;
- VAR z0Ptr:bits32;
- VAR z1Ptr:bits32;
- VAR z2Ptr:bits32;
- VAR z3Ptr:bits32
- );
- Var
- z0, z1, z2, z3: bits32;
- more1, more2: bits32;
- Begin
- mul32To64( a1, b1, z2, z3 );
- mul32To64( a1, b0, z1, more2 );
- add64( z1, more2, 0, z2, z1, z2 );
- mul32To64( a0, b0, z0, more1 );
- add64( z0, more1, 0, z1, z0, z1 );
- mul32To64( a0, b1, more1, more2 );
- add64( more1, more2, 0, z2, more1, z2 );
- add64( z0, z1, 0, more1, z0, z1 );
- z3Ptr := z3;
- z2Ptr := z2;
- z1Ptr := z1;
- z0Ptr := z0;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns an approximation to the 32-bit integer quotient obtained by dividing
- `b' into the 64-bit value formed by concatenating `a0' and `a1'. The
- divisor `b' must be at least 2^31. If q is the exact quotient truncated
- toward zero, the approximation returned lies between q and q + 2 inclusive.
- If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
- unsigned integer is returned.
- -------------------------------------------------------------------------------
- *}
- Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
- Var
- b0, b1: bits32;
- rem0, rem1, term0, term1: bits32;
- z: bits32;
- Begin
- if ( b <= a0 ) then
- Begin
- estimateDiv64To32 := $FFFFFFFF;
- exit;
- End;
- b0 := b shr 16;
- if ( b0 shl 16 <= a0 ) then
- z:= $FFFF0000
- else
- z:= ( a0 div b0 ) shl 16;
- mul32To64( b, z, term0, term1 );
- sub64( a0, a1, term0, term1, rem0, rem1 );
- while ( ( sbits32 (rem0) ) < 0 ) do
- Begin
- z := z - $10000;
- b1 := b shl 16;
- add64( rem0, rem1, b0, b1, rem0, rem1 );
- End;
- rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
- if ( b0 shl 16 <= rem0 ) then
- z := z or $FFFF
- else
- z := z or (rem0 div b0);
- estimateDiv64To32 := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns an approximation to the square root of the 32-bit significand given
- by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
- `aExp' (the least significant bit) is 1, the integer returned approximates
- 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
- is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
- case, the approximation returned lies strictly within +/-2 of the exact
- value.
- -------------------------------------------------------------------------------
- *}
- Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
- const sqrtOddAdjustments: array[0..15] of bits16 = (
- $0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
- $039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
- );
- const sqrtEvenAdjustments: array[0..15] of bits16 = (
- $0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
- $0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
- );
- Var
- index: int8;
- z: bits32;
- Begin
- index := ( a shr 27 ) AND 15;
- if ( aExp AND 1 ) <> 0 then
- Begin
- z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
- z := ( ( a div z ) shl 14 ) + ( z shl 15 );
- a := a shr 1;
- End
- else
- Begin
- z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
- z := a div z + z;
- if ( $20000 <= z ) then
- z := $FFFF8000
- else
- z := ( z shl 15 );
- if ( z <= a ) then
- Begin
- estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
- exit;
- End;
- End;
- estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the number of leading 0 bits before the most-significant 1 bit of
- `a'. If `a' is zero, 32 is returned.
- -------------------------------------------------------------------------------
- *}
- Function countLeadingZeros32( a:bits32 ): int8;
- const countLeadingZerosHigh:array[0..255] of int8 = (
- 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- );
- Var
- shiftCount: int8;
- Begin
- shiftCount := 0;
- if ( a < $10000 ) then
- Begin
- shiftCount := shiftcount + 16;
- a := a shl 16;
- End;
- if ( a < $1000000 ) then
- Begin
- shiftCount := shiftcount + 8;
- a := a shl 8;
- end;
- shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
- countLeadingZeros32:= shiftCount;
- End;
- {*----------------------------------------------------------------------------
- | Returns the number of leading 0 bits before the most-significant 1 bit of
- | `a'. If `a' is zero, 64 is returned.
- *----------------------------------------------------------------------------*}
- function countLeadingZeros64( a : bits64): int8;
- var
- shiftcount : int8;
- Begin
- shiftCount := 0;
- if ( a < (bits64(1) shl 32 )) then
- shiftCount := shiftcount + 32
- else
- a := a shr 32;
- shiftCount := shiftCount + countLeadingZeros32( a );
- countLeadingZeros64:= shiftCount;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
- equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
- returns 0.
- -------------------------------------------------------------------------------
- *}
- Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
- Begin
- eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
- than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
- Otherwise, returns 0.
- -------------------------------------------------------------------------------
- *}
- Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
- Begin
- le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
- than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
- returns 0.
- -------------------------------------------------------------------------------
- *}
- Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
- Begin
- lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
- equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
- returns 0.
- -------------------------------------------------------------------------------
- *}
- Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
- Begin
- ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
- End;
- (*****************************************************************************)
- (* End Low-Level arithmetic *)
- (*****************************************************************************)
- {*
- -------------------------------------------------------------------------------
- Functions and definitions to determine: (1) whether tininess for underflow
- is detected before or after rounding by default, (2) what (if anything)
- happens when exceptions are raised, (3) how signaling NaNs are distinguished
- from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
- are propagated from function inputs to output. These details are ENDIAN
- specific
- -------------------------------------------------------------------------------
- *}
- {$IFDEF ENDIAN_LITTLE}
- {*
- -------------------------------------------------------------------------------
- Internal canonical NaN format.
- -------------------------------------------------------------------------------
- *}
- TYPE
- commonNaNT = packed record
- sign: flag;
- high, low : bits32;
- end;
- {*
- -------------------------------------------------------------------------------
- The pattern for a default generated single-precision NaN.
- -------------------------------------------------------------------------------
- *}
- const float32_default_nan = $FFC00000;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is a NaN;
- otherwise returns 0.
- -------------------------------------------------------------------------------
- *}
- Function float32_is_nan( a : float32 ): flag;
- Begin
- float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is a signaling
- NaN; otherwise returns 0.
- -------------------------------------------------------------------------------
- *}
- Function float32_is_signaling_nan( a : float32 ): flag;
- Begin
- float32_is_signaling_nan := flag
- ( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the single-precision floating-point NaN
- `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
- exception is raised.
- -------------------------------------------------------------------------------
- *}
- Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
- var
- z : commonNaNT ;
- Begin
- if ( float32_is_signaling_nan( a ) <> 0) then
- float_raise( float_flag_invalid );
- z.sign := a shr 31;
- z.low := 0;
- z.high := a shl 9;
- c := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the canonical NaN `a' to the single-
- precision floating-point format.
- -------------------------------------------------------------------------------
- *}
- Function commonNaNToFloat32( a : commonNaNT ): float32;
- Begin
- commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
- End;
- {*
- -------------------------------------------------------------------------------
- Takes two single-precision floating-point values `a' and `b', one of which
- is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
- signaling NaN, the invalid exception is raised.
- -------------------------------------------------------------------------------
- *}
- Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
- Var
- aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
- label returnLargerSignificand;
- Begin
- aIsNaN := float32_is_nan( a );
- aIsSignalingNaN := float32_is_signaling_nan( a );
- bIsNaN := float32_is_nan( b );
- bIsSignalingNaN := float32_is_signaling_nan( b );
- a := a or $00400000;
- b := b or $00400000;
- if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
- float_raise( float_flag_invalid );
- if ( aIsSignalingNaN )<> 0 then
- Begin
- if ( bIsSignalingNaN ) <> 0 then
- goto returnLargerSignificand;
- if bIsNan <> 0 then
- propagateFloat32NaN := b
- else
- propagateFloat32NaN := a;
- exit;
- End
- else if ( aIsNaN <> 0) then
- Begin
- if ( bIsSignalingNaN or not bIsNaN )<> 0 then
- Begin
- propagateFloat32NaN := a;
- exit;
- End;
- returnLargerSignificand:
- if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
- Begin
- propagateFloat32NaN := b;
- exit;
- End;
- if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
- Begin
- propagateFloat32NaN := a;
- End;
- if a < b then
- propagateFloat32NaN := a
- else
- propagateFloat32NaN := b;
- exit;
- End
- else
- Begin
- propagateFloat32NaN := b;
- exit;
- End;
- End;
- {*
- -------------------------------------------------------------------------------
- The pattern for a default generated double-precision NaN. The `high' and
- `low' values hold the most- and least-significant bits, respectively.
- -------------------------------------------------------------------------------
- *}
- const
- float64_default_nan_high = $FFF80000;
- float64_default_nan_low = $00000000;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is a NaN;
- otherwise returns 0.
- -------------------------------------------------------------------------------
- *}
- Function float64_is_nan( a : float64 ) : flag;
- Begin
- float64_is_nan :=
- flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
- and ( a.low or ( a.high and $000FFFFF ) );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is a signaling
- NaN; otherwise returns 0.
- -------------------------------------------------------------------------------
- *}
- Function float64_is_signaling_nan( a : float64 ): flag;
- Begin
- float64_is_signaling_nan :=
- flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
- and ( a.low or ( a.high and $0007FFFF ) );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the double-precision floating-point NaN
- `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
- exception is raised.
- -------------------------------------------------------------------------------
- *}
- Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
- Var
- z : commonNaNT;
- Begin
- if ( float64_is_signaling_nan( a )<>0 ) then
- float_raise( float_flag_invalid );
- z.sign := a.high shr 31;
- shortShift64Left( a.high, a.low, 12, z.high, z.low );
- c := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the canonical NaN `a' to the double-
- precision floating-point format.
- -------------------------------------------------------------------------------
- *}
- Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
- Var
- z: float64;
- Begin
- shift64Right( a.high, a.low, 12, z.high, z.low );
- z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
- c := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Takes two double-precision floating-point values `a' and `b', one of which
- is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
- signaling NaN, the invalid exception is raised.
- -------------------------------------------------------------------------------
- *}
- Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
- Var
- aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
- label returnLargerSignificand;
- Begin
- aIsNaN := float64_is_nan( a );
- aIsSignalingNaN := float64_is_signaling_nan( a );
- bIsNaN := float64_is_nan( b );
- bIsSignalingNaN := float64_is_signaling_nan( b );
- a.high := a.high or $00080000;
- b.high := b.high or $00080000;
- if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
- float_raise( float_flag_invalid );
- if ( aIsSignalingNaN )<>0 then
- Begin
- if ( bIsSignalingNaN )<>0 then
- goto returnLargerSignificand;
- if bIsNan <> 0 then
- c := b
- else
- c := a;
- exit;
- End
- else if ( aIsNaN )<> 0 then
- Begin
- if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
- Begin
- c := a;
- exit;
- End;
- returnLargerSignificand:
- if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
- Begin
- c := b;
- exit;
- End;
- if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
- Begin
- c := a;
- exit;
- End;
- if a.high < b.high then
- c := a
- else
- c := b;
- exit;
- End
- else
- Begin
- c := b;
- exit;
- End;
- End;
- {$ELSE}
- { Big endian code }
- (*----------------------------------------------------------------------------
- | Internal canonical NaN format.
- *----------------------------------------------------------------------------*)
- type
- commonNANT = packed record
- sign : flag;
- high, low : bits32;
- end;
- (*----------------------------------------------------------------------------
- | The pattern for a default generated single-precision NaN.
- *----------------------------------------------------------------------------*)
- const float32_default_nan = $7FFFFFFF;
- (*----------------------------------------------------------------------------
- | Returns 1 if the single-precision floating-point value `a' is a NaN;
- | otherwise returns 0.
- *----------------------------------------------------------------------------*)
- function float32_is_nan(a: float32): flag;
- begin
- float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
- end;
- (*----------------------------------------------------------------------------
- | Returns 1 if the single-precision floating-point value `a' is a signaling
- | NaN; otherwise returns 0.
- *----------------------------------------------------------------------------*)
- function float32_is_signaling_nan(a: float32):flag;
- begin
- float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
- end;
- (*----------------------------------------------------------------------------
- | Returns the result of converting the single-precision floating-point NaN
- | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
- | exception is raised.
- *----------------------------------------------------------------------------*)
- Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
- var
- z: commonNANT;
- begin
- if float32_is_signaling_nan(a)<>0 then
- float_raise(float_flag_invalid);
- z.sign := a shr 31;
- z.low := 0;
- z.high := a shl 9;
- c:=z;
- end;
- (*----------------------------------------------------------------------------
- | Returns the result of converting the canonical NaN `a' to the single-
- | precision floating-point format.
- *----------------------------------------------------------------------------*)
- function CommonNanToFloat32(a : CommonNaNT): float32;
- begin
- CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
- end;
- (*----------------------------------------------------------------------------
- | Takes two single-precision floating-point values `a' and `b', one of which
- | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
- | signaling NaN, the invalid exception is raised.
- *----------------------------------------------------------------------------*)
- function propagateFloat32NaN( a: float32 ; b: float32): float32;
- var
- aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
- begin
- aIsNaN := float32_is_nan( a );
- aIsSignalingNaN := float32_is_signaling_nan( a );
- bIsNaN := float32_is_nan( b );
- bIsSignalingNaN := float32_is_signaling_nan( b );
- a := a or $00400000;
- b := b or $00400000;
- if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
- float_raise( float_flag_invalid );
- if bIsSignalingNaN<>0 then
- propagateFloat32Nan := b
- else if aIsSignalingNan<>0 then
- propagateFloat32Nan := a
- else if bIsNan<>0 then
- propagateFloat32Nan := b
- else
- propagateFloat32Nan := a;
- end;
- (*----------------------------------------------------------------------------
- | The pattern for a default generated double-precision NaN. The `high' and
- | `low' values hold the most- and least-significant bits, respectively.
- *----------------------------------------------------------------------------*)
- const
- float64_default_nan_high = $7FFFFFFF;
- float64_default_nan_low = $FFFFFFFF;
- (*----------------------------------------------------------------------------
- | Returns 1 if the double-precision floating-point value `a' is a NaN;
- | otherwise returns 0.
- *----------------------------------------------------------------------------*)
- function float64_is_nan(a: float64): flag;
- begin
- float64_is_nan := flag (
- ( $FFE00000 <= bits32 ( a.high shl 1 ) )
- and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
- end;
- (*----------------------------------------------------------------------------
- | Returns 1 if the double-precision floating-point value `a' is a signaling
- | NaN; otherwise returns 0.
- *----------------------------------------------------------------------------*)
- function float64_is_signaling_nan( a:float64): flag;
- begin
- float64_is_signaling_nan := flag
- ( ( ( a.high shr 19 ) and $FFF ) = $FFE )
- and ( (a.low<>0) or ( boolean(( a.high and $0007FFFF )<>0)) );
- end;
- (*----------------------------------------------------------------------------
- | Returns the result of converting the double-precision floating-point NaN
- | `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
- | exception is raised.
- *----------------------------------------------------------------------------*)
- Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
- var
- z : commonNaNT;
- begin
- if ( float64_is_signaling_nan( a )<>0 ) then
- float_raise( float_flag_invalid );
- z.sign := a.high shr 31;
- shortShift64Left( a.high, a.low, 12, z.high, z.low );
- c:=z;
- end;
- (*----------------------------------------------------------------------------
- | Returns the result of converting the canonical NaN `a' to the double-
- | precision floating-point format.
- *----------------------------------------------------------------------------*)
- Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
- var
- z: float64;
- begin
- shift64Right( a.high, a.low, 12, z.high, z.low );
- z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
- c:=z;
- end;
- (*----------------------------------------------------------------------------
- | Takes two double-precision floating-point values `a' and `b', one of which
- | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
- | signaling NaN, the invalid exception is raised.
- *----------------------------------------------------------------------------*)
- Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
- var
- aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
- begin
- aIsNaN := float64_is_nan( a );
- aIsSignalingNaN := float64_is_signaling_nan( a );
- bIsNaN := float64_is_nan( b );
- bIsSignalingNaN := float64_is_signaling_nan( b );
- a.high := a.high or $00080000;
- b.high := b.high or $00080000;
- if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
- float_raise( float_flag_invalid );
- if bIsSignalingNaN<>0 then
- c := b
- else if aIsSignalingNan<>0 then
- c := a
- else if bIsNan<>0 then
- c := b
- else
- c := a;
- end;
- {$ENDIF}
- (****************************************************************************)
- (* END ENDIAN SPECIFIC CODE *)
- (****************************************************************************)
- {*
- -------------------------------------------------------------------------------
- Returns the fraction bits of the single-precision floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function ExtractFloat32Frac(a : Float32) : Bits32;
- Begin
- ExtractFloat32Frac := A AND $007FFFFF;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the exponent bits of the single-precision floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function extractFloat32Exp( a: float32 ): Int16;
- Begin
- extractFloat32Exp := (a shr 23) AND $FF;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the sign bit of the single-precision floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function extractFloat32Sign( a: float32 ): Flag;
- Begin
- extractFloat32Sign := a shr 31;
- End;
- {*
- -------------------------------------------------------------------------------
- Normalizes the subnormal single-precision floating-point value represented
- by the denormalized significand `aSig'. The normalized exponent and
- significand are stored at the locations pointed to by `zExpPtr' and
- `zSigPtr', respectively.
- -------------------------------------------------------------------------------
- *}
- Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
- Var
- ShiftCount : BYTE;
- Begin
- shiftCount := countLeadingZeros32( aSig ) - 8;
- zSigPtr := aSig shl shiftCount;
- zExpPtr := 1 - shiftCount;
- End;
- {*
- -------------------------------------------------------------------------------
- Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
- single-precision floating-point value, returning the result. After being
- shifted into the proper positions, the three fields are simply added
- together to form the result. This means that any integer portion of `zSig'
- will be added into the exponent. Since a properly normalized significand
- will have an integer portion equal to 1, the `zExp' input should be 1 less
- than the desired result exponent whenever `zSig' is a complete, normalized
- significand.
- -------------------------------------------------------------------------------
- *}
- Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
- Begin
- packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
- + zSig;
- End;
- {*
- -------------------------------------------------------------------------------
- Takes an abstract floating-point value having sign `zSign', exponent `zExp',
- and significand `zSig', and returns the proper single-precision floating-
- point value corresponding to the abstract input. Ordinarily, the abstract
- value is simply rounded and packed into the single-precision format, with
- the inexact exception raised if the abstract input cannot be represented
- exactly. However, if the abstract value is too large, the overflow and
- inexact exceptions are raised and an infinity or maximal finite value is
- returned. If the abstract value is too small, the input value is rounded to
- a subnormal number, and the underflow and inexact exceptions are raised if
- the abstract input cannot be represented exactly as a subnormal single-
- precision floating-point number.
- The input significand `zSig' has its binary point between bits 30
- and 29, which is 7 bits to the left of the usual location. This shifted
- significand must be normalized or smaller. If `zSig' is not normalized,
- `zExp' must be 0; in that case, the result returned is a subnormal number,
- and it must not require rounding. In the usual case that `zSig' is
- normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
- The handling of underflow and overflow follows the IEC/IEEE Standard for
- Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
- Var
- roundingMode : BYTE;
- roundNearestEven : Flag;
- roundIncrement, roundBits : BYTE;
- IsTiny : Flag;
- Begin
- roundingMode := float_rounding_mode;
- if (roundingMode = float_round_nearest_even) then
- Begin
- roundNearestEven := Flag(TRUE);
- end
- else
- roundNearestEven := Flag(FALSE);
- roundIncrement := $40;
- if ( Boolean(roundNearestEven) = FALSE) then
- Begin
- if ( roundingMode = float_round_to_zero ) Then
- Begin
- roundIncrement := 0;
- End
- else
- Begin
- roundIncrement := $7F;
- if ( zSign <> 0 ) then
- Begin
- if roundingMode = float_round_up then roundIncrement := 0;
- End
- else
- Begin
- if roundingMode = float_round_down then roundIncrement := 0;
- End;
- End
- End;
- roundBits := zSig AND $7F;
- if ($FD <= bits16 (zExp) ) then
- Begin
- if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
- Begin
- float_raise( float_flag_overflow OR float_flag_inexact );
- roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
- exit;
- End;
- if ( zExp < 0 ) then
- Begin
- isTiny :=
- flag(( float_detect_tininess = float_tininess_before_rounding )
- OR ( zExp < -1 )
- OR ( (zSig + roundIncrement) < $80000000 ));
- shift32RightJamming( zSig, - zExp, zSig );
- zExp := 0;
- roundBits := zSig AND $7F;
- if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
- float_raise( float_flag_underflow );
- End;
- End;
- if ( roundBits )<> 0 then
- float_exception_flags := float_flag_inexact OR float_exception_flags;
- zSig := ( zSig + roundIncrement ) shr 7;
- zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
- if ( zSig = 0 ) then zExp := 0;
- roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
- exit;
- End;
- {*
- -------------------------------------------------------------------------------
- Takes an abstract floating-point value having sign `zSign', exponent `zExp',
- and significand `zSig', and returns the proper single-precision floating-
- point value corresponding to the abstract input. This routine is just like
- `roundAndPackFloat32' except that `zSig' does not have to be normalized.
- Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
- floating-point exponent.
- -------------------------------------------------------------------------------
- *}
- Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
- Var
- ShiftCount : int8;
- Begin
- shiftCount := countLeadingZeros32( zSig ) - 1;
- normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the least-significant 32 fraction bits of the double-precision
- floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function extractFloat64Frac( a: float64 ): bits32;
- Begin
- extractFloat64Frac := a.low;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the most-significant 20 fraction bits of the double-precision
- floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function extractFloat64Frac0(a: float64): bits32;
- Begin
- extractFloat64Frac0 := a.high and $000FFFFF;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the least-significant 32 fraction bits of the double-precision
- floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function extractFloat64Frac1(a: float64): bits32;
- Begin
- extractFloat64Frac1 := a.low;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the exponent bits of the double-precision floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function extractFloat64Exp(a: float64): int16;
- Begin
- extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the sign bit of the double-precision floating-point value `a'.
- -------------------------------------------------------------------------------
- *}
- Function extractFloat64Sign(a: float64) : flag;
- Begin
- extractFloat64Sign := a.high shr 31;
- End;
- {*
- -------------------------------------------------------------------------------
- Normalizes the subnormal double-precision floating-point value represented
- by the denormalized significand formed by the concatenation of `aSig0' and
- `aSig1'. The normalized exponent is stored at the location pointed to by
- `zExpPtr'. The most significant 21 bits of the normalized significand are
- stored at the location pointed to by `zSig0Ptr', and the least significant
- 32 bits of the normalized significand are stored at the location pointed to
- by `zSig1Ptr'.
- -------------------------------------------------------------------------------
- *}
- Procedure normalizeFloat64Subnormal(
- aSig0: bits32;
- aSig1: bits32;
- VAR zExpPtr : Int16;
- VAR zSig0Ptr : Bits32;
- VAR zSig1Ptr : Bits32
- );
- Var
- ShiftCount : Int8;
- Begin
- if ( aSig0 = 0 ) then
- Begin
- shiftCount := countLeadingZeros32( aSig1 ) - 11;
- if ( shiftCount < 0 ) then
- Begin
- zSig0Ptr := aSig1 shr ( - shiftCount );
- zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
- End
- else
- Begin
- zSig0Ptr := aSig1 shl shiftCount;
- zSig1Ptr := 0;
- End;
- zExpPtr := - shiftCount - 31;
- End
- else
- Begin
- shiftCount := countLeadingZeros32( aSig0 ) - 11;
- shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
- zExpPtr := 1 - shiftCount;
- End;
- End;
- {*
- -------------------------------------------------------------------------------
- Packs the sign `zSign', the exponent `zExp', and the significand formed by
- the concatenation of `zSig0' and `zSig1' into a double-precision floating-
- point value, returning the result. After being shifted into the proper
- positions, the three fields `zSign', `zExp', and `zSig0' are simply added
- together to form the most significant 32 bits of the result. This means
- that any integer portion of `zSig0' will be added into the exponent. Since
- a properly normalized significand will have an integer portion equal to 1,
- the `zExp' input should be 1 less than the desired result exponent whenever
- `zSig0' and `zSig1' concatenated form a complete, normalized significand.
- -------------------------------------------------------------------------------
- *}
- Procedure
- packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
- var
- z: Float64;
- Begin
- z.low := zSig1;
- z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
- c := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Takes an abstract floating-point value having sign `zSign', exponent `zExp',
- and extended significand formed by the concatenation of `zSig0', `zSig1',
- and `zSig2', and returns the proper double-precision floating-point value
- corresponding to the abstract input. Ordinarily, the abstract value is
- simply rounded and packed into the double-precision format, with the inexact
- exception raised if the abstract input cannot be represented exactly.
- However, if the abstract value is too large, the overflow and inexact
- exceptions are raised and an infinity or maximal finite value is returned.
- If the abstract value is too small, the input value is rounded to a
- subnormal number, and the underflow and inexact exceptions are raised if the
- abstract input cannot be represented exactly as a subnormal double-precision
- floating-point number.
- The input significand must be normalized or smaller. If the input
- significand is not normalized, `zExp' must be 0; in that case, the result
- returned is a subnormal number, and it must not require rounding. In the
- usual case that the input significand is normalized, `zExp' must be 1 less
- than the ``true'' floating-point exponent. The handling of underflow and
- overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure
- roundAndPackFloat64(
- zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
- Var
- roundingMode : Int8;
- roundNearestEven, increment, isTiny : Flag;
- Begin
- roundingMode := float_rounding_mode;
- roundNearestEven := flag( roundingMode = float_round_nearest_even );
- increment := flag( sbits32 (zSig2) < 0 );
- if ( roundNearestEven = flag(FALSE) ) then
- Begin
- if ( roundingMode = float_round_to_zero ) then
- increment := 0
- else
- Begin
- if ( zSign )<> 0 then
- Begin
- increment := flag( roundingMode = float_round_down ) and zSig2;
- End
- else
- Begin
- increment := flag( roundingMode = float_round_up ) and zSig2;
- End
- End
- End;
- if ( $7FD <= bits16 (zExp) ) then
- Begin
- if (( $7FD < zExp )
- or (( zExp = $7FD )
- and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
- and (increment<>0)
- )
- ) then
- Begin
- float_raise( float_flag_overflow OR float_flag_inexact );
- if (( roundingMode = float_round_to_zero )
- or ( (zSign<>0) and ( roundingMode = float_round_up ) )
- or ( (zSign = 0) and ( roundingMode = float_round_down ) )
- ) then
- Begin
- packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
- exit;
- End;
- packFloat64( zSign, $7FF, 0, 0, c );
- exit;
- End;
- if ( zExp < 0 ) then
- Begin
- isTiny :=
- flag( float_detect_tininess = float_tininess_before_rounding )
- or flag( zExp < -1 )
- or flag(increment = 0)
- or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
- shift64ExtraRightJamming(
- zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
- zExp := 0;
- if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
- if ( roundNearestEven )<>0 then
- Begin
- increment := flag( sbits32 (zSig2) < 0 );
- End
- else
- Begin
- if ( zSign )<>0 then
- Begin
- increment := flag( roundingMode = float_round_down ) and zSig2;
- End
- else
- Begin
- increment := flag( roundingMode = float_round_up ) and zSig2;
- End
- End;
- End;
- End;
- if ( zSig2 )<>0 then
- float_exception_flags := float_exception_flags OR float_flag_inexact;
- if ( increment )<>0 then
- Begin
- add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
- zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
- End
- else
- Begin
- if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
- End;
- packFloat64( zSign, zExp, zSig0, zSig1, c );
- End;
- {*
- -------------------------------------------------------------------------------
- Takes an abstract floating-point value having sign `zSign', exponent `zExp',
- and significand formed by the concatenation of `zSig0' and `zSig1', and
- returns the proper double-precision floating-point value corresponding
- to the abstract input. This routine is just like `roundAndPackFloat64'
- except that the input significand has fewer bits and does not have to be
- normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
- point exponent.
- -------------------------------------------------------------------------------
- *}
- Procedure
- normalizeRoundAndPackFloat64(
- zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
- Var
- shiftCount : int8;
- zSig2 : bits32;
- Begin
- if ( zSig0 = 0 ) then
- Begin
- zSig0 := zSig1;
- zSig1 := 0;
- zExp := zExp -32;
- End;
- shiftCount := countLeadingZeros32( zSig0 ) - 11;
- if ( 0 <= shiftCount ) then
- Begin
- zSig2 := 0;
- shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
- End
- else
- Begin
- shift64ExtraRightJamming
- (zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
- End;
- zExp := zExp - shiftCount;
- roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the 32-bit two's complement integer `a' to
- the single-precision floating-point format. The conversion is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function int32_to_float32( a: int32): float32; {$ifdef fpc}[public,Alias:'INT32_TO_FLOAT32'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- zSign : Flag;
- Begin
- if ( a = 0 ) then
- Begin
- int32_to_float32 := 0;
- exit;
- End;
- if ( a = sbits32 ($80000000) ) then
- Begin
- int32_to_float32 := packFloat32( 1, $9E, 0 );
- exit;
- end;
- zSign := flag( a < 0 );
- If zSign<>0 then
- a := -a;
- int32_to_float32:=
- normalizeRoundAndPackFloat32( zSign, $9C, a );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the 32-bit two's complement integer `a' to
- the double-precision floating-point format. The conversion is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure int32_to_float64( a: int32; var c: float64 );{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- var
- zSign : flag;
- absA : bits32;
- shiftCount : int8;
- zSig0, zSig1 : bits32;
- Begin
- if ( a = 0 ) then
- Begin
- packFloat64( 0, 0, 0, 0, c );
- exit;
- end;
- zSign := flag( a < 0 );
- if ZSign<>0 then
- AbsA := -a
- else
- AbsA := a;
- shiftCount := countLeadingZeros32( absA ) - 11;
- if ( 0 <= shiftCount ) then
- Begin
- zSig0 := absA shl shiftCount;
- zSig1 := 0;
- End
- else
- Begin
- shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
- End;
- packFloat64( zSign, $412 - shiftCount, zSig0, zSig1,c );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the single-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic---which means in particular that the conversion is rounded
- according to the current rounding mode. If `a' is a NaN, the largest
- positive integer is returned. Otherwise, if the conversion overflows, the
- largest integer with the same sign as `a' is returned.
- -------------------------------------------------------------------------------
- *}
- Function float32_to_int32( a : float32) : int32;{$ifdef fpc} [public,Alias:'FLOAT32_TO_INT32'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign: flag;
- aExp, shiftCount: int16;
- aSig, aSigExtra: bits32;
- z: int32;
- roundingMode: int8;
- Begin
- aSig := extractFloat32Frac( a );
- aExp := extractFloat32Exp( a );
- aSign := extractFloat32Sign( a );
- shiftCount := aExp - $96;
- if ( 0 <= shiftCount ) then
- Begin
- if ( $9E <= aExp ) then
- Begin
- if ( a <> $CF000000 ) then
- Begin
- float_raise( float_flag_invalid );
- if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
- Begin
- float32_to_int32 := $7FFFFFFF;
- exit;
- End;
- End;
- float32_to_int32 := sbits32 ($80000000);
- exit;
- End;
- z := ( aSig or $00800000 ) shl shiftCount;
- if ( aSign<>0 ) then z := - z;
- End
- else
- Begin
- if ( aExp < $7E ) then
- Begin
- aSigExtra := aExp OR aSig;
- z := 0;
- End
- else
- Begin
- aSig := aSig OR $00800000;
- aSigExtra := aSig shl ( shiftCount and 31 );
- z := aSig shr ( - shiftCount );
- End;
- if ( aSigExtra<>0 ) then
- float_exception_flags := float_exception_flags
- or float_flag_inexact;
- roundingMode := float_rounding_mode;
- if ( roundingMode = float_round_nearest_even ) then
- Begin
- if ( sbits32 (aSigExtra) < 0 ) then
- Begin
- Inc(z);
- if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
- z := z and not 1;
- End;
- if ( aSign<>0 ) then
- z := - z;
- End
- else
- Begin
- aSigExtra := flag( aSigExtra <> 0 );
- if ( aSign<>0 ) then
- Begin
- z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
- z := - z;
- End
- else
- Begin
- z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
- End
- End;
- End;
- float32_to_int32 := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the single-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic, except that the conversion is always rounded toward zero.
- If `a' is a NaN, the largest positive integer is returned. Otherwise, if
- the conversion overflows, the largest integer with the same sign as `a' is
- returned.
- -------------------------------------------------------------------------------
- *}
- Function float32_to_int32_round_to_zero( a: Float32 ): int32;
- {$ifdef fpc}[public,Alias:'FLOAT32_TO_INT32_ROUND_TO_ZERO'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign : flag;
- aExp, shiftCount : int16;
- aSig : bits32;
- z : int32;
- Begin
- aSig := extractFloat32Frac( a );
- aExp := extractFloat32Exp( a );
- aSign := extractFloat32Sign( a );
- shiftCount := aExp - $9E;
- if ( 0 <= shiftCount ) then
- Begin
- if ( a <> $CF000000 ) then
- Begin
- float_raise( float_flag_invalid );
- if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
- Begin
- float32_to_int32_round_to_zero := $7FFFFFFF;
- exit;
- end;
- End;
- float32_to_int32_round_to_zero:= sbits32 ($80000000);
- exit;
- End
- else
- if ( aExp <= $7E ) then
- Begin
- if ( aExp or aSig )<>0 then
- float_exception_flags :=
- float_exception_flags or float_flag_inexact;
- float32_to_int32_round_to_zero := 0;
- exit;
- End;
- aSig := ( aSig or $00800000 ) shl 8;
- z := aSig shr ( - shiftCount );
- if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
- Begin
- float_exception_flags :=
- float_exception_flags or float_flag_inexact;
- End;
- if ( aSign<>0 ) then z := - z;
- float32_to_int32_round_to_zero := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the single-precision floating-point value
- `a' to the double-precision floating-point format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float32_to_float64( a : float32; var out: Float64);
- {$ifdef fpc}[public,Alias:'FLOAT32_TO_FLOAT64'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign : flag;
- aExp : int16;
- aSig, zSig0, zSig1: bits32;
- tmp : CommonNanT;
- Begin
- aSig := extractFloat32Frac( a );
- aExp := extractFloat32Exp( a );
- aSign := extractFloat32Sign( a );
- if ( aExp = $FF ) then
- Begin
- if ( aSig<>0 ) then
- Begin
- float32ToCommonNaN(a, tmp);
- commonNaNToFloat64(tmp , out);
- exit;
- End;
- packFloat64( aSign, $7FF, 0, 0, out );
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- if ( aSig = 0 ) then
- Begin
- packFloat64( aSign, 0, 0, 0, out );
- exit;
- end;
- normalizeFloat32Subnormal( aSig, aExp, aSig );
- Dec(aExp);
- End;
- shift64Right( aSig, 0, 3, zSig0, zSig1 );
- packFloat64( aSign, aExp + $380, zSig0, zSig1, out );
- End;
- {*
- -------------------------------------------------------------------------------
- Rounds the single-precision floating-point value `a' to an integer,
- and returns the result as a single-precision floating-point value. The
- operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_round_to_int( a: float32): float32;
- {$ifdef fpc}[public,Alias:'FLOAT32_ROUND_TO_INT'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign: flag;
- aExp: int16;
- lastBitMask, roundBitsMask: bits32;
- roundingMode: int8;
- z: float32;
- Begin
- aExp := extractFloat32Exp( a );
- if ( $96 <= aExp ) then
- Begin
- if ( ( aExp = $FF ) and (extractFloat32Frac( a )<>0) ) then
- Begin
- float32_round_to_int:= propagateFloat32NaN( a, a );
- exit;
- End;
- float32_round_to_int:=a;
- exit;
- End;
- if ( aExp <= $7E ) then
- Begin
- if ( bits32 ( a shl 1 ) = 0 ) then
- Begin
- float32_round_to_int:=a;
- exit;
- end;
- float_exception_flags
- := float_exception_flags OR float_flag_inexact;
- aSign := extractFloat32Sign( a );
- case ( float_rounding_mode ) of
- float_round_nearest_even:
- Begin
- if ( ( aExp = $7E ) and (extractFloat32Frac( a )<>0) ) then
- Begin
- float32_round_to_int := packFloat32( aSign, $7F, 0 );
- exit;
- End;
- End;
- float_round_down:
- Begin
- if aSign <> 0 then
- float32_round_to_int := $BF800000
- else
- float32_round_to_int := 0;
- exit;
- End;
- float_round_up:
- Begin
- if aSign <> 0 then
- float32_round_to_int := $80000000
- else
- float32_round_to_int := $3F800000;
- exit;
- End;
- end;
- float32_round_to_int := packFloat32( aSign, 0, 0 );
- End;
- lastBitMask := 1;
- {_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
- lastBitMask := lastBitMask shl ($96 - aExp);
- roundBitsMask := lastBitMask - 1;
- z := a;
- roundingMode := float_rounding_mode;
- if ( roundingMode = float_round_nearest_even ) then
- Begin
- z := z + (lastBitMask shr 1);
- if ( ( z and roundBitsMask ) = 0 ) then
- z := z and not lastBitMask;
- End
- else if ( roundingMode <> float_round_to_zero ) then
- Begin
- if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
- Begin
- z := z + roundBitsMask;
- End;
- End;
- z := z and not roundBitsMask;
- if ( z <> a ) then
- float_exception_flags := float_exception_flags or float_flag_inexact;
- float32_round_to_int := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of adding the absolute values of the single-precision
- floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
- before being returned. `zSign' is ignored if the result is a NaN.
- The addition is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
- Var
- aExp, bExp, zExp: int16;
- aSig, bSig, zSig: bits32;
- expDiff: int16;
- label roundAndPack;
- Begin
- aSig:=extractFloat32Frac( a );
- aExp:=extractFloat32Exp( a );
- bSig:=extractFloat32Frac( b );
- bExp := extractFloat32Exp( b );
- expDiff := aExp - bExp;
- aSig := aSig shl 6;
- bSig := bSig shl 6;
- if ( 0 < expDiff ) then
- Begin
- if ( aExp = $FF ) then
- Begin
- if ( aSig <> 0) then
- Begin
- addFloat32Sigs := propagateFloat32NaN( a, b );
- exit;
- End;
- addFloat32Sigs := a;
- exit;
- End;
- if ( bExp = 0 ) then
- Begin
- Dec(expDiff);
- End
- else
- Begin
- bSig := bSig or $20000000;
- End;
- shift32RightJamming( bSig, expDiff, bSig );
- zExp := aExp;
- End
- else
- If ( expDiff < 0 ) then
- Begin
- if ( bExp = $FF ) then
- Begin
- if ( bSig<>0 ) then
- Begin
- addFloat32Sigs := propagateFloat32NaN( a, b );
- exit;
- end;
- addFloat32Sigs := packFloat32( zSign, $FF, 0 );
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- Inc(expDiff);
- End
- else
- Begin
- aSig := aSig OR $20000000;
- End;
- shift32RightJamming( aSig, - expDiff, aSig );
- zExp := bExp;
- End
- else
- Begin
- if ( aExp = $FF ) then
- Begin
- if ( aSig OR bSig )<> 0 then
- Begin
- addFloat32Sigs := propagateFloat32NaN( a, b );
- exit;
- end;
- addFloat32Sigs := a;
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
- exit;
- end;
- zSig := $40000000 + aSig + bSig;
- zExp := aExp;
- goto roundAndPack;
- End;
- aSig := aSig OR $20000000;
- zSig := ( aSig + bSig ) shl 1;
- Dec(zExp);
- if ( sbits32 (zSig) < 0 ) then
- Begin
- zSig := aSig + bSig;
- Inc(zExp);
- End;
- roundAndPack:
- addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of subtracting the absolute values of the single-
- precision floating-point values `a' and `b'. If `zSign' is 1, the
- difference is negated before being returned. `zSign' is ignored if the
- result is a NaN. The subtraction is performed according to the IEC/IEEE
- Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
- Var
- aExp, bExp, zExp: int16;
- aSig, bSig, zSig: bits32;
- expDiff : int16;
- label aExpBigger;
- label bExpBigger;
- label aBigger;
- label bBigger;
- label normalizeRoundAndPack;
- Begin
- aSig := extractFloat32Frac( a );
- aExp := extractFloat32Exp( a );
- bSig := extractFloat32Frac( b );
- bExp := extractFloat32Exp( b );
- expDiff := aExp - bExp;
- aSig := aSig shl 7;
- bSig := bSig shl 7;
- if ( 0 < expDiff ) then goto aExpBigger;
- if ( expDiff < 0 ) then goto bExpBigger;
- if ( aExp = $FF ) then
- Begin
- if ( aSig OR bSig )<> 0 then
- Begin
- subFloat32Sigs := propagateFloat32NaN( a, b );
- exit;
- End;
- float_raise( float_flag_invalid );
- subFloat32Sigs := float32_default_nan;
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- aExp := 1;
- bExp := 1;
- End;
- if ( bSig < aSig ) Then goto aBigger;
- if ( aSig < bSig ) Then goto bBigger;
- subFloat32Sigs := packFloat32( flag(float_rounding_mode = float_round_down), 0, 0 );
- exit;
- bExpBigger:
- if ( bExp = $FF ) then
- Begin
- if ( bSig<>0 ) then
- Begin
- subFloat32Sigs := propagateFloat32NaN( a, b );
- exit;
- End;
- subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- Inc(expDiff);
- End
- else
- Begin
- aSig := aSig OR $40000000;
- End;
- shift32RightJamming( aSig, - expDiff, aSig );
- bSig := bSig OR $40000000;
- bBigger:
- zSig := bSig - aSig;
- zExp := bExp;
- zSign := zSign xor 1;
- goto normalizeRoundAndPack;
- aExpBigger:
- if ( aExp = $FF ) then
- Begin
- if ( aSig <> 0) then
- Begin
- subFloat32Sigs := propagateFloat32NaN( a, b );
- exit;
- End;
- subFloat32Sigs := a;
- exit;
- End;
- if ( bExp = 0 ) then
- Begin
- Dec(expDiff);
- End
- else
- Begin
- bSig := bSig OR $40000000;
- End;
- shift32RightJamming( bSig, expDiff, bSig );
- aSig := aSig OR $40000000;
- aBigger:
- zSig := aSig - bSig;
- zExp := aExp;
- normalizeRoundAndPack:
- Dec(zExp);
- subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of adding the single-precision floating-point values `a'
- and `b'. The operation is performed according to the IEC/IEEE Standard for
- Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_add( a: float32; b:float32 ): float32;{$ifdef fpc} [public,Alias:'FLOAT32_ADD'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign, bSign: Flag;
- Begin
- aSign := extractFloat32Sign( a );
- bSign := extractFloat32Sign( b );
- if ( aSign = bSign ) then
- Begin
- float32_add := addFloat32Sigs( a, b, aSign );
- End
- else
- Begin
- float32_add := subFloat32Sigs( a, b, aSign );
- End;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of subtracting the single-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_sub( a: float32 ; b:float32 ): float32;{$ifdef fpc} [public,Alias:'FLOAT32_SUB'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign, bSign: flag;
- Begin
- aSign := extractFloat32Sign( a );
- bSign := extractFloat32Sign( b );
- if ( aSign = bSign ) then
- Begin
- float32_sub := subFloat32Sigs( a, b, aSign );
- End
- else
- Begin
- float32_sub := addFloat32Sigs( a, b, aSign );
- End;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of multiplying the single-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_mul(a: float32; b: float32 ) : float32;{$ifdef fpc} [public,Alias:'FLOAT32_MUL'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign, bSign, zSign: flag;
- aExp, bExp, zExp : int16;
- aSig, bSig, zSig0, zSig1: bits32;
- Begin
- aSig := extractFloat32Frac( a );
- aExp := extractFloat32Exp( a );
- aSign := extractFloat32Sign( a );
- bSig := extractFloat32Frac( b );
- bExp := extractFloat32Exp( b );
- bSign := extractFloat32Sign( b );
- zSign := aSign xor bSign;
- if ( aExp = $FF ) then
- Begin
- if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
- Begin
- float32_mul := propagateFloat32NaN( a, b );
- End;
- if ( ( bExp OR bSig ) = 0 ) then
- Begin
- float_raise( float_flag_invalid );
- float32_mul := float32_default_nan;
- exit;
- End;
- float32_mul := packFloat32( zSign, $FF, 0 );
- exit;
- End;
- if ( bExp = $FF ) then
- Begin
- if ( bSig <> 0 ) then
- Begin
- float32_mul := propagateFloat32NaN( a, b );
- exit;
- End;
- if ( ( aExp OR aSig ) = 0 ) then
- Begin
- float_raise( float_flag_invalid );
- float32_mul := float32_default_nan;
- exit;
- End;
- float32_mul := packFloat32( zSign, $FF, 0 );
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- if ( aSig = 0 ) then
- Begin
- float32_mul := packFloat32( zSign, 0, 0 );
- exit;
- End;
- normalizeFloat32Subnormal( aSig, aExp, aSig );
- End;
- if ( bExp = 0 ) then
- Begin
- if ( bSig = 0 ) then
- Begin
- float32_mul := packFloat32( zSign, 0, 0 );
- exit;
- End;
- normalizeFloat32Subnormal( bSig, bExp, bSig );
- End;
- zExp := aExp + bExp - $7F;
- aSig := ( aSig OR $00800000 ) shl 7;
- bSig := ( bSig OR $00800000 ) shl 8;
- mul32To64( aSig, bSig, zSig0, zSig1 );
- zSig0 := zSig0 OR bits32( zSig1 <> 0 );
- if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
- Begin
- zSig0 := zSig0 shl 1;
- Dec(zExp);
- End;
- float32_mul := roundAndPackFloat32( zSign, zExp, zSig0 );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of dividing the single-precision floating-point value `a'
- by the corresponding value `b'. The operation is performed according to the
- IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_div(a: float32;b: float32 ): float32;{$ifdef fpc} [public,Alias:'FLOAT32_DIV'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign, bSign, zSign: flag;
- aExp, bExp, zExp: int16;
- aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
- Begin
- aSig := extractFloat32Frac( a );
- aExp := extractFloat32Exp( a );
- aSign := extractFloat32Sign( a );
- bSig := extractFloat32Frac( b );
- bExp := extractFloat32Exp( b );
- bSign := extractFloat32Sign( b );
- zSign := aSign xor bSign;
- if ( aExp = $FF ) then
- Begin
- if ( aSig <> 0 ) then
- Begin
- float32_div := propagateFloat32NaN( a, b );
- exit;
- End;
- if ( bExp = $FF ) then
- Begin
- if ( bSig <> 0) then
- Begin
- float32_div := propagateFloat32NaN( a, b );
- End;
- float_raise( float_flag_invalid );
- float32_div := float32_default_nan;
- exit;
- End;
- float32_div := packFloat32( zSign, $FF, 0 );
- exit;
- End;
- if ( bExp = $FF ) then
- Begin
- if ( bSig <> 0) then
- Begin
- float32_div := propagateFloat32NaN( a, b );
- exit;
- End;
- float32_div := packFloat32( zSign, 0, 0 );
- exit;
- End;
- if ( bExp = 0 ) Then
- Begin
- if ( bSig = 0 ) Then
- Begin
- if ( ( aExp OR aSig ) = 0 ) then
- Begin
- float_raise( float_flag_invalid );
- float32_div := float32_default_nan;
- exit;
- End;
- float_raise( float_flag_divbyzero );
- float32_div := packFloat32( zSign, $FF, 0 );
- exit;
- End;
- normalizeFloat32Subnormal( bSig, bExp, bSig );
- End;
- if ( aExp = 0 ) Then
- Begin
- if ( aSig = 0 ) Then
- Begin
- float32_div := packFloat32( zSign, 0, 0 );
- exit;
- End;
- normalizeFloat32Subnormal( aSig, aExp, aSig );
- End;
- zExp := aExp - bExp + $7D;
- aSig := ( aSig OR $00800000 ) shl 7;
- bSig := ( bSig OR $00800000 ) shl 8;
- if ( bSig <= ( aSig + aSig ) ) then
- Begin
- aSig := aSig shr 1;
- Inc(zExp);
- End;
- zSig := estimateDiv64To32( aSig, 0, bSig );
- if ( ( zSig and $3F ) <= 2 ) then
- Begin
- mul32To64( bSig, zSig, term0, term1 );
- sub64( aSig, 0, term0, term1, rem0, rem1 );
- while ( sbits32 (rem0) < 0 ) do
- Begin
- Dec(zSig);
- add64( rem0, rem1, 0, bSig, rem0, rem1 );
- End;
- zSig := zSig or bits32( rem1 <> 0 );
- End;
- float32_div := roundAndPackFloat32( zSign, zExp, zSig );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the remainder of the single-precision floating-point value `a'
- with respect to the corresponding value `b'. The operation is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_rem(a: float32; b: float32 ):float32;{$ifdef fpc} [public,Alias:'FLOAT32_REM'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign, bSign, zSign: flag;
- aExp, bExp, expDiff: int16;
- aSig, bSig, q, allZero, alternateASig: bits32;
- sigMean: sbits32;
- Begin
- aSig := extractFloat32Frac( a );
- aExp := extractFloat32Exp( a );
- aSign := extractFloat32Sign( a );
- bSig := extractFloat32Frac( b );
- bExp := extractFloat32Exp( b );
- bSign := extractFloat32Sign( b );
- if ( aExp = $FF ) then
- Begin
- if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
- Begin
- float32_rem := propagateFloat32NaN( a, b );
- exit;
- End;
- float_raise( float_flag_invalid );
- float32_rem := float32_default_nan;
- exit;
- End;
- if ( bExp = $FF ) then
- Begin
- if ( bSig <> 0 ) then
- Begin
- float32_rem := propagateFloat32NaN( a, b );
- exit;
- End;
- float32_rem := a;
- exit;
- End;
- if ( bExp = 0 ) then
- Begin
- if ( bSig = 0 ) then
- Begin
- float_raise( float_flag_invalid );
- float32_rem := float32_default_nan;
- exit;
- End;
- normalizeFloat32Subnormal( bSig, bExp, bSig );
- End;
- if ( aExp = 0 ) then
- Begin
- if ( aSig = 0 ) then
- Begin
- float32_rem := a;
- exit;
- End;
- normalizeFloat32Subnormal( aSig, aExp, aSig );
- End;
- expDiff := aExp - bExp;
- aSig := ( aSig OR $00800000 ) shl 8;
- bSig := ( bSig OR $00800000 ) shl 8;
- if ( expDiff < 0 ) then
- Begin
- if ( expDiff < -1 ) then
- Begin
- float32_rem := a;
- exit;
- End;
- aSig := aSig shr 1;
- End;
- q := bits32( bSig <= aSig );
- if ( q <> 0) then
- aSig := aSig - bSig;
- expDiff := expDiff - 32;
- while ( 0 < expDiff ) do
- Begin
- q := estimateDiv64To32( aSig, 0, bSig );
- if (2 < q) then
- q := q - 2
- else
- q := 0;
- aSig := - ( ( bSig shr 2 ) * q );
- expDiff := expDiff - 30;
- End;
- expDiff := expDiff + 32;
- if ( 0 < expDiff ) then
- Begin
- q := estimateDiv64To32( aSig, 0, bSig );
- if (2 < q) then
- q := q - 2
- else
- q := 0;
- q := q shr (32 - expDiff);
- bSig := bSig shr 2;
- aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
- End
- else
- Begin
- aSig := aSig shr 2;
- bSig := bSig shr 2;
- End;
- Repeat
- alternateASig := aSig;
- Inc(q);
- aSig := aSig - bSig;
- Until not ( 0 <= sbits32 (aSig) );
- sigMean := aSig + alternateASig;
- if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
- Begin
- aSig := alternateASig;
- End;
- zSign := flag( sbits32 (aSig) < 0 );
- if ( zSign<>0 ) then
- aSig := - aSig;
- float32_rem := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the square root of the single-precision floating-point value `a'.
- The operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_sqrt(a: float32 ): float32;{$ifdef fpc} [public,Alias:'FLOAT32_SQRT'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign : flag;
- aExp, zExp : int16;
- aSig, zSig, rem0, rem1, term0, term1: bits32;
- label roundAndPack;
- Begin
- aSig := extractFloat32Frac( a );
- aExp := extractFloat32Exp( a );
- aSign := extractFloat32Sign( a );
- if ( aExp = $FF ) then
- Begin
- if ( aSig <> 0) then
- Begin
- float32_sqrt := propagateFloat32NaN( a, 0 );
- exit;
- End;
- if ( aSign = 0) then
- Begin
- float32_sqrt := a;
- exit;
- End;
- float_raise( float_flag_invalid );
- float32_sqrt := float32_default_nan;
- exit;
- End;
- if ( aSign <> 0) then
- Begin
- if ( ( aExp OR aSig ) = 0 ) then
- Begin
- float32_sqrt := a;
- exit;
- End;
- float_raise( float_flag_invalid );
- float32_sqrt := float32_default_nan;
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- if ( aSig = 0 ) then
- Begin
- float32_sqrt := 0;
- exit;
- End;
- normalizeFloat32Subnormal( aSig, aExp, aSig );
- End;
- zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
- aSig := ( aSig OR $00800000 ) shl 8;
- zSig := estimateSqrt32( aExp, aSig ) + 2;
- if ( ( zSig and $7F ) <= 5 ) then
- Begin
- if ( zSig < 2 ) then
- Begin
- zSig := $7FFFFFFF;
- goto roundAndPack;
- End
- else
- Begin
- aSig := aSig shr (aExp and 1);
- mul32To64( zSig, zSig, term0, term1 );
- sub64( aSig, 0, term0, term1, rem0, rem1 );
- while ( sbits32 (rem0) < 0 ) do
- Begin
- Dec(zSig);
- shortShift64Left( 0, zSig, 1, term0, term1 );
- term1 := term1 or 1;
- add64( rem0, rem1, term0, term1, rem0, rem1 );
- End;
- zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
- End;
- End;
- shift32RightJamming( zSig, 1, zSig );
- roundAndPack:
- float32_sqrt := roundAndPackFloat32( 0, zExp, zSig );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is equal to
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_eq( a:float32; b:float32): flag;{$ifdef fpc} [public,Alias:'FLOAT32_EQ'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Begin
- if ((( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0))
- OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
- ) then
- Begin
- if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
- Begin
- float_raise( float_flag_invalid );
- End;
- float32_eq := 0;
- exit;
- End;
- float32_eq := flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is less than
- or equal to the corresponding value `b', and 0 otherwise. The comparison
- is performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_le( a: float32; b : float32 ):flag;{$ifdef fpc} [public,Alias:'FLOAT32_LE'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- var
- aSign, bSign: flag;
- Begin
- if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
- OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
- ) then
- Begin
- float_raise( float_flag_invalid );
- float32_le := 0;
- exit;
- End;
- aSign := extractFloat32Sign( a );
- bSign := extractFloat32Sign( b );
- if ( aSign <> bSign ) then
- Begin
- float32_le := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
- exit;
- End;
- float32_le := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is less than
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_lt( a:float32 ; b : float32): flag;{$ifdef fpc} [public,Alias:'FLOAT32_LT'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- var
- aSign, bSign: flag;
- Begin
- if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <>0))
- OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <>0) )
- ) then
- Begin
- float_raise( float_flag_invalid );
- float32_lt :=0;
- exit;
- End;
- aSign := extractFloat32Sign( a );
- bSign := extractFloat32Sign( b );
- if ( aSign <> bSign ) then
- Begin
- float32_lt := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
- exit;
- End;
- float32_lt := flag(flag( a <> b ) AND flag( aSign xor flag( a < b ) ));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is equal to
- the corresponding value `b', and 0 otherwise. The invalid exception is
- raised if either operand is a NaN. Otherwise, the comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_eq_signaling( a: float32; b: float32) : flag;
- Begin
- if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
- OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
- ) then
- Begin
- float_raise( float_flag_invalid );
- float32_eq_signaling := 0;
- exit;
- End;
- float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is less than or
- equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
- cause an exception. Otherwise, the comparison is performed according to the
- IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_le_quiet( a: float32 ; b : float32 ): flag;
- Var
- aSign, bSign: flag;
- aExp, bExp: int16;
- Begin
- if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
- OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
- ) then
- Begin
- if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
- Begin
- float_raise( float_flag_invalid );
- End;
- float32_le_quiet := 0;
- exit;
- End;
- aSign := extractFloat32Sign( a );
- bSign := extractFloat32Sign( b );
- if ( aSign <> bSign ) then
- Begin
- float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
- exit;
- End;
- float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the single-precision floating-point value `a' is less than
- the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
- exception. Otherwise, the comparison is performed according to the IEC/IEEE
- Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
- Var
- aSign, bSign: flag;
- Begin
- if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
- OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
- ) then
- Begin
- if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
- Begin
- float_raise( float_flag_invalid );
- End;
- float32_lt_quiet := 0;
- exit;
- End;
- aSign := extractFloat32Sign( a );
- bSign := extractFloat32Sign( b );
- if ( aSign <> bSign ) then
- Begin
- float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
- exit;
- End;
- float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the double-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic---which means in particular that the conversion is rounded
- according to the current rounding mode. If `a' is a NaN, the largest
- positive integer is returned. Otherwise, if the conversion overflows, the
- largest integer with the same sign as `a' is returned.
- -------------------------------------------------------------------------------
- *}
- Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- var
- aSign: flag;
- aExp, shiftCount: int16;
- aSig0, aSig1, absZ, aSigExtra: bits32;
- z: int32;
- roundingMode: int8;
- label invalid;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- shiftCount := aExp - $413;
- if ( 0 <= shiftCount ) then
- Begin
- if ( $41E < aExp ) then
- Begin
- if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
- aSign := 0;
- goto invalid;
- End;
- shortShift64Left(
- aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
- if ( $80000000 < absZ ) then
- goto invalid;
- End
- else
- Begin
- aSig1 := flag( aSig1 <> 0 );
- if ( aExp < $3FE ) then
- Begin
- aSigExtra := aExp OR aSig0 OR aSig1;
- absZ := 0;
- End
- else
- Begin
- aSig0 := aSig0 OR $00100000;
- aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
- absZ := aSig0 shr ( - shiftCount );
- End;
- End;
- roundingMode := float_rounding_mode;
- if ( roundingMode = float_round_nearest_even ) then
- Begin
- if ( sbits32(aSigExtra) < 0 ) then
- Begin
- Inc(absZ);
- if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
- absZ := absZ and not 1;
- End;
- if aSign <> 0 then
- z := - absZ
- else
- z := absZ;
- End
- else
- Begin
- aSigExtra := bits32( aSigExtra <> 0 );
- if ( aSign <> 0) then
- Begin
- z := - ( absZ
- + ( int32( roundingMode = float_round_down ) and aSigExtra ) );
- End
- else
- Begin
- z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
- End
- End;
- if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
- Begin
- invalid:
- float_raise( float_flag_invalid );
- if (aSign <> 0 ) then
- float64_to_int32 := sbits32 ($80000000)
- else
- float64_to_int32 := $7FFFFFFF;
- exit;
- End;
- if ( aSigExtra <> 0) then
- float_exception_flags := float_exception_flags or float_flag_inexact;
- float64_to_int32 := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the double-precision floating-point value
- `a' to the 32-bit two's complement integer format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic, except that the conversion is always rounded toward zero.
- If `a' is a NaN, the largest positive integer is returned. Otherwise, if
- the conversion overflows, the largest integer with the same sign as `a' is
- returned.
- -------------------------------------------------------------------------------
- *}
- Function float64_to_int32_round_to_zero(a: float64 ): int32;
- {$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign: flag;
- aExp, shiftCount: int16;
- aSig0, aSig1, absZ, aSigExtra: bits32;
- z: int32;
- label invalid;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- shiftCount := aExp - $413;
- if ( 0 <= shiftCount ) then
- Begin
- if ( $41E < aExp ) then
- Begin
- if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
- aSign := 0;
- goto invalid;
- End;
- shortShift64Left(
- aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
- End
- else
- Begin
- if ( aExp < $3FF ) then
- Begin
- if ( aExp OR aSig0 OR aSig1 )<>0 then
- Begin
- float_exception_flags :=
- float_exception_flags or float_flag_inexact;
- End;
- float64_to_int32_round_to_zero := 0;
- exit;
- End;
- aSig0 := aSig0 or $00100000;
- aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
- absZ := aSig0 shr ( - shiftCount );
- End;
- if aSign <> 0 then
- z := - absZ
- else
- z := absZ;
- if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
- Begin
- invalid:
- float_raise( float_flag_invalid );
- if (aSign <> 0) then
- float64_to_int32_round_to_zero := sbits32 ($80000000)
- else
- float64_to_int32_round_to_zero := $7FFFFFFF;
- exit;
- End;
- if ( aSigExtra <> 0) then
- float_exception_flags := float_exception_flags or float_flag_inexact;
- float64_to_int32_round_to_zero := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of converting the double-precision floating-point value
- `a' to the single-precision floating-point format. The conversion is
- performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_to_float32(a: float64 ): float32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_FLOAT32'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign: flag;
- aExp: int16;
- aSig0, aSig1, zSig: bits32;
- allZero: bits32;
- tmp : CommonNanT;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- if ( aExp = $7FF ) then
- Begin
- if ( aSig0 OR aSig1 ) <> 0 then
- Begin
- float64ToCommonNaN( a, tmp );
- float64_to_float32 := commonNaNToFloat32( tmp );
- exit;
- End;
- float64_to_float32 := packFloat32( aSign, $FF, 0 );
- exit;
- End;
- shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
- if ( aExp <> 0) then
- zSig := zSig OR $40000000;
- float64_to_float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
- End;
- {*
- -------------------------------------------------------------------------------
- Rounds the double-precision floating-point value `a' to an integer,
- and returns the result as a double-precision floating-point value. The
- operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float64_round_to_int(a: float64; var out: float64 );{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign: flag;
- aExp: int16;
- lastBitMask, roundBitsMask: bits32;
- roundingMode: int8;
- z: float64;
- Begin
- aExp := extractFloat64Exp( a );
- if ( $413 <= aExp ) then
- Begin
- if ( $433 <= aExp ) then
- Begin
- if ( ( aExp = $7FF )
- AND
- (
- ( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
- ) <>0)
- ) then
- Begin
- propagateFloat64NaN( a, a, out );
- exit;
- End;
- out := a;
- exit;
- End;
- lastBitMask := 1;
- lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
- roundBitsMask := lastBitMask - 1;
- z := a;
- roundingMode := float_rounding_mode;
- if ( roundingMode = float_round_nearest_even ) then
- Begin
- if ( lastBitMask <> 0) then
- Begin
- add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
- if ( ( z.low and roundBitsMask ) = 0 ) then
- z.low := z.low and not lastBitMask;
- End
- else
- Begin
- if ( sbits32 (z.low) < 0 ) then
- Begin
- Inc(z.high);
- if ( bits32 ( z.low shl 1 ) = 0 ) then
- z.high := z.high and not 1;
- End;
- End;
- End
- else if ( roundingMode <> float_round_to_zero ) then
- Begin
- if ( extractFloat64Sign( z )
- xor flag( roundingMode = float_round_up ) )<> 0 then
- Begin
- add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
- End;
- End;
- z.low := z.low and not roundBitsMask;
- End
- else
- Begin
- if ( aExp <= $3FE ) then
- Begin
- if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
- Begin
- out := a;
- exit;
- End;
- float_exception_flags := float_exception_flags or
- float_flag_inexact;
- aSign := extractFloat64Sign( a );
- case ( float_rounding_mode ) of
- float_round_nearest_even:
- Begin
- if ( ( aExp = $3FE )
- AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
- ) then
- Begin
- packFloat64( aSign, $3FF, 0, 0, out );
- exit;
- End;
- End;
- float_round_down:
- Begin
- if aSign<>0 then
- packFloat64( 1, $3FF, 0, 0, out )
- else
- packFloat64( 0, 0, 0, 0, out );
- exit;
- End;
- float_round_up:
- Begin
- if aSign <> 0 then
- packFloat64( 1, 0, 0, 0, out )
- else
- packFloat64( 0, $3FF, 0, 0, out );
- exit;
- End;
- end;
- packFloat64( aSign, 0, 0, 0, out );
- exit;
- End;
- lastBitMask := 1;
- lastBitMask := lastBitMask shl ($413 - aExp);
- roundBitsMask := lastBitMask - 1;
- z.low := 0;
- z.high := a.high;
- roundingMode := float_rounding_mode;
- if ( roundingMode = float_round_nearest_even ) then
- Begin
- z.high := z.high + lastBitMask shr 1;
- if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
- Begin
- z.high := z.high and not lastBitMask;
- End;
- End
- else if ( roundingMode <> float_round_to_zero ) then
- Begin
- if ( extractFloat64Sign( z )
- xor flag( roundingMode = float_round_up ) )<> 0 then
- Begin
- z.high := z.high or bits32( a.low <> 0 );
- z.high := z.high + roundBitsMask;
- End;
- End;
- z.high := z.high and not roundBitsMask;
- End;
- if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
- Begin
- float_exception_flags :=
- float_exception_flags or float_flag_inexact;
- End;
- out := z;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of adding the absolute values of the double-precision
- floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
- before being returned. `zSign' is ignored if the result is a NaN.
- The addition is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
- Var
- aExp, bExp, zExp: int16;
- aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
- expDiff: int16;
- label shiftRight1;
- label roundAndPack;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- bSig1 := extractFloat64Frac1( b );
- bSig0 := extractFloat64Frac0( b );
- bExp := extractFloat64Exp( b );
- expDiff := aExp - bExp;
- if ( 0 < expDiff ) then
- Begin
- if ( aExp = $7FF ) then
- Begin
- if ( aSig0 OR aSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- end;
- out := a;
- exit;
- End;
- if ( bExp = 0 ) then
- Begin
- Dec(expDiff);
- End
- else
- Begin
- bSig0 := bSig0 or $00100000;
- End;
- shift64ExtraRightJamming(
- bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
- zExp := aExp;
- End
- else if ( expDiff < 0 ) then
- Begin
- if ( bExp = $7FF ) then
- Begin
- if ( bSig0 OR bSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- packFloat64( zSign, $7FF, 0, 0, out );
- End;
- if ( aExp = 0 ) then
- Begin
- Inc(expDiff);
- End
- else
- Begin
- aSig0 := aSig0 or $00100000;
- End;
- shift64ExtraRightJamming(
- aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
- zExp := bExp;
- End
- else
- Begin
- if ( aExp = $7FF ) then
- Begin
- if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- out := a;
- exit;
- End;
- add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
- if ( aExp = 0 ) then
- Begin
- packFloat64( zSign, 0, zSig0, zSig1, out );
- exit;
- End;
- zSig2 := 0;
- zSig0 := zSig0 or $00200000;
- zExp := aExp;
- goto shiftRight1;
- End;
- aSig0 := aSig0 or $00100000;
- add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
- Dec(zExp);
- if ( zSig0 < $00200000 ) then
- goto roundAndPack;
- Inc(zExp);
- shiftRight1:
- shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
- roundAndPack:
- roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of subtracting the absolute values of the double-
- precision floating-point values `a' and `b'. If `zSign' is 1, the
- difference is negated before being returned. `zSign' is ignored if the
- result is a NaN. The subtraction is performed according to the IEC/IEEE
- Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
- Var
- aExp, bExp, zExp: int16;
- aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
- expDiff: int16;
- z: float64;
- label aExpBigger;
- label bExpBigger;
- label aBigger;
- label bBigger;
- label normalizeRoundAndPack;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- bSig1 := extractFloat64Frac1( b );
- bSig0 := extractFloat64Frac0( b );
- bExp := extractFloat64Exp( b );
- expDiff := aExp - bExp;
- shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
- shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
- if ( 0 < expDiff ) then goto aExpBigger;
- if ( expDiff < 0 ) then goto bExpBigger;
- if ( aExp = $7FF ) then
- Begin
- if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- float_raise( float_flag_invalid );
- z.low := float64_default_nan_low;
- z.high := float64_default_nan_high;
- out := z;
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- aExp := 1;
- bExp := 1;
- End;
- if ( bSig0 < aSig0 ) then goto aBigger;
- if ( aSig0 < bSig0 ) then goto bBigger;
- if ( bSig1 < aSig1 ) then goto aBigger;
- if ( aSig1 < bSig1 ) then goto bBigger;
- packFloat64( flag(float_rounding_mode = float_round_down), 0, 0, 0 , out);
- exit;
- bExpBigger:
- if ( bExp = $7FF ) then
- Begin
- if ( bSig0 OR bSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- packFloat64( zSign xor 1, $7FF, 0, 0, out );
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- Inc(expDiff);
- End
- else
- Begin
- aSig0 := aSig0 or $40000000;
- End;
- shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
- bSig0 := bSig0 or $40000000;
- bBigger:
- sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
- zExp := bExp;
- zSign := zSign xor 1;
- goto normalizeRoundAndPack;
- aExpBigger:
- if ( aExp = $7FF ) then
- Begin
- if ( aSig0 OR aSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- out := a;
- exit;
- End;
- if ( bExp = 0 ) then
- Begin
- Dec(expDiff);
- End
- else
- Begin
- bSig0 := bSig0 or $40000000;
- End;
- shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
- aSig0 := aSig0 or $40000000;
- aBigger:
- sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
- zExp := aExp;
- normalizeRoundAndPack:
- Dec(zExp);
- normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of adding the double-precision floating-point values `a'
- and `b'. The operation is performed according to the IEC/IEEE Standard for
- Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float64_add( a: float64; b : float64; Var out : float64);
- {$ifdef fpc}[public,Alias:'FLOAT64_ADD'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign, bSign: flag;
- Begin
- aSign := extractFloat64Sign( a );
- bSign := extractFloat64Sign( b );
- if ( aSign = bSign ) then
- Begin
- addFloat64Sigs( a, b, aSign, out );
- End
- else
- Begin
- subFloat64Sigs( a, b, aSign, out );
- End;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of subtracting the double-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float64_sub(a: float64; b : float64; var out: float64);
- {$ifdef fpc}[public,Alias:'FLOAT64_SUB'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign, bSign: flag;
- Begin
- aSign := extractFloat64Sign( a );
- bSign := extractFloat64Sign( b );
- if ( aSign = bSign ) then
- Begin
- subFloat64Sigs( a, b, aSign, out );
- End
- else
- Begin
- addFloat64Sigs( a, b, aSign, out );
- End;
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of multiplying the double-precision floating-point values
- `a' and `b'. The operation is performed according to the IEC/IEEE Standard
- for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float64_mul( a: float64; b:float64; Var out: float64);
- {$ifdef fpc}[public,Alias:'FLOAT64_MUL'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign, bSign, zSign: flag;
- aExp, bExp, zExp: int16;
- aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
- z: float64;
- label invalid;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- bSig1 := extractFloat64Frac1( b );
- bSig0 := extractFloat64Frac0( b );
- bExp := extractFloat64Exp( b );
- bSign := extractFloat64Sign( b );
- zSign := aSign xor bSign;
- if ( aExp = $7FF ) then
- Begin
- if ( (( aSig0 OR aSig1 ) <>0)
- OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
- packFloat64( zSign, $7FF, 0, 0, out );
- exit;
- End;
- if ( bExp = $7FF ) then
- Begin
- if ( bSig0 OR bSig1 )<> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
- Begin
- invalid:
- float_raise( float_flag_invalid );
- z.low := float64_default_nan_low;
- z.high := float64_default_nan_high;
- out := z;
- exit;
- End;
- packFloat64( zSign, $7FF, 0, 0, out );
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- if ( ( aSig0 OR aSig1 ) = 0 ) then
- Begin
- packFloat64( zSign, 0, 0, 0, out );
- exit;
- End;
- normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
- End;
- if ( bExp = 0 ) then
- Begin
- if ( ( bSig0 OR bSig1 ) = 0 ) then
- Begin
- packFloat64( zSign, 0, 0, 0, out );
- exit;
- End;
- normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
- End;
- zExp := aExp + bExp - $400;
- aSig0 := aSig0 or $00100000;
- shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
- mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
- add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
- zSig2 := zSig2 or flag( zSig3 <> 0 );
- if ( $00200000 <= zSig0 ) then
- Begin
- shift64ExtraRightJamming(
- zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
- Inc(zExp);
- End;
- roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the result of dividing the double-precision floating-point value `a'
- by the corresponding value `b'. The operation is performed according to the
- IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float64_div(a: float64; b : float64 ; var out: float64 );
- {$ifdef fpc}[public,Alias:'FLOAT64_DIV'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign, bSign, zSign: flag;
- aExp, bExp, zExp: int16;
- aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
- rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
- z: float64;
- label invalid;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- bSig1 := extractFloat64Frac1( b );
- bSig0 := extractFloat64Frac0( b );
- bExp := extractFloat64Exp( b );
- bSign := extractFloat64Sign( b );
- zSign := aSign xor bSign;
- if ( aExp = $7FF ) then
- Begin
- if ( aSig0 OR aSig1 )<> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- end;
- if ( bExp = $7FF ) then
- Begin
- if ( bSig0 OR bSig1 )<>0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- goto invalid;
- End;
- packFloat64( zSign, $7FF, 0, 0, out );
- exit;
- End;
- if ( bExp = $7FF ) then
- Begin
- if ( bSig0 OR bSig1 )<> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- packFloat64( zSign, 0, 0, 0, out );
- exit;
- End;
- if ( bExp = 0 ) then
- Begin
- if ( ( bSig0 OR bSig1 ) = 0 ) then
- Begin
- if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
- Begin
- invalid:
- float_raise( float_flag_invalid );
- z.low := float64_default_nan_low;
- z.high := float64_default_nan_high;
- out := z;
- exit;
- End;
- float_raise( float_flag_divbyzero );
- packFloat64( zSign, $7FF, 0, 0, out );
- exit;
- End;
- normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
- End;
- if ( aExp = 0 ) then
- Begin
- if ( ( aSig0 OR aSig1 ) = 0 ) then
- Begin
- packFloat64( zSign, 0, 0, 0, out );
- exit;
- End;
- normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
- End;
- zExp := aExp - bExp + $3FD;
- shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
- shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
- if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
- Begin
- shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
- Inc(zExp);
- End;
- zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
- mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
- sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
- while ( sbits32 (rem0) < 0 ) do
- Begin
- Dec(zSig0);
- add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
- End;
- zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
- if ( ( zSig1 and $3FF ) <= 4 ) then
- Begin
- mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
- sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
- while ( sbits32 (rem1) < 0 ) do
- Begin
- Dec(zSig1);
- add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
- End;
- zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
- End;
- shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
- roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the remainder of the double-precision floating-point value `a'
- with respect to the corresponding value `b'. The operation is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float64_rem(a: float64; b : float64; var out: float64);
- {$ifdef fpc}[public,Alias:'FLOAT64_REM'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign, bSign, zSign: flag;
- aExp, bExp, expDiff: int16;
- aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
- allZero, alternateASig0, alternateASig1, sigMean1: bits32;
- sigMean0: sbits32;
- z: float64;
- label invalid;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- bSig1 := extractFloat64Frac1( b );
- bSig0 := extractFloat64Frac0( b );
- bExp := extractFloat64Exp( b );
- bSign := extractFloat64Sign( b );
- if ( aExp = $7FF ) then
- Begin
- if ((( aSig0 OR aSig1 )<>0)
- OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- goto invalid;
- End;
- if ( bExp = $7FF ) then
- Begin
- if ( bSig0 OR bSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, b, out );
- exit;
- End;
- out := a;
- exit;
- End;
- if ( bExp = 0 ) then
- Begin
- if ( ( bSig0 OR bSig1 ) = 0 ) then
- Begin
- invalid:
- float_raise( float_flag_invalid );
- z.low := float64_default_nan_low;
- z.high := float64_default_nan_high;
- out := z;
- exit;
- End;
- normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
- End;
- if ( aExp = 0 ) then
- Begin
- if ( ( aSig0 OR aSig1 ) = 0 ) then
- Begin
- out := a;
- exit;
- End;
- normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
- End;
- expDiff := aExp - bExp;
- if ( expDiff < -1 ) then
- Begin
- out := a;
- exit;
- End;
- shortShift64Left(
- aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
- shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
- q := le64( bSig0, bSig1, aSig0, aSig1 );
- if ( q )<>0 then
- sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
- expDiff := expDiff - 32;
- while ( 0 < expDiff ) do
- Begin
- q := estimateDiv64To32( aSig0, aSig1, bSig0 );
- if 4 < q then
- q:= q - 4
- else
- q := 0;
- mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
- shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
- shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
- sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
- expDiff := expDiff - 29;
- End;
- if ( -32 < expDiff ) then
- Begin
- q := estimateDiv64To32( aSig0, aSig1, bSig0 );
- if 4 < q then
- q := q - 4
- else
- q := 0;
- q := q shr (- expDiff);
- shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
- expDiff := expDiff + 24;
- if ( expDiff < 0 ) then
- Begin
- shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
- End
- else
- Begin
- shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
- End;
- mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
- sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
- End
- else
- Begin
- shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
- shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
- End;
- Repeat
- alternateASig0 := aSig0;
- alternateASig1 := aSig1;
- Inc(q);
- sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
- Until not ( 0 <= sbits32 (aSig0) );
- add64(
- aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
- if ( ( sigMean0 < 0 )
- OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
- Begin
- aSig0 := alternateASig0;
- aSig1 := alternateASig1;
- End;
- zSign := flag( sbits32 (aSig0) < 0 );
- if ( zSign <> 0 ) then
- sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
- normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, out );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns the square root of the double-precision floating-point value `a'.
- The operation is performed according to the IEC/IEEE Standard for Binary
- Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Procedure float64_sqrt( a: float64; var out: float64 );
- {$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign: flag;
- aExp, zExp: int16;
- aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
- rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
- z: float64;
- label invalid;
- Begin
- aSig1 := extractFloat64Frac1( a );
- aSig0 := extractFloat64Frac0( a );
- aExp := extractFloat64Exp( a );
- aSign := extractFloat64Sign( a );
- if ( aExp = $7FF ) then
- Begin
- if ( aSig0 OR aSig1 ) <> 0 then
- Begin
- propagateFloat64NaN( a, a, out );
- exit;
- End;
- if ( aSign = 0) then
- Begin
- out := a;
- exit;
- End;
- goto invalid;
- End;
- if ( aSign <> 0 ) then
- Begin
- if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
- Begin
- out := a;
- exit;
- End;
- invalid:
- float_raise( float_flag_invalid );
- z.low := float64_default_nan_low;
- z.high := float64_default_nan_high;
- out := z;
- exit;
- End;
- if ( aExp = 0 ) then
- Begin
- if ( ( aSig0 OR aSig1 ) = 0 ) then
- Begin
- packFloat64( 0, 0, 0, 0, out );
- exit;
- End;
- normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
- End;
- zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
- aSig0 := aSig0 or $00100000;
- shortShift64Left( aSig0, aSig1, 11, term0, term1 );
- zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
- if ( zSig0 = 0 ) then
- zSig0 := $7FFFFFFF;
- doubleZSig0 := zSig0 + zSig0;
- shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
- mul32To64( zSig0, zSig0, term0, term1 );
- sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
- while ( sbits32 (rem0) < 0 ) do
- Begin
- Dec(zSig0);
- doubleZSig0 := doubleZSig0 - 2;
- add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
- End;
- zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
- if ( ( zSig1 and $1FF ) <= 5 ) then
- Begin
- if ( zSig1 = 0 ) then
- zSig1 := 1;
- mul32To64( doubleZSig0, zSig1, term1, term2 );
- sub64( rem1, 0, term1, term2, rem1, rem2 );
- mul32To64( zSig1, zSig1, term2, term3 );
- sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
- while ( sbits32 (rem1) < 0 ) do
- Begin
- Dec(zSig1);
- shortShift64Left( 0, zSig1, 1, term2, term3 );
- term3 := term3 or 1;
- term2 := term2 or doubleZSig0;
- add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
- End;
- zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
- End;
- shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
- roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is equal to
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_eq(a: float64; b: float64): flag;
- {$ifdef fpc}[public,Alias:'FLOAT64_EQ'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Begin
- if
- (
- ( extractFloat64Exp( a ) = $7FF )
- AND
- (
- (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
- )
- )
- OR (
- ( extractFloat64Exp( b ) = $7FF )
- AND (
- (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
- )
- )
- ) then
- Begin
- if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
- float_raise( float_flag_invalid );
- float64_eq := 0;
- exit;
- End;
- float64_eq := flag(
- ( a.low = b.low )
- AND ( ( a.high = b.high )
- OR ( ( a.low = 0 )
- AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
- ));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is less than
- or equal to the corresponding value `b', and 0 otherwise. The comparison
- is performed according to the IEC/IEEE Standard for Binary Floating-Point
- Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_le(a: float64;b: float64): flag;
- {$ifdef fpc}[public,Alias:'FLOAT64_LE'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign, bSign: flag;
- Begin
- if
- (
- ( extractFloat64Exp( a ) = $7FF )
- AND
- (
- (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
- )
- )
- OR (
- ( extractFloat64Exp( b ) = $7FF )
- AND (
- (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
- )
- )
- ) then
- Begin
- float_raise( float_flag_invalid );
- float64_le := 0;
- exit;
- End;
- aSign := extractFloat64Sign( a );
- bSign := extractFloat64Sign( b );
- if ( aSign <> bSign ) then
- Begin
- float64_le := flag(
- (aSign <> 0)
- OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
- = 0 ));
- exit;
- End;
- if aSign <> 0 then
- float64_le := le64( b.high, b.low, a.high, a.low )
- else
- float64_le := le64( a.high, a.low, b.high, b.low );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is less than
- the corresponding value `b', and 0 otherwise. The comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_lt(a: float64;b: float64): flag;
- {$ifdef fpc}[public,Alias:'FLOAT64_LT'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- Var
- aSign, bSign: flag;
- Begin
- if
- (
- ( extractFloat64Exp( a ) = $7FF )
- AND
- (
- (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
- )
- )
- OR (
- ( extractFloat64Exp( b ) = $7FF )
- AND (
- (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
- )
- )
- ) then
- Begin
- float_raise( float_flag_invalid );
- float64_lt := 0;
- exit;
- End;
- aSign := extractFloat64Sign( a );
- bSign := extractFloat64Sign( b );
- if ( aSign <> bSign ) then
- Begin
- float64_lt := flag(
- (aSign <> 0)
- AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
- <> 0 ));
- exit;
- End;
- if aSign <> 0 then
- float64_lt := lt64( b.high, b.low, a.high, a.low )
- else
- float64_lt := lt64( a.high, a.low, b.high, b.low );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is equal to
- the corresponding value `b', and 0 otherwise. The invalid exception is
- raised if either operand is a NaN. Otherwise, the comparison is performed
- according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_eq_signaling( a: float64; b: float64): flag;
- Begin
- if
- (
- ( extractFloat64Exp( a ) = $7FF )
- AND
- (
- (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
- )
- )
- OR (
- ( extractFloat64Exp( b ) = $7FF )
- AND (
- (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
- )
- )
- ) then
- Begin
- float_raise( float_flag_invalid );
- float64_eq_signaling := 0;
- exit;
- End;
- float64_eq_signaling := flag(
- ( a.low = b.low )
- AND ( ( a.high = b.high )
- OR ( ( a.low = 0 )
- AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
- ));
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is less than or
- equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
- cause an exception. Otherwise, the comparison is performed according to the
- IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_le_quiet(a: float64 ; b: float64 ): flag;
- Var
- aSign, bSign : flag;
- Begin
- if
- (
- ( extractFloat64Exp( a ) = $7FF )
- AND
- (
- (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
- )
- )
- OR (
- ( extractFloat64Exp( b ) = $7FF )
- AND (
- (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
- )
- )
- ) then
- Begin
- if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
- float_raise( float_flag_invalid );
- float64_le_quiet := 0;
- exit;
- End;
- aSign := extractFloat64Sign( a );
- bSign := extractFloat64Sign( b );
- if ( aSign <> bSign ) then
- Begin
- float64_le_quiet := flag
- ((aSign <> 0)
- OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
- = 0 ));
- exit;
- End;
- if aSign <> 0 then
- float64_le_quiet := le64( b.high, b.low, a.high, a.low )
- else
- float64_le_quiet := le64( a.high, a.low, b.high, b.low );
- End;
- {*
- -------------------------------------------------------------------------------
- Returns 1 if the double-precision floating-point value `a' is less than
- the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
- exception. Otherwise, the comparison is performed according to the IEC/IEEE
- Standard for Binary Floating-Point Arithmetic.
- -------------------------------------------------------------------------------
- *}
- Function float64_lt_quiet(a: float64; b: float64 ): Flag;
- Var
- aSign, bSign: flag;
- Begin
- if
- (
- ( extractFloat64Exp( a ) = $7FF )
- AND
- (
- (extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
- )
- )
- OR (
- ( extractFloat64Exp( b ) = $7FF )
- AND (
- (extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
- )
- )
- ) then
- Begin
- if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
- float_raise( float_flag_invalid );
- float64_lt_quiet := 0;
- exit;
- End;
- aSign := extractFloat64Sign( a );
- bSign := extractFloat64Sign( b );
- if ( aSign <> bSign ) then
- Begin
- float64_lt_quiet := flag(
- (aSign<>0)
- AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
- <> 0 ));
- exit;
- End;
- If aSign <> 0 then
- float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
- else
- float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
- End;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 64-bit two's complement integer `a'
- | to the single-precision floating-point format. The conversion is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function int64_to_float32( a: int64 ): float32;
- {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT32'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- var
- zSign : flag;
- absA : uint64;
- shiftCount: int8;
- zSig : bits32;
- intval : int64rec;
- Begin
- if ( a = 0 ) then
- begin
- int64_to_float32 := 0;
- exit;
- end;
- if a < 0 then
- zSign := flag(TRUE)
- else
- zSign := flag(FALSE);
- if zSign<>0 then
- absA := -a
- else
- absA := a;
- shiftCount := countLeadingZeros64( absA ) - 40;
- if ( 0 <= shiftCount ) then
- begin
- int64_to_float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
- end
- else
- begin
- shiftCount := shiftCount + 7;
- if ( shiftCount < 0 ) then
- begin
- intval.low := int64rec(AbsA).low;
- intval.high := int64rec(AbsA).high;
- shift64RightJamming( intval.low, intval.high, - shiftCount,
- intval.low, intval.high);
- int64rec(absA).low := intval.low;
- int64rec(absA).high := intval.high;
- end
- else
- absA := absA shl shiftCount;
- int64_to_float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
- end;
- End;
- {*----------------------------------------------------------------------------
- | Returns the result of converting the 64-bit two's complement integer `a'
- | to the double-precision floating-point format. The conversion is performed
- | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
- *----------------------------------------------------------------------------*}
- function int64_to_float64( a: int64 ): float64;
- {$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];{$ifdef hascompilerproc} compilerproc; {$endif}{$endif}
- var
- zSign : flag;
- float_result : float64;
- intval : int64rec;
- AbsA : bits64;
- shiftcount : int8;
- zSig0, zSig1 : bits32;
- Begin
- if ( a = 0 ) then
- Begin
- packFloat64( 0, 0, 0, 0, float_result );
- exit;
- end;
- zSign := flag( a < 0 );
- if ZSign<>0 then
- AbsA := -a
- else
- AbsA := a;
- shiftCount := countLeadingZeros64( absA ) - 11;
- if ( 0 <= shiftCount ) then
- Begin
- absA := absA shl shiftcount;
- zSig0:=int64rec(absA).high;
- zSig1:=int64rec(absA).low;
- End
- else
- Begin
- shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
- End;
- packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
- int64_to_float64:= float_result;
- End;
- end.
- {
- $Log$
- Revision 1.6 2002-11-30 23:25:19 carl
- * forgot goto on switch in last commit
- Revision 1.5 2002/11/30 21:34:20 carl
- + compilerproc for softfpu (first step for integration)
- * several bugfixes for big-endian support
- Revision 1.4 2002/10/13 15:47:39 carl
- * bugfix for int64 to float conversion
- Revision 1.3 2002/10/12 20:24:22 carl
- + int64_tof_loat conversion routines
- Revision 1.2 2002/10/08 20:07:08 carl
- * fix range check errors
- - overflow checking must be off always
- * debugged and works as expected
- Revision 1.1 2002/09/16 19:10:17 carl
- * first revision of FPU emulation
- }
|