{ Implementation of mathematical routines for x86_64 This file is part of the Free Pascal run time library. Copyright (c) 1999-2005 by the Free Pascal development team See the file COPYING.FPC, included in this distribution, for details about the copyright. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. **********************************************************************} {------------------------------------------------------------------------- Using functions from AMath/DAMath libraries, which are covered by the following license: (C) Copyright 2009-2013 Wolfgang Ehrhardt This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. ----------------------------------------------------------------------------} {$push} {$codealign constmin=16} const FPC_ABSMASK_SINGLE: array[0..1] of qword=($7fffffff7fffffff,$7fffffff7fffffff); cvar; public; FPC_ABSMASK_DOUBLE: array[0..1] of qword=($7fffffffffffffff,$7fffffffffffffff); cvar; public; {$pop} FPC_LOG2E: Double = 1.4426950408889634073599246810019; { 1/log(2) } FPC_INFINITY_DOUBLE: QWord = $7ff0000000000000; { IEEE 754 bit representation of +oo for binary64 } FPC_MAXLOG_DOUBLE: Double = 709.78271289338399684324569237317; { log(2**1024) } FPC_MINLOG_DOUBLE: Double = -709.08956571282405153382846025171; { log(2**-1023) } {**************************************************************************** FPU Control word ****************************************************************************} procedure Set8087CW(cw:word); begin default8087cw:=cw; asm fnclex fldcw cw end; end; function Get8087CW:word;assembler; var tmp: word; asm fnstcw tmp movw tmp,%ax andl $0xffff,%eax { clears bits 32-63 } end; procedure SetMXCSR(w : dword); begin defaultmxcsr:=w; asm {$ifdef FPUX86_HAS_AVXUNIT} vldmxcsr w {$else FPUX86_HAS_AVXUNIT} ldmxcsr w {$endif FPUX86_HAS_AVXUNIT} end; end; function GetMXCSR : dword;assembler; var _w : dword; asm {$ifdef FPUX86_HAS_AVXUNIT} vstmxcsr _w {$else FPUX86_HAS_AVXUNIT} stmxcsr _w {$endif FPUX86_HAS_AVXUNIT} movl _w,%eax end; function GetNativeFPUControlWord: TNativeFPUControlWord; {$if defined(SYSTEMINLINE)}inline;{$endif} begin result.cw8087:=Get8087CW; result.MXCSR:=GetMXCSR end; procedure SetNativeFPUControlWord(const cw: TNativeFPUControlWord); {$if defined(SYSTEMINLINE)}inline;{$endif} begin Set8087CW(cw.cw8087); SetMXCSR(cw.MXCSR); end; procedure SetSSECSR(w : dword); begin SetMXCSR(w); end; function GetSSECSR: dword; begin result:=GetMXCSR; end; {**************************************************************************** EXTENDED data type routines ****************************************************************************} {$ifndef FPC_SYSTEM_HAS_ABS} {$define FPC_SYSTEM_HAS_ABS} function fpc_abs_real(d : ValReal) : ValReal;compilerproc; {$ifndef cpullvm} begin { Function is handled internal in the compiler } runerror(207); result:=0; {$else not cpullvm} assembler; asm fldt d fabs {$endif not cpullvm} end; {$endif FPC_SYSTEM_HAS_ABS} {$ifndef FPC_SYSTEM_HAS_SQR} {$define FPC_SYSTEM_HAS_SQR} function fpc_sqr_real(d : ValReal) : ValReal;compilerproc; {$ifndef cpullvm} begin { Function is handled internal in the compiler } runerror(207); result:=0; {$else not cpullvm} begin fpc_sqr_real:=d*d; {$endif not cpullvm} end; {$endif FPC_SYSTEM_HAS_SQR} {$ifndef FPC_SYSTEM_HAS_SQRT} {$define FPC_SYSTEM_HAS_SQRT} function fpc_sqrt_real(d : ValReal) : ValReal;compilerproc; {$ifndef cpullvm} begin { Function is handled internal in the compiler } runerror(207); result:=0; {$else not cpullvm} assembler; asm fldt d fsqrt {$endif not cpullvm} end; {$endif FPC_SYSTEM_HAS_SQRT} {$ifdef FPC_HAS_TYPE_EXTENDED} {$ifndef FPC_SYSTEM_HAS_ARCTAN} {$define FPC_SYSTEM_HAS_ARCTAN} function fpc_arctan_real(d : ValReal) : ValReal;compilerproc; {$ifndef cpullvm} begin { Function is handled internal in the compiler } runerror(207); result:=0; {$else not cpullvm} assembler; asm fldt d fld1 fpatan {$endif not cpullvm} end; {$endif FPC_SYSTEM_HAS_ARCTAN} {$ifndef FPC_SYSTEM_HAS_LN} {$define FPC_SYSTEM_HAS_LN} function fpc_ln_real(d : ValReal) : ValReal;compilerproc; {$ifndef cpullvm} begin { Function is handled internal in the compiler } runerror(207); result:=0; {$else not cpullvm} assembler; asm fldln2 fldt d fyl2x {$endif not cpullvm} end; {$endif FPC_SYSTEM_HAS_LN} {$ifndef FPC_SYSTEM_HAS_SIN} {$define FPC_SYSTEM_HAS_SIN} function fpc_sin_real(d : ValReal) : ValReal;compilerproc; {$ifndef cpullvm} begin { Function is handled internal in the compiler } runerror(207); result:=0; {$else not cpullvm} assembler; asm fldt d fsin {$endif not cpullvm} end; {$endif FPC_SYSTEM_HAS_SIN} {$ifndef FPC_SYSTEM_HAS_COS} {$define FPC_SYSTEM_HAS_COS} function fpc_cos_real(d : ValReal) : ValReal;compilerproc; {$ifndef cpullvm} begin { Function is handled internal in the compiler } runerror(207); result:=0; {$else not cpullvm} assembler; asm fldt d fcos {$endif not cpullvm} end; {$endif FPC_SYSTEM_HAS_COS} {$ifndef FPC_SYSTEM_HAS_EXP} {$define FPC_SYSTEM_HAS_EXP} { exp function adapted from AMath library (C) Copyright 2009-2013 Wolfgang Ehrhardt * translated into AT&T syntax + PIC support * return +Inf/0 for +Inf/-Inf input, instead of NaN } function fpc_exp_real(d : ValReal) : ValReal;assembler;compilerproc; const ln2hi: double=6.9314718036912382E-001; ln2lo: double=1.9082149292705877E-010; large: single=24576.0; two: single=2.0; half: single=0.5; asm fldt d fldl2e fmul %st(1),%st { z = d * log2(e) } frndint { Calculate frac(z) using modular arithmetic to avoid precision loss } fldl ln2hi(%rip) fmul %st(1),%st fsubrp %st,%st(2) fldl ln2lo(%rip) fmul %st(1),%st fsubrp %st,%st(2) fxch %st(1) { (d-int(z)*ln2_hi)-int(z)*ln2_lo } fldl2e fmulp %st,%st(1) { frac(z) } { Above calculation can yield |frac(z)|>1, particularly when rounding mode is not "round to nearest". f2xm1 is undefined in that case, so it's necessary to check } fld %st fabs fld1 fcomip %st(1),%st(0) fstp %st jp .L3 { NaN } jae .L1 { |frac(z)| <= 1, good } fld %st(1) fabs flds large(%rip) fcomip %st(1),%st(0) fstp %st jb .L3 { int(z) >= 24576 } .L0: { Calculate 2**frac(z)-1 as N*(N+2), where N=2**(frac(z)/2)-1 } fmuls half(%rip) f2xm1 fld %st fadds two(%rip) fmulp %st,%st(1) jmp .L2 .L3: fstp %st { pop frac(z) and load 0 } fldz .L1: f2xm1 .L2: fld1 faddp %st,%st(1) fscale fstp %st(1) end; {$endif FPC_SYSTEM_HAS_EXP} {$ifndef FPC_SYSTEM_HAS_FRAC} {$define FPC_SYSTEM_HAS_FRAC} function fpc_frac_real(d : ValReal) : ValReal;assembler;compilerproc; var oldcw,newcw: word; asm fnstcw oldcw fldt d movw oldcw,%cx orw $0x0c00,%cx movw %cx,newcw fldcw newcw fld %st frndint fsubrp %st,%st(1) fwait fldcw oldcw end; {$endif FPC_SYSTEM_HAS_FRAC} {$ifndef FPC_SYSTEM_HAS_INT} {$define FPC_SYSTEM_HAS_INT} function fpc_int_real(d : ValReal) : ValReal;assembler;compilerproc; var oldcw,newcw: word; asm fnstcw oldcw movw oldcw,%cx orw $0x0c00,%cx movw %cx,newcw fldcw newcw fldt d frndint fwait fldcw oldcw end; {$endif FPC_SYSTEM_HAS_INT} {$ifndef FPC_SYSTEM_HAS_TRUNC} {$define FPC_SYSTEM_HAS_TRUNC} function fpc_trunc_real(d : ValReal) : int64;assembler;compilerproc; var oldcw, newcw : word; res : int64; asm fnstcw oldcw movw oldcw,%cx orw $0x0c00,%cx movw %cx,newcw fldcw newcw fldt d fistpq res fwait movq res,%rax fldcw oldcw end; {$endif FPC_SYSTEM_HAS_TRUNC} {$ifndef FPC_SYSTEM_HAS_ROUND} {$define FPC_SYSTEM_HAS_ROUND} function fpc_round_real(d : ValReal) : int64;assembler;compilerproc; var res : int64; asm fldt d fistpq res fwait movq res,%rax end; {$endif FPC_SYSTEM_HAS_ROUND} {$else FPC_HAS_TYPE_EXTENDED} {$ifndef FPC_SYSTEM_HAS_INT} {$define FPC_SYSTEM_HAS_INT} function fpc_int_real(d : ValReal) : ValReal;compilerproc; assembler; nostackframe; asm pextrw $3, %xmm0, %eax { eax = d[48:63] } and $0x7ff0,%ax cmp $0x4330,%ax jge .L0 cvttsd2si %xmm0, %rax cvtsi2sd %rax, %xmm0 .L0: end; {$endif FPC_SYSTEM_HAS_INT} {$ifndef FPC_SYSTEM_HAS_TRUNC} {$define FPC_SYSTEM_HAS_TRUNC} function fpc_trunc_real(d : ValReal) : int64;compilerproc; assembler; nostackframe; asm cvttsd2si %xmm0,%rax; end; {$endif FPC_SYSTEM_HAS_TRUNC} {$ifndef FPC_SYSTEM_HAS_ROUND} {$define FPC_SYSTEM_HAS_ROUND} function fpc_round_real(d : ValReal) : int64;compilerproc; assembler; nostackframe; asm cvtsd2si %xmm0,%rax; end; {$endif FPC_SYSTEM_HAS_ROUND} {$ifndef FPC_SYSTEM_HAS_FRAC} {$define FPC_SYSTEM_HAS_FRAC} function fpc_frac_real(d: ValReal) : ValReal;compilerproc; assembler; nostackframe; asm { Windows defines %xmm4 and %xmm5 as first non-parameter volatile registers; on SYSV systems all are considered as such, so use %xmm4 } pextrw $3, %xmm0, %eax { eax = d[48:63] } movapd %xmm0, %xmm4 and $0x7ff0,%ax cmp $0x4330,%ax jge .L0 cvttsd2si %xmm0, %rax cvtsi2sd %rax, %xmm4 .L0: subsd %xmm4, %xmm0 end; {$endif FPC_SYSTEM_HAS_FRAC} {$ifndef FPC_SYSTEM_HAS_EXP} {$define FPC_SYSTEM_HAS_EXP} {$push} {$codealign constmin=16} const FPC_M25_24_ARRAY: array[0..1] of Double = (-25.0, 24.0); FPC_300_252_ARRAY: array[0..1] of Double = (300.0, 252.0); FPC_M2100_1344_ARRAY: array[0..1] of Double = (-2100.0, 1344.0); FPC_8400_3024_ARRAY: array[0..1] of Double = (8400.0, 3024.0); {$pop} FPC_HALF_DOUBLE: Double = 0.5; FPC_ONE_DOUBLE: Double = 1.0; FPC_M15120_DOUBLE: Double = -15120.0; FPC_M5_DOUBLE: Double = -5.0; function fpc_exp_real(d : ValReal) : ValReal; compilerproc; assembler; nostackframe; {*****************************************************************} { Exponential Function } {*****************************************************************} { } { SYNOPSIS: } { } { double d, y, exp(); } { } { y = exp(d); } { } { DESCRIPTION: } { } { Returns e (2.71828...) raised to the d power. } { } { First calculate z = d log2 e, then break z into integer and } { fractional components k and f. } { } { d z k + f k f } { e = 2 => 2 = 2 2 } { } { 2^k can be calculated very quickly with bit manipulation, then } { convert 2^f to e^x via x = f ln 2 and use a Padé approximant to } { calculate e^x for x between -0.5 ln 2 and 0.5 ln 2 } {*****************************************************************} asm movsd FPC_LOG2E(%rip), %xmm4 ucomisd FPC_MAXLOG_DOUBLE(%rip), %xmm0 jp .LNaN { Propagate NaN into result } jae .LOverflow movsd FPC_ONE_DOUBLE(%rip), %xmm5 xorpd %xmm3, %xmm3 comisd FPC_MINLOG_DOUBLE(%rip), %xmm0 ja .LNoUnderflow xorpd %xmm0, %xmm0 { Set result to zero } .LNaN: ret .LOverflow: movsd FPC_INFINITY_DOUBLE(%rip), %xmm0 { Set result to infinity } ret .balign 16 .LNoUnderflow: movsd FPC_HALF_DOUBLE(%rip), %xmm2 comisd %xmm3, %xmm0 jnz .LNotZero movsd %xmm5, %xmm0 { e^0 = 1 } ret .balign 16 .LNotZero: mulsd %xmm4, %xmm0 addsd %xmm2, %xmm0 { Add 0.5 to make sure the fractional part falls between -0.5 and 0.5 } { %xmm0 won't be out of range due to the earlier checks } cvttsd2si %xmm0, %rax subsd %xmm2, %xmm0 cvtsi2sd %rax, %xmm1 movapd FPC_M25_24_ARRAY(%rip), %xmm5 { Some slightly evil floating-point bit manipulation to calculate 2^k } addw $1023, %ax shlw $4, %ax subsd %xmm1, %xmm0 { %xmm0 now contains the fractional component } pinsrw $3,%eax, %xmm3 { Insert into the most-significant 16 bits } { %xmm3 now contains 2^k } { Calculate the Padé approximant that is: -5(x(x(x(x + 24) + 252) + 1344) + 3024) ---------------------------------------------- x(x(x(x(x - 25) + 300) - 2100) + 8400) - 15120 Aligned for easier calculation: -5(x(x(x(x + 24) + 252) + 1344) + 3024) ---------------------------------------------------- x(x(x(x(x - 25) + 300) - 2100) + 8400) - 15120 } movapd FPC_300_252_ARRAY(%rip), %xmm2 { 300, 252 } divsd %xmm4, %xmm0 { Reapply the factor of ln 2 to x } movapd FPC_M2100_1344_ARRAY(%rip), %xmm1 { -2100, 1344 } shufpd $0,%xmm0, %xmm0 { x, x } movapd %xmm0, %xmm4 { x, x } addpd %xmm5, %xmm0 { x - 25, x - 24 } mulpd %xmm4, %xmm0 { x(x - 25), x(x + 24) } addpd %xmm2, %xmm0 { x(x - 25) + 300, x(x + 24) + 252 } movapd FPC_8400_3024_ARRAY(%rip), %xmm2 { 8400, 3024 } mulpd %xmm4, %xmm0 { x(x(x - 25) + 300), x(x(x + 24) + 252) } addpd %xmm1, %xmm0 { x(x(x - 25) + 300) - 2100, x(x(x + 24) + 252) + 1344 } movsd FPC_M5_DOUBLE(%rip), %xmm1 { -5, 0 } mulpd %xmm4, %xmm0 { x(x(x(x - 25) + 300) - 2100), x(x(x(x + 24) + 252) + 1344) } shufpd $1,%xmm1, %xmm4 { x, -5 } movsd FPC_M15120_DOUBLE(%rip), %xmm1 { -15120, 0 } addpd %xmm2, %xmm0 { x(x(x(x - 25) + 300) - 2100) + 8400, x(x(x(x + 24) + 252) + 1344) + 3024 } movsd FPC_ONE_DOUBLE(%rip), %xmm2 { 1, 0 } mulpd %xmm4, %xmm0 { x(x(x(x(x - 25) + 300) - 2100) + 8400), -5(x(x(x(x + 24) + 252) + 1344) + 3024) } addpd %xmm1, %xmm0 { x(x(x(x(x - 25) + 300) - 2100) + 8400) - 15120, -5(x(x(x(x + 24) + 252) + 1344) + 3024) } movapd %xmm0, %xmm4 { x(x(x(x(x - 25) + 300) - 2100) + 8400) - 15120, -5(x(x(x(x + 24) + 252) + 1344) + 3024) } shufpd $1,%xmm0, %xmm0 { -5(x(x(x(x + 24) + 252) + 1344) + 3024), x(x(x(x(x - 25) + 300) - 2100) + 8400) - 15120 } divsd %xmm4, %xmm0 { Padé approximant for e^x computed } mulsd %xmm3, %xmm0 { Apply the integer component calculated earlier } end; {$endif FPC_SYSTEM_HAS_EXP} {$endif FPC_HAS_TYPE_EXTENDED}