|
@@ -322,6 +322,40 @@
|
|
|
|.macro fdup; fld st0; .endmacro
|
|
|
|.macro fpop1; fstp st1; .endmacro
|
|
|
|
|
|
|
+|// Synthesize SSE FP constants.
|
|
|
+|.macro sseconst_sign, reg, tmp // Synthesize sign mask.
|
|
|
+|.if X64
|
|
|
+| mov64 tmp, U64x(80000000,00000000); movd reg, tmp
|
|
|
+|.else
|
|
|
+| mov tmp, 0x80000000; movd xmm1, tmp; pshufd reg, reg, 0x51
|
|
|
+|.endif
|
|
|
+|.endmacro
|
|
|
+|
|
|
|
+|.macro sseconst_abs, reg, tmp // Synthesize abs mask.
|
|
|
+|.if X64
|
|
|
+| mov64 tmp, U64x(7fffffff,ffffffff); movd reg, tmp
|
|
|
+|.else
|
|
|
+| pxor reg, reg; pcmpeqd reg, reg; psrlq reg, 1
|
|
|
+|.endif
|
|
|
+|.endmacro
|
|
|
+|
|
|
|
+|.macro sseconst_1, reg, tmp // Synthesize 1.0.
|
|
|
+|.if X64
|
|
|
+| mov64 tmp, U64x(3ff00000,00000000)
|
|
|
+| movd reg, tmp
|
|
|
+|.else
|
|
|
+| mov tmp, 0x3ff00000; movd reg, tmp; pshufd reg, reg, 0x51
|
|
|
+|.endif
|
|
|
+|.endmacro
|
|
|
+|
|
|
|
+|.macro sseconst_2p52, reg, tmp // Synthesize 2^52.
|
|
|
+|.if X64
|
|
|
+| mov64 tmp, U64x(43300000,00000000); movd reg, tmp
|
|
|
+|.else
|
|
|
+| mov tmp, 0x43300000; movd reg, tmp; pshufd reg, reg, 0x51
|
|
|
+|.endif
|
|
|
+|.endmacro
|
|
|
+|
|
|
|
|// Move table write barrier back. Overwrites reg.
|
|
|
|.macro barrierback, tab, reg
|
|
|
| and byte tab->marked, cast_byte(~LJ_GC_BLACK) // black2gray(tab)
|
|
@@ -334,7 +368,7 @@
|
|
|
|
|
|
/* Generate subroutines used by opcodes and other parts of the VM. */
|
|
|
/* The .code_sub section should be last to help static branch prediction. */
|
|
|
-static void build_subroutines(BuildCtx *ctx, int cmov)
|
|
|
+static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
|
|
|
{
|
|
|
|.code_sub
|
|
|
|
|
|
@@ -2454,21 +2488,51 @@ static void build_subroutines(BuildCtx *ctx, int cmov)
|
|
|
| vm_round 0x0c00, 0xffff
|
|
|
|
|
|
|
|// FP modulo x%y. Called by BC_MOD* and vm_arith.
|
|
|
- |// Args/ret on x87 stack (y on top). No xmm registers modified.
|
|
|
- |// Caveat: needs 3 slots on x87 stack! RC (eax) modified!
|
|
|
|->vm_mod:
|
|
|
- | fld st1
|
|
|
- | fdiv st1
|
|
|
- | fnstcw word [esp+4]
|
|
|
- | mov ax, 0x0400
|
|
|
- | or ax, [esp+4]
|
|
|
- | and ax, 0xf7ff
|
|
|
- | mov [esp+6], ax
|
|
|
- | fldcw word [esp+6]
|
|
|
- | frndint
|
|
|
- | fldcw word [esp+4]
|
|
|
- | fmulp st1
|
|
|
- | fsubp st1
|
|
|
+ if (sse) {
|
|
|
+ |// Args in xmm0/xmm1, return value in xmm0.
|
|
|
+ |// Caveat: xmm0-xmm5 and RC (eax) modified!
|
|
|
+ | movaps xmm5, xmm0
|
|
|
+ | divsd xmm0, xmm1
|
|
|
+ | sseconst_abs xmm2, RDa
|
|
|
+ | sseconst_2p52 xmm3, RDa
|
|
|
+ | movaps xmm4, xmm0
|
|
|
+ | andpd xmm4, xmm2 // |x/y|
|
|
|
+ | ucomisd xmm3, xmm4 // No truncation if 2^52 <= |x/y|.
|
|
|
+ | jbe >1
|
|
|
+ | andnpd xmm2, xmm0 // Isolate sign bit.
|
|
|
+ | addsd xmm4, xmm3 // (|x/y| + 2^52) - 2^52
|
|
|
+ | subsd xmm4, xmm3
|
|
|
+ | orpd xmm4, xmm2 // Merge sign bit back in.
|
|
|
+ | sseconst_1 xmm2, RDa
|
|
|
+ | cmpsd xmm0, xmm4, 1 // x/y < result?
|
|
|
+ | andpd xmm0, xmm2
|
|
|
+ | subsd xmm4, xmm0 // If yes, subtract 1.0.
|
|
|
+ | movaps xmm0, xmm5
|
|
|
+ | mulsd xmm1, xmm4
|
|
|
+ | subsd xmm0, xmm1
|
|
|
+ | ret
|
|
|
+ |1:
|
|
|
+ | mulsd xmm1, xmm0
|
|
|
+ | movaps xmm0, xmm5
|
|
|
+ | subsd xmm0, xmm1
|
|
|
+ | ret
|
|
|
+ } else {
|
|
|
+ |// Args/ret on x87 stack (y on top). No xmm registers modified.
|
|
|
+ |// Caveat: needs 3 slots on x87 stack! RC (eax) modified!
|
|
|
+ | fld st1
|
|
|
+ | fdiv st1
|
|
|
+ | fnstcw word [esp+4]
|
|
|
+ | mov ax, 0x0400
|
|
|
+ | or ax, [esp+4]
|
|
|
+ | and ax, 0xf7ff
|
|
|
+ | mov [esp+6], ax
|
|
|
+ | fldcw word [esp+6]
|
|
|
+ | frndint
|
|
|
+ | fldcw word [esp+4]
|
|
|
+ | fmulp st1
|
|
|
+ | fsubp st1
|
|
|
+ }
|
|
|
| ret
|
|
|
|
|
|
|
|// FP exponentiation e^x and 2^x. Called by math.exp fast function and
|
|
@@ -2619,31 +2683,100 @@ static void build_subroutines(BuildCtx *ctx, int cmov)
|
|
|
|// Compute x op y for basic arithmetic operators (+ - * / % ^ and unary -)
|
|
|
|// and basic math functions. ORDER ARITH
|
|
|
|->vm_foldarith:
|
|
|
- | mov eax, [esp+20]
|
|
|
- | fld qword [esp+4]
|
|
|
- | fld qword [esp+12]
|
|
|
- | cmp eax, 1; je >1; ja >2
|
|
|
- | faddp st1; ret
|
|
|
- |1: ; fsubp st1; ret
|
|
|
- |2: ; cmp eax, 3; je >1; ja >2
|
|
|
- | fmulp st1; ret
|
|
|
- |1: ; fdivp st1; ret
|
|
|
- |2: ; cmp eax, 5; jb ->vm_mod; je ->vm_pow
|
|
|
- | cmp eax, 7; je >1; ja >2
|
|
|
- | fpop; fchs; ret
|
|
|
- |1: ; fpop; fabs; ret
|
|
|
- |2: ; cmp eax, 9; je >1; ja >2
|
|
|
- | fpatan; ret
|
|
|
- |1: ; fxch; fscale; fpop1; ret
|
|
|
- |2: ; cmp eax, 11; je >1; ja >9
|
|
|
- ||if (cmov) {
|
|
|
- | fucomi st1; fcmovnbe st1; fpop1; ret
|
|
|
- |1: ; fucomi st1; fcmovbe st1; fpop1; ret
|
|
|
- ||} else {
|
|
|
- | fucom st1; fnstsw ax; test ah, 1; jz >2; fxch; 2: ; fpop; ret
|
|
|
- |1: ; fucom st1; fnstsw ax; test ah, 1; jnz >2; fxch; 2: ; fpop; ret
|
|
|
- ||}
|
|
|
- |9: ; int3 // Bad op.
|
|
|
+ if (sse) {
|
|
|
+ |.macro retxmm0; .if X64; ret; .else; jmp >7; .endif; .endmacro
|
|
|
+ |.macro retst0; .if X64; jmp >7; .else; ret; .endif; .endmacro
|
|
|
+ |
|
|
|
+ |.if X64WIN
|
|
|
+ | .define foldop, CARG3d
|
|
|
+ |.elif X64
|
|
|
+ | .define foldop, CARG1d
|
|
|
+ |.else
|
|
|
+ | .define foldop, eax
|
|
|
+ | mov foldop, [esp+20]
|
|
|
+ | movsd xmm0, qword [esp+4]
|
|
|
+ | movsd xmm1, qword [esp+12]
|
|
|
+ |.endif
|
|
|
+ | cmp foldop, 1; je >1; ja >2
|
|
|
+ | addsd xmm0, xmm1; retxmm0
|
|
|
+ |1: ; subsd xmm0, xmm1; retxmm0
|
|
|
+ |2: ; cmp foldop, 3; je >1; ja >2
|
|
|
+ | mulsd xmm0, xmm1; retxmm0
|
|
|
+ |1: ; divsd xmm0, xmm1; retxmm0
|
|
|
+ |2: ; cmp foldop, 5
|
|
|
+ |.if X64
|
|
|
+ | jb ->vm_mod; je ->vm_pow // NYI: broken without SSE vm_pow.
|
|
|
+ |.else
|
|
|
+ | je >1; ja >2
|
|
|
+ | call ->vm_mod; retxmm0
|
|
|
+ |1: ; fld qword [esp+4]; fld qword [esp+12]; jmp ->vm_pow // NYI
|
|
|
+ |2:
|
|
|
+ |.endif
|
|
|
+ | cmp foldop, 7; je >1; ja >2
|
|
|
+ | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; retxmm0
|
|
|
+ |1:
|
|
|
+ | sseconst_abs xmm1, RDa; andps xmm0, xmm1; retxmm0
|
|
|
+ |2: ; cmp foldop, 9; ja >2
|
|
|
+ |.if X64WIN
|
|
|
+ | movsd qword [esp+8], xmm0 // Use scratch area.
|
|
|
+ | movsd qword [esp+16], xmm1
|
|
|
+ | fld qword [esp+8]
|
|
|
+ | fld qword [esp+16]
|
|
|
+ |.elif X64
|
|
|
+ | movsd qword [esp-8], xmm0 // Use red zone.
|
|
|
+ | movsd qword [esp-16], xmm1
|
|
|
+ | fld qword [esp-8]
|
|
|
+ | fld qword [esp-16]
|
|
|
+ |.else
|
|
|
+ | fld qword [esp+4] // Reload from stack
|
|
|
+ | fld qword [esp+12]
|
|
|
+ |.endif
|
|
|
+ | je >1
|
|
|
+ | fpatan; retst0
|
|
|
+ |1: ; fxch; fscale; fpop1; retst0
|
|
|
+ |2: ; cmp foldop, 11; je >1; ja >9
|
|
|
+ | minsd xmm0, xmm1; retxmm0
|
|
|
+ |1: ; maxsd xmm0, xmm1; retxmm0
|
|
|
+ |9: ; int3 // Bad op.
|
|
|
+ |7: // Move return value depending on calling convention.
|
|
|
+ |.if X64WIN
|
|
|
+ | fstp qword [esp+8] // Use scratch area.
|
|
|
+ | movsd xmm0, qword [esp+8]
|
|
|
+ |.elif X64
|
|
|
+ | fstp qword [esp-8] // Use red zone.
|
|
|
+ | movsd xmm0, qword [esp-8]
|
|
|
+ |.else
|
|
|
+ | movsd qword [esp+4], xmm0 // Overwrite callee-owned args.
|
|
|
+ | fld qword [esp+4]
|
|
|
+ |.endif
|
|
|
+ | ret
|
|
|
+ } else {
|
|
|
+ | mov eax, [esp+20]
|
|
|
+ | fld qword [esp+4]
|
|
|
+ | fld qword [esp+12]
|
|
|
+ | cmp eax, 1; je >1; ja >2
|
|
|
+ | faddp st1; ret
|
|
|
+ |1: ; fsubp st1; ret
|
|
|
+ |2: ; cmp eax, 3; je >1; ja >2
|
|
|
+ | fmulp st1; ret
|
|
|
+ |1: ; fdivp st1; ret
|
|
|
+ |2: ; cmp eax, 5; jb ->vm_mod; je ->vm_pow
|
|
|
+ | cmp eax, 7; je >1; ja >2
|
|
|
+ | fpop; fchs; ret
|
|
|
+ |1: ; fpop; fabs; ret
|
|
|
+ |2: ; cmp eax, 9; je >1; ja >2
|
|
|
+ | fpatan; ret
|
|
|
+ |1: ; fxch; fscale; fpop1; ret
|
|
|
+ |2: ; cmp eax, 11; je >1; ja >9
|
|
|
+ ||if (cmov) {
|
|
|
+ | fucomi st1; fcmovnbe st1; fpop1; ret
|
|
|
+ |1: ; fucomi st1; fcmovbe st1; fpop1; ret
|
|
|
+ ||} else {
|
|
|
+ | fucom st1; fnstsw ax; test ah, 1; jz >2; fxch; 2: ; fpop; ret
|
|
|
+ |1: ; fucom st1; fnstsw ax; test ah, 1; jnz >2; fxch; 2: ; fpop; ret
|
|
|
+ ||}
|
|
|
+ |9: ; int3 // Bad op.
|
|
|
+ }
|
|
|
|
|
|
|
|//-----------------------------------------------------------------------
|
|
|
|//-- Miscellaneous functions --------------------------------------------
|
|
@@ -2694,7 +2827,7 @@ static void build_subroutines(BuildCtx *ctx, int cmov)
|
|
|
}
|
|
|
|
|
|
/* Generate the code for a single instruction. */
|
|
|
-static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov)
|
|
|
+static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse)
|
|
|
{
|
|
|
int vk = 0;
|
|
|
|// Note: aligning all instructions does not pay off.
|
|
@@ -2711,10 +2844,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov)
|
|
|
| ins_AD
|
|
|
| checknum RA, ->vmeta_comp
|
|
|
| checknum RD, ->vmeta_comp
|
|
|
- | fld qword [BASE+RA*8] // Reverse order, i.e like cmp D, A.
|
|
|
- | fld qword [BASE+RD*8]
|
|
|
- | add PC, 4
|
|
|
- | fcomparepp // eax (RD) modified!
|
|
|
+ if (sse) {
|
|
|
+ | movsd xmm0, qword [BASE+RD*8]
|
|
|
+ | add PC, 4
|
|
|
+ | ucomisd xmm0, qword [BASE+RA*8]
|
|
|
+ } else {
|
|
|
+ | fld qword [BASE+RA*8] // Reverse order, i.e like cmp D, A.
|
|
|
+ | fld qword [BASE+RD*8]
|
|
|
+ | add PC, 4
|
|
|
+ | fcomparepp // eax (RD) modified!
|
|
|
+ }
|
|
|
| // Unordered: all of ZF CF PF set, ordered: PF clear.
|
|
|
| // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't.
|
|
|
switch (op) {
|
|
@@ -2746,9 +2885,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov)
|
|
|
| add PC, 4
|
|
|
| cmp RB, LJ_TISNUM; ja >5
|
|
|
| checknum RA, >5
|
|
|
- | fld qword [BASE+RA*8]
|
|
|
- | fld qword [BASE+RD*8]
|
|
|
- | fcomparepp // eax (RD) modified!
|
|
|
+ if (sse) {
|
|
|
+ | movsd xmm0, qword [BASE+RD*8]
|
|
|
+ | ucomisd xmm0, qword [BASE+RA*8]
|
|
|
+ } else {
|
|
|
+ | fld qword [BASE+RA*8]
|
|
|
+ | fld qword [BASE+RD*8]
|
|
|
+ | fcomparepp // eax (RD) modified!
|
|
|
+ }
|
|
|
iseqne_fp:
|
|
|
if (vk) {
|
|
|
| jp >2 // Unordered means not equal.
|
|
@@ -2820,9 +2964,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov)
|
|
|
| ins_AD // RA = src, RD = num const, JMP with RD = target
|
|
|
| add PC, 4
|
|
|
| checknum RA, >2
|
|
|
- | fld qword [BASE+RA*8]
|
|
|
- | fld qword [KBASE+RD*8]
|
|
|
- | fcomparepp // eax (RD) modified!
|
|
|
+ if (sse) {
|
|
|
+ | movsd xmm0, qword [KBASE+RD*8]
|
|
|
+ | ucomisd xmm0, qword [BASE+RA*8]
|
|
|
+ } else {
|
|
|
+ | fld qword [BASE+RA*8]
|
|
|
+ | fld qword [KBASE+RD*8]
|
|
|
+ | fcomparepp // eax (RD) modified!
|
|
|
+ }
|
|
|
goto iseqne_fp;
|
|
|
case BC_ISEQP: case BC_ISNEP:
|
|
|
vk = op == BC_ISEQP;
|
|
@@ -2875,18 +3024,32 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov)
|
|
|
case BC_UNM:
|
|
|
| ins_AD // RA = dst, RD = src
|
|
|
| checknum RD, ->vmeta_unm
|
|
|
- | fld qword [BASE+RD*8]
|
|
|
- | fchs
|
|
|
- | fstp qword [BASE+RA*8]
|
|
|
+ if (sse) {
|
|
|
+ | movsd xmm0, qword [BASE+RD*8]
|
|
|
+ | sseconst_sign xmm1, RDa
|
|
|
+ | xorps xmm0, xmm1
|
|
|
+ | movsd qword [BASE+RA*8], xmm0
|
|
|
+ } else {
|
|
|
+ | fld qword [BASE+RD*8]
|
|
|
+ | fchs
|
|
|
+ | fstp qword [BASE+RA*8]
|
|
|
+ }
|
|
|
| ins_next
|
|
|
break;
|
|
|
case BC_LEN:
|
|
|
| ins_AD // RA = dst, RD = src
|
|
|
| checkstr RD, >2
|
|
|
| mov STR:RD, [BASE+RD*8]
|
|
|
- | fild dword STR:RD->len
|
|
|
- |1:
|
|
|
- | fstp qword [BASE+RA*8]
|
|
|
+ if (sse) {
|
|
|
+ | xorps xmm0, xmm0
|
|
|
+ | cvtsi2sd xmm0, dword STR:RD->len
|
|
|
+ |1:
|
|
|
+ | movsd qword [BASE+RA*8], xmm0
|
|
|
+ } else {
|
|
|
+ | fild dword STR:RD->len
|
|
|
+ |1:
|
|
|
+ | fstp qword [BASE+RA*8]
|
|
|
+ }
|
|
|
| ins_next
|
|
|
|2:
|
|
|
| checktab RD, ->vmeta_len
|
|
@@ -2894,72 +3057,108 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov)
|
|
|
| mov RB, BASE // Save BASE.
|
|
|
| call extern lj_tab_len@4 // (GCtab *t)
|
|
|
| // Length of table returned in eax (RC).
|
|
|
- | mov ARG1, RC
|
|
|
- | mov BASE, RB // Restore BASE.
|
|
|
- | fild ARG1
|
|
|
+ if (sse) {
|
|
|
+ | cvtsi2sd xmm0, RC
|
|
|
+ | mov BASE, RB // Restore BASE.
|
|
|
+ } else {
|
|
|
+ | mov ARG1, RC
|
|
|
+ | mov BASE, RB // Restore BASE.
|
|
|
+ | fild ARG1
|
|
|
+ }
|
|
|
| movzx RA, PC_RA
|
|
|
| jmp <1
|
|
|
break;
|
|
|
|
|
|
/* -- Binary ops -------------------------------------------------------- */
|
|
|
|
|
|
- |.macro ins_arithpre, ins
|
|
|
+ |.macro ins_arithpre, ins, sseins, ssereg
|
|
|
| ins_ABC
|
|
|
||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
|
|
|
||switch (vk) {
|
|
|
||case 0:
|
|
|
| checknum RB, ->vmeta_arith_vn
|
|
|
+ ||if (sse) {
|
|
|
+ | movsd xmm0, qword [BASE+RB*8]
|
|
|
+ | sseins ssereg, qword [KBASE+RC*8]
|
|
|
+ ||} else {
|
|
|
| fld qword [BASE+RB*8]
|
|
|
| ins qword [KBASE+RC*8]
|
|
|
+ ||}
|
|
|
|| break;
|
|
|
||case 1:
|
|
|
| checknum RB, ->vmeta_arith_nv
|
|
|
+ ||if (sse) {
|
|
|
+ | movsd xmm0, qword [KBASE+RC*8]
|
|
|
+ | sseins ssereg, qword [BASE+RB*8]
|
|
|
+ ||} else {
|
|
|
| fld qword [KBASE+RC*8]
|
|
|
| ins qword [BASE+RB*8]
|
|
|
+ ||}
|
|
|
|| break;
|
|
|
||default:
|
|
|
| checknum RB, ->vmeta_arith_vv
|
|
|
| checknum RC, ->vmeta_arith_vv
|
|
|
+ ||if (sse) {
|
|
|
+ | movsd xmm0, qword [BASE+RB*8]
|
|
|
+ | sseins ssereg, qword [BASE+RC*8]
|
|
|
+ ||} else {
|
|
|
| fld qword [BASE+RB*8]
|
|
|
| ins qword [BASE+RC*8]
|
|
|
+ ||}
|
|
|
|| break;
|
|
|
||}
|
|
|
|.endmacro
|
|
|
|
|
|
|
- |.macro ins_arith, ins
|
|
|
- | ins_arithpre ins
|
|
|
+ |.macro ins_arithpost
|
|
|
+ ||if (sse) {
|
|
|
+ | movsd qword [BASE+RA*8], xmm0
|
|
|
+ ||} else {
|
|
|
| fstp qword [BASE+RA*8]
|
|
|
+ ||}
|
|
|
+ |.endmacro
|
|
|
+ |
|
|
|
+ |.macro ins_arith, ins, sseins
|
|
|
+ | ins_arithpre ins, sseins, xmm0
|
|
|
+ | ins_arithpost
|
|
|
| ins_next
|
|
|
|.endmacro
|
|
|
|
|
|
| // RA = dst, RB = src1 or num const, RC = src2 or num const
|
|
|
case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
|
|
|
- | ins_arith fadd
|
|
|
+ | ins_arith fadd, addsd
|
|
|
break;
|
|
|
case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
|
|
|
- | ins_arith fsub
|
|
|
+ | ins_arith fsub, subsd
|
|
|
break;
|
|
|
case BC_MULVN: case BC_MULNV: case BC_MULVV:
|
|
|
- | ins_arith fmul
|
|
|
+ | ins_arith fmul, mulsd
|
|
|
break;
|
|
|
case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
|
|
|
- | ins_arith fdiv
|
|
|
+ | ins_arith fdiv, divsd
|
|
|
break;
|
|
|
case BC_MODVN:
|
|
|
- | ins_arithpre fld
|
|
|
+ | ins_arithpre fld, movsd, xmm1
|
|
|
|->BC_MODVN_Z:
|
|
|
| call ->vm_mod
|
|
|
- | fstp qword [BASE+RA*8]
|
|
|
+ | ins_arithpost
|
|
|
| ins_next
|
|
|
break;
|
|
|
case BC_MODNV: case BC_MODVV:
|
|
|
- | ins_arithpre fld
|
|
|
+ | ins_arithpre fld, movsd, xmm1
|
|
|
| jmp ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway.
|
|
|
break;
|
|
|
case BC_POW:
|
|
|
- | ins_arithpre fld
|
|
|
- | call ->vm_pow
|
|
|
- | fstp qword [BASE+RA*8]
|
|
|
+ if (sse) {
|
|
|
+ sse = 0; /* NYI: temporary workaround. */
|
|
|
+ | ins_arithpre fld, movsd, xmm1
|
|
|
+ | call ->vm_pow
|
|
|
+ | ins_arithpost
|
|
|
+ sse = 1;
|
|
|
+ } else {
|
|
|
+ | ins_arithpre fld, movsd, xmm1
|
|
|
+ | call ->vm_pow
|
|
|
+ | ins_arithpost
|
|
|
+ }
|
|
|
| ins_next
|
|
|
break;
|
|
|
|
|
@@ -3945,17 +4144,21 @@ static int build_backend(BuildCtx *ctx)
|
|
|
{
|
|
|
int op;
|
|
|
int cmov = 1;
|
|
|
+ int sse = 0;
|
|
|
#ifdef LUAJIT_CPU_NOCMOV
|
|
|
cmov = 0;
|
|
|
#endif
|
|
|
+#ifdef LUAJIT_CPU_SSE2
|
|
|
+ sse = 1;
|
|
|
+#endif
|
|
|
|
|
|
dasm_growpc(Dst, BC__MAX);
|
|
|
|
|
|
- build_subroutines(ctx, cmov);
|
|
|
+ build_subroutines(ctx, cmov, sse);
|
|
|
|
|
|
|.code_op
|
|
|
for (op = 0; op < BC__MAX; op++)
|
|
|
- build_ins(ctx, (BCOp)op, op, cmov);
|
|
|
+ build_ins(ctx, (BCOp)op, op, cmov, sse);
|
|
|
|
|
|
return BC__MAX;
|
|
|
}
|