16 years ago · 690760aa38
--- a/src/buildvm_x86.dasc
+++ b/src/buildvm_x86.dasc
@@ -96,10 +96,6 @@
 
				 |.type TRACE,		Trace
			
 
				 |.type EXITINFO,	ExitInfo
			
 
				 |
			
 
				-|// x86/x64 portability macros
			
 
				-|.macro push_eax; .if X64; push rax; .else; push eax; .endif; .endmacro
			
 
				-|.macro pop_eax; .if X64; pop rax; .else; pop eax; .endif; .endmacro
			
 
				-|
			
 
				 |// Stack layout while in interpreter. Must match with lj_frame.h.
			
 
				 |//-----------------------------------------------------------------------
			
 
				 |.if not X64		// x86 stack layout.
			
@@ -2072,10 +2068,10 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
 
				   |  fpop1
			
 
				   |  jmp ->fff_resn
			
 
				   |
			
 
				-  if (0 && sse) {  // NYI
			
 
				-    |.ffunc_nnsse math_pow;  call ->vm_pow;  jmp ->fff_resxmm0
			
 
				+  if (sse) {
			
 
				+    |.ffunc_nnsse math_pow;	call ->vm_pow;	jmp ->fff_resxmm0
			
 
				   } else {
			
 
				-    |.ffunc_nn math_pow;  call ->vm_pow;  jmp ->fff_resn
			
 
				+    |.ffunc_nn math_pow;	call ->vm_pow;	jmp ->fff_resn
			
 
				   }
			
 
				   |
			
 
				   |.macro math_minmax, name, cmovop, nocmovop, sseop
			
@@ -2091,6 +2087,7 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
 
				   |  add RB, 1
			
 
				   |  jmp <1
			
 
				   ||} else {
			
 
				+  |.if not X64
			
 
				   |.ffunc_n name
			
 
				   |  mov RB, 2
			
 
				   |1:
			
@@ -2101,12 +2098,13 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
 
				   ||if (cmov) {
			
 
				   |  fucomi st1; cmovop st1; fpop1
			
 
				   ||} else {
			
 
				-  |  push_eax
			
 
				+  |  push eax
			
 
				   |  fucom st1; fnstsw ax; test ah, 1; nocmovop >2; fxch; 2: ; fpop
			
 
				-  |  pop_eax
			
 
				+  |  pop eax
			
 
				   ||}
			
 
				   |  add RB, 1
			
 
				   |  jmp <1
			
 
				+  |.endif
			
 
				   ||}
			
 
				   |.endmacro
			
 
				   |
			
@@ -2842,19 +2840,29 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
 
				   |->vm_exp:
			
 
				   |  fldl2e; fmulp st1				// e^x ==> 2^(x*log2(e))
			
 
				   |->vm_exp2:
			
 
				-  |  fst dword [esp+4]				// Caveat: overwrites ARG1.
			
 
				-  |  cmp dword [esp+4], 0x7f800000; je >1	// Special case: e^+Inf = +Inf
			
 
				-  |  cmp dword [esp+4], 0xff800000; je >2	// Special case: e^-Inf = 0
			
 
				+  |  .if X64WIN
			
 
				+  |    .define expscratch, dword [rsp+8]	// Use scratch area.
			
 
				+  |  .elif X64
			
 
				+  |    .define expscratch, dword [rsp-8]	// Use red zone.
			
 
				+  |  .else
			
 
				+  |    .define expscratch, dword [esp+4]	// Needs 4 byte scratch area.
			
 
				+  |  .endif
			
 
				+  |  fst expscratch				// Caveat: overwrites ARG1.
			
 
				+  |  cmp expscratch, 0x7f800000; je >1		// Special case: e^+Inf = +Inf
			
 
				+  |  cmp expscratch, 0xff800000; je >2		// Special case: e^-Inf = 0
			
 
				   |->vm_exp2raw:  // Entry point for vm_pow. Without +-Inf check.
			
 
				-  |  fdup; frndint; fsub st1, st0; fxch	// Split into frac/int part.
			
 
				+  |  fdup; frndint; fsub st1, st0; fxch		// Split into frac/int part.
			
 
				   |  f2xm1; fld1; faddp st1; fscale; fpop1	// ==> (2^frac-1 +1) << int
			
 
				   |1:
			
 
				   |  ret
			
 
				   |2:
			
 
				   |  fpop; fldz; ret
			
 
				   |
			
 
				-  |// Generic power function x^y. Called by BC_POW, math.pow fast function
			
 
				-  |// and vm_arith. Args/ret on x87 stack (y on top). No int/xmm regs modified.
			
 
				+  |// Generic power function x^y. Called by BC_POW, math.pow fast function,
			
 
				+  |// and vm_arith.
			
 
				+  if (!sse) {
			
 
				+  |.if not X64
			
 
				+  |// Args/ret on x87 stack (y on top). RC (eax) modified.
			
 
				   |// Caveat: needs 3 slots on x87 stack!
			
 
				   |->vm_pow:
			
 
				   |  fist dword [esp+4]			// Store/reload int before comparison.
			
@@ -2862,18 +2870,16 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
 
				   ||if (cmov) {
			
 
				   |  fucomip st1
			
 
				   ||} else {
			
 
				-  |  push_eax; fucomp st1; fnstsw ax; sahf; pop_eax
			
 
				+  |  fucomp st1; fnstsw ax; sahf
			
 
				   ||}
			
 
				   |  jnz >8				// Branch for FP exponents.
			
 
				   |  jp >9				// Branch for NaN exponent.
			
 
				   |  fpop				// Pop y and fallthrough to vm_powi.
			
 
				   |
			
 
				-  |// FP/int power function x^i. Called from JIT code. Arg1/ret on x87 stack.
			
 
				-  |// Arg2 (int) on C stack. No int/xmm regs modified.
			
 
				+  |// FP/int power function x^i. Arg1/ret on x87 stack.
			
 
				+  |// Arg2 (int) on C stack. RC (eax) modified.
			
 
				   |// Caveat: needs 2 slots on x87 stack!
			
 
				-  |->vm_powi:
			
 
				-  |  push_eax
			
 
				-  |  mov eax, [esp+8]
			
 
				+  |  mov eax, [esp+4]
			
 
				   |  cmp eax, 1; jle >6			// i<=1?
			
 
				   |  // Now 1 < (unsigned)i <= 0x80000000.
			
 
				   |1:  // Handle leading zeros.
			
@@ -2893,7 +2899,6 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
 
				   |4:
			
 
				   |  fmulp st1
			
 
				   |5:
			
 
				-  |  pop_eax
			
 
				   |  ret
			
 
				   |6:
			
 
				   |  je <5				// x^1 ==> x
			
@@ -2904,19 +2909,16 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
 
				   |  jmp <1				// x^-i ==> (1/x)^i
			
 
				   |7:
			
 
				   |  fpop; fld1				// x^0 ==> 1
			
 
				-  |  pop_eax
			
 
				   |  ret
			
 
				   |
			
 
				   |8:  // FP/FP power function x^y.
			
 
				-  |  push_eax
			
 
				-  |  fst dword [esp+8]
			
 
				+  |  fst dword [esp+4]
			
 
				   |  fxch
			
 
				-  |  fst dword [esp+12]
			
 
				-  |  mov eax, [esp+8]; shl eax, 1
			
 
				+  |  fst dword [esp+8]
			
 
				+  |  mov eax, [esp+4]; shl eax, 1
			
 
				   |  cmp eax, 0xff000000; je >2			// x^+-Inf?
			
 
				-  |  mov eax, [esp+12]; shl eax, 1; je >4	// +-0^y?
			
 
				+  |  mov eax, [esp+8]; shl eax, 1; je >4	// +-0^y?
			
 
				   |  cmp eax, 0xff000000; je >4			// +-Inf^y?
			
 
				-  |  pop_eax
			
 
				   |  fyl2x
			
 
				   |  jmp ->vm_exp2raw
			
 
				   |
			
@@ -2925,7 +2927,7 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
 
				   ||if (cmov) {
			
 
				   |  fucomip st2
			
 
				   ||} else {
			
 
				-  |  push_eax; fucomp st2; fnstsw ax; sahf; pop_eax
			
 
				+  |  fucomp st2; fnstsw ax; sahf
			
 
				   ||}
			
 
				   |  je >1				// 1^NaN ==> 1
			
 
				   |  fxch				// x^NaN ==> NaN
			
@@ -2943,41 +2945,205 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
 
				   ||}
			
 
				   |  je >3					// +-1^+-Inf ==> 1
			
 
				   |  fpop; fabs; fldz; mov eax, 0; setc al
			
 
				-  |  ror eax, 1; xor eax, [esp+8]; jns >3	// |x|<>1, x^+-Inf ==> +Inf/0
			
 
				+  |  ror eax, 1; xor eax, [esp+4]; jns >3	// |x|<>1, x^+-Inf ==> +Inf/0
			
 
				   |  fxch
			
 
				   |3:
			
 
				-  |  fpop1; fabs; pop_eax
			
 
				+  |  fpop1; fabs
			
 
				   |  ret
			
 
				   |
			
 
				   |4:  // Handle +-0^y or +-Inf^y.
			
 
				-  |  cmp dword [esp+8], 0; jge <3		// y >= 0, x^y ==> |x|
			
 
				+  |  cmp dword [esp+4], 0; jge <3		// y >= 0, x^y ==> |x|
			
 
				   |  fpop; fpop
			
 
				-  |  test eax, eax; pop_eax; jz >5		// y < 0, +-0^y ==> +Inf
			
 
				+  |  test eax, eax; jz >5			// y < 0, +-0^y ==> +Inf
			
 
				   |  fldz					// y < 0, +-Inf^y ==> 0
			
 
				   |  ret
			
 
				   |5:
			
 
				-  |  mov dword [esp+8], 0x7f800000		// Return +Inf.
			
 
				-  |  fld dword [esp+8]
			
 
				+  |  mov dword [esp+4], 0x7f800000		// Return +Inf.
			
 
				+  |  fld dword [esp+4]
			
 
				+  |  ret
			
 
				+  |.endif
			
 
				+  } else {
			
 
				+    |->vm_pow:
			
 
				+  }
			
 
				+  |
			
 
				+  |// Args in xmm0/xmm1. Ret in xmm0. xmm0-xmm2 and RC (eax) modified.
			
 
				+  |// Needs 16 byte scratch area for x86. Also called from JIT code.
			
 
				+  |->vm_pow_sse:
			
 
				+  |  cvtsd2si eax, xmm1
			
 
				+  |  cvtsi2sd xmm2, eax
			
 
				+  |  ucomisd xmm1, xmm2
			
 
				+  |  jnz >8				// Branch for FP exponents.
			
 
				+  |  jp >9				// Branch for NaN exponent.
			
 
				+  |  // Fallthrough to vm_powi_sse.
			
 
				+  |
			
 
				+  |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
			
 
				+  |->vm_powi_sse:
			
 
				+  |  cmp eax, 1; jle >6			// i<=1?
			
 
				+  |  // Now 1 < (unsigned)i <= 0x80000000.
			
 
				+  |1:  // Handle leading zeros.
			
 
				+  |  test eax, 1; jnz >2
			
 
				+  |  mulsd xmm0, xmm0
			
 
				+  |  shr eax, 1
			
 
				+  |  jmp <1
			
 
				+  |2:
			
 
				+  |  shr eax, 1; jz >5
			
 
				+  |  movaps xmm1, xmm0
			
 
				+  |3:  // Handle trailing bits.
			
 
				+  |  mulsd xmm0, xmm0
			
 
				+  |  shr eax, 1; jz >4
			
 
				+  |  jnc <3
			
 
				+  |  mulsd xmm1, xmm0
			
 
				+  |  jmp <3
			
 
				+  |4:
			
 
				+  |  mulsd xmm0, xmm1
			
 
				+  |5:
			
 
				+  |  ret
			
 
				+  |6:
			
 
				+  |  je <5				// x^1 ==> x
			
 
				+  |  jb >7
			
 
				+  |  push RDa
			
 
				+  |  sseconst_1 xmm1, RDa
			
 
				+  |  divsd xmm1, xmm0
			
 
				+  |  pop RDa
			
 
				+  |  movaps xmm0, xmm1
			
 
				+  |  neg eax
			
 
				+  |  cmp eax, 1; je <5			// x^-1 ==> 1/x
			
 
				+  |  jmp <1				// x^-i ==> (1/x)^i
			
 
				+  |7:
			
 
				+  |  sseconst_1 xmm0, RDa
			
 
				+  |  ret
			
 
				+  |
			
 
				+  |8:  // FP/FP power function x^y.
			
 
				+  |.if X64
			
 
				+  |  movd rax, xmm1; shl rax, 1
			
 
				+  |  ror rax, 32; cmp rax, 0xffe00000; je >2	// x^+-Inf?
			
 
				+  |  movd rax, xmm0; shl rax, 1; je >4		// +-0^y?
			
 
				+  |  ror rax, 32; cmp rax, 0xffe00000; je >5	// +-Inf^y?
			
 
				+  |  .if X64WIN
			
 
				+  |    movsd qword [rsp+16], xmm1		// Use scratch area.
			
 
				+  |    movsd qword [rsp+8], xmm0
			
 
				+  |    fld qword [rsp+16]
			
 
				+  |    fld qword [rsp+8]
			
 
				+  |  .else
			
 
				+  |    movsd qword [rsp-16], xmm1		// Use red zone.
			
 
				+  |    movsd qword [rsp-8], xmm0
			
 
				+  |    fld qword [rsp-16]
			
 
				+  |    fld qword [rsp-8]
			
 
				+  |  .endif
			
 
				+  |.else
			
 
				+  |  movsd qword [esp+12], xmm1			// Needs 16 byte scratch area.
			
 
				+  |  movsd qword [esp+4], xmm0
			
 
				+  |  cmp dword [esp+12], 0; jne >1
			
 
				+  |  mov eax, [esp+16]; shl eax, 1
			
 
				+  |  cmp eax, 0xffe00000; je >2			// x^+-Inf?
			
 
				+  |1:
			
 
				+  |  cmp dword [esp+4], 0; jne >1
			
 
				+  |  mov eax, [esp+8]; shl eax, 1; je >4	// +-0^y?
			
 
				+  |  cmp eax, 0xffe00000; je >5			// +-Inf^y?
			
 
				+  |1:
			
 
				+  |  fld qword [esp+12]
			
 
				+  |  fld qword [esp+4]
			
 
				+  |.endif
			
 
				+  |  fyl2x					// y*log2(x)
			
 
				+  |  fdup; frndint; fsub st1, st0; fxch		// Split into frac/int part.
			
 
				+  |  f2xm1; fld1; faddp st1; fscale; fpop1	// ==> (2^frac-1 +1) << int
			
 
				+  |.if X64WIN
			
 
				+  |  fstp qword [rsp+8]				// Use scratch area.
			
 
				+  |  movsd xmm0, qword [rsp+8]
			
 
				+  |.elif X64
			
 
				+  |  fstp qword [rsp-8]				// Use red zone.
			
 
				+  |  movsd xmm0, qword [rsp-8]
			
 
				+  |.else
			
 
				+  |  fstp qword [esp+4]				// Needs 8 byte scratch area.
			
 
				+  |  movsd xmm0, qword [esp+4]
			
 
				+  |.endif
			
 
				+  |  ret
			
 
				+  |
			
 
				+  |9:  // Handle x^NaN.
			
 
				+  |  sseconst_1 xmm2, RDa
			
 
				+  |  ucomisd xmm0, xmm2; je >1			// 1^NaN ==> 1
			
 
				+  |  movaps xmm0, xmm1				// x^NaN ==> NaN
			
 
				+  |1:
			
 
				+  |  ret
			
 
				+  |
			
 
				+  |2:  // Handle x^+-Inf.
			
 
				+  |  sseconst_abs xmm2, RDa
			
 
				+  |  andpd xmm0, xmm2				// |x|
			
 
				+  |  sseconst_1 xmm2, RDa
			
 
				+  |  ucomisd xmm0, xmm2; je <1			// +-1^+-Inf ==> 1
			
 
				+  |  movmskpd eax, xmm1
			
 
				+  |  xorps xmm0, xmm0
			
 
				+  |  mov ah, al; setc al; xor al, ah; jne <1	// |x|<>1, x^+-Inf ==> +Inf/0
			
 
				+  |3:
			
 
				+  |  sseconst_hi xmm0, RDa, 7ff00000  // +Inf
			
 
				+  |  ret
			
 
				+  |
			
 
				+  |4:  // Handle +-0^y.
			
 
				+  |  movmskpd eax, xmm1; test eax, eax; jnz <3	// y < 0, +-0^y ==> +Inf
			
 
				+  |  xorps xmm0, xmm0				// y >= 0, +-0^y ==> 0
			
 
				+  |  ret
			
 
				+  |
			
 
				+  |5:  // Handle +-Inf^y.
			
 
				+  |  movmskpd eax, xmm1; test eax, eax; jz <3	// y >= 0, +-Inf^y ==> +Inf
			
 
				+  |  xorps xmm0, xmm0				// y < 0, +-Inf^y ==> 0
			
 
				   |  ret
			
 
				   |
			
 
				   |// Callable from C: double lj_vm_foldfpm(double x, int fpm)
			
 
				   |// Computes fpm(x) for extended math functions. ORDER FPM.
			
 
				   |->vm_foldfpm:
			
 
				   if (sse) {
			
 
				-    |.if X64WIN
			
 
				-    |  .define fpmop, CARG2d
			
 
				-    |.elif X64
			
 
				-    |  .define fpmop, CARG1d
			
 
				-    |.else
			
 
				-    |  .define fpmop, eax
			
 
				-    |  mov fpmop, [esp+12]
			
 
				-    |  movsd xmm0, qword [esp+4]
			
 
				-    |.endif
			
 
				     |.if X64
			
 
				+    |
			
 
				+    |  .if X64WIN
			
 
				+    |    .define fpmop, CARG2d
			
 
				+    |  .else
			
 
				+    |    .define fpmop, CARG1d
			
 
				+    |  .endif
			
 
				     |  cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil
			
 
				     |  cmp fpmop, 3; jb ->vm_trunc; ja >2
			
 
				     |  sqrtsd xmm0, xmm0; ret
			
 
				-    |.else
			
 
				+    |2:
			
 
				+    |  .if X64WIN
			
 
				+    |    movsd qword [rsp+8], xmm0	// Use scratch area.
			
 
				+    |    fld qword [rsp+8]
			
 
				+    |  .else
			
 
				+    |    movsd qword [rsp-8], xmm0	// Use red zone.
			
 
				+    |    fld qword [rsp-8]
			
 
				+    |  .endif
			
 
				+    |  cmp fpmop, 5; ja >2
			
 
				+    |  .if X64WIN; pop rax; .endif
			
 
				+    |  je >1
			
 
				+    |  call ->vm_exp
			
 
				+    |  .if X64WIN; push rax; .endif
			
 
				+    |  jmp >7
			
 
				+    |1:
			
 
				+    |  call ->vm_exp2
			
 
				+    |  .if X64WIN; push rax; .endif
			
 
				+    |  jmp >7
			
 
				+    |2: ; cmp fpmop, 7; je >1; ja >2
			
 
				+    |  fldln2; fxch; fyl2x; jmp >7
			
 
				+    |1: ; fld1; fxch; fyl2x; jmp >7
			
 
				+    |2: ; cmp fpmop, 9; je >1; ja >2
			
 
				+    |  fldlg2; fxch; fyl2x; jmp >7
			
 
				+    |1: ; fsin; jmp >7
			
 
				+    |2: ; cmp fpmop, 11; je >1; ja >9
			
 
				+    |   fcos; jmp >7
			
 
				+    |1: ; fptan; fpop
			
 
				+    |7:
			
 
				+    |  .if X64WIN
			
 
				+    |    fstp qword [rsp+8]		// Use scratch area.
			
 
				+    |    movsd xmm0, qword [rsp+8]
			
 
				+    |  .else
			
 
				+    |    fstp qword [rsp-8]		// Use red zone.
			
 
				+    |    movsd xmm0, qword [rsp-8]
			
 
				+    |  .endif
			
 
				+    |  ret
			
 
				+    |
			
 
				+    |.else  // x86 calling convention.
			
 
				+    |
			
 
				+    |  .define fpmop, eax
			
 
				+    |  mov fpmop, [esp+12]
			
 
				+    |  movsd xmm0, qword [esp+4]
			
 
				     |  cmp fpmop, 1; je >1; ja >2
			
 
				     |  call ->vm_floor; jmp >7
			
 
				     |1: ; call ->vm_ceil; jmp >7
			
@@ -2989,27 +3155,36 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
 
				     |  movsd qword [esp+4], xmm0	// Overwrite callee-owned args.
			
 
				     |  fld qword [esp+4]
			
 
				     |  ret
			
 
				+    |2: ; fld qword [esp+4]
			
 
				+    |  cmp fpmop, 5; jb ->vm_exp; je ->vm_exp2
			
 
				+    |2: ; cmp fpmop, 7; je >1; ja >2
			
 
				+    |  fldln2; fxch; fyl2x; ret
			
 
				+    |1: ; fld1; fxch; fyl2x; ret
			
 
				+    |2: ; cmp fpmop, 9; je >1; ja >2
			
 
				+    |  fldlg2; fxch; fyl2x; ret
			
 
				+    |1: ; fsin; ret
			
 
				+    |2: ; cmp fpmop, 11; je >1; ja >9
			
 
				+    |   fcos; ret
			
 
				+    |1: ; fptan; fpop; ret
			
 
				+    |
			
 
				     |.endif
			
 
				-    |2:
			
 
				-    |  fld qword [esp+4]
			
 
				   } else {
			
 
				     |  mov fpmop, [esp+12]
			
 
				     |  fld qword [esp+4]
			
 
				     |  cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil
			
 
				     |  cmp fpmop, 3; jb ->vm_trunc; ja >2
			
 
				     |  fsqrt; ret
			
 
				-    |2:
			
 
				+    |2: ; cmp fpmop, 5; jb ->vm_exp; je ->vm_exp2
			
 
				+    |  cmp fpmop, 7; je >1; ja >2
			
 
				+    |  fldln2; fxch; fyl2x; ret
			
 
				+    |1: ; fld1; fxch; fyl2x; ret
			
 
				+    |2: ; cmp fpmop, 9; je >1; ja >2
			
 
				+    |  fldlg2; fxch; fyl2x; ret
			
 
				+    |1: ; fsin; ret
			
 
				+    |2: ; cmp fpmop, 11; je >1; ja >9
			
 
				+    |   fcos; ret
			
 
				+    |1: ; fptan; fpop; ret
			
 
				   }
			
 
				-  |  cmp fpmop, 5; jb ->vm_exp; je ->vm_exp2
			
 
				-  |  cmp fpmop, 7; je >1; ja >2
			
 
				-  |  fldln2; fxch; fyl2x; ret
			
 
				-  |1: ; fld1; fxch; fyl2x; ret
			
 
				-  |2: ; cmp fpmop, 9; je >1; ja >2
			
 
				-  |  fldlg2; fxch; fyl2x; ret
			
 
				-  |1: ; fsin; ret
			
 
				-  |2: ; cmp fpmop, 11; je >1; ja >9
			
 
				-  |   fcos; ret
			
 
				-  |1: ; fptan; fpop; ret
			
 
				   |9: ; int3					// Bad fpm.
			
 
				   |
			
 
				   |// Callable from C: double lj_vm_foldarith(double x, double y, int op)
			
@@ -3017,72 +3192,87 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
 
				   |// and basic math functions. ORDER ARITH
			
 
				   |->vm_foldarith:
			
 
				   if (sse) {
			
 
				-    |.macro retxmm0; .if X64; ret; .else; jmp >7; .endif; .endmacro
			
 
				-    |.macro retst0; .if X64; jmp >7; .else; ret; .endif; .endmacro
			
 
				+    |.if X64
			
 
				     |
			
 
				+    |  .if X64WIN
			
 
				+    |    .define foldop, CARG3d
			
 
				+    |  .else
			
 
				+    |    .define foldop, CARG1d
			
 
				+    |  .endif
			
 
				+    |  cmp foldop, 1; je >1; ja >2
			
 
				+    |  addsd xmm0, xmm1; ret
			
 
				+    |1: ; subsd xmm0, xmm1; ret
			
 
				+    |2: ; cmp foldop, 3; je >1; ja >2
			
 
				+    |  mulsd xmm0, xmm1; ret
			
 
				+    |1: ; divsd xmm0, xmm1; ret
			
 
				+    |2: ; cmp foldop, 5; jb ->vm_mod; je ->vm_pow
			
 
				+    |  cmp foldop, 7; je >1; ja >2
			
 
				+    |  sseconst_sign xmm1, RDa; xorps xmm0, xmm1; ret
			
 
				+    |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; ret
			
 
				+    |2: ; cmp foldop, 9; ja >2
			
 
				     |.if X64WIN
			
 
				-    |  .define foldop, CARG3d
			
 
				-    |.elif X64
			
 
				-    |  .define foldop, CARG1d
			
 
				+    |  movsd qword [rsp+8], xmm0	// Use scratch area.
			
 
				+    |  movsd qword [rsp+16], xmm1
			
 
				+    |  fld qword [rsp+8]
			
 
				+    |  fld qword [rsp+16]
			
 
				     |.else
			
 
				+    |  movsd qword [rsp-8], xmm0	// Use red zone.
			
 
				+    |  movsd qword [rsp-16], xmm1
			
 
				+    |  fld qword [rsp-8]
			
 
				+    |  fld qword [rsp-16]
			
 
				+    |.endif
			
 
				+    |  je >1
			
 
				+    |  fpatan
			
 
				+    |7:
			
 
				+    |.if X64WIN
			
 
				+    |  fstp qword [rsp+8]		// Use scratch area.
			
 
				+    |  movsd xmm0, qword [rsp+8]
			
 
				+    |.else
			
 
				+    |  fstp qword [rsp-8]		// Use red zone.
			
 
				+    |  movsd xmm0, qword [rsp-8]
			
 
				+    |.endif
			
 
				+    |  ret
			
 
				+    |1: ; fxch; fscale; fpop1; jmp <7
			
 
				+    |2: ; cmp foldop, 11; je >1; ja >9
			
 
				+    |  minsd xmm0, xmm1; ret
			
 
				+    |1: ; maxsd xmm0, xmm1; ret
			
 
				+    |9: ; int3				// Bad op.
			
 
				+    |
			
 
				+    |.else  // x86 calling convention.
			
 
				+    |
			
 
				     |  .define foldop, eax
			
 
				     |  mov foldop, [esp+20]
			
 
				     |  movsd xmm0, qword [esp+4]
			
 
				     |  movsd xmm1, qword [esp+12]
			
 
				-    |.endif
			
 
				     |  cmp foldop, 1; je >1; ja >2
			
 
				-    |  addsd xmm0, xmm1; retxmm0
			
 
				-    |1: ; subsd xmm0, xmm1; retxmm0
			
 
				+    |  addsd xmm0, xmm1
			
 
				+    |7:
			
 
				+    |  movsd qword [esp+4], xmm0	// Overwrite callee-owned args.
			
 
				+    |  fld qword [esp+4]
			
 
				+    |  ret
			
 
				+    |1: ; subsd xmm0, xmm1; jmp <7
			
 
				     |2: ; cmp foldop, 3; je >1; ja >2
			
 
				-    |  mulsd xmm0, xmm1; retxmm0
			
 
				-    |1: ; divsd xmm0, xmm1; retxmm0
			
 
				+    |  mulsd xmm0, xmm1; jmp <7
			
 
				+    |1: ; divsd xmm0, xmm1; jmp <7
			
 
				     |2: ; cmp foldop, 5
			
 
				-    |.if X64
			
 
				-    |  jb ->vm_mod; je ->vm_pow		// NYI: broken without SSE vm_pow.
			
 
				-    |.else
			
 
				     |  je >1; ja >2
			
 
				-    |  call ->vm_mod; retxmm0
			
 
				-    |1: ; fld qword [esp+4]; fld qword [esp+12]; jmp ->vm_pow  // NYI
			
 
				-    |2:
			
 
				-    |.endif
			
 
				-    |  cmp foldop, 7; je >1; ja >2
			
 
				-    |  sseconst_sign xmm1, RDa; xorps xmm0, xmm1; retxmm0
			
 
				-    |1:
			
 
				-    |  sseconst_abs xmm1, RDa; andps xmm0, xmm1; retxmm0
			
 
				+    |  call ->vm_mod; jmp <7
			
 
				+    |1: ; pop edx; call ->vm_pow; push edx; jmp <7  // Writes to scratch area.
			
 
				+    |2: ; cmp foldop, 7; je >1; ja >2
			
 
				+    |  sseconst_sign xmm1, RDa; xorps xmm0, xmm1; jmp <7
			
 
				+    |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; jmp <7
			
 
				     |2: ; cmp foldop, 9; ja >2
			
 
				-    |.if X64WIN
			
 
				-    |  movsd qword [esp+8], xmm0	// Use scratch area.
			
 
				-    |  movsd qword [esp+16], xmm1
			
 
				-    |  fld qword [esp+8]
			
 
				-    |  fld qword [esp+16]
			
 
				-    |.elif X64
			
 
				-    |  movsd qword [esp-8], xmm0	// Use red zone.
			
 
				-    |  movsd qword [esp-16], xmm1
			
 
				-    |  fld qword [esp-8]
			
 
				-    |  fld qword [esp-16]
			
 
				-    |.else
			
 
				     |  fld qword [esp+4]		// Reload from stack
			
 
				     |  fld qword [esp+12]
			
 
				-    |.endif
			
 
				     |  je >1
			
 
				-    |  fpatan; retst0
			
 
				-    |1: ; fxch; fscale; fpop1; retst0
			
 
				+    |  fpatan; ret
			
 
				+    |1: ; fxch; fscale; fpop1; ret
			
 
				     |2: ; cmp foldop, 11; je >1; ja >9
			
 
				-    |  minsd xmm0, xmm1; retxmm0
			
 
				-    |1: ; maxsd xmm0, xmm1; retxmm0
			
 
				+    |  minsd xmm0, xmm1; jmp <7
			
 
				+    |1: ; maxsd xmm0, xmm1; jmp <7
			
 
				     |9: ; int3				// Bad op.
			
 
				-    |7:  // Move return value depending on calling convention.
			
 
				-    |.if X64WIN
			
 
				-    |  fstp qword [esp+8]		// Use scratch area.
			
 
				-    |  movsd xmm0, qword [esp+8]
			
 
				-    |.elif X64
			
 
				-    |  fstp qword [esp-8]		// Use red zone.
			
 
				-    |  movsd xmm0, qword [esp-8]
			
 
				-    |.else
			
 
				-    |  movsd qword [esp+4], xmm0	// Overwrite callee-owned args.
			
 
				-    |  fld qword [esp+4]
			
 
				+    |
			
 
				     |.endif
			
 
				-    |  ret
			
 
				   } else {
			
 
				     |  mov eax, [esp+20]
			
 
				     |  fld qword [esp+4]
			
@@ -3483,17 +3673,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse)
 
				     |  jmp ->BC_MODVN_Z			// Avoid 3 copies. It's slow anyway.
			
 
				     break;
			
 
				   case BC_POW:
			
 
				-    if (sse) {
			
 
				-      sse = 0;  /* NYI: temporary workaround. */
			
 
				-      |  ins_arithpre fld, movsd, xmm1
			
 
				-      |  call ->vm_pow
			
 
				-      |  ins_arithpost
			
 
				-      sse = 1;
			
 
				-    } else {
			
 
				-      |  ins_arithpre fld, movsd, xmm1
			
 
				-      |  call ->vm_pow
			
 
				-      |  ins_arithpost
			
 
				-    }
			
 
				+    |  ins_arithpre fld, movsd, xmm1
			
 
				+    |  call ->vm_pow
			
 
				+    |  ins_arithpost
			
 
				     |  ins_next
			
 
				     break;
			
 
				 
			
--- a/src/buildvm_x86.h
+++ b/src/buildvm_x86.h
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -1991,9 +1991,19 @@ static int fpmjoin_pow(ASMState *as, IRIns *ir)
 
				     IRIns *irpp = IR(irp->op1);
			
 
				     if (irpp == ir-2 && irpp->o == IR_FPMATH &&
			
 
				 	irpp->op2 == IRFPM_LOG2 && !ra_used(irpp)) {
			
 
				-      emit_call(as, lj_vm_pow);  /* st0 = lj_vm_pow(st1, st0) */
			
 
				-      asm_x87load(as, irp->op2);
			
 
				-      asm_x87load(as, irpp->op1);
			
 
				+      /* The modified regs must match with the *.dasc implementation. */
			
 
				+      RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
			
 
				+      IRIns *irx;
			
 
				+      if (ra_hasreg(ir->r))
			
 
				+	rset_clear(drop, ir->r);  /* Dest reg handled below. */
			
 
				+      ra_evictset(as, drop);
			
 
				+      ra_destreg(as, ir, RID_XMM0);
			
 
				+      emit_call(as, lj_vm_pow_sse);
			
 
				+      irx = IR(irpp->op1);
			
 
				+      if (ra_noreg(irx->r) && ra_gethint(irx->r) == RID_XMM1)
			
 
				+	irx->r = RID_INIT;  /* Avoid allocating xmm1 for x. */
			
 
				+      ra_left(as, RID_XMM0, irpp->op1);
			
 
				+      ra_left(as, RID_XMM1, irp->op2);
			
 
				       return 1;
			
 
				     }
			
 
				   }
			
@@ -2007,30 +2017,35 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
 
				     Reg dest = ra_dest(as, ir, RSET_FPR);
			
 
				     Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
			
 
				     emit_mrm(as, XO_SQRTSD, dest, left);
			
 
				-  } else if ((as->flags & JIT_F_SSE4_1) && fpm <= IRFPM_TRUNC) {
			
 
				-    Reg dest = ra_dest(as, ir, RSET_FPR);
			
 
				-    Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
			
 
				-    /* Round down/up/trunc == 1001/1010/1011. */
			
 
				-    emit_i8(as, 0x09 + fpm);
			
 
				-    /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op. */
			
 
				-    emit_mrm(as, XO_ROUNDSD, dest, left);
			
 
				-    /* Let's pretend it's a 3-byte opcode, and compensate afterwards. */
			
 
				-    /* This is atrocious, but the alternatives are much worse. */
			
 
				-    if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) {
			
 
				-      as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f;  /* Swap 0F and REX. */
			
 
				-    }
			
 
				-    *--as->mcp = 0x66;  /* 1st byte of ROUNDSD opcode. */
			
 
				   } else if (fpm <= IRFPM_TRUNC) {
			
 
				-    /* The modified regs must match with the *.dasc implementation. */
			
 
				-    RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);
			
 
				-    if (ra_hasreg(ir->r))
			
 
				-      rset_clear(drop, ir->r);  /* Dest reg handled below. */
			
 
				-    ra_evictset(as, drop);
			
 
				-    ra_destreg(as, ir, RID_XMM0);
			
 
				-    emit_call(as, fpm == IRFPM_FLOOR ? lj_vm_floor_sse :
			
 
				-		  fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse);
			
 
				-    ra_left(as, RID_XMM0, ir->op1);
			
 
				-  } else {
			
 
				+    if (as->flags & JIT_F_SSE4_1) {  /* SSE4.1 has a rounding instruction. */
			
 
				+      Reg dest = ra_dest(as, ir, RSET_FPR);
			
 
				+      Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
			
 
				+      /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op.
			
 
				+      ** Let's pretend it's a 3-byte opcode, and compensate afterwards.
			
 
				+      ** This is atrocious, but the alternatives are much worse.
			
 
				+      */
			
 
				+      /* Round down/up/trunc == 1001/1010/1011. */
			
 
				+      emit_i8(as, 0x09 + fpm);
			
 
				+      emit_mrm(as, XO_ROUNDSD, dest, left);
			
 
				+      if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) {
			
 
				+	as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f;  /* Swap 0F and REX. */
			
 
				+      }
			
 
				+      *--as->mcp = 0x66;  /* 1st byte of ROUNDSD opcode. */
			
 
				+    } else {  /* Call helper functions for SSE2 variant. */
			
 
				+      /* The modified regs must match with the *.dasc implementation. */
			
 
				+      RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);
			
 
				+      if (ra_hasreg(ir->r))
			
 
				+	rset_clear(drop, ir->r);  /* Dest reg handled below. */
			
 
				+      ra_evictset(as, drop);
			
 
				+      ra_destreg(as, ir, RID_XMM0);
			
 
				+      emit_call(as, fpm == IRFPM_FLOOR ? lj_vm_floor_sse :
			
 
				+		    fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse);
			
 
				+      ra_left(as, RID_XMM0, ir->op1);
			
 
				+    }
			
 
				+  } else if (fpm == IRFPM_EXP2 && fpmjoin_pow(as, ir)) {
			
 
				+    /* Rejoined to pow(). */
			
 
				+  } else {  /* Handle x87 ops. */
			
 
				     int32_t ofs = sps_scale(ir->s);  /* Use spill slot or slots SPS_TEMP1/2. */
			
 
				     Reg dest = ir->r;
			
 
				     if (ra_hasreg(dest)) {
			
@@ -2040,14 +2055,8 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
 
				     }
			
 
				     emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs);
			
 
				     switch (fpm) {  /* st0 = lj_vm_*(st0) */
			
 
				-    case IRFPM_FLOOR: emit_call(as, lj_vm_floor); break;
			
 
				-    case IRFPM_CEIL: emit_call(as, lj_vm_ceil); break;
			
 
				-    case IRFPM_TRUNC: emit_call(as, lj_vm_trunc); break;
			
 
				     case IRFPM_EXP: emit_call(as, lj_vm_exp); break;
			
 
				-    case IRFPM_EXP2:
			
 
				-      if (fpmjoin_pow(as, ir)) return;
			
 
				-      emit_call(as, lj_vm_exp2);  /* st0 = lj_vm_exp2(st0) */
			
 
				-      break;
			
 
				+    case IRFPM_EXP2: emit_call(as, lj_vm_exp2); break;
			
 
				     case IRFPM_SIN: emit_x87op(as, XI_FSIN); break;
			
 
				     case IRFPM_COS: emit_x87op(as, XI_FCOS); break;
			
 
				     case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break;
			
@@ -2063,10 +2072,6 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
 
				 	emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break;
			
 
				       case IR_LDEXP:
			
 
				 	emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break;
			
 
				-      case IR_POWI:
			
 
				-	emit_call(as, lj_vm_powi);  /* st0 = lj_vm_powi(st0, [esp]) */
			
 
				-	emit_rmro(as, XO_MOVto, ra_alloc1(as, ir->op2, RSET_GPR), RID_ESP, 0);
			
 
				-	break;
			
 
				       default: lua_assert(0); break;
			
 
				       }
			
 
				       break;
			
@@ -2085,6 +2090,19 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
 
				   }
			
 
				 }
			
 
				 
			
 
				+static void asm_powi(ASMState *as, IRIns *ir)
			
 
				+{
			
 
				+  /* The modified regs must match with the *.dasc implementation. */
			
 
				+  RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
			
 
				+  if (ra_hasreg(ir->r))
			
 
				+    rset_clear(drop, ir->r);  /* Dest reg handled below. */
			
 
				+  ra_evictset(as, drop);
			
 
				+  ra_destreg(as, ir, RID_XMM0);
			
 
				+  emit_call(as, lj_vm_powi_sse);
			
 
				+  ra_left(as, RID_XMM0, ir->op1);
			
 
				+  ra_left(as, RID_EAX, ir->op2);
			
 
				+}
			
 
				+
			
 
				 /* Find out whether swapping operands might be beneficial. */
			
 
				 static int swapops(ASMState *as, IRIns *ir)
			
 
				 {
			
@@ -3132,9 +3150,10 @@ static void asm_ir(ASMState *as, IRIns *ir)
 
				   case IR_MIN: asm_fparith(as, ir, XO_MINSD); break;
			
 
				   case IR_MAX: asm_fparith(as, ir, XO_MAXSD); break;
			
 
				 
			
 
				-  case IR_FPMATH: case IR_ATAN2: case IR_LDEXP: case IR_POWI:
			
 
				+  case IR_FPMATH: case IR_ATAN2: case IR_LDEXP:
			
 
				     asm_fpmath(as, ir);
			
 
				     break;
			
 
				+  case IR_POWI: asm_powi(as, ir); break;
			
 
				 
			
 
				   /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
			
 
				   case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break;
			
@@ -3285,8 +3304,22 @@ static void asm_setup_regsp(ASMState *as, Trace *T)
 
				       if (inloop)
			
 
				 	as->modset = RSET_SCRATCH;
			
 
				       break;
			
 
				+    case IR_POWI:
			
 
				+      ir->prev = REGSP_HINT(RID_XMM0);
			
 
				+      if (inloop)
			
 
				+	as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
			
 
				+      continue;
			
 
				     case IR_FPMATH:
			
 
				-      if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) {
			
 
				+      if (ir->op2 == IRFPM_EXP2) {  /* May be joined to lj_vm_pow_sse. */
			
 
				+	ir->prev = REGSP_HINT(RID_XMM0);
			
 
				+#if !LJ_64
			
 
				+	if (as->evenspill < 4)  /* Leave room for 16 byte scratch area. */
			
 
				+	  as->evenspill = 4;
			
 
				+#endif
			
 
				+	if (inloop)
			
 
				+	  as->modset |= RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
			
 
				+	continue;
			
 
				+      } else if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) {
			
 
				 	ir->prev = REGSP_HINT(RID_XMM0);
			
 
				 	if (inloop)
			
 
				 	  as->modset |= RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);
			
--- a/src/lj_vm.h
+++ b/src/lj_vm.h
@@ -34,16 +34,13 @@ LJ_ASMF void lj_vm_exit_handler(void);
 
				 LJ_ASMF void lj_vm_exit_interp(void);
			
 
				 
			
 
				 /* Handlers callable from compiled code. */
			
 
				-LJ_ASMF void lj_vm_floor(void);
			
 
				-LJ_ASMF void lj_vm_ceil(void);
			
 
				-LJ_ASMF void lj_vm_trunc(void);
			
 
				 LJ_ASMF void lj_vm_floor_sse(void);
			
 
				 LJ_ASMF void lj_vm_ceil_sse(void);
			
 
				 LJ_ASMF void lj_vm_trunc_sse(void);
			
 
				 LJ_ASMF void lj_vm_exp(void);
			
 
				 LJ_ASMF void lj_vm_exp2(void);
			
 
				-LJ_ASMF void lj_vm_pow(void);
			
 
				-LJ_ASMF void lj_vm_powi(void);
			
 
				+LJ_ASMF void lj_vm_pow_sse(void);
			
 
				+LJ_ASMF void lj_vm_powi_sse(void);
			
 
				 
			
 
				 /* Call gates for functions. */
			
 
				 LJ_ASMF void lj_gate_lf(void);