Browse Source

Add x64 call argument setup. More 32/64 bit cleanups in assembler.

Mike Pall 15 years ago
parent
commit
3c6cec0846
2 changed files with 79 additions and 47 deletions
  1. 58 42
      src/lj_asm.c
  2. 21 5
      src/lj_target_x86.h

+ 58 - 42
src/lj_asm.c

@@ -1292,21 +1292,52 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
 {
   RegSet allow = RSET_ALL;
   uint32_t n, nargs = CCI_NARGS(ci);
-  int32_t ofs = 0;
+  int32_t ofs = STACKARG_OFS;
+  uint32_t gprs = REGARG_GPRS;
+#if LJ_64
+  Reg fpr = REGARG_FIRSTFPR;
+#endif
   lua_assert(!(nargs > 2 && (ci->flags&CCI_FASTCALL)));  /* Avoid stack adj. */
   emit_call(as, ci->func);
   for (n = 0; n < nargs; n++) {  /* Setup args. */
-#if LJ_64
-#error "NYI: 64 bit mode call argument setup"
-#endif
     IRIns *ir = IR(args[n]);
+    Reg r;
+#if LJ_64 && defined(_WIN64)
+    /* Windows/x64 argument registers are strictly positional. */
+    r = irt_isnum(ir->t) ? (fpr <= REGARG_LASTFPR ? fpr : 0) : (gprs & 31);
+    fpr++; gprs >>= 5;
+#elif LJ_64
+    /* POSIX/x64 argument registers are used in order of appearance. */
     if (irt_isnum(ir->t)) {
-      if ((ofs & 4) && irref_isk(args[n])) {
+      r = fpr <= REGARG_LASTFPR ? fpr : 0; fpr++;
+    } else {
+      r = gprs & 31; gprs >>= 5;
+    }
+#else
+    if (irt_isnum(ir->t) || !(ci->flags & CCI_FASTCALL)) {
+      r = 0;
+    } else {
+      r = gprs & 31; gprs >>= 5;
+    }
+#endif
+    if (r) {  /* Argument is in a register. */
+      if (args[n] < ASMREF_TMP1) {
+	emit_loadi(as, r, ir->i);
+      } else {
+	lua_assert(rset_test(as->freeset, r));  /* Must have been evicted. */
+	if (ra_hasreg(ir->r)) {
+	  ra_noweak(as, ir->r);
+	  ra_movrr(as, ir, r, ir->r);
+	} else {
+	  ra_allocref(as, args[n], RID2RSET(r));
+	}
+      }
+    } else if (irt_isnum(ir->t)) {  /* FP argument is on stack. */
+      if (!LJ_64 && (ofs & 4) && irref_isk(args[n])) {
 	/* Split stores for unaligned FP consts. */
 	emit_movmroi(as, RID_ESP, ofs, (int32_t)ir_knum(ir)->u32.lo);
 	emit_movmroi(as, RID_ESP, ofs+4, (int32_t)ir_knum(ir)->u32.hi);
       } else {
-	Reg r;
 	if ((allow & RSET_FPR) == RSET_EMPTY)
 	  lj_trace_err(as->J, LJ_TRERR_NYICOAL);
 	r = ra_alloc1(as, args[n], allow & RSET_FPR);
@@ -1314,34 +1345,18 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
 	emit_rmro(as, XO_MOVSDto, r, RID_ESP, ofs);
       }
       ofs += 8;
-    } else {
-      if ((ci->flags & CCI_FASTCALL) && n < 2) {
-	Reg r = n == 0 ? RID_ECX : RID_EDX;
-	if (args[n] < ASMREF_TMP1) {
-	  emit_loadi(as, r, ir->i);
-	} else {
-	  lua_assert(rset_test(as->freeset, r));  /* Must have been evicted. */
-	  allow &= ~RID2RSET(r);
-	  if (ra_hasreg(ir->r)) {
-	    ra_noweak(as, ir->r);
-	    ra_movrr(as, ir, r, ir->r);
-	  } else {
-	    ra_allocref(as, args[n], RID2RSET(r));
-	  }
-	}
+    } else {  /* Non-FP argument is on stack. */
+      /* NYI: no widening for 64 bit parameters on x64. */
+      if (args[n] < ASMREF_TMP1) {
+	emit_movmroi(as, RID_ESP, ofs, ir->i);
       } else {
-	if (args[n] < ASMREF_TMP1) {
-	  emit_movmroi(as, RID_ESP, ofs, ir->i);
-	} else {
-	  Reg r;
-	  if ((allow & RSET_GPR) == RSET_EMPTY)
-	    lj_trace_err(as->J, LJ_TRERR_NYICOAL);
-	  r = ra_alloc1(as, args[n], allow & RSET_GPR);
-	  allow &= ~RID2RSET(r);
-	  emit_movtomro(as, r, RID_ESP, ofs);
-	}
-	ofs += 4;
+	if ((allow & RSET_GPR) == RSET_EMPTY)
+	  lj_trace_err(as->J, LJ_TRERR_NYICOAL);
+	r = ra_alloc1(as, args[n], allow & RSET_GPR);
+	allow &= ~RID2RSET(r);
+	emit_movtomro(as, REX_64LU(ir, r), RID_ESP, ofs);
       }
+      ofs += sizeof(intptr_t);
     }
   }
 }
@@ -2561,7 +2576,7 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
 	asm_guardcc(as, cc);
 	if (usetest && left != RID_MRM) {
 	  /* Use test r,r instead of cmp r,0. */
-	  emit_rr(as, XO_TEST, left, left);
+	  emit_rr(as, XO_TEST, REX_64LU(ir, left), left);
 	  if (irl+1 == ir)  /* Referencing previous ins? */
 	    as->testmcp = as->mcp;  /* Set flag to drop test r,r if possible. */
 	} else {
@@ -2580,11 +2595,7 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
       Reg left = ra_alloc1(as, lref, RSET_GPR);
       Reg right = asm_fuseload(as, rref, rset_exclude(RSET_GPR, left));
       asm_guardcc(as, cc);
-#if LJ_64
-      if (irt_islightud(ir->t))
-	left |= REX_64;
-#endif
-      emit_mrm(as, XO_CMP, left, right);
+      emit_mrm(as, XO_CMP, REX_64LU(ir, left), right);
     }
   }
 }
@@ -2732,14 +2743,14 @@ static void asm_gc_check(ASMState *as, SnapShot *snap)
   /* We don't know spadj yet, so get the C frame from L->cframe. */
   emit_movmroi(as, tmp, CFRAME_OFS_PC,
 	       (int32_t)as->T->snapmap[snap->mapofs+snap->nent]);
-  emit_gri(as, XG_ARITHi(XOg_AND), tmp, CFRAME_RAWMASK);
+  emit_gri(as, XG_ARITHi(XOg_AND), tmp|REX_64, CFRAME_RAWMASK);
   lstate = IR(ASMREF_L)->r;
-  emit_rmro(as, XO_MOV, tmp, lstate, offsetof(lua_State, cframe));
+  emit_rmro(as, XO_MOV, tmp|REX_64, lstate, offsetof(lua_State, cframe));
   /* It's ok if lstate is already in a non-scratch reg. But all allocations
   ** in the non-fast path must use a scratch reg. See comment above.
   */
   base = ra_alloc1(as, REF_BASE, rset_exclude(RSET_SCRATCH & RSET_GPR, lstate));
-  emit_movtomro(as, base, lstate, offsetof(lua_State, base));
+  emit_movtomro(as, base|REX_64, lstate, offsetof(lua_State, base));
   asm_gc_sync(as, snap, base);
   /* BASE/L get restored anyway, better do it inside the slow path. */
   if (as->parent || as->curins == as->loopref) ra_restore(as, REF_BASE);
@@ -3447,7 +3458,12 @@ static void asm_setup_regsp(ASMState *as, Trace *T)
     case IR_CALLN: case IR_CALLL: case IR_CALLS: {
       const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
 #if LJ_64
-      /* NYI: add stack slots for calls with more than 4/6 args. */
+      /* NYI: add stack slots for x64 calls with many args. */
+#ifdef _WIN64
+      lua_assert(CCI_NARGS(ci) <= 4);
+#else
+      lua_assert(CCI_NARGS(ci) <= 6);  /* Safe lower bound. */
+#endif
       ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET);
 #else
       /* NYI: not fastcall-aware, but doesn't matter (yet). */

+ 21 - 5
src/lj_target_x86.h

@@ -78,14 +78,27 @@ enum {
 /* Windows x64 ABI. */
 #define RSET_SCRATCH \
   (RSET_ACD|RSET_RANGE(RID_R8D, RID_R11D+1)|RSET_RANGE(RID_XMM0, RID_XMM5+1))
+#define REGARG_GPRS \
+  (RID_ECX|((RID_EDX|((RID_R8D|(RID_R9D<<5))<<5))<<5))
+#define REGARG_FIRSTFPR	RID_XMM0
+#define REGARG_LASTFPR	RID_XMM3
+#define STACKARG_OFS	(4*8)
 #else
 /* The rest of the civilized x64 world has a common ABI. */
 #define RSET_SCRATCH \
   (RSET_ACD|RSET_RANGE(RID_ESI, RID_R11D+1)|RSET_FPR)
+#define REGARG_GPRS \
+  (RID_EDI|((RID_ESI|((RID_EDX|((RID_ECX|((RID_R8D|(RID_R9D \
+   <<5))<<5))<<5))<<5))<<5))
+#define REGARG_FIRSTFPR	RID_XMM0
+#define REGARG_LASTFPR	RID_XMM7
+#define STACKARG_OFS	0
 #endif
 #else
 /* Common x86 ABI. */
 #define RSET_SCRATCH	(RSET_ACD|RSET_FPR)
+#define REGARG_GPRS	(RID_ECX|(RID_EDX<<5))  /* Fastcall only. */
+#define STACKARG_OFS	0
 #endif
 
 #if LJ_64
@@ -96,23 +109,26 @@ enum {
 
 /* -- Spill slots --------------------------------------------------------- */
 
-/* Available fixed spill slots in interpreter frame.
+/* Spill slots are 32 bit wide. An even/odd pair is used for FPRs.
+**
+** SPS_FIXED: Available fixed spill slots in interpreter frame.
 ** This definition must match with the *.dasc file(s).
+**
+** SPS_FIRST: First spill slot for general use. Reserve min. two 32 bit slots.
 */
 #if LJ_64
 #ifdef _WIN64
 #define SPS_FIXED	(5*2)
+#define SPS_FIRST	(4*2)	/* Don't use callee register save area. */
 #else
 #define SPS_FIXED	2
+#define SPS_FIRST	2
 #endif
 #else
 #define SPS_FIXED	6
-#endif
-
-/* First spill slot for general use. Reserve one 64 bit slot. */
 #define SPS_FIRST	2
+#endif
 
-/* Spill slots are 32 bit wide. An even/odd pair is used for FPRs. */
 #define sps_scale(slot)		(4 * (int32_t)(slot))
 
 /* -- Exit state ---------------------------------------------------------- */