Browse Source

FFI: Compile calls to stdcall, fastcall and vararg functions.

Mike Pall 13 years ago
parent
commit
2dc574d06b
14 changed files with 262 additions and 78 deletions
  1. 0 1
      doc/ext_ffi_semantics.html
  2. 21 3
      lib/dump.lua
  3. 4 4
      src/Makefile.dep
  4. 10 1
      src/lj_asm.c
  5. 7 3
      src/lj_asm_arm.h
  6. 9 3
      src/lj_asm_ppc.h
  7. 104 36
      src/lj_asm_x86.h
  8. 2 2
      src/lj_ccall.c
  9. 3 0
      src/lj_ccall.h
  10. 83 20
      src/lj_crecord.c
  11. 1 1
      src/lj_ctype.h
  12. 12 4
      src/lj_ircall.h
  13. 2 0
      src/lj_target_ppc.h
  14. 4 0
      src/lj_target_x86.h

+ 0 - 1
doc/ext_ffi_semantics.html

@@ -985,7 +985,6 @@ alignment &gt; 8&nbsp;bytes.</li>
 <li>Conversions from lightuserdata to <tt>void&nbsp;*</tt>.</li>
 <li>Conversions from lightuserdata to <tt>void&nbsp;*</tt>.</li>
 <li>Pointer differences for element sizes that are not a power of
 <li>Pointer differences for element sizes that are not a power of
 two.</li>
 two.</li>
-<li>Calls to non-cdecl or vararg C&nbsp;functions.</li>
 <li>Calls to C&nbsp;functions with aggregates passed or returned by
 <li>Calls to C&nbsp;functions with aggregates passed or returned by
 value.</li>
 value.</li>
 <li>Calls to ctype metamethods which are not plain functions.</li>
 <li>Calls to ctype metamethods which are not plain functions.</li>

+ 21 - 3
lib/dump.lua

@@ -378,6 +378,24 @@ local function ridsp_name(ridsp)
   return ""
   return ""
 end
 end
 
 
+-- Dump CALL* function ref and return optional ctype.
+local function dumpcallfunc(tr, ins)
+  local ctype
+  if ins > 0 then
+    local m, ot, op1, op2 = traceir(tr, ins)
+    if band(ot, 31) == 0 then -- nil type means CARG(func, ctype).
+      ins = op1
+      ctype = formatk(tr, op2)
+    end
+  end
+  if ins < 0 then
+    out:write(format("[0x%x](", tonumber((tracek(tr, ins)))))
+  else
+    out:write(format("%04d (", ins))
+  end
+  return ctype
+end
+
 -- Recursively gather CALL* args and dump them.
 -- Recursively gather CALL* args and dump them.
 local function dumpcallargs(tr, ins)
 local function dumpcallargs(tr, ins)
   if ins < 0 then
   if ins < 0 then
@@ -447,15 +465,15 @@ local function dump_ir(tr, dumpsnap, dumpreg)
 		       irtype[t], op))
 		       irtype[t], op))
       local m1, m2 = band(m, 3), band(m, 3*4)
       local m1, m2 = band(m, 3), band(m, 3*4)
       if sub(op, 1, 4) == "CALL" then
       if sub(op, 1, 4) == "CALL" then
+	local ctype
 	if m2 == 1*4 then -- op2 == IRMlit
 	if m2 == 1*4 then -- op2 == IRMlit
 	  out:write(format("%-10s  (", vmdef.ircall[op2]))
 	  out:write(format("%-10s  (", vmdef.ircall[op2]))
-	elseif op2 < 0 then
-	  out:write(format("[0x%x](", tonumber((tracek(tr, op2)))))
 	else
 	else
-	  out:write(format("%04d (", op2))
+	  ctype = dumpcallfunc(tr, op2)
 	end
 	end
 	if op1 ~= -1 then dumpcallargs(tr, op1) end
 	if op1 ~= -1 then dumpcallargs(tr, op1) end
 	out:write(")")
 	out:write(")")
+	if ctype then out:write(" ctype ", ctype) end
       elseif op == "CNEW  " and op2 == -1 then
       elseif op == "CNEW  " and op2 == -1 then
 	out:write(formatk(tr, op1))
 	out:write(formatk(tr, op1))
       elseif m1 ~= 3 then -- op1 != IRMnone
       elseif m1 ~= 3 then -- op1 != IRMnone

+ 4 - 4
src/Makefile.dep

@@ -1,6 +1,6 @@
 buildvm.o: buildvm.c buildvm.h lj_def.h lua.h luaconf.h lj_arch.h \
 buildvm.o: buildvm.c buildvm.h lj_def.h lua.h luaconf.h lj_arch.h \
  lj_obj.h lj_gc.h lj_bc.h lj_ir.h lj_ircall.h lj_jit.h lj_frame.h \
  lj_obj.h lj_gc.h lj_bc.h lj_ir.h lj_ircall.h lj_jit.h lj_frame.h \
- lj_dispatch.h lj_ccall.h luajit.h \
+ lj_dispatch.h lj_ccall.h lj_ctype.h luajit.h \
  lj_traceerr.h
  lj_traceerr.h
 buildvm_asm.o: buildvm_asm.c buildvm.h lj_def.h lua.h luaconf.h lj_arch.h \
 buildvm_asm.o: buildvm_asm.c buildvm.h lj_def.h lua.h luaconf.h lj_arch.h \
  lj_bc.h
  lj_bc.h
@@ -86,9 +86,9 @@ lj_cparse.o: lj_cparse.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
  lj_bc.h lj_vm.h lj_char.h
  lj_bc.h lj_vm.h lj_char.h
 lj_crecord.o: lj_crecord.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
 lj_crecord.o: lj_crecord.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
  lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_frame.h lj_bc.h lj_ctype.h \
  lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_frame.h lj_bc.h lj_ctype.h \
- lj_gc.h lj_cparse.h lj_cconv.h lj_clib.h lj_ir.h lj_jit.h lj_ircall.h \
- lj_iropt.h lj_trace.h lj_dispatch.h lj_traceerr.h lj_record.h \
- lj_ffrecord.h lj_crecord.h
+ lj_gc.h lj_cparse.h lj_cconv.h lj_clib.h lj_ccall.h lj_ir.h lj_jit.h \
+ lj_ircall.h lj_iropt.h lj_trace.h lj_dispatch.h lj_traceerr.h \
+ lj_record.h lj_ffrecord.h lj_crecord.h
 lj_ctype.o: lj_ctype.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
 lj_ctype.o: lj_ctype.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
  lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_ctype.h
  lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_ctype.h
 lj_debug.o: lj_debug.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
 lj_debug.o: lj_debug.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \

+ 10 - 1
src/lj_asm.c

@@ -888,7 +888,16 @@ static uint32_t asm_callx_flags(ASMState *as, IRIns *ir)
     nargs++;
     nargs++;
     while (ira->o == IR_CARG) { nargs++; ira = IR(ira->op1); }
     while (ira->o == IR_CARG) { nargs++; ira = IR(ira->op1); }
   }
   }
-  /* NYI: fastcall etc. */
+#if LJ_HASFFI
+  if (IR(ir->op2)->o == IR_CARG) {  /* Copy calling convention info. */
+    CTypeID id = (CTypeID)IR(IR(ir->op2)->op2)->i;
+    CType *ct = ctype_get(ctype_ctsG(J2G(as->J)), id);
+    nargs |= ((ct->info & CTF_VARARG) ? CCI_VARARG : 0);
+#if LJ_TARGET_X86
+    nargs |= (ctype_cconv(ct->info) << CCI_CC_SHIFT);
+#endif
+  }
+#endif
   return (nargs | (ir->t.irt << CCI_OTSHIFT));
   return (nargs | (ir->t.irt << CCI_OTSHIFT));
 }
 }
 
 

+ 7 - 3
src/lj_asm_arm.h

@@ -331,13 +331,17 @@ static void asm_callx(ASMState *as, IRIns *ir)
 {
 {
   IRRef args[CCI_NARGS_MAX];
   IRRef args[CCI_NARGS_MAX];
   CCallInfo ci;
   CCallInfo ci;
+  IRRef func;
+  IRIns *irf;
   ci.flags = asm_callx_flags(as, ir);
   ci.flags = asm_callx_flags(as, ir);
   asm_collectargs(as, ir, &ci, args);
   asm_collectargs(as, ir, &ci, args);
   asm_setupresult(as, ir, &ci);
   asm_setupresult(as, ir, &ci);
-  if (irref_isk(ir->op2)) {  /* Call to constant address. */
-    ci.func = (ASMFunction)(void *)(IR(ir->op2)->i);
+  func = ir->op2; irf = IR(func);
+  if (irf->o == IR_CARG) { func = irf->op1; irf = IR(func); }
+  if (irref_isk(func)) {  /* Call to constant address. */
+    ci.func = (ASMFunction)(void *)(irf->i);
   } else {  /* Need a non-argument register for indirect calls. */
   } else {  /* Need a non-argument register for indirect calls. */
-    Reg freg = ra_alloc1(as, ir->op2, RSET_RANGE(RID_R4, RID_R12+1));
+    Reg freg = ra_alloc1(as, func, RSET_RANGE(RID_R4, RID_R12+1));
     emit_m(as, ARMI_BLXr, freg);
     emit_m(as, ARMI_BLXr, freg);
     ci.func = (ASMFunction)(void *)0;
     ci.func = (ASMFunction)(void *)0;
   }
   }

+ 9 - 3
src/lj_asm_ppc.h

@@ -284,6 +284,8 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
 	ofs += 4;
 	ofs += 4;
     }
     }
   }
   }
+  if ((ci->flags & CCI_VARARG))  /* Vararg calls need to know about FPR use. */
+    emit_tab(as, fpr == REGARG_FIRSTFPR ? PPCI_CRXOR : PPCI_CREQV, 6, 6, 6);
 }
 }
 
 
 /* Setup result reg/sp for call. Evict scratch regs. */
 /* Setup result reg/sp for call. Evict scratch regs. */
@@ -336,14 +338,18 @@ static void asm_callx(ASMState *as, IRIns *ir)
 {
 {
   IRRef args[CCI_NARGS_MAX];
   IRRef args[CCI_NARGS_MAX];
   CCallInfo ci;
   CCallInfo ci;
+  IRRef func;
+  IRIns *irf;
   ci.flags = asm_callx_flags(as, ir);
   ci.flags = asm_callx_flags(as, ir);
   asm_collectargs(as, ir, &ci, args);
   asm_collectargs(as, ir, &ci, args);
   asm_setupresult(as, ir, &ci);
   asm_setupresult(as, ir, &ci);
-  if (irref_isk(ir->op2)) {  /* Call to constant address. */
-    ci.func = (ASMFunction)(void *)(IR(ir->op2)->i);
+  func = ir->op2; irf = IR(func);
+  if (irf->o == IR_CARG) { func = irf->op1; irf = IR(func); }
+  if (irref_isk(func)) {  /* Call to constant address. */
+    ci.func = (ASMFunction)(void *)(irf->i);
   } else {  /* Need a non-argument register for indirect calls. */
   } else {  /* Need a non-argument register for indirect calls. */
     RegSet allow = RSET_GPR & ~RSET_RANGE(RID_R0, REGARG_LASTGPR+1);
     RegSet allow = RSET_GPR & ~RSET_RANGE(RID_R0, REGARG_LASTGPR+1);
-    Reg freg = ra_alloc1(as, ir->op2, allow);
+    Reg freg = ra_alloc1(as, func, allow);
     *--as->mcp = PPCI_BCTRL;
     *--as->mcp = PPCI_BCTRL;
     *--as->mcp = PPCI_MTCTR | PPCF_T(freg);
     *--as->mcp = PPCI_MTCTR | PPCF_T(freg);
     ci.func = (ASMFunction)(void *)0;
     ci.func = (ASMFunction)(void *)0;

+ 104 - 36
src/lj_asm_x86.h

@@ -369,18 +369,76 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
 
 
 /* -- Calls --------------------------------------------------------------- */
 /* -- Calls --------------------------------------------------------------- */
 
 
+/* Count the required number of stack slots for a call. */
+static int asm_count_call_slots(ASMState *as, const CCallInfo *ci, IRRef *args)
+{
+  uint32_t i, nargs = CCI_NARGS(ci);
+  int nslots = 0;
+#if LJ_64
+  if (LJ_ABI_WIN) {
+    nslots = (int)(nargs*2);  /* Only matters for more than four args. */
+  } else {
+    int ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR;
+    for (i = 0; i < nargs; i++)
+      if (args[i] && irt_isfp(IR(args[i])->t)) {
+	if (nfpr > 0) nfpr--; else nslots += 2;
+      } else {
+	if (ngpr > 0) ngpr--; else nslots += 2;
+      }
+  }
+#else
+  int ngpr = 0;
+  if ((ci->flags & CCI_CC_MASK) == CCI_CC_FASTCALL)
+    ngpr = 2;
+  else if ((ci->flags & CCI_CC_MASK) == CCI_CC_THISCALL)
+    ngpr = 1;
+  for (i = 0; i < nargs; i++)
+    if (args[i] && irt_isfp(IR(args[i])->t)) {
+      nslots += irt_isnum(IR(args[i])->t) ? 2 : 1;
+    } else {
+      if (ngpr > 0) ngpr--; else nslots++;
+    }
+#endif
+  return nslots;
+}
+
 /* Generate a call to a C function. */
 /* Generate a call to a C function. */
 static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
 static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
 {
 {
   uint32_t n, nargs = CCI_NARGS(ci);
   uint32_t n, nargs = CCI_NARGS(ci);
   int32_t ofs = STACKARG_OFS;
   int32_t ofs = STACKARG_OFS;
-  uint32_t gprs = REGARG_GPRS;
 #if LJ_64
 #if LJ_64
+  uint32_t gprs = REGARG_GPRS;
   Reg fpr = REGARG_FIRSTFPR;
   Reg fpr = REGARG_FIRSTFPR;
+#if !LJ_ABI_WIN
+  MCode *patchnfpr = NULL;
+#endif
+#else
+  uint32_t gprs = 0;
+  if ((ci->flags & CCI_CC_MASK) != CCI_CC_CDECL) {
+    if ((ci->flags & CCI_CC_MASK) == CCI_CC_THISCALL)
+      gprs = (REGARG_GPRS & 31);
+    else if ((ci->flags & CCI_CC_MASK) == CCI_CC_FASTCALL)
+      gprs = REGARG_GPRS;
+  }
 #endif
 #endif
-  lua_assert(!(nargs > 2 && (ci->flags&CCI_FASTCALL)));  /* Avoid stack adj. */
   if ((void *)ci->func)
   if ((void *)ci->func)
     emit_call(as, ci->func);
     emit_call(as, ci->func);
+#if LJ_64
+  if ((ci->flags & CCI_VARARG)) {  /* Special handling for vararg calls. */
+#if LJ_ABI_WIN
+    for (n = 0; n < 4 && n < nargs; n++) {
+      IRIns *ir = IR(args[n]);
+      if (irt_isfp(ir->t))  /* Duplicate FPRs in GPRs. */
+	emit_rr(as, XO_MOVDto, (irt_isnum(ir->t) ? REX_64 : 0) | (fpr+n),
+		((gprs >> (n*5)) & 31));  /* Either MOVD or MOVQ. */
+    }
+#else
+    patchnfpr = --as->mcp;  /* Indicate number of used FPRs in register al. */
+    *--as->mcp = XI_MOVrib | RID_EAX;
+#endif
+  }
+#endif
   for (n = 0; n < nargs; n++) {  /* Setup args. */
   for (n = 0; n < nargs; n++) {  /* Setup args. */
     IRRef ref = args[n];
     IRRef ref = args[n];
     IRIns *ir = IR(ref);
     IRIns *ir = IR(ref);
@@ -392,15 +450,16 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
 #elif LJ_64
 #elif LJ_64
     /* POSIX/x64 argument registers are used in order of appearance. */
     /* POSIX/x64 argument registers are used in order of appearance. */
     if (irt_isfp(ir->t)) {
     if (irt_isfp(ir->t)) {
-      r = fpr <= REGARG_LASTFPR ? fpr : 0; fpr++;
+      r = fpr <= REGARG_LASTFPR ? fpr++ : 0;
     } else {
     } else {
       r = gprs & 31; gprs >>= 5;
       r = gprs & 31; gprs >>= 5;
     }
     }
 #else
 #else
-    if (irt_isfp(ir->t) || !(ci->flags & CCI_FASTCALL)) {
+    if (ref && irt_isfp(ir->t)) {
       r = 0;
       r = 0;
     } else {
     } else {
       r = gprs & 31; gprs >>= 5;
       r = gprs & 31; gprs >>= 5;
+      if (!ref) continue;
     }
     }
 #endif
 #endif
     if (r) {  /* Argument is in a register. */
     if (r) {  /* Argument is in a register. */
@@ -442,6 +501,9 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
       ofs += sizeof(intptr_t);
       ofs += sizeof(intptr_t);
     }
     }
   }
   }
+#if LJ_64 && !LJ_ABI_WIN
+  if (patchnfpr) *patchnfpr = fpr - REGARG_FIRSTFPR;
+#endif
 }
 }
 
 
 /* Setup result reg/sp for call. Evict scratch regs. */
 /* Setup result reg/sp for call. Evict scratch regs. */
@@ -503,23 +565,50 @@ static void asm_call(ASMState *as, IRIns *ir)
   asm_gencall(as, ci, args);
   asm_gencall(as, ci, args);
 }
 }
 
 
+/* Return a constant function pointer or NULL for indirect calls. */
+static void *asm_callx_func(ASMState *as, IRIns *irf, IRRef func)
+{
+#if LJ_32
+  UNUSED(as);
+  if (irref_isk(func))
+    return (void *)irf->i;
+#else
+  if (irref_isk(func)) {
+    MCode *p;
+    if (irf->o == IR_KINT64)
+      p = (MCode *)(void *)ir_k64(irf)->u64;
+    else
+      p = (MCode *)(void *)(uintptr_t)(uint32_t)irf->i;
+    if (p - as->mcp == (int32_t)(p - as->mcp))
+      return p;  /* Call target is still in +-2GB range. */
+    /* Avoid the indirect case of emit_call(). Try to hoist func addr. */
+  }
+#endif
+  return NULL;
+}
+
 static void asm_callx(ASMState *as, IRIns *ir)
 static void asm_callx(ASMState *as, IRIns *ir)
 {
 {
   IRRef args[CCI_NARGS_MAX];
   IRRef args[CCI_NARGS_MAX];
   CCallInfo ci;
   CCallInfo ci;
+  IRRef func;
   IRIns *irf;
   IRIns *irf;
   ci.flags = asm_callx_flags(as, ir);
   ci.flags = asm_callx_flags(as, ir);
   asm_collectargs(as, ir, &ci, args);
   asm_collectargs(as, ir, &ci, args);
   asm_setupresult(as, ir, &ci);
   asm_setupresult(as, ir, &ci);
-  irf = IR(ir->op2);
-  if (LJ_32 && irref_isk(ir->op2)) {  /* Call to constant address on x86. */
-    ci.func = (ASMFunction)(void *)(uintptr_t)(uint32_t)irf->i;
-  } else {
-    /* Prefer a non-argument register or RID_RET for indirect calls. */
-    RegSet allow = (RSET_GPR & ~RSET_SCRATCH)|RID2RSET(RID_RET);
-    Reg r = ra_alloc1(as, ir->op2, allow);
+#if LJ_32
+  /* Have to readjust stack after non-cdecl calls due to callee cleanup. */
+  if ((ci.flags & CCI_CC_MASK) != CCI_CC_CDECL)
+    emit_spsub(as, 4 * asm_count_call_slots(as, &ci, args));
+#endif
+  func = ir->op2; irf = IR(func);
+  if (irf->o == IR_CARG) { func = irf->op1; irf = IR(func); }
+  ci.func = (ASMFunction)asm_callx_func(as, irf, func);
+  if (!(void *)ci.func) {
+    /* Use a (hoistable) non-scratch register for indirect calls. */
+    RegSet allow = (RSET_GPR & ~RSET_SCRATCH);
+    Reg r = ra_alloc1(as, func, allow);
     emit_rr(as, XO_GROUP5, XOg_CALL, r);
     emit_rr(as, XO_GROUP5, XOg_CALL, r);
-    ci.func = (ASMFunction)(void *)0;
   }
   }
   asm_gencall(as, &ci, args);
   asm_gencall(as, &ci, args);
 }
 }
@@ -2608,35 +2697,14 @@ static void asm_ir(ASMState *as, IRIns *ir)
 static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
 static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
 {
 {
   IRRef args[CCI_NARGS_MAX];
   IRRef args[CCI_NARGS_MAX];
-  uint32_t nargs = (int)CCI_NARGS(ci);
-  int nslots = 0;
+  int nslots;
   asm_collectargs(as, ir, ci, args);
   asm_collectargs(as, ir, ci, args);
-#if LJ_64
-  if (LJ_ABI_WIN) {
-    nslots = (int)(nargs*2);  /* Only matters for more than four args. */
-  } else {
-    uint32_t i;
-    int ngpr = 6, nfpr = 8;
-    for (i = 0; i < nargs; i++)
-      if (args[i] && irt_isfp(IR(args[i])->t)) {
-	if (nfpr > 0) nfpr--; else nslots += 2;
-      } else {
-	if (ngpr > 0) ngpr--; else nslots += 2;
-      }
-  }
+  nslots = asm_count_call_slots(as, ci, args);
   if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
   if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
     as->evenspill = nslots;
     as->evenspill = nslots;
+#if LJ_64
   return irt_isfp(ir->t) ? REGSP_HINT(RID_FPRET) : REGSP_HINT(RID_RET);
   return irt_isfp(ir->t) ? REGSP_HINT(RID_FPRET) : REGSP_HINT(RID_RET);
 #else
 #else
-  if ((ci->flags & CCI_FASTCALL)) {
-    lua_assert(nargs <= 2);
-  } else {
-    uint32_t i;
-    for (i = 0; i < nargs; i++)
-      nslots += (args[i] && irt_isnum(IR(args[i])->t)) ? 2 : 1;
-    if (nslots > as->evenspill)  /* Leave room for args. */
-      as->evenspill = nslots;
-  }
   return irt_isfp(ir->t) ? REGSP_INIT : REGSP_HINT(RID_RET);
   return irt_isfp(ir->t) ? REGSP_INIT : REGSP_HINT(RID_RET);
 #endif
 #endif
 }
 }

+ 2 - 2
src/lj_ccall.c

@@ -402,7 +402,7 @@ static void ccall_struct_ret(CCallState *cc, int *rcl, uint8_t *dp, CTSize sz)
 /* -- Common C call handling ---------------------------------------------- */
 /* -- Common C call handling ---------------------------------------------- */
 
 
 /* Infer the destination CTypeID for a vararg argument. */
 /* Infer the destination CTypeID for a vararg argument. */
-static CTypeID ccall_ctid_vararg(CTState *cts, cTValue *o)
+CTypeID lj_ccall_ctid_vararg(CTState *cts, cTValue *o)
 {
 {
   if (tvisnumber(o)) {
   if (tvisnumber(o)) {
     return CTID_DOUBLE;
     return CTID_DOUBLE;
@@ -506,7 +506,7 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
     } else {
     } else {
       if (!(ct->info & CTF_VARARG))
       if (!(ct->info & CTF_VARARG))
 	lj_err_caller(L, LJ_ERR_FFI_NUMARG);  /* Too many arguments. */
 	lj_err_caller(L, LJ_ERR_FFI_NUMARG);  /* Too many arguments. */
-      did = ccall_ctid_vararg(cts, o);  /* Infer vararg type. */
+      did = lj_ccall_ctid_vararg(cts, o);  /* Infer vararg type. */
       isva = 1;
       isva = 1;
     }
     }
     d = ctype_raw(cts, did);
     d = ctype_raw(cts, did);

+ 3 - 0
src/lj_ccall.h

@@ -7,6 +7,7 @@
 #define _LJ_CCALL_H
 #define _LJ_CCALL_H
 
 
 #include "lj_obj.h"
 #include "lj_obj.h"
+#include "lj_ctype.h"
 
 
 #if LJ_HASFFI
 #if LJ_HASFFI
 
 
@@ -129,6 +130,8 @@ typedef struct CCallState {
 
 
 /* Really belongs to lj_vm.h. */
 /* Really belongs to lj_vm.h. */
 LJ_ASMF void LJ_FASTCALL lj_vm_ffi_call(CCallState *cc);
 LJ_ASMF void LJ_FASTCALL lj_vm_ffi_call(CCallState *cc);
+
+LJ_FUNC CTypeID lj_ccall_ctid_vararg(CTState *cts, cTValue *o);
 LJ_FUNC int lj_ccall_func(lua_State *L, GCcdata *cd);
 LJ_FUNC int lj_ccall_func(lua_State *L, GCcdata *cd);
 
 
 #endif
 #endif

+ 83 - 20
src/lj_crecord.c

@@ -18,6 +18,7 @@
 #include "lj_cparse.h"
 #include "lj_cparse.h"
 #include "lj_cconv.h"
 #include "lj_cconv.h"
 #include "lj_clib.h"
 #include "lj_clib.h"
+#include "lj_ccall.h"
 #include "lj_ir.h"
 #include "lj_ir.h"
 #include "lj_jit.h"
 #include "lj_jit.h"
 #include "lj_ircall.h"
 #include "lj_ircall.h"
@@ -364,7 +365,7 @@ static TRef crec_tv_ct(jit_State *J, CType *s, CTypeID sid, TRef sp)
 
 
 /* -- Convert TValue to C type (store) ------------------------------------ */
 /* -- Convert TValue to C type (store) ------------------------------------ */
 
 
-static TRef crec_ct_tv(jit_State *J, CType *d, TRef dp, TRef sp, TValue *sval)
+static TRef crec_ct_tv(jit_State *J, CType *d, TRef dp, TRef sp, cTValue *sval)
 {
 {
   CTState *cts = ctype_ctsG(J2G(J));
   CTState *cts = ctype_ctsG(J2G(J));
   CTypeID sid = CTID_P_VOID;
   CTypeID sid = CTID_P_VOID;
@@ -747,29 +748,88 @@ static TRef crec_call_args(jit_State *J, RecordFFData *rd,
 			   CTState *cts, CType *ct)
 			   CTState *cts, CType *ct)
 {
 {
   TRef args[CCI_NARGS_MAX];
   TRef args[CCI_NARGS_MAX];
+  CTypeID fid;
   MSize i, n;
   MSize i, n;
-  TRef tr;
+  TRef tr, *base;
+  cTValue *o;
+#if LJ_TARGET_X86
+#if LJ_ABI_WIN
+  TRef *arg0 = NULL, *arg1 = NULL;
+#endif
+  int ngpr = 0;
+  if (ctype_cconv(ct->info) == CTCC_THISCALL)
+    ngpr = 1;
+  else if (ctype_cconv(ct->info) == CTCC_FASTCALL)
+    ngpr = 2;
+#endif
+
+  /* Skip initial attributes. */
+  fid = ct->sib;
+  while (fid) {
+    CType *ctf = ctype_get(cts, fid);
+    if (!ctype_isattrib(ctf->info)) break;
+    fid = ctf->sib;
+  }
   args[0] = TREF_NIL;
   args[0] = TREF_NIL;
-  for (n = 0; J->base[n+1]; n++) {
+  for (n = 0, base = J->base+1, o = rd->argv+1; *base; n++, base++, o++) {
+    CTypeID did;
     CType *d;
     CType *d;
-    do {
-      if (!ct->sib || n >= CCI_NARGS_MAX)
-	lj_trace_err(J, LJ_TRERR_NYICALL);
-      ct = ctype_get(cts, ct->sib);
-    } while (ctype_isattrib(ct->info));
-    if (!ctype_isfield(ct->info))
+
+    if (n >= CCI_NARGS_MAX)
       lj_trace_err(J, LJ_TRERR_NYICALL);
       lj_trace_err(J, LJ_TRERR_NYICALL);
-    d = ctype_rawchild(cts, ct);
+
+    if (fid) {  /* Get argument type from field. */
+      CType *ctf = ctype_get(cts, fid);
+      fid = ctf->sib;
+      lua_assert(ctype_isfield(ctf->info));
+      did = ctype_cid(ctf->info);
+    } else {
+      if (!(ct->info & CTF_VARARG))
+        lj_trace_err(J, LJ_TRERR_NYICALL);  /* Too many arguments. */
+      did = lj_ccall_ctid_vararg(cts, o);  /* Infer vararg type. */
+    }
+    d = ctype_raw(cts, did);
     if (!(ctype_isnum(d->info) || ctype_isptr(d->info) ||
     if (!(ctype_isnum(d->info) || ctype_isptr(d->info) ||
 	  ctype_isenum(d->info)))
 	  ctype_isenum(d->info)))
       lj_trace_err(J, LJ_TRERR_NYICALL);
       lj_trace_err(J, LJ_TRERR_NYICALL);
-    tr = crec_ct_tv(J, d, 0, J->base[n+1], &rd->argv[n+1]);
-    if (ctype_isinteger_or_bool(d->info) && d->size < 4) {
-      if ((d->info & CTF_UNSIGNED))
-	tr = emitconv(tr, IRT_INT, d->size==1 ? IRT_U8 : IRT_U16, 0);
-      else
-	tr = emitconv(tr, IRT_INT, d->size==1 ? IRT_I8 : IRT_I16, IRCONV_SEXT);
+    tr = crec_ct_tv(J, d, 0, *base, o);
+    if (ctype_isinteger_or_bool(d->info)) {
+      if (d->size < 4) {
+	if ((d->info & CTF_UNSIGNED))
+	  tr = emitconv(tr, IRT_INT, d->size==1 ? IRT_U8 : IRT_U16, 0);
+	else
+	  tr = emitconv(tr, IRT_INT, d->size==1 ? IRT_I8 : IRT_I16,IRCONV_SEXT);
+      }
     }
     }
+#if LJ_TARGET_X86
+    /* 64 bit args must not end up in registers for fastcall/thiscall. */
+#if LJ_ABI_WIN
+    if (!ctype_isfp(d->info)) {
+      /* Sigh, the Windows/x86 ABI allows reordering across 64 bit args. */
+      if (tref_typerange(tr, IRT_I64, IRT_U64)) {
+	if (ngpr) {
+	  arg0 = &args[n]; args[n++] = TREF_NIL; ngpr--;
+	  if (ngpr) {
+	    arg1 = &args[n]; args[n++] = TREF_NIL; ngpr--;
+	  }
+	}
+      } else {
+	if (arg0) { *arg0 = tr; arg0 = NULL; n--; continue; }
+	if (arg1) { *arg1 = tr; arg1 = NULL; n--; continue; }
+	if (ngpr) ngpr--;
+      }
+    }
+#else
+    if (!ctype_isfp(d->info) && ngpr) {
+      if (tref_typerange(tr, IRT_I64, IRT_U64)) {
+	/* No reordering for other x86 ABIs. Simply add alignment args. */
+	do { args[n++] = TREF_NIL; } while (--ngpr);
+      } else {
+	ngpr--;
+      }
+    }
+#endif
+#endif
     args[n] = tr;
     args[n] = tr;
   }
   }
   tr = args[0];
   tr = args[0];
@@ -801,12 +861,15 @@ static int crec_call(jit_State *J, RecordFFData *rd, GCcdata *cd)
     }
     }
     if (!(ctype_isnum(ctr->info) || ctype_isptr(ctr->info) ||
     if (!(ctype_isnum(ctr->info) || ctype_isptr(ctr->info) ||
 	  ctype_isvoid(ctr->info)) ||
 	  ctype_isvoid(ctr->info)) ||
-	ctype_isbool(ctr->info) || (ct->info & CTF_VARARG) ||
+	ctype_isbool(ctr->info) || t == IRT_CDATA)
+      lj_trace_err(J, LJ_TRERR_NYICALL);
+    if ((ct->info & CTF_VARARG)
 #if LJ_TARGET_X86
 #if LJ_TARGET_X86
-	ctype_cconv(ct->info) != CTCC_CDECL ||
+	|| ctype_cconv(ct->info) != CTCC_CDECL
 #endif
 #endif
-	t == IRT_CDATA)
-      lj_trace_err(J, LJ_TRERR_NYICALL);
+	)
+      func = emitir(IRT(IR_CARG, IRT_NIL), func,
+		    lj_ir_kint(J, ctype_typeid(cts, ct)));
     tr = emitir(IRT(IR_CALLXS, t), crec_call_args(J, rd, cts, ct), func);
     tr = emitir(IRT(IR_CALLXS, t), crec_call_args(J, rd, cts, ct), func);
     if (t == IRT_FLOAT || t == IRT_U32) {
     if (t == IRT_FLOAT || t == IRT_U32) {
       tr = emitconv(tr, IRT_NUM, t, 0);
       tr = emitconv(tr, IRT_NUM, t, 0);

+ 1 - 1
src/lj_ctype.h

@@ -117,7 +117,7 @@ LJ_STATIC_ASSERT(((int)CT_STRUCT & (int)CT_ARRAY) == CT_STRUCT);
   info = (info & ~(CTMASK_##field<<CTSHIFT_##field)) | \
   info = (info & ~(CTMASK_##field<<CTSHIFT_##field)) | \
 	  (((CTSize)(val) & CTMASK_##field) << CTSHIFT_##field)
 	  (((CTSize)(val) & CTMASK_##field) << CTSHIFT_##field)
 
 
-/* Calling conventions. */
+/* Calling conventions. ORDER CC */
 enum { CTCC_CDECL, CTCC_THISCALL, CTCC_FASTCALL, CTCC_STDCALL };
 enum { CTCC_CDECL, CTCC_THISCALL, CTCC_FASTCALL, CTCC_STDCALL };
 
 
 /* Attribute numbers. */
 /* Attribute numbers. */

+ 12 - 4
src/lj_ircall.h

@@ -27,15 +27,23 @@ typedef struct CCallInfo {
 #define CCI_CALL_N		(IR_CALLN << CCI_OPSHIFT)
 #define CCI_CALL_N		(IR_CALLN << CCI_OPSHIFT)
 #define CCI_CALL_L		(IR_CALLL << CCI_OPSHIFT)
 #define CCI_CALL_L		(IR_CALLL << CCI_OPSHIFT)
 #define CCI_CALL_S		(IR_CALLS << CCI_OPSHIFT)
 #define CCI_CALL_S		(IR_CALLS << CCI_OPSHIFT)
-#define CCI_CALL_FN		(CCI_CALL_N|CCI_FASTCALL)
-#define CCI_CALL_FL		(CCI_CALL_L|CCI_FASTCALL)
-#define CCI_CALL_FS		(CCI_CALL_S|CCI_FASTCALL)
+#define CCI_CALL_FN		(CCI_CALL_N|CCI_CC_FASTCALL)
+#define CCI_CALL_FL		(CCI_CALL_L|CCI_CC_FASTCALL)
+#define CCI_CALL_FS		(CCI_CALL_S|CCI_CC_FASTCALL)
 
 
 /* C call info flags. */
 /* C call info flags. */
 #define CCI_L			0x0100	/* Implicit L arg. */
 #define CCI_L			0x0100	/* Implicit L arg. */
 #define CCI_CASTU64		0x0200	/* Cast u64 result to number. */
 #define CCI_CASTU64		0x0200	/* Cast u64 result to number. */
 #define CCI_NOFPRCLOBBER	0x0400	/* Does not clobber any FPRs. */
 #define CCI_NOFPRCLOBBER	0x0400	/* Does not clobber any FPRs. */
-#define CCI_FASTCALL		0x0800	/* Fastcall convention. */
+#define CCI_VARARG		0x0800	/* Vararg function. */
+
+#define CCI_CC_MASK		0x3000	/* Calling convention mask. */
+#define CCI_CC_SHIFT		12
+/* ORDER CC */
+#define CCI_CC_CDECL		0x0000	/* Default cdecl calling convention. */
+#define CCI_CC_THISCALL		0x1000	/* Thiscall calling convention. */
+#define CCI_CC_FASTCALL		0x2000	/* Fastcall calling convention. */
+#define CCI_CC_STDCALL		0x3000	/* Stdcall calling convention. */
 
 
 /* Helpers for conditional function definitions. */
 /* Helpers for conditional function definitions. */
 #define IRCALLCOND_ANY(x)		x
 #define IRCALLCOND_ANY(x)		x

+ 2 - 0
src/lj_target_ppc.h

@@ -207,7 +207,9 @@ typedef enum PPCIns {
   PPCI_BCTRL = 0x4e800421,
   PPCI_BCTRL = 0x4e800421,
 
 
   PPCI_CRANDC = 0x4c000102,
   PPCI_CRANDC = 0x4c000102,
+  PPCI_CRXOR = 0x4c000182,
   PPCI_CRAND = 0x4c000202,
   PPCI_CRAND = 0x4c000202,
+  PPCI_CREQV = 0x4c000242,
   PPCI_CRORC = 0x4c000342,
   PPCI_CRORC = 0x4c000342,
   PPCI_CROR = 0x4c000382,
   PPCI_CROR = 0x4c000382,
 
 

+ 4 - 0
src/lj_target_x86.h

@@ -85,6 +85,7 @@ enum {
 #define REGARG_GPRS \
 #define REGARG_GPRS \
   (RID_ECX|((RID_EDX|((RID_R8D|(RID_R9D<<5))<<5))<<5))
   (RID_ECX|((RID_EDX|((RID_R8D|(RID_R9D<<5))<<5))<<5))
 #define REGARG_NUMGPR	4
 #define REGARG_NUMGPR	4
+#define REGARG_NUMFPR	4
 #define REGARG_FIRSTFPR	RID_XMM0
 #define REGARG_FIRSTFPR	RID_XMM0
 #define REGARG_LASTFPR	RID_XMM3
 #define REGARG_LASTFPR	RID_XMM3
 #define STACKARG_OFS	(4*8)
 #define STACKARG_OFS	(4*8)
@@ -96,6 +97,7 @@ enum {
   (RID_EDI|((RID_ESI|((RID_EDX|((RID_ECX|((RID_R8D|(RID_R9D \
   (RID_EDI|((RID_ESI|((RID_EDX|((RID_ECX|((RID_R8D|(RID_R9D \
    <<5))<<5))<<5))<<5))<<5))
    <<5))<<5))<<5))<<5))<<5))
 #define REGARG_NUMGPR	6
 #define REGARG_NUMGPR	6
+#define REGARG_NUMFPR	8
 #define REGARG_FIRSTFPR	RID_XMM0
 #define REGARG_FIRSTFPR	RID_XMM0
 #define REGARG_LASTFPR	RID_XMM7
 #define REGARG_LASTFPR	RID_XMM7
 #define STACKARG_OFS	0
 #define STACKARG_OFS	0
@@ -105,6 +107,7 @@ enum {
 #define RSET_SCRATCH	(RSET_ACD|RSET_FPR)
 #define RSET_SCRATCH	(RSET_ACD|RSET_FPR)
 #define REGARG_GPRS	(RID_ECX|(RID_EDX<<5))  /* Fastcall only. */
 #define REGARG_GPRS	(RID_ECX|(RID_EDX<<5))  /* Fastcall only. */
 #define REGARG_NUMGPR	2  /* Fastcall only. */
 #define REGARG_NUMGPR	2  /* Fastcall only. */
+#define REGARG_NUMFPR	0
 #define STACKARG_OFS	0
 #define STACKARG_OFS	0
 #endif
 #endif
 
 
@@ -192,6 +195,7 @@ typedef enum {
   XI_JCCs =	0x70, /* Really 7x. */
   XI_JCCs =	0x70, /* Really 7x. */
   XI_JCCn =	0x80, /* Really 0f8x. */
   XI_JCCn =	0x80, /* Really 0f8x. */
   XI_LEA =	0x8d,
   XI_LEA =	0x8d,
+  XI_MOVrib =	0xb0, /* Really b0+r. */
   XI_MOVri =	0xb8, /* Really b8+r. */
   XI_MOVri =	0xb8, /* Really b8+r. */
   XI_ARITHib =	0x80,
   XI_ARITHib =	0x80,
   XI_ARITHi =	0x81,
   XI_ARITHi =	0x81,