Browse Source

ARM64: Add FFI support.

Mike Pall 10 years ago
parent
commit
33f0c24f06
7 changed files with 412 additions and 20 deletions
  1. 0 1
      src/lj_arch.h
  2. 121 0
      src/lj_ccall.c
  3. 17 0
      src/lj_ccall.h
  4. 54 10
      src/lj_ccallback.c
  5. 2 0
      src/lj_target.h
  6. 97 0
      src/lj_target_arm64.h
  7. 121 9
      src/vm_arm64.dasc

+ 0 - 1
src/lj_arch.h

@@ -202,7 +202,6 @@
 #define LJ_TARGET_UNIFYROT	2	/* Want only IR_BROR. */
 #define LJ_TARGET_GC64		1
 #define LJ_ARCH_NUMMODE		LJ_NUMMODE_DUAL
-#define LJ_ARCH_NOFFI		1	/* NYI */
 #define LJ_ARCH_NOJIT		1	/* NYI */
 
 #define LJ_ARCH_VERSION		80

+ 121 - 0
src/lj_ccall.c

@@ -290,6 +290,75 @@
 #define CCALL_HANDLE_RET \
   if ((ct->info & CTF_VARARG)) sp = (uint8_t *)&cc->gpr[0];
 
+#elif LJ_TARGET_ARM64
+/* -- ARM64 calling conventions ------------------------------------------- */
+
+#define CCALL_HANDLE_STRUCTRET \
+  cc->retref = !ccall_classify_struct(cts, ctr); \
+  if (cc->retref) cc->retp = dp;
+
+#define CCALL_HANDLE_STRUCTRET2 \
+  unsigned int cl = ccall_classify_struct(cts, ctr); \
+  if ((cl & 4)) { /* Combine float HFA from separate registers. */ \
+    CTSize i = (cl >> 8) - 1; \
+    do { ((uint32_t *)dp)[i] = cc->fpr[i].u32; } while (i--); \
+  } else { \
+    if (cl > 1) sp = (uint8_t *)&cc->fpr[0]; \
+    memcpy(dp, sp, ctr->size); \
+  }
+
+#define CCALL_HANDLE_COMPLEXRET \
+  /* Complex values are returned in one or two FPRs. */ \
+  cc->retref = 0;
+
+#define CCALL_HANDLE_COMPLEXRET2 \
+  if (ctr->size == 2*sizeof(float)) {  /* Copy complex float from FPRs. */ \
+    ((float *)dp)[0] = cc->fpr[0].f; \
+    ((float *)dp)[1] = cc->fpr[1].f; \
+  } else {  /* Copy complex double from FPRs. */ \
+    ((double *)dp)[0] = cc->fpr[0].d; \
+    ((double *)dp)[1] = cc->fpr[1].d; \
+  }
+
+#define CCALL_HANDLE_STRUCTARG \
+  unsigned int cl = ccall_classify_struct(cts, d); \
+  if (cl == 0) {  /* Pass struct by reference. */ \
+    rp = cdataptr(lj_cdata_new(cts, did, sz)); \
+    sz = CTSIZE_PTR; \
+  } else if (cl > 1) {  /* Pass struct in FPRs or on stack. */ \
+    isfp = (cl & 4) ? 2 : 1; \
+  }  /* else: Pass struct in GPRs or on stack. */
+
+#define CCALL_HANDLE_COMPLEXARG \
+  /* Pass complex by value in separate (!) FPRs or on stack. */ \
+  isfp = ctr->size == 2*sizeof(float) ? 2 : 1;
+
+#define CCALL_HANDLE_REGARG \
+  if (LJ_TARGET_IOS && isva) { \
+    /* IOS: All variadic arguments are on the stack. */ \
+  } else if (isfp) {  /* Try to pass argument in FPRs. */ \
+    int n2 = ctype_isvector(d->info) ? 1 : n*isfp; \
+    if (nfpr + n2 <= CCALL_NARG_FPR) { \
+      dp = &cc->fpr[nfpr]; \
+      nfpr += n2; \
+      goto done; \
+    } else { \
+      nfpr = CCALL_NARG_FPR;  /* Prevent reordering. */ \
+      if (LJ_TARGET_IOS && d->size < 8) goto err_nyi; \
+    } \
+  } else {  /* Try to pass argument in GPRs. */ \
+    if (!LJ_TARGET_IOS && (d->info & CTF_ALIGN) > CTALIGN_PTR) \
+      ngpr = (ngpr + 1u) & ~1u;  /* Align to regpair. */ \
+    if (ngpr + n <= maxgpr) { \
+      dp = &cc->gpr[ngpr]; \
+      ngpr += n; \
+      goto done; \
+    } else { \
+      ngpr = maxgpr;  /* Prevent reordering. */ \
+      if (LJ_TARGET_IOS && d->size < 8) goto err_nyi; \
+    } \
+  }
+
 #elif LJ_TARGET_PPC
 /* -- PPC calling conventions --------------------------------------------- */
 
@@ -584,6 +653,52 @@ noth:  /* Not a homogeneous float/double aggregate. */
 
 #endif
 
+/* -- ARM64 ABI struct classification ------------------------------------- */
+
+#if LJ_TARGET_ARM64
+
+/* Classify a struct based on its fields. */
+static unsigned int ccall_classify_struct(CTState *cts, CType *ct)
+{
+  CTSize sz = ct->size;
+  unsigned int r = 0, n = 0, isu = (ct->info & CTF_UNION);
+  while (ct->sib) {
+    CType *sct;
+    ct = ctype_get(cts, ct->sib);
+    if (ctype_isfield(ct->info)) {
+      sct = ctype_rawchild(cts, ct);
+      if (ctype_isfp(sct->info)) {
+	r |= sct->size;
+	if (!isu) n++; else if (n == 0) n = 1;
+      } else if (ctype_iscomplex(sct->info)) {
+	r |= (sct->size >> 1);
+	if (!isu) n += 2; else if (n < 2) n = 2;
+      } else if (ctype_isstruct(sct->info)) {
+	goto substruct;
+      } else {
+	goto noth;
+      }
+    } else if (ctype_isbitfield(ct->info)) {
+      goto noth;
+    } else if (ctype_isxattrib(ct->info, CTA_SUBTYPE)) {
+      sct = ctype_rawchild(cts, ct);
+    substruct:
+      if (sct->size > 0) {
+	unsigned int s = ccall_classify_struct(cts, sct);
+	if (s <= 1) goto noth;
+	r |= (s & 255);
+	if (!isu) n += (s >> 8); else if (n < (s >>8)) n = (s >> 8);
+      }
+    }
+  }
+  if ((r == 4 || r == 8) && n <= 4)
+    return r + (n << 8);
+noth:  /* Not a homogeneous float/double aggregate. */
+  return (sz <= 16);  /* Return structs of size <= 16 in GPRs. */
+}
+
+#endif
+
 /* -- Common C call handling ---------------------------------------------- */
 
 /* Infer the destination CTypeID for a vararg argument. */
@@ -766,6 +881,12 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
       cc->fpr[nfpr-1].d[0] = cc->fpr[nfpr-2].d[1];  /* Split complex double. */
       cc->fpr[nfpr-2].d[1] = 0;
     }
+#elif LJ_TARGET_ARM64
+    if (isfp == 2 && (uint8_t *)dp < (uint8_t *)cc->stack) {
+      /* Split float HFA or complex float into separate registers. */
+      CTSize i = (sz >> 2) - 1;
+      do { ((uint64_t *)dp)[i] = ((uint32_t *)dp)[i]; } while (i--);
+    }
 #else
     UNUSED(isfp);
 #endif

+ 17 - 0
src/lj_ccall.h

@@ -68,6 +68,21 @@ typedef union FPRArg {
   float f[2];
 } FPRArg;
 
+#elif LJ_TARGET_ARM64
+
+#define CCALL_NARG_GPR		8
+#define CCALL_NRET_GPR		2
+#define CCALL_NARG_FPR		8
+#define CCALL_NRET_FPR		4
+#define CCALL_SPS_FREE		0
+
+typedef intptr_t GPRArg;
+typedef union FPRArg {
+  double d;
+  float f;
+  uint32_t u32;
+} FPRArg;
+
 #elif LJ_TARGET_PPC
 
 #define CCALL_NARG_GPR		8
@@ -135,6 +150,8 @@ typedef LJ_ALIGN(CCALL_ALIGN_CALLSTATE) struct CCallState {
   uint8_t nfpr;			/* Number of arguments in FPRs. */
 #elif LJ_TARGET_X86
   uint8_t resx87;		/* Result on x87 stack: 1:float, 2:double. */
+#elif LJ_TARGET_ARM64
+  void *retp;			/* Aggregate return pointer in x8. */
 #elif LJ_TARGET_PPC
   uint8_t nfpr;			/* Number of arguments in FPRs. */
 #endif

+ 54 - 10
src/lj_ccallback.c

@@ -27,7 +27,7 @@
 
 #if LJ_OS_NOJIT
 
-/* Disabled callback support. */
+/* Callbacks disabled. */
 #define CALLBACK_SLOT2OFS(slot)	(0*(slot))
 #define CALLBACK_OFS2SLOT(ofs)	(0*(ofs))
 #define CALLBACK_MAX_SLOT	0
@@ -54,23 +54,18 @@ static MSize CALLBACK_OFS2SLOT(MSize ofs)
 #elif LJ_TARGET_ARM
 
 #define CALLBACK_MCODE_HEAD		32
-#define CALLBACK_SLOT2OFS(slot)		(CALLBACK_MCODE_HEAD + 8*(slot))
-#define CALLBACK_OFS2SLOT(ofs)		(((ofs)-CALLBACK_MCODE_HEAD)/8)
-#define CALLBACK_MAX_SLOT		(CALLBACK_OFS2SLOT(CALLBACK_MCODE_SIZE))
+
+#elif LJ_TARGET_ARM64
+
+#define CALLBACK_MCODE_HEAD		32
 
 #elif LJ_TARGET_PPC
 
 #define CALLBACK_MCODE_HEAD		24
-#define CALLBACK_SLOT2OFS(slot)		(CALLBACK_MCODE_HEAD + 8*(slot))
-#define CALLBACK_OFS2SLOT(ofs)		(((ofs)-CALLBACK_MCODE_HEAD)/8)
-#define CALLBACK_MAX_SLOT		(CALLBACK_OFS2SLOT(CALLBACK_MCODE_SIZE))
 
 #elif LJ_TARGET_MIPS
 
 #define CALLBACK_MCODE_HEAD		24
-#define CALLBACK_SLOT2OFS(slot)		(CALLBACK_MCODE_HEAD + 8*(slot))
-#define CALLBACK_OFS2SLOT(ofs)		(((ofs)-CALLBACK_MCODE_HEAD)/8)
-#define CALLBACK_MAX_SLOT		(CALLBACK_OFS2SLOT(CALLBACK_MCODE_SIZE))
 
 #else
 
@@ -81,6 +76,12 @@ static MSize CALLBACK_OFS2SLOT(MSize ofs)
 
 #endif
 
+#ifndef CALLBACK_SLOT2OFS
+#define CALLBACK_SLOT2OFS(slot)		(CALLBACK_MCODE_HEAD + 8*(slot))
+#define CALLBACK_OFS2SLOT(ofs)		(((ofs)-CALLBACK_MCODE_HEAD)/8)
+#define CALLBACK_MAX_SLOT		(CALLBACK_OFS2SLOT(CALLBACK_MCODE_SIZE))
+#endif
+
 /* Convert callback slot number to callback function pointer. */
 static void *callback_slot2ptr(CTState *cts, MSize slot)
 {
@@ -157,6 +158,26 @@ static void callback_mcode_init(global_State *g, uint32_t *page)
   }
   lua_assert(p - page <= CALLBACK_MCODE_SIZE);
 }
+#elif LJ_TARGET_ARM64
+static void callback_mcode_init(global_State *g, uint32_t *page)
+{
+  uint32_t *p = page;
+  void *target = (void *)lj_vm_ffi_callback;
+  MSize slot;
+  *p++ = A64I_LDRLx | A64F_D(RID_X11) | A64F_S19(4);
+  *p++ = A64I_LDRLx | A64F_D(RID_X10) | A64F_S19(5);
+  *p++ = A64I_BR | A64F_N(RID_X11);
+  *p++ = A64I_NOP;
+  ((void **)p)[0] = target;
+  ((void **)p)[1] = g;
+  p += 4;
+  for (slot = 0; slot < CALLBACK_MAX_SLOT; slot++) {
+    *p++ = A64I_MOVZw | A64F_D(RID_X9) | A64F_U16(slot);
+    *p = A64I_B | A64F_S26((page-p) & 0x03ffffffu);
+    p++;
+  }
+  lua_assert(p - page <= CALLBACK_MCODE_SIZE);
+}
 #elif LJ_TARGET_PPC
 static void callback_mcode_init(global_State *g, uint32_t *page)
 {
@@ -351,6 +372,29 @@ void lj_ccallback_mcode_free(CTState *cts)
     goto done; \
   } CALLBACK_HANDLE_REGARG_FP2
 
+#elif LJ_TARGET_ARM64
+
+#define CALLBACK_HANDLE_REGARG \
+  if (isfp) { \
+    if (nfpr + n <= CCALL_NARG_FPR) { \
+      sp = &cts->cb.fpr[nfpr]; \
+      nfpr += n; \
+      goto done; \
+    } else { \
+      nfpr = CCALL_NARG_FPR;  /* Prevent reordering. */ \
+    } \
+  } else { \
+    if (!LJ_TARGET_IOS && n > 1) \
+      ngpr = (ngpr + 1u) & ~1u;  /* Align to regpair. */ \
+    if (ngpr + n <= maxgpr) { \
+      sp = &cts->cb.gpr[ngpr]; \
+      ngpr += n; \
+      goto done; \
+    } else { \
+      ngpr = CCALL_NARG_GPR;  /* Prevent reordering. */ \
+    } \
+  }
+
 #elif LJ_TARGET_PPC
 
 #define CALLBACK_HANDLE_REGARG \

+ 2 - 0
src/lj_target.h

@@ -138,6 +138,8 @@ typedef uint32_t RegCost;
 #include "lj_target_x86.h"
 #elif LJ_TARGET_ARM
 #include "lj_target_arm.h"
+#elif LJ_TARGET_ARM64
+#include "lj_target_arm64.h"
 #elif LJ_TARGET_PPC
 #include "lj_target_ppc.h"
 #elif LJ_TARGET_MIPS

+ 97 - 0
src/lj_target_arm64.h

@@ -0,0 +1,97 @@
+/*
+** Definitions for ARM64 CPUs.
+** Copyright (C) 2005-2015 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_TARGET_ARM64_H
+#define _LJ_TARGET_ARM64_H
+
+/* -- Registers IDs ------------------------------------------------------- */
+
+#define GPRDEF(_) \
+  _(X0) _(X1) _(X2) _(X3) _(X4) _(X5) _(X6) _(X7) \
+  _(X8) _(X9) _(X10) _(X11) _(X12) _(X13) _(X14) _(X15) \
+  _(X16) _(X17) _(X18) _(X19) _(X20) _(X21) _(X22) _(X23) \
+  _(X24) _(X25) _(X26) _(X27) _(X28) _(FP) _(LR) _(SP)
+#define FPRDEF(_) \
+  _(D0) _(D1) _(D2) _(D3) _(D4) _(D5) _(D6) _(D7) \
+  _(D8) _(D9) _(D10) _(D11) _(D12) _(D13) _(D14) _(D15) \
+  _(D16) _(D17) _(D18) _(D19) _(D20) _(D21) _(D22) _(D23) \
+  _(D24) _(D25) _(D26) _(D27) _(D28) _(D29) _(D30) _(D31)
+#define VRIDDEF(_)
+
+#define RIDENUM(name)	RID_##name,
+
+enum {
+  GPRDEF(RIDENUM)		/* General-purpose registers (GPRs). */
+  FPRDEF(RIDENUM)		/* Floating-point registers (FPRs). */
+  RID_MAX,
+  RID_TMP = RID_LR,
+  RID_ZERO = RID_SP,
+
+  /* Calling conventions. */
+  RID_RET = RID_X0,
+  RID_FPRET = RID_D0,
+
+  /* These definitions must match with the *.dasc file(s): */
+  RID_BASE = RID_X19,		/* Interpreter BASE. */
+  RID_LPC = RID_X21,		/* Interpreter PC. */
+  RID_GL = RID_X22,		/* Interpreter GL. */
+  RID_LREG = RID_X23,		/* Interpreter L. */
+
+  /* Register ranges [min, max) and number of registers. */
+  RID_MIN_GPR = RID_X0,
+  RID_MAX_GPR = RID_SP+1,
+  RID_MIN_FPR = RID_MAX_GPR,
+  RID_MAX_FPR = RID_D31+1,
+  RID_NUM_GPR = RID_MAX_GPR - RID_MIN_GPR,
+  RID_NUM_FPR = RID_MAX_FPR - RID_MIN_FPR
+};
+
+#define RID_NUM_KREF		RID_NUM_GPR
+#define RID_MIN_KREF		RID_X0
+
+/* -- Register sets ------------------------------------------------------- */
+
+/* Make use of all registers, except for x18, fp, lr and sp. */
+#define RSET_FIXED \
+  (RID2RSET(RID_X18)|RID2RSET(RID_FP)|RID2RSET(RID_LR)|RID2RSET(RID_SP))
+#define RSET_GPR	(RSET_RANGE(RID_MIN_GPR, RID_MAX_GPR) - RSET_FIXED)
+#define RSET_FPR	RSET_RANGE(RID_MIN_FPR, RID_MAX_FPR)
+#define RSET_ALL	(RSET_GPR|RSET_FPR)
+#define RSET_INIT	RSET_ALL
+
+/* lr is an implicit scratch register. */
+#define RSET_SCRATCH_GPR	(RSET_RANGE(RID_X0, RID_X17+1))
+#define RSET_SCRATCH_FPR \
+  (RSET_RANGE(RID_D0, RID_D7+1)|RSET_RANGE(RID_D16, RID_D31+1))
+#define RSET_SCRATCH		(RSET_SCRATCH_GPR|RSET_SCRATCH_FPR)
+#define REGARG_FIRSTGPR		RID_X0
+#define REGARG_LASTGPR		RID_X7
+#define REGARG_NUMGPR		8
+#define REGARG_FIRSTFPR		RID_D0
+#define REGARG_LASTFPR		RID_D7
+#define REGARG_NUMFPR		8
+
+/* -- Instructions -------------------------------------------------------- */
+
+/* Instruction fields. */
+#define A64F_D(r)	(r)
+#define A64F_N(r)       ((r) << 5)
+#define A64F_A(r)       ((r) << 10)
+#define A64F_M(r)       ((r) << 16)
+#define A64F_U16(x)	((x) << 5)
+#define A64F_S26(x)	(x)
+#define A64F_S19(x)	((x) << 5)
+
+typedef enum A64Ins {
+  A64I_MOVZw = 0x52800000,
+  A64I_MOVZx = 0xd2800000,
+  A64I_LDRLw = 0x18000000,
+  A64I_LDRLx = 0x58000000,
+  A64I_NOP = 0xd503201f,
+  A64I_B = 0x14000000,
+  A64I_BR = 0xd61f0000,
+} A64Ins;
+
+#endif

+ 121 - 9
src/vm_arm64.dasc

@@ -853,7 +853,8 @@ static void build_subroutines(BuildCtx *ctx)
   |   str PC, SAVE_PC
   |  add CARG3, RA, NARGS8:RC
   |  bl extern lj_meta_call	// (lua_State *L, TValue *func, TValue *top)
-  |  ldp LFUNC:CARG3, PC, [RA, FRAME_FUNC]  // Guaranteed to be a function here.
+  |  ldr LFUNC:CARG3, [RA, FRAME_FUNC]	// Guaranteed to be a function here.
+  |   ldr PC, [BASE, FRAME_PC]
   |   add NARGS8:RC, NARGS8:RC, #8	// Got one more argument now.
   |  and LFUNC:CARG3, CARG3, #LJ_GCVMASK
   |  b ->BC_CALLT2_Z
@@ -1859,18 +1860,89 @@ static void build_subroutines(BuildCtx *ctx)
   |// Saveregs already performed. Callback slot number in [sp], g in r12.
   |->vm_ffi_callback:
   |.if FFI
-  |  NYI
+  |.type CTSTATE, CTState, PC
+  |  saveregs
+  |  ldr CTSTATE, GL:x10->ctype_state
+  |  mov GL, x10
+  |    add x10, sp, # CFRAME_SPACE
+  |  str w9, CTSTATE->cb.slot
+  |  stp x0, x1, CTSTATE->cb.gpr[0]
+  |   stp d0, d1, CTSTATE->cb.fpr[0]
+  |  stp x2, x3, CTSTATE->cb.gpr[2]
+  |   stp d2, d3, CTSTATE->cb.fpr[2]
+  |  stp x4, x5, CTSTATE->cb.gpr[4]
+  |   stp d4, d5, CTSTATE->cb.fpr[4]
+  |  stp x6, x7, CTSTATE->cb.gpr[6]
+  |   stp d6, d7, CTSTATE->cb.fpr[6]
+  |    str x10, CTSTATE->cb.stack
+  |  mov CARG1, CTSTATE
+  |   str CTSTATE, SAVE_PC		// Any value outside of bytecode is ok.
+  |  mov CARG2, sp
+  |  bl extern lj_ccallback_enter	// (CTState *cts, void *cf)
+  |  // Returns lua_State *.
+  |  ldp BASE, RC, L:CRET1->base
+  |   movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
+  |   movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
+  |   movn TISNIL, #0
+  |   mov L, CRET1
+  |  ldr LFUNC:CARG3, [BASE, FRAME_FUNC]
+  |  sub RC, RC, BASE
+  |   st_vmstate ST_INTERP
+  |  and LFUNC:CARG3, CARG3, #LJ_GCVMASK
+  |  ins_callt
   |.endif
   |
   |->cont_ffi_callback:			// Return from FFI callback.
   |.if FFI
-  |  NYI
+  |  ldr CTSTATE, GL->ctype_state
+  |   stp BASE, CARG4, L->base
+  |  str L, CTSTATE->L
+  |  mov CARG1, CTSTATE
+  |  mov CARG2, RA
+  |  bl extern lj_ccallback_leave       // (CTState *cts, TValue *o)
+  |  ldp x0, x1, CTSTATE->cb.gpr[0]
+  |   ldp d0, d1, CTSTATE->cb.fpr[0]
+  |  b ->vm_leave_unw
   |.endif
   |
   |->vm_ffi_call:			// Call C function via FFI.
   |  // Caveat: needs special frame unwinding, see below.
   |.if FFI
-  |  NYI
+  |  .type CCSTATE, CCallState, x19
+  |  stp fp, lr, [sp, #-32]!
+  |  add fp, sp, #0
+  |  str CCSTATE, [sp, #16]
+  |  mov CCSTATE, x0
+  |  ldr TMP0w, CCSTATE:x0->spadj
+  |   ldrb TMP1w, CCSTATE->nsp
+  |    add TMP2, CCSTATE, #offsetof(CCallState, stack)
+  |   subs TMP1, TMP1, #1
+  |    ldr TMP3, CCSTATE->func
+  |  sub sp, fp, TMP0
+  |   bmi >2
+  |1:  // Copy stack slots
+  |  ldr TMP0, [TMP2, TMP1, lsl #3]
+  |  str TMP0, [sp, TMP1, lsl #3]
+  |  subs TMP1, TMP1, #1
+  |  bpl <1
+  |2:
+  |  ldp x0, x1, CCSTATE->gpr[0]
+  |   ldp d0, d1, CCSTATE->fpr[0]
+  |  ldp x2, x3, CCSTATE->gpr[2]
+  |   ldp d2, d3, CCSTATE->fpr[2]
+  |  ldp x4, x5, CCSTATE->gpr[4]
+  |   ldp d4, d5, CCSTATE->fpr[4]
+  |  ldp x6, x7, CCSTATE->gpr[6]
+  |   ldp d6, d7, CCSTATE->fpr[6]
+  |  ldr x8, CCSTATE->retp
+  |  blr TMP3
+  |  mov sp, fp
+  |  stp x0, x1, CCSTATE->gpr[0]
+  |   stp d0, d1, CCSTATE->fpr[0]
+  |   stp d2, d3, CCSTATE->fpr[2]
+  |  ldr CCSTATE, [sp, #16]
+  |  ldp fp, lr, [sp], #32
+  |  ret
   |.endif
   |// Note: vm_ffi_call must be the last function in this object file!
   |
@@ -2087,7 +2159,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |
     |.if FFI
     |7:
-    |  asr ITYPE, TMP0, #47
+    |  asr ITYPE, CARG1, #47
     |  cmn ITYPE, #-LJ_TCDATA
     |  bne <2
     |  b ->vmeta_equal_cd
@@ -3600,7 +3672,19 @@ static void emit_asm_debug(BuildCtx *ctx)
 	"\t.align 3\n"
 	".LEFDE0:\n\n");
 #if LJ_HASFFI
-#error "NYI"
+    fprintf(ctx->fp,
+	".LSFDE1:\n"
+	"\t.long .LEFDE1-.LASFDE1\n"
+	".LASFDE1:\n"
+	"\t.long .Lframe0\n"
+	"\t.quad lj_vm_ffi_call\n"
+	"\t.quad %d\n"
+	"\t.byte 0xe\n\t.uleb128 32\n"		/* def_cfa_offset */
+	"\t.byte 0x9d\n\t.uleb128 4\n"		/* offset fp */
+	"\t.byte 0x9e\n\t.uleb128 3\n"		/* offset lr */
+	"\t.byte 0x93\n\t.uleb128 2\n"		/* offset x19 */
+	"\t.align 3\n"
+	".LEFDE1:\n\n", (int)ctx->codesz - fcofs);
 #endif
     fprintf(ctx->fp, "\t.section .eh_frame,\"a\",%%progbits\n");
     fprintf(ctx->fp,
@@ -3615,7 +3699,7 @@ static void emit_asm_debug(BuildCtx *ctx)
 	"\t.byte 30\n"				/* Return address is in lr. */
 	"\t.uleb128 6\n"			/* augmentation length */
 	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
-        "\t.long lj_err_unwind_dwarf-.\n"
+	"\t.long lj_err_unwind_dwarf-.\n"
 	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
 	"\t.byte 0xc\n\t.uleb128 31\n\t.uleb128 0\n"	/* def_cfa sp */
 	"\t.align 3\n"
@@ -3627,7 +3711,7 @@ static void emit_asm_debug(BuildCtx *ctx)
 	"\t.long .LASFDE2-.Lframe1\n"
 	"\t.long .Lbegin-.\n"
 	"\t.long %d\n"
-        "\t.uleb128 0\n"			/* augmentation length */
+	"\t.uleb128 0\n"			/* augmentation length */
 	"\t.byte 0xe\n\t.uleb128 %d\n"		/* def_cfa_offset */
 	"\t.byte 0x9d\n\t.uleb128 %d\n"		/* offset fp */
 	"\t.byte 0x9e\n\t.uleb128 %d\n",	/* offset lr */
@@ -3641,7 +3725,35 @@ static void emit_asm_debug(BuildCtx *ctx)
 	"\t.align 3\n"
 	".LEFDE2:\n\n");
 #if LJ_HASFFI
-#error "NYI"
+    fprintf(ctx->fp,
+	".Lframe2:\n"
+	"\t.long .LECIE2-.LSCIE2\n"
+	".LSCIE2:\n"
+	"\t.long 0\n"
+	"\t.byte 0x1\n"
+	"\t.string \"zR\"\n"
+	"\t.uleb128 0x1\n"
+	"\t.sleb128 -8\n"
+	"\t.byte 30\n"				/* Return address is in lr. */
+	"\t.uleb128 1\n"                        /* augmentation length */
+	"\t.byte 0x1b\n"                        /* pcrel|sdata4 */
+	"\t.byte 0xc\n\t.uleb128 31\n\t.uleb128 0\n"	/* def_cfa sp */
+	"\t.align 3\n"
+	".LECIE2:\n\n");
+    fprintf(ctx->fp,
+	".LSFDE3:\n"
+	"\t.long .LEFDE3-.LASFDE3\n"
+	".LASFDE3:\n"
+	"\t.long .LASFDE3-.Lframe2\n"
+	"\t.long lj_vm_ffi_call-.\n"
+	"\t.long %d\n"
+	"\t.uleb128 0\n"                        /* augmentation length */
+	"\t.byte 0xe\n\t.uleb128 32\n"		/* def_cfa_offset */
+	"\t.byte 0x9d\n\t.uleb128 4\n"		/* offset fp */
+	"\t.byte 0x9e\n\t.uleb128 3\n"		/* offset lr */
+	"\t.byte 0x93\n\t.uleb128 2\n"		/* offset x19 */
+	"\t.align 3\n"
+	".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
 #endif
     break;
   default: