浏览代码

Add trace stitching.

Mike Pall 12 年之前
父节点
当前提交
b5d741fa7e
共有 17 个文件被更改,包括 422 次插入69 次删除
  1. 2 2
      src/lib_base.c
  2. 1 1
      src/lib_jit.c
  3. 23 0
      src/lj_dispatch.c
  4. 5 1
      src/lj_dispatch.h
  5. 96 34
      src/lj_ffrecord.c
  6. 3 1
      src/lj_jit.h
  7. 39 18
      src/lj_record.c
  8. 1 0
      src/lj_record.h
  9. 2 1
      src/lj_snap.c
  10. 27 3
      src/lj_trace.c
  11. 1 0
      src/lj_trace.h
  12. 1 2
      src/lj_traceerr.h
  13. 1 0
      src/lj_vm.h
  14. 50 0
      src/vm_arm.dasc
  15. 56 2
      src/vm_mips.dasc
  16. 52 3
      src/vm_ppc.dasc
  17. 62 1
      src/vm_x86.dasc

+ 2 - 2
src/lib_base.c

@@ -101,7 +101,7 @@ static int ffh_pairs(lua_State *L, MMS mm)
 #endif
 #endif
 
 
 LJLIB_PUSH(lastcl)
 LJLIB_PUSH(lastcl)
-LJLIB_ASM(pairs)
+LJLIB_ASM(pairs)		LJLIB_REC(xpairs 0)
 {
 {
   return ffh_pairs(L, MM_pairs);
   return ffh_pairs(L, MM_pairs);
 }
 }
@@ -114,7 +114,7 @@ LJLIB_NOREGUV LJLIB_ASM(ipairs_aux)	LJLIB_REC(.)
 }
 }
 
 
 LJLIB_PUSH(lastcl)
 LJLIB_PUSH(lastcl)
-LJLIB_ASM(ipairs)		LJLIB_REC(.)
+LJLIB_ASM(ipairs)		LJLIB_REC(xpairs 1)
 {
 {
   return ffh_pairs(L, MM_ipairs);
   return ffh_pairs(L, MM_ipairs);
 }
 }

+ 1 - 1
src/lib_jit.c

@@ -284,7 +284,7 @@ static GCtrace *jit_checktrace(lua_State *L)
 /* Names of link types. ORDER LJ_TRLINK */
 /* Names of link types. ORDER LJ_TRLINK */
 static const char *const jit_trlinkname[] = {
 static const char *const jit_trlinkname[] = {
   "none", "root", "loop", "tail-recursion", "up-recursion", "down-recursion",
   "none", "root", "loop", "tail-recursion", "up-recursion", "down-recursion",
-  "interpreter", "return"
+  "interpreter", "return", "stitch"
 };
 };
 
 
 /* local info = jit.util.traceinfo(tr) */
 /* local info = jit.util.traceinfo(tr) */

+ 23 - 0
src/lj_dispatch.c

@@ -42,6 +42,12 @@ LJ_STATIC_ASSERT(GG_NUM_ASMFF == FF_NUM_ASMFUNC);
 #include <math.h>
 #include <math.h>
 LJ_FUNCA_NORET void LJ_FASTCALL lj_ffh_coroutine_wrap_err(lua_State *L,
 LJ_FUNCA_NORET void LJ_FASTCALL lj_ffh_coroutine_wrap_err(lua_State *L,
 							  lua_State *co);
 							  lua_State *co);
+#if !LJ_HASJIT
+#define lj_dispatch_stitch	lj_dispatch_ins
+#endif
+#if !LJ_HASPROFILE
+#define lj_dispatch_profile	lj_dispatch_ins
+#endif
 
 
 #define GOTFUNC(name)	(ASMFunction)name,
 #define GOTFUNC(name)	(ASMFunction)name,
 static const ASMFunction dispatch_got[] = {
 static const ASMFunction dispatch_got[] = {
@@ -511,6 +517,23 @@ out:
   return makeasmfunc(lj_bc_ofs[op]);  /* Return static dispatch target. */
   return makeasmfunc(lj_bc_ofs[op]);  /* Return static dispatch target. */
 }
 }
 
 
+#if LJ_HASJIT
+/* Stitch a new trace. */
+void LJ_FASTCALL lj_dispatch_stitch(jit_State *J, const BCIns *pc)
+{
+  ERRNO_SAVE
+  lua_State *L = J->L;
+  void *cf = cframe_raw(L->cframe);
+  const BCIns *oldpc = cframe_pc(cf);
+  setcframe_pc(cf, pc);
+  /* Before dispatch, have to bias PC by 1. */
+  L->top = L->base + cur_topslot(curr_proto(L), pc+1, cframe_multres_n(cf));
+  lj_trace_stitch(J, pc-1);  /* Point to the CALL instruction. */
+  setcframe_pc(cf, oldpc);
+  ERRNO_RESTORE
+}
+#endif
+
 #if LJ_HASPROFILE
 #if LJ_HASPROFILE
 /* Profile dispatch. */
 /* Profile dispatch. */
 void LJ_FASTCALL lj_dispatch_profile(lua_State *L, const BCIns *pc)
 void LJ_FASTCALL lj_dispatch_profile(lua_State *L, const BCIns *pc)

+ 5 - 1
src/lj_dispatch.h

@@ -29,7 +29,8 @@
   _(floor) _(ceil) _(trunc) _(log) _(log10) _(exp) _(sin) _(cos) _(tan) \
   _(floor) _(ceil) _(trunc) _(log) _(log10) _(exp) _(sin) _(cos) _(tan) \
   _(asin) _(acos) _(atan) _(sinh) _(cosh) _(tanh) _(frexp) _(modf) _(atan2) \
   _(asin) _(acos) _(atan) _(sinh) _(cosh) _(tanh) _(frexp) _(modf) _(atan2) \
   _(pow) _(fmod) _(ldexp) \
   _(pow) _(fmod) _(ldexp) \
-  _(lj_dispatch_call) _(lj_dispatch_ins) _(lj_dispatch_profile) _(lj_err_throw)\
+  _(lj_dispatch_call) _(lj_dispatch_ins) _(lj_dispatch_stitch) \
+  _(lj_dispatch_profile) _(lj_err_throw) \
   _(lj_ffh_coroutine_wrap_err) _(lj_func_closeuv) _(lj_func_newL_gc) \
   _(lj_ffh_coroutine_wrap_err) _(lj_func_closeuv) _(lj_func_newL_gc) \
   _(lj_gc_barrieruv) _(lj_gc_step) _(lj_gc_step_fixtop) _(lj_meta_arith) \
   _(lj_gc_barrieruv) _(lj_gc_step) _(lj_gc_step_fixtop) _(lj_meta_arith) \
   _(lj_meta_call) _(lj_meta_cat) _(lj_meta_comp) _(lj_meta_equal) \
   _(lj_meta_call) _(lj_meta_cat) _(lj_meta_comp) _(lj_meta_equal) \
@@ -110,6 +111,9 @@ LJ_FUNC void lj_dispatch_update(global_State *g);
 /* Instruction dispatch callback for hooks or when recording. */
 /* Instruction dispatch callback for hooks or when recording. */
 LJ_FUNCA void LJ_FASTCALL lj_dispatch_ins(lua_State *L, const BCIns *pc);
 LJ_FUNCA void LJ_FASTCALL lj_dispatch_ins(lua_State *L, const BCIns *pc);
 LJ_FUNCA ASMFunction LJ_FASTCALL lj_dispatch_call(lua_State *L, const BCIns*pc);
 LJ_FUNCA ASMFunction LJ_FASTCALL lj_dispatch_call(lua_State *L, const BCIns*pc);
+#if LJ_HASJIT
+LJ_FUNCA void LJ_FASTCALL lj_dispatch_stitch(jit_State *J, const BCIns *pc);
+#endif
 #if LJ_HASPROFILE
 #if LJ_HASPROFILE
 LJ_FUNCA void LJ_FASTCALL lj_dispatch_profile(lua_State *L, const BCIns *pc);
 LJ_FUNCA void LJ_FASTCALL lj_dispatch_profile(lua_State *L, const BCIns *pc);
 #endif
 #endif

+ 96 - 34
src/lj_ffrecord.c

@@ -96,28 +96,81 @@ static ptrdiff_t results_wanted(jit_State *J)
     return -1;
     return -1;
 }
 }
 
 
-/* Throw error for unsupported variant of fast function. */
-LJ_NORET static void recff_nyiu(jit_State *J)
+/* Trace stitching: add continuation below frame to start a new trace. */
+static void recff_stitch(jit_State *J)
 {
 {
-  setfuncV(J->L, &J->errinfo, J->fn);
-  lj_trace_err_info(J, LJ_TRERR_NYIFFU);
+  ASMFunction cont = lj_cont_stitch;
+  TraceNo traceno = J->cur.traceno;
+  lua_State *L = J->L;
+  TValue *base = L->base;
+  const BCIns *pc = frame_pc(base-1);
+  TValue *pframe = frame_prevl(base-1);
+  TRef trcont;
+
+  /* Move func + args up in Lua stack and insert continuation. */
+  memmove(&base[1], &base[-1], sizeof(TValue)*(J->maxslot+1));
+  setframe_ftsz(base+1, (int)((char *)(base+1) - (char *)pframe) + FRAME_CONT);
+  setcont(base, cont);
+  setframe_pc(base, pc);
+  if (LJ_DUALNUM) setintV(base-1, traceno); else base[-1].u64 = traceno;
+  L->base += 2;
+  L->top += 2;
+
+  /* Ditto for the IR. */
+  memmove(&J->base[1], &J->base[-1], sizeof(TRef)*(J->maxslot+1));
+#if LJ_64
+  trcont = lj_ir_kptr(J, (void *)((int64_t)cont-(int64_t)lj_vm_asm_begin));
+#else
+  trcont = lj_ir_kptr(J, (void *)cont);
+#endif
+  J->base[0] = trcont | TREF_CONT;
+  J->base[-1] = LJ_DUALNUM ? lj_ir_kint(J,traceno) : lj_ir_knum_u64(J,traceno);
+  J->maxslot += 2;
+  J->framedepth++;
+
+  lj_record_stop(J, LJ_TRLINK_STITCH, 0);
+
+  /* Undo Lua stack changes. */
+  memmove(&base[-1], &base[1], sizeof(TValue)*(J->maxslot+1));
+  setframe_pc(base-1, pc);
+  L->base -= 2;
+  L->top -= 2;
 }
 }
 
 
-/* Fallback handler for all fast functions that are not recorded (yet). */
+/* Fallback handler for fast functions that are not recorded (yet). */
 static void LJ_FASTCALL recff_nyi(jit_State *J, RecordFFData *rd)
 static void LJ_FASTCALL recff_nyi(jit_State *J, RecordFFData *rd)
 {
 {
-  setfuncV(J->L, &J->errinfo, J->fn);
-  lj_trace_err_info(J, LJ_TRERR_NYIFF);
-  UNUSED(rd);
+  if (J->cur.nins < (IRRef)J->param[JIT_P_minstitch] + REF_BASE) {
+    lj_trace_err_info(J, LJ_TRERR_TRACEUV);
+  } else {
+    /* Can only stitch from Lua call. */
+    if (J->framedepth && frame_islua(J->L->base-1)) {
+      BCOp op = bc_op(*frame_pc(J->L->base-1));
+      /* Stitched trace cannot start with *M op with variable # of args. */
+      if (!(op == BC_CALLM || op == BC_RETM || op == BC_TSETM)) {
+	switch (J->fn->c.ffid) {
+	case FF_error:
+	case FF_debug_sethook:
+	case FF_jit_flush:
+	  break;  /* Don't stitch across special builtins. */
+	default:
+	  recff_stitch(J);  /* Use trace stitching. */
+	  rd->nres = -1;
+	  return;
+	}
+      }
+    }
+    /* Otherwise stop trace and return to interpreter. */
+    lj_record_stop(J, LJ_TRLINK_RETURN, 0);
+    rd->nres = -1;
+  }
 }
 }
 
 
-/* C functions can have arbitrary side-effects and are not recorded (yet). */
-static void LJ_FASTCALL recff_c(jit_State *J, RecordFFData *rd)
-{
-  setfuncV(J->L, &J->errinfo, J->fn);
-  lj_trace_err_info(J, LJ_TRERR_NYICF);
-  UNUSED(rd);
-}
+/* Fallback handler for unsupported variants of fast functions. */
+#define recff_nyiu	recff_nyi
+
+/* Must stop the trace for classic C functions with arbitrary side-effects. */
+#define recff_c		recff_nyi
 
 
 /* Emit BUFHDR for the global temporary buffer. */
 /* Emit BUFHDR for the global temporary buffer. */
 static TRef recff_bufhdr(jit_State *J)
 static TRef recff_bufhdr(jit_State *J)
@@ -268,7 +321,8 @@ static void LJ_FASTCALL recff_select(jit_State *J, RecordFFData *rd)
 	  J->base[i] = J->base[start+i];
 	  J->base[i] = J->base[start+i];
       }  /* else: Interpreter will throw. */
       }  /* else: Interpreter will throw. */
     } else {
     } else {
-      recff_nyiu(J);
+      recff_nyiu(J, rd);
+      return;
     }
     }
   }  /* else: Interpreter will throw. */
   }  /* else: Interpreter will throw. */
 }
 }
@@ -279,14 +333,18 @@ static void LJ_FASTCALL recff_tonumber(jit_State *J, RecordFFData *rd)
   TRef base = J->base[1];
   TRef base = J->base[1];
   if (tr && !tref_isnil(base)) {
   if (tr && !tref_isnil(base)) {
     base = lj_opt_narrow_toint(J, base);
     base = lj_opt_narrow_toint(J, base);
-    if (!tref_isk(base) || IR(tref_ref(base))->i != 10)
-      recff_nyiu(J);
+    if (!tref_isk(base) || IR(tref_ref(base))->i != 10) {
+      recff_nyiu(J, rd);
+      return;
+    }
   }
   }
   if (tref_isnumber_str(tr)) {
   if (tref_isnumber_str(tr)) {
     if (tref_isstr(tr)) {
     if (tref_isstr(tr)) {
       TValue tmp;
       TValue tmp;
-      if (!lj_strscan_num(strV(&rd->argv[0]), &tmp))
-	recff_nyiu(J);  /* Would need an inverted STRTO for this case. */
+      if (!lj_strscan_num(strV(&rd->argv[0]), &tmp)) {
+	recff_nyiu(J, rd);  /* Would need an inverted STRTO for this case. */
+	return;
+      }
       tr = emitir(IRTG(IR_STRTO, IRT_NUM), tr, 0);
       tr = emitir(IRTG(IR_STRTO, IRT_NUM), tr, 0);
     }
     }
 #if LJ_HASFFI
 #if LJ_HASFFI
@@ -348,7 +406,8 @@ static void LJ_FASTCALL recff_tostring(jit_State *J, RecordFFData *rd)
     } else if (tref_ispri(tr)) {
     } else if (tref_ispri(tr)) {
       J->base[0] = lj_ir_kstr(J, lj_strfmt_obj(J->L, &rd->argv[0]));
       J->base[0] = lj_ir_kstr(J, lj_strfmt_obj(J->L, &rd->argv[0]));
     } else {
     } else {
-      recff_nyiu(J);
+      recff_nyiu(J, rd);
+      return;
     }
     }
   }
   }
 }
 }
@@ -370,14 +429,14 @@ static void LJ_FASTCALL recff_ipairs_aux(jit_State *J, RecordFFData *rd)
   }  /* else: Interpreter will throw. */
   }  /* else: Interpreter will throw. */
 }
 }
 
 
-static void LJ_FASTCALL recff_ipairs(jit_State *J, RecordFFData *rd)
+static void LJ_FASTCALL recff_xpairs(jit_State *J, RecordFFData *rd)
 {
 {
   if (!(LJ_52 && recff_metacall(J, rd, MM_ipairs))) {
   if (!(LJ_52 && recff_metacall(J, rd, MM_ipairs))) {
     TRef tab = J->base[0];
     TRef tab = J->base[0];
     if (tref_istab(tab)) {
     if (tref_istab(tab)) {
       J->base[0] = lj_ir_kfunc(J, funcV(&J->fn->c.upvalue[0]));
       J->base[0] = lj_ir_kfunc(J, funcV(&J->fn->c.upvalue[0]));
       J->base[1] = tab;
       J->base[1] = tab;
-      J->base[2] = lj_ir_kint(J, 0);
+      J->base[2] = rd->data ? lj_ir_kint(J, 0) : TREF_NIL;
       rd->nres = 3;
       rd->nres = 3;
     }  /* else: Interpreter will throw. */
     }  /* else: Interpreter will throw. */
   }
   }
@@ -431,8 +490,7 @@ static void LJ_FASTCALL recff_getfenv(jit_State *J, RecordFFData *rd)
     J->base[0] = emitir(IRT(IR_FLOAD, IRT_TAB), trl, IRFL_THREAD_ENV);
     J->base[0] = emitir(IRT(IR_FLOAD, IRT_TAB), trl, IRFL_THREAD_ENV);
     return;
     return;
   }
   }
-  recff_nyiu(J);
-  UNUSED(rd);
+  recff_nyiu(J, rd);
 }
 }
 
 
 /* -- Math library fast functions ----------------------------------------- */
 /* -- Math library fast functions ----------------------------------------- */
@@ -672,8 +730,7 @@ static void LJ_FASTCALL recff_bit_tohex(jit_State *J, RecordFFData *rd)
   TRef tr = recff_bit64_tohex(J, rd, hdr);
   TRef tr = recff_bit64_tohex(J, rd, hdr);
   J->base[0] = emitir(IRT(IR_BUFSTR, IRT_STR), tr, hdr);
   J->base[0] = emitir(IRT(IR_BUFSTR, IRT_STR), tr, hdr);
 #else
 #else
-  UNUSED(rd);
-  recff_nyiu(J);  /* Don't bother working around this NYI. */
+  recff_nyiu(J, rd);  /* Don't bother working around this NYI. */
 #endif
 #endif
 }
 }
 
 
@@ -891,7 +948,8 @@ static void LJ_FASTCALL recff_string_find(jit_State *J, RecordFFData *rd)
       J->base[0] = TREF_NIL;
       J->base[0] = TREF_NIL;
     }
     }
   } else {  /* Search for pattern. */
   } else {  /* Search for pattern. */
-    recff_nyiu(J);
+    recff_nyiu(J, rd);
+    return;
   }
   }
 }
 }
 
 
@@ -931,7 +989,8 @@ static void LJ_FASTCALL recff_string_format(jit_State *J, RecordFFData *rd)
 	tr = lj_ir_call(J, IRCALL_lj_strfmt_putfxint, tr, trsf, tra);
 	tr = lj_ir_call(J, IRCALL_lj_strfmt_putfxint, tr, trsf, tra);
 	lj_needsplit(J);
 	lj_needsplit(J);
 #else
 #else
-	recff_nyiu(J);  /* Don't bother working around this NYI. */
+	recff_nyiu(J, rd);  /* Don't bother working around this NYI. */
+	return;
 #endif
 #endif
       }
       }
       break;
       break;
@@ -946,8 +1005,10 @@ static void LJ_FASTCALL recff_string_format(jit_State *J, RecordFFData *rd)
       if (LJ_SOFTFP) lj_needsplit(J);
       if (LJ_SOFTFP) lj_needsplit(J);
       break;
       break;
     case STRFMT_STR:
     case STRFMT_STR:
-      if (!tref_isstr(tra))
-	recff_nyiu(J);  /* NYI: __tostring and non-string types for %s. */
+      if (!tref_isstr(tra)) {
+	recff_nyiu(J, rd);  /* NYI: __tostring and non-string types for %s. */
+	return;
+      }
       if (sf == STRFMT_STR)  /* Shortcut for plain %s. */
       if (sf == STRFMT_STR)  /* Shortcut for plain %s. */
 	tr = emitir(IRT(IR_BUFPUT, IRT_P32), tr, tra);
 	tr = emitir(IRT(IR_BUFPUT, IRT_P32), tr, tra);
       else if ((sf & STRFMT_T_QUOTED))
       else if ((sf & STRFMT_T_QUOTED))
@@ -966,8 +1027,8 @@ static void LJ_FASTCALL recff_string_format(jit_State *J, RecordFFData *rd)
     case STRFMT_PTR:  /* NYI */
     case STRFMT_PTR:  /* NYI */
     case STRFMT_ERR:
     case STRFMT_ERR:
     default:
     default:
-      recff_nyiu(J);
-      break;
+      recff_nyiu(J, rd);
+      return;
     }
     }
   }
   }
   J->base[0] = emitir(IRT(IR_BUFSTR, IRT_STR), tr, hdr);
   J->base[0] = emitir(IRT(IR_BUFSTR, IRT_STR), tr, hdr);
@@ -991,7 +1052,8 @@ static void LJ_FASTCALL recff_table_insert(jit_State *J, RecordFFData *rd)
       ix.idxchain = 0;
       ix.idxchain = 0;
       lj_record_idx(J, &ix);  /* Set new value. */
       lj_record_idx(J, &ix);  /* Set new value. */
     } else {  /* Complex case: insert in the middle. */
     } else {  /* Complex case: insert in the middle. */
-      recff_nyiu(J);
+      recff_nyiu(J, rd);
+      return;
     }
     }
   }  /* else: Interpreter will throw. */
   }  /* else: Interpreter will throw. */
 }
 }

+ 3 - 1
src/lj_jit.h

@@ -97,6 +97,7 @@
   _(\012, maxirconst,	500)	/* Max. # of IR constants of a trace. */ \
   _(\012, maxirconst,	500)	/* Max. # of IR constants of a trace. */ \
   _(\007, maxside,	100)	/* Max. # of side traces of a root trace. */ \
   _(\007, maxside,	100)	/* Max. # of side traces of a root trace. */ \
   _(\007, maxsnap,	500)	/* Max. # of snapshots for a trace. */ \
   _(\007, maxsnap,	500)	/* Max. # of snapshots for a trace. */ \
+  _(\011, minstitch,	0)	/* Min. # of IR ins for a stitched trace. */ \
   \
   \
   _(\007, hotloop,	56)	/* # of iter. to detect a hot loop/call. */ \
   _(\007, hotloop,	56)	/* # of iter. to detect a hot loop/call. */ \
   _(\007, hotexit,	10)	/* # of taken exits to start a side trace. */ \
   _(\007, hotexit,	10)	/* # of taken exits to start a side trace. */ \
@@ -202,7 +203,8 @@ typedef enum {
   LJ_TRLINK_UPREC,		/* Up-recursion. */
   LJ_TRLINK_UPREC,		/* Up-recursion. */
   LJ_TRLINK_DOWNREC,		/* Down-recursion. */
   LJ_TRLINK_DOWNREC,		/* Down-recursion. */
   LJ_TRLINK_INTERP,		/* Fallback to interpreter. */
   LJ_TRLINK_INTERP,		/* Fallback to interpreter. */
-  LJ_TRLINK_RETURN		/* Return to interpreter. */
+  LJ_TRLINK_RETURN,		/* Return to interpreter. */
+  LJ_TRLINK_STITCH		/* Trace stitching. */
 } TraceLink;
 } TraceLink;
 
 
 /* Trace object. */
 /* Trace object. */

+ 39 - 18
src/lj_record.c

@@ -233,7 +233,7 @@ static void canonicalize_slots(jit_State *J)
 }
 }
 
 
 /* Stop recording. */
 /* Stop recording. */
-static void rec_stop(jit_State *J, TraceLink linktype, TraceNo lnk)
+void lj_record_stop(jit_State *J, TraceLink linktype, TraceNo lnk)
 {
 {
   lj_trace_end(J);
   lj_trace_end(J);
   J->cur.linktype = (uint8_t)linktype;
   J->cur.linktype = (uint8_t)linktype;
@@ -501,8 +501,7 @@ static LoopEvent rec_for(jit_State *J, const BCIns *fori, int isforl)
 static LoopEvent rec_iterl(jit_State *J, const BCIns iterins)
 static LoopEvent rec_iterl(jit_State *J, const BCIns iterins)
 {
 {
   BCReg ra = bc_a(iterins);
   BCReg ra = bc_a(iterins);
-  lua_assert(J->base[ra] != 0);
-  if (!tref_isnil(J->base[ra])) {  /* Looping back? */
+  if (!tref_isnil(getslot(J, ra))) {  /* Looping back? */
     J->base[ra-1] = J->base[ra];  /* Copy result of ITERC to control var. */
     J->base[ra-1] = J->base[ra];  /* Copy result of ITERC to control var. */
     J->maxslot = ra-1+bc_b(J->pc[-1]);
     J->maxslot = ra-1+bc_b(J->pc[-1]);
     J->pc += bc_j(iterins)+1;
     J->pc += bc_j(iterins)+1;
@@ -540,12 +539,12 @@ static int innerloopleft(jit_State *J, const BCIns *pc)
 /* Handle the case when an interpreted loop op is hit. */
 /* Handle the case when an interpreted loop op is hit. */
 static void rec_loop_interp(jit_State *J, const BCIns *pc, LoopEvent ev)
 static void rec_loop_interp(jit_State *J, const BCIns *pc, LoopEvent ev)
 {
 {
-  if (J->parent == 0) {
+  if (J->parent == 0 && J->exitno == 0) {
     if (pc == J->startpc && J->framedepth + J->retdepth == 0) {
     if (pc == J->startpc && J->framedepth + J->retdepth == 0) {
       /* Same loop? */
       /* Same loop? */
       if (ev == LOOPEV_LEAVE)  /* Must loop back to form a root trace. */
       if (ev == LOOPEV_LEAVE)  /* Must loop back to form a root trace. */
 	lj_trace_err(J, LJ_TRERR_LLEAVE);
 	lj_trace_err(J, LJ_TRERR_LLEAVE);
-      rec_stop(J, LJ_TRLINK_LOOP, J->cur.traceno);  /* Looping root trace. */
+      lj_record_stop(J, LJ_TRLINK_LOOP, J->cur.traceno);  /* Looping trace. */
     } else if (ev != LOOPEV_LEAVE) {  /* Entering inner loop? */
     } else if (ev != LOOPEV_LEAVE) {  /* Entering inner loop? */
       /* It's usually better to abort here and wait until the inner loop
       /* It's usually better to abort here and wait until the inner loop
       ** is traced. But if the inner loop repeatedly didn't loop back,
       ** is traced. But if the inner loop repeatedly didn't loop back,
@@ -570,15 +569,15 @@ static void rec_loop_interp(jit_State *J, const BCIns *pc, LoopEvent ev)
 /* Handle the case when an already compiled loop op is hit. */
 /* Handle the case when an already compiled loop op is hit. */
 static void rec_loop_jit(jit_State *J, TraceNo lnk, LoopEvent ev)
 static void rec_loop_jit(jit_State *J, TraceNo lnk, LoopEvent ev)
 {
 {
-  if (J->parent == 0) {  /* Root trace hit an inner loop. */
+  if (J->parent == 0 && J->exitno == 0) {  /* Root trace hit an inner loop. */
     /* Better let the inner loop spawn a side trace back here. */
     /* Better let the inner loop spawn a side trace back here. */
     lj_trace_err(J, LJ_TRERR_LINNER);
     lj_trace_err(J, LJ_TRERR_LINNER);
   } else if (ev != LOOPEV_LEAVE) {  /* Side trace enters a compiled loop. */
   } else if (ev != LOOPEV_LEAVE) {  /* Side trace enters a compiled loop. */
     J->instunroll = 0;  /* Cannot continue across a compiled loop op. */
     J->instunroll = 0;  /* Cannot continue across a compiled loop op. */
     if (J->pc == J->startpc && J->framedepth + J->retdepth == 0)
     if (J->pc == J->startpc && J->framedepth + J->retdepth == 0)
-      rec_stop(J, LJ_TRLINK_LOOP, J->cur.traceno);  /* Form an extra loop. */
+      lj_record_stop(J, LJ_TRLINK_LOOP, J->cur.traceno);  /* Form extra loop. */
     else
     else
-      rec_stop(J, LJ_TRLINK_ROOT, lnk);  /* Link to the loop. */
+      lj_record_stop(J, LJ_TRLINK_ROOT, lnk);  /* Link to the loop. */
   }  /* Side trace continues across a loop that's left or not entered. */
   }  /* Side trace continues across a loop that's left or not entered. */
 }
 }
 
 
@@ -643,6 +642,18 @@ static TRef rec_call_specialize(jit_State *J, GCfunc *fn, TRef tr)
       (void)lj_ir_kgc(J, obj2gco(pt), IRT_PROTO);  /* Prevent GC of proto. */
       (void)lj_ir_kgc(J, obj2gco(pt), IRT_PROTO);  /* Prevent GC of proto. */
       return tr;
       return tr;
     }
     }
+  } else {
+    /* Don't specialize to non-monomorphic builtins. */
+    switch (fn->c.ffid) {
+    case FF_coroutine_wrap_aux:
+    case FF_string_gmatch_aux:
+      /* NYI: io_file_iter doesn't have an ffid, yet. */
+      /* NYI: specialize to ffid? Not strictly necessary, trace will stop. */
+      return tr;
+    default:
+      /* NYI: don't specialize to non-monomorphic C functions. */
+      break;
+    }
   }
   }
   /* Otherwise specialize to the function (closure) value itself. */
   /* Otherwise specialize to the function (closure) value itself. */
   kfunc = lj_ir_kfunc(J, fn);
   kfunc = lj_ir_kfunc(J, fn);
@@ -750,12 +761,13 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults)
   /* Return to lower frame via interpreter for unhandled cases. */
   /* Return to lower frame via interpreter for unhandled cases. */
   if (J->framedepth == 0 && J->pt && bc_isret(bc_op(*J->pc)) &&
   if (J->framedepth == 0 && J->pt && bc_isret(bc_op(*J->pc)) &&
        (!frame_islua(frame) ||
        (!frame_islua(frame) ||
-	(J->parent == 0 && !bc_isret(bc_op(J->cur.startins))))) {
+	(J->parent == 0 && J->exitno == 0 &&
+	 !bc_isret(bc_op(J->cur.startins))))) {
     /* NYI: specialize to frame type and return directly, not via RET*. */
     /* NYI: specialize to frame type and return directly, not via RET*. */
     for (i = -1; i < (ptrdiff_t)rbase; i++)
     for (i = -1; i < (ptrdiff_t)rbase; i++)
       J->base[i] = 0;  /* Purge dead slots. */
       J->base[i] = 0;  /* Purge dead slots. */
     J->maxslot = rbase + (BCReg)gotresults;
     J->maxslot = rbase + (BCReg)gotresults;
-    rec_stop(J, LJ_TRLINK_RETURN, 0);  /* Return to interpreter. */
+    lj_record_stop(J, LJ_TRLINK_RETURN, 0);  /* Return to interpreter. */
     return;
     return;
   }
   }
   if (frame_isvarg(frame)) {
   if (frame_isvarg(frame)) {
@@ -779,7 +791,7 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults)
       if (check_downrec_unroll(J, pt)) {
       if (check_downrec_unroll(J, pt)) {
 	J->maxslot = (BCReg)(rbase + gotresults);
 	J->maxslot = (BCReg)(rbase + gotresults);
 	lj_snap_purge(J);
 	lj_snap_purge(J);
-	rec_stop(J, LJ_TRLINK_DOWNREC, J->cur.traceno);  /* Down-recursion. */
+	lj_record_stop(J, LJ_TRLINK_DOWNREC, J->cur.traceno);  /* Down-rec. */
 	return;
 	return;
       }
       }
       lj_snap_add(J);
       lj_snap_add(J);
@@ -792,7 +804,8 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults)
       lua_assert(J->baseslot > cbase+1);
       lua_assert(J->baseslot > cbase+1);
       J->baseslot -= cbase+1;
       J->baseslot -= cbase+1;
       J->base -= cbase+1;
       J->base -= cbase+1;
-    } else if (J->parent == 0 && !bc_isret(bc_op(J->cur.startins))) {
+    } else if (J->parent == 0 && J->exitno == 0 &&
+	       !bc_isret(bc_op(J->cur.startins))) {
       /* Return to lower frame would leave the loop in a root trace. */
       /* Return to lower frame would leave the loop in a root trace. */
       lj_trace_err(J, LJ_TRERR_LLEAVE);
       lj_trace_err(J, LJ_TRERR_LLEAVE);
     } else {  /* Return to lower frame. Guard for the target we return to. */
     } else {  /* Return to lower frame. Guard for the target we return to. */
@@ -1480,9 +1493,9 @@ static void check_call_unroll(jit_State *J, TraceNo lnk)
     if (count + J->tailcalled > J->param[JIT_P_recunroll]) {
     if (count + J->tailcalled > J->param[JIT_P_recunroll]) {
       J->pc++;
       J->pc++;
       if (J->framedepth + J->retdepth == 0)
       if (J->framedepth + J->retdepth == 0)
-	rec_stop(J, LJ_TRLINK_TAILREC, J->cur.traceno);  /* Tail-recursion. */
+	lj_record_stop(J, LJ_TRLINK_TAILREC, J->cur.traceno);  /* Tail-rec. */
       else
       else
-	rec_stop(J, LJ_TRLINK_UPREC, J->cur.traceno);  /* Up-recursion. */
+	lj_record_stop(J, LJ_TRLINK_UPREC, J->cur.traceno);  /* Up-recursion. */
     }
     }
   } else {
   } else {
     if (count > J->param[JIT_P_callunroll]) {
     if (count > J->param[JIT_P_callunroll]) {
@@ -1556,9 +1569,9 @@ static void rec_func_jit(jit_State *J, TraceNo lnk)
   }
   }
   J->instunroll = 0;  /* Cannot continue across a compiled function. */
   J->instunroll = 0;  /* Cannot continue across a compiled function. */
   if (J->pc == J->startpc && J->framedepth + J->retdepth == 0)
   if (J->pc == J->startpc && J->framedepth + J->retdepth == 0)
-    rec_stop(J, LJ_TRLINK_TAILREC, J->cur.traceno);  /* Extra tail-recursion. */
+    lj_record_stop(J, LJ_TRLINK_TAILREC, J->cur.traceno);  /* Extra tail-rec. */
   else
   else
-    rec_stop(J, LJ_TRLINK_ROOT, lnk);  /* Link to the function. */
+    lj_record_stop(J, LJ_TRLINK_ROOT, lnk);  /* Link to the function. */
 }
 }
 
 
 /* -- Vararg handling ----------------------------------------------------- */
 /* -- Vararg handling ----------------------------------------------------- */
@@ -2165,7 +2178,7 @@ void lj_record_ins(jit_State *J)
   case BC_JFORI:
   case BC_JFORI:
     lua_assert(bc_op(pc[(ptrdiff_t)rc-BCBIAS_J]) == BC_JFORL);
     lua_assert(bc_op(pc[(ptrdiff_t)rc-BCBIAS_J]) == BC_JFORL);
     if (rec_for(J, pc, 0) != LOOPEV_LEAVE)  /* Link to existing loop. */
     if (rec_for(J, pc, 0) != LOOPEV_LEAVE)  /* Link to existing loop. */
-      rec_stop(J, LJ_TRLINK_ROOT, bc_d(pc[(ptrdiff_t)rc-BCBIAS_J]));
+      lj_record_stop(J, LJ_TRLINK_ROOT, bc_d(pc[(ptrdiff_t)rc-BCBIAS_J]));
     /* Continue tracing if the loop is not entered. */
     /* Continue tracing if the loop is not entered. */
     break;
     break;
 
 
@@ -2299,6 +2312,12 @@ static const BCIns *rec_setup_root(jit_State *J)
     J->maxslot = J->pt->numparams;
     J->maxslot = J->pt->numparams;
     pc++;
     pc++;
     break;
     break;
+  case BC_CALLM:
+  case BC_CALL:
+  case BC_ITERC:
+    /* No bytecode range check for stitched traces. */
+    pc++;
+    break;
   default:
   default:
     lua_assert(0);
     lua_assert(0);
     break;
     break;
@@ -2366,7 +2385,7 @@ void lj_record_setup(jit_State *J)
     if (traceref(J, J->cur.root)->nchild >= J->param[JIT_P_maxside] ||
     if (traceref(J, J->cur.root)->nchild >= J->param[JIT_P_maxside] ||
 	T->snap[J->exitno].count >= J->param[JIT_P_hotexit] +
 	T->snap[J->exitno].count >= J->param[JIT_P_hotexit] +
 				    J->param[JIT_P_tryside]) {
 				    J->param[JIT_P_tryside]) {
-      rec_stop(J, LJ_TRLINK_INTERP, 0);
+      lj_record_stop(J, LJ_TRLINK_INTERP, 0);
     }
     }
   } else {  /* Root trace. */
   } else {  /* Root trace. */
     J->cur.root = 0;
     J->cur.root = 0;
@@ -2378,6 +2397,8 @@ void lj_record_setup(jit_State *J)
     lj_snap_add(J);
     lj_snap_add(J);
     if (bc_op(J->cur.startins) == BC_FORL)
     if (bc_op(J->cur.startins) == BC_FORL)
       rec_for_loop(J, J->pc-1, &J->scev, 1);
       rec_for_loop(J, J->pc-1, &J->scev, 1);
+    else if (bc_op(J->cur.startins) == BC_ITERC)
+      J->startpc = NULL;
     if (1 + J->pt->framesize >= LJ_MAX_JSLOTS)
     if (1 + J->pt->framesize >= LJ_MAX_JSLOTS)
       lj_trace_err(J, LJ_TRERR_STACKOV);
       lj_trace_err(J, LJ_TRERR_STACKOV);
   }
   }

+ 1 - 0
src/lj_record.h

@@ -28,6 +28,7 @@ typedef struct RecordIndex {
 
 
 LJ_FUNC int lj_record_objcmp(jit_State *J, TRef a, TRef b,
 LJ_FUNC int lj_record_objcmp(jit_State *J, TRef a, TRef b,
 			     cTValue *av, cTValue *bv);
 			     cTValue *av, cTValue *bv);
+LJ_FUNC void lj_record_stop(jit_State *J, TraceLink linktype, TraceNo lnk);
 LJ_FUNC TRef lj_record_constify(jit_State *J, cTValue *o);
 LJ_FUNC TRef lj_record_constify(jit_State *J, cTValue *o);
 
 
 LJ_FUNC void lj_record_call(jit_State *J, BCReg func, ptrdiff_t nargs);
 LJ_FUNC void lj_record_call(jit_State *J, BCReg func, ptrdiff_t nargs);

+ 2 - 1
src/lj_snap.c

@@ -97,7 +97,8 @@ static BCReg snapshot_framelinks(jit_State *J, SnapEntry *map)
 {
 {
   cTValue *frame = J->L->base - 1;
   cTValue *frame = J->L->base - 1;
   cTValue *lim = J->L->base - J->baseslot;
   cTValue *lim = J->L->base - J->baseslot;
-  cTValue *ftop = frame + funcproto(frame_func(frame))->framesize;
+  GCfunc *fn = frame_func(frame);
+  cTValue *ftop = isluafunc(fn) ? (frame+funcproto(fn)->framesize) : J->L->top;
   MSize f = 0;
   MSize f = 0;
   map[f++] = SNAP_MKPC(J->pc);  /* The current PC is always the first entry. */
   map[f++] = SNAP_MKPC(J->pc);  /* The current PC is always the first entry. */
   while (frame > lim) {  /* Backwards traversal of all frames above base. */
   while (frame > lim) {  /* Backwards traversal of all frames above base. */

+ 27 - 3
src/lj_trace.c

@@ -360,7 +360,7 @@ static void trace_start(jit_State *J)
   TraceNo traceno;
   TraceNo traceno;
 
 
   if ((J->pt->flags & PROTO_NOJIT)) {  /* JIT disabled for this proto? */
   if ((J->pt->flags & PROTO_NOJIT)) {  /* JIT disabled for this proto? */
-    if (J->parent == 0) {
+    if (J->parent == 0 && J->exitno == 0) {
       /* Lazy bytecode patching to disable hotcount events. */
       /* Lazy bytecode patching to disable hotcount events. */
       lua_assert(bc_op(*J->pc) == BC_FORL || bc_op(*J->pc) == BC_ITERL ||
       lua_assert(bc_op(*J->pc) == BC_FORL || bc_op(*J->pc) == BC_ITERL ||
 		 bc_op(*J->pc) == BC_LOOP || bc_op(*J->pc) == BC_FUNCF);
 		 bc_op(*J->pc) == BC_LOOP || bc_op(*J->pc) == BC_FUNCF);
@@ -453,6 +453,12 @@ static void trace_stop(jit_State *J)
       root->nextside = (TraceNo1)traceno;
       root->nextside = (TraceNo1)traceno;
     }
     }
     break;
     break;
+  case BC_CALLM:
+  case BC_CALL:
+  case BC_ITERC:
+    /* Trace stitching: patch link of previous trace. */
+    traceref(J, J->exitno)->link = traceno;
+    break;
   default:
   default:
     lua_assert(0);
     lua_assert(0);
     break;
     break;
@@ -502,8 +508,12 @@ static int trace_abort(jit_State *J)
     return 1;  /* Retry ASM with new MCode area. */
     return 1;  /* Retry ASM with new MCode area. */
   }
   }
   /* Penalize or blacklist starting bytecode instruction. */
   /* Penalize or blacklist starting bytecode instruction. */
-  if (J->parent == 0 && !bc_isret(bc_op(J->cur.startins)))
-    penalty_pc(J, &gcref(J->cur.startpt)->pt, mref(J->cur.startpc, BCIns), e);
+  if (J->parent == 0 && !bc_isret(bc_op(J->cur.startins))) {
+    if (J->exitno == 0)
+      penalty_pc(J, &gcref(J->cur.startpt)->pt, mref(J->cur.startpc, BCIns), e);
+    else
+      traceref(J, J->exitno)->link = J->exitno;  /* Self-link is blacklisted. */
+  }
 
 
   /* Is there anything to abort? */
   /* Is there anything to abort? */
   traceno = J->cur.traceno;
   traceno = J->cur.traceno;
@@ -680,6 +690,20 @@ static void trace_hotside(jit_State *J, const BCIns *pc)
   }
   }
 }
 }
 
 
+/* Stitch a new trace to the previous trace. */
+void LJ_FASTCALL lj_trace_stitch(jit_State *J, const BCIns *pc)
+{
+  /* Only start a new trace if not recording or inside __gc call or vmevent. */
+  if (J->state == LJ_TRACE_IDLE &&
+      !(J2G(J)->hookmask & (HOOK_GC|HOOK_VMEVENT))) {
+    J->parent = 0;  /* Have to treat it like a root trace. */
+    /* J->exitno is set to the invoking trace. */
+    J->state = LJ_TRACE_START;
+    lj_trace_ins(J, pc);
+  }
+}
+
+
 /* Tiny struct to pass data to protected call. */
 /* Tiny struct to pass data to protected call. */
 typedef struct ExitDataCP {
 typedef struct ExitDataCP {
   jit_State *J;
   jit_State *J;

+ 1 - 0
src/lj_trace.h

@@ -34,6 +34,7 @@ LJ_FUNC void lj_trace_freestate(global_State *g);
 /* Event handling. */
 /* Event handling. */
 LJ_FUNC void lj_trace_ins(jit_State *J, const BCIns *pc);
 LJ_FUNC void lj_trace_ins(jit_State *J, const BCIns *pc);
 LJ_FUNCA void LJ_FASTCALL lj_trace_hot(jit_State *J, const BCIns *pc);
 LJ_FUNCA void LJ_FASTCALL lj_trace_hot(jit_State *J, const BCIns *pc);
+LJ_FUNCA void LJ_FASTCALL lj_trace_stitch(jit_State *J, const BCIns *pc);
 LJ_FUNCA int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr);
 LJ_FUNCA int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr);
 
 
 /* Signal asynchronous abort of trace or end of trace. */
 /* Signal asynchronous abort of trace or end of trace. */

+ 1 - 2
src/lj_traceerr.h

@@ -7,6 +7,7 @@
 
 
 /* Recording. */
 /* Recording. */
 TREDEF(RECERR,	"error thrown or hook called during recording")
 TREDEF(RECERR,	"error thrown or hook called during recording")
+TREDEF(TRACEUV,	"trace too short")
 TREDEF(TRACEOV,	"trace too long")
 TREDEF(TRACEOV,	"trace too long")
 TREDEF(STACKOV,	"trace too deep")
 TREDEF(STACKOV,	"trace too deep")
 TREDEF(SNAPOV,	"too many snapshots")
 TREDEF(SNAPOV,	"too many snapshots")
@@ -23,8 +24,6 @@ TREDEF(BADTYPE,	"bad argument type")
 TREDEF(CJITOFF,	"JIT compilation disabled for function")
 TREDEF(CJITOFF,	"JIT compilation disabled for function")
 TREDEF(CUNROLL,	"call unroll limit reached")
 TREDEF(CUNROLL,	"call unroll limit reached")
 TREDEF(DOWNREC,	"down-recursion, restarting")
 TREDEF(DOWNREC,	"down-recursion, restarting")
-TREDEF(NYICF,	"NYI: C function %p")
-TREDEF(NYIFF,	"NYI: FastFunc %s")
 TREDEF(NYIFFU,	"NYI: unsupported variant of FastFunc %s")
 TREDEF(NYIFFU,	"NYI: unsupported variant of FastFunc %s")
 TREDEF(NYIRETL,	"NYI: return to lower frame")
 TREDEF(NYIRETL,	"NYI: return to lower frame")
 
 

+ 1 - 0
src/lj_vm.h

@@ -107,6 +107,7 @@ LJ_ASMF void lj_cont_nop(void);  /* Do nothing, just continue execution. */
 LJ_ASMF void lj_cont_condt(void);  /* Branch if result is true. */
 LJ_ASMF void lj_cont_condt(void);  /* Branch if result is true. */
 LJ_ASMF void lj_cont_condf(void);  /* Branch if result is false. */
 LJ_ASMF void lj_cont_condf(void);  /* Branch if result is false. */
 LJ_ASMF void lj_cont_hook(void);  /* Continue from hook yield. */
 LJ_ASMF void lj_cont_hook(void);  /* Continue from hook yield. */
+LJ_ASMF void lj_cont_stitch(void);  /* Trace stitching. */
 
 
 enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK };  /* Special continuations. */
 enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK };  /* Special continuations. */
 
 

+ 50 - 0
src/vm_arm.dasc

@@ -2082,6 +2082,55 @@ static void build_subroutines(BuildCtx *ctx)
   |   ldr INS, [PC, #-4]
   |   ldr INS, [PC, #-4]
   |  bx CRET1
   |  bx CRET1
   |
   |
+  |->cont_stitch:			// Trace stitching.
+  |.if JIT
+  |  // RA = resultptr, CARG4 = meta base
+  |   ldr RB, SAVE_MULTRES
+  |  ldr INS, [PC, #-4]
+  |    ldr CARG3, [CARG4, #-24]		// Save previous trace number.
+  |   subs RB, RB, #8
+  |  decode_RA8 RC, INS			// Call base.
+  |   beq >2
+  |1:  // Move results down.
+  |  ldrd CARG12, [RA]
+  |    add RA, RA, #8
+  |   subs RB, RB, #8
+  |  strd CARG12, [BASE, RC]
+  |    add RC, RC, #8
+  |   bne <1
+  |2:
+  |   decode_RA8 RA, INS
+  |   decode_RB8 RB, INS
+  |   add RA, RA, RB
+  |  ldr CARG1, [DISPATCH, #DISPATCH_J(trace)]
+  |3:
+  |   cmp RA, RC
+  |  mvn CARG2, #~LJ_TNIL
+  |   bhi >9				// More results wanted?
+  |
+  |  ldr TRACE:RA, [CARG1, CARG3, lsl #2]
+  |  ldrh RC, TRACE:RA->link
+  |  cmp RC, CARG3
+  |  beq ->cont_nop			// Blacklisted.
+  |  cmp RC, #0
+  |  bne =>BC_JLOOP			// Jump to stitched trace.
+  |
+  |  // Stitch a new trace to the previous trace.
+  |  str CARG3, [DISPATCH, #DISPATCH_J(exitno)]
+  |  str L, [DISPATCH, #DISPATCH_J(L)]
+  |  str BASE, L->base
+  |  sub CARG1, DISPATCH, #-GG_DISP2J
+  |  mov CARG2, PC
+  |  bl extern lj_dispatch_stitch	// (jit_State *J, const BCIns *pc)
+  |  ldr BASE, L->base
+  |  b ->cont_nop
+  |
+  |9:  // Fill up results with nil.
+  |  strd CARG12, [BASE, RC]
+  |  add RC, RC, #8
+  |  b <3
+  |.endif
+  |
   |->vm_profhook:			// Dispatch target for profiler hook.
   |->vm_profhook:			// Dispatch target for profiler hook.
 #if LJ_HASPROFILE
 #if LJ_HASPROFILE
   |  mov CARG1, L
   |  mov CARG1, L
@@ -2166,6 +2215,7 @@ static void build_subroutines(BuildCtx *ctx)
   |   lsrlo RC, INS, #16	// No: Decode operands A*8 and D.
   |   lsrlo RC, INS, #16	// No: Decode operands A*8 and D.
   |   subhs RC, RC, #8
   |   subhs RC, RC, #8
   |   addhs RA, RA, BASE	// Yes: RA = BASE+framesize*8, RC = nargs*8
   |   addhs RA, RA, BASE	// Yes: RA = BASE+framesize*8, RC = nargs*8
+  |   ldrhs CARG3, [BASE, FRAME_FUNC]
   |  bx OP
   |  bx OP
   |
   |
   |3:  // Rethrow error from the right C frame.
   |3:  // Rethrow error from the right C frame.

+ 56 - 2
src/vm_mips.dasc

@@ -2011,6 +2011,60 @@ static void build_subroutines(BuildCtx *ctx)
   |  jr CRET1
   |  jr CRET1
   |.  lw INS, -4(PC)
   |.  lw INS, -4(PC)
   |
   |
+  |->cont_stitch:			// Trace stitching.
+  |.if JIT
+  |  // RA = resultptr, RB = meta base
+  |  lw INS, -4(PC)
+  |    lw TMP3, -24+LO(RB)		// Save previous trace number.
+  |  decode_RA8a RC, INS
+  |   addiu AT, MULTRES, -8
+  |  decode_RA8b RC
+  |   beqz AT, >2
+  |. addu RC, BASE, RC			// Call base.
+  |1:  // Move results down.
+  |  ldc1 f0, 0(RA)
+  |   addiu AT, AT, -8
+  |    addiu RA, RA, 8
+  |  sdc1 f0, 0(RC)
+  |   bnez AT, <1
+  |.   addiu RC, RC, 8
+  |2:
+  |   decode_RA8a RA, INS
+  |    decode_RB8a RB, INS
+  |   decode_RA8b RA
+  |    decode_RB8b RB
+  |   addu RA, RA, RB
+  |  lw TMP1, DISPATCH_J(trace)(DISPATCH)
+  |   addu RA, BASE, RA
+  |3:
+  |   sltu AT, RC, RA
+  |   bnez AT, >9			// More results wanted?
+  |. sll TMP2, TMP3, 2
+  |
+  |  addu TMP2, TMP1, TMP2
+  |  lw TRACE:TMP2, 0(TMP2)
+  |  lhu RD, TRACE:TMP2->link
+  |  beq RD, TMP3, ->cont_nop		// Blacklisted.
+  |.  load_got lj_dispatch_stitch
+  |  bnez RD, =>BC_JLOOP		// Jump to stitched trace.
+  |.  sll RD, RD, 3
+  |
+  |  // Stitch a new trace to the previous trace.
+  |  sw TMP3, DISPATCH_J(exitno)(DISPATCH)
+  |  sw L, DISPATCH_J(L)(DISPATCH)
+  |  sw BASE, L->base
+  |  addiu CARG1, DISPATCH, GG_DISP2J
+  |  call_intern lj_dispatch_stitch	// (jit_State *J, const BCIns *pc)
+  |.  move CARG2, PC
+  |  b ->cont_nop
+  |.  lw BASE, L->base
+  |
+  |9:
+  |  sw TISNIL, HI(RC)
+  |  b <3
+  |.  addiu RC, RC, 8
+  |.endif
+  |
   |->vm_profhook:			// Dispatch target for profiler hook.
   |->vm_profhook:			// Dispatch target for profiler hook.
 #if LJ_HASPROFILE
 #if LJ_HASPROFILE
   |  load_got lj_dispatch_profile
   |  load_got lj_dispatch_profile
@@ -2091,13 +2145,13 @@ static void build_subroutines(BuildCtx *ctx)
   |  sw BASE, L->base
   |  sw BASE, L->base
   |1:
   |1:
   |  bltz CRET1, >3			// Check for error from exit.
   |  bltz CRET1, >3			// Check for error from exit.
-  |.  lw LFUNC:TMP1, FRAME_FUNC(BASE)
+  |.  lw LFUNC:RB, FRAME_FUNC(BASE)
   |    lui TMP3, 0x59c0			// TOBIT = 2^52 + 2^51 (float).
   |    lui TMP3, 0x59c0			// TOBIT = 2^52 + 2^51 (float).
   |  sll MULTRES, CRET1, 3
   |  sll MULTRES, CRET1, 3
   |    li TISNIL, LJ_TNIL
   |    li TISNIL, LJ_TNIL
   |  sw MULTRES, SAVE_MULTRES
   |  sw MULTRES, SAVE_MULTRES
   |    mtc1 TMP3, TOBIT
   |    mtc1 TMP3, TOBIT
-  |  lw TMP1, LFUNC:TMP1->pc
+  |  lw TMP1, LFUNC:RB->pc
   |   sw r0, DISPATCH_GL(jit_base)(DISPATCH)
   |   sw r0, DISPATCH_GL(jit_base)(DISPATCH)
   |  lw KBASE, PC2PROTO(k)(TMP1)
   |  lw KBASE, PC2PROTO(k)(TMP1)
   |    cvt.d.s TOBIT, TOBIT
   |    cvt.d.s TOBIT, TOBIT

+ 52 - 3
src/vm_ppc.dasc

@@ -2505,6 +2505,55 @@ static void build_subroutines(BuildCtx *ctx)
   |  mtctr CRET1
   |  mtctr CRET1
   |  bctr
   |  bctr
   |
   |
+  |->cont_stitch:			// Trace stitching.
+  |.if JIT
+  |  // RA = resultptr, RB = meta base
+  |  lwz INS, -4(PC)
+  |    lwz TMP3, -20(RB)		// Save previous trace number.
+  |   addic. TMP1, MULTRES, -8
+  |  decode_RA8 RC, INS			// Call base.
+  |   beq >2
+  |1:  // Move results down.
+  |  lfd f0, 0(RA)
+  |   addic. TMP1, TMP1, -8
+  |    addi RA, RA, 8
+  |  stfdx f0, BASE, RC
+  |    addi RC, RC, 8
+  |   bne <1
+  |2:
+  |   decode_RA8 RA, INS
+  |   decode_RB8 RB, INS
+  |   add RA, RA, RB
+  |  lwz TMP1, DISPATCH_J(trace)(DISPATCH)
+  |3:
+  |   cmplw RA, RC
+  |   bgt >9				// More results wanted?
+  |
+  |  slwi TMP2, TMP3, 2
+  |  lwzx TRACE:TMP2, TMP1, TMP2
+  |  lhz RD, TRACE:TMP2->link
+  |  cmpw RD, TMP3
+  |   cmpwi cr1, RD, 0
+  |  beq ->cont_nop			// Blacklisted.
+  |    slwi RD, RD, 3
+  |   bne cr1, =>BC_JLOOP		// Jump to stitched trace.
+  |
+  |  // Stitch a new trace to the previous trace.
+  |  stw TMP3, DISPATCH_J(exitno)(DISPATCH)
+  |  stp L, DISPATCH_J(L)(DISPATCH)
+  |  stp BASE, L->base
+  |  addi CARG1, DISPATCH, GG_DISP2J
+  |  mr CARG2, PC
+  |  bl extern lj_dispatch_stitch	// (jit_State *J, const BCIns *pc)
+  |  lp BASE, L->base
+  |  b ->cont_nop
+  |
+  |9:
+  |  stwx TISNIL, BASE, RC
+  |  addi RC, RC, 8
+  |  b <3
+  |.endif
+  |
   |->vm_profhook:			// Dispatch target for profiler hook.
   |->vm_profhook:			// Dispatch target for profiler hook.
 #if LJ_HASPROFILE
 #if LJ_HASPROFILE
   |  mr CARG1, L
   |  mr CARG1, L
@@ -2557,7 +2606,7 @@ static void build_subroutines(BuildCtx *ctx)
   |   sub CARG3, TMP0, CARG3		// Compute exit number.
   |   sub CARG3, TMP0, CARG3		// Compute exit number.
   |  lp BASE, DISPATCH_GL(jit_base)(DISPATCH)
   |  lp BASE, DISPATCH_GL(jit_base)(DISPATCH)
   |   srwi CARG3, CARG3, 2
   |   srwi CARG3, CARG3, 2
-  |  stw L, DISPATCH_J(L)(DISPATCH)
+  |  stp L, DISPATCH_J(L)(DISPATCH)
   |   subi CARG3, CARG3, 2
   |   subi CARG3, CARG3, 2
   |  stp BASE, L->base
   |  stp BASE, L->base
   |   stw CARG4, DISPATCH_J(parent)(DISPATCH)
   |   stw CARG4, DISPATCH_J(parent)(DISPATCH)
@@ -2589,11 +2638,11 @@ static void build_subroutines(BuildCtx *ctx)
   |1:
   |1:
   |  cmpwi CARG1, 0
   |  cmpwi CARG1, 0
   |  blt >3				// Check for error from exit.
   |  blt >3				// Check for error from exit.
-  |  lwz LFUNC:TMP1, FRAME_FUNC(BASE)
+  |  lwz LFUNC:RB, FRAME_FUNC(BASE)
   |   slwi MULTRES, CARG1, 3
   |   slwi MULTRES, CARG1, 3
   |    li TMP2, 0
   |    li TMP2, 0
   |   stw MULTRES, SAVE_MULTRES
   |   stw MULTRES, SAVE_MULTRES
-  |  lwz TMP1, LFUNC:TMP1->pc
+  |  lwz TMP1, LFUNC:RB->pc
   |    stw TMP2, DISPATCH_GL(jit_base)(DISPATCH)
   |    stw TMP2, DISPATCH_GL(jit_base)(DISPATCH)
   |  lwz KBASE, PC2PROTO(k)(TMP1)
   |  lwz KBASE, PC2PROTO(k)(TMP1)
   |  // Setup type comparison constants.
   |  // Setup type comparison constants.

+ 62 - 1
src/vm_x86.dasc

@@ -2659,6 +2659,67 @@ static void build_subroutines(BuildCtx *ctx)
   |  add NARGS:RD, 1
   |  add NARGS:RD, 1
   |  jmp RBa
   |  jmp RBa
   |
   |
+  |->cont_stitch:			// Trace stitching.
+  |.if JIT
+  |  // BASE = base, RC = result, RB = mbase
+  |  mov RA, [RB-24]			// Save previous trace number.
+  |  mov TMP1, RA
+  |  mov TMP3, DISPATCH			// Need one more register.
+  |  mov DISPATCH, MULTRES
+  |  movzx RA, PC_RA
+  |  lea RA, [BASE+RA*8]		// Call base.
+  |  sub DISPATCH, 1
+  |  jz >2
+  |1:  // Move results down.
+  |.if X64
+  |  mov RBa, [RC]
+  |  mov [RA], RBa
+  |.else
+  |  mov RB, [RC]
+  |  mov [RA], RB
+  |  mov RB, [RC+4]
+  |  mov [RA+4], RB
+  |.endif
+  |  add RC, 8
+  |  add RA, 8
+  |  sub DISPATCH, 1
+  |  jnz <1
+  |2:
+  |  movzx RC, PC_RA
+  |  movzx RB, PC_RB
+  |  add RC, RB
+  |  lea RC, [BASE+RC*8-8]
+  |3:
+  |  cmp RC, RA
+  |  ja >9				// More results wanted?
+  |
+  |  mov DISPATCH, TMP3
+  |  mov RB, TMP1			// Get previous trace number.
+  |  mov RA, [DISPATCH+DISPATCH_J(trace)]
+  |  mov TRACE:RD, [RA+RB*4]
+  |  movzx RD, word TRACE:RD->link
+  |  cmp RD, RB
+  |  je ->cont_nop			// Blacklisted.
+  |  test RD, RD
+  |  jne =>BC_JLOOP			// Jump to stitched trace.
+  |
+  |  // Stitch a new trace to the previous trace.
+  |  mov [DISPATCH+DISPATCH_J(exitno)], RB
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE
+  |  mov FCARG2, PC
+  |  lea FCARG1, [DISPATCH+GG_DISP2J]
+  |  mov aword [DISPATCH+DISPATCH_J(L)], L:RBa
+  |  call extern lj_dispatch_stitch@8	// (jit_State *J, const BCIns *pc)
+  |  mov BASE, L:RB->base
+  |  jmp ->cont_nop
+  |
+  |9:  // Fill up results with nil.
+  |  mov dword [RA+4], LJ_TNIL
+  |  add RA, 8
+  |  jmp <3
+  |.endif
+  |
   |->vm_profhook:			// Dispatch target for profiler hook.
   |->vm_profhook:			// Dispatch target for profiler hook.
 #if LJ_HASPROFILE
 #if LJ_HASPROFILE
   |  mov L:RB, SAVE_L
   |  mov L:RB, SAVE_L
@@ -5382,7 +5443,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  ins_A	// RA = base, RD = target (loop extent)
     |  ins_A	// RA = base, RD = target (loop extent)
     |  // Note: RA/RD is only used by trace recorder to determine scope/extent
     |  // Note: RA/RD is only used by trace recorder to determine scope/extent
     |  // This opcode does NOT jump, it's only purpose is to detect a hot loop.
     |  // This opcode does NOT jump, it's only purpose is to detect a hot loop.
-  |.if JIT
+    |.if JIT
     |  hotloop RB
     |  hotloop RB
     |.endif
     |.endif
     | // Fall through. Assumes BC_ILOOP follows and ins_A is a no-op.
     | // Fall through. Assumes BC_ILOOP follows and ins_A is a no-op.