Explorar o código

Compress snapshots using a simple, extensible 1D-compression.

Typically reduces storage overhead for snapshot maps by 60%.
The extensible format is a prerequisite for the next redesign steps:
Eliminate IR_FRAME and implement return-to-lower-frame.
Mike Pall %!s(int64=16) %!d(string=hai) anos
pai
achega
67ca399a30
Modificáronse 11 ficheiros con 365 adicións e 319 borrados
  1. 3 3
      src/Makefile.dep
  2. 14 7
      src/lib_jit.c
  3. 48 53
      src/lj_asm.c
  4. 2 2
      src/lj_gdbjit.c
  5. 13 3
      src/lj_jit.h
  6. 3 3
      src/lj_opt_dce.c
  7. 87 81
      src/lj_opt_loop.c
  8. 49 48
      src/lj_record.c
  9. 131 116
      src/lj_snap.c
  10. 13 0
      src/lj_snap.h
  11. 2 3
      src/lj_trace.c

+ 3 - 3
src/Makefile.dep

@@ -11,7 +11,7 @@ buildvm_lib.o: buildvm_lib.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
 buildvm_peobj.o: buildvm_peobj.c buildvm.h lj_def.h lua.h luaconf.h \
   lj_arch.h lj_bc.h
 lib_aux.o: lib_aux.c lua.h luaconf.h lauxlib.h lj_obj.h lj_def.h \
-  lj_arch.h lj_err.h lj_errmsg.h lj_lib.h lj_alloc.h
+  lj_arch.h lj_err.h lj_errmsg.h lj_state.h lj_lib.h lj_alloc.h
 lib_base.o: lib_base.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
   lj_def.h lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h \
   lj_meta.h lj_state.h lj_ff.h lj_ffdef.h lj_ctype.h lj_lib.h lj_libdef.h
@@ -87,8 +87,8 @@ lj_opt_fold.o: lj_opt_fold.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
   lj_str.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h lj_dispatch.h lj_bc.h \
   lj_traceerr.h lj_vm.h lj_folddef.h
 lj_opt_loop.o: lj_opt_loop.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
-  lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_ir.h lj_jit.h lj_iropt.h \
-  lj_trace.h lj_dispatch.h lj_bc.h lj_traceerr.h lj_snap.h lj_vm.h
+  lj_err.h lj_errmsg.h lj_str.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \
+  lj_dispatch.h lj_bc.h lj_traceerr.h lj_snap.h lj_vm.h
 lj_opt_mem.o: lj_opt_mem.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
   lj_tab.h lj_ir.h lj_jit.h lj_iropt.h
 lj_opt_narrow.o: lj_opt_narrow.c lj_obj.h lua.h luaconf.h lj_def.h \

+ 14 - 7
src/lib_jit.c

@@ -332,18 +332,25 @@ LJLIB_CF(jit_util_tracesnap)
   if (T && sn < T->nsnap) {
     SnapShot *snap = &T->snap[sn];
     SnapEntry *map = &T->snapmap[snap->mapofs];
-    BCReg s, nslots = snap->nslots;
+    MSize n, nent = snap->nent;
+    BCReg nslots = snap->nslots;
     GCtab *t;
     lua_createtable(L, nslots ? (int)nslots : 1, 0);
     t = tabV(L->top-1);
     setintV(lj_tab_setint(L, t, 0), (int32_t)snap->ref - REF_BIAS);
-    for (s = 0; s < nslots; s++) {
-      TValue *o = lj_tab_setint(L, t, (int32_t)(s+1));
-      IRRef ref = snap_ref(map[s]);
-      if (ref)
-	setintV(o, (int32_t)ref - REF_BIAS);
-      else
+    /* NYI: get rid of this and expose the compressed slot map. */
+    {
+      BCReg s;
+      for (s = 0; s < nslots; s++) {
+	TValue *o = lj_tab_setint(L, t, (int32_t)(s+1));
 	setboolV(o, 0);
+      }
+    }
+    for (n = 0; n < nent; n++) {
+      BCReg s = snap_slot(map[n]);
+      IRRef ref = snap_ref(map[n]);
+      TValue *o = lj_tab_setint(L, t, (int32_t)(s+1));
+      setintV(o, (int32_t)ref - REF_BIAS);
     }
     return 1;
   }

+ 48 - 53
src/lj_asm.c

@@ -926,9 +926,9 @@ static void asm_snap_alloc(ASMState *as)
 {
   SnapShot *snap = &as->T->snap[as->snapno];
   SnapEntry *map = &as->T->snapmap[snap->mapofs];
-  BCReg s, nslots = snap->nslots;
-  for (s = 0; s < nslots; s++) {
-    IRRef ref = snap_ref(map[s]);
+  MSize n, nent = snap->nent;
+  for (n = 0; n < nent; n++) {
+    IRRef ref = snap_ref(map[n]);
     if (!irref_isk(ref)) {
       IRIns *ir = IR(ref);
       if (!ra_used(ir) && ir->o != IR_FRAME) {
@@ -960,9 +960,9 @@ static int asm_snap_checkrename(ASMState *as, IRRef ren)
 {
   SnapShot *snap = &as->T->snap[as->snapno];
   SnapEntry *map = &as->T->snapmap[snap->mapofs];
-  BCReg s, nslots = snap->nslots;
-  for (s = 0; s < nslots; s++) {
-    IRRef ref = snap_ref(map[s]);
+  MSize n, nent = snap->nent;
+  for (n = 0; n < nent; n++) {
+    IRRef ref = snap_ref(map[n]);
     if (ref == ren) {
       IRIns *ir = IR(ref);
       ra_spill(as, ir);  /* Register renamed, so force a spill slot. */
@@ -2465,18 +2465,17 @@ static void asm_gc_sync(ASMState *as, SnapShot *snap, Reg base)
   */
   RegSet allow = rset_exclude(RSET_SCRATCH & RSET_GPR, base);
   SnapEntry *map = &as->T->snapmap[snap->mapofs];
-  BCReg s, nslots = snap->nslots;
-  for (s = 0; s < nslots; s++) {
-    IRRef ref = snap_ref(map[s]);
+  MSize n, nent = snap->nent;
+  for (n = 0; n < nent; n++) {
+    IRRef ref = snap_ref(map[n]);
     if (!irref_isk(ref)) {
+      int32_t ofs = 8*(int32_t)(snap_slot(map[n])-1);
       IRIns *ir = IR(ref);
       if (ir->o == IR_FRAME) {
 	/* NYI: sync the frame, bump base, set topslot, clear new slots. */
 	lj_trace_err(as->J, LJ_TRERR_NYIGCF);
-      } else if (irt_isgcv(ir->t) &&
-	       !(ir->o == IR_SLOAD && ir->op1 < nslots && map[ir->op1] == 0)) {
+      } else if (irt_isgcv(ir->t)) {
 	Reg src = ra_alloc1(as, ref, allow);
-	int32_t ofs = 8*(int32_t)(s-1);
 	emit_movtomro(as, src, base, ofs);
 	emit_movmroi(as, base, ofs+4, irt_toitype(ir->t));
 	checkmclim(as);
@@ -2504,7 +2503,7 @@ static void asm_gc_check(ASMState *as, SnapShot *snap)
   emit_loadi(as, tmp, (int32_t)as->gcsteps);
   /* We don't know spadj yet, so get the C frame from L->cframe. */
   emit_movmroi(as, tmp, CFRAME_OFS_PC,
-	       (int32_t)as->T->snapmap[snap->mapofs+snap->nslots]);
+	       (int32_t)as->T->snapmap[snap->mapofs+snap->nent]);
   emit_gri(as, XG_ARITHi(XOg_AND), tmp, CFRAME_RAWMASK);
   lstate = IR(ASMREF_L)->r;
   emit_movrmro(as, tmp, lstate, offsetof(lua_State, cframe));
@@ -2965,19 +2964,19 @@ static void asm_head_side(ASMState *as)
 static void asm_tail_sync(ASMState *as)
 {
   SnapShot *snap = &as->T->snap[as->T->nsnap-1];  /* Last snapshot. */
-  BCReg s, nslots = snap->nslots;
+  MSize n, nent = snap->nent;
   SnapEntry *map = &as->T->snapmap[snap->mapofs];
-  SnapEntry *flinks = map + nslots + snap->nframelinks;
+  SnapEntry *flinks = map + nent + snap->nframelinks;
   BCReg newbase = 0;
-  BCReg secondbase = ~(BCReg)0;
-  BCReg topslot = 0;
+  BCReg nslots, topslot = 0;
 
   checkmclim(as);
   ra_allocref(as, REF_BASE, RID2RSET(RID_BASE));
 
   /* Must check all frames to find topslot (outer can be larger than inner). */
-  for (s = 0; s < nslots; s++) {
-    IRRef ref = snap_ref(map[s]);
+  for (n = 0; n < nent; n++) {
+    IRRef ref = snap_ref(map[n]);
+    BCReg s = snap_slot(map[n]);
     if (!irref_isk(ref)) {
       IRIns *ir = IR(ref);
       if (ir->o == IR_FRAME && irt_isfunc(ir->t)) {
@@ -2985,10 +2984,7 @@ static void asm_tail_sync(ASMState *as)
 	if (isluafunc(fn)) {
 	  BCReg fs = s + funcproto(fn)->framesize;
 	  if (fs > topslot) topslot = fs;
-	  if (s != 0) {
-	    newbase = s;
-	    if (secondbase == ~(BCReg)0) secondbase = s;
-	  }
+	  newbase = s;
 	}
       }
     }
@@ -2998,7 +2994,7 @@ static void asm_tail_sync(ASMState *as)
   if (as->T->link == TRACE_INTERP) {
     /* Setup fixed registers for exit to interpreter. */
     emit_loada(as, RID_DISPATCH, J2GG(as->J)->dispatch);
-    emit_loadi(as, RID_PC, (int32_t)map[nslots]);
+    emit_loadi(as, RID_PC, (int32_t)map[nent]);
   } else if (newbase) {
     /* Save modified BASE for linking to trace with higher start frame. */
     emit_setgl(as, RID_BASE, jit_base);
@@ -3007,51 +3003,50 @@ static void asm_tail_sync(ASMState *as)
   emit_addptr(as, RID_BASE, 8*(int32_t)newbase);
 
   /* Clear stack slots of newly added frames. */
+  nslots = snap->nslots;
   if (nslots <= topslot) {
     if (nslots < topslot) {
+      BCReg s;
       for (s = nslots; s <= topslot; s++) {
-	emit_movtomro(as, RID_EAX, RID_BASE, 8*(int32_t)s-4);
+	emit_movtomro(as, RID_EAX, RID_BASE, 8*((int32_t)s-1)+4);
 	checkmclim(as);
       }
       emit_loadi(as, RID_EAX, LJ_TNIL);
     } else {
-      emit_movmroi(as, RID_BASE, 8*(int32_t)nslots-4, LJ_TNIL);
+      emit_movmroi(as, RID_BASE, 8*((int32_t)nslots-1)+4, LJ_TNIL);
     }
   }
 
   /* Store the value of all modified slots to the Lua stack. */
-  for (s = 0; s < nslots; s++) {
+  for (n = 0; n < nent; n++) {
+    BCReg s = snap_slot(map[n]);
     int32_t ofs = 8*((int32_t)s-1);
-    IRRef ref = snap_ref(map[s]);
-    if (ref) {
-      IRIns *ir = IR(ref);
-      /* No need to restore readonly slots and unmodified non-parent slots. */
-      if (ir->o == IR_SLOAD && ir->op1 == s &&
-	  (ir->op2 & (IRSLOAD_READONLY|IRSLOAD_PARENT)) != IRSLOAD_PARENT)
-	continue;
-      if (irt_isnum(ir->t)) {
-	Reg src = ra_alloc1(as, ref, RSET_FPR);
-	emit_rmro(as, XO_MOVSDto, src, RID_BASE, ofs);
-      } else if (ir->o == IR_FRAME) {
-	emit_movmroi(as, RID_BASE, ofs, ptr2addr(ir_kgc(IR(ir->op2))));
-	if (s != 0)  /* Do not overwrite link to previous frame. */
-	  emit_movmroi(as, RID_BASE, ofs+4, (int32_t)(*--flinks));
-      } else {
-	lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t));
-	if (!irref_isk(ref)) {
-	  Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE));
-	  emit_movtomro(as, src, RID_BASE, ofs);
-	} else if (!irt_ispri(ir->t)) {
-	  emit_movmroi(as, RID_BASE, ofs, ir->i);
-	}
-	emit_movmroi(as, RID_BASE, ofs+4, irt_toitype(ir->t));
-      }
+    IRRef ref = snap_ref(map[n]);
+    IRIns *ir = IR(ref);
+    /* No need to restore readonly slots and unmodified non-parent slots. */
+    if (ir->o == IR_SLOAD && ir->op1 == s &&
+	(ir->op2 & (IRSLOAD_READONLY|IRSLOAD_PARENT)) != IRSLOAD_PARENT)
+      continue;
+    if (irt_isnum(ir->t)) {
+      Reg src = ra_alloc1(as, ref, RSET_FPR);
+      emit_rmro(as, XO_MOVSDto, src, RID_BASE, ofs);
+    } else if (ir->o == IR_FRAME) {
+      emit_movmroi(as, RID_BASE, ofs, ptr2addr(ir_kgc(IR(ir->op2))));
+      if (s != 0)  /* Do not overwrite link to previous frame. */
+	emit_movmroi(as, RID_BASE, ofs+4, (int32_t)(*--flinks));
     } else {
-      lua_assert(!(s > secondbase));
+      lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t));
+      if (!irref_isk(ref)) {
+	Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE));
+	emit_movtomro(as, src, RID_BASE, ofs);
+      } else if (!irt_ispri(ir->t)) {
+	emit_movmroi(as, RID_BASE, ofs, ir->i);
+      }
+      emit_movmroi(as, RID_BASE, ofs+4, irt_toitype(ir->t));
     }
     checkmclim(as);
   }
-  lua_assert(map + nslots == flinks-1);
+  lua_assert(map + nent == flinks-1);
 }
 
 /* Fixup the tail code. */

+ 2 - 2
src/lj_gdbjit.c

@@ -698,8 +698,8 @@ void lj_gdbjit_addtrace(jit_State *J, Trace *T, TraceNo traceno)
   lua_State *L = J->L;
   GCproto *pt = &gcref(T->startpt)->pt;
   TraceNo parent = T->ir[REF_BASE].op1;
-  uintptr_t pcofs = (uintptr_t)(T->snap[0].mapofs+T->snap[0].nslots);
-  const BCIns *startpc = (const BCIns *)(uintptr_t)T->snapmap[pcofs];
+  uintptr_t pcofs = (uintptr_t)(T->snap[0].mapofs+T->snap[0].nent);
+  const BCIns *startpc = snap_pc(T->snapmap[pcofs]);
   ctx.T = T;
   ctx.mcaddr = (uintptr_t)T->mcode;
   ctx.szmcode = T->szmcode;

+ 13 - 3
src/lj_jit.h

@@ -112,17 +112,27 @@ typedef uint8_t MCode;
 typedef struct SnapShot {
   uint16_t mapofs;	/* Offset into snapshot map. */
   IRRef1 ref;		/* First IR ref for this snapshot. */
-  uint8_t nslots;	/* Number of stack slots. */
+  uint8_t nslots;	/* Number of valid slots. */
+  uint8_t nent;		/* Number of compressed entries. */
   uint8_t nframelinks;	/* Number of frame links. */
   uint8_t count;	/* Count of taken exits for this snapshot. */
-  uint8_t unused1;
 } SnapShot;
 
 #define SNAPCOUNT_DONE	255	/* Already compiled and linked a side trace. */
 
-/* Snapshot entry. */
+/* Compressed snapshot entry. */
 typedef uint32_t SnapEntry;
+
+#define SNAP_FRAME		0x010000	/* Slot has frame link. */
+
+#define SNAP(slot, flags, ref)	((SnapEntry)((slot) << 24) + (flags) + (ref))
+#define SNAP_MKPC(pc)		((SnapEntry)u32ptr(pc))
+#define SNAP_MKFTSZ(ftsz)	((SnapEntry)(ftsz))
 #define snap_ref(sn)		((sn) & 0xffff)
+#define snap_slot(sn)		((BCReg)((sn) >> 24))
+#define snap_isframe(sn)	((sn) & SNAP_FRAME)
+#define snap_pc(sn)		((const BCIns *)(uintptr_t)(sn))
+#define snap_setref(sn, ref)	(((sn) & 0xffff0000) | (ref))
 
 /* Snapshot and exit numbers. */
 typedef uint32_t SnapNo;

+ 3 - 3
src/lj_opt_dce.c

@@ -24,9 +24,9 @@ static void dce_marksnap(jit_State *J)
   for (i = 0; i < nsnap; i++) {
     SnapShot *snap = &J->cur.snap[i];
     SnapEntry *map = &J->cur.snapmap[snap->mapofs];
-    BCReg s, nslots = snap->nslots;
-    for (s = 0; s < nslots; s++) {
-      IRRef ref = snap_ref(map[s]);
+    MSize n, nent = snap->nent;
+    for (n = 0; n < nent; n++) {
+      IRRef ref = snap_ref(map[n]);
       if (!irref_isk(ref))
 	irt_setmark(IR(ref)->t);
     }

+ 87 - 81
src/lj_opt_loop.c

@@ -10,7 +10,6 @@
 
 #if LJ_HASJIT
 
-#include "lj_gc.h"
 #include "lj_err.h"
 #include "lj_str.h"
 #include "lj_ir.h"
@@ -163,21 +162,69 @@ static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi)
 
 /* -- Loop unrolling using copy-substitution ------------------------------ */
 
+/* Copy-substitute snapshot. */
+static void loop_subst_snap(jit_State *J, SnapShot *osnap,
+			    SnapEntry *loopmap, IRRef1 *subst)
+{
+  SnapEntry *nmap, *omap = &J->cur.snapmap[osnap->mapofs];
+  MSize nmapofs, nframelinks;
+  MSize on, ln, nn, onent = osnap->nent;
+  BCReg nslots = osnap->nslots;
+  SnapShot *snap = &J->cur.snap[J->cur.nsnap];
+  if (irt_isguard(J->guardemit)) {  /* Guard inbetween? */
+    nmapofs = J->cur.nsnapmap;
+    J->cur.nsnap++;  /* Add new snapshot. */
+  } else {  /* Otherwise overwrite previous snapshot. */
+    snap--;
+    nmapofs = snap->mapofs;
+  }
+  J->guardemit.irt = 0;
+  nframelinks = osnap->nframelinks;
+  /* Setup new snapshot. */
+  snap->mapofs = (uint16_t)nmapofs;
+  snap->ref = (IRRef1)J->cur.nins;
+  snap->nframelinks = (uint8_t)nframelinks;
+  snap->nslots = nslots;
+  snap->count = 0;
+  nmap = &J->cur.snapmap[nmapofs];
+  /* Substitute snapshot slots. */
+  on = ln = nn = 0;
+  while (on < onent) {
+    SnapEntry osn = omap[on], lsn = loopmap[ln];
+    if (snap_slot(lsn) < snap_slot(osn)) {  /* Copy slot from loop map. */
+      nmap[nn++] = lsn;
+      ln++;
+    } else {  /* Copy substituted slot from snapshot map. */
+      if (snap_slot(lsn) == snap_slot(osn)) ln++;  /* Shadowed loop slot. */
+      if (!irref_isk(snap_ref(osn)))
+	osn = snap_setref(osn, subst[snap_ref(osn)]);
+      nmap[nn++] = osn;
+      on++;
+    }
+  }
+  while (snap_slot(loopmap[ln]) < nslots)  /* Copy remaining loop slots. */
+    nmap[nn++] = loopmap[ln++];
+  snap->nent = (uint8_t)nn;
+  J->cur.nsnapmap = (uint16_t)(nmapofs + nn + nframelinks);
+  omap += onent;
+  nmap += nn;
+  for (nn = 0; nn < nframelinks; nn++)  /* Copy frame links. */
+    nmap[nn] = omap[nn];
+}
+
 /* Unroll loop. */
 static void loop_unroll(jit_State *J)
 {
   IRRef1 phi[LJ_MAX_PHI];
   uint32_t nphi = 0;
   IRRef1 *subst;
-  SnapShot *osnap, *snap;
-  SnapEntry *loopmap;
-  BCReg loopslots;
-  MSize nsnap, nsnapmap;
-  IRRef ins, invar, osnapref;
+  SnapShot *osnap;
+  SnapEntry *loopmap, *psentinel;
+  IRRef ins, invar;
 
   /* Use temp buffer for substitution table.
   ** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
-  ** Note: don't call into the VM or run the GC or the buffer may be gone.
+  ** Caveat: don't call into the VM or run the GC or the buffer may be gone.
   */
   invar = J->cur.nins;
   subst = (IRRef1 *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf,
@@ -187,80 +234,37 @@ static void loop_unroll(jit_State *J)
   /* LOOP separates the pre-roll from the loop body. */
   emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0);
 
-  /* Ensure size for copy-substituted snapshots (minus #0 and loop snapshot). */
-  nsnap = J->cur.nsnap;
-  if (LJ_UNLIKELY(2*nsnap-2 > J->sizesnap)) {
-    MSize maxsnap = (MSize)J->param[JIT_P_maxsnap];
-    if (2*nsnap-2 > maxsnap)
-      lj_trace_err(J, LJ_TRERR_SNAPOV);
-    lj_mem_growvec(J->L, J->snapbuf, J->sizesnap, maxsnap, SnapShot);
-    J->cur.snap = J->snapbuf;
-  }
-  nsnapmap = J->cur.nsnapmap;  /* Use temp. copy to avoid undo. */
-  if (LJ_UNLIKELY(nsnapmap*2 > J->sizesnapmap)) {
-    J->snapmapbuf = (SnapEntry *)lj_mem_realloc(J->L, J->snapmapbuf,
-		      J->sizesnapmap*sizeof(SnapEntry),
-		      2*J->sizesnapmap*sizeof(SnapEntry));
-    J->cur.snapmap = J->snapmapbuf;
-    J->sizesnapmap *= 2;
-  }
+  /* Grow snapshot buffer and map for copy-substituted snapshots.
+  ** Need up to twice the number of snapshots minus #0 and loop snapshot.
+  ** Need up to twice the number of entries plus fallback substitutions
+  ** from the loop snapshot entries for each new snapshot.
+  ** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap!
+  */
+  {
+    MSize nsnap = J->cur.nsnap;
+    SnapShot *loopsnap;
+    lj_snap_grow_buf(J, 2*nsnap-2);
+    lj_snap_grow_map(J, J->cur.nsnapmap*2+(nsnap-2)*J->cur.snap[nsnap-1].nent);
 
-  /* The loop snapshot is used for fallback substitutions. */
-  snap = &J->cur.snap[nsnap-1];
-  loopmap = &J->cur.snapmap[snap->mapofs];
-  loopslots = snap->nslots;
-  /* The PC of snapshot #0 and the loop snapshot must match. */
-  lua_assert(loopmap[loopslots] == J->cur.snapmap[J->cur.snap[0].nslots]);
+    /* The loop snapshot is used for fallback substitutions. */
+    loopsnap = &J->cur.snap[nsnap-1];
+    loopmap = &J->cur.snapmap[loopsnap->mapofs];
+    /* The PC of snapshot #0 and the loop snapshot must match. */
+    psentinel = &loopmap[loopsnap->nent];
+    lua_assert(*psentinel == J->cur.snapmap[J->cur.snap[0].nent]);
+    *psentinel = SNAP(255, 0, 0);  /* Replace PC with temporary sentinel. */
+  }
 
   /* Start substitution with snapshot #1 (#0 is empty for root traces). */
   osnap = &J->cur.snap[1];
-  osnapref = osnap->ref;
 
   /* Copy and substitute all recorded instructions and snapshots. */
   for (ins = REF_FIRST; ins < invar; ins++) {
     IRIns *ir;
     IRRef op1, op2;
 
-    /* Copy-substitute snapshot. */
-    if (ins >= osnapref) {
-      SnapEntry *nmap, *omap = &J->cur.snapmap[osnap->mapofs];
-      BCReg s, nslots;
-      uint32_t nmapofs, nframelinks;
-      if (irt_isguard(J->guardemit)) {  /* Guard inbetween? */
-	nmapofs = nsnapmap;
-	snap++;  /* Add new snapshot. */
-      } else {
-	nmapofs = snap->mapofs;  /* Overwrite previous snapshot. */
-      }
-      J->guardemit.irt = 0;
-      nslots = osnap->nslots;
-      nframelinks = osnap->nframelinks;
-      snap->mapofs = (uint16_t)nmapofs;
-      snap->ref = (IRRef1)J->cur.nins;
-      snap->nslots = (uint8_t)nslots;
-      snap->nframelinks = (uint8_t)nframelinks;
-      snap->count = 0;
-      osnap++;
-      osnapref = osnap->ref;
-      nsnapmap = nmapofs + nslots + nframelinks;
-      nmap = &J->cur.snapmap[nmapofs];
-      /* Substitute snapshot slots. */
-      for (s = 0; s < nslots; s++) {
-	IRRef ref = snap_ref(omap[s]);
-	if (ref) {
-	  if (!irref_isk(ref))
-	    ref = subst[ref];
-	} else if (s < loopslots) {
-	  ref = loopmap[s];
-	}
-	nmap[s] = ref;
-      }
-      /* Copy frame links. */
-      nmap += nslots;
-      omap += nslots;
-      for (s = 0; s < nframelinks; s++)
-	nmap[s] = omap[s];
-    }
+    if (ins >= osnap->ref)  /* Instruction belongs to next snapshot? */
+      loop_subst_snap(J, osnap++, loopmap, subst);  /* Copy-substitute it. */
 
     /* Substitute instruction operands. */
     ir = IR(ins);
@@ -295,22 +299,24 @@ static void loop_unroll(jit_State *J)
       }
     }
   }
-  if (irt_isguard(J->guardemit)) {  /* Guard inbetween? */
-    J->cur.nsnapmap = (uint16_t)nsnapmap;
-    snap++;
-  } else {
-    J->cur.nsnapmap = (uint16_t)snap->mapofs;  /* Last snapshot is redundant. */
-  }
-  J->cur.nsnap = (uint16_t)(snap - J->cur.snap);
+  if (!irt_isguard(J->guardemit))  /* Drop redundant snapshot. */
+    J->cur.nsnapmap = (uint16_t)J->cur.snap[--J->cur.nsnap].mapofs;
   lua_assert(J->cur.nsnapmap <= J->sizesnapmap);
+  *psentinel = J->cur.snapmap[J->cur.snap[0].nent];  /* Restore PC. */
 
   loop_emit_phi(J, subst, phi, nphi);
 }
 
 /* Undo any partial changes made by the loop optimization. */
-static void loop_undo(jit_State *J, IRRef ins)
+static void loop_undo(jit_State *J, IRRef ins, MSize nsnap)
 {
   ptrdiff_t i;
+  SnapShot *snap = &J->cur.snap[nsnap-1];
+  SnapEntry *map = J->cur.snapmap;
+  map[snap->mapofs + snap->nent] = map[J->cur.snap[0].nent];  /* Restore PC. */
+  J->cur.nsnapmap = (uint16_t)(snap->mapofs + snap->nent + snap->nframelinks);
+  J->cur.nsnap = nsnap;
+  J->guardemit.irt = 0;
   lj_ir_rollback(J, ins);
   for (i = 0; i < BPROP_SLOTS; i++) {  /* Remove backprop. cache entries. */
     BPropEntry *bp = &J->bpropcache[i];
@@ -336,6 +342,7 @@ static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud)
 int lj_opt_loop(jit_State *J)
 {
   IRRef nins = J->cur.nins;
+  MSize nsnap = J->cur.nsnap;
   int errcode = lj_vm_cpcall(J->L, NULL, J, cploop_opt);
   if (LJ_UNLIKELY(errcode)) {
     lua_State *L = J->L;
@@ -348,8 +355,7 @@ int lj_opt_loop(jit_State *J)
 	if (--J->instunroll < 0)  /* But do not unroll forever. */
 	  break;
 	L->top--;  /* Remove error object. */
-	J->guardemit.irt = 0;
-	loop_undo(J, nins);
+	loop_undo(J, nins, nsnap);
 	return 1;  /* Loop optimization failed, continue recording. */
       default:
 	break;

+ 49 - 48
src/lj_record.c

@@ -1696,7 +1696,7 @@ static void optstate_comp(jit_State *J, int cond)
   const BCIns *npc = J->pc + 2 + (cond ? bc_j(jmpins) : 0);
   SnapShot *snap = &J->cur.snap[J->cur.nsnap-1];
   /* Avoid re-recording the comparison in side traces. */
-  J->cur.snapmap[snap->mapofs + snap->nslots] = u32ptr(npc);
+  J->cur.snapmap[snap->mapofs + snap->nent] = SNAP_MKPC(npc);
   J->needsnap = 1;
   /* Shrink last snapshot if possible. */
   if (bc_a(jmpins) < J->maxslot) {
@@ -2159,61 +2159,62 @@ static void rec_setup_side(jit_State *J, Trace *T)
 {
   SnapShot *snap = &T->snap[J->exitno];
   SnapEntry *map = &T->snapmap[snap->mapofs];
-  BCReg s, nslots = snap->nslots;
+  MSize n, nent = snap->nent;
   BloomFilter seen = 0;
-  for (s = 0; s < nslots; s++) {
-    IRRef ref = snap_ref(map[s]);
-    if (ref) {
-      IRIns *ir = &T->ir[ref];
-      TRef tr = 0;
-      /* The bloom filter avoids O(nslots^2) overhead for de-duping slots. */
-      if (bloomtest(seen, ref)) {
-	BCReg j;
-	for (j = 0; j < s; j++)
-	  if (snap_ref(map[j]) == ref) {
-	    if (ir->o == IR_FRAME && irt_isfunc(ir->t)) {
-	      lua_assert(s != 0);
-	      J->baseslot = s+1;
-	      J->framedepth++;
-	    }
-	    tr = J->slot[j];
-	    goto dupslot;
-	  }
-      }
-      bloomset(seen, ref);
-      switch ((IROp)ir->o) {
-      case IR_KPRI: tr = TREF_PRI(irt_type(ir->t)); break;
-      case IR_KINT: tr = lj_ir_kint(J, ir->i); break;
-      case IR_KGC:  tr = lj_ir_kgc(J, ir_kgc(ir), irt_t(ir->t)); break;
-      case IR_KNUM: tr = lj_ir_knum_addr(J, ir_knum(ir)); break;
-      case IR_FRAME:  /* Placeholder FRAMEs don't need a guard. */
-	if (irt_isfunc(ir->t)) {
-	  if (s != 0) {
+  /* Emit IR for slots inherited from parent snapshot. */
+  for (n = 0; n < nent; n++) {
+    IRRef ref = snap_ref(map[n]);
+    BCReg s = snap_slot(map[n]);
+    IRIns *ir = &T->ir[ref];
+    TRef tr;
+    /* The bloom filter avoids O(nent^2) overhead for de-duping slots. */
+    if (bloomtest(seen, ref)) {
+      MSize j;
+      for (j = 0; j < n; j++)
+	if (snap_ref(map[j]) == ref) {
+	  tr = J->slot[snap_slot(map[j])];
+	  if (ir->o == IR_FRAME && irt_isfunc(ir->t)) {
+	    lua_assert(s != 0);
 	    J->baseslot = s+1;
 	    J->framedepth++;
 	  }
-	  tr = lj_ir_kfunc(J, ir_kfunc(&T->ir[ir->op2]));
-	  tr = emitir_raw(IRT(IR_FRAME, IRT_FUNC), tr, tr);
-	} else {
-	  tr = lj_ir_kptr(J, mref(T->ir[ir->op2].ptr, void));
-	  tr = emitir_raw(IRT(IR_FRAME, IRT_PTR), tr, tr);
+	  goto dupslot;
 	}
-	break;
-      case IR_SLOAD:  /* Inherited SLOADs don't need a guard or type check. */
-	tr = emitir_raw(ir->ot & ~IRT_GUARD, s,
-	       (ir->op2&IRSLOAD_READONLY) | IRSLOAD_INHERIT|IRSLOAD_PARENT);
-	break;
-      default:  /* Parent refs are already typed and don't need a guard. */
-	tr = emitir_raw(IRT(IR_SLOAD, irt_type(ir->t)), s,
-			IRSLOAD_INHERIT|IRSLOAD_PARENT);
-	break;
+    }
+    bloomset(seen, ref);
+    switch ((IROp)ir->o) {
+    /* Only have to deal with constants that can occur in stack slots. */
+    case IR_KPRI: tr = TREF_PRI(irt_type(ir->t)); break;
+    case IR_KINT: tr = lj_ir_kint(J, ir->i); break;
+    case IR_KGC:  tr = lj_ir_kgc(J, ir_kgc(ir), irt_t(ir->t)); break;
+    case IR_KNUM: tr = lj_ir_knum_addr(J, ir_knum(ir)); break;
+    case IR_FRAME:  /* Placeholder FRAMEs don't need a guard. */
+      if (irt_isfunc(ir->t)) {
+	if (s != 0) {
+	  J->baseslot = s+1;
+	  J->framedepth++;
+	}
+	tr = lj_ir_kfunc(J, ir_kfunc(&T->ir[ir->op2]));
+	tr = emitir_raw(IRT(IR_FRAME, IRT_FUNC), tr, tr);
+      } else {
+	tr = lj_ir_kptr(J, mref(T->ir[ir->op2].ptr, void));
+	tr = emitir_raw(IRT(IR_FRAME, IRT_PTR), tr, tr);
       }
-    dupslot:
-      J->slot[s] = tr;
+      break;
+    case IR_SLOAD:  /* Inherited SLOADs don't need a guard or type check. */
+      tr = emitir_raw(ir->ot & ~IRT_GUARD, s,
+	     (ir->op2&IRSLOAD_READONLY) | IRSLOAD_INHERIT|IRSLOAD_PARENT);
+      break;
+    default:  /* Parent refs are already typed and don't need a guard. */
+      tr = emitir_raw(IRT(IR_SLOAD, irt_type(ir->t)), s,
+		      IRSLOAD_INHERIT|IRSLOAD_PARENT);
+      break;
     }
+  dupslot:
+    J->slot[s] = tr;
   }
   J->base = J->slot + J->baseslot;
-  J->maxslot = nslots - J->baseslot;
+  J->maxslot = snap->nslots - J->baseslot;
   lj_snap_add(J);
 }
 
@@ -2259,7 +2260,7 @@ void lj_record_setup(jit_State *J)
     J->cur.root = (uint16_t)root;
     J->cur.startins = BCINS_AD(BC_JMP, 0, 0);
     /* Check whether we could at least potentially form an extra loop. */
-    if (J->exitno == 0 && T->snap[0].nslots == 1 && T->snapmap[0] == 0) {
+    if (J->exitno == 0 && T->snap[0].nent == 0) {
       /* We can narrow a FORL for some side traces, too. */
       if (J->pc > J->pt->bc && bc_op(J->pc[-1]) == BC_JFORI &&
 	  bc_d(J->pc[bc_j(J->pc[-1])-1]) == root) {

+ 131 - 116
src/lj_snap.c

@@ -23,28 +23,50 @@
 /* Some local macros to save typing. Undef'd at the end. */
 #define IR(ref)		(&J->cur.ir[(ref)])
 
+/* -- Snapshot buffer allocation ------------------------------------------ */
+
+/* Grow snapshot buffer. */
+void lj_snap_grow_buf_(jit_State *J, MSize need)
+{
+  MSize maxsnap = (MSize)J->param[JIT_P_maxsnap];
+  if (need > maxsnap)
+    lj_trace_err(J, LJ_TRERR_SNAPOV);
+  lj_mem_growvec(J->L, J->snapbuf, J->sizesnap, maxsnap, SnapShot);
+  J->cur.snap = J->snapbuf;
+}
+
+/* Grow snapshot map buffer. */
+void lj_snap_grow_map_(jit_State *J, MSize need)
+{
+  if (need < 2*J->sizesnapmap)
+    need = 2*J->sizesnapmap;
+  else if (need < 64)
+    need = 64;
+  J->snapmapbuf = (SnapEntry *)lj_mem_realloc(J->L, J->snapmapbuf,
+		    J->sizesnapmap*sizeof(SnapEntry), need*sizeof(SnapEntry));
+  J->cur.snapmap = J->snapmapbuf;
+  J->sizesnapmap = need;
+}
+
 /* -- Snapshot generation ------------------------------------------------- */
 
-/* NYI: Snapshots are in need of a redesign. The current storage model for
-** snapshot maps is too wasteful. They could be compressed (1D or 2D) and
-** made more flexible at the same time. Iterators should no longer need to
-** skip unmodified slots. IR_FRAME should be eliminated, too.
-*/
+/* NYI: IR_FRAME should be eliminated, too. */
 
 /* Add all modified slots to the snapshot. */
 static MSize snapshot_slots(jit_State *J, SnapEntry *map, BCReg nslots)
 {
   BCReg s;
+  MSize n = 0;
   for (s = 0; s < nslots; s++) {
     IRRef ref = tref_ref(J->slot[s]);
     if (ref) {
       IRIns *ir = IR(ref);
-      if (ir->o == IR_SLOAD && ir->op1 == s && !(ir->op2 & IRSLOAD_INHERIT))
-	ref = 0;
+      if (!(ir->o == IR_SLOAD && ir->op1 == s &&
+	    !(ir->op2 & IRSLOAD_INHERIT)))
+	map[n++] = SNAP(s, ir->o == IR_FRAME ? SNAP_FRAME : 0, ref);
     }
-    map[s] = (SnapEntry)ref;
   }
-  return nslots;
+  return n;
 }
 
 /* Add frame links at the end of the snapshot. */
@@ -53,17 +75,17 @@ static MSize snapshot_framelinks(jit_State *J, SnapEntry *map)
   cTValue *frame = J->L->base - 1;
   cTValue *lim = J->L->base - J->baseslot;
   MSize f = 0;
-  map[f++] = u32ptr(J->pc);
-  while (frame > lim) {
+  map[f++] = SNAP_MKPC(J->pc);  /* The current PC is always the first entry. */
+  while (frame > lim) {  /* Backwards traversal of all frames above base. */
     if (frame_islua(frame)) {
-      map[f++] = u32ptr(frame_pc(frame));
+      map[f++] = SNAP_MKPC(frame_pc(frame));
       frame = frame_prevl(frame);
     } else if (frame_ispcall(frame)) {
-      map[f++] = (uint32_t)frame_ftsz(frame);
+      map[f++] = SNAP_MKFTSZ(frame_ftsz(frame));
       frame = frame_prevd(frame);
     } else if (frame_iscont(frame)) {
-      map[f++] = (uint32_t)frame_ftsz(frame);
-      map[f++] = u32ptr(frame_contpc(frame));
+      map[f++] = SNAP_MKFTSZ(frame_ftsz(frame));
+      map[f++] = SNAP_MKPC(frame_contpc(frame));
       frame = frame_prevd(frame);
     } else {
       lua_assert(0);
@@ -76,28 +98,19 @@ static MSize snapshot_framelinks(jit_State *J, SnapEntry *map)
 static void snapshot_stack(jit_State *J, SnapShot *snap, MSize nsnapmap)
 {
   BCReg nslots = J->baseslot + J->maxslot;
-  MSize nsm, nframelinks;
+  MSize nent, nframelinks;
   SnapEntry *p;
   /* Conservative estimate. Continuation frames need 2 slots. */
-  nsm = nsnapmap + nslots + (uint32_t)J->framedepth*2+1;
-  if (LJ_UNLIKELY(nsm > J->sizesnapmap)) {  /* Need to grow snapshot map? */
-    if (nsm < 2*J->sizesnapmap)
-      nsm = 2*J->sizesnapmap;
-    else if (nsm < 64)
-      nsm = 64;
-    J->snapmapbuf = (SnapEntry *)lj_mem_realloc(J->L, J->snapmapbuf,
-		      J->sizesnapmap*sizeof(SnapEntry), nsm*sizeof(SnapEntry));
-    J->cur.snapmap = J->snapmapbuf;
-    J->sizesnapmap = nsm;
-  }
+  lj_snap_grow_map(J, nsnapmap + nslots + (MSize)J->framedepth*2+1);
   p = &J->cur.snapmap[nsnapmap];
-  nslots = snapshot_slots(J, p, nslots);
-  nframelinks = snapshot_framelinks(J, p + nslots);
-  J->cur.nsnapmap = (uint16_t)(nsnapmap + nslots + nframelinks);
+  nent = snapshot_slots(J, p, nslots);
+  nframelinks = snapshot_framelinks(J, p + nent);
+  J->cur.nsnapmap = (uint16_t)(nsnapmap + nent + nframelinks);
   snap->mapofs = (uint16_t)nsnapmap;
   snap->ref = (IRRef1)J->cur.nins;
-  snap->nslots = (uint8_t)nslots;
+  snap->nent = (uint8_t)nent;
   snap->nframelinks = (uint8_t)nframelinks;
+  snap->nslots = (uint8_t)nslots;
   snap->count = 0;
 }
 
@@ -111,14 +124,7 @@ void lj_snap_add(jit_State *J)
       (nsnap > 0 && J->cur.snap[nsnap-1].ref == J->cur.nins)) {
     nsnapmap = J->cur.snap[--nsnap].mapofs;
   } else {
-    /* Need to grow snapshot buffer? */
-    if (LJ_UNLIKELY(nsnap >= J->sizesnap)) {
-      MSize maxsnap = (MSize)J->param[JIT_P_maxsnap];
-      if (nsnap >= maxsnap)
-	lj_trace_err(J, LJ_TRERR_SNAPOV);
-      lj_mem_growvec(J->L, J->snapbuf, J->sizesnap, maxsnap, SnapShot);
-      J->cur.snap = J->snapbuf;
-    }
+    lj_snap_grow_buf(J, nsnap+1);
     J->cur.nsnap = (uint16_t)(nsnap+1);
   }
   J->mergesnap = 0;
@@ -131,14 +137,21 @@ void lj_snap_shrink(jit_State *J)
 {
   BCReg nslots = J->baseslot + J->maxslot;
   SnapShot *snap = &J->cur.snap[J->cur.nsnap-1];
-  SnapEntry *oflinks = &J->cur.snapmap[snap->mapofs + snap->nslots];
-  SnapEntry *nflinks = &J->cur.snapmap[snap->mapofs + nslots];
-  uint32_t s, nframelinks = snap->nframelinks;
+  SnapEntry *map = &J->cur.snapmap[snap->mapofs];
+  MSize nent = snap->nent;
   lua_assert(nslots < snap->nslots);
   snap->nslots = (uint8_t)nslots;
-  J->cur.nsnapmap = (uint16_t)(snap->mapofs + nslots + nframelinks);
-  for (s = 0; s < nframelinks; s++)  /* Move frame links down. */
-    nflinks[s] = oflinks[s];
+  if (nent > 0 && snap_slot(map[nent-1]) >= nslots) {
+    MSize s, delta, nframelinks = snap->nframelinks;
+    for (nent--; nent > 0 && snap_slot(map[nent-1]) >= nslots; nent--)
+      ;
+    delta = snap->nent - nent;
+    snap->nent = (uint8_t)nent;
+    J->cur.nsnapmap = (uint16_t)(snap->mapofs + nent + nframelinks);
+    map += nent;
+    for (s = 0; s < nframelinks; s++)  /* Move frame links down. */
+      map[s] = map[s+delta];
+  }
 }
 
 /* -- Snapshot access ----------------------------------------------------- */
@@ -167,21 +180,24 @@ static RegSP snap_renameref(Trace *T, SnapNo lim, IRRef ref, RegSP rs)
   return rs;
 }
 
-/* Convert a snapshot into a linear slot -> RegSP map. */
+/* Convert a snapshot into a linear slot -> RegSP map.
+** Note: unused slots are not initialized!
+*/
 void lj_snap_regspmap(uint16_t *rsmap, Trace *T, SnapNo snapno)
 {
   SnapShot *snap = &T->snap[snapno];
-  BCReg s, nslots = snap->nslots;
+  MSize n, nent = snap->nent;
   SnapEntry *map = &T->snapmap[snap->mapofs];
   BloomFilter rfilt = snap_renamefilter(T, snapno);
-  for (s = 0; s < nslots; s++) {
-    IRRef ref = snap_ref(map[s]);
+  for (n = 0; n < nent; n++) {
+    SnapEntry sn = map[n];
+    IRRef ref = snap_ref(sn);
     if (!irref_isk(ref)) {
       IRIns *ir = &T->ir[ref];
       uint32_t rs = ir->prev;
       if (bloomtest(rfilt, ref))
 	rs = snap_renameref(T, snapno, ref, rs);
-      rsmap[s] = (uint16_t)rs;
+      rsmap[snap_slot(sn)] = (uint16_t)rs;
     }
   }
 }
@@ -193,89 +209,88 @@ void lj_snap_restore(jit_State *J, void *exptr)
   SnapNo snapno = J->exitno;  /* For now, snapno == exitno. */
   Trace *T = J->trace[J->parent];
   SnapShot *snap = &T->snap[snapno];
-  BCReg s, nslots = snap->nslots;
+  MSize n, nent = snap->nent;
   SnapEntry *map = &T->snapmap[snap->mapofs];
-  SnapEntry *flinks = map + nslots + snap->nframelinks;
-  TValue *o, *newbase, *ntop;
+  SnapEntry *flinks = map + nent + snap->nframelinks;
+  BCReg nslots = snap->nslots;
+  TValue *frame;
   BloomFilter rfilt = snap_renamefilter(T, snapno);
   lua_State *L = J->L;
 
   /* Make sure the stack is big enough for the slots from the snapshot. */
-  if (L->base + nslots >= L->maxstack) {
+  if (LJ_UNLIKELY(L->base + nslots > L->maxstack)) {
     L->top = curr_topL(L);
     lj_state_growstack(L, nslots - curr_proto(L)->framesize);
   }
 
   /* Fill stack slots with data from the registers and spill slots. */
-  newbase = NULL;
-  ntop = L->base;
-  for (s = 0, o = L->base-1; s < nslots; s++, o++) {
-    IRRef ref = snap_ref(map[s]);
-    if (ref) {
-      IRIns *ir = &T->ir[ref];
-      if (irref_isk(ref)) {  /* Restore constant slot. */
-	lj_ir_kvalue(L, o, ir);
-      } else {
-	IRType1 t = ir->t;
-	RegSP rs = ir->prev;
-	if (LJ_UNLIKELY(bloomtest(rfilt, ref)))
-	  rs = snap_renameref(T, snapno, ref, rs);
-	if (ra_hasspill(regsp_spill(rs))) {  /* Restore from spill slot. */
-	  int32_t *sps = &ex->spill[regsp_spill(rs)];
-	  if (irt_isinteger(t)) {
-	    setintV(o, *sps);
-	  } else if (irt_isnum(t)) {
-	    o->u64 = *(uint64_t *)sps;
-	  } else {
-	    lua_assert(!irt_ispri(t));  /* PRI refs never have a spill slot. */
-	    setgcrefi(o->gcr, *sps);
-	    setitype(o, irt_toitype(t));
-	  }
-	} else if (ra_hasreg(regsp_reg(rs))) {  /* Restore from register. */
-	  Reg r = regsp_reg(rs);
-	  if (irt_isinteger(t)) {
-	    setintV(o, ex->gpr[r-RID_MIN_GPR]);
-	  } else if (irt_isnum(t)) {
-	    setnumV(o, ex->fpr[r-RID_MIN_FPR]);
-	  } else {
-	    if (!irt_ispri(t))
-	      setgcrefi(o->gcr, ex->gpr[r-RID_MIN_GPR]);
-	    setitype(o, irt_toitype(t));
-	  }
-	} else {  /* Restore frame slot. */
-	  lua_assert(ir->o == IR_FRAME);
-	  /* This works for both PTR and FUNC IR_FRAME. */
-	  setgcrefp(o->fr.func, mref(T->ir[ir->op2].ptr, void));
-	  if (s != 0)  /* Do not overwrite link to previous frame. */
-	    o->fr.tp.ftsz = (int32_t)*--flinks;
-	  if (irt_isfunc(ir->t)) {
-	    GCfunc *fn = gco2func(gcref(T->ir[ir->op2].gcr));
-	    if (isluafunc(fn)) {
-	      TValue *fs;
-	      fs = o+1 + funcproto(fn)->framesize;
-	      if (fs > ntop) ntop = fs; /* Update top for newly added frames. */
-	      if (s != 0) newbase = o+1;
+  frame = L->base-1;
+  for (n = 0; n < nent; n++) {
+    IRRef ref = snap_ref(map[n]);
+    BCReg s = snap_slot(map[n]);
+    TValue *o = &frame[s];  /* Stack slots are relative to start frame. */
+    IRIns *ir = &T->ir[ref];
+    if (irref_isk(ref)) {  /* Restore constant slot. */
+      lj_ir_kvalue(L, o, ir);
+    } else {
+      IRType1 t = ir->t;
+      RegSP rs = ir->prev;
+      if (LJ_UNLIKELY(bloomtest(rfilt, ref)))
+	rs = snap_renameref(T, snapno, ref, rs);
+      if (ra_hasspill(regsp_spill(rs))) {  /* Restore from spill slot. */
+	int32_t *sps = &ex->spill[regsp_spill(rs)];
+	if (irt_isinteger(t)) {
+	  setintV(o, *sps);
+	} else if (irt_isnum(t)) {
+	  o->u64 = *(uint64_t *)sps;
+	} else {
+	  lua_assert(!irt_ispri(t));  /* PRI refs never have a spill slot. */
+	  setgcrefi(o->gcr, *sps);
+	  setitype(o, irt_toitype(t));
+	}
+      } else if (ra_hasreg(regsp_reg(rs))) {  /* Restore from register. */
+	Reg r = regsp_reg(rs);
+	if (irt_isinteger(t)) {
+	  setintV(o, ex->gpr[r-RID_MIN_GPR]);
+	} else if (irt_isnum(t)) {
+	  setnumV(o, ex->fpr[r-RID_MIN_FPR]);
+	} else {
+	  if (!irt_ispri(t))
+	    setgcrefi(o->gcr, ex->gpr[r-RID_MIN_GPR]);
+	  setitype(o, irt_toitype(t));
+	}
+      } else {  /* Restore frame slot. */
+	lua_assert(ir->o == IR_FRAME);
+	/* This works for both PTR and FUNC IR_FRAME. */
+	setgcrefp(o->fr.func, mref(T->ir[ir->op2].ptr, void));
+	if (s != 0)  /* Do not overwrite link to previous frame. */
+	  o->fr.tp.ftsz = (int32_t)*--flinks;
+	if (irt_isfunc(ir->t)) {
+	  GCfunc *fn = gco2func(gcref(T->ir[ir->op2].gcr));
+	  if (isluafunc(fn)) {
+	    MSize framesize = funcproto(fn)->framesize;
+	    TValue *fs;
+	    L->base = ++o;
+	    if (LJ_UNLIKELY(o + framesize > L->maxstack)) {  /* Grow again? */
+	      ptrdiff_t fsave = savestack(L, frame);
+	      L->top = o;
+	      lj_state_growstack(L, framesize);
+	      frame = restorestack(L, fsave);
+	      o = L->top;
 	    }
+	    fs = o + framesize;
+	    if (s == 0)  /* Only partially clear tail call frame at #0. */
+	      o = &frame[nslots];
+	    while (o < fs)  /* Clear slots of newly added frames. */
+	      setnilV(o++);
 	  }
 	}
       }
-    } else {
-      lua_assert(!newbase);
     }
   }
-  if (newbase) L->base = newbase;
-  if (ntop >= L->maxstack) {  /* Need to grow the stack again. */
-    MSize need = (MSize)(ntop - o);
-    L->top = o;
-    lj_state_growstack(L, need);
-    o = L->top;
-    ntop = o + need;
-  }
   L->top = curr_topL(L);
-  for (; o < ntop; o++)  /* Clear remainder of newly added frames. */
-    setnilV(o);
-  lua_assert(map + nslots == flinks-1);
-  J->pc = (const BCIns *)(uintptr_t)(*--flinks);
+  J->pc = snap_pc(*--flinks);
+  lua_assert(map + nent == flinks);
 }
 
 #undef IR

+ 13 - 0
src/lj_snap.h

@@ -14,6 +14,19 @@ LJ_FUNC void lj_snap_add(jit_State *J);
 LJ_FUNC void lj_snap_shrink(jit_State *J);
 LJ_FUNC void lj_snap_regspmap(uint16_t *rsmap, Trace *T, SnapNo snapno);
 LJ_FUNC void lj_snap_restore(jit_State *J, void *exptr);
+LJ_FUNC void lj_snap_grow_buf_(jit_State *J, MSize need);
+LJ_FUNC void lj_snap_grow_map_(jit_State *J, MSize need);
+
+static LJ_AINLINE void lj_snap_grow_buf(jit_State *J, MSize need)
+{
+  if (LJ_UNLIKELY(need > J->sizesnap)) lj_snap_grow_buf_(J, need);
+}
+
+static LJ_AINLINE void lj_snap_grow_map(jit_State *J, MSize need)
+{
+  if (LJ_UNLIKELY(need > J->sizesnapmap)) lj_snap_grow_map_(J, need);
+}
+
 #endif
 
 #endif

+ 2 - 3
src/lj_trace.c

@@ -161,8 +161,8 @@ void lj_trace_reenableproto(GCproto *pt)
 static void trace_unpatch(jit_State *J, Trace *T)
 {
   BCOp op = bc_op(T->startins);
-  uint32_t pcofs = T->snap[0].mapofs + T->snap[0].nslots;
-  BCIns *pc = ((BCIns *)(uintptr_t)T->snapmap[pcofs]) - 1;
+  MSize pcofs = T->snap[0].mapofs + T->snap[0].nent;
+  BCIns *pc = ((BCIns *)snap_pc(T->snapmap[pcofs])) - 1;
   switch (op) {
   case BC_FORL:
     lua_assert(bc_op(*pc) == BC_JFORI);
@@ -352,7 +352,6 @@ static void trace_start(jit_State *J)
   J->cur.ir = J->irbuf;
   J->cur.snap = J->snapbuf;
   J->cur.snapmap = J->snapmapbuf;
-  /* J->cur.nsnapmap = 0; */
   J->mergesnap = 0;
   J->needsnap = 0;
   J->guardemit.irt = 0;