Browse Source

Add allocation sinking and store sinking optimization.

Mike Pall 13 years ago
parent
commit
0af3f47ba0
15 changed files with 749 additions and 88 deletions
  1. 1 1
      src/Makefile
  2. 12 8
      src/Makefile.dep
  3. 10 5
      src/jit/dump.lua
  4. 52 15
      src/lj_asm.c
  5. 31 18
      src/lj_asm_arm.h
  6. 30 13
      src/lj_asm_mips.h
  7. 26 8
      src/lj_asm_ppc.h
  8. 14 1
      src/lj_asm_x86.h
  9. 1 0
      src/lj_iropt.h
  10. 5 4
      src/lj_jit.h
  11. 244 0
      src/lj_opt_sink.c
  12. 317 13
      src/lj_snap.c
  13. 4 2
      src/lj_target.h
  14. 1 0
      src/lj_trace.c
  15. 1 0
      src/ljamalg.c

+ 1 - 1
src/Makefile

@@ -443,7 +443,7 @@ LJCORE_O= lj_gc.o lj_err.o lj_char.o lj_bc.o lj_obj.o \
 	  lj_state.o lj_dispatch.o lj_vmevent.o lj_vmmath.o lj_api.o \
 	  lj_lex.o lj_parse.o lj_bcread.o lj_bcwrite.o \
 	  lj_ir.o lj_opt_mem.o lj_opt_fold.o lj_opt_narrow.o \
-	  lj_opt_dce.o lj_opt_loop.o lj_opt_split.o \
+	  lj_opt_dce.o lj_opt_loop.o lj_opt_split.o lj_opt_sink.o \
 	  lj_mcode.o lj_snap.o lj_record.o lj_crecord.o lj_ffrecord.o \
 	  lj_asm.o lj_trace.o lj_gdbjit.o \
 	  lj_ctype.o lj_cdata.o lj_cconv.o lj_ccall.o lj_ccallback.o \

+ 12 - 8
src/Makefile.dep

@@ -142,6 +142,8 @@ lj_opt_mem.o: lj_opt_mem.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
 lj_opt_narrow.o: lj_opt_narrow.c lj_obj.h lua.h luaconf.h lj_def.h \
  lj_arch.h lj_str.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \
  lj_dispatch.h lj_traceerr.h lj_vm.h
+lj_opt_sink.o: lj_opt_sink.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+ lj_ir.h lj_jit.h lj_iropt.h lj_target.h lj_target_*.h
 lj_opt_split.o: lj_opt_split.c lj_obj.h lua.h luaconf.h lj_def.h \
  lj_arch.h lj_err.h lj_errmsg.h lj_str.h lj_ir.h lj_jit.h lj_ircall.h \
  lj_iropt.h lj_vm.h
@@ -153,8 +155,9 @@ lj_record.o: lj_record.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
  lj_ff.h lj_ffdef.h lj_ir.h lj_jit.h lj_ircall.h lj_iropt.h lj_trace.h \
  lj_dispatch.h lj_traceerr.h lj_record.h lj_ffrecord.h lj_snap.h lj_vm.h
 lj_snap.o: lj_snap.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
- lj_state.h lj_frame.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \
- lj_dispatch.h lj_traceerr.h lj_snap.h lj_target.h lj_target_*.h
+ lj_tab.h lj_state.h lj_frame.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h \
+ lj_trace.h lj_dispatch.h lj_traceerr.h lj_snap.h lj_target.h \
+ lj_target_*.h lj_ctype.h lj_cdata.h
 lj_state.o: lj_state.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
  lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_meta.h \
  lj_state.h lj_frame.h lj_bc.h lj_ctype.h lj_trace.h lj_jit.h lj_ir.h \
@@ -188,12 +191,13 @@ ljamalg.o: ljamalg.c lua.h luaconf.h lauxlib.h lj_gc.c lj_obj.h lj_def.h \
  lj_target.h lj_target_*.h lj_mcode.h lj_carith.c lj_carith.h lj_clib.c \
  lj_clib.h lj_cparse.c lj_cparse.h lj_lib.c lj_lib.h lj_ir.c lj_ircall.h \
  lj_iropt.h lj_opt_mem.c lj_opt_fold.c lj_folddef.h lj_opt_narrow.c \
- lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_opt_split.c lj_mcode.c lj_snap.c \
- lj_record.c lj_record.h lj_ffrecord.h lj_crecord.c lj_crecord.h \
- lj_ffrecord.c lj_recdef.h lj_asm.c lj_asm.h lj_emit_*.h lj_asm_*.h \
- lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c lib_aux.c lib_base.c \
- lj_libdef.h lib_math.c lib_string.c lib_table.c lib_io.c lib_os.c \
- lib_package.c lib_debug.c lib_bit.c lib_jit.c lib_ffi.c lib_init.c
+ lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_opt_split.c lj_opt_sink.c \
+ lj_mcode.c lj_snap.c lj_record.c lj_record.h lj_ffrecord.h lj_crecord.c \
+ lj_crecord.h lj_ffrecord.c lj_recdef.h lj_asm.c lj_asm.h lj_emit_*.h \
+ lj_asm_*.h lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c lib_aux.c \
+ lib_base.c lj_libdef.h lib_math.c lib_string.c lib_table.c lib_io.c \
+ lib_os.c lib_package.c lib_debug.c lib_bit.c lib_jit.c lib_ffi.c \
+ lib_init.c
 luajit.o: luajit.c lua.h luaconf.h lauxlib.h lualib.h luajit.h lj_arch.h
 host/buildvm.o: host/buildvm.c host/buildvm.h lj_def.h lua.h luaconf.h \
  lj_arch.h lj_obj.h lj_def.h lj_arch.h lj_gc.h lj_obj.h lj_bc.h lj_ir.h \

+ 10 - 5
src/jit/dump.lua

@@ -374,10 +374,13 @@ local function dump_snap(tr)
 end
 
 -- Return a register name or stack slot for a rid/sp location.
-local function ridsp_name(ridsp)
+local function ridsp_name(ridsp, ins)
   if not disass then disass = require("jit.dis_"..jit.arch) end
-  local rid = band(ridsp, 0xff)
-  if ridsp > 255 then return format("[%x]", shr(ridsp, 8)*4) end
+  local rid, slot = band(ridsp, 0xff), shr(ridsp, 8)
+  if rid == 253 or rid == 254 then
+    return slot == 0 and " {sink" or format(" {%04d", ins-slot)
+  end
+  if ridsp > 255 then return format("[%x]", slot*4) end
   if rid < 128 then return disass.regname(rid) end
   return ""
 end
@@ -458,13 +461,15 @@ local function dump_ir(tr, dumpsnap, dumpreg)
       end
     elseif op ~= "NOP   " and op ~= "CARG  " and
 	   (dumpreg or op ~= "RENAME") then
+      local rid = band(ridsp, 255)
       if dumpreg then
-	out:write(format("%04d %-5s ", ins, ridsp_name(ridsp)))
+	out:write(format("%04d %-6s", ins, ridsp_name(ridsp, ins)))
       else
 	out:write(format("%04d ", ins))
       end
       out:write(format("%s%s %s %s ",
-		       band(ot, 128) == 0 and " " or ">",
+		       (rid == 254 or rid == 253) and "}" or
+		       (band(ot, 128) == 0 and " " or ">"),
 		       band(ot, 64) == 0 and " " or "+",
 		       irtype[t], op))
       local m1, m2 = band(m, 3), band(m, 3*4)

+ 52 - 15
src/lj_asm.c

@@ -782,19 +782,44 @@ static int asm_snap_canremat(ASMState *as)
 static void asm_snap_alloc1(ASMState *as, IRRef ref)
 {
   IRIns *ir = IR(ref);
-  if (!ra_used(ir)) {
-    RegSet allow = (!LJ_SOFTFP && irt_isnum(ir->t)) ? RSET_FPR : RSET_GPR;
-    /* Get a weak register if we have a free one or can rematerialize. */
-    if ((as->freeset & allow) ||
-	(allow == RSET_FPR && asm_snap_canremat(as))) {
-      Reg r = ra_allocref(as, ref, allow);  /* Allocate a register. */
-      if (!irt_isphi(ir->t))
-	ra_weak(as, r);  /* But mark it as weakly referenced. */
-      checkmclim(as);
-      RA_DBGX((as, "snapreg   $f $r", ref, ir->r));
+  if (!(ra_used(ir) || ir->r == RID_SUNK)) {
+    if (ir->r == RID_SINK) {
+      ir->r = RID_SUNK;
+#if LJ_HASFFI
+      if (ir->o == IR_CNEWI) {  /* Allocate CNEWI value. */
+	asm_snap_alloc1(as, ir->op2);
+	if (LJ_32 && (ir+1)->o == IR_HIOP)
+	  asm_snap_alloc1(as, (ir+1)->op2);
+      }
+#endif
+      else {  /* Allocate stored values for TNEW, TDUP and CNEW. */
+	IRIns *irs;
+	lua_assert(ir->o == IR_TNEW || ir->o == IR_TDUP || ir->o == IR_CNEW);
+	for (irs = IR(as->curins); irs > ir; irs--)
+	  if (irs->r == RID_SINK && ir + irs->s == irs) {
+	    lua_assert(irs->o == IR_ASTORE || irs->o == IR_HSTORE ||
+		       irs->o == IR_FSTORE || irs->o == IR_XSTORE);
+	    asm_snap_alloc1(as, irs->op2);
+	    if (LJ_32 && (irs+1)->o == IR_HIOP)
+	      asm_snap_alloc1(as, (irs+1)->op2);
+	  }
+      }
+    } else if (ir->o == IR_CONV && ir->op2 == IRCONV_NUM_INT) {
+      asm_snap_alloc1(as, ir->op1);
     } else {
-      ra_spill(as, ir);  /* Otherwise force a spill slot. */
-      RA_DBGX((as, "snapspill $f $s", ref, ir->s));
+      RegSet allow = (!LJ_SOFTFP && irt_isfp(ir->t)) ? RSET_FPR : RSET_GPR;
+      if ((as->freeset & allow) ||
+	       (allow == RSET_FPR && asm_snap_canremat(as))) {
+	/* Get a weak register if we have a free one or can rematerialize. */
+	Reg r = ra_allocref(as, ref, allow);  /* Allocate a register. */
+	if (!irt_isphi(ir->t))
+	  ra_weak(as, r);  /* But mark it as weakly referenced. */
+	checkmclim(as);
+	RA_DBGX((as, "snapreg   $f $r", ref, ir->r));
+      } else {
+	ra_spill(as, ir);  /* Otherwise force a spill slot. */
+	RA_DBGX((as, "snapspill $f $s", ref, ir->s));
+      }
     }
   }
 }
@@ -848,7 +873,7 @@ static void asm_snap_prep(ASMState *as)
 {
   if (as->curins < as->snapref) {
     do {
-      lua_assert(as->snapno != 0);
+      if (as->snapno == 0) return;  /* Called by sunk stores before snap #0. */
       as->snapno--;
       as->snapref = as->T->snap[as->snapno].ref;
     } while (as->curins < as->snapref);
@@ -1180,6 +1205,8 @@ static void asm_phi(ASMState *as, IRIns *ir)
   RegSet afree = (as->freeset & allow);
   IRIns *irl = IR(ir->op1);
   IRIns *irr = IR(ir->op2);
+  if (ir->r == RID_SINK)  /* Sink PHI. */
+    return;
   /* Spill slot shuffling is not implemented yet (but rarely needed). */
   if (ra_hasspill(irl->s) || ra_hasspill(irr->s))
     lj_trace_err(as->J, LJ_TRERR_NYIPHI);
@@ -1494,7 +1521,7 @@ static void asm_tail_link(ASMState *as)
 /* -- Trace setup --------------------------------------------------------- */
 
 /* Clear reg/sp for all instructions and add register hints. */
-static void asm_setup_regsp(ASMState *as)
+static void asm_setup_regsp(ASMState *as, int sink)
 {
   GCtrace *T = as->T;
   IRRef nins = T->nins;
@@ -1545,6 +1572,14 @@ static void asm_setup_regsp(ASMState *as)
   inloop = 0;
   as->evenspill = SPS_FIRST;
   for (lastir = IR(nins); ir < lastir; ir++) {
+    if (sink) {
+      if (ir->r == RID_SINK)
+	continue;
+      if (ir->r == RID_SUNK) {  /* Revert after ASM restart. */
+	ir->r = RID_SINK;
+	continue;
+      }
+    }
     switch (ir->o) {
     case IR_LOOP:
       inloop = 1;
@@ -1716,6 +1751,7 @@ void lj_asm_trace(jit_State *J, GCtrace *T)
   ASMState as_;
   ASMState *as = &as_;
   MCode *origtop;
+  int sink;
 
   /* Ensure an initialized instruction beyond the last one for HIOP checks. */
   J->cur.nins = lj_ir_nextins(J);
@@ -1736,6 +1772,7 @@ void lj_asm_trace(jit_State *J, GCtrace *T)
   as->mcp = as->mctop;
   as->mclim = as->mcbot + MCLIM_REDZONE;
   asm_setup_target(as);
+  sink = (IR(REF_BASE)->prev == 1);
 
   do {
     as->mcp = as->mctop;
@@ -1751,7 +1788,7 @@ void lj_asm_trace(jit_State *J, GCtrace *T)
     as->gcsteps = 0;
     as->sectref = as->loopref;
     as->fuseref = (as->flags & JIT_F_OPT_FUSE) ? as->loopref : FUSE_DISABLED;
-    asm_setup_regsp(as);
+    asm_setup_regsp(as, sink);
     if (!as->loopref)
       asm_tail_link(as);
 

+ 31 - 18
src/lj_asm_arm.h

@@ -693,6 +693,8 @@ static void asm_newref(ASMState *as, IRIns *ir)
 {
   const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_newkey];
   IRRef args[3];
+  if (ir->r == RID_SINK)  /* Sink newref. */
+    return;
   args[0] = ASMREF_L;     /* lua_State *L */
   args[1] = ir->op1;      /* GCtab *t     */
   args[2] = ASMREF_TMP1;  /* cTValue *key */
@@ -836,9 +838,13 @@ static void asm_xload(ASMState *as, IRIns *ir)
 
 static void asm_xstore(ASMState *as, IRIns *ir, int32_t ofs)
 {
-  Reg src = ra_alloc1(as, ir->op2, RSET_GPR);
-  asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1,
-	       rset_exclude(RSET_GPR, src), ofs);
+  if (ir->r == RID_SINK) {  /* Sink store. */
+    asm_snap_prep(as);
+  } else {
+    Reg src = ra_alloc1(as, ir->op2, RSET_GPR);
+    asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1,
+		 rset_exclude(RSET_GPR, src), ofs);
+  }
 }
 
 static void asm_ahuvload(ASMState *as, IRIns *ir)
@@ -876,21 +882,25 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
 
 static void asm_ahustore(ASMState *as, IRIns *ir)
 {
-  RegSet allow = RSET_GPR;
-  Reg idx, src = RID_NONE, type = RID_NONE;
-  int32_t ofs = 0;
-  int hiop = ((ir+1)->o == IR_HIOP);
-  if (!irt_ispri(ir->t)) {
-    src = ra_alloc1(as, ir->op2, allow);
-    rset_clear(allow, src);
+  if (ir->r == RID_SINK) {  /* Sink store. */
+    asm_snap_prep(as);
+  } else {
+    RegSet allow = RSET_GPR;
+    Reg idx, src = RID_NONE, type = RID_NONE;
+    int32_t ofs = 0;
+    int hiop = ((ir+1)->o == IR_HIOP);
+    if (!irt_ispri(ir->t)) {
+      src = ra_alloc1(as, ir->op2, allow);
+      rset_clear(allow, src);
+    }
+    if (hiop)
+      type = ra_alloc1(as, (ir+1)->op2, allow);
+    else
+      type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
+    idx = asm_fuseahuref(as, ir->op1, &ofs, rset_exclude(allow, type));
+    if (ra_hasreg(src)) emit_lso(as, ARMI_STR, src, idx, ofs);
+    emit_lso(as, ARMI_STR, type, idx, ofs+4);
   }
-  if (hiop)
-    type = ra_alloc1(as, (ir+1)->op2, allow);
-  else
-    type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
-  idx = asm_fuseahuref(as, ir->op1, &ofs, rset_exclude(allow, type));
-  if (ra_hasreg(src)) emit_lso(as, ARMI_STR, src, idx, ofs);
-  emit_lso(as, ARMI_STR, type, idx, ofs+4);
 }
 
 static void asm_sload(ASMState *as, IRIns *ir)
@@ -1382,7 +1392,10 @@ static void asm_hiop(ASMState *as, IRIns *ir)
       asm_fpmin_max(as, ir-1, (ir-1)->o == IR_MIN ? CC_HI : CC_LO);
     return;
   } else if ((ir-1)->o == IR_XSTORE) {
-    asm_xstore(as, ir, 4);
+    if ((ir-1)->r == RID_SINK)
+      asm_snap_prep(as);
+    else
+      asm_xstore(as, ir, 4);
     return;
   }
   if (!usehi) return;  /* Skip unused hiword op for all remaining ops. */

+ 30 - 13
src/lj_asm_mips.h

@@ -769,14 +769,18 @@ nolo:
 
 static void asm_newref(ASMState *as, IRIns *ir)
 {
-  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_newkey];
-  IRRef args[3];
-  args[0] = ASMREF_L;     /* lua_State *L */
-  args[1] = ir->op1;      /* GCtab *t     */
-  args[2] = ASMREF_TMP1;  /* cTValue *key */
-  asm_setupresult(as, ir, ci);  /* TValue * */
-  asm_gencall(as, ci, args);
-  asm_tvptr(as, ra_releasetmp(as, ASMREF_TMP1), ir->op2);
+  if (ir->r == RID_SINK) {  /* Sink newref. */
+    return;
+  } else {
+    const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_newkey];
+    IRRef args[3];
+    args[0] = ASMREF_L;     /* lua_State *L */
+    args[1] = ir->op1;      /* GCtab *t     */
+    args[2] = ASMREF_TMP1;  /* cTValue *key */
+    asm_setupresult(as, ir, ci);  /* TValue * */
+    asm_gencall(as, ci, args);
+    asm_tvptr(as, ra_releasetmp(as, ASMREF_TMP1), ir->op2);
+  }
 }
 
 static void asm_uref(ASMState *as, IRIns *ir)
@@ -912,9 +916,14 @@ static void asm_xload(ASMState *as, IRIns *ir)
 
 static void asm_xstore(ASMState *as, IRIns *ir, int32_t ofs)
 {
-  Reg src = ra_alloc1z(as, ir->op2, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
-  asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1,
-	       rset_exclude(RSET_GPR, src), ofs);
+  if (ir->r == RID_SINK) {  /* Sink store. */
+    asm_snap_prep(as);
+    return;
+  } else {
+    Reg src = ra_alloc1z(as, ir->op2, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
+    asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1,
+		 rset_exclude(RSET_GPR, src), ofs);
+  }
 }
 
 static void asm_ahuvload(ASMState *as, IRIns *ir)
@@ -947,6 +956,10 @@ static void asm_ahustore(ASMState *as, IRIns *ir)
   RegSet allow = RSET_GPR;
   Reg idx, src = RID_NONE, type = RID_NONE;
   int32_t ofs = 0;
+  if (ir->r == RID_SINK) {  /* Sink store. */
+    asm_snap_prep(as);
+    return;
+  }
   if (irt_isnum(ir->t)) {
     src = ra_alloc1(as, ir->op2, RSET_FPR);
   } else {
@@ -1561,8 +1574,12 @@ static void asm_hiop(ASMState *as, IRIns *ir)
     return;
   } else if ((ir-1)->o == IR_XSTORE) {
     as->curins--;  /* Handle both stores here. */
-    asm_xstore(as, ir, LJ_LE ? 4 : 0);
-    asm_xstore(as, ir-1, LJ_LE ? 0 : 4);
+    if ((ir-1)->r == RID_SINK) {
+      asm_snap_prep(as);
+    } else {
+      asm_xstore(as, ir, LJ_LE ? 4 : 0);
+      asm_xstore(as, ir-1, LJ_LE ? 0 : 4);
+    }
     return;
   }
   if (!usehi) return;  /* Skip unused hiword op for all remaining ops. */

+ 26 - 8
src/lj_asm_ppc.h

@@ -773,6 +773,8 @@ static void asm_newref(ASMState *as, IRIns *ir)
 {
   const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_newkey];
   IRRef args[3];
+  if (ir->r == RID_SINK)  /* Sink newref. */
+    return;
   args[0] = ASMREF_L;     /* lua_State *L */
   args[1] = ir->op1;      /* GCtab *t     */
   args[2] = ASMREF_TMP1;  /* cTValue *key */
@@ -892,12 +894,16 @@ static void asm_fload(ASMState *as, IRIns *ir)
 
 static void asm_fstore(ASMState *as, IRIns *ir)
 {
-  Reg src = ra_alloc1(as, ir->op2, RSET_GPR);
-  IRIns *irf = IR(ir->op1);
-  Reg idx = ra_alloc1(as, irf->op1, rset_exclude(RSET_GPR, src));
-  int32_t ofs = field_ofs[irf->op2];
-  PPCIns pi = asm_fxstoreins(ir);
-  emit_tai(as, pi, src, idx, ofs);
+  if (ir->r == RID_SINK) {  /* Sink store. */
+    asm_snap_prep(as);
+  } else {
+    Reg src = ra_alloc1(as, ir->op2, RSET_GPR);
+    IRIns *irf = IR(ir->op1);
+    Reg idx = ra_alloc1(as, irf->op1, rset_exclude(RSET_GPR, src));
+    int32_t ofs = field_ofs[irf->op2];
+    PPCIns pi = asm_fxstoreins(ir);
+    emit_tai(as, pi, src, idx, ofs);
+  }
 }
 
 static void asm_xload(ASMState *as, IRIns *ir)
@@ -912,6 +918,10 @@ static void asm_xload(ASMState *as, IRIns *ir)
 static void asm_xstore(ASMState *as, IRIns *ir, int32_t ofs)
 {
   IRIns *irb;
+  if (ir->r == RID_SINK) {  /* Sink store. */
+    asm_snap_prep(as);
+    return;
+  }
   if (ofs == 0 && mayfuse(as, ir->op2) && (irb = IR(ir->op2))->o == IR_BSWAP &&
       ra_noreg(irb->r) && (irt_isint(ir->t) || irt_isu32(ir->t))) {
     /* Fuse BSWAP with XSTORE to stwbrx. */
@@ -968,6 +978,10 @@ static void asm_ahustore(ASMState *as, IRIns *ir)
   RegSet allow = RSET_GPR;
   Reg idx, src = RID_NONE, type = RID_NONE;
   int32_t ofs = AHUREF_LSX;
+  if (ir->r == RID_SINK) {  /* Sink store. */
+    asm_snap_prep(as);
+    return;
+  }
   if (irt_isnum(ir->t)) {
     src = ra_alloc1(as, ir->op2, RSET_FPR);
   } else {
@@ -1747,8 +1761,12 @@ static void asm_hiop(ASMState *as, IRIns *ir)
     return;
   } else if ((ir-1)->o == IR_XSTORE) {
     as->curins--;  /* Handle both stores here. */
-    asm_xstore(as, ir, 0);
-    asm_xstore(as, ir-1, 4);
+    if ((ir-1)->r == RID_SINK) {
+      asm_snap_prep(as);
+    } else {
+      asm_xstore(as, ir, 0);
+      asm_xstore(as, ir-1, 4);
+    }
     return;
   }
   if (!usehi) return;  /* Skip unused hiword op for all remaining ops. */

+ 14 - 1
src/lj_asm_x86.h

@@ -1155,6 +1155,8 @@ static void asm_newref(ASMState *as, IRIns *ir)
   IRRef args[3];
   IRIns *irkey;
   Reg tmp;
+  if (ir->r == RID_SINK)  /* Sink newref. */
+    return;
   args[0] = ASMREF_L;     /* lua_State *L */
   args[1] = ir->op1;      /* GCtab *t     */
   args[2] = ASMREF_TMP1;  /* cTValue *key */
@@ -1259,6 +1261,10 @@ static void asm_fxstore(ASMState *as, IRIns *ir)
   RegSet allow = RSET_GPR;
   Reg src = RID_NONE, osrc = RID_NONE;
   int32_t k = 0;
+  if (ir->r == RID_SINK) {  /* Sink store. */
+    asm_snap_prep(as);
+    return;
+  }
   /* The IRT_I16/IRT_U16 stores should never be simplified for constant
   ** values since mov word [mem], imm16 has a length-changing prefix.
   */
@@ -1372,6 +1378,10 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
 
 static void asm_ahustore(ASMState *as, IRIns *ir)
 {
+  if (ir->r == RID_SINK) {  /* Sink store. */
+    asm_snap_prep(as);
+    return;
+  }
   if (irt_isnum(ir->t)) {
     Reg src = ra_alloc1(as, ir->op2, RSET_FPR);
     asm_fuseahuref(as, ir->op1, RSET_GPR);
@@ -2251,7 +2261,10 @@ static void asm_hiop(ASMState *as, IRIns *ir)
     asm_comp_int64(as, ir);
     return;
   } else if ((ir-1)->o == IR_XSTORE) {
-    asm_fxstore(as, ir);
+    if ((ir-1)->r == RID_SINK)
+      asm_snap_prep(as);
+    else
+      asm_fxstore(as, ir);
     return;
   }
   if (!usehi) return;  /* Skip unused hiword op for all remaining ops. */

+ 1 - 0
src/lj_iropt.h

@@ -154,6 +154,7 @@ LJ_FUNC void lj_opt_split(jit_State *J);
 #else
 #define lj_opt_split(J)		UNUSED(J)
 #endif
+LJ_FUNC void lj_opt_sink(jit_State *J);
 
 #endif
 

+ 5 - 4
src/lj_jit.h

@@ -63,19 +63,20 @@
 #define JIT_F_OPT_NARROW	0x00200000
 #define JIT_F_OPT_LOOP		0x00400000
 #define JIT_F_OPT_ABC		0x00800000
-#define JIT_F_OPT_FUSE		0x01000000
+#define JIT_F_OPT_SINK		0x01000000
+#define JIT_F_OPT_FUSE		0x02000000
 
 /* Optimizations names for -O. Must match the order above. */
 #define JIT_F_OPT_FIRST		JIT_F_OPT_FOLD
 #define JIT_F_OPTSTRING	\
-  "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4fuse"
+  "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse"
 
 /* Optimization levels set a fixed combination of flags. */
 #define JIT_F_OPT_0	0
 #define JIT_F_OPT_1	(JIT_F_OPT_FOLD|JIT_F_OPT_CSE|JIT_F_OPT_DCE)
 #define JIT_F_OPT_2	(JIT_F_OPT_1|JIT_F_OPT_NARROW|JIT_F_OPT_LOOP)
-#define JIT_F_OPT_3 \
-  (JIT_F_OPT_2|JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_FUSE)
+#define JIT_F_OPT_3	(JIT_F_OPT_2|\
+  JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_SINK|JIT_F_OPT_FUSE)
 #define JIT_F_OPT_DEFAULT	JIT_F_OPT_3
 
 #if LJ_TARGET_WINDOWS || LJ_64

+ 244 - 0
src/lj_opt_sink.c

@@ -0,0 +1,244 @@
+/*
+** SINK: Allocation Sinking and Store Sinking.
+** Copyright (C) 2005-2012 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_opt_sink_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+
+#if LJ_HASJIT
+
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#include "lj_target.h"
+
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref)		(&J->cur.ir[(ref)])
+
+/* Check whether the store ref points to an eligible allocation. */
+static IRIns *sink_checkalloc(jit_State *J, IRIns *irs)
+{
+  IRIns *ir = IR(irs->op1);
+  if (!irref_isk(ir->op2))
+    return NULL;  /* Non-constant key. */
+  if (ir->o == IR_HREFK || ir->o == IR_AREF)
+    ir = IR(ir->op1);
+  else if (!(ir->o == IR_HREF || ir->o == IR_NEWREF ||
+	     ir->o == IR_FREF || ir->o == IR_ADD))
+    return NULL;  /* Unhandled reference type (for XSTORE). */
+  ir = IR(ir->op1);
+  if (!(ir->o == IR_TNEW || ir->o == IR_TDUP || ir->o == IR_CNEW))
+    return NULL;  /* Not an allocation. */
+  if (ir + 255 < irs)
+    return NULL;  /* Out of range. */
+  return ir;  /* Return allocation. */
+}
+
+/* Recursively check whether a value depends on a PHI. */
+static int sink_phidep(jit_State *J, IRRef ref)
+{
+  IRIns *ir = IR(ref);
+  if (irt_isphi(ir->t)) return 1;
+  if (ir->op1 >= REF_FIRST && sink_phidep(J, ir->op1)) return 1;
+  if (ir->op2 >= REF_FIRST && sink_phidep(J, ir->op2)) return 1;
+  return 0;
+}
+
+/* Check whether a value is a sinkable PHI or a non-PHI. */
+static int sink_checkphi(jit_State *J, IRIns *ira, IRRef ref)
+{
+  if (ref >= REF_FIRST) {
+    IRIns *ir = IR(ref);
+    if (irt_isphi(ir->t) || (ir->o == IR_CONV && ir->op2 == IRCONV_NUM_INT &&
+			     irt_isphi(IR(ir->op1)->t))) {
+      ira->prev++;
+      return 1;  /* Sinkable PHI. */
+    }
+    return !sink_phidep(J, ref);  /* Must be a non-PHI then. */
+  }
+  return 1;  /* Constant (non-PHI). */
+}
+
+/* Mark non-sinkable allocations using single-pass backward propagation.
+**
+** Roots for the marking process are:
+** - Some PHIs or snapshots (see below).
+** - Non-PHI, non-constant values stored to PHI allocations.
+** - All guards.
+** - Any remaining loads not eliminated by store-to-load forwarding.
+** - Stores with non-constant keys.
+** - All stored values.
+*/
+static void sink_mark_ins(jit_State *J)
+{
+  IRIns *ir, *irlast = IR(J->cur.nins-1);
+  for (ir = irlast ; ; ir--) {
+    switch (ir->o) {
+    case IR_BASE:
+      return;  /* Finished. */
+    case IR_CALLL:  /* IRCALL_lj_tab_len */
+    case IR_ALOAD: case IR_HLOAD: case IR_XLOAD:
+      irt_setmark(IR(ir->op1)->t);  /* Mark ref for remaining loads. */
+      break;
+    case IR_FLOAD:
+      if (irt_ismarked(ir->t) || ir->op2 == IRFL_TAB_META)
+	irt_setmark(IR(ir->op1)->t);  /* Mark table for remaining loads. */
+      break;
+    case IR_ASTORE: case IR_HSTORE: case IR_FSTORE: case IR_XSTORE: {
+      IRIns *ira = sink_checkalloc(J, ir);
+      if (!ira || (irt_isphi(ira->t) && !sink_checkphi(J, ira, ir->op2)))
+	irt_setmark(IR(ir->op1)->t);  /* Mark ineligible ref. */
+      irt_setmark(IR(ir->op2)->t);  /* Mark stored value. */
+      break;
+      }
+#if LJ_HASFFI
+    case IR_CNEWI:
+      if (irt_isphi(ir->t) &&
+	  (!sink_checkphi(J, ir, ir->op2) ||
+	   (LJ_32 && ir+1 < irlast && (ir+1)->o == IR_HIOP &&
+	    !sink_checkphi(J, ir, (ir+1)->op2))))
+	irt_setmark(ir->t);  /* Mark ineligible allocation. */
+      /* fallthrough */
+#endif
+    case IR_USTORE:
+      irt_setmark(IR(ir->op2)->t);  /* Mark stored value. */
+      break;
+#if LJ_HASFFI
+    case IR_CALLXS:
+#endif
+    case IR_CALLS:
+      irt_setmark(IR(ir->op1)->t);  /* Mark (potentially) stored values. */
+      break;
+    case IR_PHI: {
+      IRIns *irl = IR(ir->op1), *irr = IR(ir->op2);
+      irl->prev = irr->prev = 0;  /* Clear PHI value counts. */
+      if (irl->o == irr->o &&
+	  (irl->o == IR_TNEW || irl->o == IR_TDUP ||
+	   (LJ_HASFFI && (irl->o == IR_CNEW || irl->o == IR_CNEWI))))
+	break;
+      irt_setmark(irl->t);
+      irt_setmark(irr->t);
+      break;
+      }
+    default:
+      if (irt_ismarked(ir->t) || irt_isguard(ir->t)) {  /* Propagate mark. */
+	if (ir->op1 >= REF_FIRST) irt_setmark(IR(ir->op1)->t);
+	if (ir->op2 >= REF_FIRST) irt_setmark(IR(ir->op2)->t);
+      }
+      break;
+    }
+  }
+}
+
+/* Mark all instructions referenced by a snapshot. */
+static void sink_mark_snap(jit_State *J, SnapShot *snap)
+{
+  SnapEntry *map = &J->cur.snapmap[snap->mapofs];
+  MSize n, nent = snap->nent;
+  for (n = 0; n < nent; n++) {
+    IRRef ref = snap_ref(map[n]);
+    if (!irref_isk(ref))
+      irt_setmark(IR(ref)->t);
+  }
+}
+
+/* Iteratively remark PHI refs with differing marks or PHI value counts. */
+static void sink_remark_phi(jit_State *J)
+{
+  IRIns *ir;
+  int remark;
+  do {
+    remark = 0;
+    for (ir = IR(J->cur.nins-1); ir->o == IR_PHI; ir--) {
+      IRIns *irl = IR(ir->op1), *irr = IR(ir->op2);
+      if (((irl->t.irt ^ irr->t.irt) & IRT_MARK))
+	remark = 1;
+      else if (irl->prev == irr->prev)
+	continue;
+      irt_setmark(IR(ir->op1)->t);
+      irt_setmark(IR(ir->op2)->t);
+    }
+  } while (remark);
+}
+
+/* Sweep instructions and mark sunken allocations and stores. */
+static void sink_sweep_ins(jit_State *J)
+{
+  IRIns *ir, *irfirst = IR(J->cur.nk);
+  for (ir = IR(J->cur.nins-1) ; ir >= irfirst; ir--) {
+    switch (ir->o) {
+    case IR_ASTORE: case IR_HSTORE: case IR_FSTORE: case IR_XSTORE: {
+      IRIns *ira = sink_checkalloc(J, ir);
+      if (ira && !irt_ismarked(ira->t))
+	ir->prev = REGSP(RID_SINK, (int)(ir - ira));
+      else
+	ir->prev = REGSP_INIT;
+      break;
+      }
+    case IR_NEWREF:
+      if (!irt_ismarked(ir->t)) {
+	ir->prev = REGSP(RID_SINK, 0);
+      } else {
+	irt_clearmark(ir->t);
+	ir->prev = REGSP_INIT;
+      }
+      break;
+#if LJ_HASFFI
+    case IR_CNEW: case IR_CNEWI:
+#endif
+    case IR_TNEW: case IR_TDUP:
+      if (!irt_ismarked(ir->t)) {
+	ir->t.irt &= ~IRT_GUARD;
+	ir->prev = REGSP(RID_SINK, 0);
+      } else {
+	irt_clearmark(ir->t);
+	ir->prev = REGSP_INIT;
+      }
+      break;
+    case IR_PHI: {
+      IRIns *ira = IR(ir->op2);
+      if (!irt_ismarked(ira->t) &&
+	  (ira->o == IR_TNEW || ira->o == IR_TDUP ||
+	   (LJ_HASFFI && (ira->o == IR_CNEW || ira->o == IR_CNEWI)))) {
+	ir->prev = REGSP(RID_SINK, 0);
+      } else {
+	ir->prev = REGSP_INIT;
+      }
+      break;
+      }
+    default:
+      irt_clearmark(ir->t);
+      ir->prev = REGSP_INIT;
+      break;
+    }
+  }
+  IR(REF_BASE)->prev = 1;  /* Signal SINK flags to assembler. */
+}
+
+/* Allocation sinking and store sinking.
+**
+** 1. Mark all non-sinkable allocations.
+** 2. Then sink all remaining allocations and the related stores.
+*/
+void lj_opt_sink(jit_State *J)
+{
+  const uint32_t need = (JIT_F_OPT_SINK|JIT_F_OPT_FWD|
+			 JIT_F_OPT_DCE|JIT_F_OPT_CSE|JIT_F_OPT_FOLD);
+  if ((J->flags & need) == need &&
+      (J->chain[IR_TNEW] || J->chain[IR_TDUP] ||
+       (LJ_HASFFI && (J->chain[IR_CNEW] || J->chain[IR_CNEWI])))) {
+    if (!J->loopref)
+      sink_mark_snap(J, &J->cur.snap[J->cur.nsnap-1]);
+    sink_mark_ins(J);
+    if (J->loopref)
+      sink_remark_phi(J);
+    sink_sweep_ins(J);
+  }
+}
+
+#undef IR
+
+#endif

+ 317 - 13
src/lj_snap.c

@@ -11,6 +11,7 @@
 #if LJ_HASJIT
 
 #include "lj_gc.h"
+#include "lj_tab.h"
 #include "lj_state.h"
 #include "lj_frame.h"
 #include "lj_bc.h"
@@ -20,10 +21,17 @@
 #include "lj_trace.h"
 #include "lj_snap.h"
 #include "lj_target.h"
+#if LJ_HASFFI
+#include "lj_ctype.h"
+#include "lj_cdata.h"
+#endif
 
 /* Some local macros to save typing. Undef'd at the end. */
 #define IR(ref)		(&J->cur.ir[(ref)])
 
+/* Pass IR on to next optimization in chain (FOLD). */
+#define emitir(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
+
 /* Emit raw IR without passing through optimizations. */
 #define emitir_raw(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
 
@@ -370,6 +378,31 @@ static TRef snap_replay_const(jit_State *J, IRIns *ir)
   }
 }
 
+/* De-duplicate parent reference. */
+static TRef snap_dedup(jit_State *J, SnapEntry *map, MSize nmax, IRRef ref)
+{
+  MSize j;
+  for (j = 0; j < nmax; j++)
+    if (snap_ref(map[j]) == ref)
+      return J->slot[snap_slot(map[j])];
+  return 0;
+}
+
+/* Emit parent reference with de-duplication. */
+static TRef snap_pref(jit_State *J, GCtrace *T, SnapEntry *map, MSize nmax,
+		      BloomFilter seen, IRRef ref)
+{
+  IRIns *ir = &T->ir[ref];
+  TRef tr;
+  if (irref_isk(ref))
+    tr = snap_replay_const(J, ir);
+  else if (!regsp_used(ir->prev))
+    tr = 0;
+  else if (!bloomtest(seen, ref) || (tr = snap_dedup(J, map, nmax, ref)) == 0)
+    tr = emitir(IRT(IR_PVAL, irt_type(ir->t)), ref - REF_BIAS, 0);
+  return tr;
+}
+
 /* Replay snapshot state to setup side trace. */
 void lj_snap_replay(jit_State *J, GCtrace *T)
 {
@@ -377,6 +410,7 @@ void lj_snap_replay(jit_State *J, GCtrace *T)
   SnapEntry *map = &T->snapmap[snap->mapofs];
   MSize n, nent = snap->nent;
   BloomFilter seen = 0;
+  int pass23 = 0;
   J->framedepth = 0;
   /* Emit IR for slots inherited from parent snapshot. */
   for (n = 0; n < nent; n++) {
@@ -386,21 +420,18 @@ void lj_snap_replay(jit_State *J, GCtrace *T)
     IRIns *ir = &T->ir[ref];
     TRef tr;
     /* The bloom filter avoids O(nent^2) overhead for de-duping slots. */
-    if (bloomtest(seen, ref)) {
-      MSize j;
-      for (j = 0; j < n; j++)
-	if (snap_ref(map[j]) == ref) {
-	  tr = J->slot[snap_slot(map[j])];
-	  goto setslot;
-	}
-    }
+    if (bloomtest(seen, ref) && (tr = snap_dedup(J, map, n, ref)) != 0)
+      goto setslot;
     bloomset(seen, ref);
     if (irref_isk(ref)) {
       tr = snap_replay_const(J, ir);
+    } else if (!regsp_used(ir->prev)) {
+      pass23 = 1;
+      lua_assert(s != 0);
+      tr = s;
     } else {
       IRType t = irt_type(ir->t);
       uint32_t mode = IRSLOAD_INHERIT|IRSLOAD_PARENT;
-      lua_assert(regsp_used(ir->prev));
       if (LJ_SOFTFP && (sn & SNAP_SOFTFPNUM)) t = IRT_NUM;
       if (ir->o == IR_SLOAD) mode |= (ir->op2 & IRSLOAD_READONLY);
       tr = emitir_raw(IRT(IR_SLOAD, t), s, mode);
@@ -411,13 +442,126 @@ void lj_snap_replay(jit_State *J, GCtrace *T)
     if ((sn & SNAP_FRAME))
       J->baseslot = s+1;
   }
+  if (pass23) {
+    IRIns *irlast = &T->ir[(snap+1)->ref];
+    lua_assert(J->exitno+1 < T->nsnap);
+    pass23 = 0;
+    /* Emit dependent PVALs. */
+    for (n = 0; n < nent; n++) {
+      SnapEntry sn = map[n];
+      IRRef refp = snap_ref(sn);
+      IRIns *ir = &T->ir[refp];
+      if (regsp_reg(ir->r) == RID_SUNK) {
+	if (J->slot[snap_slot(sn)] != snap_slot(sn)) continue;
+	pass23 = 1;
+	lua_assert(ir->o == IR_TNEW || ir->o == IR_TDUP ||
+		   ir->o == IR_CNEW || ir->o == IR_CNEWI);
+	if (ir->op1 >= T->nk) snap_pref(J, T, map, nent, seen, ir->op1);
+	if (ir->op2 >= T->nk) snap_pref(J, T, map, nent, seen, ir->op2);
+	if (LJ_HASFFI && ir->o == IR_CNEWI) {
+	  if (LJ_32 && refp+1 < T->nins && (ir+1)->o == IR_HIOP)
+	    snap_pref(J, T, map, nent, seen, (ir+1)->op2);
+	} else {
+	  IRIns *irs;
+	  for (irs = ir+1; irs < irlast; irs++)
+	    if (irs->r == RID_SINK && ir + irs->s == irs) {
+	      if (snap_pref(J, T, map, nent, seen, irs->op2) == 0)
+		snap_pref(J, T, map, nent, seen, T->ir[irs->op2].op1);
+	      else if ((LJ_SOFTFP || (LJ_32 && LJ_HASFFI)) &&
+		       irs+1 < irlast && (irs+1)->o == IR_HIOP)
+		snap_pref(J, T, map, nent, seen, (irs+1)->op2);
+	    }
+	}
+      } else if (!irref_isk(refp) && !regsp_used(ir->prev)) {
+	lua_assert(ir->o == IR_CONV && ir->op2 == IRCONV_NUM_INT);
+	J->slot[snap_slot(sn)] = snap_pref(J, T, map, nent, seen, ir->op1);
+      }
+    }
+    /* Replay sunk instructions. */
+    for (n = 0; pass23 && n < nent; n++) {
+      SnapEntry sn = map[n];
+      IRRef refp = snap_ref(sn);
+      IRIns *ir = &T->ir[refp];
+      if (regsp_reg(ir->r) == RID_SUNK) {
+	TRef op1, op2;
+	if (J->slot[snap_slot(sn)] != snap_slot(sn)) {  /* De-dup allocs. */
+	  J->slot[snap_slot(sn)] = J->slot[J->slot[snap_slot(sn)]];
+	  continue;
+	}
+	op1 = ir->op1;
+	if (op1 >= T->nk) op1 = snap_pref(J, T, map, nent, seen, op1);
+	op2 = ir->op2;
+	if (op2 >= T->nk) op2 = snap_pref(J, T, map, nent, seen, op2);
+	if (LJ_HASFFI && ir->o == IR_CNEWI) {
+	  if (LJ_32 && refp+1 < T->nins && (ir+1)->o == IR_HIOP) {
+	    lj_needsplit(J);  /* Emit joining HIOP. */
+	    op2 = emitir_raw(IRT(IR_HIOP, IRT_I64), op2,
+			     snap_pref(J, T, map, nent, seen, (ir+1)->op2));
+	  }
+	  J->slot[snap_slot(sn)] = emitir(ir->ot, op1, op2);
+	} else {
+	  IRIns *irs;
+	  TRef tr = emitir(ir->ot, op1, op2);
+	  J->slot[snap_slot(sn)] = tr;
+	  for (irs = ir+1; irs < irlast; irs++)
+	    if (irs->r == RID_SINK && ir + irs->s == irs) {
+	      IRIns *irr = &T->ir[irs->op1];
+	      TRef val, key = irr->op2, tmp = tr;
+	      if (irr->o != IR_FREF) {
+		IRIns *irk = &T->ir[key];
+		if (irr->o == IR_HREFK)
+		  key = lj_ir_kslot(J, snap_replay_const(J, &T->ir[irk->op1]),
+				    irk->op2);
+		else
+		  key = snap_replay_const(J, irk);
+		if (irr->o == IR_HREFK || irr->o == IR_AREF) {
+		  IRIns *irf = &T->ir[irr->op1];
+		  tmp = emitir(irf->ot, tmp, irf->op2);
+		}
+	      }
+	      tmp = emitir(irr->ot, tmp, key);
+	      val = snap_pref(J, T, map, nent, seen, irs->op2);
+	      if (val == 0) {
+		IRIns *irc = &T->ir[irs->op2];
+		lua_assert(irc->o == IR_CONV && irc->op2 == IRCONV_NUM_INT);
+		val = snap_pref(J, T, map, nent, seen, irc->op1);
+		val = emitir(IRTN(IR_CONV), val, IRCONV_NUM_INT);
+	      } else if ((LJ_SOFTFP || (LJ_32 && LJ_HASFFI)) &&
+			 irs+1 < irlast && (irs+1)->o == IR_HIOP) {
+		IRType t = IRT_I64;
+		if (LJ_SOFTFP && irt_type((irs+1)->t) == IRT_SOFTFP)
+		  t = IRT_NUM;
+		if (irref_isk(irs->op2) && irref_isk((irs+1)->op2)) {
+		  uint64_t k = (uint32_t)T->ir[irs->op2].i +
+			       ((uint64_t)T->ir[(irs+1)->op2].i << 32);
+		  val = lj_ir_k64(J, t == IRT_I64 ? IR_KINT64 : IR_KNUM,
+				  lj_ir_k64_find(J, k));
+		} else {
+		  val = emitir_raw(IRT(IR_HIOP, t), val,
+			  snap_pref(J, T, map, nent, seen, (irs+1)->op2));
+		}
+		tmp = emitir(IRT(irs->o, t), tmp, val);
+		continue;
+	      }
+	      tmp = emitir(irs->ot, tmp, val);
+	    }
+	}
+      }
+    }
+  }
   J->base = J->slot + J->baseslot;
   J->maxslot = snap->nslots - J->baseslot;
   lj_snap_add(J);
+  if (pass23)  /* Need explicit GC step _after_ initial snapshot. */
+    emitir_raw(IRTG(IR_GCSTEP, IRT_NIL), 0, 0);
 }
 
 /* -- Snapshot restore ---------------------------------------------------- */
 
+static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex,
+			SnapNo snapno, BloomFilter rfilt,
+			IRIns *ir, TValue *o);
+
 /* Restore a value from the trace exit state. */
 static void snap_restoreval(jit_State *J, GCtrace *T, ExitState *ex,
 			    SnapNo snapno, BloomFilter rfilt,
@@ -450,8 +594,12 @@ static void snap_restoreval(jit_State *J, GCtrace *T, ExitState *ex,
     }
   } else {  /* Restore from register. */
     Reg r = regsp_reg(rs);
-    lua_assert(ra_hasreg(r));
-    if (irt_isinteger(t)) {
+    if (ra_noreg(r)) {
+      lua_assert(ir->o == IR_CONV && ir->op2 == IRCONV_NUM_INT);
+      snap_restoreval(J, T, ex, snapno, rfilt, ir->op1, o);
+      if (LJ_DUALNUM) setnumV(o, (lua_Number)intV(o));
+      return;
+    } else if (irt_isinteger(t)) {
       setintV(o, (int32_t)ex->gpr[r-RID_MIN_GPR]);
 #if !LJ_SOFTFP
     } else if (irt_isnum(t)) {
@@ -468,6 +616,148 @@ static void snap_restoreval(jit_State *J, GCtrace *T, ExitState *ex,
   }
 }
 
+#if LJ_HASFFI
+/* Restore raw data from the trace exit state. */
+static void snap_restoredata(GCtrace *T, ExitState *ex,
+			     SnapNo snapno, BloomFilter rfilt,
+			     IRRef ref, void *dst, CTSize sz)
+{
+  IRIns *ir = &T->ir[ref];
+  RegSP rs = ir->prev;
+  int32_t *src;
+  union { uint64_t u64; float f; } tmp;
+  if (irref_isk(ref)) {
+    if (ir->o == IR_KNUM || ir->o == IR_KINT64) {
+      src = mref(ir->ptr, int32_t);
+    } else if (sz == 8) {
+      tmp.u64 = (uint64_t)(uint32_t)ir->i;
+      src = (int32_t *)&tmp.u64;
+    } else {
+      src = &ir->i;
+    }
+  } else {
+    if (LJ_UNLIKELY(bloomtest(rfilt, ref)))
+      rs = snap_renameref(T, snapno, ref, rs);
+    if (ra_hasspill(regsp_spill(rs))) {
+      src = &ex->spill[regsp_spill(rs)];
+    } else {
+      Reg r = regsp_reg(rs);
+      if (ra_noreg(r)) {
+	/* Note: this assumes CNEWI is never used for SOFTFP split numbers. */
+	lua_assert(sz == 8 && ir->o == IR_CONV && ir->op2 == IRCONV_NUM_INT);
+	snap_restoredata(T, ex, snapno, rfilt, ir->op1, dst, 4);
+	*(lua_Number *)dst = (lua_Number)*(int32_t *)dst;
+	return;
+      }
+      src = (int32_t *)&ex->gpr[r-RID_MIN_GPR];
+#if !LJ_SOFTFP
+      if (r >= RID_MAX_GPR) {
+	src = (int32_t *)&ex->fpr[r-RID_MIN_FPR];
+#if LJ_TARGET_PPC
+	if (sz == 4) {  /* PPC FPRs are always doubles. */
+	  tmp.f = (float)*(double *)src;
+	  src = (int32_t *)&tmp.f;
+	}
+#else
+	if (LJ_BE && sz == 4) src++;
+#endif
+      }
+#endif
+    }
+  }
+  lua_assert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
+  if (sz == 4) *(int32_t *)dst = *src;
+  else if (sz == 8) *(int64_t *)dst = *(int64_t *)src;
+  else if (sz == 1) *(int8_t *)dst = (int8_t)*src;
+  else *(int16_t *)dst = (int16_t)*src;
+}
+#endif
+
+/* Unsink allocation from the trace exit state. Unsink sunk stores. */
+static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex,
+			SnapNo snapno, BloomFilter rfilt,
+			IRIns *ir, TValue *o)
+{
+  lua_assert(ir->o == IR_TNEW || ir->o == IR_TDUP ||
+	     ir->o == IR_CNEW || ir->o == IR_CNEWI);
+#if LJ_HASFFI
+  if (ir->o == IR_CNEW || ir->o == IR_CNEWI) {
+    CTState *cts = ctype_ctsG(J2G(J));
+    CTypeID id = (CTypeID)T->ir[ir->op1].i;
+    CTSize sz = lj_ctype_size(cts, id);
+    GCcdata *cd = lj_cdata_new(cts, id, sz);
+    setcdataV(J->L, o, cd);
+    if (ir->o == IR_CNEWI) {
+      uint8_t *p = (uint8_t *)cdataptr(cd);
+      lua_assert(sz == 4 || sz == 8);
+      if (LJ_32 && sz == 8 && ir+1 < T->ir + T->nins && (ir+1)->o == IR_HIOP) {
+	snap_restoredata(T, ex, snapno, rfilt, (ir+1)->op2, LJ_LE?p+4:p, 4);
+	if (LJ_BE) p += 4;
+	sz = 4;
+      }
+      snap_restoredata(T, ex, snapno, rfilt, ir->op2, p, sz);
+    } else {
+      IRIns *irs, *irlast = &T->ir[T->snap[snapno].ref];
+      for (irs = ir+1; irs < irlast; irs++)
+	if (irs->r == RID_SINK && ir + irs->s == irs) {
+	  IRIns *iro = &T->ir[T->ir[irs->op1].op2];
+	  uint8_t *p = (uint8_t *)cd;
+	  CTSize szs;
+	  lua_assert(irs->o == IR_XSTORE && T->ir[irs->op1].o == IR_ADD);
+	  lua_assert(iro->o == IR_KINT || iro->o == IR_KINT64);
+	  if (irt_is64(irs->t)) szs = 8;
+	  else if (irt_isi8(irs->t) || irt_isu8(irs->t)) szs = 1;
+	  else if (irt_isi16(irs->t) || irt_isu16(irs->t)) szs = 2;
+	  else szs = 4;
+	  if (LJ_64 && iro->o == IR_KINT64)
+	    p += (int64_t)ir_k64(iro)->u64;
+	  else
+	    p += iro->i;
+	  lua_assert(p >= (uint8_t *)cdataptr(cd) &&
+		     p + szs <= (uint8_t *)cdataptr(cd) + sz);
+	  if (LJ_32 && irs+1 < T->ir + T->nins && (irs+1)->o == IR_HIOP) {
+	    lua_assert(szs == 4);
+	    snap_restoredata(T, ex, snapno, rfilt, (irs+1)->op2, LJ_LE?p+4:p,4);
+	    if (LJ_BE) p += 4;
+	  }
+	  snap_restoredata(T, ex, snapno, rfilt, irs->op2, p, szs);
+	}
+    }
+  } else
+#endif
+  {
+    IRIns *irs, *irlast;
+    GCtab *t = ir->o == IR_TNEW ? lj_tab_new(J->L, ir->op1, ir->op2) :
+				  lj_tab_dup(J->L, ir_ktab(&T->ir[ir->op1]));
+    settabV(J->L, o, t);
+    irlast = &T->ir[T->snap[snapno].ref];
+    for (irs = ir+1; irs < irlast; irs++)
+      if (irs->r == RID_SINK && ir + irs->s == irs) {
+	IRIns *irk = &T->ir[irs->op1];
+	TValue tmp, *val;
+	lua_assert(irs->o == IR_ASTORE || irs->o == IR_HSTORE ||
+		   irs->o == IR_FSTORE);
+	if (irk->o == IR_FREF) {
+	  lua_assert(irk->op2 == IRFL_TAB_META);
+	  snap_restoreval(J, T, ex, snapno, rfilt, irs->op2, &tmp);
+	  /* NOBARRIER: The table is new (marked white). */
+	  setgcref(t->metatable, obj2gco(tabV(&tmp)));
+	} else {
+	  irk = &T->ir[irk->op2];
+	  if (irk->o == IR_KSLOT) irk = &T->ir[irk->op1];
+	  lj_ir_kvalue(J->L, &tmp, irk);
+	  val = lj_tab_set(J->L, t, &tmp);
+	  /* NOBARRIER: The table is new (marked white). */
+	  snap_restoreval(J, T, ex, snapno, rfilt, irs->op2, val);
+	  if (LJ_SOFTFP && irs+1 < T->ir + T->nins && (irs+1)->o == IR_HIOP) {
+	    snap_restoreval(J, T, ex, snapno, rfilt, (irs+1)->op2, &tmp);
+	    val->u32.hi = tmp.u32.lo;
+	  }
+	}
+      }
+  }
+}
+
 /* Restore interpreter state from exit state with the help of a snapshot. */
 const BCIns *lj_snap_restore(jit_State *J, void *exptr)
 {
@@ -500,10 +790,23 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr)
     SnapEntry sn = map[n];
     if (!(sn & SNAP_NORESTORE)) {
       TValue *o = &frame[snap_slot(sn)];
-      snap_restoreval(J, T, ex, snapno, rfilt, snap_ref(sn), o);
+      IRRef ref = snap_ref(sn);
+      IRIns *ir = &T->ir[ref];
+      if (ir->r == RID_SUNK) {
+	MSize j;
+	for (j = 0; j < n; j++)
+	  if (snap_ref(map[j]) == ref) {  /* De-duplicate sunk allocations. */
+	    copyTV(L, o, &frame[snap_slot(map[j])]);
+	    goto dupslot;
+	  }
+	snap_unsink(J, T, ex, snapno, rfilt, ir, o);
+      dupslot:
+	continue;
+      }
+      snap_restoreval(J, T, ex, snapno, rfilt, ref, o);
       if (LJ_SOFTFP && (sn & SNAP_SOFTFPNUM) && tvisint(o)) {
 	TValue tmp;
-	snap_restoreval(J, T, ex, snapno, rfilt, snap_ref(sn)+1, &tmp);
+	snap_restoreval(J, T, ex, snapno, rfilt, ref+1, &tmp);
 	o->u32.hi = tmp.u32.lo;
       } else if ((sn & (SNAP_CONT|SNAP_FRAME))) {
 	/* Overwrite tag with frame link. */
@@ -528,5 +831,6 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr)
 
 #undef IR
 #undef emitir_raw
+#undef emitir
 
 #endif

+ 4 - 2
src/lj_target.h

@@ -16,17 +16,19 @@ typedef uint32_t Reg;
 
 /* The hi-bit is NOT set for an allocated register. This means the value
 ** can be directly used without masking. The hi-bit is set for a register
-** allocation hint or for RID_INIT.
+** allocation hint or for RID_INIT, RID_SINK or RID_SUNK.
 */
 #define RID_NONE		0x80
 #define RID_MASK		0x7f
 #define RID_INIT		(RID_NONE|RID_MASK)
+#define RID_SINK		(RID_INIT-1)
+#define RID_SUNK		(RID_INIT-2)
 
 #define ra_noreg(r)		((r) & RID_NONE)
 #define ra_hasreg(r)		(!((r) & RID_NONE))
 
 /* The ra_hashint() macro assumes a previous test for ra_noreg(). */
-#define ra_hashint(r)		((r) != RID_INIT)
+#define ra_hashint(r)		((r) < RID_SUNK)
 #define ra_gethint(r)		((Reg)((r) & RID_MASK))
 #define ra_sethint(rr, r)	rr = (uint8_t)((r)|RID_NONE)
 #define ra_samehint(r1, r2)	(ra_gethint((r1)^(r2)) == 0)

+ 1 - 0
src/lj_trace.c

@@ -606,6 +606,7 @@ static TValue *trace_state(lua_State *L, lua_CFunction dummy, void *ud)
 	J->loopref = J->chain[IR_LOOP];  /* Needed by assembler. */
       }
       lj_opt_split(J);
+      lj_opt_sink(J);
       J->state = LJ_TRACE_ASM;
       break;
 

+ 1 - 0
src/ljamalg.c

@@ -64,6 +64,7 @@
 #include "lj_opt_dce.c"
 #include "lj_opt_loop.c"
 #include "lj_opt_split.c"
+#include "lj_opt_sink.c"
 #include "lj_mcode.c"
 #include "lj_snap.c"
 #include "lj_record.c"