瀏覽代碼

Compile table traversals: next(), pairs(), BC_ISNEXT/BC_ITERN.

Sponsored by OpenResty Inc.
Mike Pall 4 年之前
父節點
當前提交
bb0f241015
共有 27 個文件被更改,包括 781 次插入47 次删除
  1. 11 6
      src/jit/dump.lua
  2. 1 1
      src/lib_base.c
  3. 11 1
      src/lj_asm.c
  4. 2 0
      src/lj_asm_arm.h
  5. 8 1
      src/lj_asm_arm64.h
  6. 2 0
      src/lj_asm_mips.h
  7. 4 1
      src/lj_asm_ppc.h
  8. 11 2
      src/lj_asm_x86.h
  9. 7 1
      src/lj_dispatch.c
  10. 34 0
      src/lj_ffrecord.c
  11. 2 0
      src/lj_ir.h
  12. 2 0
      src/lj_ircall.h
  13. 5 1
      src/lj_jit.h
  14. 9 0
      src/lj_opt_fold.c
  15. 4 1
      src/lj_opt_mem.c
  16. 125 5
      src/lj_record.c
  17. 1 0
      src/lj_record.h
  18. 8 2
      src/lj_snap.c
  19. 15 7
      src/lj_trace.c
  20. 2 0
      src/lj_vm.h
  21. 77 2
      src/vm_arm.dasc
  22. 77 2
      src/vm_arm64.dasc
  23. 92 5
      src/vm_mips.dasc
  24. 88 4
      src/vm_mips64.dasc
  25. 8 1
      src/vm_ppc.dasc
  26. 78 2
      src/vm_x64.dasc
  27. 97 2
      src/vm_x86.dasc

+ 11 - 6
src/jit/dump.lua

@@ -219,8 +219,10 @@ local function colorize_text(s)
   return s
   return s
 end
 end
 
 
-local function colorize_ansi(s, t)
-  return format(colortype_ansi[t], s)
+local function colorize_ansi(s, t, extra)
+  local out = format(colortype_ansi[t], s)
+  if extra then out = "\027[3m"..out end
+  return out
 end
 end
 
 
 local irtype_ansi = setmetatable({},
 local irtype_ansi = setmetatable({},
@@ -229,9 +231,10 @@ local irtype_ansi = setmetatable({},
 
 
 local html_escape = { ["<"] = "&lt;", [">"] = "&gt;", ["&"] = "&amp;", }
 local html_escape = { ["<"] = "&lt;", [">"] = "&gt;", ["&"] = "&amp;", }
 
 
-local function colorize_html(s, t)
+local function colorize_html(s, t, extra)
   s = gsub(s, "[<>&]", html_escape)
   s = gsub(s, "[<>&]", html_escape)
-  return format('<span class="irt_%s">%s</span>', irtype_text[t], s)
+  return format('<span class="irt_%s%s">%s</span>',
+		irtype_text[t], extra and " irt_extra" or "", s)
 end
 end
 
 
 local irtype_html = setmetatable({},
 local irtype_html = setmetatable({},
@@ -256,6 +259,7 @@ span.irt_tab { color: #c00000; }
 span.irt_udt, span.irt_lud { color: #00c0c0; }
 span.irt_udt, span.irt_lud { color: #00c0c0; }
 span.irt_num { color: #4040c0; }
 span.irt_num { color: #4040c0; }
 span.irt_int, span.irt_i8, span.irt_u8, span.irt_i16, span.irt_u16 { color: #b040b0; }
 span.irt_int, span.irt_i8, span.irt_u8, span.irt_i16, span.irt_u16 { color: #b040b0; }
+span.irt_extra { font-style: italic; }
 </style>
 </style>
 ]]
 ]]
 
 
@@ -271,6 +275,7 @@ local litname = {
     if band(mode, 8) ~= 0 then s = s.."C" end
     if band(mode, 8) ~= 0 then s = s.."C" end
     if band(mode, 16) ~= 0 then s = s.."R" end
     if band(mode, 16) ~= 0 then s = s.."R" end
     if band(mode, 32) ~= 0 then s = s.."I" end
     if band(mode, 32) ~= 0 then s = s.."I" end
+    if band(mode, 64) ~= 0 then s = s.."K" end
     t[mode] = s
     t[mode] = s
     return s
     return s
   end}),
   end}),
@@ -350,7 +355,7 @@ local function formatk(tr, idx, sn)
   else
   else
     s = tostring(k) -- For primitives.
     s = tostring(k) -- For primitives.
   end
   end
-  s = colorize(format("%-4s", s), t)
+  s = colorize(format("%-4s", s), t, band(sn or 0, 0x100000) ~= 0)
   if slot then
   if slot then
     s = format("%s @%d", s, slot)
     s = format("%s @%d", s, slot)
   end
   end
@@ -370,7 +375,7 @@ local function printsnap(tr, snap)
 	out:write(colorize(format("%04d/%04d", ref, ref+1), 14))
 	out:write(colorize(format("%04d/%04d", ref, ref+1), 14))
       else
       else
 	local m, ot, op1, op2 = traceir(tr, ref)
 	local m, ot, op1, op2 = traceir(tr, ref)
-	out:write(colorize(format("%04d", ref), band(ot, 31)))
+	out:write(colorize(format("%04d", ref), band(ot, 31), band(sn, 0x100000) ~= 0))
       end
       end
       out:write(band(sn, 0x10000) == 0 and " " or "|") -- SNAP_FRAME
       out:write(band(sn, 0x10000) == 0 and " " or "|") -- SNAP_FRAME
     else
     else

+ 1 - 1
src/lib_base.c

@@ -76,7 +76,7 @@ LJLIB_ASM_(type)		LJLIB_REC(.)
 /* This solves a circular dependency problem -- change FF_next_N as needed. */
 /* This solves a circular dependency problem -- change FF_next_N as needed. */
 LJ_STATIC_ASSERT((int)FF_next == FF_next_N);
 LJ_STATIC_ASSERT((int)FF_next == FF_next_N);
 
 
-LJLIB_ASM(next)
+LJLIB_ASM(next)			LJLIB_REC(.)
 {
 {
   lj_lib_checktab(L, 1);
   lj_lib_checktab(L, 1);
   lj_err_msg(L, LJ_ERR_NEXTIDX);
   lj_err_msg(L, LJ_ERR_NEXTIDX);

+ 11 - 1
src/lj_asm.c

@@ -2225,7 +2225,17 @@ static void asm_setup_regsp(ASMState *as)
 	as->modset |= RSET_SCRATCH;
 	as->modset |= RSET_SCRATCH;
       continue;
       continue;
       }
       }
-    case IR_CALLN: case IR_CALLA: case IR_CALLL: case IR_CALLS: {
+    case IR_CALLL:
+      /* lj_vm_next needs two TValues on the stack. */
+#if LJ_TARGET_X64 && LJ_ABI_WIN
+      if (ir->op2 == IRCALL_lj_vm_next && as->evenspill < SPS_FIRST + 4)
+	as->evenspill = SPS_FIRST + 4;
+#else
+      if (SPS_FIRST < 4 && ir->op2 == IRCALL_lj_vm_next && as->evenspill < 4)
+	as->evenspill = 4;
+#endif
+      /* fallthrough */
+    case IR_CALLN: case IR_CALLA: case IR_CALLS: {
       const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
       const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
       ir->prev = asm_setup_call_slots(as, ir, ci);
       ir->prev = asm_setup_call_slots(as, ir, ci);
       if (inloop)
       if (inloop)

+ 2 - 0
src/lj_asm_arm.h

@@ -2064,6 +2064,8 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
       } else if ((sn & SNAP_SOFTFPNUM)) {
       } else if ((sn & SNAP_SOFTFPNUM)) {
 	type = ra_alloc1(as, ref+1, rset_exclude(RSET_GPRODD, RID_BASE));
 	type = ra_alloc1(as, ref+1, rset_exclude(RSET_GPRODD, RID_BASE));
 #endif
 #endif
+      } else if ((sn & SNAP_KEYINDEX)) {
+	type = ra_allock(as, (int32_t)LJ_KEYINDEX, odd);
       } else {
       } else {
 	type = ra_allock(as, (int32_t)irt_toitype(ir->t), odd);
 	type = ra_allock(as, (int32_t)irt_toitype(ir->t), odd);
       }
       }

+ 8 - 1
src/lj_asm_arm64.h

@@ -1814,7 +1814,14 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
     IRIns *ir = IR(ref);
     IRIns *ir = IR(ref);
     if ((sn & SNAP_NORESTORE))
     if ((sn & SNAP_NORESTORE))
       continue;
       continue;
-    if (irt_isnum(ir->t)) {
+    if ((sn & SNAP_KEYINDEX)) {
+      RegSet allow = rset_exclude(RSET_GPR, RID_BASE);
+      Reg r = irref_isk(ref) ? ra_allock(as, ir->i, allow) :
+			       ra_alloc1(as, ref, allow);
+      rset_clear(allow, r);
+      emit_lso(as, A64I_STRw, r, RID_BASE, ofs);
+      emit_lso(as, A64I_STRw, ra_allock(as, LJ_KEYINDEX, allow), RID_BASE, ofs+4);
+    } else if (irt_isnum(ir->t)) {
       Reg src = ra_alloc1(as, ref, RSET_FPR);
       Reg src = ra_alloc1(as, ref, RSET_FPR);
       emit_lso(as, A64I_STRd, (src & 31), RID_BASE, ofs);
       emit_lso(as, A64I_STRd, (src & 31), RID_BASE, ofs);
     } else {
     } else {

+ 2 - 0
src/lj_asm_mips.h

@@ -2568,6 +2568,8 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
       } else if ((sn & SNAP_SOFTFPNUM)) {
       } else if ((sn & SNAP_SOFTFPNUM)) {
 	type = ra_alloc1(as, ref+1, rset_exclude(RSET_GPR, RID_BASE));
 	type = ra_alloc1(as, ref+1, rset_exclude(RSET_GPR, RID_BASE));
 #endif
 #endif
+      } else if ((sn & SNAP_KEYINDEX)) {
+	type = ra_allock(as, (int32_t)LJ_KEYINDEX, allow);
       } else {
       } else {
 	type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
 	type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
       }
       }

+ 4 - 1
src/lj_asm_ppc.h

@@ -1103,7 +1103,8 @@ static void asm_sload(ASMState *as, IRIns *ir)
   lj_assertA(irt_isguard(ir->t) || !(ir->op2 & IRSLOAD_TYPECHECK),
   lj_assertA(irt_isguard(ir->t) || !(ir->op2 & IRSLOAD_TYPECHECK),
 	     "inconsistent SLOAD variant");
 	     "inconsistent SLOAD variant");
   lj_assertA(LJ_DUALNUM ||
   lj_assertA(LJ_DUALNUM ||
-	     !irt_isint(t) || (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME)),
+	     !irt_isint(t) ||
+	     (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME|IRSLOAD_KEYINDEX)),
 	     "bad SLOAD type");
 	     "bad SLOAD type");
 #if LJ_SOFTFP
 #if LJ_SOFTFP
   lj_assertA(!(ir->op2 & IRSLOAD_CONVERT),
   lj_assertA(!(ir->op2 & IRSLOAD_CONVERT),
@@ -2096,6 +2097,8 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
       } else if ((sn & SNAP_SOFTFPNUM)) {
       } else if ((sn & SNAP_SOFTFPNUM)) {
 	type = ra_alloc1(as, ref+1, rset_exclude(RSET_GPR, RID_BASE));
 	type = ra_alloc1(as, ref+1, rset_exclude(RSET_GPR, RID_BASE));
 #endif
 #endif
+      } else if ((sn & SNAP_KEYINDEX)) {
+	type = ra_allock(as, (int32_t)LJ_KEYINDEX, allow);
       } else {
       } else {
 	type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
 	type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
       }
       }

+ 11 - 2
src/lj_asm_x86.h

@@ -1700,7 +1700,8 @@ static void asm_sload(ASMState *as, IRIns *ir)
   lj_assertA(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK),
   lj_assertA(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK),
 	     "inconsistent SLOAD variant");
 	     "inconsistent SLOAD variant");
   lj_assertA(LJ_DUALNUM ||
   lj_assertA(LJ_DUALNUM ||
-	     !irt_isint(t) || (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME)),
+	     !irt_isint(t) ||
+	     (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME|IRSLOAD_KEYINDEX)),
 	     "bad SLOAD type");
 	     "bad SLOAD type");
   if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) {
   if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) {
     Reg left = ra_scratch(as, RSET_FPR);
     Reg left = ra_scratch(as, RSET_FPR);
@@ -2727,7 +2728,15 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
     IRIns *ir = IR(ref);
     IRIns *ir = IR(ref);
     if ((sn & SNAP_NORESTORE))
     if ((sn & SNAP_NORESTORE))
       continue;
       continue;
-    if (irt_isnum(ir->t)) {
+    if ((sn & SNAP_KEYINDEX)) {
+      emit_movmroi(as, RID_BASE, ofs+4, LJ_KEYINDEX);
+      if (irref_isk(ref)) {
+	emit_movmroi(as, RID_BASE, ofs, ir->i);
+      } else {
+	Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE));
+	emit_movtomro(as, src, RID_BASE, ofs);
+      }
+    } else if (irt_isnum(ir->t)) {
       Reg src = ra_alloc1(as, ref, RSET_FPR);
       Reg src = ra_alloc1(as, ref, RSET_FPR);
       emit_rmro(as, XO_MOVSDto, src, RID_BASE, ofs);
       emit_rmro(as, XO_MOVSDto, src, RID_BASE, ofs);
     } else {
     } else {

+ 7 - 1
src/lj_dispatch.c

@@ -68,6 +68,8 @@ void lj_dispatch_init(GG_State *GG)
   /* The JIT engine is off by default. luaopen_jit() turns it on. */
   /* The JIT engine is off by default. luaopen_jit() turns it on. */
   disp[BC_FORL] = disp[BC_IFORL];
   disp[BC_FORL] = disp[BC_IFORL];
   disp[BC_ITERL] = disp[BC_IITERL];
   disp[BC_ITERL] = disp[BC_IITERL];
+  /* Workaround for stable v2.1 bytecode. TODO: Replace with BC_IITERN. */
+  disp[BC_ITERN] = &lj_vm_IITERN;
   disp[BC_LOOP] = disp[BC_ILOOP];
   disp[BC_LOOP] = disp[BC_ILOOP];
   disp[BC_FUNCF] = disp[BC_IFUNCF];
   disp[BC_FUNCF] = disp[BC_IFUNCF];
   disp[BC_FUNCV] = disp[BC_IFUNCV];
   disp[BC_FUNCV] = disp[BC_IFUNCV];
@@ -118,19 +120,21 @@ void lj_dispatch_update(global_State *g)
   mode |= (g->hookmask & LUA_MASKRET) ? DISPMODE_RET : 0;
   mode |= (g->hookmask & LUA_MASKRET) ? DISPMODE_RET : 0;
   if (oldmode != mode) {  /* Mode changed? */
   if (oldmode != mode) {  /* Mode changed? */
     ASMFunction *disp = G2GG(g)->dispatch;
     ASMFunction *disp = G2GG(g)->dispatch;
-    ASMFunction f_forl, f_iterl, f_loop, f_funcf, f_funcv;
+    ASMFunction f_forl, f_iterl, f_itern, f_loop, f_funcf, f_funcv;
     g->dispatchmode = mode;
     g->dispatchmode = mode;
 
 
     /* Hotcount if JIT is on, but not while recording. */
     /* Hotcount if JIT is on, but not while recording. */
     if ((mode & (DISPMODE_JIT|DISPMODE_REC)) == DISPMODE_JIT) {
     if ((mode & (DISPMODE_JIT|DISPMODE_REC)) == DISPMODE_JIT) {
       f_forl = makeasmfunc(lj_bc_ofs[BC_FORL]);
       f_forl = makeasmfunc(lj_bc_ofs[BC_FORL]);
       f_iterl = makeasmfunc(lj_bc_ofs[BC_ITERL]);
       f_iterl = makeasmfunc(lj_bc_ofs[BC_ITERL]);
+      f_itern = makeasmfunc(lj_bc_ofs[BC_ITERN]);
       f_loop = makeasmfunc(lj_bc_ofs[BC_LOOP]);
       f_loop = makeasmfunc(lj_bc_ofs[BC_LOOP]);
       f_funcf = makeasmfunc(lj_bc_ofs[BC_FUNCF]);
       f_funcf = makeasmfunc(lj_bc_ofs[BC_FUNCF]);
       f_funcv = makeasmfunc(lj_bc_ofs[BC_FUNCV]);
       f_funcv = makeasmfunc(lj_bc_ofs[BC_FUNCV]);
     } else {  /* Otherwise use the non-hotcounting instructions. */
     } else {  /* Otherwise use the non-hotcounting instructions. */
       f_forl = disp[GG_LEN_DDISP+BC_IFORL];
       f_forl = disp[GG_LEN_DDISP+BC_IFORL];
       f_iterl = disp[GG_LEN_DDISP+BC_IITERL];
       f_iterl = disp[GG_LEN_DDISP+BC_IITERL];
+      f_itern = &lj_vm_IITERN;
       f_loop = disp[GG_LEN_DDISP+BC_ILOOP];
       f_loop = disp[GG_LEN_DDISP+BC_ILOOP];
       f_funcf = makeasmfunc(lj_bc_ofs[BC_IFUNCF]);
       f_funcf = makeasmfunc(lj_bc_ofs[BC_IFUNCF]);
       f_funcv = makeasmfunc(lj_bc_ofs[BC_IFUNCV]);
       f_funcv = makeasmfunc(lj_bc_ofs[BC_IFUNCV]);
@@ -138,6 +142,7 @@ void lj_dispatch_update(global_State *g)
     /* Init static counting instruction dispatch first (may be copied below). */
     /* Init static counting instruction dispatch first (may be copied below). */
     disp[GG_LEN_DDISP+BC_FORL] = f_forl;
     disp[GG_LEN_DDISP+BC_FORL] = f_forl;
     disp[GG_LEN_DDISP+BC_ITERL] = f_iterl;
     disp[GG_LEN_DDISP+BC_ITERL] = f_iterl;
+    disp[GG_LEN_DDISP+BC_ITERN] = f_itern;
     disp[GG_LEN_DDISP+BC_LOOP] = f_loop;
     disp[GG_LEN_DDISP+BC_LOOP] = f_loop;
 
 
     /* Set dynamic instruction dispatch. */
     /* Set dynamic instruction dispatch. */
@@ -165,6 +170,7 @@ void lj_dispatch_update(global_State *g)
       /* Otherwise set dynamic counting ins. */
       /* Otherwise set dynamic counting ins. */
       disp[BC_FORL] = f_forl;
       disp[BC_FORL] = f_forl;
       disp[BC_ITERL] = f_iterl;
       disp[BC_ITERL] = f_iterl;
+      disp[BC_ITERN] = f_itern;
       disp[BC_LOOP] = f_loop;
       disp[BC_LOOP] = f_loop;
       /* Set dynamic return dispatch. */
       /* Set dynamic return dispatch. */
       if ((mode & DISPMODE_RET)) {
       if ((mode & DISPMODE_RET)) {

+ 34 - 0
src/lj_ffrecord.c

@@ -521,6 +521,40 @@ static void LJ_FASTCALL recff_getfenv(jit_State *J, RecordFFData *rd)
   recff_nyiu(J, rd);
   recff_nyiu(J, rd);
 }
 }
 
 
+static void LJ_FASTCALL recff_next(jit_State *J, RecordFFData *rd)
+{
+#if LJ_BE
+  /* YAGNI: Disabled on big-endian due to issues with lj_vm_next,
+  ** IR_HIOP, RID_RETLO/RID_RETHI and ra_destpair.
+  */
+  recff_nyi(J, rd);
+#else
+  TRef tab = J->base[0];
+  if (tref_istab(tab)) {
+    RecordIndex ix;
+    cTValue *keyv;
+    ix.tab = tab;
+    if (tref_isnil(J->base[1])) {  /* Shortcut for start of traversal. */
+      ix.key = lj_ir_kint(J, 0);
+      keyv = niltvg(J2G(J));
+    } else {
+      TRef tmp = recff_tmpref(J, J->base[1], IRTMPREF_IN1);
+      ix.key = lj_ir_call(J, IRCALL_lj_tab_keyindex, tab, tmp);
+      keyv = &rd->argv[1];
+    }
+    copyTV(J->L, &ix.tabv, &rd->argv[0]);
+    ix.keyv.u32.lo = lj_tab_keyindex(tabV(&ix.tabv), keyv);
+    /* Omit the value, if not used by the caller. */
+    ix.idxchain = (J->framedepth && frame_islua(J->L->base-1) &&
+		   bc_b(frame_pc(J->L->base-1)[-1]) <= 2);
+    ix.mobj = 0;  /* We don't need the next index. */
+    rd->nres = lj_record_next(J, &ix);
+    J->base[0] = ix.key;
+    J->base[1] = ix.val;
+  }  /* else: Interpreter will throw. */
+#endif
+}
+
 /* -- Math library fast functions ----------------------------------------- */
 /* -- Math library fast functions ----------------------------------------- */
 
 
 static void LJ_FASTCALL recff_math_abs(jit_State *J, RecordFFData *rd)
 static void LJ_FASTCALL recff_math_abs(jit_State *J, RecordFFData *rd)

+ 2 - 0
src/lj_ir.h

@@ -236,6 +236,7 @@ IRFLDEF(FLENUM)
 #define IRSLOAD_CONVERT		0x08	/* Number to integer conversion. */
 #define IRSLOAD_CONVERT		0x08	/* Number to integer conversion. */
 #define IRSLOAD_READONLY	0x10	/* Read-only, omit slot store. */
 #define IRSLOAD_READONLY	0x10	/* Read-only, omit slot store. */
 #define IRSLOAD_INHERIT		0x20	/* Inherited by exits/side traces. */
 #define IRSLOAD_INHERIT		0x20	/* Inherited by exits/side traces. */
+#define IRSLOAD_KEYINDEX	0x40	/* Table traversal key index. */
 
 
 /* XLOAD mode bits, stored in op2. */
 /* XLOAD mode bits, stored in op2. */
 #define IRXLOAD_READONLY	0x01	/* Load from read-only data. */
 #define IRXLOAD_READONLY	0x01	/* Load from read-only data. */
@@ -495,6 +496,7 @@ typedef uint32_t TRef;
 #define TREF_REFMASK		0x0000ffff
 #define TREF_REFMASK		0x0000ffff
 #define TREF_FRAME		0x00010000
 #define TREF_FRAME		0x00010000
 #define TREF_CONT		0x00020000
 #define TREF_CONT		0x00020000
+#define TREF_KEYINDEX		0x00100000
 
 
 #define TREF(ref, t)		((TRef)((ref) + ((t)<<24)))
 #define TREF(ref, t)		((TRef)((ref) + ((t)<<24)))
 
 

+ 2 - 0
src/lj_ircall.h

@@ -187,6 +187,8 @@ typedef struct CCallInfo {
   _(ANY,	lj_tab_dup,		2,  FA, TAB, CCI_L|CCI_T) \
   _(ANY,	lj_tab_dup,		2,  FA, TAB, CCI_L|CCI_T) \
   _(ANY,	lj_tab_clear,		1,  FS, NIL, 0) \
   _(ANY,	lj_tab_clear,		1,  FS, NIL, 0) \
   _(ANY,	lj_tab_newkey,		3,   S, PGC, CCI_L|CCI_T) \
   _(ANY,	lj_tab_newkey,		3,   S, PGC, CCI_L|CCI_T) \
+  _(ANY,	lj_tab_keyindex,	2,  FL, INT, 0) \
+  _(ANY,	lj_vm_next,		2,  FL, PTR, 0) \
   _(ANY,	lj_tab_len,		1,  FL, INT, 0) \
   _(ANY,	lj_tab_len,		1,  FL, INT, 0) \
   _(ANY,	lj_tab_len_hint,	2,  FL, INT, 0) \
   _(ANY,	lj_tab_len_hint,	2,  FL, INT, 0) \
   _(ANY,	lj_gc_step_jit,		2,  FS, NIL, CCI_L) \
   _(ANY,	lj_gc_step_jit,		2,  FS, NIL, CCI_L) \

+ 5 - 1
src/lj_jit.h

@@ -150,6 +150,7 @@ typedef enum {
   LJ_TRACE_IDLE,	/* Trace compiler idle. */
   LJ_TRACE_IDLE,	/* Trace compiler idle. */
   LJ_TRACE_ACTIVE = 0x10,
   LJ_TRACE_ACTIVE = 0x10,
   LJ_TRACE_RECORD,	/* Bytecode recording active. */
   LJ_TRACE_RECORD,	/* Bytecode recording active. */
+  LJ_TRACE_RECORD_1ST,	/* Record 1st instruction, too. */
   LJ_TRACE_START,	/* New trace started. */
   LJ_TRACE_START,	/* New trace started. */
   LJ_TRACE_END,		/* End of trace. */
   LJ_TRACE_END,		/* End of trace. */
   LJ_TRACE_ASM,		/* Assemble trace. */
   LJ_TRACE_ASM,		/* Assemble trace. */
@@ -200,12 +201,15 @@ typedef uint32_t SnapEntry;
 #define SNAP_CONT		0x020000	/* Continuation slot. */
 #define SNAP_CONT		0x020000	/* Continuation slot. */
 #define SNAP_NORESTORE		0x040000	/* No need to restore slot. */
 #define SNAP_NORESTORE		0x040000	/* No need to restore slot. */
 #define SNAP_SOFTFPNUM		0x080000	/* Soft-float number. */
 #define SNAP_SOFTFPNUM		0x080000	/* Soft-float number. */
+#define SNAP_KEYINDEX		0x100000	/* Traversal key index. */
 LJ_STATIC_ASSERT(SNAP_FRAME == TREF_FRAME);
 LJ_STATIC_ASSERT(SNAP_FRAME == TREF_FRAME);
 LJ_STATIC_ASSERT(SNAP_CONT == TREF_CONT);
 LJ_STATIC_ASSERT(SNAP_CONT == TREF_CONT);
+LJ_STATIC_ASSERT(SNAP_KEYINDEX == TREF_KEYINDEX);
 
 
 #define SNAP(slot, flags, ref)	(((SnapEntry)(slot) << 24) + (flags) + (ref))
 #define SNAP(slot, flags, ref)	(((SnapEntry)(slot) << 24) + (flags) + (ref))
 #define SNAP_TR(slot, tr) \
 #define SNAP_TR(slot, tr) \
-  (((SnapEntry)(slot) << 24) + ((tr) & (TREF_CONT|TREF_FRAME|TREF_REFMASK)))
+  (((SnapEntry)(slot) << 24) + \
+   ((tr) & (TREF_KEYINDEX|TREF_CONT|TREF_FRAME|TREF_REFMASK)))
 #if !LJ_FR2
 #if !LJ_FR2
 #define SNAP_MKPC(pc)		((SnapEntry)u32ptr(pc))
 #define SNAP_MKPC(pc)		((SnapEntry)u32ptr(pc))
 #endif
 #endif

+ 9 - 0
src/lj_opt_fold.c

@@ -2320,6 +2320,15 @@ LJFOLDF(fload_sbuf)
   return lj_opt_fwd_sbuf(J, tref_ref(tr)) ? tr : EMITFOLD;
   return lj_opt_fwd_sbuf(J, tref_ref(tr)) ? tr : EMITFOLD;
 }
 }
 
 
+/* The fast function ID of function objects is immutable. */
+LJFOLD(FLOAD KGC IRFL_FUNC_FFID)
+LJFOLDF(fload_func_ffid_kgc)
+{
+  if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
+    return INTFOLD((int32_t)ir_kfunc(fleft)->c.ffid);
+  return NEXTFOLD;
+}
+
 /* The C type ID of cdata objects is immutable. */
 /* The C type ID of cdata objects is immutable. */
 LJFOLD(FLOAD KGC IRFL_CDATA_CTYPEID)
 LJFOLD(FLOAD KGC IRFL_CDATA_CTYPEID)
 LJFOLDF(fload_cdata_typeid_kgc)
 LJFOLDF(fload_cdata_typeid_kgc)

+ 4 - 1
src/lj_opt_mem.c

@@ -364,7 +364,10 @@ TRef LJ_FASTCALL lj_opt_dse_ahstore(jit_State *J)
       /* Different value: try to eliminate the redundant store. */
       /* Different value: try to eliminate the redundant store. */
       if (ref > J->chain[IR_LOOP]) {  /* Quick check to avoid crossing LOOP. */
       if (ref > J->chain[IR_LOOP]) {  /* Quick check to avoid crossing LOOP. */
 	IRIns *ir;
 	IRIns *ir;
-	/* Check for any intervening guards (includes conflicting loads). */
+	/* Check for any intervening guards (includes conflicting loads).
+	** Note that lj_tab_keyindex and lj_vm_next don't need guards,
+	** since they are followed by at least one guarded VLOAD.
+	*/
 	for (ir = IR(J->cur.nins-1); ir > store; ir--)
 	for (ir = IR(J->cur.nins-1); ir > store; ir--)
 	  if (irt_isguard(ir->t) || ir->o == IR_ALEN)
 	  if (irt_isguard(ir->t) || ir->o == IR_ALEN)
 	    goto doemit;  /* No elimination possible. */
 	    goto doemit;  /* No elimination possible. */

+ 125 - 5
src/lj_record.c

@@ -156,6 +156,9 @@ static void rec_check_slots(jit_State *J)
 	lj_assertJ((J->slot[s+1+LJ_FR2] & TREF_FRAME),
 	lj_assertJ((J->slot[s+1+LJ_FR2] & TREF_FRAME),
 		   "cont slot %d not followed by frame", s);
 		   "cont slot %d not followed by frame", s);
 	depth++;
 	depth++;
+      } else if ((tr & TREF_KEYINDEX)) {
+	lj_assertJ(tref_isint(tr), "keyindex slot %d bad type %d",
+				   s, tref_type(tr));
       } else {
       } else {
 	/* Number repr. may differ, but other types must be the same. */
 	/* Number repr. may differ, but other types must be the same. */
 	lj_assertJ(tvisnumber(tv) ? tref_isnumber(tr) :
 	lj_assertJ(tvisnumber(tv) ? tref_isnumber(tr) :
@@ -283,9 +286,9 @@ static void canonicalize_slots(jit_State *J)
   if (LJ_DUALNUM) return;
   if (LJ_DUALNUM) return;
   for (s = J->baseslot+J->maxslot-1; s >= 1; s--) {
   for (s = J->baseslot+J->maxslot-1; s >= 1; s--) {
     TRef tr = J->slot[s];
     TRef tr = J->slot[s];
-    if (tref_isinteger(tr)) {
+    if (tref_isinteger(tr) && !(tr & TREF_KEYINDEX)) {
       IRIns *ir = IR(tref_ref(tr));
       IRIns *ir = IR(tref_ref(tr));
-      if (!(ir->o == IR_SLOAD && (ir->op2 & IRSLOAD_READONLY)))
+      if (!(ir->o == IR_SLOAD && (ir->op2 & (IRSLOAD_READONLY))))
 	J->slot[s] = emitir(IRTN(IR_CONV), tr, IRCONV_NUM_INT);
 	J->slot[s] = emitir(IRTN(IR_CONV), tr, IRCONV_NUM_INT);
     }
     }
   }
   }
@@ -606,6 +609,7 @@ static void rec_loop_interp(jit_State *J, const BCIns *pc, LoopEvent ev)
 {
 {
   if (J->parent == 0 && J->exitno == 0) {
   if (J->parent == 0 && J->exitno == 0) {
     if (pc == J->startpc && J->framedepth + J->retdepth == 0) {
     if (pc == J->startpc && J->framedepth + J->retdepth == 0) {
+      if (bc_op(J->cur.startins) == BC_ITERN) return;  /* See rec_itern(). */
       /* Same loop? */
       /* Same loop? */
       if (ev == LOOPEV_LEAVE)  /* Must loop back to form a root trace. */
       if (ev == LOOPEV_LEAVE)  /* Must loop back to form a root trace. */
 	lj_trace_err(J, LJ_TRERR_LLEAVE);
 	lj_trace_err(J, LJ_TRERR_LLEAVE);
@@ -646,6 +650,68 @@ static void rec_loop_jit(jit_State *J, TraceNo lnk, LoopEvent ev)
   }  /* Side trace continues across a loop that's left or not entered. */
   }  /* Side trace continues across a loop that's left or not entered. */
 }
 }
 
 
+/* Record ITERN. */
+static LoopEvent rec_itern(jit_State *J, BCReg ra, BCReg rb)
+{
+#if LJ_BE
+  /* YAGNI: Disabled on big-endian due to issues with lj_vm_next,
+  ** IR_HIOP, RID_RETLO/RID_RETHI and ra_destpair.
+  */
+  UNUSED(ra); UNUSED(rb);
+  setintV(&J->errinfo, (int32_t)BC_ITERN);
+  lj_trace_err_info(J, LJ_TRERR_NYIBC);
+#else
+  RecordIndex ix;
+  /* Since ITERN is recorded at the start, we need our own loop detection. */
+  if (J->pc == J->startpc && J->cur.nins > REF_FIRST &&
+      J->framedepth + J->retdepth == 0 && J->parent == 0 && J->exitno == 0) {
+    lj_record_stop(J, LJ_TRLINK_LOOP, J->cur.traceno);  /* Looping trace. */
+    return LOOPEV_ENTER;
+  }
+  J->maxslot = ra;
+  lj_snap_add(J);  /* Required to make JLOOP the first ins in a side-trace. */
+  ix.tab = getslot(J, ra-2);
+  ix.key = J->base[ra-1] ? J->base[ra-1] :
+	   sloadt(J, (int32_t)(ra-1), IRT_INT, IRSLOAD_KEYINDEX);
+  copyTV(J->L, &ix.tabv, &J->L->base[ra-2]);
+  copyTV(J->L, &ix.keyv, &J->L->base[ra-1]);
+  ix.idxchain = (rb < 3);  /* Omit value type check, if unused. */
+  ix.mobj = 1;  /* We need the next index, too. */
+  J->maxslot = ra + lj_record_next(J, &ix);
+  J->needsnap = 1;
+  if (!tref_isnil(ix.key)) {  /* Looping back? */
+    J->base[ra-1] = ix.mobj | TREF_KEYINDEX;  /* Control var has next index. */
+    J->base[ra] = ix.key;
+    J->base[ra+1] = ix.val;
+    J->pc += bc_j(J->pc[1])+2;
+    return LOOPEV_ENTER;
+  } else {
+    J->maxslot = ra-3;
+    J->pc += 2;
+    return LOOPEV_LEAVE;
+  }
+#endif
+}
+
+/* Record ISNEXT. */
+static void rec_isnext(jit_State *J, BCReg ra)
+{
+  cTValue *b = &J->L->base[ra-3];
+  if (tvisfunc(b) && funcV(b)->c.ffid == FF_next &&
+      tvistab(b+1) && tvisnil(b+2)) {
+    /* These checks are folded away for a compiled pairs(). */
+    TRef func = getslot(J, ra-3);
+    TRef trid = emitir(IRT(IR_FLOAD, IRT_U8), func, IRFL_FUNC_FFID);
+    emitir(IRTGI(IR_EQ), trid, lj_ir_kint(J, FF_next));
+    (void)getslot(J, ra-2); /* Type check for table. */
+    (void)getslot(J, ra-1); /* Type check for nil key. */
+    J->base[ra-1] = lj_ir_kint(J, 0) | TREF_KEYINDEX;
+    J->maxslot = ra;
+  } else {  /* Abort trace. Interpreter will despecialize bytecode. */
+    lj_trace_err(J, LJ_TRERR_RECERR);
+  }
+}
+
 /* -- Record profiler hook checks ----------------------------------------- */
 /* -- Record profiler hook checks ----------------------------------------- */
 
 
 #if LJ_HASPROFILE
 #if LJ_HASPROFILE
@@ -716,7 +782,7 @@ static TRef rec_call_specialize(jit_State *J, GCfunc *fn, TRef tr)
       /* NYI: io_file_iter doesn't have an ffid, yet. */
       /* NYI: io_file_iter doesn't have an ffid, yet. */
       {  /* Specialize to the ffid. */
       {  /* Specialize to the ffid. */
 	TRef trid = emitir(IRT(IR_FLOAD, IRT_U8), tr, IRFL_FUNC_FFID);
 	TRef trid = emitir(IRT(IR_FLOAD, IRT_U8), tr, IRFL_FUNC_FFID);
-	emitir(IRTG(IR_EQ, IRT_INT), trid, lj_ir_kint(J, fn->c.ffid));
+	emitir(IRTGI(IR_EQ), trid, lj_ir_kint(J, fn->c.ffid));
       }
       }
       return tr;
       return tr;
     default:
     default:
@@ -1565,6 +1631,47 @@ TRef lj_record_idx(jit_State *J, RecordIndex *ix)
   }
   }
 }
 }
 
 
+/* Determine result type of table traversal. */
+static IRType rec_next_types(GCtab *t, uint32_t idx)
+{
+  for (; idx < t->asize; idx++) {
+    cTValue *a = arrayslot(t, idx);
+    if (LJ_LIKELY(!tvisnil(a)))
+      return (LJ_DUALNUM ? IRT_INT : IRT_NUM) + (itype2irt(a) << 8);
+  }
+  idx -= t->asize;
+  for (; idx <= t->hmask; idx++) {
+    Node *n = &noderef(t->node)[idx];
+    if (!tvisnil(&n->val))
+      return itype2irt(&n->key) + (itype2irt(&n->val) << 8);
+  }
+  return IRT_NIL + (IRT_NIL << 8);
+}
+
+/* Record a table traversal step aka next(). */
+int lj_record_next(jit_State *J, RecordIndex *ix)
+{
+  IRType t, tkey, tval;
+  TRef trvk;
+  t = rec_next_types(tabV(&ix->tabv), ix->keyv.u32.lo);
+  tkey = (t & 0xff); tval = (t >> 8);
+  trvk = lj_ir_call(J, IRCALL_lj_vm_next, ix->tab, ix->key);
+  if (ix->mobj || tkey == IRT_NIL) {
+    TRef idx = emitir(IRTI(IR_HIOP), trvk, trvk);
+    /* Always check for invalid key from next() for nil result. */
+    if (!ix->mobj) emitir(IRTGI(IR_NE), idx, lj_ir_kint(J, -1));
+    ix->mobj = idx;
+  }
+  ix->key = lj_record_vload(J, trvk, 1, tkey);
+  if (tkey == IRT_NIL || ix->idxchain) {  /* Omit value type check. */
+    ix->val = TREF_NIL;
+    return 1;
+  } else {  /* Need value. */
+    ix->val = lj_record_vload(J, trvk, 0, tval);
+    return 2;
+  }
+}
+
 static void rec_tsetm(jit_State *J, BCReg ra, BCReg rn, int32_t i)
 static void rec_tsetm(jit_State *J, BCReg ra, BCReg rn, int32_t i)
 {
 {
   RecordIndex ix;
   RecordIndex ix;
@@ -2440,6 +2547,9 @@ void lj_record_ins(jit_State *J)
   case BC_ITERL:
   case BC_ITERL:
     rec_loop_interp(J, pc, rec_iterl(J, *pc));
     rec_loop_interp(J, pc, rec_iterl(J, *pc));
     break;
     break;
+  case BC_ITERN:
+    rec_loop_interp(J, pc, rec_itern(J, ra, rb));
+    break;
   case BC_LOOP:
   case BC_LOOP:
     rec_loop_interp(J, pc, rec_loop(J, ra, 1));
     rec_loop_interp(J, pc, rec_loop(J, ra, 1));
     break;
     break;
@@ -2468,6 +2578,10 @@ void lj_record_ins(jit_State *J)
       J->maxslot = ra;  /* Shrink used slots. */
       J->maxslot = ra;  /* Shrink used slots. */
     break;
     break;
 
 
+  case BC_ISNEXT:
+    rec_isnext(J, ra);
+    break;
+
   /* -- Function headers -------------------------------------------------- */
   /* -- Function headers -------------------------------------------------- */
 
 
   case BC_FUNCF:
   case BC_FUNCF:
@@ -2497,8 +2611,6 @@ void lj_record_ins(jit_State *J)
       break;
       break;
     }
     }
     /* fallthrough */
     /* fallthrough */
-  case BC_ITERN:
-  case BC_ISNEXT:
   case BC_UCLO:
   case BC_UCLO:
   case BC_FNEW:
   case BC_FNEW:
     setintV(&J->errinfo, (int32_t)op);
     setintV(&J->errinfo, (int32_t)op);
@@ -2550,6 +2662,13 @@ static const BCIns *rec_setup_root(jit_State *J)
     lj_assertJ(bc_op(pc[-1]) == BC_JMP, "ITERL does not point to JMP+1");
     lj_assertJ(bc_op(pc[-1]) == BC_JMP, "ITERL does not point to JMP+1");
     J->bc_min = pc;
     J->bc_min = pc;
     break;
     break;
+  case BC_ITERN:
+    lj_assertJ(bc_op(pc[1]) == BC_ITERL, "no ITERL after ITERN");
+    J->maxslot = ra;
+    J->bc_extent = (MSize)(-bc_j(pc[1]))*sizeof(BCIns);
+    J->bc_min = pc+2 + bc_j(pc[1]);
+    J->state = LJ_TRACE_RECORD_1ST;  /* Record the first ITERN, too. */
+    break;
   case BC_LOOP:
   case BC_LOOP:
     /* Only check BC range for real loops, but not for "repeat until true". */
     /* Only check BC range for real loops, but not for "repeat until true". */
     pcj = pc + bc_j(ins);
     pcj = pc + bc_j(ins);
@@ -2657,6 +2776,7 @@ void lj_record_setup(jit_State *J)
     J->pc = rec_setup_root(J);
     J->pc = rec_setup_root(J);
     /* Note: the loop instruction itself is recorded at the end and not
     /* Note: the loop instruction itself is recorded at the end and not
     ** at the start! So snapshot #0 needs to point to the *next* instruction.
     ** at the start! So snapshot #0 needs to point to the *next* instruction.
+    ** The one exception is BC_ITERN, which sets LJ_TRACE_RECORD_1ST.
     */
     */
     lj_snap_add(J);
     lj_snap_add(J);
     if (bc_op(J->cur.startins) == BC_FORL)
     if (bc_op(J->cur.startins) == BC_FORL)

+ 1 - 0
src/lj_record.h

@@ -38,6 +38,7 @@ LJ_FUNC void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults);
 
 
 LJ_FUNC int lj_record_mm_lookup(jit_State *J, RecordIndex *ix, MMS mm);
 LJ_FUNC int lj_record_mm_lookup(jit_State *J, RecordIndex *ix, MMS mm);
 LJ_FUNC TRef lj_record_idx(jit_State *J, RecordIndex *ix);
 LJ_FUNC TRef lj_record_idx(jit_State *J, RecordIndex *ix);
+LJ_FUNC int lj_record_next(jit_State *J, RecordIndex *ix);
 
 
 LJ_FUNC void lj_record_ins(jit_State *J);
 LJ_FUNC void lj_record_ins(jit_State *J);
 LJ_FUNC void lj_record_setup(jit_State *J);
 LJ_FUNC void lj_record_setup(jit_State *J);

+ 8 - 2
src/lj_snap.c

@@ -463,7 +463,7 @@ static TRef snap_dedup(jit_State *J, SnapEntry *map, MSize nmax, IRRef ref)
   MSize j;
   MSize j;
   for (j = 0; j < nmax; j++)
   for (j = 0; j < nmax; j++)
     if (snap_ref(map[j]) == ref)
     if (snap_ref(map[j]) == ref)
-      return J->slot[snap_slot(map[j])] & ~(SNAP_CONT|SNAP_FRAME);
+      return J->slot[snap_slot(map[j])] & ~(SNAP_KEYINDEX|SNAP_CONT|SNAP_FRAME);
   return 0;
   return 0;
 }
 }
 
 
@@ -538,10 +538,12 @@ void lj_snap_replay(jit_State *J, GCtrace *T)
       uint32_t mode = IRSLOAD_INHERIT|IRSLOAD_PARENT;
       uint32_t mode = IRSLOAD_INHERIT|IRSLOAD_PARENT;
       if (LJ_SOFTFP32 && (sn & SNAP_SOFTFPNUM)) t = IRT_NUM;
       if (LJ_SOFTFP32 && (sn & SNAP_SOFTFPNUM)) t = IRT_NUM;
       if (ir->o == IR_SLOAD) mode |= (ir->op2 & IRSLOAD_READONLY);
       if (ir->o == IR_SLOAD) mode |= (ir->op2 & IRSLOAD_READONLY);
+      if ((sn & SNAP_KEYINDEX)) mode |= IRSLOAD_KEYINDEX;
       tr = emitir_raw(IRT(IR_SLOAD, t), s, mode);
       tr = emitir_raw(IRT(IR_SLOAD, t), s, mode);
     }
     }
   setslot:
   setslot:
-    J->slot[s] = tr | (sn&(SNAP_CONT|SNAP_FRAME));  /* Same as TREF_* flags. */
+    /* Same as TREF_* flags. */
+    J->slot[s] = tr | (sn&(SNAP_KEYINDEX|SNAP_CONT|SNAP_FRAME));
     J->framedepth += ((sn & (SNAP_CONT|SNAP_FRAME)) && (s != LJ_FR2));
     J->framedepth += ((sn & (SNAP_CONT|SNAP_FRAME)) && (s != LJ_FR2));
     if ((sn & SNAP_FRAME))
     if ((sn & SNAP_FRAME))
       J->baseslot = s+1;
       J->baseslot = s+1;
@@ -961,6 +963,10 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr)
 	setframe_ftsz(o, snap_slot(sn) != 0 ? (int32_t)*flinks-- : ftsz0);
 	setframe_ftsz(o, snap_slot(sn) != 0 ? (int32_t)*flinks-- : ftsz0);
 	L->base = o+1;
 	L->base = o+1;
 #endif
 #endif
+      } else if ((sn & SNAP_KEYINDEX)) {
+	/* A IRT_INT key index slot is restored as a number. Undo this. */
+	o->u32.lo = (uint32_t)(LJ_DUALNUM ? intV(o) : lj_num2int(numV(o)));
+	o->u32.hi = LJ_KEYINDEX;
       }
       }
     }
     }
   }
   }

+ 15 - 7
src/lj_trace.c

@@ -215,8 +215,8 @@ static void trace_unpatch(jit_State *J, GCtrace *T)
     break;
     break;
   case BC_JITERL:
   case BC_JITERL:
   case BC_JLOOP:
   case BC_JLOOP:
-    lj_assertJ(op == BC_ITERL || op == BC_LOOP || bc_isret(op),
-	       "bad original bytecode %d", op);
+    lj_assertJ(op == BC_ITERL || op == BC_ITERN || op == BC_LOOP ||
+	       bc_isret(op), "bad original bytecode %d", op);
     *pc = T->startins;
     *pc = T->startins;
     break;
     break;
   case BC_JMP:
   case BC_JMP:
@@ -411,7 +411,7 @@ static void trace_start(jit_State *J)
   TraceNo traceno;
   TraceNo traceno;
 
 
   if ((J->pt->flags & PROTO_NOJIT)) {  /* JIT disabled for this proto? */
   if ((J->pt->flags & PROTO_NOJIT)) {  /* JIT disabled for this proto? */
-    if (J->parent == 0 && J->exitno == 0) {
+    if (J->parent == 0 && J->exitno == 0 && bc_op(*J->pc) != BC_ITERN) {
       /* Lazy bytecode patching to disable hotcount events. */
       /* Lazy bytecode patching to disable hotcount events. */
       lj_assertJ(bc_op(*J->pc) == BC_FORL || bc_op(*J->pc) == BC_ITERL ||
       lj_assertJ(bc_op(*J->pc) == BC_FORL || bc_op(*J->pc) == BC_ITERL ||
 		 bc_op(*J->pc) == BC_LOOP || bc_op(*J->pc) == BC_FUNCF,
 		 bc_op(*J->pc) == BC_LOOP || bc_op(*J->pc) == BC_FUNCF,
@@ -496,6 +496,7 @@ static void trace_stop(jit_State *J)
     J->cur.nextroot = pt->trace;
     J->cur.nextroot = pt->trace;
     pt->trace = (TraceNo1)traceno;
     pt->trace = (TraceNo1)traceno;
     break;
     break;
+  case BC_ITERN:
   case BC_RET:
   case BC_RET:
   case BC_RET0:
   case BC_RET0:
   case BC_RET1:
   case BC_RET1:
@@ -575,7 +576,8 @@ static int trace_abort(jit_State *J)
     return 1;  /* Retry ASM with new MCode area. */
     return 1;  /* Retry ASM with new MCode area. */
   }
   }
   /* Penalize or blacklist starting bytecode instruction. */
   /* Penalize or blacklist starting bytecode instruction. */
-  if (J->parent == 0 && !bc_isret(bc_op(J->cur.startins))) {
+  if (J->parent == 0 && !bc_isret(bc_op(J->cur.startins)) &&
+      bc_op(J->cur.startins) != BC_ITERN) {
     if (J->exitno == 0) {
     if (J->exitno == 0) {
       BCIns *startpc = mref(J->cur.startpc, BCIns);
       BCIns *startpc = mref(J->cur.startpc, BCIns);
       if (e == LJ_TRERR_RETRY)
       if (e == LJ_TRERR_RETRY)
@@ -651,8 +653,13 @@ static TValue *trace_state(lua_State *L, lua_CFunction dummy, void *ud)
       J->state = LJ_TRACE_RECORD;  /* trace_start() may change state. */
       J->state = LJ_TRACE_RECORD;  /* trace_start() may change state. */
       trace_start(J);
       trace_start(J);
       lj_dispatch_update(J2G(J));
       lj_dispatch_update(J2G(J));
-      break;
+      if (J->state != LJ_TRACE_RECORD_1ST)
+	break;
+      /* fallthrough */
 
 
+    case LJ_TRACE_RECORD_1ST:
+      J->state = LJ_TRACE_RECORD;
+      /* fallthrough */
     case LJ_TRACE_RECORD:
     case LJ_TRACE_RECORD:
       trace_pendpatch(J, 0);
       trace_pendpatch(J, 0);
       setvmstate(J2G(J), RECORD);
       setvmstate(J2G(J), RECORD);
@@ -899,13 +906,14 @@ int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr)
   }
   }
   if (bc_op(*pc) == BC_JLOOP) {
   if (bc_op(*pc) == BC_JLOOP) {
     BCIns *retpc = &traceref(J, bc_d(*pc))->startins;
     BCIns *retpc = &traceref(J, bc_d(*pc))->startins;
-    if (bc_isret(bc_op(*retpc))) {
+    int isret = bc_isret(bc_op(*retpc));
+    if (isret || bc_op(*retpc) == BC_ITERN) {
       if (J->state == LJ_TRACE_RECORD) {
       if (J->state == LJ_TRACE_RECORD) {
 	J->patchins = *pc;
 	J->patchins = *pc;
 	J->patchpc = (BCIns *)pc;
 	J->patchpc = (BCIns *)pc;
 	*J->patchpc = *retpc;
 	*J->patchpc = *retpc;
 	J->bcskip = 1;
 	J->bcskip = 1;
-      } else {
+      } else if (isret) {
 	pc = retpc;
 	pc = retpc;
 	setcframe_pc(cf, pc);
 	setcframe_pc(cf, pc);
       }
       }

+ 2 - 0
src/lj_vm.h

@@ -51,6 +51,7 @@ LJ_ASMF void lj_vm_inshook(void);
 LJ_ASMF void lj_vm_rethook(void);
 LJ_ASMF void lj_vm_rethook(void);
 LJ_ASMF void lj_vm_callhook(void);
 LJ_ASMF void lj_vm_callhook(void);
 LJ_ASMF void lj_vm_profhook(void);
 LJ_ASMF void lj_vm_profhook(void);
+LJ_ASMF void lj_vm_IITERN(void);
 
 
 /* Trace exit handling. */
 /* Trace exit handling. */
 LJ_ASMF void lj_vm_exit_handler(void);
 LJ_ASMF void lj_vm_exit_handler(void);
@@ -98,6 +99,7 @@ LJ_ASMF double lj_vm_trunc_sf(double);
 #if LJ_HASFFI
 #if LJ_HASFFI
 LJ_ASMF int lj_vm_errno(void);
 LJ_ASMF int lj_vm_errno(void);
 #endif
 #endif
+LJ_ASMF TValue *lj_vm_next(GCtab *t, uint32_t idx);
 #endif
 #endif
 
 
 /* Continuations for metamethods. */
 /* Continuations for metamethods. */

+ 77 - 2
src/vm_arm.dasc

@@ -2424,6 +2424,64 @@ static void build_subroutines(BuildCtx *ctx)
   |//-- Miscellaneous functions --------------------------------------------
   |//-- Miscellaneous functions --------------------------------------------
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
   |
   |
+  |.define NEXT_TAB,		TAB:CARG1
+  |.define NEXT_RES,		CARG1
+  |.define NEXT_IDX,		CARG2
+  |.define NEXT_TMP0,		CARG3
+  |.define NEXT_TMP1,		CARG4
+  |.define NEXT_LIM,		r12
+  |.define NEXT_RES_PTR,	sp
+  |.define NEXT_RES_VAL,	[sp]
+  |.define NEXT_RES_KEY_I,	[sp, #8]
+  |.define NEXT_RES_KEY_IT,	[sp, #12]
+  |
+  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
+  |// Next idx returned in CRET2.
+  |->vm_next:
+  |.if JIT
+  |  ldr NEXT_TMP0, NEXT_TAB->array
+  |   ldr NEXT_LIM, NEXT_TAB->asize
+  |  add NEXT_TMP0, NEXT_TMP0, NEXT_IDX, lsl #3
+  |1:  // Traverse array part.
+  |   subs NEXT_TMP1, NEXT_IDX, NEXT_LIM
+  |   bhs >5
+  |  ldr NEXT_TMP1, [NEXT_TMP0, #4]
+  |   str NEXT_IDX, NEXT_RES_KEY_I
+  |   add NEXT_TMP0, NEXT_TMP0, #8
+  |   add NEXT_IDX, NEXT_IDX, #1
+  |  checktp NEXT_TMP1, LJ_TNIL
+  |  beq <1				// Skip holes in array part.
+  |  ldr NEXT_TMP0, [NEXT_TMP0, #-8]
+  |   mov NEXT_RES, NEXT_RES_PTR
+  |  strd NEXT_TMP0, NEXT_RES_VAL	// Stores NEXT_TMP1, too.
+  |  mvn NEXT_TMP0, #~LJ_TISNUM
+  |  str NEXT_TMP0, NEXT_RES_KEY_IT
+  |  bx lr
+  |
+  |5:  // Traverse hash part.
+  |  ldr NEXT_TMP0, NEXT_TAB->hmask
+  |   ldr NODE:NEXT_RES, NEXT_TAB->node
+  |   add NEXT_TMP1, NEXT_TMP1, NEXT_TMP1, lsl #1
+  |  add NEXT_LIM, NEXT_LIM, NEXT_TMP0
+  |   add NODE:NEXT_RES, NODE:NEXT_RES, NEXT_TMP1, lsl #3
+  |6:
+  |  cmp NEXT_IDX, NEXT_LIM
+  |  bhi >9
+  |  ldr NEXT_TMP1, NODE:NEXT_RES->val.it
+  |  checktp NEXT_TMP1, LJ_TNIL
+  |   add NEXT_IDX, NEXT_IDX, #1
+  |  bxne lr
+  |  // Skip holes in hash part.
+  |  add NEXT_RES, NEXT_RES, #sizeof(Node)
+  |  b <6
+  |
+  |9:  // End of iteration. Set the key to nil (not the value).
+  |  mvn NEXT_TMP0, #0
+  |   mov NEXT_RES, NEXT_RES_PTR
+  |  str NEXT_TMP0, NEXT_RES_KEY_IT
+  |  bx lr
+  |.endif
+  |
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
   |//-- FFI helper functions -----------------------------------------------
   |//-- FFI helper functions -----------------------------------------------
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
@@ -3914,10 +3972,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     break;
     break;
 
 
   case BC_ITERN:
   case BC_ITERN:
-    |  // RA = base*8, (RB = nresults+1, RC = nargs+1 (2+1))
     |.if JIT
     |.if JIT
-    |  // NYI: add hotloop, record BC_ITERN.
+    |  hotloop
     |.endif
     |.endif
+    |->vm_IITERN:
+    |  // RA = base*8, (RB = nresults+1, RC = nargs+1 (2+1))
     |  add RA, BASE, RA
     |  add RA, BASE, RA
     |  ldr TAB:RB, [RA, #-16]
     |  ldr TAB:RB, [RA, #-16]
     |  ldr CARG1, [RA, #-8]		// Get index from control var.
     |  ldr CARG1, [RA, #-8]		// Get index from control var.
@@ -3992,9 +4051,25 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   mov OP, #BC_ITERC
     |   mov OP, #BC_ITERC
     |  strb CARG1, [PC, #-4]
     |  strb CARG1, [PC, #-4]
     |   sub PC, RC, #0x20000
     |   sub PC, RC, #0x20000
+    |.if JIT
+    |   ldrb CARG1, [PC]
+    |   cmp CARG1, #BC_ITERN
+    |   bne >6
+    |.endif
     |   strb OP, [PC]			// Subsumes ins_next1.
     |   strb OP, [PC]			// Subsumes ins_next1.
     |   ins_next2
     |   ins_next2
     |  b <1
     |  b <1
+    |.if JIT
+    |6:  // Unpatch JLOOP.
+    |  ldr CARG1, [DISPATCH, #DISPATCH_J(trace)]
+    |  ldrh CARG2, [PC, #2]
+    |  ldr TRACE:CARG1, [CARG1, CARG2, lsl #2]
+    |  // Subsumes ins_next1 and ins_next2.
+    |  ldr INS, TRACE:CARG1->startins
+    |  bfi INS, OP, #0, #8
+    |  str INS, [PC], #4
+    |  b <1
+    |.endif
     break;
     break;
 
 
   case BC_VARG:
   case BC_VARG:

+ 77 - 2
src/vm_arm64.dasc

@@ -2064,6 +2064,63 @@ static void build_subroutines(BuildCtx *ctx)
   |//-- Miscellaneous functions --------------------------------------------
   |//-- Miscellaneous functions --------------------------------------------
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
   |
   |
+  |.define NEXT_TAB,		TAB:CARG1
+  |.define NEXT_RES,		CARG1
+  |.define NEXT_IDX,		CARG2w
+  |.define NEXT_LIM,		CARG3w
+  |.define NEXT_TMP0,		TMP0
+  |.define NEXT_TMP0w,		TMP0w
+  |.define NEXT_TMP1,		TMP1
+  |.define NEXT_TMP1w,		TMP1w
+  |.define NEXT_RES_PTR,	sp
+  |.define NEXT_RES_VAL,	[sp]
+  |.define NEXT_RES_KEY,	[sp, #8]
+  |
+  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
+  |// Next idx returned in CRET2w.
+  |->vm_next:
+  |.if JIT
+  |  ldr NEXT_LIM, NEXT_TAB->asize
+  |   ldr NEXT_TMP1, NEXT_TAB->array
+  |1:  // Traverse array part.
+  |  subs NEXT_TMP0w, NEXT_IDX, NEXT_LIM
+  |  bhs >5				// Index points after array part?
+  |  ldr NEXT_TMP0, [NEXT_TMP1, NEXT_IDX, uxtw #3]
+  |  cmn NEXT_TMP0, #-LJ_TNIL
+  |   cinc NEXT_IDX, NEXT_IDX, eq
+  |  beq <1				// Skip holes in array part.
+  |  str NEXT_TMP0, NEXT_RES_VAL
+  |   movz NEXT_TMP0w, #(LJ_TISNUM>>1)&0xffff, lsl #16
+  |   stp NEXT_IDX, NEXT_TMP0w, NEXT_RES_KEY
+  |  add NEXT_IDX, NEXT_IDX, #1
+  |  mov NEXT_RES, NEXT_RES_PTR
+  |4:
+  |  ret
+  |
+  |5:  // Traverse hash part.
+  |  ldr NEXT_TMP1w, NEXT_TAB->hmask
+  |   ldr NODE:NEXT_RES, NEXT_TAB->node
+  |   add NEXT_TMP0w, NEXT_TMP0w, NEXT_TMP0w, lsl #1
+  |  add NEXT_LIM, NEXT_LIM, NEXT_TMP1w
+  |   add NODE:NEXT_RES, NODE:NEXT_RES, NEXT_TMP0w, uxtw #3
+  |6:
+  |  cmp NEXT_IDX, NEXT_LIM
+  |  bhi >9
+  |  ldr NEXT_TMP0, NODE:NEXT_RES->val
+  |  cmn NEXT_TMP0, #-LJ_TNIL
+  |   add NEXT_IDX, NEXT_IDX, #1
+  |  bne <4
+  |  // Skip holes in hash part.
+  |  add NODE:NEXT_RES, NODE:NEXT_RES, #sizeof(Node)
+  |  b <6
+  |
+  |9:  // End of iteration. Set the key to nil (not the value).
+  |  movn NEXT_TMP0, #0
+  |  str NEXT_TMP0, NEXT_RES_KEY
+  |  mov NEXT_RES, NEXT_RES_PTR
+  |  ret
+  |.endif
+  |
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
   |//-- FFI helper functions -----------------------------------------------
   |//-- FFI helper functions -----------------------------------------------
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
@@ -3320,10 +3377,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     break;
     break;
 
 
   case BC_ITERN:
   case BC_ITERN:
-    |  // RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
     |.if JIT
     |.if JIT
-    |  // NYI: add hotloop, record BC_ITERN.
+    |  hotloop
     |.endif
     |.endif
+    |->vm_IITERN:
+    |  // RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
     |  add RA, BASE, RA, lsl #3
     |  add RA, BASE, RA, lsl #3
     |  ldr TAB:RB, [RA, #-16]
     |  ldr TAB:RB, [RA, #-16]
     |    ldrh TMP3w, [PC, # OFS_RD]
     |    ldrh TMP3w, [PC, # OFS_RD]
@@ -3390,11 +3448,28 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  ins_next
     |  ins_next
     |
     |
     |5:  // Despecialize bytecode if any of the checks fail.
     |5:  // Despecialize bytecode if any of the checks fail.
+    |.if JIT
+    |  ldrb TMP2w, [RC, # OFS_OP]
+    |.endif
     |  mov TMP0, #BC_JMP
     |  mov TMP0, #BC_JMP
     |   mov TMP1, #BC_ITERC
     |   mov TMP1, #BC_ITERC
     |  strb TMP0w, [PC, #-4+OFS_OP]
     |  strb TMP0w, [PC, #-4+OFS_OP]
+    |.if JIT
+    |  cmp TMP2w, #BC_ITERN
+    |  bne >6
+    |.endif
     |   strb TMP1w, [RC, # OFS_OP]
     |   strb TMP1w, [RC, # OFS_OP]
     |  b <1
     |  b <1
+    |.if JIT
+    |6:  // Unpatch JLOOP.
+    |  ldr RA, [GL, #GL_J(trace)]
+    |  ldrh TMP2w, [RC, # OFS_RD]
+    |  ldr TRACE:RA, [RA, TMP2, lsl #3]
+    |  ldr TMP2w, TRACE:RA->startins
+    |  bfxil TMP2w, TMP1w, #0, #8
+    |  str TMP2w, [RC]
+    |  b <1
+    |.endif
     break;
     break;
 
 
   case BC_VARG:
   case BC_VARG:

+ 92 - 5
src/vm_mips.dasc

@@ -190,7 +190,7 @@
 |//-----------------------------------------------------------------------
 |//-----------------------------------------------------------------------
 |
 |
 |// Trap for not-yet-implemented parts.
 |// Trap for not-yet-implemented parts.
-|.macro NYI; .long 0xf0f0f0f0; .endmacro
+|.macro NYI; .long 0xec1cf0f0; .endmacro
 |
 |
 |// Macros to mark delay slots.
 |// Macros to mark delay slots.
 |.macro ., a; a; .endmacro
 |.macro ., a; a; .endmacro
@@ -2798,6 +2798,73 @@ static void build_subroutines(BuildCtx *ctx)
   |//-- Miscellaneous functions --------------------------------------------
   |//-- Miscellaneous functions --------------------------------------------
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
   |
   |
+  |.define NEXT_TAB,		TAB:CARG1
+  |.define NEXT_IDX,		CARG2
+  |.define NEXT_ASIZE,		CARG3
+  |.define NEXT_NIL,		CARG4
+  |.define NEXT_TMP0,		r12
+  |.define NEXT_TMP1,		r13
+  |.define NEXT_TMP2,		r14
+  |.define NEXT_RES_VK,		CRET1
+  |.define NEXT_RES_IDX,	CRET2
+  |.define NEXT_RES_PTR,	sp
+  |.define NEXT_RES_VAL_I,	0(sp)
+  |.define NEXT_RES_VAL_IT,	4(sp)
+  |.define NEXT_RES_KEY_I,	8(sp)
+  |.define NEXT_RES_KEY_IT,	12(sp)
+  |
+  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
+  |// Next idx returned in CRET2.
+  |->vm_next:
+  |.if JIT and ENDIAN_LE
+  |   lw NEXT_ASIZE, NEXT_TAB->asize
+  |  lw NEXT_TMP0, NEXT_TAB->array
+  |    li NEXT_NIL, LJ_TNIL
+  |1:  // Traverse array part.
+  |   sltu AT, NEXT_IDX, NEXT_ASIZE
+  |    sll NEXT_TMP1, NEXT_IDX, 3
+  |   beqz AT, >5
+  |.   addu NEXT_TMP1, NEXT_TMP0, NEXT_TMP1
+  |  lw NEXT_TMP2, 4(NEXT_TMP1)
+  |   sw NEXT_IDX, NEXT_RES_KEY_I
+  |  beq NEXT_TMP2, NEXT_NIL, <1
+  |.  addiu NEXT_IDX, NEXT_IDX, 1
+  |    lw NEXT_TMP0, 0(NEXT_TMP1)
+  |   li AT, LJ_TISNUM
+  |  sw NEXT_TMP2, NEXT_RES_VAL_IT
+  |   sw AT, NEXT_RES_KEY_IT
+  |    sw NEXT_TMP0, NEXT_RES_VAL_I
+  |  move NEXT_RES_VK, NEXT_RES_PTR
+  |  jr ra
+  |.  move NEXT_RES_IDX, NEXT_IDX
+  |
+  |5:  // Traverse hash part.
+  |  subu NEXT_RES_IDX, NEXT_IDX, NEXT_ASIZE
+  |   lw NODE:NEXT_RES_VK, NEXT_TAB->node
+  |    sll NEXT_TMP2, NEXT_RES_IDX, 5
+  |  lw NEXT_TMP0, NEXT_TAB->hmask
+  |    sll AT, NEXT_RES_IDX, 3
+  |    subu AT, NEXT_TMP2, AT
+  |   addu NODE:NEXT_RES_VK, NODE:NEXT_RES_VK, AT
+  |6:
+  |  sltu AT, NEXT_TMP0, NEXT_RES_IDX
+  |  bnez AT, >8
+  |.  nop
+  |  lw NEXT_TMP2, NODE:NEXT_RES_VK->val.it
+  |  bne NEXT_TMP2, NEXT_NIL, >9
+  |.  addiu NEXT_RES_IDX, NEXT_RES_IDX, 1
+  |  // Skip holes in hash part.
+  |  b <6
+  |.  addiu NODE:NEXT_RES_VK, NODE:NEXT_RES_VK, sizeof(Node)
+  |
+  |8:  // End of iteration. Set the key to nil (not the value).
+  |  sw NEXT_NIL, NEXT_RES_KEY_IT
+  |  move NEXT_RES_VK, NEXT_RES_PTR
+  |9:
+  |  jr ra
+  |.  addu NEXT_RES_IDX, NEXT_RES_IDX, NEXT_ASIZE
+  |.endif
+  |
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
   |//-- FFI helper functions -----------------------------------------------
   |//-- FFI helper functions -----------------------------------------------
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
@@ -4521,10 +4588,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     break;
     break;
 
 
   case BC_ITERN:
   case BC_ITERN:
-    |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
-    |.if JIT
-    |  // NYI: add hotloop, record BC_ITERN.
+    |.if JIT and ENDIAN_LE
+    |  hotloop
     |.endif
     |.endif
+    |->vm_IITERN:
+    |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
     |  addu RA, BASE, RA
     |  addu RA, BASE, RA
     |  lw TAB:RB, -16+LO(RA)
     |  lw TAB:RB, -16+LO(RA)
     |  lw RC, -8+LO(RA)			// Get index from control var.
     |  lw RC, -8+LO(RA)			// Get index from control var.
@@ -4614,9 +4682,28 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  li TMP3, BC_JMP
     |  li TMP3, BC_JMP
     |   li TMP1, BC_ITERC
     |   li TMP1, BC_ITERC
     |  sb TMP3, -4+OFS_OP(PC)
     |  sb TMP3, -4+OFS_OP(PC)
-    |    addu PC, TMP0, TMP2
+    |  addu PC, TMP0, TMP2
+    |.if JIT
+    |  lb TMP0, OFS_OP(PC)
+    |  li AT, BC_ITERN
+    |  bne TMP0, AT, >6
+    |.  lhu TMP2, OFS_RD(PC)
+    |.endif
     |  b <1
     |  b <1
     |.  sb TMP1, OFS_OP(PC)
     |.  sb TMP1, OFS_OP(PC)
+    |.if JIT
+    |6:  // Unpatch JLOOP.
+    |  lw TMP0, DISPATCH_J(trace)(DISPATCH)
+    |   sll TMP2, TMP2, 2
+    |  addu TMP0, TMP0, TMP2
+    |  lw TRACE:TMP2, 0(TMP0)
+    |  lw TMP0, TRACE:TMP2->startins
+    |   li AT, -256
+    |  and TMP0, TMP0, AT
+    |  or TMP0, TMP0, TMP1
+    |  b <1
+    |.  sw TMP0, 0(PC)
+    |.endif
     break;
     break;
 
 
   case BC_VARG:
   case BC_VARG:

+ 88 - 4
src/vm_mips64.dasc

@@ -193,7 +193,7 @@
 |//-----------------------------------------------------------------------
 |//-----------------------------------------------------------------------
 |
 |
 |// Trap for not-yet-implemented parts.
 |// Trap for not-yet-implemented parts.
-|.macro NYI; .long 0xf0f0f0f0; .endmacro
+|.macro NYI; .long 0xec1cf0f0; .endmacro
 |
 |
 |// Macros to mark delay slots.
 |// Macros to mark delay slots.
 |.macro ., a; a; .endmacro
 |.macro ., a; a; .endmacro
@@ -2904,6 +2904,70 @@ static void build_subroutines(BuildCtx *ctx)
   |//-- Miscellaneous functions --------------------------------------------
   |//-- Miscellaneous functions --------------------------------------------
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
   |
   |
+  |.define NEXT_TAB,		TAB:CARG1
+  |.define NEXT_IDX,		CARG2
+  |.define NEXT_ASIZE,		CARG3
+  |.define NEXT_NIL,		CARG4
+  |.define NEXT_TMP0,		r12
+  |.define NEXT_TMP1,		r13
+  |.define NEXT_TMP2,		r14
+  |.define NEXT_RES_VK,		CRET1
+  |.define NEXT_RES_IDX,	CRET2
+  |.define NEXT_RES_PTR,	sp
+  |.define NEXT_RES_VAL,	0(sp)
+  |.define NEXT_RES_KEY,	8(sp)
+  |
+  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
+  |// Next idx returned in CRET2.
+  |->vm_next:
+  |.if JIT and ENDIAN_LE
+  |   lw NEXT_ASIZE, NEXT_TAB->asize
+  |  ld NEXT_TMP0, NEXT_TAB->array
+  |    li NEXT_NIL, LJ_TNIL
+  |1:  // Traverse array part.
+  |   sltu AT, NEXT_IDX, NEXT_ASIZE
+  |    sll NEXT_TMP1, NEXT_IDX, 3
+  |   beqz AT, >5
+  |.   daddu NEXT_TMP1, NEXT_TMP0, NEXT_TMP1
+  |   li AT, LJ_TISNUM
+  |  ld NEXT_TMP2, 0(NEXT_TMP1)
+  |   dsll AT, AT, 47
+  |   or NEXT_TMP1, NEXT_IDX, AT
+  |  beq NEXT_TMP2, NEXT_NIL, <1
+  |.  addiu NEXT_IDX, NEXT_IDX, 1
+  |  sd NEXT_TMP2, NEXT_RES_VAL
+  |   sd NEXT_TMP1, NEXT_RES_KEY
+  |  move NEXT_RES_VK, NEXT_RES_PTR
+  |  jr ra
+  |.  move NEXT_RES_IDX, NEXT_IDX
+  |
+  |5:  // Traverse hash part.
+  |  subu NEXT_RES_IDX, NEXT_IDX, NEXT_ASIZE
+  |   ld NODE:NEXT_RES_VK, NEXT_TAB->node
+  |    sll NEXT_TMP2, NEXT_RES_IDX, 5
+  |  lw NEXT_TMP0, NEXT_TAB->hmask
+  |    sll AT, NEXT_RES_IDX, 3
+  |    subu AT, NEXT_TMP2, AT
+  |   daddu NODE:NEXT_RES_VK, NODE:NEXT_RES_VK, AT
+  |6:
+  |  sltu AT, NEXT_TMP0, NEXT_RES_IDX
+  |  bnez AT, >8
+  |.  nop
+  |  ld NEXT_TMP2, NODE:NEXT_RES_VK->val
+  |  bne NEXT_TMP2, NEXT_NIL, >9
+  |.  addiu NEXT_RES_IDX, NEXT_RES_IDX, 1
+  |  // Skip holes in hash part.
+  |  b <6
+  |.  daddiu NODE:NEXT_RES_VK, NODE:NEXT_RES_VK, sizeof(Node)
+  |
+  |8:  // End of iteration. Set the key to nil (not the value).
+  |  sd NEXT_NIL, NEXT_RES_KEY
+  |  move NEXT_RES_VK, NEXT_RES_PTR
+  |9:
+  |  jr ra
+  |.  addu NEXT_RES_IDX, NEXT_RES_IDX, NEXT_ASIZE
+  |.endif
+  |
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
   |//-- FFI helper functions -----------------------------------------------
   |//-- FFI helper functions -----------------------------------------------
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
@@ -4700,10 +4764,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     break;
     break;
 
 
   case BC_ITERN:
   case BC_ITERN:
-    |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
-    |.if JIT
-    |  // NYI: add hotloop, record BC_ITERN.
+    |.if JIT and ENDIAN_LE
+    |  hotloop
     |.endif
     |.endif
+    |->vm_IITERN:
+    |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
     |  daddu RA, BASE, RA
     |  daddu RA, BASE, RA
     |  ld TAB:RB, -16(RA)
     |  ld TAB:RB, -16(RA)
     |   lw RC, -8+LO(RA)		// Get index from control var.
     |   lw RC, -8+LO(RA)		// Get index from control var.
@@ -4789,8 +4854,27 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   li TMP1, BC_ITERC
     |   li TMP1, BC_ITERC
     |  sb TMP3, -4+OFS_OP(PC)
     |  sb TMP3, -4+OFS_OP(PC)
     |   daddu PC, TMP0, TMP2
     |   daddu PC, TMP0, TMP2
+    |.if JIT
+    |  lb TMP0, OFS_OP(PC)
+    |  li AT, BC_ITERN
+    |  bne TMP0, AT, >6
+    |.  lhu TMP2, OFS_RD(PC)
+    |.endif
     |  b <1
     |  b <1
     |.  sb TMP1, OFS_OP(PC)
     |.  sb TMP1, OFS_OP(PC)
+    |.if JIT
+    |6:  // Unpatch JLOOP.
+    |  ld TMP0, DISPATCH_J(trace)(DISPATCH)
+    |   sll TMP2, TMP2, 3
+    |  daddu TMP0, TMP0, TMP2
+    |  ld TRACE:TMP2, 0(TMP0)
+    |  lw TMP0, TRACE:TMP2->startins
+    |   li AT, -256
+    |  and TMP0, TMP0, AT
+    |  or TMP0, TMP0, TMP1
+    |  b <1
+    |.  sw TMP0, 0(PC)
+    |.endif
     break;
     break;
 
 
   case BC_VARG:
   case BC_VARG:

+ 8 - 1
src/vm_ppc.dasc

@@ -3163,6 +3163,11 @@ static void build_subroutines(BuildCtx *ctx)
   |  blr
   |  blr
   |.endif
   |.endif
   |
   |
+  |->vm_next:
+  |.if JIT
+  |  NYI  // On big-endian.
+  |.endif
+  |
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
   |//-- FFI helper functions -----------------------------------------------
   |//-- FFI helper functions -----------------------------------------------
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
@@ -5112,8 +5117,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
   case BC_ITERN:
   case BC_ITERN:
     |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
     |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
     |.if JIT
     |.if JIT
-    |  // NYI: add hotloop, record BC_ITERN.
+    |  // NYI on big-endian
     |.endif
     |.endif
+    |->vm_IITERN:
     |  add RA, BASE, RA
     |  add RA, BASE, RA
     |  lwz TAB:RB, -12(RA)
     |  lwz TAB:RB, -12(RA)
     |  lwz RC, -4(RA)			// Get index from control var.
     |  lwz RC, -4(RA)			// Get index from control var.
@@ -5244,6 +5250,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   li TMP1, BC_ITERC
     |   li TMP1, BC_ITERC
     |  stb TMP0, -1(PC)
     |  stb TMP0, -1(PC)
     |    addis PC, TMP3, -(BCBIAS_J*4 >> 16)
     |    addis PC, TMP3, -(BCBIAS_J*4 >> 16)
+    |  // NYI on big-endian: unpatch JLOOP.
     |   stb TMP1, 3(PC)
     |   stb TMP1, 3(PC)
     |  b <1
     |  b <1
     break;
     break;

+ 78 - 2
src/vm_x64.dasc

@@ -2633,6 +2633,67 @@ static void build_subroutines(BuildCtx *ctx)
   |  .if X64WIN; pop rsi; .endif
   |  .if X64WIN; pop rsi; .endif
   |  ret
   |  ret
   |
   |
+  |.define NEXT_TAB,		TAB:CARG1
+  |.define NEXT_IDX,		CARG2d
+  |.define NEXT_IDXa,		CARG2
+  |.define NEXT_PTR,		RC
+  |.define NEXT_PTRd,		RCd
+  |.define NEXT_TMP,		CARG3
+  |.define NEXT_ASIZE,		CARG4d
+  |.macro NEXT_RES_IDXL, op2;	lea edx, [NEXT_IDX+op2]; .endmacro
+  |.if X64WIN
+  |.define NEXT_RES_PTR,	[rsp+aword*5]
+  |.macro NEXT_RES_IDX, op2;	add NEXT_IDX, op2; .endmacro
+  |.else
+  |.define NEXT_RES_PTR,	[rsp+aword*1]
+  |.macro NEXT_RES_IDX, op2;	lea edx, [NEXT_IDX+op2]; .endmacro
+  |.endif
+  |
+  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
+  |// Next idx returned in edx.
+  |->vm_next:
+  |.if JIT
+  |  mov NEXT_ASIZE, NEXT_TAB->asize
+  |1:  // Traverse array part.
+  |  cmp NEXT_IDX, NEXT_ASIZE;  jae >5
+  |  mov NEXT_TMP, NEXT_TAB->array
+  |  mov NEXT_TMP, qword [NEXT_TMP+NEXT_IDX*8]
+  |  cmp NEXT_TMP, LJ_TNIL;  je >2
+  |  lea NEXT_PTR, NEXT_RES_PTR
+  |  mov qword [NEXT_PTR], NEXT_TMP
+  |.if DUALNUM
+  |  setint NEXT_TMP, NEXT_IDXa
+  |  mov qword [NEXT_PTR+qword*1], NEXT_TMP
+  |.else
+  |  cvtsi2sd xmm0, NEXT_IDX
+  |  movsd qword [NEXT_PTR+qword*1], xmm0
+  |.endif
+  |  NEXT_RES_IDX 1
+  |  ret
+  |2:  // Skip holes in array part.
+  |  add NEXT_IDX, 1
+  |  jmp <1
+  |
+  |5:  // Traverse hash part.
+  |  sub NEXT_IDX, NEXT_ASIZE
+  |6:
+  |  cmp NEXT_IDX, NEXT_TAB->hmask; ja >9
+  |  imul NEXT_PTRd, NEXT_IDX, #NODE
+  |  add NODE:NEXT_PTR, NEXT_TAB->node
+  |  cmp qword NODE:NEXT_PTR->val, LJ_TNIL; je >7
+  |  NEXT_RES_IDXL NEXT_ASIZE+1
+  |  ret
+  |7:  // Skip holes in hash part.
+  |  add NEXT_IDX, 1
+  |  jmp <6
+  |
+  |9:  // End of iteration. Set the key to nil (not the value).
+  |  NEXT_RES_IDX NEXT_ASIZE
+  |  lea NEXT_PTR, NEXT_RES_PTR
+  |  mov qword [NEXT_PTR+qword*1], LJ_TNIL
+  |  ret
+  |.endif
+  |
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
   |//-- Assertions ---------------------------------------------------------
   |//-- Assertions ---------------------------------------------------------
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
@@ -4044,10 +4105,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     break;
     break;
 
 
   case BC_ITERN:
   case BC_ITERN:
-    |  ins_A	// RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
     |.if JIT
     |.if JIT
-    |  // NYI: add hotloop, record BC_ITERN.
+    |  hotloop RBd
     |.endif
     |.endif
+    |->vm_IITERN:
+    |  ins_A	// RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
     |  mov TAB:RB, [BASE+RA*8-16]
     |  mov TAB:RB, [BASE+RA*8-16]
     |  cleartp TAB:RB
     |  cleartp TAB:RB
     |  mov RCd, [BASE+RA*8-8]		// Get index from control var.
     |  mov RCd, [BASE+RA*8-8]		// Get index from control var.
@@ -4118,8 +4180,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |5:  // Despecialize bytecode if any of the checks fail.
     |5:  // Despecialize bytecode if any of the checks fail.
     |  mov PC_OP, BC_JMP
     |  mov PC_OP, BC_JMP
     |  branchPC RD
     |  branchPC RD
+    |.if JIT
+    |  cmp byte [PC], BC_ITERN
+    |  jne >6
+    |.endif
     |  mov byte [PC], BC_ITERC
     |  mov byte [PC], BC_ITERC
     |  jmp <1
     |  jmp <1
+    |.if JIT
+    |6:  // Unpatch JLOOP.
+    |  mov RA, [DISPATCH+DISPATCH_J(trace)]
+    |  movzx RCd, word [PC+2]
+    |  mov TRACE:RA, [RA+RC*8]
+    |  mov eax, TRACE:RA->startins
+    |  mov al, BC_ITERC
+    |  mov dword [PC], eax
+    |  jmp <1
+    |.endif
     break;
     break;
 
 
   case BC_VARG:
   case BC_VARG:

+ 97 - 2
src/vm_x86.dasc

@@ -3120,6 +3120,86 @@ static void build_subroutines(BuildCtx *ctx)
   |  ret
   |  ret
   |.endif
   |.endif
   |
   |
+  |.define NEXT_TAB,		TAB:FCARG1
+  |.define NEXT_IDX,		FCARG2
+  |.define NEXT_PTR,		RCa
+  |.define NEXT_PTRd,		RC
+  |.macro NEXT_RES_IDXL, op2;	lea edx, [NEXT_IDX+op2]; .endmacro
+  |.if X64
+  |.define NEXT_TMP,		CARG3d
+  |.define NEXT_TMPq,		CARG3
+  |.define NEXT_ASIZE,		CARG4d
+  |.macro NEXT_ENTER;		.endmacro
+  |.macro NEXT_LEAVE;		ret; .endmacro
+  |.if X64WIN
+  |.define NEXT_RES_PTR,	[rsp+aword*5]
+  |.macro NEXT_RES_IDX, op2;	add NEXT_IDX, op2; .endmacro
+  |.else
+  |.define NEXT_RES_PTR,	[rsp+aword*1]
+  |.macro NEXT_RES_IDX, op2;	lea edx, [NEXT_IDX+op2]; .endmacro
+  |.endif
+  |.else
+  |.define NEXT_ASIZE,		esi
+  |.define NEXT_TMP,		edi
+  |.macro NEXT_ENTER;		push esi; push edi; .endmacro
+  |.macro NEXT_LEAVE;		pop edi; pop esi; ret; .endmacro
+  |.define NEXT_RES_PTR,	[esp+dword*3]
+  |.macro NEXT_RES_IDX, op2;	add NEXT_IDX, op2; .endmacro
+  |.endif
+  |
+  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
+  |// Next idx returned in edx.
+  |->vm_next:
+  |.if JIT
+  |  NEXT_ENTER
+  |  mov NEXT_ASIZE, NEXT_TAB->asize
+  |1:  // Traverse array part.
+  |  cmp NEXT_IDX, NEXT_ASIZE;  jae >5
+  |  mov NEXT_TMP, NEXT_TAB->array
+  |  cmp dword [NEXT_TMP+NEXT_IDX*8+4], LJ_TNIL;  je >2
+  |  lea NEXT_PTR, NEXT_RES_PTR
+  |.if X64
+  |  mov NEXT_TMPq, qword [NEXT_TMP+NEXT_IDX*8]
+  |  mov qword [NEXT_PTR], NEXT_TMPq
+  |.else
+  |  mov NEXT_ASIZE, dword [NEXT_TMP+NEXT_IDX*8+4]
+  |  mov NEXT_TMP, dword [NEXT_TMP+NEXT_IDX*8]
+  |  mov dword [NEXT_PTR+4], NEXT_ASIZE
+  |  mov dword [NEXT_PTR], NEXT_TMP
+  |.endif
+  |.if DUALNUM
+  |  mov dword [NEXT_PTR+dword*3], LJ_TISNUM
+  |  mov dword [NEXT_PTR+dword*2], NEXT_IDX
+  |.else
+  |  cvtsi2sd xmm0, NEXT_IDX
+  |  movsd qword [NEXT_PTR+dword*2], xmm0
+  |.endif
+  |  NEXT_RES_IDX 1
+  |  NEXT_LEAVE
+  |2:  // Skip holes in array part.
+  |  add NEXT_IDX, 1
+  |  jmp <1
+  |
+  |5:  // Traverse hash part.
+  |  sub NEXT_IDX, NEXT_ASIZE
+  |6:
+  |  cmp NEXT_IDX, NEXT_TAB->hmask; ja >9
+  |  imul NEXT_PTRd, NEXT_IDX, #NODE
+  |  add NODE:NEXT_PTRd, dword NEXT_TAB->node
+  |  cmp dword NODE:NEXT_PTR->val.it, LJ_TNIL; je >7
+  |  NEXT_RES_IDXL NEXT_ASIZE+1
+  |  NEXT_LEAVE
+  |7:  // Skip holes in hash part.
+  |  add NEXT_IDX, 1
+  |  jmp <6
+  |
+  |9:  // End of iteration. Set the key to nil (not the value).
+  |  NEXT_RES_IDX NEXT_ASIZE
+  |  lea NEXT_PTR, NEXT_RES_PTR
+  |  mov dword [NEXT_PTR+dword*3], LJ_TNIL
+  |  NEXT_LEAVE
+  |.endif
+  |
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
   |//-- Assertions ---------------------------------------------------------
   |//-- Assertions ---------------------------------------------------------
   |//-----------------------------------------------------------------------
   |//-----------------------------------------------------------------------
@@ -4771,10 +4851,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     break;
     break;
 
 
   case BC_ITERN:
   case BC_ITERN:
-    |  ins_A	// RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
     |.if JIT
     |.if JIT
-    |  // NYI: add hotloop, record BC_ITERN.
+    |  hotloop RB
     |.endif
     |.endif
+    |->vm_IITERN:
+    |  ins_A	// RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
     |  mov TMP1, KBASE			// Need two more free registers.
     |  mov TMP1, KBASE			// Need two more free registers.
     |  mov TMP2, DISPATCH
     |  mov TMP2, DISPATCH
     |  mov TAB:RB, [BASE+RA*8-16]
     |  mov TAB:RB, [BASE+RA*8-16]
@@ -4868,8 +4949,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |5:  // Despecialize bytecode if any of the checks fail.
     |5:  // Despecialize bytecode if any of the checks fail.
     |  mov PC_OP, BC_JMP
     |  mov PC_OP, BC_JMP
     |  branchPC RD
     |  branchPC RD
+    |.if JIT
+    |  cmp byte [PC], BC_ITERN
+    |  jne >6
+    |.endif
     |  mov byte [PC], BC_ITERC
     |  mov byte [PC], BC_ITERC
     |  jmp <1
     |  jmp <1
+    |.if JIT
+    |6:  // Unpatch JLOOP.
+    |  mov RA, [DISPATCH+DISPATCH_J(trace)]
+    |  movzx RC, word [PC+2]
+    |  mov TRACE:RA, [RA+RC*4]
+    |  mov eax, TRACE:RA->startins
+    |  mov al, BC_ITERC
+    |  mov dword [PC], eax
+    |  jmp <1
+    |.endif
     break;
     break;
 
 
   case BC_VARG:
   case BC_VARG: