lj_opt_loop.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. /*
  2. ** LOOP: Loop Optimizations.
  3. ** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
  4. */
  5. #define lj_opt_loop_c
  6. #define LUA_CORE
  7. #include "lj_obj.h"
  8. #if LJ_HASJIT
  9. #include "lj_err.h"
  10. #include "lj_buf.h"
  11. #include "lj_ir.h"
  12. #include "lj_jit.h"
  13. #include "lj_iropt.h"
  14. #include "lj_trace.h"
  15. #include "lj_snap.h"
  16. #include "lj_vm.h"
  17. /* Loop optimization:
  18. **
  19. ** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
  20. ** of a loop into invariant and variant instructions. The invariant
  21. ** instructions are hoisted out of the loop and only the variant
  22. ** instructions remain inside the loop body.
  23. **
  24. ** Unfortunately LICM is mostly useless for compiling dynamic languages.
  25. ** The IR has many guards and most of the subsequent instructions are
  26. ** control-dependent on them. The first non-hoistable guard would
  27. ** effectively prevent hoisting of all subsequent instructions.
  28. **
  29. ** That's why we use a special form of unrolling using copy-substitution,
  30. ** combined with redundancy elimination:
  31. **
  32. ** The recorded instruction stream is re-emitted to the compiler pipeline
  33. ** with substituted operands. The substitution table is filled with the
  34. ** refs returned by re-emitting each instruction. This can be done
  35. ** on-the-fly, because the IR is in strict SSA form, where every ref is
  36. ** defined before its use.
  37. **
  38. ** This aproach generates two code sections, separated by the LOOP
  39. ** instruction:
  40. **
  41. ** 1. The recorded instructions form a kind of pre-roll for the loop. It
  42. ** contains a mix of invariant and variant instructions and performs
  43. ** exactly one loop iteration (but not necessarily the 1st iteration).
  44. **
  45. ** 2. The loop body contains only the variant instructions and performs
  46. ** all remaining loop iterations.
  47. **
  48. ** On first sight that looks like a waste of space, because the variant
  49. ** instructions are present twice. But the key insight is that the
  50. ** pre-roll honors the control-dependencies for *both* the pre-roll itself
  51. ** *and* the loop body!
  52. **
  53. ** It also means one doesn't have to explicitly model control-dependencies
  54. ** (which, BTW, wouldn't help LICM much). And it's much easier to
  55. ** integrate sparse snapshotting with this approach.
  56. **
  57. ** One of the nicest aspects of this approach is that all of the
  58. ** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
  59. ** reused with only minor restrictions (e.g. one should not fold
  60. ** instructions across loop-carried dependencies).
  61. **
  62. ** But in general all optimizations can be applied which only need to look
  63. ** backwards into the generated instruction stream. At any point in time
  64. ** during the copy-substitution process this contains both a static loop
  65. ** iteration (the pre-roll) and a dynamic one (from the to-be-copied
  66. ** instruction up to the end of the partial loop body).
  67. **
  68. ** Since control-dependencies are implicitly kept, CSE also applies to all
  69. ** kinds of guards. The major advantage is that all invariant guards can
  70. ** be hoisted, too.
  71. **
  72. ** Load/store forwarding works across loop iterations, too. This is
  73. ** important if loop-carried dependencies are kept in upvalues or tables.
  74. ** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
  75. ** become a forwarded loop-recurrence after inlining.
  76. **
  77. ** Since the IR is in SSA form, loop-carried dependencies have to be
  78. ** modeled with PHI instructions. The potential candidates for PHIs are
  79. ** collected on-the-fly during copy-substitution. After eliminating the
  80. ** redundant ones, PHI instructions are emitted *below* the loop body.
  81. **
  82. ** Note that this departure from traditional SSA form doesn't change the
  83. ** semantics of the PHI instructions themselves. But it greatly simplifies
  84. ** on-the-fly generation of the IR and the machine code.
  85. */
  86. /* Some local macros to save typing. Undef'd at the end. */
  87. #define IR(ref) (&J->cur.ir[(ref)])
  88. /* Pass IR on to next optimization in chain (FOLD). */
  89. #define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
  90. /* Emit raw IR without passing through optimizations. */
  91. #define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
  92. /* -- PHI elimination ----------------------------------------------------- */
  93. /* Emit or eliminate collected PHIs. */
  94. static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi,
  95. SnapNo onsnap)
  96. {
  97. int passx = 0;
  98. IRRef i, j, nslots;
  99. IRRef invar = J->chain[IR_LOOP];
  100. /* Pass #1: mark redundant and potentially redundant PHIs. */
  101. for (i = 0, j = 0; i < nphi; i++) {
  102. IRRef lref = phi[i];
  103. IRRef rref = subst[lref];
  104. if (lref == rref || rref == REF_DROP) { /* Invariants are redundant. */
  105. irt_clearphi(IR(lref)->t);
  106. } else {
  107. phi[j++] = (IRRef1)lref;
  108. if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) {
  109. /* Quick check for simple recurrences failed, need pass2. */
  110. irt_setmark(IR(lref)->t);
  111. passx = 1;
  112. }
  113. }
  114. }
  115. nphi = j;
  116. /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */
  117. if (passx) {
  118. SnapNo s;
  119. for (i = J->cur.nins-1; i > invar; i--) {
  120. IRIns *ir = IR(i);
  121. if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
  122. if (!irref_isk(ir->op1)) {
  123. irt_clearmark(IR(ir->op1)->t);
  124. if (ir->op1 < invar &&
  125. ir->o >= IR_CALLN && ir->o <= IR_CARG) { /* ORDER IR */
  126. ir = IR(ir->op1);
  127. while (ir->o == IR_CARG) {
  128. if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
  129. if (irref_isk(ir->op1)) break;
  130. ir = IR(ir->op1);
  131. irt_clearmark(ir->t);
  132. }
  133. }
  134. }
  135. }
  136. for (s = J->cur.nsnap-1; s >= onsnap; s--) {
  137. SnapShot *snap = &J->cur.snap[s];
  138. SnapEntry *map = &J->cur.snapmap[snap->mapofs];
  139. MSize n, nent = snap->nent;
  140. for (n = 0; n < nent; n++) {
  141. IRRef ref = snap_ref(map[n]);
  142. if (!irref_isk(ref)) irt_clearmark(IR(ref)->t);
  143. }
  144. }
  145. }
  146. /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */
  147. nslots = J->baseslot+J->maxslot;
  148. for (i = 1; i < nslots; i++) {
  149. IRRef ref = tref_ref(J->slot[i]);
  150. while (!irref_isk(ref) && ref != subst[ref]) {
  151. IRIns *ir = IR(ref);
  152. irt_clearmark(ir->t); /* Unmark potential uses, too. */
  153. if (irt_isphi(ir->t) || irt_ispri(ir->t))
  154. break;
  155. irt_setphi(ir->t);
  156. if (nphi >= LJ_MAX_PHI)
  157. lj_trace_err(J, LJ_TRERR_PHIOV);
  158. phi[nphi++] = (IRRef1)ref;
  159. ref = subst[ref];
  160. if (ref > invar)
  161. break;
  162. }
  163. }
  164. /* Pass #4: propagate non-redundant PHIs. */
  165. while (passx) {
  166. passx = 0;
  167. for (i = 0; i < nphi; i++) {
  168. IRRef lref = phi[i];
  169. IRIns *ir = IR(lref);
  170. if (!irt_ismarked(ir->t)) { /* Propagate only from unmarked PHIs. */
  171. IRIns *irr = IR(subst[lref]);
  172. if (irt_ismarked(irr->t)) { /* Right ref points to other PHI? */
  173. irt_clearmark(irr->t); /* Mark that PHI as non-redundant. */
  174. passx = 1; /* Retry. */
  175. }
  176. }
  177. }
  178. }
  179. /* Pass #5: emit PHI instructions or eliminate PHIs. */
  180. for (i = 0; i < nphi; i++) {
  181. IRRef lref = phi[i];
  182. IRIns *ir = IR(lref);
  183. if (!irt_ismarked(ir->t)) { /* Emit PHI if not marked. */
  184. IRRef rref = subst[lref];
  185. if (rref > invar)
  186. irt_setphi(IR(rref)->t);
  187. emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
  188. } else { /* Otherwise eliminate PHI. */
  189. irt_clearmark(ir->t);
  190. irt_clearphi(ir->t);
  191. }
  192. }
  193. }
  194. /* -- Loop unrolling using copy-substitution ------------------------------ */
  195. /* Copy-substitute snapshot. */
  196. static void loop_subst_snap(jit_State *J, SnapShot *osnap,
  197. SnapEntry *loopmap, IRRef1 *subst)
  198. {
  199. SnapEntry *nmap, *omap = &J->cur.snapmap[osnap->mapofs];
  200. SnapEntry *nextmap = &J->cur.snapmap[snap_nextofs(&J->cur, osnap)];
  201. MSize nmapofs;
  202. MSize on, ln, nn, onent = osnap->nent;
  203. BCReg nslots = osnap->nslots;
  204. SnapShot *snap = &J->cur.snap[J->cur.nsnap];
  205. if (irt_isguard(J->guardemit)) { /* Guard inbetween? */
  206. nmapofs = J->cur.nsnapmap;
  207. J->cur.nsnap++; /* Add new snapshot. */
  208. } else { /* Otherwise overwrite previous snapshot. */
  209. snap--;
  210. nmapofs = snap->mapofs;
  211. }
  212. J->guardemit.irt = 0;
  213. /* Setup new snapshot. */
  214. snap->mapofs = (uint32_t)nmapofs;
  215. snap->ref = (IRRef1)J->cur.nins;
  216. snap->mcofs = 0;
  217. snap->nslots = nslots;
  218. snap->topslot = osnap->topslot;
  219. snap->count = 0;
  220. nmap = &J->cur.snapmap[nmapofs];
  221. /* Substitute snapshot slots. */
  222. on = ln = nn = 0;
  223. while (on < onent) {
  224. SnapEntry osn = omap[on], lsn = loopmap[ln];
  225. if (snap_slot(lsn) < snap_slot(osn)) { /* Copy slot from loop map. */
  226. nmap[nn++] = lsn;
  227. ln++;
  228. } else { /* Copy substituted slot from snapshot map. */
  229. if (snap_slot(lsn) == snap_slot(osn)) ln++; /* Shadowed loop slot. */
  230. if (!irref_isk(snap_ref(osn)))
  231. osn = snap_setref(osn, subst[snap_ref(osn)]);
  232. nmap[nn++] = osn;
  233. on++;
  234. }
  235. }
  236. while (snap_slot(loopmap[ln]) < nslots) /* Copy remaining loop slots. */
  237. nmap[nn++] = loopmap[ln++];
  238. snap->nent = (uint8_t)nn;
  239. omap += onent;
  240. nmap += nn;
  241. while (omap < nextmap) /* Copy PC + frame links. */
  242. *nmap++ = *omap++;
  243. J->cur.nsnapmap = (uint32_t)(nmap - J->cur.snapmap);
  244. }
  245. typedef struct LoopState {
  246. jit_State *J;
  247. IRRef1 *subst;
  248. MSize sizesubst;
  249. } LoopState;
  250. /* Unroll loop. */
  251. static void loop_unroll(LoopState *lps)
  252. {
  253. jit_State *J = lps->J;
  254. IRRef1 phi[LJ_MAX_PHI];
  255. uint32_t nphi = 0;
  256. IRRef1 *subst;
  257. SnapNo onsnap;
  258. SnapShot *osnap, *loopsnap;
  259. SnapEntry *loopmap, *psentinel;
  260. IRRef ins, invar;
  261. /* Allocate substitution table.
  262. ** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
  263. */
  264. invar = J->cur.nins;
  265. lps->sizesubst = invar - REF_BIAS;
  266. lps->subst = lj_mem_newvec(J->L, lps->sizesubst, IRRef1);
  267. subst = lps->subst - REF_BIAS;
  268. subst[REF_BASE] = REF_BASE;
  269. /* LOOP separates the pre-roll from the loop body. */
  270. emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0);
  271. /* Grow snapshot buffer and map for copy-substituted snapshots.
  272. ** Need up to twice the number of snapshots minus #0 and loop snapshot.
  273. ** Need up to twice the number of entries plus fallback substitutions
  274. ** from the loop snapshot entries for each new snapshot.
  275. ** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap!
  276. */
  277. onsnap = J->cur.nsnap;
  278. lj_snap_grow_buf(J, 2*onsnap-2);
  279. lj_snap_grow_map(J, J->cur.nsnapmap*2+(onsnap-2)*J->cur.snap[onsnap-1].nent);
  280. /* The loop snapshot is used for fallback substitutions. */
  281. loopsnap = &J->cur.snap[onsnap-1];
  282. loopmap = &J->cur.snapmap[loopsnap->mapofs];
  283. /* The PC of snapshot #0 and the loop snapshot must match. */
  284. psentinel = &loopmap[loopsnap->nent];
  285. lj_assertJ(*psentinel == J->cur.snapmap[J->cur.snap[0].nent],
  286. "mismatched PC for loop snapshot");
  287. *psentinel = SNAP(255, 0, 0); /* Replace PC with temporary sentinel. */
  288. /* Start substitution with snapshot #1 (#0 is empty for root traces). */
  289. osnap = &J->cur.snap[1];
  290. /* Copy and substitute all recorded instructions and snapshots. */
  291. for (ins = REF_FIRST; ins < invar; ins++) {
  292. IRIns *ir;
  293. IRRef op1, op2;
  294. if (ins >= osnap->ref) /* Instruction belongs to next snapshot? */
  295. loop_subst_snap(J, osnap++, loopmap, subst); /* Copy-substitute it. */
  296. /* Substitute instruction operands. */
  297. ir = IR(ins);
  298. op1 = ir->op1;
  299. if (!irref_isk(op1)) op1 = subst[op1];
  300. op2 = ir->op2;
  301. if (!irref_isk(op2)) op2 = subst[op2];
  302. if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
  303. op1 == ir->op1 && op2 == ir->op2) { /* Regular invariant ins? */
  304. subst[ins] = (IRRef1)ins; /* Shortcut. */
  305. } else {
  306. /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */
  307. IRType1 t = ir->t; /* Get this first, since emitir may invalidate ir. */
  308. IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
  309. subst[ins] = (IRRef1)ref;
  310. if (ref != ins) {
  311. IRIns *irr = IR(ref);
  312. if (ref < invar) { /* Loop-carried dependency? */
  313. /* Potential PHI? */
  314. if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
  315. irt_setphi(irr->t);
  316. if (nphi >= LJ_MAX_PHI)
  317. lj_trace_err(J, LJ_TRERR_PHIOV);
  318. phi[nphi++] = (IRRef1)ref;
  319. }
  320. /* Check all loop-carried dependencies for type instability. */
  321. if (!irt_sametype(t, irr->t)) {
  322. if (irt_isinteger(t) && irt_isinteger(irr->t))
  323. continue;
  324. else if (irt_isnum(t) && irt_isinteger(irr->t)) /* Fix int->num. */
  325. ref = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT));
  326. else if (irt_isnum(irr->t) && irt_isinteger(t)) /* Fix num->int. */
  327. ref = tref_ref(emitir(IRTGI(IR_CONV), ref,
  328. IRCONV_INT_NUM|IRCONV_CHECK));
  329. else
  330. lj_trace_err(J, LJ_TRERR_TYPEINS);
  331. subst[ins] = (IRRef1)ref;
  332. irr = IR(ref);
  333. goto phiconv;
  334. }
  335. } else if (ref != REF_DROP && ref > invar &&
  336. ((irr->o == IR_CONV && irr->op1 < invar) ||
  337. (irr->o == IR_ALEN && irr->op2 < invar &&
  338. irr->op2 != REF_NIL))) {
  339. /* May need an extra PHI for a CONV or ALEN hint. */
  340. ref = irr->o == IR_CONV ? irr->op1 : irr->op2;
  341. irr = IR(ref);
  342. phiconv:
  343. if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) {
  344. irt_setphi(irr->t);
  345. if (nphi >= LJ_MAX_PHI)
  346. lj_trace_err(J, LJ_TRERR_PHIOV);
  347. phi[nphi++] = (IRRef1)ref;
  348. }
  349. }
  350. }
  351. }
  352. }
  353. if (!irt_isguard(J->guardemit)) /* Drop redundant snapshot. */
  354. J->cur.nsnapmap = (uint32_t)J->cur.snap[--J->cur.nsnap].mapofs;
  355. lj_assertJ(J->cur.nsnapmap <= J->sizesnapmap, "bad snapshot map index");
  356. *psentinel = J->cur.snapmap[J->cur.snap[0].nent]; /* Restore PC. */
  357. loop_emit_phi(J, subst, phi, nphi, onsnap);
  358. }
  359. /* Undo any partial changes made by the loop optimization. */
  360. static void loop_undo(jit_State *J, IRRef ins, SnapNo nsnap, MSize nsnapmap)
  361. {
  362. ptrdiff_t i;
  363. SnapShot *snap = &J->cur.snap[nsnap-1];
  364. SnapEntry *map = J->cur.snapmap;
  365. map[snap->mapofs + snap->nent] = map[J->cur.snap[0].nent]; /* Restore PC. */
  366. J->cur.nsnapmap = (uint32_t)nsnapmap;
  367. J->cur.nsnap = nsnap;
  368. J->guardemit.irt = 0;
  369. lj_ir_rollback(J, ins);
  370. for (i = 0; i < BPROP_SLOTS; i++) { /* Remove backprop. cache entries. */
  371. BPropEntry *bp = &J->bpropcache[i];
  372. if (bp->val >= ins)
  373. bp->key = 0;
  374. }
  375. for (ins--; ins >= REF_FIRST; ins--) { /* Remove flags. */
  376. IRIns *ir = IR(ins);
  377. irt_clearphi(ir->t);
  378. irt_clearmark(ir->t);
  379. }
  380. }
  381. /* Protected callback for loop optimization. */
  382. static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud)
  383. {
  384. UNUSED(L); UNUSED(dummy);
  385. loop_unroll((LoopState *)ud);
  386. return NULL;
  387. }
  388. /* Loop optimization. */
  389. int lj_opt_loop(jit_State *J)
  390. {
  391. IRRef nins = J->cur.nins;
  392. SnapNo nsnap = J->cur.nsnap;
  393. MSize nsnapmap = J->cur.nsnapmap;
  394. LoopState lps;
  395. int errcode;
  396. lps.J = J;
  397. lps.subst = NULL;
  398. lps.sizesubst = 0;
  399. errcode = lj_vm_cpcall(J->L, NULL, &lps, cploop_opt);
  400. lj_mem_freevec(J2G(J), lps.subst, lps.sizesubst, IRRef1);
  401. if (LJ_UNLIKELY(errcode)) {
  402. lua_State *L = J->L;
  403. if (errcode == LUA_ERRRUN && tvisnumber(L->top-1)) { /* Trace error? */
  404. int32_t e = numberVint(L->top-1);
  405. switch ((TraceError)e) {
  406. case LJ_TRERR_TYPEINS: /* Type instability. */
  407. case LJ_TRERR_GFAIL: /* Guard would always fail. */
  408. /* Unrolling via recording fixes many cases, e.g. a flipped boolean. */
  409. if (--J->instunroll < 0) /* But do not unroll forever. */
  410. break;
  411. L->top--; /* Remove error object. */
  412. loop_undo(J, nins, nsnap, nsnapmap);
  413. return 1; /* Loop optimization failed, continue recording. */
  414. default:
  415. break;
  416. }
  417. }
  418. lj_err_throw(L, errcode); /* Propagate all other errors. */
  419. }
  420. return 0; /* Loop optimization is ok. */
  421. }
  422. #undef IR
  423. #undef emitir
  424. #undef emitir_raw
  425. #endif