Explorar o código

ARM64: Make use of tbz/tbnz and cbz/cbnz.

Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
Mike Pall %!s(int64=8) %!d(string=hai) anos
pai
achega
3ad2bbf586
Modificáronse 3 ficheiros con 91 adicións e 17 borrados
  1. 66 17
      src/lj_asm_arm64.h
  2. 19 0
      src/lj_emit_arm64.h
  3. 6 0
      src/lj_target_arm64.h

+ 66 - 17
src/lj_asm_arm64.h

@@ -84,6 +84,34 @@ static void asm_guardcc(ASMState *as, A64CC cc)
   emit_cond_branch(as, cc, target);
 }
 
+/* Emit test and branch instruction to exit for guard. */
+static void asm_guardtnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit)
+{
+  MCode *target = asm_exitstub_addr(as, as->snapno);
+  MCode *p = as->mcp;
+  if (LJ_UNLIKELY(p == as->invmcp)) {
+    as->loopinv = 1;
+    *p = A64I_B | ((target-p) & 0x03ffffffu);
+    emit_tnb(as, ai^0x01000000u, r, bit, p-1);
+    return;
+  }
+  emit_tnb(as, ai, r, bit, target);
+}
+
+/* Emit compare and branch instruction to exit for guard. */
+static void asm_guardcnb(ASMState *as, A64Ins ai, Reg r)
+{
+  MCode *target = asm_exitstub_addr(as, as->snapno);
+  MCode *p = as->mcp;
+  if (LJ_UNLIKELY(p == as->invmcp)) {
+    as->loopinv = 1;
+    *p = A64I_B | ((target-p) & 0x03ffffffu);
+    emit_cnb(as, ai^0x01000000u, r, p-1);
+    return;
+  }
+  emit_cnb(as, ai, r, target);
+}
+
 /* -- Operand fusion ------------------------------------------------------ */
 
 /* Limit linear search to this distance. Avoids O(n^2) behavior. */
@@ -482,10 +510,9 @@ static void asm_strto(ASMState *as, IRIns *ir)
       dest = ra_dest(as, ir, RSET_FPR);
     }
   }
-  asm_guardcc(as, CC_EQ);
   if (destused)
     emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0);
-  emit_n(as, (A64I_CMPw^A64I_K12)|A64F_U12(0), RID_RET);
+  asm_guardcnb(as, A64I_CBZ, RID_RET);
   args[0] = ir->op1; /* GCstr *str */
   args[1] = ASMREF_TMP1; /* TValue *n  */
   asm_gencall(as, ci, args);
@@ -1465,13 +1492,13 @@ static void asm_intcomp(ASMState *as, IRIns *ir)
     else if (cc > CC_NE) cc ^= 11;  /* LO <-> HI, LS <-> HS */
   }
   oldcc = cc;
-  if (irref_isk(rref) && IR(rref)->i == 0) {
+  if (irref_isk(rref) && get_k64val(IR(rref)) == 0) {
     IRIns *irl = IR(lref);
     if (cc == CC_GE) cc = CC_PL;
     else if (cc == CC_LT) cc = CC_MI;
-    else if (cc > CC_NE) goto notst;  /* Other conds don't work with tst. */
+    else if (cc > CC_NE) goto nocombine;  /* Other conds don't work with tst. */
     cmpprev0 = (irl+1 == ir);
-    /* Combine comp(BAND(left, right), 0) into tst left, right. */
+    /* Combine and-cmp-bcc into tbz/tbnz or and-cmp into tst. */
     if (cmpprev0 && irl->o == IR_BAND && !ra_used(irl)) {
       IRRef blref = irl->op1, brref = irl->op2;
       uint32_t m2 = 0;
@@ -1480,10 +1507,13 @@ static void asm_intcomp(ASMState *as, IRIns *ir)
 	Reg tmp = blref; blref = brref; brref = tmp;
       }
       if (irref_isk(brref)) {
-	/* NYI: use tbz/tbnz, if applicable. */
-	m2 = emit_isk13(IR(brref)->i, irt_is64(irl->t));
-	if (!m2)
-	  goto notst;  /* Not beneficial if we miss a constant operand. */
+	uint64_t k = get_k64val(IR(brref));
+	if (k && !(k & (k-1)) && (cc == CC_EQ || cc == CC_NE)) {
+	  asm_guardtnb(as, cc == CC_EQ ? A64I_TBZ : A64I_TBNZ,
+		       ra_alloc1(as, blref, RSET_GPR), emit_ctz64(k));
+	  return;
+	}
+	m2 = emit_isk13(k, irt_is64(irl->t));
       }
       bleft = ra_alloc1(as, blref, RSET_GPR);
       ai = (irt_is64(irl->t) ? A64I_TSTx : A64I_TSTw);
@@ -1493,9 +1523,15 @@ static void asm_intcomp(ASMState *as, IRIns *ir)
       emit_n(as, ai^m2, bleft);
       return;
     }
-    /* NYI: use cbz/cbnz for EQ/NE 0. */
+    if (cc == CC_EQ || cc == CC_NE) {
+      /* Combine cmp-bcc into cbz/cbnz. */
+      ai = cc == CC_EQ ? A64I_CBZ : A64I_CBNZ;
+      if (irt_is64(ir->t)) ai |= A64I_X;
+      asm_guardcnb(as, ai, ra_alloc1(as, lref, RSET_GPR));
+      return;
+    }
   }
-notst:
+nocombine:
   left = ra_alloc1(as, lref, RSET_GPR);
   m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left));
   asm_guardcc(as, cc);
@@ -1638,8 +1674,7 @@ static void asm_gc_check(ASMState *as)
   ra_evictset(as, RSET_SCRATCH);
   l_end = emit_label(as);
   /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
-  asm_guardcc(as, CC_NE);  /* Assumes asm_snap_prep() already done. */
-  emit_n(as, A64I_CMPx^A64I_K12, RID_RET);
+  asm_guardcnb(as, A64I_CBNZ, RID_RET); /* Assumes asm_snap_prep() is done. */
   args[0] = ASMREF_TMP1;  /* global_State *g */
   args[1] = ASMREF_TMP2;  /* MSize steps     */
   asm_gencall(as, ci, args);
@@ -1666,10 +1701,10 @@ static void asm_loop_fixup(ASMState *as)
   MCode *p = as->mctop;
   MCode *target = as->mcp;
   if (as->loopinv) {  /* Inverted loop branch? */
+    uint32_t mask = (p[-2] & 0x7e000000) == 0x36000000 ? 0x3fffu : 0x7ffffu;
     ptrdiff_t delta = target - (p - 2);
-    lua_assert(((delta + 0x40000) >> 19) == 0);
-    /* asm_guardcc already inverted the b.cc and patched the final bl. */
-    p[-2] |= ((uint32_t)delta & 0x7ffff) << 5;
+    /* asm_guard* already inverted the bcc/tnb/cnb and patched the final b. */
+    p[-2] |= ((uint32_t)delta & mask) << 5;
   } else {
     ptrdiff_t delta = target - (p - 1);
     p[-1] = A64I_B | ((uint32_t)(delta) & 0x03ffffffu);
@@ -1795,18 +1830,32 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target)
   MCode *mcarea = lj_mcode_patch(J, p, 0);
   MCode *px = exitstub_trace_addr(T, exitno);
   for (; p < pe; p++) {
-    /* Look for bcc/b exitstub, replace with bcc/b target. */
+    /* Look for exitstub branch, replace with branch to target. */
     uint32_t ins = *p;
     if ((ins & 0xff000000u) == 0x54000000u &&
 	((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) {
+      /* Patch bcc exitstub. */
       *p = (ins & 0xff00001fu) | (((target-p)<<5) & 0x00ffffe0u);
       cend = p+1;
       if (!cstart) cstart = p;
     } else if ((ins & 0xfc000000u) == 0x14000000u &&
 	       ((ins ^ (px-p)) & 0x03ffffffu) == 0) {
+      /* Patch b exitstub. */
       *p = (ins & 0xfc000000u) | ((target-p) & 0x03ffffffu);
       cend = p+1;
       if (!cstart) cstart = p;
+    } else if ((ins & 0x7e000000u) == 0x34000000u &&
+	       ((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) {
+      /* Patch cbz/cbnz exitstub. */
+      *p = (ins & 0xff00001f) | (((target-p)<<5) & 0x00ffffe0u);
+      cend = p+1;
+      if (!cstart) cstart = p;
+    } else if ((ins & 0x7e000000u) == 0x36000000u &&
+	       ((ins ^ ((px-p)<<5)) & 0x0007ffe0u) == 0) {
+      /* Patch tbz/tbnz exitstub. */
+      *p = (ins & 0xfff8001fu) | (((target-p)<<5) & 0x0007ffe0u);
+      cend = p+1;
+      if (!cstart) cstart = p;
     }
   }
   lua_assert(cstart != NULL);

+ 19 - 0
src/lj_emit_arm64.h

@@ -321,6 +321,25 @@ static void emit_branch(ASMState *as, A64Ins ai, MCode *target)
   as->mcp = p;
 }
 
+static void emit_tnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit, MCode *target)
+{
+  MCode *p = as->mcp;
+  ptrdiff_t delta = target - (p - 1);
+  lua_assert(bit < 63 && ((delta + 0x2000) >> 14) == 0);
+  if (bit > 31) ai |= A64I_X;
+  *--p = ai | A64F_BIT(bit & 31) | A64F_S14((uint32_t)delta & 0x3fffu) | r;
+  as->mcp = p;
+}
+
+static void emit_cnb(ASMState *as, A64Ins ai, Reg r, MCode *target)
+{
+  MCode *p = as->mcp;
+  ptrdiff_t delta = target - (p - 1);
+  lua_assert(((delta + 0x40000) >> 19) == 0);
+  *--p = ai | A64F_S19((uint32_t)delta & 0x7ffff) | r;
+  as->mcp = p;
+}
+
 #define emit_jmp(as, target)	emit_branch(as, A64I_B, (target))
 
 static void emit_call(ASMState *as, void *target)

+ 6 - 0
src/lj_target_arm64.h

@@ -127,7 +127,9 @@ static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p, uint32_t exitno)
 #define A64F_U12(x)	((x) << 10)
 #define A64F_S26(x)	(x)
 #define A64F_S19(x)	((x) << 5)
+#define A64F_S14(x)	((x) << 5)
 #define A64F_S9(x)	((x) << 12)
+#define A64F_BIT(x)	((x) << 19)
 #define A64F_SH(sh, x)	(((sh) << 22) | ((x) << 10))
 #define A64F_EX(ex)	(A64I_EX | ((ex) << 13))
 #define A64F_EXSH(ex,x)	(A64I_EX | ((ex) << 13) | ((x) << 10))
@@ -235,6 +237,10 @@ typedef enum A64Ins {
   A64I_BL = 0x94000000,
   A64I_BR = 0xd61f0000,
   A64I_BLR = 0xd63f0000,
+  A64I_TBZ = 0x36000000,
+  A64I_TBNZ = 0x37000000,
+  A64I_CBZ = 0x34000000,
+  A64I_CBNZ = 0x35000000,
 
   A64I_NOP = 0xd503201f,