Pārlūkot izejas kodu

x86: Generate BMI2 shifts and rotates, if available.

Contributed by Peter Cawley.
Mike Pall 9 gadi atpakaļ
vecāks
revīzija
892887e584
5 mainītis faili ar 51 papildinājumiem un 7 dzēšanām
  1. 3 0
      src/jit/dis_x86.lua
  2. 4 1
      src/lj_asm.c
  3. 22 6
      src/lj_asm_x86.h
  4. 11 0
      src/lj_emit_x86.h
  5. 11 0
      src/lj_target_x86.h

+ 3 - 0
src/jit/dis_x86.lua

@@ -244,6 +244,7 @@ nil,"||psrlvVSXrvm","||psravdXrvm","||psllvVSXrvm",
 [0xde] = "||aesdecXrvm", [0xdf] = "||aesdeclastXrvm",
 --Fx
 [0xf0] = "|||crc32TrBmt",[0xf1] = "|||crc32TrVmt",
+[0xf7] = "|sarxVrmv|shlxVrmv|shrxVrmv",
 },
 
 ["3a"] = { -- [66] 0f 3a xx
@@ -273,6 +274,8 @@ nil,nil,nil,nil,
 [0x60] = "||pcmpestrmXrmu",[0x61] = "||pcmpestriXrmu",
 [0x62] = "||pcmpistrmXrmu",[0x63] = "||pcmpistriXrmu",
 [0xdf] = "||aeskeygenassistXrmu",
+--Fx
+[0xf0] = "|||rorxVrmu",
 },
 }
 

+ 4 - 1
src/lj_asm.c

@@ -2150,7 +2150,10 @@ static void asm_setup_regsp(ASMState *as)
 #endif
 #if LJ_TARGET_X86ORX64
     /* Non-constant shift counts need to be in RID_ECX on x86/x64. */
-    case IR_BSHL: case IR_BSHR: case IR_BSAR: case IR_BROL: case IR_BROR:
+    case IR_BSHL: case IR_BSHR: case IR_BSAR:
+      if ((as->flags & JIT_F_BMI2))  /* Except if BMI2 is available. */
+	break;
+    case IR_BROL: case IR_BROR:
       if (!irref_isk(ir->op2) && !ra_hashint(IR(ir->op2)->r)) {
 	IR(ir->op2)->r = REGSP_HINT(RID_ECX);
 	if (inloop)

+ 22 - 6
src/lj_asm_x86.h

@@ -1956,7 +1956,7 @@ static void asm_bswap(ASMState *as, IRIns *ir)
 #define asm_bor(as, ir)		asm_intarith(as, ir, XOg_OR)
 #define asm_bxor(as, ir)	asm_intarith(as, ir, XOg_XOR)
 
-static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs)
+static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs, x86Op xv)
 {
   IRRef rref = ir->op2;
   IRIns *irr = IR(rref);
@@ -1965,11 +1965,27 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs)
     int shift;
     dest = ra_dest(as, ir, RSET_GPR);
     shift = irr->i & (irt_is64(ir->t) ? 63 : 31);
+    if (!xv && shift && (as->flags & JIT_F_BMI2)) {
+      Reg left = asm_fuseloadm(as, ir->op1, RSET_GPR, irt_is64(ir->t));
+      if (left != dest) {  /* BMI2 rotate right by constant. */
+	emit_i8(as, xs == XOg_ROL ? -shift : shift);
+	emit_mrm(as, VEX_64IR(ir, XV_RORX), dest, left);
+	return;
+      }
+    }
     switch (shift) {
     case 0: break;
     case 1: emit_rr(as, XO_SHIFT1, REX_64IR(ir, xs), dest); break;
     default: emit_shifti(as, REX_64IR(ir, xs), dest, shift); break;
     }
+  } else if ((as->flags & JIT_F_BMI2) && xv) {	/* BMI2 variable shifts. */
+    Reg left, right;
+    dest = ra_dest(as, ir, RSET_GPR);
+    right = ra_alloc1(as, rref, RSET_GPR);
+    left = asm_fuseloadm(as, ir->op1, rset_exclude(RSET_GPR, right),
+			 irt_is64(ir->t));
+    emit_mrm(as, VEX_64IR(ir, xv) ^ (right << 19), dest, left);
+    return;
   } else {  /* Variable shifts implicitly use register cl (i.e. ecx). */
     Reg right;
     dest = ra_dest(as, ir, rset_exclude(RSET_GPR, RID_ECX));
@@ -1995,11 +2011,11 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs)
   */
 }
 
-#define asm_bshl(as, ir)	asm_bitshift(as, ir, XOg_SHL)
-#define asm_bshr(as, ir)	asm_bitshift(as, ir, XOg_SHR)
-#define asm_bsar(as, ir)	asm_bitshift(as, ir, XOg_SAR)
-#define asm_brol(as, ir)	asm_bitshift(as, ir, XOg_ROL)
-#define asm_bror(as, ir)	asm_bitshift(as, ir, XOg_ROR)
+#define asm_bshl(as, ir)	asm_bitshift(as, ir, XOg_SHL, XV_SHLX)
+#define asm_bshr(as, ir)	asm_bitshift(as, ir, XOg_SHR, XV_SHRX)
+#define asm_bsar(as, ir)	asm_bitshift(as, ir, XOg_SAR, XV_SARX)
+#define asm_brol(as, ir)	asm_bitshift(as, ir, XOg_ROL, 0)
+#define asm_bror(as, ir)	asm_bitshift(as, ir, XOg_ROR, 0)
 
 /* -- Comparisons --------------------------------------------------------- */
 

+ 11 - 0
src/lj_emit_x86.h

@@ -13,10 +13,12 @@
       if (rex != 0x40) *--(p) = rex; }
 #define FORCE_REX		0x200
 #define REX_64			(FORCE_REX|0x080000)
+#define VEX_64			0x800000
 #else
 #define REXRB(p, rr, rb)	((void)0)
 #define FORCE_REX		0
 #define REX_64			0
+#define VEX_64			0
 #endif
 
 #define emit_i8(as, i)		(*--as->mcp = (MCode)(i))
@@ -31,6 +33,13 @@ static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg rb, Reg rx,
 				 MCode *p, int delta)
 {
   int n = (int8_t)xo;
+  if (n == -60) {  /* VEX-encoded instruction */
+#if LJ_64
+    xo ^= (((rr>>1)&4)+((rx>>2)&2)+((rb>>3)&1))<<13;
+#endif
+    *(uint32_t *)(p+delta-5) = (uint32_t)xo;
+    return p+delta-5;
+  }
 #if defined(__GNUC__)
   if (__builtin_constant_p(xo) && n == -2)
     p[delta-2] = (MCode)(xo >> 24);
@@ -412,8 +421,10 @@ static void emit_call_(ASMState *as, MCode *target)
 /* Use 64 bit operations to handle 64 bit IR types. */
 #if LJ_64
 #define REX_64IR(ir, r)		((r) + (irt_is64((ir)->t) ? REX_64 : 0))
+#define VEX_64IR(ir, r)		((r) + (irt_is64((ir)->t) ? VEX_64 : 0))
 #else
 #define REX_64IR(ir, r)		(r)
+#define VEX_64IR(ir, r)		(r)
 #endif
 
 /* Generic move between two regs. */

+ 11 - 0
src/lj_target_x86.h

@@ -189,6 +189,11 @@ typedef struct {
 #define XO_f20f(o)	((uint32_t)(0x0ff2fc + (0x##o<<24)))
 #define XO_f30f(o)	((uint32_t)(0x0ff3fc + (0x##o<<24)))
 
+#define XV_660f38(o)	((uint32_t)(0x79e2c4 + (0x##o<<24)))
+#define XV_f20f38(o)	((uint32_t)(0x7be2c4 + (0x##o<<24)))
+#define XV_f20f3a(o)	((uint32_t)(0x7be3c4 + (0x##o<<24)))
+#define XV_f30f38(o)	((uint32_t)(0x7ae2c4 + (0x##o<<24)))
+
 /* This list of x86 opcodes is not intended to be complete. Opcodes are only
 ** included when needed. Take a look at DynASM or jit.dis_x86 to see the
 ** whole mess.
@@ -231,6 +236,12 @@ typedef enum {
   XI_FSCALE =	0xfdd9,
   XI_FYL2X =	0xf1d9,
 
+  /* VEX-encoded instructions. XV_* prefix. */
+  XV_RORX =	XV_f20f3a(f0),
+  XV_SARX =	XV_f30f38(f7),
+  XV_SHLX =	XV_660f38(f7),
+  XV_SHRX =	XV_f20f38(f7),
+
   /* Variable-length opcodes. XO_* prefix. */
   XO_MOV =	XO_(8b),
   XO_MOVto =	XO_(89),