Browse Source

x64: Various fixes for CET IBT.

Also add ELF notes. #1391
Mike Pall 1 month ago
parent
commit
e34a78acf6
6 changed files with 101 additions and 38 deletions
  1. 7 3
      src/Makefile
  2. 15 3
      src/lj_arch.h
  3. 2 2
      src/lj_asm.c
  4. 18 6
      src/lj_ccallback.c
  5. 2 2
      src/lj_emit_x86.h
  6. 57 22
      src/vm_x64.dasc

+ 7 - 3
src/Makefile

@@ -446,9 +446,13 @@ ifneq (,$(findstring LJ_ABI_PAUTH 1,$(TARGET_TESTARCH)))
   DASM_AFLAGS+= -D PAUTH
   TARGET_ARCH+= -DLJ_ABI_PAUTH=1
 endif
-ifneq (,$(findstring LJ_CET_BR 1,$(TARGET_TESTARCH)))
-  DASM_AFLAGS+= -D CET_BR
-  TARGET_ARCH+= -DLJ_CET_BR=1
+ifneq (,$(findstring LJ_ABI_BRANCH_TRACK 1,$(TARGET_TESTARCH)))
+  DASM_AFLAGS+= -D BRANCH_TRACK
+  TARGET_ARCH+= -DLJ_ABI_BRANCH_TRACK=1
+endif
+ifneq (,$(findstring LJ_ABI_SHADOW_STACK 1,$(TARGET_TESTARCH)))
+  DASM_AFLAGS+= -D SHADOW_STACK
+  TARGET_ARCH+= -DLJ_ABI_SHADOW_STACK=1
 endif
 DASM_AFLAGS+= -D VER=$(subst LJ_ARCH_VERSION_,,$(filter LJ_ARCH_VERSION_%,$(subst LJ_ARCH_VERSION ,LJ_ARCH_VERSION_,$(TARGET_TESTARCH))))
 ifeq (Windows,$(TARGET_SYS))

+ 15 - 3
src/lj_arch.h

@@ -219,15 +219,27 @@
 #error "macOS requires GC64 -- don't disable it"
 #endif
 
-#if (__CET__ & 1) && defined(LUAJIT_ENABLE_CET_BR)
+#if !defined(LJ_ABI_BRANCH_TRACK) && (__CET__ & 1) && \
+    LJ_TARGET_GC64 && defined(LUAJIT_ENABLE_CET_BR)
 /*
 ** Control-Flow Enforcement Technique (CET) indirect branch tracking (IBT).
 ** This is not enabled by default because it causes a notable slowdown of
 ** the interpreter on all x64 CPUs, whether they have CET enabled or not.
 ** If your toolchain enables -fcf-protection=branch by default, you need
-** to build with: make XCFLAGS=-DLUAJIT_ENABLE_CET_BR
+** to build with: make amalg XCFLAGS=-DLUAJIT_ENABLE_CET_BR
 */
-#define LJ_CET_BR		1
+#define LJ_ABI_BRANCH_TRACK	1
+#endif
+
+#if !defined(LJ_ABI_SHADOW_STACK) && (__CET__ & 2)
+/*
+** Control-Flow Enforcement Technique (CET) shadow stack (CET-SS).
+** It has no code overhead and doesn't cause any slowdowns when unused.
+** It can also be unconditionally enabled since all code already follows
+** a strict CALL to RET correspondence for performance reasons (all modern
+** CPUs use a (non-enforcing) shadow stack for return branch prediction).
+*/
+#define LJ_ABI_SHADOW_STACK	1
 #endif
 
 #elif LUAJIT_TARGET == LUAJIT_ARCH_ARM

+ 2 - 2
src/lj_asm.c

@@ -2586,8 +2586,8 @@ void lj_asm_trace(jit_State *J, GCtrace *T)
       asm_head_side(as);
     else
       asm_head_root(as);
-#if LJ_CET_BR
-    emit_endbr(as);
+#if LJ_ABI_BRANCH_TRACK
+    emit_branch_track(as);
 #endif
     asm_phi_fixup(as);
 

+ 18 - 6
src/lj_ccallback.c

@@ -34,22 +34,29 @@
 
 #elif LJ_TARGET_X86ORX64
 
+#if LJ_ABI_BRANCH_TRACK
+#define CALLBACK_MCODE_SLOTSZ	8
+#else
+#define CALLBACK_MCODE_SLOTSZ	4
+#endif
+#define CALLBACK_MCODE_NSLOT	(128 / CALLBACK_MCODE_SLOTSZ)
+
 #define CALLBACK_MCODE_HEAD	(LJ_64 ? 8 : 0)
 #define CALLBACK_MCODE_GROUP	(-2+1+2+(LJ_GC64 ? 10 : 5)+(LJ_64 ? 6 : 5))
 
 #define CALLBACK_SLOT2OFS(slot) \
-  (CALLBACK_MCODE_HEAD + CALLBACK_MCODE_GROUP*((slot)/32) + 4*(slot))
+  (CALLBACK_MCODE_HEAD + CALLBACK_MCODE_GROUP*((slot)/CALLBACK_MCODE_NSLOT) + CALLBACK_MCODE_SLOTSZ*(slot))
 
 static MSize CALLBACK_OFS2SLOT(MSize ofs)
 {
   MSize group;
   ofs -= CALLBACK_MCODE_HEAD;
-  group = ofs / (32*4 + CALLBACK_MCODE_GROUP);
-  return (ofs % (32*4 + CALLBACK_MCODE_GROUP))/4 + group*32;
+  group = ofs / (128 + CALLBACK_MCODE_GROUP);
+  return (ofs % (128 + CALLBACK_MCODE_GROUP))/CALLBACK_MCODE_SLOTSZ + group*CALLBACK_MCODE_NSLOT;
 }
 
 #define CALLBACK_MAX_SLOT \
-  (((CALLBACK_MCODE_SIZE-CALLBACK_MCODE_HEAD)/(CALLBACK_MCODE_GROUP+4*32))*32)
+  (((CALLBACK_MCODE_SIZE-CALLBACK_MCODE_HEAD)/(CALLBACK_MCODE_GROUP+128))*CALLBACK_MCODE_NSLOT)
 
 #elif LJ_TARGET_ARM
 
@@ -118,9 +125,13 @@ static void *callback_mcode_init(global_State *g, uint8_t *page)
   *(void **)p = target; p += 8;
 #endif
   for (slot = 0; slot < CALLBACK_MAX_SLOT; slot++) {
+#if LJ_ABI_BRANCH_TRACK
+    *(uint32_t *)p = XI_ENDBR64; p += 4;
+#endif
     /* mov al, slot; jmp group */
     *p++ = XI_MOVrib | RID_EAX; *p++ = (uint8_t)slot;
-    if ((slot & 31) == 31 || slot == CALLBACK_MAX_SLOT-1) {
+    if ((slot & (CALLBACK_MCODE_NSLOT-1)) == (CALLBACK_MCODE_NSLOT-1) ||
+	slot == CALLBACK_MAX_SLOT-1) {
       /* push ebp/rbp; mov ah, slot>>8; mov ebp, &g. */
       *p++ = XI_PUSH + RID_EBP;
       *p++ = XI_MOVrib | (RID_EAX+4); *p++ = (uint8_t)(slot >> 8);
@@ -140,7 +151,8 @@ static void *callback_mcode_init(global_State *g, uint8_t *page)
       *p++ = XI_JMP; *(int32_t *)p = target-(p+4); p += 4;
 #endif
     } else {
-      *p++ = XI_JMPs; *p++ = (uint8_t)((2+2)*(31-(slot&31)) - 2);
+      *p++ = XI_JMPs;
+      *p++ = (uint8_t)(CALLBACK_MCODE_SLOTSZ*(CALLBACK_MCODE_NSLOT-1-(slot&(CALLBACK_MCODE_NSLOT-1))) - 2);
     }
   }
   return p;

+ 2 - 2
src/lj_emit_x86.h

@@ -70,8 +70,8 @@ static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg rb, Reg rx,
   return p;
 }
 
-#if LJ_CET_BR
-static void emit_endbr(ASMState *as)
+#if LJ_ABI_BRANCH_TRACK
+static void emit_branch_track(ASMState *as)
 {
   emit_u32(as, XI_ENDBR64);
 }

+ 57 - 22
src/vm_x64.dasc

@@ -191,7 +191,7 @@
 |
 |//-- Control-Flow Enforcement Technique (CET) ---------------------------
 |
-|.if CET_BR
+|.if BRANCH_TRACK
 |.macro endbr; endbr64; .endmacro
 |.else
 |.macro endbr; .endmacro
@@ -200,13 +200,13 @@
 |//-----------------------------------------------------------------------
 |
 |// Instruction headers.
-|.macro ins_A; endbr; .endmacro
-|.macro ins_AD; endbr; .endmacro
-|.macro ins_AJ; endbr; .endmacro
-|.macro ins_ABC; endbr; movzx RBd, RCH; movzx RCd, RCL; .endmacro
-|.macro ins_AB_; endbr; movzx RBd, RCH; .endmacro
-|.macro ins_A_C; endbr; movzx RCd, RCL; .endmacro
-|.macro ins_AND; endbr; not RD; .endmacro
+|.macro ins_A; .endmacro
+|.macro ins_AD; .endmacro
+|.macro ins_AJ; .endmacro
+|.macro ins_ABC; movzx RBd, RCH; movzx RCd, RCL; .endmacro
+|.macro ins_AB_; movzx RBd, RCH; .endmacro
+|.macro ins_A_C; movzx RCd, RCL; .endmacro
+|.macro ins_AND; not RD; .endmacro
 |
 |// Instruction decode+dispatch. Carefully tuned (nope, lodsd is not faster).
 |.macro ins_NEXT
@@ -487,13 +487,12 @@ static void build_subroutines(BuildCtx *ctx)
   |  jmp <3
   |
   |->vm_unwind_yield:
-  |  endbr
   |  mov al, LUA_YIELD
   |  jmp ->vm_unwind_c_eh
   |
   |->vm_unwind_c:			// Unwind C stack, return from vm_pcall.
-  |  endbr
   |  // (void *cframe, int errcode)
+  |  endbr
   |  mov eax, CARG2d			// Error return status for vm_pcall.
   |  mov rsp, CARG1
   |->vm_unwind_c_eh:			// Landing pad for external unwinder.
@@ -513,8 +512,8 @@ static void build_subroutines(BuildCtx *ctx)
   |.endif
   |
   |->vm_unwind_ff:			// Unwind C stack, return from ff pcall.
-  |  endbr
   |  // (void *cframe)
+  |  endbr
   |  and CARG1, CFRAME_RAWMASK
   |  mov rsp, CARG1
   |->vm_unwind_ff_eh:			// Landing pad for external unwinder.
@@ -689,7 +688,6 @@ static void build_subroutines(BuildCtx *ctx)
   |//-- Continuation dispatch ----------------------------------------------
   |
   |->cont_dispatch:
-  |  endbr
   |  // BASE = meta base, RA = resultofs, RD = nresults+1 (also in MULTRES)
   |  add RA, BASE
   |  and PC, -8
@@ -1152,7 +1150,7 @@ static void build_subroutines(BuildCtx *ctx)
   |
   |.macro .ffunc, name
   |->ff_ .. name:
-  | endbr
+  |  endbr
   |.endmacro
   |
   |.macro .ffunc_1, name
@@ -2338,8 +2336,8 @@ static void build_subroutines(BuildCtx *ctx)
   |
   |->cont_stitch:			// Trace stitching.
   |.if JIT
-  |  endbr
   |  // BASE = base, RC = result, RB = mbase
+  |  endbr
   |  mov TRACE:ITYPE, [RB-40]		// Save previous trace.
   |  cleartp TRACE:ITYPE
   |  mov TMPRd, MULTRES
@@ -2460,8 +2458,8 @@ static void build_subroutines(BuildCtx *ctx)
   |  jmp >1
   |.endif
   |->vm_exit_interp:
-  |  endbr
   |  // RD = MULTRES or negated error code, BASE, PC and DISPATCH set.
+  |  endbr
   |.if JIT
   |  // Restore additional callee-save registers only used in compiled code.
   |.if X64WIN
@@ -2849,6 +2847,26 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
   |=>defop:
 
   switch (op) {
+#if !LJ_HASJIT
+  case BC_FORL:
+  case BC_JFORI:
+  case BC_JFORL:
+  case BC_ITERL:
+  case BC_JITERL:
+  case BC_LOOP:
+  case BC_JLOOP:
+  case BC_FUNCF:
+  case BC_JFUNCF:
+  case BC_JFUNCV:
+#endif
+  case BC_FUNCV:  /* NYI: compiled vararg functions. */
+    break;  /* Avoid redundant endbr instructions. */
+  default:
+    |  endbr
+    break;
+  }
+
+  switch (op) {
 
   /* -- Comparison ops ---------------------------------------------------- */
 
@@ -4119,7 +4137,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
 
   case BC_ITERN:
     |.if JIT
-    |  endbr
     |  hotloop RBd
     |.endif
     |->vm_IITERN:
@@ -4299,7 +4316,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  jnz >7				// Not returning to a fixarg Lua func?
     switch (op) {
     case BC_RET:
-      |  endbr
       |->BC_RET_Z:
       |  mov KBASE, BASE		// Use KBASE for result move.
       |  sub RDd, 1
@@ -4318,12 +4334,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
       |  ja >6
       break;
     case BC_RET1:
-      |  endbr
       |  mov RB, [BASE+RA]
       |  mov [BASE-16], RB
       /* fallthrough */
     case BC_RET0:
-      |  endbr
       |5:
       |  cmp PC_RB, RDL			// More results expected?
       |  ja >6
@@ -4370,7 +4384,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
 
   case BC_FORL:
     |.if JIT
-    |  endbr
     |  hotloop RBd
     |.endif
     | // Fall through. Assumes BC_IFORL follows and ins_AJ is a no-op.
@@ -4522,7 +4535,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
 
   case BC_ITERL:
     |.if JIT
-    |  endbr
     |  hotloop RBd
     |.endif
     | // Fall through. Assumes BC_IITERL follows and ins_AJ is a no-op.
@@ -4616,7 +4628,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
 
   case BC_FUNCF:
     |.if JIT
-    |  endbr
     |  hotcall RBd
     |.endif
   case BC_FUNCV:  /* NYI: compiled vararg functions. */
@@ -4886,6 +4897,30 @@ static void emit_asm_debug(BuildCtx *ctx)
 	"\t.align 8\n"
 	".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
 #endif
+#endif
+#if LJ_TARGET_LINUX && (LJ_ABI_BRANCH_TRACK || LJ_ABI_SHADOW_STACK)
+    fprintf(ctx->fp,
+	"\t.section .note.gnu.property,\"a\"\n"
+	"\t.align 8\n"
+	"\t.long 4\n"
+	"\t.long 16\n"
+	"\t.long 5\n"
+	"\t.long 0x00554e47\n"
+	"\t.long 0xc0000002\n"
+	"\t.long 4\n"
+	"\t.long %d\n"
+	"\t.long 0\n",
+#if LJ_ABI_BRANCH_TRACK
+	1|
+#else
+	0|
+#endif
+#if LJ_ABI_SHADOW_STACK
+	2
+#else
+	0
+#endif
+	);
 #endif
     break;
 #if !LJ_NO_UNWIND