瀏覽代碼

ARM64: Add big-endian support.

Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
Sponsored by Cisco Systems, Inc.
Mike Pall 8 年之前
父節點
當前提交
3143b21894
共有 14 個文件被更改,包括 149 次插入63 次删除
  1. 2 2
      Makefile
  2. 3 0
      src/Makefile
  3. 6 2
      src/host/buildvm_asm.c
  4. 4 4
      src/jit/bcsave.lua
  5. 12 0
      src/jit/dis_arm64be.lua
  6. 6 4
      src/lj_arch.h
  7. 3 0
      src/lj_asm.c
  8. 28 14
      src/lj_asm_arm64.h
  9. 17 3
      src/lj_ccall.c
  10. 2 2
      src/lj_ccall.h
  11. 11 7
      src/lj_ccallback.c
  12. 1 1
      src/lj_emit_arm64.h
  13. 8 1
      src/lj_target_arm64.h
  14. 46 23
      src/vm_arm64.dasc

+ 2 - 2
Makefile

@@ -87,8 +87,8 @@ FILE_PC= luajit.pc
 FILES_INC= lua.h lualib.h lauxlib.h luaconf.h lua.hpp luajit.h
 FILES_JITLIB= bc.lua bcsave.lua dump.lua p.lua v.lua zone.lua \
 	      dis_x86.lua dis_x64.lua dis_arm.lua dis_arm64.lua \
-	      dis_ppc.lua dis_mips.lua dis_mipsel.lua dis_mips64.lua \
-	      dis_mips64el.lua vmdef.lua
+	      dis_arm64be.lua dis_ppc.lua dis_mips.lua dis_mipsel.lua \
+	      dis_mips64.lua dis_mips64el.lua vmdef.lua
 
 ifeq (,$(findstring Windows,$(OS)))
   HOST_SYS:= $(shell uname -s)

+ 3 - 0
src/Makefile

@@ -242,6 +242,9 @@ ifneq (,$(findstring LJ_TARGET_ARM ,$(TARGET_TESTARCH)))
   TARGET_LJARCH= arm
 else
 ifneq (,$(findstring LJ_TARGET_ARM64 ,$(TARGET_TESTARCH)))
+  ifneq (,$(findstring __AARCH64EB__ ,$(TARGET_TESTARCH)))
+    TARGET_ARCH= -D__AARCH64EB__=1
+  endif
   TARGET_LJARCH= arm64
 else
 ifneq (,$(findstring LJ_TARGET_PPC ,$(TARGET_TESTARCH)))

+ 6 - 2
src/host/buildvm_asm.c

@@ -93,10 +93,14 @@ static void emit_asm_words(BuildCtx *ctx, uint8_t *p, int n)
 {
   int i;
   for (i = 0; i < n; i += 4) {
+    uint32_t ins = *(uint32_t *)(p+i);
+#if LJ_TARGET_ARM64 && LJ_BE
+    ins = lj_bswap(ins);  /* ARM64 instructions are always little-endian. */
+#endif
     if ((i & 15) == 0)
-      fprintf(ctx->fp, "\t.long 0x%08x", *(uint32_t *)(p+i));
+      fprintf(ctx->fp, "\t.long 0x%08x", ins);
     else
-      fprintf(ctx->fp, ",0x%08x", *(uint32_t *)(p+i));
+      fprintf(ctx->fp, ",0x%08x", ins);
     if ((i & 15) == 12) putc('\n', ctx->fp);
   }
   if ((n & 15) != 0) putc('\n', ctx->fp);

+ 4 - 4
src/jit/bcsave.lua

@@ -63,8 +63,8 @@ local map_type = {
 }
 
 local map_arch = {
-  x86 = true, x64 = true, arm = true, arm64 = true, ppc = true,
-  mips = true, mipsel = true,
+  x86 = true, x64 = true, arm = true, arm64 = true, arm64be = true,
+  ppc = true, mips = true, mipsel = true,
 }
 
 local map_os = {
@@ -200,7 +200,7 @@ typedef struct {
 ]]
   local symname = LJBC_PREFIX..ctx.modname
   local is64, isbe = false, false
-  if ctx.arch == "x64" or ctx.arch == "arm64" then
+  if ctx.arch == "x64" or ctx.arch == "arm64" or ctx.arch == "arm64be" then
     is64 = true
   elseif ctx.arch == "ppc" or ctx.arch == "mips" then
     isbe = true
@@ -237,7 +237,7 @@ typedef struct {
   hdr.eendian = isbe and 2 or 1
   hdr.eversion = 1
   hdr.type = f16(1)
-  hdr.machine = f16(({ x86=3, x64=62, arm=40, arm64=183, ppc=20, mips=8, mipsel=8 })[ctx.arch])
+  hdr.machine = f16(({ x86=3, x64=62, arm=40, arm64=183, arm64be=183, ppc=20, mips=8, mipsel=8 })[ctx.arch])
   if ctx.arch == "mips" or ctx.arch == "mipsel" then
     hdr.flags = f32(0x50001006)
   end

+ 12 - 0
src/jit/dis_arm64be.lua

@@ -0,0 +1,12 @@
+----------------------------------------------------------------------------
+-- LuaJIT ARM64BE disassembler wrapper module.
+--
+-- Copyright (C) 2005-2017 Mike Pall. All rights reserved.
+-- Released under the MIT license. See Copyright Notice in luajit.h
+----------------------------------------------------------------------------
+-- ARM64 instructions are always little-endian. So just forward to the
+-- common ARM64 disassembler module. All the interesting stuff is there.
+------------------------------------------------------------------------------
+
+return require((string.match(..., ".*%.") or "").."dis_arm64")
+

+ 6 - 4
src/lj_arch.h

@@ -215,9 +215,14 @@
 
 #elif LUAJIT_TARGET == LUAJIT_ARCH_ARM64
 
-#define LJ_ARCH_NAME		"arm64"
 #define LJ_ARCH_BITS		64
+#if defined(__AARCH64EB__)
+#define LJ_ARCH_NAME		"arm64be"
+#define LJ_ARCH_ENDIAN		LUAJIT_BE
+#else
+#define LJ_ARCH_NAME		"arm64"
 #define LJ_ARCH_ENDIAN		LUAJIT_LE
+#endif
 #define LJ_TARGET_ARM64		1
 #define LJ_TARGET_EHRETREG	0
 #define LJ_TARGET_JUMPRANGE	27	/* +-2^27 = +-128MB */
@@ -409,9 +414,6 @@
 #error "Only ARM EABI or iOS 3.0+ ABI is supported"
 #endif
 #elif LJ_TARGET_ARM64
-#if defined(__AARCH64EB__)
-#error "No support for big-endian ARM64"
-#endif
 #if defined(_ILP32)
 #error "No support for ILP32 model on ARM64"
 #endif

+ 3 - 0
src/lj_asm.c

@@ -2393,6 +2393,9 @@ void lj_asm_trace(jit_State *J, GCtrace *T)
   if (!as->loopref)
     asm_tail_fixup(as, T->link);  /* Note: this may change as->mctop! */
   T->szmcode = (MSize)((char *)as->mctop - (char *)as->mcp);
+#if LJ_TARGET_MCODE_FIXUP
+  asm_mcode_fixup(T->mcode, T->szmcode);
+#endif
   lj_mcode_sync(T->mcode, origtop);
 }
 

+ 28 - 14
src/lj_asm_arm64.h

@@ -56,11 +56,11 @@ static void asm_exitstub_setup(ASMState *as, ExitNo nexits)
     asm_mclimit(as);
   /* 1: str lr,[sp]; bl ->vm_exit_handler; movz w0,traceno; bl <1; bl <1; ... */
   for (i = nexits-1; (int32_t)i >= 0; i--)
-    *--mxp = A64I_BL|((-3-i)&0x03ffffffu);
-  *--mxp = A64I_MOVZw|A64F_U16(as->T->traceno);
+    *--mxp = A64I_LE(A64I_BL|((-3-i)&0x03ffffffu));
+  *--mxp = A64I_LE(A64I_MOVZw|A64F_U16(as->T->traceno));
   mxp--;
-  *mxp = A64I_BL|(((MCode *)(void *)lj_vm_exit_handler-mxp)&0x03ffffffu);
-  *--mxp = A64I_STRx|A64F_D(RID_LR)|A64F_N(RID_SP);
+  *mxp = A64I_LE(A64I_BL|(((MCode *)(void *)lj_vm_exit_handler-mxp)&0x03ffffffu));
+  *--mxp = A64I_LE(A64I_STRx|A64F_D(RID_LR)|A64F_N(RID_SP));
   as->mctop = mxp;
 }
 
@@ -431,7 +431,7 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
 	  fpr++;
 	} else {
 	  Reg r = ra_alloc1(as, ref, RSET_FPR);
-	  emit_spstore(as, ir, r, ofs);
+	  emit_spstore(as, ir, r, ofs + ((LJ_BE && !irt_isnum(ir->t)) ? 4 : 0));
 	  ofs += 8;
 	}
       } else {
@@ -441,7 +441,7 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
 	  gpr++;
 	} else {
 	  Reg r = ra_alloc1(as, ref, RSET_GPR);
-	  emit_spstore(as, ir, r, ofs);
+	  emit_spstore(as, ir, r, ofs + ((LJ_BE && !irt_is64(ir->t)) ? 4 : 0));
 	  ofs += 8;
 	}
       }
@@ -1082,7 +1082,7 @@ static void asm_ahustore(ASMState *as, IRIns *ir)
 	src = ra_alloc1(as, ir->op2, allow);
 	rset_clear(allow, src);
 	if (irt_isinteger(ir->t))
-	  type = ra_allock(as, (int64_t)LJ_TISNUM << 47, allow);
+	  type = ra_allock(as, (uint64_t)(int32_t)LJ_TISNUM << 47, allow);
 	else
 	  type = ra_allock(as, irt_toitype(ir->t), allow);
       } else {
@@ -1179,7 +1179,8 @@ dotypecheck:
   }
   if (ra_hasreg(dest)) {
     emit_lso(as, irt_isnum(t) ? A64I_LDRd :
-	     (irt_isint(t) ? A64I_LDRw : A64I_LDRx), (dest & 31), base, ofs);
+	     (irt_isint(t) ? A64I_LDRw : A64I_LDRx), (dest & 31), base,
+	     ofs ^ ((LJ_BE && irt_isint(t) ? 4 : 0)));
   }
 }
 
@@ -1909,7 +1910,7 @@ static void asm_tail_fixup(ASMState *as, TraceNo lnk)
   /* Undo the sp adjustment in BC_JLOOP when exiting to the interpreter. */
   int32_t spadj = as->T->spadjust + (lnk ? 0 : sps_scale(SPS_FIXED));
   if (spadj == 0) {
-    *--p = A64I_NOP;
+    *--p = A64I_LE(A64I_NOP);
     as->mctop = p;
   } else {
     /* Patch stack adjustment. */
@@ -1962,6 +1963,19 @@ static void asm_setup_target(ASMState *as)
   asm_exitstub_setup(as, as->T->nsnap + (as->parent ? 1 : 0));
 }
 
+#if LJ_BE
+/* ARM64 instructions are always little-endian. Swap for ARM64BE. */
+static void asm_mcode_fixup(MCode *mcode, MSize size)
+{
+  MCode *pe = (MCode *)((char *)mcode + size);
+  while (mcode < pe) {
+    MCode ins = *mcode;
+    *mcode++ = lj_bswap(ins);
+  }
+}
+#define LJ_TARGET_MCODE_FIXUP	1
+#endif
+
 /* -- Trace patching ------------------------------------------------------ */
 
 /* Patch exit jumps of existing machine code to a new target. */
@@ -1974,29 +1988,29 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target)
   MCode *px = exitstub_trace_addr(T, exitno);
   for (; p < pe; p++) {
     /* Look for exitstub branch, replace with branch to target. */
-    uint32_t ins = *p;
+    MCode ins = A64I_LE(*p);
     if ((ins & 0xff000000u) == 0x54000000u &&
 	((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) {
       /* Patch bcc exitstub. */
-      *p = (ins & 0xff00001fu) | (((target-p)<<5) & 0x00ffffe0u);
+      *p = A64I_LE((ins & 0xff00001fu) | (((target-p)<<5) & 0x00ffffe0u));
       cend = p+1;
       if (!cstart) cstart = p;
     } else if ((ins & 0xfc000000u) == 0x14000000u &&
 	       ((ins ^ (px-p)) & 0x03ffffffu) == 0) {
       /* Patch b exitstub. */
-      *p = (ins & 0xfc000000u) | ((target-p) & 0x03ffffffu);
+      *p = A64I_LE((ins & 0xfc000000u) | ((target-p) & 0x03ffffffu));
       cend = p+1;
       if (!cstart) cstart = p;
     } else if ((ins & 0x7e000000u) == 0x34000000u &&
 	       ((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) {
       /* Patch cbz/cbnz exitstub. */
-      *p = (ins & 0xff00001f) | (((target-p)<<5) & 0x00ffffe0u);
+      *p = A64I_LE((ins & 0xff00001f) | (((target-p)<<5) & 0x00ffffe0u));
       cend = p+1;
       if (!cstart) cstart = p;
     } else if ((ins & 0x7e000000u) == 0x36000000u &&
 	       ((ins ^ ((px-p)<<5)) & 0x0007ffe0u) == 0) {
       /* Patch tbz/tbnz exitstub. */
-      *p = (ins & 0xfff8001fu) | (((target-p)<<5) & 0x0007ffe0u);
+      *p = A64I_LE((ins & 0xfff8001fu) | (((target-p)<<5) & 0x0007ffe0u));
       cend = p+1;
       if (!cstart) cstart = p;
     }

+ 17 - 3
src/lj_ccall.c

@@ -301,7 +301,7 @@
   unsigned int cl = ccall_classify_struct(cts, ctr); \
   if ((cl & 4)) { /* Combine float HFA from separate registers. */ \
     CTSize i = (cl >> 8) - 1; \
-    do { ((uint32_t *)dp)[i] = cc->fpr[i].u32; } while (i--); \
+    do { ((uint32_t *)dp)[i] = cc->fpr[i].lo; } while (i--); \
   } else { \
     if (cl > 1) sp = (uint8_t *)&cc->fpr[0]; \
     memcpy(dp, sp, ctr->size); \
@@ -359,6 +359,13 @@
     } \
   }
 
+#if LJ_BE
+#define CCALL_HANDLE_RET \
+  if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
+    sp = (uint8_t *)&cc->fpr[0].f;
+#endif
+
+
 #elif LJ_TARGET_PPC
 /* -- PPC calling conventions --------------------------------------------- */
 
@@ -1033,9 +1040,16 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
 	*(int32_t *)dp = d->size == 1 ? (int32_t)*(int8_t *)dp :
 					(int32_t)*(int16_t *)dp;
     }
+#if LJ_TARGET_ARM64 && LJ_BE
+    if (isfp && d->size == sizeof(float))
+      ((float *)dp)[1] = ((float *)dp)[0];  /* Floats occupy high slot. */
+#endif
+#if LJ_TARGET_MIPS64 || (LJ_TARGET_ARM64 && LJ_BE)
+    if ((ctype_isinteger_or_bool(d->info) || ctype_isenum(d->info)
 #if LJ_TARGET_MIPS64
-    if ((ctype_isinteger_or_bool(d->info) || ctype_isenum(d->info) ||
-	 (isfp && nsp == 0)) && d->size <= 4) {
+	 || (isfp && nsp == 0)
+#endif
+	 ) && d->size <= 4) {
       *(int64_t *)dp = (int64_t)*(int32_t *)dp;  /* Sign-extend to 64 bit. */
     }
 #endif

+ 2 - 2
src/lj_ccall.h

@@ -79,8 +79,8 @@ typedef union FPRArg {
 typedef intptr_t GPRArg;
 typedef union FPRArg {
   double d;
-  float f;
-  uint32_t u32;
+  struct { LJ_ENDIAN_LOHI(float f; , float g;) };
+  struct { LJ_ENDIAN_LOHI(uint32_t lo; , uint32_t hi;) };
 } FPRArg;
 
 #elif LJ_TARGET_PPC

+ 11 - 7
src/lj_ccallback.c

@@ -173,16 +173,16 @@ static void callback_mcode_init(global_State *g, uint32_t *page)
   uint32_t *p = page;
   void *target = (void *)lj_vm_ffi_callback;
   MSize slot;
-  *p++ = A64I_LDRLx | A64F_D(RID_X11) | A64F_S19(4);
-  *p++ = A64I_LDRLx | A64F_D(RID_X10) | A64F_S19(5);
-  *p++ = A64I_BR | A64F_N(RID_X11);
-  *p++ = A64I_NOP;
+  *p++ = A64I_LE(A64I_LDRLx | A64F_D(RID_X11) | A64F_S19(4));
+  *p++ = A64I_LE(A64I_LDRLx | A64F_D(RID_X10) | A64F_S19(5));
+  *p++ = A64I_LE(A64I_BR | A64F_N(RID_X11));
+  *p++ = A64I_LE(A64I_NOP);
   ((void **)p)[0] = target;
   ((void **)p)[1] = g;
   p += 4;
   for (slot = 0; slot < CALLBACK_MAX_SLOT; slot++) {
-    *p++ = A64I_MOVZw | A64F_D(RID_X9) | A64F_U16(slot);
-    *p = A64I_B | A64F_S26((page-p) & 0x03ffffffu);
+    *p++ = A64I_LE(A64I_MOVZw | A64F_D(RID_X9) | A64F_U16(slot));
+    *p = A64I_LE(A64I_B | A64F_S26((page-p) & 0x03ffffffu));
     p++;
   }
   lua_assert(p - page <= CALLBACK_MCODE_SIZE);
@@ -623,6 +623,10 @@ static void callback_conv_result(CTState *cts, lua_State *L, TValue *o)
 #if CCALL_NUM_FPR
     if (ctype_isfp(ctr->info))
       dp = (uint8_t *)&cts->cb.fpr[0];
+#endif
+#if LJ_TARGET_ARM64 && LJ_BE
+    if (ctype_isfp(ctr->info) && ctr->size == sizeof(float))
+      dp = (uint8_t *)&cts->cb.fpr[0].f[1];
 #endif
     lj_cconv_ct_tv(cts, ctr, dp, o, 0);
 #ifdef CALLBACK_HANDLE_RET
@@ -637,7 +641,7 @@ static void callback_conv_result(CTState *cts, lua_State *L, TValue *o)
 	*(int32_t *)dp = ctr->size == 1 ? (int32_t)*(int8_t *)dp :
 					  (int32_t)*(int16_t *)dp;
     }
-#if LJ_TARGET_MIPS64
+#if LJ_TARGET_MIPS64 || (LJ_TARGET_ARM64 && LJ_BE)
     /* Always sign-extend results to 64 bits. Even a soft-fp 'float'. */
     if (ctr->size <= 4 &&
 	(LJ_ABI_SOFTFP || ctype_isinteger_or_bool(ctr->info)))

+ 1 - 1
src/lj_emit_arm64.h

@@ -140,7 +140,7 @@ static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs)
     } else {
       goto nopair;
     }
-    if (ofsm >= (-64<<sc) && ofsm <= (63<<sc)) {
+    if (ofsm >= (int)((unsigned int)-64<<sc) && ofsm <= (63<<sc)) {
       *as->mcp = aip | A64F_N(rn) | ((ofsm >> sc) << 15) |
 	(ai ^ ((ai == A64I_LDRx || ai == A64I_STRx) ? 0x50000000 : 0x90000000));
       return;

+ 8 - 1
src/lj_target_arm64.h

@@ -107,7 +107,7 @@ typedef struct {
 /* Return the address of a per-trace exit stub. */
 static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p, uint32_t exitno)
 {
-  while (*p == 0xd503201f) p++;  /* Skip A64I_NOP. */
+  while (*p == (LJ_LE ? 0xd503201f : 0x1f2003d5)) p++;  /* Skip A64I_NOP. */
   return p + 3 + exitno;
 }
 /* Avoid dependence on lj_jit.h if only including lj_target.h. */
@@ -116,6 +116,13 @@ static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p, uint32_t exitno)
 
 /* -- Instructions -------------------------------------------------------- */
 
+/* ARM64 instructions are always little-endian. Swap for ARM64BE. */
+#if LJ_BE
+#define A64I_LE(x)	(lj_bswap(x))
+#else
+#define A64I_LE(x)	(x)
+#endif
+
 /* Instruction fields. */
 #define A64F_D(r)	(r)
 #define A64F_N(r)	((r) << 5)

+ 46 - 23
src/vm_arm64.dasc

@@ -151,6 +151,21 @@
 |.define FRAME_FUNC,	#-16
 |.define FRAME_PC,	#-8
 |
+|// Endian-specific defines.
+|.if ENDIAN_LE
+|.define LO,		0
+|.define OFS_RD,	2
+|.define OFS_RB,	3
+|.define OFS_RA,	1
+|.define OFS_OP,	0
+|.else
+|.define LO,		4
+|.define OFS_RD,	0
+|.define OFS_RB,	0
+|.define OFS_RA,	2
+|.define OFS_OP,	3
+|.endif
+|
 |.macro decode_RA, dst, ins; ubfx dst, ins, #8, #8; .endmacro
 |.macro decode_RB, dst, ins; ubfx dst, ins, #24, #8; .endmacro
 |.macro decode_RC, dst, ins; ubfx dst, ins, #16, #8; .endmacro
@@ -717,7 +732,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  cmp CRET1, #1
   |  bhi ->vmeta_binop
   |4:
-  |   ldrh RBw, [PC, #2]
+  |   ldrh RBw, [PC, # OFS_RD]
   |    add PC, PC, #4
   |   add RB, PC, RB, lsl #2
   |   sub RB, RB, #0x20000
@@ -1500,7 +1515,12 @@ static void build_subroutines(BuildCtx *ctx)
   |  bne ->fff_fallback
   |  checkint CARG1, ->fff_fallback
   |  mov CARG3, #1
-  |  mov CARG2, BASE			// Points to stack. Little-endian.
+  |  // Point to the char inside the integer in the stack slot.
+  |.if ENDIAN_LE
+  |  mov CARG2, BASE
+  |.else
+  |  add CARG2, BASE, #7
+  |.endif
   |->fff_newstr:
   |  // CARG2 = str, CARG3 = len.
   |   str BASE, L->base
@@ -1703,7 +1723,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  ands TMP0, PC, #FRAME_TYPE
   |   and TMP1, PC, #~FRAME_TYPEP
   |  bne >3
-  |  ldrb RAw, [PC, #-3]
+  |  ldrb RAw, [PC, #-4+OFS_RA]
   |  lsl RA, RA, #3
   |  add TMP1, RA, #16
   |3:
@@ -1838,7 +1858,7 @@ static void build_subroutines(BuildCtx *ctx)
   |->cont_stitch:			// Trace stitching.
   |.if JIT
   |  // RA = resultptr, CARG4 = meta base
-  |   ldr RB, SAVE_MULTRES
+  |   ldr RBw, SAVE_MULTRES
   |  ldr INSw, [PC, #-4]
   |    ldr TRACE:CARG3, [CARG4, #-40]	// Save previous trace.
   |   subs RB, RB, #8
@@ -1869,7 +1889,7 @@ static void build_subroutines(BuildCtx *ctx)
   |
   |  // Stitch a new trace to the previous trace.
   |  mov CARG1, #GL_J(exitno)
-  |  str RA, [GL, CARG1]
+  |  str RAw, [GL, CARG1]
   |  mov CARG1, #GL_J(L)
   |  str L, [GL, CARG1]
   |  str BASE, L->base
@@ -1936,6 +1956,9 @@ static void build_subroutines(BuildCtx *ctx)
   |  sub CARG1, CARG1, #2
   |   ldr CARG2w, [lr]		// Load trace number.
   |    st_vmstate CARG4
+  |.if ENDIAN_BE
+  |   rev32 CARG2, CARG2
+  |.endif
   |   str BASE, L->base
   |  ubfx CARG2w, CARG2w, #5, #16
   |  str CARG1w, [GL, #GL_J(exitno)]
@@ -1967,14 +1990,14 @@ static void build_subroutines(BuildCtx *ctx)
   |    movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
   |    movn TISNIL, #0
   |  and LFUNC:CARG2, CARG2, #LJ_GCVMASK
-  |   str RC, SAVE_MULTRES
+  |   str RCw, SAVE_MULTRES
   |   str BASE, L->base
   |  ldr CARG2, LFUNC:CARG2->pc
   |   str xzr, GL->jit_base
   |    mv_vmstate CARG4, INTERP
   |  ldr KBASE, [CARG2, #PC2PROTO(k)]
   |  // Modified copy of ins_next which handles function header dispatch, too.
-  |  ldrb RBw, [PC]
+  |  ldrb RBw, [PC, # OFS_OP]
   |   ldr INSw, [PC], #4
   |    st_vmstate CARG4
   |  cmp RBw, #BC_FUNCC+2		// Fast function?
@@ -2000,7 +2023,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  ands CARG2, CARG1, #FRAME_TYPE
   |  bne <2			// Trace stitching continuation?
   |  // Otherwise set KBASE for Lua function below fast function.
-  |  ldr CARG3, [CARG1, #-4]
+  |  ldr CARG3w, [CARG1, #-4]
   |  decode_RA CARG1, CARG3
   |  sub CARG2, BASE, CARG1, lsl #3
   |  ldr LFUNC:CARG3, [CARG2, #-32]
@@ -2153,7 +2176,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
   case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT:
     |  // RA = src1, RC = src2, JMP with RC = target
     |  ldr CARG1, [BASE, RA, lsl #3]
-    |    ldrh RBw, [PC, #2]
+    |    ldrh RBw, [PC, # OFS_RD]
     |   ldr CARG2, [BASE, RC, lsl #3]
     |    add PC, PC, #4
     |    add RB, PC, RB, lsl #2
@@ -2210,7 +2233,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  // RA = src1, RC = src2, JMP with RC = target
     |  ldr CARG1, [BASE, RA, lsl #3]
     |   add RC, BASE, RC, lsl #3
-    |    ldrh RBw, [PC, #2]
+    |    ldrh RBw, [PC, # OFS_RD]
     |   ldr CARG3, [RC]
     |    add PC, PC, #4
     |    add RB, PC, RB, lsl #2
@@ -2271,7 +2294,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  // RA = src, RC = str_const (~), JMP with RC = target
     |  ldr CARG1, [BASE, RA, lsl #3]
     |   mvn RC, RC
-    |    ldrh RBw, [PC, #2]
+    |    ldrh RBw, [PC, # OFS_RD]
     |   ldr CARG2, [KBASE, RC, lsl #3]
     |    add PC, PC, #4
     |   movn TMP0, #~LJ_TSTR
@@ -2299,7 +2322,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  // RA = src, RC = num_const (~), JMP with RC = target
     |  ldr CARG1, [BASE, RA, lsl #3]
     |   add RC, KBASE, RC, lsl #3
-    |    ldrh RBw, [PC, #2]
+    |    ldrh RBw, [PC, # OFS_RD]
     |   ldr CARG3, [RC]
     |    add PC, PC, #4
     |    add RB, PC, RB, lsl #2
@@ -2359,7 +2382,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     vk = op == BC_ISEQP;
     |  // RA = src, RC = primitive_type (~), JMP with RC = target
     |  ldr TMP0, [BASE, RA, lsl #3]
-    |   ldrh RBw, [PC, #2]
+    |   ldrh RBw, [PC, # OFS_RD]
     |   add PC, PC, #4
     |  add RC, RC, #1
     |   add RB, PC, RB, lsl #2
@@ -2384,7 +2407,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
 
   case BC_ISTC: case BC_ISFC: case BC_IST: case BC_ISF:
     |  // RA = dst or unused, RC = src, JMP with RC = target
-    |   ldrh RBw, [PC, #2]
+    |   ldrh RBw, [PC, # OFS_RD]
     |  ldr TMP0, [BASE, RC, lsl #3]
     |   add PC, PC, #4
     |  mov_false TMP1
@@ -2631,7 +2654,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   str PC, SAVE_PC
     |  bl extern lj_meta_cat		// (lua_State *L, TValue *top, int left)
     |  // Returns NULL (finished) or TValue * (metamethod).
-    |  ldrb RBw, [PC, #-1]
+    |  ldrb RBw, [PC, #-4+OFS_RB]
     |   ldr BASE, L->base
     |   cbnz CRET1, ->vmeta_binop
     |  ldr TMP0, [BASE, RB, lsl #3]
@@ -3262,7 +3285,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  ins_callt
     |
     |5:  // Tailcall to a fast function with a Lua frame below.
-    |  ldrb RAw, [PC, #-3]
+    |  ldrb RAw, [PC, #-4+OFS_RA]
     |  sub CARG1, BASE, RA, lsl #3
     |  ldr LFUNC:CARG1, [CARG1, #-32]
     |  and LFUNC:CARG1, CARG1, #LJ_GCVMASK
@@ -3303,8 +3326,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |.endif
     |  add RA, BASE, RA, lsl #3
     |  ldr TAB:RB, [RA, #-16]
-    |    ldrh TMP3w, [PC, #2]
-    |  ldr CARG1w, [RA, #-8]		// Get index from control var.
+    |    ldrh TMP3w, [PC, # OFS_RD]
+    |  ldr CARG1w, [RA, #-8+LO]		// Get index from control var.
     |    add PC, PC, #4
     |    add TMP3, PC, TMP3, lsl #2
     |  and TAB:RB, RB, #LJ_GCVMASK
@@ -3323,7 +3346,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   stp CARG1, TMP0, [RA]
     |    add CARG1, CARG1, #1
     |3:
-    |    str CARG1w, [RA, #-8]		// Update control var.
+    |    str CARG1w, [RA, #-8+LO]	// Update control var.
     |  mov PC, TMP3
     |4:
     |  ins_next
@@ -3369,8 +3392,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |5:  // Despecialize bytecode if any of the checks fail.
     |  mov TMP0, #BC_JMP
     |   mov TMP1, #BC_ITERC
-    |  strb TMP0w, [PC, #-4]
-    |   strb TMP1w, [RC]
+    |  strb TMP0w, [PC, #-4+OFS_OP]
+    |   strb TMP1w, [RC, # OFS_OP]
     |  b <1
     break;
 
@@ -3576,7 +3599,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
       |  csel PC, RC, PC, gt
     } else if (op == BC_JFORI) {
       |  mov PC, RC
-      |  ldrh RCw, [RC, #-2]
+      |  ldrh RCw, [RC, #-4+OFS_RD]
     } else if (op == BC_IFORL) {
       |  csel PC, RC, PC, le
     }
@@ -3617,7 +3640,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     if (op == BC_FORI) {
       |  csel PC, RC, PC, hi
     } else if (op == BC_JFORI) {
-      |  ldrh RCw, [RC, #-2]
+      |  ldrh RCw, [RC, #-4+OFS_RD]
       |  bls =>BC_JLOOP
     } else if (op == BC_IFORL) {
       |  csel PC, RC, PC, ls