lj_emit_arm64.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. /*
  2. ** ARM64 instruction emitter.
  3. ** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
  4. **
  5. ** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
  6. ** Sponsored by Cisco Systems, Inc.
  7. */
  8. /* -- Constant encoding --------------------------------------------------- */
  9. static uint64_t get_k64val(ASMState *as, IRRef ref)
  10. {
  11. IRIns *ir = IR(ref);
  12. if (ir->o == IR_KINT64) {
  13. return ir_kint64(ir)->u64;
  14. } else if (ir->o == IR_KGC) {
  15. return (uint64_t)ir_kgc(ir);
  16. } else if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
  17. return (uint64_t)ir_kptr(ir);
  18. } else {
  19. lj_assertA(ir->o == IR_KINT || ir->o == IR_KNULL,
  20. "bad 64 bit const IR op %d", ir->o);
  21. return (uint32_t)ir->i; /* Zero-extended. */
  22. }
  23. }
  24. /* Encode constant in K12 format for data processing instructions. */
  25. static uint32_t emit_isk12(int64_t n)
  26. {
  27. uint64_t k = n < 0 ? ~(uint64_t)n+1u : (uint64_t)n;
  28. uint32_t m = n < 0 ? 0x40000000 : 0;
  29. if (k < 0x1000) {
  30. return (uint32_t)(A64I_K12|m|A64F_U12(k));
  31. } else if ((k & 0xfff000) == k) {
  32. return (uint32_t)(A64I_K12|m|0x400000|A64F_U12(k>>12));
  33. }
  34. return 0;
  35. }
  36. #define emit_clz64(n) (lj_fls64(n)^63)
  37. #define emit_ctz64(n) lj_ffs64(n)
  38. /* Encode constant in K13 format for logical data processing instructions. */
  39. static uint32_t emit_isk13(uint64_t n, int is64)
  40. {
  41. /* Thanks to: https://dougallj.wordpress.com/2021/10/30/ */
  42. int rot, ones, size, immr, imms;
  43. if (!is64) n = ((uint64_t)n << 32) | (uint32_t)n;
  44. if ((n+1u) <= 1u) return 0; /* Neither all-zero nor all-ones are allowed. */
  45. rot = (n & (n+1u)) ? emit_ctz64(n & (n+1u)) : 64;
  46. n = lj_ror(n, rot & 63);
  47. ones = emit_ctz64(~n);
  48. size = emit_clz64(n) + ones;
  49. if (lj_ror(n, size & 63) != n) return 0; /* Non-repeating? */
  50. immr = -rot & (size - 1);
  51. imms = (-(size << 1) | (ones - 1)) & 63;
  52. return A64I_K13 | A64F_IMMR(immr | (size & 64)) | A64F_IMMS(imms);
  53. }
  54. static uint32_t emit_isfpk64(uint64_t n)
  55. {
  56. uint64_t etop9 = ((n >> 54) & 0x1ff);
  57. if ((n << 16) == 0 && (etop9 == 0x100 || etop9 == 0x0ff)) {
  58. return (uint32_t)(((n >> 48) & 0x7f) | ((n >> 56) & 0x80));
  59. }
  60. return ~0u;
  61. }
  62. /* -- Emit basic instructions --------------------------------------------- */
  63. static void emit_dnma(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm, Reg ra)
  64. {
  65. *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm) | A64F_A(ra);
  66. }
  67. static void emit_dnm(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm)
  68. {
  69. *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm);
  70. }
  71. static void emit_dm(ASMState *as, A64Ins ai, Reg rd, Reg rm)
  72. {
  73. *--as->mcp = ai | A64F_D(rd) | A64F_M(rm);
  74. }
  75. static void emit_dn(ASMState *as, A64Ins ai, Reg rd, Reg rn)
  76. {
  77. *--as->mcp = ai | A64F_D(rd) | A64F_N(rn);
  78. }
  79. static void emit_nm(ASMState *as, A64Ins ai, Reg rn, Reg rm)
  80. {
  81. *--as->mcp = ai | A64F_N(rn) | A64F_M(rm);
  82. }
  83. static void emit_d(ASMState *as, A64Ins ai, Reg rd)
  84. {
  85. *--as->mcp = ai | A64F_D(rd);
  86. }
  87. static void emit_n(ASMState *as, A64Ins ai, Reg rn)
  88. {
  89. *--as->mcp = ai | A64F_N(rn);
  90. }
  91. static int emit_checkofs(A64Ins ai, int64_t ofs)
  92. {
  93. int scale = (ai >> 30) & 3;
  94. if (ofs < 0 || (ofs & ((1<<scale)-1))) {
  95. return (ofs >= -256 && ofs <= 255) ? -1 : 0;
  96. } else {
  97. return (ofs < (4096<<scale)) ? 1 : 0;
  98. }
  99. }
  100. static LJ_AINLINE uint32_t emit_lso_pair_candidate(A64Ins ai, int ofs, int sc)
  101. {
  102. if (ofs >= 0) {
  103. return ai | A64F_U12(ofs>>sc); /* Subsequent lj_ror checks ofs. */
  104. } else if (ofs >= -256) {
  105. return (ai^A64I_LS_U) | A64F_S9(ofs & 0x1ff);
  106. } else {
  107. return A64F_D(31); /* Will mismatch prev. */
  108. }
  109. }
  110. static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs64)
  111. {
  112. int ot = emit_checkofs(ai, ofs64), sc = (ai >> 30) & 3, ofs = (int)ofs64;
  113. lj_assertA(ot, "load/store offset %d out of range", ofs);
  114. /* Combine LDR/STR pairs to LDP/STP. */
  115. if ((sc == 2 || sc == 3) &&
  116. (!(ai & 0x400000) || rd != rn) &&
  117. as->mcp != as->mcloop) {
  118. uint32_t prev = *as->mcp & ~A64F_D(31);
  119. int ofsm = ofs - (1<<sc), ofsp = ofs + (1<<sc);
  120. A64Ins aip;
  121. if (prev == emit_lso_pair_candidate(ai | A64F_N(rn), ofsm, sc)) {
  122. aip = (A64F_A(rd) | A64F_D(*as->mcp & 31));
  123. } else if (prev == emit_lso_pair_candidate(ai | A64F_N(rn), ofsp, sc)) {
  124. aip = (A64F_D(rd) | A64F_A(*as->mcp & 31));
  125. ofsm = ofs;
  126. } else {
  127. goto nopair;
  128. }
  129. if (lj_ror((unsigned int)ofsm + (64u<<sc), sc) <= 127u) {
  130. *as->mcp = aip | A64F_N(rn) | (((ofsm >> sc) & 0x7f) << 15) |
  131. (ai ^ ((ai == A64I_LDRx || ai == A64I_STRx) ? 0x50000000 : 0x90000000));
  132. return;
  133. }
  134. }
  135. nopair:
  136. if (ot == 1)
  137. *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_U12(ofs >> sc);
  138. else
  139. *--as->mcp = (ai^A64I_LS_U) | A64F_D(rd) | A64F_N(rn) | A64F_S9(ofs & 0x1ff);
  140. }
  141. /* -- Emit loads/stores --------------------------------------------------- */
  142. /* Prefer rematerialization of BASE/L from global_State over spills. */
  143. #define emit_canremat(ref) ((ref) <= REF_BASE)
  144. /* Try to find a one-step delta relative to other consts. */
  145. static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int is64)
  146. {
  147. RegSet work = (~as->freeset & RSET_GPR) | RID2RSET(RID_GL);
  148. while (work) {
  149. Reg r = rset_picktop(work);
  150. IRRef ref = regcost_ref(as->cost[r]);
  151. lj_assertA(r != rd, "dest reg %d not free", rd);
  152. if (ref < REF_TRUE) {
  153. uint64_t kx = ra_iskref(ref) ? (uint64_t)ra_krefk(as, ref) :
  154. get_k64val(as, ref);
  155. int64_t delta = (int64_t)(k - kx);
  156. if (!is64) delta = (int64_t)(int32_t)delta; /* Sign-extend. */
  157. if (delta == 0) {
  158. emit_dm(as, is64|A64I_MOVw, rd, r);
  159. return 1;
  160. } else {
  161. uint32_t k12 = emit_isk12(delta < 0 ? (int64_t)(~(uint64_t)delta+1u) : delta);
  162. if (k12) {
  163. emit_dn(as, (delta < 0 ? A64I_SUBw : A64I_ADDw)^is64^k12, rd, r);
  164. return 1;
  165. }
  166. /* Do other ops or multi-step deltas pay off? Probably not.
  167. ** E.g. XOR rarely helps with pointer consts.
  168. */
  169. }
  170. }
  171. rset_clear(work, r);
  172. }
  173. return 0; /* Failed. */
  174. }
  175. #define glofs(as, k) \
  176. ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g))
  177. #define mcpofs(as, k) \
  178. ((intptr_t)((uintptr_t)(k) - (uintptr_t)(as->mcp - 1)))
  179. #define checkmcpofs(as, k) \
  180. (A64F_S_OK(mcpofs(as, k)>>2, 19))
  181. /* Try to form a const as ADR or ADRP or ADRP + ADD. */
  182. static int emit_kadrp(ASMState *as, Reg rd, uint64_t k)
  183. {
  184. A64Ins ai = A64I_ADR;
  185. int64_t ofs = mcpofs(as, k);
  186. if (!A64F_S_OK((uint64_t)ofs, 21)) {
  187. uint64_t kpage = k & ~0xfffull;
  188. MCode *adrp = as->mcp - 1 - (k != kpage);
  189. ofs = (int64_t)(kpage - ((uint64_t)adrp & ~0xfffull)) >> 12;
  190. if (!A64F_S_OK(ofs, 21))
  191. return 0; /* Failed. */
  192. if (k != kpage)
  193. emit_dn(as, (A64I_ADDx^A64I_K12)|A64F_U12(k - kpage), rd, rd);
  194. ai = A64I_ADRP;
  195. }
  196. emit_d(as, ai|(((uint32_t)ofs&3)<<29)|A64F_S19(ofs>>2), rd);
  197. return 1;
  198. }
  199. static void emit_loadk(ASMState *as, Reg rd, uint64_t u64)
  200. {
  201. int zeros = 0, ones = 0, neg, lshift = 0;
  202. int is64 = (u64 >> 32) ? A64I_X : 0, i = is64 ? 4 : 2;
  203. /* Count non-homogeneous 16 bit fragments. */
  204. while (--i >= 0) {
  205. uint32_t frag = (u64 >> i*16) & 0xffff;
  206. zeros += (frag != 0);
  207. ones += (frag != 0xffff);
  208. }
  209. neg = ones < zeros; /* Use MOVN if it pays off. */
  210. if ((neg ? ones : zeros) > 1) { /* Need 2+ ins. Try 1 ins encodings. */
  211. uint32_t k13 = emit_isk13(u64, is64);
  212. if (k13) {
  213. emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO);
  214. return;
  215. }
  216. if (emit_kdelta(as, rd, u64, is64)) {
  217. return;
  218. }
  219. if (emit_kadrp(as, rd, u64)) { /* Either 1 or 2 ins. */
  220. return;
  221. }
  222. }
  223. if (neg) {
  224. u64 = ~u64;
  225. if (!is64) u64 = (uint32_t)u64;
  226. }
  227. if (u64) {
  228. /* Find first/last fragment to be filled. */
  229. int shift = (63-emit_clz64(u64)) & ~15;
  230. lshift = emit_ctz64(u64) & ~15;
  231. for (; shift > lshift; shift -= 16) {
  232. uint32_t frag = (u64 >> shift) & 0xffff;
  233. if (frag == 0) continue; /* Will be correctly filled by MOVN/MOVZ. */
  234. if (neg) frag ^= 0xffff; /* MOVK requires the original value. */
  235. emit_d(as, is64 | A64I_MOVKw | A64F_U16(frag) | A64F_LSL16(shift), rd);
  236. }
  237. }
  238. /* But MOVN needs an inverted value. */
  239. emit_d(as, is64 | (neg ? A64I_MOVNw : A64I_MOVZw) |
  240. A64F_U16((u64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd);
  241. }
  242. /* Load a 32 bit constant into a GPR. */
  243. #define emit_loadi(as, rd, i) emit_loadk(as, rd, (uint32_t)i)
  244. /* Load a 64 bit constant into a GPR. */
  245. #define emit_loadu64(as, rd, i) emit_loadk(as, rd, i)
  246. static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
  247. /* Get/set from constant pointer. */
  248. static void emit_lsptr(ASMState *as, A64Ins ai, Reg r, void *p)
  249. {
  250. Reg base = RID_GL;
  251. int64_t ofs = glofs(as, p);
  252. if (emit_checkofs(ai, ofs)) {
  253. /* GL + offset, might subsequently fuse to LDP/STP. */
  254. } else if (ai == A64I_LDRx && checkmcpofs(as, p)) {
  255. /* IP + offset is cheaper than allock, but address must be in range. */
  256. emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, p)>>2), r);
  257. return;
  258. } else { /* Split up into base reg + offset. */
  259. int64_t i64 = i64ptr(p);
  260. base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r));
  261. ofs = i64 & 0x7fffull;
  262. }
  263. emit_lso(as, ai, r, base, ofs);
  264. }
  265. /* Load 64 bit IR constant into register. */
  266. static void emit_loadk64(ASMState *as, Reg r, IRIns *ir)
  267. {
  268. const uint64_t *k = &ir_k64(ir)->u64;
  269. int64_t ofs;
  270. if (r >= RID_MAX_GPR) {
  271. uint32_t fpk = emit_isfpk64(*k);
  272. if (fpk != ~0u) {
  273. emit_d(as, A64I_FMOV_DI | A64F_FP8(fpk), (r & 31));
  274. return;
  275. }
  276. }
  277. ofs = glofs(as, k);
  278. if (emit_checkofs(A64I_LDRx, ofs)) {
  279. emit_lso(as, r >= RID_MAX_GPR ? A64I_LDRd : A64I_LDRx,
  280. (r & 31), RID_GL, ofs);
  281. } else {
  282. if (r >= RID_MAX_GPR) {
  283. emit_dn(as, A64I_FMOV_D_R, (r & 31), RID_TMP);
  284. r = RID_TMP;
  285. }
  286. if (checkmcpofs(as, k))
  287. emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, k)>>2), r);
  288. else
  289. emit_loadu64(as, r, *k);
  290. }
  291. }
  292. /* Get/set global_State fields. */
  293. #define emit_getgl(as, r, field) \
  294. emit_lsptr(as, A64I_LDRx, (r), (void *)&J2G(as->J)->field)
  295. #define emit_setgl(as, r, field) \
  296. emit_lsptr(as, A64I_STRx, (r), (void *)&J2G(as->J)->field)
  297. /* Trace number is determined from pc of exit instruction. */
  298. #define emit_setvmstate(as, i) UNUSED(i)
  299. /* -- Emit control-flow instructions -------------------------------------- */
  300. /* Label for internal jumps. */
  301. typedef MCode *MCLabel;
  302. /* Return label pointing to current PC. */
  303. #define emit_label(as) ((as)->mcp)
  304. static void emit_cond_branch(ASMState *as, A64CC cond, MCode *target)
  305. {
  306. MCode *p = --as->mcp;
  307. ptrdiff_t delta = target - p;
  308. lj_assertA(A64F_S_OK(delta, 19), "branch target out of range");
  309. *p = A64I_BCC | A64F_S19(delta) | cond;
  310. }
  311. static void emit_branch(ASMState *as, A64Ins ai, MCode *target)
  312. {
  313. MCode *p = --as->mcp;
  314. ptrdiff_t delta = target - p;
  315. lj_assertA(A64F_S_OK(delta, 26), "branch target out of range");
  316. *p = ai | A64F_S26(delta);
  317. }
  318. static void emit_tnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit, MCode *target)
  319. {
  320. MCode *p = --as->mcp;
  321. ptrdiff_t delta = target - p;
  322. lj_assertA(bit < 63, "bit number out of range");
  323. lj_assertA(A64F_S_OK(delta, 14), "branch target out of range");
  324. if (bit > 31) ai |= A64I_X;
  325. *p = ai | A64F_BIT(bit & 31) | A64F_S14(delta) | r;
  326. }
  327. static void emit_cnb(ASMState *as, A64Ins ai, Reg r, MCode *target)
  328. {
  329. MCode *p = --as->mcp;
  330. ptrdiff_t delta = target - p;
  331. lj_assertA(A64F_S_OK(delta, 19), "branch target out of range");
  332. *p = ai | A64F_S19(delta) | r;
  333. }
  334. #define emit_jmp(as, target) emit_branch(as, A64I_B, (target))
  335. static void emit_call(ASMState *as, ASMFunction target)
  336. {
  337. MCode *p = --as->mcp;
  338. #if LJ_ABI_PAUTH
  339. char *targetp = ptrauth_auth_data((char *)target,
  340. ptrauth_key_function_pointer, 0);
  341. #else
  342. char *targetp = (char *)target;
  343. #endif
  344. ptrdiff_t delta = targetp - (char *)p;
  345. if (A64F_S_OK(delta>>2, 26)) {
  346. *p = A64I_BL | A64F_S26(delta>>2);
  347. } else { /* Target out of range: need indirect call. But don't use R0-R7. */
  348. Reg r = ra_allock(as, i64ptr(target),
  349. RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);
  350. *p = A64I_BLR_AUTH | A64F_N(r);
  351. }
  352. }
  353. /* -- Emit generic operations --------------------------------------------- */
  354. /* Generic move between two regs. */
  355. static void emit_movrr(ASMState *as, IRIns *ir, Reg dst, Reg src)
  356. {
  357. if (dst >= RID_MAX_GPR) {
  358. emit_dn(as, irt_isnum(ir->t) ? A64I_FMOV_D : A64I_FMOV_S,
  359. (dst & 31), (src & 31));
  360. return;
  361. }
  362. if (as->mcp != as->mcloop) { /* Swap early registers for loads/stores. */
  363. MCode ins = *as->mcp, swp = (src^dst);
  364. if ((ins & 0xbf800000) == 0xb9000000) {
  365. if (!((ins ^ (dst << 5)) & 0x000003e0))
  366. *as->mcp = ins ^ (swp << 5); /* Swap N in load/store. */
  367. if (!(ins & 0x00400000) && !((ins ^ dst) & 0x0000001f))
  368. *as->mcp = ins ^ swp; /* Swap D in store. */
  369. }
  370. }
  371. emit_dm(as, A64I_MOVx, dst, src);
  372. }
  373. /* Generic load of register with base and (small) offset address. */
  374. static void emit_loadofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
  375. {
  376. if (r >= RID_MAX_GPR)
  377. emit_lso(as, irt_isnum(ir->t) ? A64I_LDRd : A64I_LDRs, (r & 31), base, ofs);
  378. else
  379. emit_lso(as, irt_is64(ir->t) ? A64I_LDRx : A64I_LDRw, r, base, ofs);
  380. }
  381. /* Generic store of register with base and (small) offset address. */
  382. static void emit_storeofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
  383. {
  384. if (r >= RID_MAX_GPR)
  385. emit_lso(as, irt_isnum(ir->t) ? A64I_STRd : A64I_STRs, (r & 31), base, ofs);
  386. else
  387. emit_lso(as, irt_is64(ir->t) ? A64I_STRx : A64I_STRw, r, base, ofs);
  388. }
  389. /* Emit an arithmetic operation with a constant operand. */
  390. static void emit_opk(ASMState *as, A64Ins ai, Reg dest, Reg src,
  391. int32_t i, RegSet allow)
  392. {
  393. uint32_t k = emit_isk12(i);
  394. if (k)
  395. emit_dn(as, ai^k, dest, src);
  396. else
  397. emit_dnm(as, ai, dest, src, ra_allock(as, i, allow));
  398. }
  399. /* Add offset to pointer. */
  400. static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
  401. {
  402. if (ofs)
  403. emit_opk(as, ofs < 0 ? A64I_SUBx : A64I_ADDx, r, r,
  404. ofs < 0 ? (int32_t)(~(uint32_t)ofs+1u) : ofs,
  405. rset_exclude(RSET_GPR, r));
  406. }
  407. #define emit_spsub(as, ofs) emit_addptr(as, RID_SP, -(ofs))