aoptarm.pas 130 KB


  1. {
  2. Copyright (c) 1998-2020 by Jonas Maebe and Florian Klaempfl, members of the Free Pascal
  3. Development Team
  4. This unit implements an ARM optimizer object used commonly for ARM and AAarch64
  5. This program is free software; you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation; either version 2 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program; if not, write to the Free Software
  15. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16. ****************************************************************************
  17. }
  18. Unit aoptarm;
  19. {$i fpcdefs.inc}
  20. { $define DEBUG_PREREGSCHEDULER}
  21. {$ifdef EXTDEBUG}
  22. {$define DEBUG_AOPTCPU}
  23. {$endif EXTDEBUG}
  24. Interface
  25. uses
  26. cgbase, cgutils, globtype, cpubase, aasmtai, aasmcpu,aopt, aoptobj;
  27. Type
  28. { while ARM and AAarch64 look not very similar at a first glance,
  29. several optimizations can be shared between both }
  30. TARMAsmOptimizer = class(TAsmOptimizer)
  31. procedure DebugMsg(const s : string; p : tai);
  32. function RemoveSuperfluousMove(const p: tai; movp: tai; const optimizer: string): boolean;
  33. function RedundantMovProcess(var p: tai; var hp1: tai): boolean;
  34. function GetNextInstructionUsingReg(Current: tai; out Next: tai; const reg: TRegister): Boolean;
  35. {$ifdef AARCH64}
  36. function USxtOp2Op(var p, hp1: tai; shiftmode: tshiftmode): Boolean;
  37. {$endif AARCH64}
  38. function OptPreSBFXUBFX(var p: tai): Boolean;
  39. function OptPass1UXTB(var p: tai): Boolean;
  40. function OptPass1UXTH(var p: tai): Boolean;
  41. function OptPass1SXTB(var p: tai): Boolean;
  42. function OptPass1SXTH(var p: tai): Boolean;
  43. function OptPass1LDR(var p: tai): Boolean; virtual;
  44. function OptPass1STR(var p: tai): Boolean; virtual;
  45. function OptPass1And(var p: tai): Boolean; virtual;
  46. function OptPass2Bitwise(var p: tai): Boolean;
  47. function OptPass2TST(var p: tai): Boolean;
  48. { Common code that tries to merge constant writes to sequential memory }
  49. function TryConstMerge(var p: tai; hp1: tai): Boolean;
  50. protected
  51. function DoXTArithOp(var p: tai; hp1: tai): Boolean;
  52. End;
  53. function MatchInstruction(const instr: tai; const op: TCommonAsmOps; const cond: TAsmConds; const postfix: TOpPostfixes): boolean;
  54. function MatchInstruction(const instr: tai; const op: TAsmOp; const cond: TAsmConds; const postfix: TOpPostfixes): boolean;
  55. {$ifdef AARCH64}
  56. function MatchInstruction(const instr: tai; const ops : array of TAsmOp; const postfix: TOpPostfixes): boolean;
  57. {$endif AARCH64}
  58. function MatchInstruction(const instr: tai; const op: TAsmOp; const postfix: TOpPostfixes): boolean;
  59. function RefsEqual(const r1, r2: treference): boolean;
  60. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  61. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean; inline;
  62. function MatchOperand(const oper: TOper; const a: TCGInt): boolean; inline;
  63. Implementation
  64. uses
  65. cutils,verbose,globals,aoptutils,
  66. systems,
  67. cpuinfo,
  68. cgobj,procinfo,
  69. aasmbase,aasmdata,itcpugas;
  70. {$ifdef DEBUG_AOPTCPU}
  71. const
  72. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  73. procedure TARMAsmOptimizer.DebugMsg(const s: string;p : tai);
  74. begin
  75. asml.insertbefore(tai_comment.Create(strpnew(s)), p);
  76. end;
  77. {$else DEBUG_AOPTCPU}
  78. { Empty strings help the optimizer to remove string concatenations that won't
  79. ever appear to the user on release builds. [Kit] }
  80. const
  81. SPeepholeOptimization = '';
  82. procedure TARMAsmOptimizer.DebugMsg(const s: string;p : tai);inline;
  83. begin
  84. end;
  85. {$endif DEBUG_AOPTCPU}
  86. function MatchInstruction(const instr: tai; const op: TCommonAsmOps; const cond: TAsmConds; const postfix: TOpPostfixes): boolean;
  87. begin
  88. result :=
  89. (instr.typ = ait_instruction) and
  90. ((op = []) or ((taicpu(instr).opcode<=LastCommonAsmOp) and (taicpu(instr).opcode in op))) and
  91. ((cond = []) or (taicpu(instr).condition in cond)) and
  92. ((postfix = []) or (taicpu(instr).oppostfix in postfix));
  93. end;
  94. function MatchInstruction(const instr: tai; const op: TAsmOp; const cond: TAsmConds; const postfix: TOpPostfixes): boolean;
  95. begin
  96. result :=
  97. (instr.typ = ait_instruction) and
  98. (taicpu(instr).opcode = op) and
  99. ((cond = []) or (taicpu(instr).condition in cond)) and
  100. ((postfix = []) or (taicpu(instr).oppostfix in postfix));
  101. end;
  102. {$ifdef AARCH64}
  103. function MatchInstruction(const instr: tai; const ops : array of TAsmOp; const postfix: TOpPostfixes): boolean;
  104. var
  105. op : TAsmOp;
  106. begin
  107. result:=false;
  108. if instr.typ <> ait_instruction then
  109. exit;
  110. for op in ops do
  111. begin
  112. if (taicpu(instr).opcode = op) and
  113. ((postfix = []) or (taicpu(instr).oppostfix in postfix)) then
  114. begin
  115. result:=true;
  116. exit;
  117. end;
  118. end;
  119. end;
  120. {$endif AARCH64}
  121. function MatchInstruction(const instr: tai; const op: TAsmOp; const postfix: TOpPostfixes): boolean;
  122. begin
  123. result :=
  124. (instr.typ = ait_instruction) and
  125. (taicpu(instr).opcode = op) and
  126. ((postfix = []) or (taicpu(instr).oppostfix in postfix));
  127. end;
  128. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  129. begin
  130. result := (oper.typ = top_reg) and (oper.reg = reg);
  131. end;
  132. function RefsEqual(const r1, r2: treference): boolean;
  133. begin
  134. refsequal :=
  135. (r1.offset = r2.offset) and
  136. (r1.base = r2.base) and
  137. (r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
  138. (r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
  139. (r1.relsymbol = r2.relsymbol) and
  140. {$ifdef ARM}
  141. (r1.signindex = r2.signindex) and
  142. {$endif ARM}
  143. (r1.shiftimm = r2.shiftimm) and
  144. (r1.addressmode = r2.addressmode) and
  145. (r1.shiftmode = r2.shiftmode) and
  146. (r1.volatility=[]) and
  147. (r2.volatility=[]);
  148. end;
  149. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean; inline;
  150. begin
  151. result := oper1.typ = oper2.typ;
  152. if result then
  153. case oper1.typ of
  154. top_const:
  155. Result:=oper1.val = oper2.val;
  156. top_reg:
  157. Result:=oper1.reg = oper2.reg;
  158. top_conditioncode:
  159. Result:=oper1.cc = oper2.cc;
  160. top_realconst:
  161. Result:=oper1.val_real = oper2.val_real;
  162. top_ref:
  163. Result:=RefsEqual(oper1.ref^, oper2.ref^);
  164. else Result:=false;
  165. end
  166. end;
  167. function MatchOperand(const oper: TOper; const a: TCGInt): boolean; inline;
  168. begin
  169. result := (oper.typ = top_const) and (oper.val = a);
  170. end;
  171. {$ifdef AARCH64}
  172. function TARMAsmOptimizer.USxtOp2Op(var p,hp1: tai; shiftmode: tshiftmode): Boolean;
  173. var
  174. so: tshifterop;
  175. opoffset: Integer;
  176. begin
  177. Result:=false;
  178. if ((MatchInstruction(hp1, [A_ADD,A_SUB], [C_None], [PF_None,PF_S]) and
  179. (taicpu(hp1).ops=3) and
  180. MatchOperand(taicpu(hp1).oper[2]^, taicpu(p).oper[0]^.reg) and
  181. not(MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) or
  182. (MatchInstruction(hp1, [A_CMP,A_CMN], [C_None], [PF_None]) and
  183. (taicpu(hp1).ops=2) and
  184. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))
  185. ) and
  186. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  187. { reg1 might not be modified inbetween }
  188. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  189. begin
  190. DebugMsg('Peephole '+gas_op2str[taicpu(p).opcode]+gas_op2str[taicpu(hp1).opcode]+'2'+gas_op2str[taicpu(hp1).opcode]+' done', p);
  191. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,UsedRegs);
  192. if MatchInstruction(hp1, [A_CMP,A_CMN], [C_None], [PF_None]) then
  193. opoffset:=0
  194. else
  195. opoffset:=1;
  196. taicpu(hp1).loadReg(opoffset+1,taicpu(p).oper[1]^.reg);
  197. if not(shiftmode in [SM_SXTX,SM_UXTX,SM_LSL]) then
  198. setsubreg(taicpu(hp1).oper[opoffset+1]^.reg,R_SUBD);
  199. taicpu(hp1).ops:=opoffset+3;
  200. shifterop_reset(so);
  201. so.shiftmode:=shiftmode;
  202. so.shiftimm:=0;
  203. taicpu(hp1).loadshifterop(opoffset+2,so);
  204. result:=RemoveCurrentP(p);
  205. end;
  206. end;
  207. {$endif AARCH64}
  208. function TARMAsmOptimizer.GetNextInstructionUsingReg(Current: tai;
  209. Out Next: tai; const reg: TRegister): Boolean;
  210. var
  211. gniResult: Boolean;
  212. begin
  213. Next:=Current;
  214. Result := False;
  215. repeat
  216. gniResult:=GetNextInstruction(Next,Next);
  217. if gniResult and RegInInstruction(reg,Next) then
  218. { Found something }
  219. Exit(True);
  220. until not gniResult or
  221. not(cs_opt_level3 in current_settings.optimizerswitches) or
  222. (Next.typ<>ait_instruction) or
  223. is_calljmp(taicpu(Next).opcode)
  224. {$ifdef ARM}
  225. or RegModifiedByInstruction(NR_PC,Next)
  226. {$endif ARM}
  227. ;
  228. end;
  229. function TARMAsmOptimizer.RemoveSuperfluousMove(const p: tai; movp: tai; const optimizer: string):boolean;
  230. var
  231. alloc,
  232. dealloc : tai_regalloc;
  233. hp1 : tai;
  234. begin
  235. Result:=false;
  236. if MatchInstruction(movp, A_MOV, [taicpu(p).condition], [PF_None]) and
  237. { We can't optimize if there is a shiftop }
  238. (taicpu(movp).ops=2) and
  239. MatchOperand(taicpu(movp).oper[1]^, taicpu(p).oper[0]^.reg) and
  240. { don't mess with moves to fp }
  241. (taicpu(movp).oper[0]^.reg<>current_procinfo.framepointer) and
  242. { the destination register of the mov might not be used beween p and movp }
  243. not(RegUsedBetween(taicpu(movp).oper[0]^.reg,p,movp)) and
  244. {$ifdef ARM}
  245. { PC should be changed only by moves }
  246. (taicpu(movp).oper[0]^.reg<>NR_PC) and
  247. { cb[n]z are thumb instructions which require specific registers, with no wide forms }
  248. (taicpu(p).opcode<>A_CBZ) and
  249. (taicpu(p).opcode<>A_CBNZ) and
  250. { There is a special requirement for MUL and MLA, oper[0] and oper[1] are not allowed to be the same }
  251. not (
  252. (taicpu(p).opcode in [A_MLA, A_MUL]) and
  253. (taicpu(p).oper[1]^.reg = taicpu(movp).oper[0]^.reg) and
  254. (current_settings.cputype < cpu_armv6)
  255. ) and
  256. {$endif ARM}
  257. { Take care to only do this for instructions which REALLY load to the first register.
  258. Otherwise
  259. str reg0, [reg1]
  260. mov reg2, reg0
  261. will be optimized to
  262. str reg2, [reg1]
  263. }
  264. RegLoadedWithNewValue(taicpu(p).oper[0]^.reg, p) then
  265. begin
  266. dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(movp.Next));
  267. if assigned(dealloc) then
  268. begin
  269. DebugMsg('Peephole '+optimizer+' removed superfluous mov', movp);
  270. result:=true;
  271. { taicpu(p).oper[0]^.reg is not used anymore, try to find its allocation
  272. and remove it if possible }
  273. asml.Remove(dealloc);
  274. alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.previous));
  275. if assigned(alloc) then
  276. begin
  277. asml.Remove(alloc);
  278. alloc.free;
  279. dealloc.free;
  280. end
  281. else
  282. asml.InsertAfter(dealloc,p);
  283. AllocRegBetween(taicpu(movp).oper[0]^.reg,p,movp,UsedRegs);
  284. { finally get rid of the mov }
  285. taicpu(p).loadreg(0,taicpu(movp).oper[0]^.reg);
  286. { Remove preindexing and postindexing for LDR in some cases.
  287. For example:
  288. ldr reg2,[reg1, xxx]!
  289. mov reg1,reg2
  290. must be translated to:
  291. ldr reg1,[reg1, xxx]
  292. Preindexing must be removed there, since the same register is used as the base and as the target.
  293. Such case is not allowed for ARM CPU and produces crash. }
  294. if (taicpu(p).opcode = A_LDR) and (taicpu(p).oper[1]^.typ = top_ref)
  295. and (taicpu(movp).oper[0]^.reg = taicpu(p).oper[1]^.ref^.base)
  296. then
  297. taicpu(p).oper[1]^.ref^.addressmode:=AM_OFFSET;
  298. asml.remove(movp);
  299. movp.free;
  300. end;
  301. end;
  302. end;
  303. function TARMAsmOptimizer.RedundantMovProcess(var p: tai; var hp1: tai):boolean;
  304. var
  305. I: Integer;
  306. current_hp, next_hp: tai;
  307. LDRChange: Boolean;
  308. begin
  309. Result:=false;
  310. {
  311. change
  312. mov r1, r0
  313. add r1, r1, #1
  314. to
  315. add r1, r0, #1
  316. Todo: Make it work for mov+cmp too
  317. CAUTION! If this one is successful p might not be a mov instruction anymore!
  318. }
  319. if (taicpu(p).ops = 2) and
  320. (taicpu(p).oper[1]^.typ = top_reg) and
  321. (taicpu(p).oppostfix = PF_NONE) then
  322. begin
  323. if
  324. MatchInstruction(hp1, [A_ADD, A_ADC,
  325. {$ifdef ARM}
  326. A_RSB, A_RSC,
  327. {$endif ARM}
  328. A_SUB, A_SBC,
  329. A_AND, A_BIC, A_EOR, A_ORR, A_MOV, A_MVN],
  330. [taicpu(p).condition], []) and
  331. { MOV and MVN might only have 2 ops }
  332. (taicpu(hp1).ops >= 2) and
  333. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^.reg) and
  334. (taicpu(hp1).oper[1]^.typ = top_reg) and
  335. (
  336. (taicpu(hp1).ops = 2) or
  337. (taicpu(hp1).oper[2]^.typ in [top_reg, top_const, top_shifterop])
  338. ) and
  339. {$ifdef AARCH64}
  340. (taicpu(p).oper[1]^.reg<>NR_SP) and
  341. { in this case you have to transform it to movk or the like }
  342. (getsupreg(taicpu(p).oper[1]^.reg)<>RS_XZR) and
  343. {$endif AARCH64}
  344. not(RegUsedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  345. begin
  346. { When we get here we still don't know if the registers match }
  347. for I:=1 to 2 do
  348. {
  349. If the first loop was successful p will be replaced with hp1.
  350. The checks will still be ok, because all required information
  351. will also be in hp1 then.
  352. }
  353. if (taicpu(hp1).ops > I) and
  354. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[I]^.reg)
  355. {$ifdef ARM}
  356. { prevent certain combinations on thumb(2), this is only a safe approximation }
  357. and (not(GenerateThumbCode or GenerateThumb2Code) or
  358. ((getsupreg(taicpu(p).oper[1]^.reg)<>RS_R13) and
  359. (getsupreg(taicpu(p).oper[1]^.reg)<>RS_R15)))
  360. {$endif ARM}
  361. then
  362. begin
  363. DebugMsg('Peephole RedundantMovProcess done', hp1);
  364. taicpu(hp1).oper[I]^.reg := taicpu(p).oper[1]^.reg;
  365. if p<>hp1 then
  366. begin
  367. asml.remove(p);
  368. p.free;
  369. p:=hp1;
  370. Result:=true;
  371. end;
  372. end;
  373. if Result then Exit;
  374. end
  375. { Change: Change:
  376. mov r1, r0 mov r1, r0
  377. ... ...
  378. ldr/str r2, [r1, etc.] mov r2, r1
  379. To: To:
  380. ldr/str r2, [r0, etc.] mov r2, r0
  381. }
  382. else if (taicpu(p).condition = C_None) and (taicpu(p).oper[1]^.typ = top_reg)
  383. {$ifdef ARM}
  384. and not (getsupreg(taicpu(p).oper[0]^.reg) in [RS_PC, RS_R14, RS_STACK_POINTER_REG])
  385. and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_PC)
  386. { Thumb does not support references with base and index one being SP }
  387. and (not(GenerateThumbCode) or (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG))
  388. {$endif ARM}
  389. {$ifdef AARCH64}
  390. and (getsupreg(taicpu(p).oper[0]^.reg) <> RS_STACK_POINTER_REG)
  391. {$endif AARCH64}
  392. then
  393. begin
  394. current_hp := p;
  395. TransferUsedRegs(TmpUsedRegs);
  396. { Search local instruction block }
  397. while GetNextInstruction(current_hp, next_hp) and (next_hp <> BlockEnd) and (next_hp.typ = ait_instruction) do
  398. begin
  399. UpdateUsedRegs(TmpUsedRegs, tai(current_hp.Next));
  400. LDRChange := False;
  401. if (taicpu(next_hp).opcode in [A_LDR,A_STR]) and (taicpu(next_hp).ops = 2)
  402. {$ifdef AARCH64}
  403. { If r0 is the zero register, then this sequence of instructions will cause
  404. an access violation, but that's better than an assembler error caused by
  405. changing r0 to xzr inside the reference (Where it's illegal). [Kit] }
  406. and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_XZR)
  407. {$endif AARCH64}
  408. then
  409. begin
  410. { Change the registers from r1 to r0 }
  411. if (taicpu(next_hp).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) and
  412. {$ifdef ARM}
  413. { This optimisation conflicts with something and raises
  414. an access violation - needs further investigation. [Kit] }
  415. (taicpu(next_hp).opcode <> A_LDR) and
  416. {$endif ARM}
  417. { Don't mess around with the base register if the
  418. reference is pre- or post-indexed }
  419. (taicpu(next_hp).oper[1]^.ref^.addressmode = AM_OFFSET) then
  420. begin
  421. taicpu(next_hp).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg;
  422. LDRChange := True;
  423. end;
  424. if taicpu(next_hp).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then
  425. begin
  426. taicpu(next_hp).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
  427. LDRChange := True;
  428. end;
  429. if LDRChange then
  430. DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovLdr2Ldr 1)', next_hp);
  431. { Drop out if we're dealing with pre-indexed references }
  432. if (taicpu(next_hp).oper[1]^.ref^.addressmode = AM_PREINDEXED) and
  433. (
  434. RegInRef(taicpu(p).oper[0]^.reg, taicpu(next_hp).oper[1]^.ref^) or
  435. RegInRef(taicpu(p).oper[1]^.reg, taicpu(next_hp).oper[1]^.ref^)
  436. ) then
  437. begin
  438. { Remember to update register allocations }
  439. if LDRChange then
  440. AllocRegBetween(taicpu(p).oper[1]^.reg, p, next_hp, UsedRegs);
  441. Break;
  442. end;
  443. { The register being stored can be potentially changed (as long as it's not the stack pointer) }
  444. if (taicpu(next_hp).opcode = A_STR) and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) and
  445. MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[0]^.reg) then
  446. begin
  447. DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovLdr2Ldr 2)', next_hp);
  448. taicpu(next_hp).oper[0]^.reg := taicpu(p).oper[1]^.reg;
  449. LDRChange := True;
  450. end;
  451. if LDRChange and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) then
  452. begin
  453. AllocRegBetween(taicpu(p).oper[1]^.reg, p, next_hp, UsedRegs);
  454. if (taicpu(p).oppostfix = PF_None) and
  455. (
  456. (
  457. (taicpu(next_hp).opcode = A_LDR) and
  458. MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[0]^.reg)
  459. ) or
  460. not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, next_hp, TmpUsedRegs)
  461. ) and
  462. { Double-check to see if the old registers were actually
  463. changed (e.g. if the super registers matched, but not
  464. the sizes, they won't be changed). }
  465. (
  466. (taicpu(next_hp).opcode = A_LDR) or
  467. not RegInOp(taicpu(p).oper[0]^.reg, taicpu(next_hp).oper[0]^)
  468. ) and
  469. not RegInRef(taicpu(p).oper[0]^.reg, taicpu(next_hp).oper[1]^.ref^) then
  470. begin
  471. DebugMsg('Peephole Optimization: RedundantMovProcess 2a done', p);
  472. RemoveCurrentP(p);
  473. Result := True;
  474. Exit;
  475. end;
  476. end;
  477. end
  478. else if (taicpu(next_hp).opcode = A_MOV) and (taicpu(next_hp).oppostfix = PF_None) and
  479. (taicpu(next_hp).ops = 2) then
  480. begin
  481. if MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[0]^.reg) then
  482. begin
  483. { mov r0,r1; mov r1,r1 - remove second MOV here so
  484. so "RedundantMovProcess 2b" doesn't get erroneously
  485. applied }
  486. if MatchOperand(taicpu(next_hp).oper[0]^, taicpu(next_hp).oper[1]^.reg) then
  487. begin
  488. DebugMsg(SPeepholeOptimization + 'Mov2None 2a done', next_hp);
  489. if (next_hp = hp1) then
  490. { Don't let hp1 become a dangling pointer }
  491. hp1 := nil;
  492. asml.Remove(next_hp);
  493. next_hp.Free;
  494. Continue;
  495. end;
  496. { Found another mov that writes entirely to the register }
  497. if RegUsedBetween(taicpu(p).oper[0]^.reg, p, next_hp) then
  498. begin
  499. { Register was used beforehand }
  500. if MatchOperand(taicpu(next_hp).oper[1]^, taicpu(p).oper[1]^.reg) then
  501. begin
  502. { This MOV is exactly the same as the first one.
  503. Since none of the registers have changed value
  504. at this point, we can remove it. }
  505. DebugMsg(SPeepholeOptimization + 'RedundantMovProcess 3a done', next_hp);
  506. if (next_hp = hp1) then
  507. { Don't let hp1 become a dangling pointer }
  508. hp1 := nil;
  509. asml.Remove(next_hp);
  510. next_hp.Free;
  511. { We still have the original p, so we can continue optimising;
  512. if it was -O2 or below, this instruction appeared immediately
  513. after the first MOV, so we're technically not looking more
  514. than one instruction ahead after it's removed! [Kit] }
  515. Continue;
  516. end
  517. else
  518. { Register changes value - drop out }
  519. Break;
  520. end;
  521. { We can delete the first MOV (only if the second MOV is unconditional) }
  522. {$ifdef ARM}
  523. if (taicpu(p).oppostfix = PF_None) and
  524. (taicpu(next_hp).condition = C_None) then
  525. {$endif ARM}
  526. begin
  527. DebugMsg('Peephole Optimization: RedundantMovProcess 2b done', p);
  528. RemoveCurrentP(p);
  529. Result := True;
  530. end;
  531. Exit;
  532. end
  533. else if MatchOperand(taicpu(next_hp).oper[1]^, taicpu(p).oper[0]^.reg) then
  534. begin
  535. if MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[1]^.reg)
  536. { Be careful - if the entire register is not used, removing this
  537. instruction will leave the unused part uninitialised }
  538. {$ifdef AARCH64}
  539. and (getsubreg(taicpu(p).oper[1]^.reg) = R_SUBQ)
  540. {$endif AARCH64}
  541. then
  542. begin
  543. { Instruction will become mov r1,r1 }
  544. DebugMsg(SPeepholeOptimization + 'Mov2None 2 done', next_hp);
  545. { Allocate r1 between the instructions; not doing
  546. so may cause problems when removing superfluous
  547. MOVs later (i38055) }
  548. AllocRegBetween(taicpu(p).oper[1]^.reg, p, next_hp, UsedRegs);
  549. if (next_hp = hp1) then
  550. { Don't let hp1 become a dangling pointer }
  551. hp1 := nil;
  552. asml.Remove(next_hp);
  553. next_hp.Free;
  554. Continue;
  555. end;
  556. { Change the old register (checking the first operand again
  557. forces it to be left alone if the full register is not
  558. used, lest mov w1,w1 gets optimised out by mistake. [Kit] }
  559. {$ifdef AARCH64}
  560. if not MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[1]^.reg) then
  561. {$endif AARCH64}
  562. begin
  563. DebugMsg(SPeepholeOptimization + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovMov2Mov 2)', next_hp);
  564. taicpu(next_hp).oper[1]^.reg := taicpu(p).oper[1]^.reg;
  565. AllocRegBetween(taicpu(p).oper[1]^.reg, p, next_hp, UsedRegs);
  566. { If this was the only reference to the old register,
  567. then we can remove the original MOV now }
  568. if (taicpu(p).oppostfix = PF_None) and
  569. { A bit of a hack - sometimes registers aren't tracked properly, so do not
  570. remove if the register was apparently not allocated when its value is
  571. first set at the MOV command (this is especially true for the stack
  572. register). [Kit] }
  573. (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) and
  574. RegInUsedRegs(taicpu(p).oper[0]^.reg, UsedRegs) and
  575. not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, next_hp, TmpUsedRegs) then
  576. begin
  577. DebugMsg(SPeepholeOptimization + 'RedundantMovProcess 2c done', p);
  578. RemoveCurrentP(p);
  579. Result := True;
  580. Exit;
  581. end;
  582. end;
  583. end;
  584. end;
  585. { On low optimisation settions, don't search more than one instruction ahead }
  586. if not(cs_opt_level3 in current_settings.optimizerswitches) or
  587. { Stop at procedure calls and jumps }
  588. is_calljmp(taicpu(next_hp).opcode) or
  589. { If the read register has changed value, or the MOV
  590. destination register has been used, drop out }
  591. RegInInstruction(taicpu(p).oper[0]^.reg, next_hp) or
  592. RegModifiedByInstruction(taicpu(p).oper[1]^.reg, next_hp) then
  593. Break;
  594. current_hp := next_hp;
  595. end;
  596. end;
  597. end;
  598. end;
  599. function TARMAsmOptimizer.DoXTArithOp(var p: tai; hp1: tai): Boolean;
  600. var
  601. hp2: tai;
  602. ConstLimit: TCGInt;
  603. ValidPostFixes: TOpPostFixes;
  604. FirstCode, SecondCode, ThirdCode, FourthCode: TAsmOp;
  605. begin
  606. Result := False;
  607. { Change:
  608. uxtb/h reg1,reg1
  609. (operation on reg1 with immediate operand where the upper 24/56
  610. bits don't affect the state of the first 8 bits )
  611. uxtb/h reg1,reg1
  612. Remove first uxtb/h
  613. }
  614. case taicpu(p).opcode of
  615. A_UXTB,
  616. A_SXTB:
  617. begin
  618. ConstLimit := $FF;
  619. ValidPostFixes := [PF_B];
  620. FirstCode := A_UXTB;
  621. SecondCode := A_SXTB;
  622. ThirdCode := A_UXTB; { Used to indicate no other valid codes }
  623. FourthCode := A_SXTB;
  624. end;
  625. A_UXTH,
  626. A_SXTH:
  627. begin
  628. ConstLimit := $FFFF;
  629. ValidPostFixes := [PF_B, PF_H];
  630. FirstCode := A_UXTH;
  631. SecondCode := A_SXTH;
  632. ThirdCode := A_UXTB;
  633. FourthCode := A_SXTB;
  634. end;
  635. else
  636. InternalError(2024051401);
  637. end;
  638. {$ifndef AARCH64}
  639. { Regular ARM doesn't have the multi-instruction MatchInstruction available }
  640. if (hp1.typ = ait_instruction) and (taicpu(hp1).oppostfix = PF_None) then
  641. case taicpu(hp1).opcode of
  642. A_ADD, A_SUB, A_MUL, A_LSL, A_AND, A_ORR, A_EOR, A_BIC, A_ORN:
  643. {$endif AARCH64}
  644. if
  645. (taicpu(p).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
  646. {$ifdef AARCH64}
  647. MatchInstruction(hp1, [A_ADD, A_SUB, A_MUL, A_LSL, A_AND, A_ORR, A_EOR, A_BIC, A_ORN, A_EON], [PF_None]) and
  648. {$endif AARCH64}
  649. (taicpu(hp1).condition = C_None) and
  650. (taicpu(hp1).ops = 3) and
  651. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[0]^.reg) and
  652. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
  653. (taicpu(hp1).oper[2]^.typ = top_const) and
  654. (
  655. (
  656. { If the AND immediate is 8-bit, then this essentially performs
  657. the functionality of the second UXTB and so its presence is
  658. not required }
  659. (taicpu(hp1).opcode = A_AND) and
  660. (taicpu(hp1).oper[2]^.val >= 0) and
  661. (taicpu(hp1).oper[2]^.val <= ConstLimit)
  662. ) or
  663. (
  664. GetNextInstructionUsingReg(hp1,hp2,taicpu(p).oper[0]^.reg) and
  665. (hp2.typ = ait_instruction) and
  666. (taicpu(hp2).ops = 2) and
  667. (taicpu(hp2).condition = C_None) and
  668. (
  669. (
  670. (taicpu(hp2).opcode in [FirstCode, SecondCode, ThirdCode, FourthCode]) and
  671. (taicpu(hp2).oppostfix = PF_None) and
  672. (taicpu(hp2).oper[1]^.reg = taicpu(p).oper[0]^.reg)
  673. { Destination is allowed to be different in this case, but
  674. only if the source is no longer in use (it being the same as
  675. the source is covered by RegEndOfLife as well) }
  676. ) or
  677. (
  678. { STRB essentially fills the same role as the second UXTB
  679. as long as the register is deallocated afterwards }
  680. MatchInstruction(hp2, A_STR, [C_None], ValidPostFixes) and
  681. (taicpu(hp2).oper[0]^.reg = taicpu(p).oper[0]^.reg) and
  682. not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp2).oper[1]^)
  683. )
  684. ) and
  685. RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp2))
  686. )
  687. ) then
  688. begin
  689. DebugMsg(SPeepholeOptimization + 'S/Uxtb/hArithUxtb/h2ArithS/Uxtb/h done', p);
  690. Result := RemoveCurrentP(p);
  691. { Simplify bitwise constants if able }
  692. {$ifdef AARCH64}
  693. if (taicpu(hp1).opcode in [A_AND, A_ORR, A_EOR, A_BIC, A_ORN, A_EON]) and
  694. is_shifter_const(taicpu(hp1).oper[2]^.val and ConstLimit, OS_32) then
  695. {$else AARCH64}
  696. if (
  697. (ConstLimit = $FF) or
  698. (taicpu(hp1).oper[2]^.val <= $100)
  699. ) and
  700. (taicpu(hp1).opcode in [A_AND, A_ORR, A_EOR, A_BIC, A_ORN]) then
  701. {$endif AARCH64}
  702. taicpu(hp1).oper[2]^.val := taicpu(hp1).oper[2]^.val and ConstLimit;
  703. end;
  704. {$ifndef AARCH64}
  705. else
  706. ;
  707. end;
  708. {$endif not AARCH64}
  709. end;
  710. function TARMAsmOptimizer.OptPass1UXTB(var p : tai) : Boolean;
  711. var
  712. hp1, hp2: tai;
  713. so: tshifterop;
  714. begin
  715. Result:=false;
  716. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  717. (taicpu(p).oppostfix = PF_None) and
  718. (taicpu(p).ops = 2) then
  719. begin
  720. if (taicpu(p).condition = C_None) then
  721. begin
  722. {
  723. change
  724. uxtb reg2,reg1
  725. strb reg2,[...]
  726. dealloc reg2
  727. to
  728. strb reg1,[...]
  729. }
  730. if MatchInstruction(hp1, A_STR, [C_None], [PF_B]) and
  731. assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) and
  732. { the reference in strb might not use reg2 }
  733. not(RegInRef(taicpu(p).oper[0]^.reg,taicpu(hp1).oper[1]^.ref^)) and
  734. { reg1 might not be modified inbetween }
  735. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  736. begin
  737. DebugMsg('Peephole UxtbStrb2Strb done', p);
  738. taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
  739. result:=RemoveCurrentP(p);
  740. end
  741. {
  742. change
  743. uxtb reg2,reg1
  744. uxth reg3,reg2
  745. dealloc reg2
  746. to
  747. uxtb reg3,reg1
  748. }
  749. else if MatchInstruction(hp1, A_UXTH, [C_None], [PF_None]) and
  750. (taicpu(hp1).ops = 2) and
  751. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  752. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  753. { reg1 might not be modified inbetween }
  754. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  755. begin
  756. DebugMsg('Peephole UxtbUxth2Uxtb done', p);
  757. AllocRegBetween(taicpu(hp1).oper[0]^.reg,p,hp1,UsedRegs);
  758. taicpu(p).loadReg(0,taicpu(hp1).oper[0]^.reg);
  759. asml.remove(hp1);
  760. hp1.free;
  761. result:=true;
  762. end
  763. {
  764. change
  765. uxtb reg2,reg1
  766. uxtb reg3,reg2
  767. dealloc reg2
  768. to
  769. uxtb reg3,reg1
  770. }
  771. else if MatchInstruction(hp1, A_UXTB, [C_None], [PF_None]) and
  772. (taicpu(hp1).ops = 2) and
  773. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  774. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  775. { reg1 might not be modified inbetween }
  776. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  777. begin
  778. DebugMsg('Peephole UxtbUxtb2Uxtb done', p);
  779. AllocRegBetween(taicpu(hp1).oper[0]^.reg,p,hp1,UsedRegs);
  780. taicpu(p).loadReg(0,taicpu(hp1).oper[0]^.reg);
  781. asml.remove(hp1);
  782. hp1.free;
  783. result:=true;
  784. end
  785. {
  786. change
  787. uxtb reg2,reg1
  788. and reg3,reg2,#0x*FF
  789. dealloc reg2
  790. to
  791. uxtb reg3,reg1
  792. }
  793. else if MatchInstruction(hp1, A_AND, [C_None], [PF_None]) and
  794. (taicpu(hp1).ops=3) and
  795. (taicpu(hp1).oper[2]^.typ=top_const) and
  796. ((taicpu(hp1).oper[2]^.val and $FF)=$FF) and
  797. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  798. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  799. { reg1 might not be modified inbetween }
  800. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  801. begin
  802. DebugMsg('Peephole UxtbAndImm2Uxtb done', p);
  803. taicpu(hp1).opcode:=A_UXTB;
  804. taicpu(hp1).ops:=2;
  805. taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
  806. result:=RemoveCurrentP(p);
  807. end
  808. else if DoXTArithOp(p, hp1) then
  809. Result:=true
  810. {$ifdef AARCH64}
  811. else if USxtOp2Op(p,hp1,SM_UXTB) then
  812. Result:=true
  813. {$endif AARCH64}
  814. end;
  815. { Condition doesn't have to be C_None }
  816. if not Result and
  817. RemoveSuperfluousMove(p, hp1, 'UxtbMov2Uxtb') then
  818. Result:=true;
  819. end;
  820. end;
  821. function TARMAsmOptimizer.OptPass1UXTH(var p : tai) : Boolean;
  822. var
  823. hp1: tai;
  824. so: tshifterop;
  825. begin
  826. Result:=false;
  827. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  828. (taicpu(p).oppostfix = PF_None) and
  829. (taicpu(p).ops = 2) then
  830. begin
  831. if (taicpu(p).condition = C_None) then
  832. begin
  833. {
  834. change
  835. uxth reg2,reg1
  836. strh reg2,[...]
  837. dealloc reg2
  838. to
  839. strh reg1,[...]
  840. }
  841. if MatchInstruction(hp1, A_STR, [C_None], [PF_H]) and
  842. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  843. { the reference in strb might not use reg2 }
  844. not(RegInRef(taicpu(p).oper[0]^.reg,taicpu(hp1).oper[1]^.ref^)) and
  845. { reg1 might not be modified inbetween }
  846. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  847. begin
  848. DebugMsg('Peephole UXTHStrh2Strh done', p);
  849. taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
  850. result:=RemoveCurrentP(p);
  851. end
  852. {
  853. change
  854. uxth reg2,reg1
  855. uxth reg3,reg2
  856. dealloc reg2
  857. to
  858. uxth reg3,reg1
  859. }
  860. else if MatchInstruction(hp1, A_UXTH, [C_None], [PF_None]) and
  861. (taicpu(hp1).ops=2) and
  862. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  863. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  864. { reg1 might not be modified inbetween }
  865. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  866. begin
  867. DebugMsg('Peephole UxthUxth2Uxth done', p);
  868. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,UsedRegs);
  869. taicpu(hp1).opcode:=A_UXTH;
  870. taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
  871. result:=RemoveCurrentP(p);
  872. end
  873. {
  874. change
  875. uxth reg2,reg1
  876. and reg3,reg2,#65535
  877. dealloc reg2
  878. to
  879. uxth reg3,reg1
  880. }
  881. else if MatchInstruction(hp1, A_AND, [C_None], [PF_None]) and
  882. (taicpu(hp1).ops=3) and
  883. (taicpu(hp1).oper[2]^.typ=top_const) and
  884. ((taicpu(hp1).oper[2]^.val and $FFFF)=$FFFF) and
  885. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  886. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  887. { reg1 might not be modified inbetween }
  888. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  889. begin
  890. DebugMsg('Peephole UxthAndImm2Uxth done', p);
  891. taicpu(hp1).opcode:=A_UXTH;
  892. taicpu(hp1).ops:=2;
  893. taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
  894. result:=RemoveCurrentP(p);
  895. end
  896. else if DoXTArithOp(p, hp1) then
  897. Result:=true
  898. {$ifdef AARCH64}
  899. else if USxtOp2Op(p,hp1,SM_UXTH) then
  900. Result:=true
  901. {$endif AARCH64}
  902. end;
  903. { Condition doesn't have to be C_None }
  904. if not Result and
  905. RemoveSuperfluousMove(p, hp1, 'UxthMov2Data') then
  906. Result:=true;
  907. end;
  908. end;
  909. function TARMAsmOptimizer.OptPass1SXTB(var p : tai) : Boolean;
  910. var
  911. hp1, hp2: tai;
  912. so: tshifterop;
  913. begin
  914. Result:=false;
  915. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  916. (taicpu(p).oppostfix = PF_None) and
  917. (taicpu(p).ops = 2) then
  918. begin
  919. if (taicpu(p).condition = C_None) then
  920. begin
  921. {
  922. change
  923. sxtb reg2,reg1
  924. strb reg2,[...]
  925. dealloc reg2
  926. to
  927. strb reg1,[...]
  928. }
  929. if MatchInstruction(hp1, A_STR, [C_None], [PF_B]) and
  930. assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) and
  931. { the reference in strb might not use reg2 }
  932. not(RegInRef(taicpu(p).oper[0]^.reg,taicpu(hp1).oper[1]^.ref^)) and
  933. { reg1 might not be modified inbetween }
  934. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  935. begin
  936. DebugMsg('Peephole SxtbStrb2Strb done', p);
  937. taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
  938. result:=RemoveCurrentP(p);
  939. end
  940. {
  941. change
  942. sxtb reg2,reg1
  943. sxth reg3,reg2
  944. dealloc reg2
  945. to
  946. sxtb reg3,reg1
  947. }
  948. else if MatchInstruction(hp1, A_SXTH, [C_None], [PF_None]) and
  949. (taicpu(hp1).ops = 2) and
  950. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  951. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  952. { reg1 might not be modified inbetween }
  953. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  954. begin
  955. DebugMsg('Peephole SxtbSxth2Sxtb done', p);
  956. AllocRegBetween(taicpu(hp1).oper[0]^.reg,p,hp1,UsedRegs);
  957. taicpu(p).loadReg(0,taicpu(hp1).oper[0]^.reg);
  958. asml.remove(hp1);
  959. hp1.free;
  960. result:=true;
  961. end
  962. {
  963. change
  964. sxtb reg2,reg1
  965. sxtb reg3,reg2
  966. dealloc reg2
  967. to
  968. uxtb reg3,reg1
  969. }
  970. else if MatchInstruction(hp1, A_SXTB, [C_None], [PF_None]) and
  971. (taicpu(hp1).ops = 2) and
  972. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  973. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  974. { reg1 might not be modified inbetween }
  975. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  976. begin
  977. DebugMsg('Peephole SxtbSxtb2Sxtb done', p);
  978. AllocRegBetween(taicpu(hp1).oper[0]^.reg,p,hp1,UsedRegs);
  979. taicpu(p).loadReg(0,taicpu(hp1).oper[0]^.reg);
  980. asml.remove(hp1);
  981. hp1.free;
  982. result:=true;
  983. end
  984. {
  985. change
  986. sxtb reg2,reg1
  987. and reg3,reg2,#0x*FF
  988. dealloc reg2
  989. to
  990. uxtb reg3,reg1
  991. }
  992. else if MatchInstruction(hp1, A_AND, [C_None], [PF_None]) and
  993. (taicpu(hp1).ops=3) and
  994. (taicpu(hp1).oper[2]^.typ=top_const) and
  995. ((taicpu(hp1).oper[2]^.val and $FF)=$FF) and
  996. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  997. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  998. { reg1 might not be modified inbetween }
  999. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  1000. begin
  1001. DebugMsg('Peephole SxtbAndImm2Uxtb done', p);
  1002. taicpu(hp1).opcode:=A_UXTB;
  1003. taicpu(hp1).ops:=2;
  1004. taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
  1005. result:=RemoveCurrentP(p);
  1006. end
  1007. else if DoXTArithOp(p, hp1) then
  1008. Result:=true
  1009. {$ifdef AARCH64}
  1010. else if USxtOp2Op(p,hp1,SM_SXTB) then
  1011. Result:=true
  1012. {$endif AARCH64}
  1013. end;
  1014. { Condition doesn't have to be C_None }
  1015. if not Result and
  1016. RemoveSuperfluousMove(p, hp1, 'SxtbMov2Sxtb') then
  1017. Result:=true;
  1018. end;
  1019. end;
  1020. function TARMAsmOptimizer.OptPass1SXTH(var p : tai) : Boolean;
  1021. var
  1022. hp1: tai;
  1023. so: tshifterop;
  1024. begin
  1025. Result:=false;
  1026. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  1027. (taicpu(p).oppostfix = PF_None) and
  1028. (taicpu(p).ops = 2) then
  1029. begin
  1030. if (taicpu(p).condition = C_None) then
  1031. begin
  1032. {
  1033. change
  1034. sxth reg2,reg1
  1035. strh reg2,[...]
  1036. dealloc reg2
  1037. to
  1038. strh reg1,[...]
  1039. }
  1040. if MatchInstruction(p, taicpu(p).opcode, [C_None], [PF_None]) and
  1041. (taicpu(p).ops=2) and
  1042. MatchInstruction(hp1, A_STR, [C_None], [PF_H]) and
  1043. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  1044. { the reference in strb might not use reg2 }
  1045. not(RegInRef(taicpu(p).oper[0]^.reg,taicpu(hp1).oper[1]^.ref^)) and
  1046. { reg1 might not be modified inbetween }
  1047. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  1048. begin
  1049. DebugMsg('Peephole SxthStrh2Strh done', p);
  1050. taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
  1051. result:=RemoveCurrentP(p);
  1052. end
  1053. {
  1054. change
  1055. sxth reg2,reg1
  1056. sxth reg3,reg2
  1057. dealloc reg2
  1058. to
  1059. sxth reg3,reg1
  1060. }
  1061. else if MatchInstruction(p, A_SXTH, [C_None], [PF_None]) and
  1062. (taicpu(p).ops=2) and
  1063. MatchInstruction(hp1, A_SXTH, [C_None], [PF_None]) and
  1064. (taicpu(hp1).ops=2) and
  1065. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  1066. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  1067. { reg1 might not be modified inbetween }
  1068. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  1069. begin
  1070. DebugMsg('Peephole SxthSxth2Sxth done', p);
  1071. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,UsedRegs);
  1072. taicpu(hp1).opcode:=A_SXTH;
  1073. taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
  1074. result:=RemoveCurrentP(p);
  1075. end
  1076. {$ifdef AARCH64}
  1077. {
  1078. change
  1079. sxth reg2,reg1
  1080. sxtw reg3,reg2
  1081. dealloc reg2
  1082. to
  1083. sxth reg3,reg1
  1084. }
  1085. else if MatchInstruction(p, A_SXTH, [C_None], [PF_None]) and
  1086. (taicpu(p).ops=2) and
  1087. MatchInstruction(hp1, A_SXTW, [C_None], [PF_None]) and
  1088. (taicpu(hp1).ops=2) and
  1089. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  1090. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  1091. { reg1 might not be modified inbetween }
  1092. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  1093. begin
  1094. DebugMsg('Peephole SxthSxtw2Sxth done', p);
  1095. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,UsedRegs);
  1096. taicpu(hp1).opcode:=A_SXTH;
  1097. taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
  1098. result:=RemoveCurrentP(p);
  1099. end
  1100. {$endif AARCH64}
  1101. {
  1102. change
  1103. sxth reg2,reg1
  1104. and reg3,reg2,#65535
  1105. dealloc reg2
  1106. to
  1107. uxth reg3,reg1
  1108. }
  1109. else if MatchInstruction(p, A_SXTH, [C_None], [PF_None]) and
  1110. (taicpu(p).ops=2) and
  1111. MatchInstruction(hp1, A_AND, [C_None], [PF_None]) and
  1112. (taicpu(hp1).ops=3) and
  1113. (taicpu(hp1).oper[2]^.typ=top_const) and
  1114. ((taicpu(hp1).oper[2]^.val and $FFFF)=$FFFF) and
  1115. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  1116. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  1117. { reg1 might not be modified inbetween }
  1118. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  1119. begin
  1120. DebugMsg('Peephole SxthAndImm2Uxth done', p);
  1121. taicpu(hp1).opcode:=A_UXTH;
  1122. taicpu(hp1).ops:=2;
  1123. taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
  1124. result:=RemoveCurrentP(p);
  1125. end
  1126. else if DoXTArithOp(p, hp1) then
  1127. Result:=true
  1128. {$ifdef AARCH64}
  1129. else if USxtOp2Op(p,hp1,SM_SXTH) then
  1130. Result:=true
  1131. {$endif AARCH64}
  1132. end;
  1133. { Condition doesn't have to be C_None }
  1134. if not Result and
  1135. RemoveSuperfluousMove(p, hp1, 'SxthMov2Sxth') then
  1136. Result:=true;
  1137. end;
  1138. end;
  1139. function TARMAsmOptimizer.OptPreSBFXUBFX(var p: tai): Boolean;
  1140. begin
  1141. Result := False;
  1142. { Convert:
  1143. s/ubfx reg1,reg2,#0,#64 (or #32 for 32-bit registers)
  1144. To:
  1145. mov reg1,reg2
  1146. }
  1147. if (taicpu(p).oper[2]^.val = 0) and
  1148. {$ifdef AARCH64}
  1149. (
  1150. (
  1151. (getsubreg(taicpu(p).oper[0]^.reg) = R_SUBQ) and
  1152. (taicpu(p).oper[3]^.val = 64)
  1153. ) or
  1154. (
  1155. (getsubreg(taicpu(p).oper[0]^.reg) = R_SUBD) and
  1156. (taicpu(p).oper[3]^.val = 32)
  1157. )
  1158. )
  1159. {$else AARCH64}
  1160. (taicpu(p).oper[3]^.val = 32)
  1161. {$endif AARCH64}
  1162. then
  1163. begin
  1164. DebugMsg(SPeepholeOptimization + 'SBFX or UBFX -> MOV (full bitfield extract)', p);
  1165. taicpu(p).opcode := A_MOV;
  1166. taicpu(p).ops := 2;
  1167. taicpu(p).clearop(2);
  1168. taicpu(p).clearop(3);
  1169. Result := True;
  1170. Exit;
  1171. end;
  1172. end;
  1173. function TARMAsmOptimizer.OptPass1LDR(var p : tai) : Boolean;
  1174. var
  1175. hp1: tai;
  1176. Reference: TReference;
  1177. NewOp: TAsmOp;
  1178. begin
  1179. Result := False;
  1180. if (taicpu(p).ops <> 2) or (taicpu(p).condition <> C_None) then
  1181. Exit;
  1182. Reference := taicpu(p).oper[1]^.ref^;
  1183. if (Reference.addressmode = AM_OFFSET) and
  1184. not RegInRef(taicpu(p).oper[0]^.reg, Reference) and
  1185. { Delay calling GetNextInstruction for as long as possible }
  1186. GetNextInstruction(p, hp1) and
  1187. (hp1.typ = ait_instruction) and
  1188. (taicpu(hp1).condition = C_None) and
  1189. (taicpu(hp1).oppostfix = taicpu(p).oppostfix) then
  1190. begin
  1191. if (taicpu(hp1).opcode = A_STR) and
  1192. RefsEqual(taicpu(hp1).oper[1]^.ref^, Reference) and
  1193. (getregtype(taicpu(p).oper[0]^.reg) = getregtype(taicpu(hp1).oper[0]^.reg)) then
  1194. begin
  1195. { With:
  1196. ldr reg1,[ref]
  1197. str reg2,[ref]
  1198. If reg1 = reg2, Remove str
  1199. }
  1200. if taicpu(p).oper[0]^.reg = taicpu(hp1).oper[0]^.reg then
  1201. begin
  1202. DebugMsg(SPeepholeOptimization + 'Removed redundant store instruction (load/store -> load/nop)', hp1);
  1203. RemoveInstruction(hp1);
  1204. Result := True;
  1205. Exit;
  1206. end;
  1207. end
  1208. else if (taicpu(hp1).opcode = A_LDR) and
  1209. RefsEqual(taicpu(hp1).oper[1]^.ref^, Reference) then
  1210. begin
  1211. { With:
  1212. ldr reg1,[ref]
  1213. ldr reg2,[ref]
  1214. If reg1 = reg2, delete the second ldr
  1215. If reg1 <> reg2, changing the 2nd ldr to a mov might introduce
  1216. a dependency, but it will likely open up new optimisations, so
  1217. do it for now and handle any new dependencies later.
  1218. }
  1219. if taicpu(p).oper[0]^.reg = taicpu(hp1).oper[0]^.reg then
  1220. begin
  1221. DebugMsg(SPeepholeOptimization + 'Removed duplicate load instruction (load/load -> load/nop)', hp1);
  1222. RemoveInstruction(hp1);
  1223. Result := True;
  1224. Exit;
  1225. end
  1226. else if
  1227. (getregtype(taicpu(p).oper[0]^.reg) = R_INTREGISTER) and
  1228. (getregtype(taicpu(hp1).oper[0]^.reg) = R_INTREGISTER) and
  1229. (getsubreg(taicpu(p).oper[0]^.reg) = getsubreg(taicpu(hp1).oper[0]^.reg)) then
  1230. begin
  1231. DebugMsg(SPeepholeOptimization + 'Changed second ldr' + oppostfix2str[taicpu(hp1).oppostfix] + ' to mov (load/load -> load/move)', hp1);
  1232. taicpu(hp1).opcode := A_MOV;
  1233. taicpu(hp1).oppostfix := PF_None;
  1234. taicpu(hp1).loadreg(1, taicpu(p).oper[0]^.reg);
  1235. AllocRegBetween(taicpu(p).oper[0]^.reg, p, hp1, UsedRegs);
  1236. Result := True;
  1237. Exit;
  1238. end;
  1239. end;
  1240. end;
  1241. end;
  1242. function TARMAsmOptimizer.OptPass1STR(var p : tai) : Boolean;
  1243. var
  1244. hp1: tai;
  1245. Reference: TReference;
  1246. SizeMismatch: Boolean;
  1247. SrcReg, DstReg: TRegister;
  1248. NewOp: TAsmOp;
  1249. begin
  1250. Result := False;
  1251. if (taicpu(p).ops <> 2) or (taicpu(p).condition <> C_None) then
  1252. Exit;
  1253. Reference := taicpu(p).oper[1]^.ref^;
  1254. if (Reference.addressmode = AM_OFFSET) and
  1255. not RegInRef(taicpu(p).oper[0]^.reg, Reference) and
  1256. { Delay calling GetNextInstruction for as long as possible }
  1257. GetNextInstruction(p, hp1) and
  1258. (hp1.typ = ait_instruction) and
  1259. (taicpu(hp1).condition = C_None) and
  1260. (taicpu(hp1).oppostfix = taicpu(p).oppostfix) and
  1261. (taicpu(hp1).ops>0) and (taicpu(hp1).oper[0]^.typ=top_reg) then
  1262. begin
  1263. { Saves constant dereferencing and makes it easier to change the size if necessary }
  1264. SrcReg := taicpu(p).oper[0]^.reg;
  1265. DstReg := taicpu(hp1).oper[0]^.reg;
  1266. if (taicpu(hp1).opcode = A_LDR) and
  1267. RefsEqual(taicpu(hp1).oper[1]^.ref^, Reference) and
  1268. (taicpu(hp1).oper[1]^.ref^.volatility=[]) and
  1269. (
  1270. (taicpu(hp1).oppostfix = taicpu(p).oppostfix) or
  1271. ((taicpu(p).oppostfix = PF_B) and (taicpu(hp1).oppostfix = PF_SB)) or
  1272. ((taicpu(p).oppostfix = PF_H) and (taicpu(hp1).oppostfix = PF_SH))
  1273. {$ifdef AARCH64}
  1274. or ((taicpu(p).oppostfix = PF_W) and (taicpu(hp1).oppostfix = PF_SW))
  1275. {$endif AARCH64}
  1276. ) then
  1277. begin
  1278. { With:
  1279. str reg1,[ref]
  1280. ldr reg2,[ref]
  1281. If reg1 = reg2, Remove ldr.
  1282. If reg1 <> reg2, replace ldr with "mov reg2,reg1"
  1283. }
  1284. if (SrcReg = DstReg) and
  1285. { e.g. the ldrb in strb/ldrb is not a null operation as it clears the upper 24 bits }
  1286. (taicpu(p).oppostfix=PF_None) then
  1287. begin
  1288. DebugMsg(SPeepholeOptimization + 'Removed redundant load instruction (store/load -> store/nop)', hp1);
  1289. RemoveInstruction(hp1);
  1290. Result := True;
  1291. Exit;
  1292. end
  1293. else if (getregtype(SrcReg) = R_INTREGISTER) and
  1294. (getregtype(DstReg) = R_INTREGISTER) and
  1295. (getsubreg(SrcReg) = getsubreg(DstReg)) then
  1296. begin
  1297. NewOp:=A_NONE;
  1298. if taicpu(hp1).oppostfix=PF_None then
  1299. NewOp:=A_MOV
  1300. else
  1301. {$ifdef ARM}
  1302. if (current_settings.cputype < cpu_armv6) then
  1303. begin
  1304. { The zero- and sign-extension operations were only
  1305. introduced under ARMv6 }
  1306. case taicpu(hp1).oppostfix of
  1307. PF_B:
  1308. begin
  1309. { The if-block afterwards will set the middle operand to the correct register }
  1310. taicpu(hp1).allocate_oper(3);
  1311. taicpu(hp1).ops := 3;
  1312. taicpu(hp1).loadconst(2, $FF);
  1313. NewOp := A_AND;
  1314. end;
  1315. PF_H:
  1316. { ARMv5 and under doesn't have a concise way of storing the immediate $FFFF, so leave alone };
  1317. PF_SB,
  1318. PF_SH:
  1319. { Do nothing - can't easily encode sign-extensions };
  1320. else
  1321. InternalError(2021043002);
  1322. end;
  1323. end
  1324. else
  1325. {$endif ARM}
  1326. case taicpu(hp1).oppostfix of
  1327. PF_B:
  1328. NewOp := A_UXTB;
  1329. PF_SB:
  1330. NewOp := A_SXTB;
  1331. PF_H:
  1332. NewOp := A_UXTH;
  1333. PF_SH:
  1334. NewOp := A_SXTH;
  1335. {$ifdef AARCH64}
  1336. PF_SW:
  1337. NewOp := A_SXTW;
  1338. PF_W:
  1339. NewOp := A_MOV;
  1340. {$endif AARCH64}
  1341. else
  1342. InternalError(2021043001);
  1343. end;
  1344. if (NewOp<>A_None) then
  1345. begin
  1346. DebugMsg(SPeepholeOptimization + 'Changed ldr' + oppostfix2str[taicpu(hp1).oppostfix] + ' to ' + gas_op2str[NewOp] + ' (store/load -> store/move)', hp1);
  1347. taicpu(hp1).oppostfix := PF_None;
  1348. taicpu(hp1).opcode := NewOp;
  1349. taicpu(hp1).loadreg(1, SrcReg);
  1350. AllocRegBetween(SrcReg, p, hp1, UsedRegs);
  1351. Result := True;
  1352. Exit;
  1353. end;
  1354. end
  1355. end
  1356. else if (taicpu(hp1).opcode = A_STR) and
  1357. RefsEqual(taicpu(hp1).oper[1]^.ref^, Reference) then
  1358. begin
  1359. { With:
  1360. str reg1,[ref]
  1361. str reg2,[ref]
  1362. If reg1 <> reg2, delete the first str
  1363. IF reg1 = reg2, delete the second str
  1364. }
  1365. if (SrcReg = DstReg) and (taicpu(hp1).oper[1]^.ref^.volatility=[]) then
  1366. begin
  1367. DebugMsg(SPeepholeOptimization + 'Removed duplicate store instruction (store/store -> store/nop)', hp1);
  1368. RemoveInstruction(hp1);
  1369. Result := True;
  1370. Exit;
  1371. end
  1372. else if
  1373. { Registers same byte size? }
  1374. (tcgsize2size[reg_cgsize(SrcReg)] = tcgsize2size[reg_cgsize(DstReg)]) and
  1375. (taicpu(p).oper[1]^.ref^.volatility=[]) then
  1376. begin
  1377. DebugMsg(SPeepholeOptimization + 'Removed dominated store instruction (store/store -> nop/store)', p);
  1378. RemoveCurrentP(p, hp1);
  1379. Result := True;
  1380. Exit;
  1381. end;
  1382. end;
  1383. end;
  1384. end;
  1385. function TARMAsmOptimizer.OptPass1And(var p : tai) : Boolean;
  1386. var
  1387. hp1, hp2: tai;
  1388. i: longint;
  1389. begin
  1390. Result:=false;
  1391. {
  1392. optimize
  1393. and reg2,reg1,const1
  1394. ...
  1395. }
  1396. if (taicpu(p).ops>2) and
  1397. (taicpu(p).oper[1]^.typ = top_reg) and
  1398. (taicpu(p).oper[2]^.typ = top_const) then
  1399. begin
  1400. {
  1401. change
  1402. and reg2,reg1,const1
  1403. ...
  1404. and reg3,reg2,const2
  1405. to
  1406. and reg3,reg1,(const1 and const2)
  1407. }
  1408. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  1409. MatchInstruction(hp1, A_AND, [taicpu(p).condition], [PF_None]) and
  1410. RegEndOfLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  1411. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  1412. (taicpu(hp1).oper[2]^.typ = top_const)
  1413. {$ifdef AARCH64}
  1414. and ((((getsubreg(taicpu(p).oper[0]^.reg)=R_SUBQ) and is_shifter_const(taicpu(p).oper[2]^.val and taicpu(hp1).oper[2]^.val,OS_64)) or
  1415. ((getsubreg(taicpu(p).oper[0]^.reg)=R_SUBL) and is_shifter_const(taicpu(p).oper[2]^.val and taicpu(hp1).oper[2]^.val,OS_32))
  1416. ) or
  1417. ((taicpu(p).oper[2]^.val and taicpu(hp1).oper[2]^.val)=0))
  1418. {$endif AARCH64}
  1419. then
  1420. begin
  1421. if not(RegUsedBetween(taicpu(hp1).oper[0]^.reg,p,hp1)) then
  1422. begin
  1423. DebugMsg('Peephole AndAnd2And done', p);
  1424. AllocRegBetween(taicpu(hp1).oper[0]^.reg,p,hp1,UsedRegs);
  1425. if (taicpu(p).oper[2]^.val and taicpu(hp1).oper[2]^.val)=0 then
  1426. begin
  1427. DebugMsg('Peephole AndAnd2Mov0 1 done', p);
  1428. taicpu(p).opcode:=A_MOV;
  1429. taicpu(p).ops:=2;
  1430. taicpu(p).loadConst(1,0);
  1431. taicpu(p).oppostfix:=taicpu(hp1).oppostfix;
  1432. end
  1433. else
  1434. begin
  1435. DebugMsg('Peephole AndAnd2And 1 done', p);
  1436. taicpu(p).loadConst(2,taicpu(p).oper[2]^.val and taicpu(hp1).oper[2]^.val);
  1437. taicpu(p).oppostfix:=taicpu(hp1).oppostfix;
  1438. taicpu(p).loadReg(0,taicpu(hp1).oper[0]^.reg);
  1439. end;
  1440. asml.remove(hp1);
  1441. hp1.free;
  1442. Result:=true;
  1443. exit;
  1444. end
  1445. else if not(RegUsedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  1446. begin
  1447. if (taicpu(p).oper[2]^.val and taicpu(hp1).oper[2]^.val)=0 then
  1448. begin
  1449. DebugMsg('Peephole AndAnd2Mov0 2 done', hp1);
  1450. taicpu(hp1).opcode:=A_MOV;
  1451. taicpu(hp1).loadConst(1,0);
  1452. taicpu(hp1).ops:=2;
  1453. taicpu(hp1).oppostfix:=taicpu(p).oppostfix;
  1454. end
  1455. else
  1456. begin
  1457. DebugMsg('Peephole AndAnd2And 2 done', hp1);
  1458. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,UsedRegs);
  1459. taicpu(hp1).loadConst(2,taicpu(p).oper[2]^.val and taicpu(hp1).oper[2]^.val);
  1460. taicpu(hp1).oppostfix:=taicpu(p).oppostfix;
  1461. taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
  1462. end;
  1463. GetNextInstruction(p, hp1);
  1464. RemoveCurrentP(p);
  1465. p:=hp1;
  1466. Result:=true;
  1467. exit;
  1468. end;
  1469. end
  1470. {
  1471. change
  1472. and reg2,reg1,$xxxxxxFF
  1473. strb reg2,[...]
  1474. dealloc reg2
  1475. to
  1476. strb reg1,[...]
  1477. }
  1478. else if ((taicpu(p).oper[2]^.val and $FF) = $FF) and
  1479. MatchInstruction(p, A_AND, [C_None], [PF_None]) and
  1480. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  1481. MatchInstruction(hp1, A_STR, [C_None], [PF_B]) and
  1482. assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) and
  1483. { the reference in strb might not use reg2 }
  1484. not(RegInRef(taicpu(p).oper[0]^.reg,taicpu(hp1).oper[1]^.ref^)) and
  1485. { reg1 might not be modified inbetween }
  1486. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  1487. begin
  1488. DebugMsg('Peephole AndStrb2Strb done', p);
  1489. {$ifdef AARCH64}
  1490. taicpu(hp1).loadReg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[1]^.reg),R_SUBD));
  1491. {$else AARCH64}
  1492. taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
  1493. {$endif AARCH64}
  1494. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,UsedRegs);
  1495. RemoveCurrentP(p);
  1496. result:=true;
  1497. exit;
  1498. end
  1499. {
  1500. change
  1501. and reg2,reg1,255
  1502. uxtb/uxth reg3,reg2
  1503. dealloc reg2
  1504. to
  1505. and reg3,reg1,x
  1506. }
  1507. else if MatchInstruction(p, A_AND, [C_None], [PF_None]) and
  1508. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  1509. ((((taicpu(p).oper[2]^.val and $ffffff00)=0) and MatchInstruction(hp1, A_UXTB, [C_None], [PF_None])) or
  1510. (((taicpu(p).oper[2]^.val and $ffff0000)=0) and MatchInstruction(hp1, A_UXTH, [C_None], [PF_None]))) and
  1511. (taicpu(hp1).ops = 2) and
  1512. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  1513. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  1514. { reg1 might not be modified inbetween }
  1515. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  1516. begin
  1517. DebugMsg('Peephole AndUxt2And done', p);
  1518. taicpu(hp1).opcode:=A_AND;
  1519. taicpu(hp1).ops:=3;
  1520. taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
  1521. taicpu(hp1).loadconst(2,taicpu(p).oper[2]^.val);
  1522. GetNextInstruction(p,hp1);
  1523. asml.remove(p);
  1524. p.Free;
  1525. p:=hp1;
  1526. result:=true;
  1527. exit;
  1528. end
  1529. else if ((taicpu(p).oper[2]^.val and $ffffff80)=0) and
  1530. MatchInstruction(p, A_AND, [C_None], [PF_None]) and
  1531. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  1532. MatchInstruction(hp1, [A_SXTB,A_SXTH], [C_None], [PF_None]) and
  1533. (taicpu(hp1).ops = 2) and
  1534. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  1535. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  1536. { reg1 might not be modified inbetween }
  1537. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  1538. begin
  1539. DebugMsg('Peephole AndSxt2And done', p);
  1540. taicpu(hp1).opcode:=A_AND;
  1541. taicpu(hp1).ops:=3;
  1542. taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
  1543. setsubreg(taicpu(hp1).oper[1]^.reg,getsubreg(taicpu(hp1).oper[0]^.reg));
  1544. taicpu(hp1).loadconst(2,taicpu(p).oper[2]^.val);
  1545. GetNextInstruction(p,hp1);
  1546. asml.remove(p);
  1547. p.Free;
  1548. p:=hp1;
  1549. result:=true;
  1550. exit;
  1551. end
  1552. {
  1553. from
  1554. and reg1,reg0,2^n-1
  1555. mov reg2,reg1, lsl imm1
  1556. (mov reg3,reg2, lsr/asr imm1)
  1557. remove either the and or the lsl/xsr sequence if possible
  1558. }
  1559. else if (taicpu(p).oper[2]^.val < high(int64)) and
  1560. cutils.ispowerof2(taicpu(p).oper[2]^.val+1,i) and
  1561. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  1562. MatchInstruction(hp1, A_MOV, [taicpu(p).condition], [PF_None]) and
  1563. (taicpu(hp1).ops=3) and
  1564. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  1565. (taicpu(hp1).oper[2]^.typ = top_shifterop) and
  1566. {$ifdef ARM}
  1567. (taicpu(hp1).oper[2]^.shifterop^.rs = NR_NO) and
  1568. {$endif ARM}
  1569. (taicpu(hp1).oper[2]^.shifterop^.shiftmode=SM_LSL) and
  1570. RegEndOfLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) then
  1571. begin
  1572. {
  1573. and reg1,reg0,2^n-1
  1574. mov reg2,reg1, lsl imm1
  1575. mov reg3,reg2, lsr/asr imm1
  1576. =>
  1577. and reg1,reg0,2^n-1
  1578. if lsr and 2^n-1>=imm1 or asr and 2^n-1>imm1
  1579. }
  1580. if GetNextInstructionUsingReg(hp1,hp2,taicpu(p).oper[0]^.reg) and
  1581. MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and
  1582. (taicpu(hp2).ops=3) and
  1583. MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[0]^.reg) and
  1584. (taicpu(hp2).oper[2]^.typ = top_shifterop) and
  1585. {$ifdef ARM}
  1586. (taicpu(hp2).oper[2]^.shifterop^.rs = NR_NO) and
  1587. {$endif ARM}
  1588. (taicpu(hp2).oper[2]^.shifterop^.shiftmode in [SM_ASR,SM_LSR]) and
  1589. (taicpu(hp1).oper[2]^.shifterop^.shiftimm=taicpu(hp2).oper[2]^.shifterop^.shiftimm) and
  1590. RegEndOfLife(taicpu(hp1).oper[0]^.reg,taicpu(hp2)) and
  1591. ((i<32-taicpu(hp1).oper[2]^.shifterop^.shiftimm) or
  1592. ((i=32-taicpu(hp1).oper[2]^.shifterop^.shiftimm) and
  1593. (taicpu(hp2).oper[2]^.shifterop^.shiftmode=SM_LSR))) then
  1594. begin
  1595. DebugMsg('Peephole AndLslXsr2And done', p);
  1596. taicpu(p).oper[0]^.reg:=taicpu(hp2).oper[0]^.reg;
  1597. asml.Remove(hp1);
  1598. asml.Remove(hp2);
  1599. hp1.free;
  1600. hp2.free;
  1601. result:=true;
  1602. exit;
  1603. end
  1604. {
  1605. and reg1,reg0,2^n-1
  1606. mov reg2,reg1, lsl imm1
  1607. =>
  1608. mov reg2,reg0, lsl imm1
  1609. if imm1>i
  1610. }
  1611. else if (i>32-taicpu(hp1).oper[2]^.shifterop^.shiftimm) and
  1612. not(RegModifiedBetween(taicpu(p).oper[1]^.reg, p, hp1)) then
  1613. begin
  1614. DebugMsg('Peephole AndLsl2Lsl done', p);
  1615. taicpu(hp1).oper[1]^.reg:=taicpu(p).oper[1]^.reg;
  1616. GetNextInstruction(p, hp1);
  1617. asml.Remove(p);
  1618. p.free;
  1619. p:=hp1;
  1620. result:=true;
  1621. exit;
  1622. end
  1623. end;
  1624. end;
  1625. {
  1626. change
  1627. and reg1, ...
  1628. mov reg2, reg1
  1629. to
  1630. and reg2, ...
  1631. }
  1632. if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  1633. (taicpu(p).ops>=3) and
  1634. RemoveSuperfluousMove(p, hp1, 'DataMov2Data') then
  1635. Result:=true;
  1636. end;
  1637. function TARMAsmOptimizer.OptPass2Bitwise(var p: tai): Boolean;
  1638. var
  1639. hp1, hp2: tai;
  1640. WorkingReg: TRegister;
  1641. begin
  1642. Result := False;
  1643. {
  1644. change
  1645. and/bic reg1, ...
  1646. ...
  1647. cmp reg1, #0
  1648. b<ne/eq> @Lbl
  1649. to
  1650. ands/bics reg1, ...
  1651. Also:
  1652. and/bic reg1, ...
  1653. ...
  1654. cmp reg1, #0
  1655. (reg1 end of life)
  1656. b<ne/eq> @Lbl
  1657. to
  1658. tst reg1, ...
  1659. or
  1660. bics xzr, reg1, ... under AArch64
  1661. For ARM, also include OR, EOR and ORN
  1662. }
  1663. if (taicpu(p).condition = C_None) and
  1664. (taicpu(p).ops>=3) and
  1665. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  1666. MatchInstruction(hp1, A_CMP, [C_None], [PF_None]) and
  1667. MatchOperand(taicpu(hp1).oper[1]^, 0) and
  1668. {$ifdef AARCH64}
  1669. (SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[0]^.reg)) and
  1670. (
  1671. (getsubreg(taicpu(hp1).oper[0]^.reg) = getsubreg(taicpu(p).oper[0]^.reg))
  1672. or
  1673. (
  1674. (taicpu(p).oper[2]^.typ = top_const) and
  1675. (taicpu(p).oper[2]^.val >= 0) and
  1676. (taicpu(p).oper[2]^.val <= $FFFFFFFF)
  1677. )
  1678. ) and
  1679. {$else AARCH64}
  1680. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[0]^.reg) and
  1681. {$endif AARCH64}
  1682. not RegModifiedBetween(NR_DEFAULTFLAGS, p, hp1) and
  1683. GetNextInstruction(hp1, hp2) then
  1684. begin
  1685. if MatchInstruction(hp2, [A_B, A_CMP, A_CMN, A_TST{$ifndef AARCH64}, A_TEQ{$endif not AARCH64}], [C_EQ, C_NE], [PF_None]) then
  1686. begin
  1687. AllocRegBetween(NR_DEFAULTFLAGS, p, hp1, UsedRegs);
  1688. WorkingReg := taicpu(p).oper[0]^.reg;
  1689. if
  1690. {$ifndef AARCH64}
  1691. (taicpu(p).opcode = A_AND) and
  1692. {$endif AARCH64}
  1693. RegEndOfLife(WorkingReg, taicpu(hp1)) then
  1694. begin
  1695. {$ifdef AARCH64}
  1696. if (taicpu(p).opcode <> A_AND) then
  1697. begin
  1698. setsupreg(taicpu(p).oper[0]^.reg, RS_XZR);
  1699. taicpu(p).oppostfix := PF_S;
  1700. DebugMsg(SPeepholeOptimization + 'BIC; CMP -> BICS ' + gas_regname(taicpu(p).oper[0]^.reg), p);
  1701. end
  1702. else
  1703. {$endif AARCH64}
  1704. begin
  1705. taicpu(p).opcode := A_TST;
  1706. taicpu(p).oppostfix := PF_None;
  1707. taicpu(p).loadreg(0, taicpu(p).oper[1]^.reg);
  1708. taicpu(p).loadoper(1, taicpu(p).oper[2]^);
  1709. if (taicpu(p).ops = 4) then
  1710. begin
  1711. { Make sure any shifter operator is also transferred }
  1712. taicpu(p).loadshifterop(2, taicpu(p).oper[3]^.shifterop^);
  1713. taicpu(p).ops := 3;
  1714. end
  1715. else
  1716. taicpu(p).ops := 2;
  1717. DebugMsg(SPeepholeOptimization + 'AND; CMP -> TST', p);
  1718. end;
  1719. end
  1720. else
  1721. begin
  1722. taicpu(p).oppostfix := PF_S;
  1723. {$ifdef AARCH64}
  1724. DebugMsg(SPeepholeOptimization + 'AND/BIC; CMP -> ANDS/BICS', p);
  1725. {$else AARCH64}
  1726. DebugMsg(SPeepholeOptimization + 'Bitwise; CMP -> Bitwise+S', p);
  1727. {$endif AARCH64}
  1728. end;
  1729. RemoveInstruction(hp1);
  1730. { If a temporary register was used for and/cmp before, we might be
  1731. able to deallocate the register so it can be used for other
  1732. optimisations later }
  1733. if (taicpu(p).opcode = A_TST) and TryRemoveRegAlloc(WorkingReg, p, p) then
  1734. ExcludeRegFromUsedRegs(WorkingReg, UsedRegs);
  1735. Result := True;
  1736. Exit;
  1737. end
  1738. else if
  1739. (hp2.typ = ait_label) or
  1740. { Conditional comparison instructions have already been covered }
  1741. RegModifiedByInstruction(NR_DEFAULTFLAGS, hp2) then
  1742. begin
  1743. { The comparison is a null operation }
  1744. if RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
  1745. begin
  1746. DebugMsg(SPeepholeOptimization + 'Bitwise; CMP -> nop', p);
  1747. RemoveInstruction(hp1);
  1748. RemoveCurrentP(p);
  1749. end
  1750. else
  1751. begin
  1752. DebugMsg(SPeepholeOptimization + 'CMP/BIC -> nop', hp1);
  1753. RemoveInstruction(hp1);
  1754. end;
  1755. Result := True;
  1756. Exit;
  1757. end;
  1758. end;
  1759. end;
  1760. function TARMAsmOptimizer.OptPass2TST(var p: tai): Boolean;
  1761. var
  1762. hp1, hp2: tai;
  1763. begin
  1764. Result := False;
  1765. if
  1766. {$ifndef AARCH64}
  1767. (taicpu(p).condition = C_None) and
  1768. {$endif AARCH64}
  1769. GetNextInstruction(p, hp1) and
  1770. MatchInstruction(hp1, A_B, [C_EQ, C_NE], [PF_None]) and
  1771. GetNextInstructionUsingReg(hp1, hp2, taicpu(p).oper[0]^.reg) then
  1772. begin
  1773. case taicpu(hp2).opcode of
  1774. A_AND:
  1775. { Change:
  1776. tst r1,##
  1777. (r2 not in use, or r2 = r1)
  1778. b.c .Lbl
  1779. ...
  1780. and r2,r1,##
  1781. Optimise to:
  1782. ands r2,r1,##
  1783. b.c .Lbl
  1784. ...
  1785. }
  1786. if (taicpu(hp2).oppostfix in [PF_None, PF_S]) and
  1787. {$ifndef AARCH64}
  1788. (taicpu(hp2).condition = C_None) and
  1789. {$endif AARCH64}
  1790. (taicpu(hp2).ops = taicpu(p).ops + 1) and
  1791. not RegInUsedRegs(taicpu(hp2).oper[0]^.reg, UsedRegs) and
  1792. MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^.reg) and
  1793. MatchOperand(taicpu(hp2).oper[2]^, taicpu(p).oper[1]^) and
  1794. (
  1795. (taicpu(hp2).ops = 3) or
  1796. MatchOperand(taicpu(hp2).oper[3]^, taicpu(p).oper[2]^)
  1797. ) and
  1798. (
  1799. not (cs_opt_level3 in current_settings.optimizerswitches) or
  1800. (
  1801. { Make sure the target register isn't used in between }
  1802. not RegUsedBetween(taicpu(hp2).oper[0]^.reg, hp1, hp2) and
  1803. (
  1804. { If the second operand is a register, make sure it isn't modified in between }
  1805. (taicpu(p).oper[1]^.typ <> top_reg) or
  1806. not RegModifiedBetween(taicpu(p).oper[1]^.reg, hp1, hp2)
  1807. )
  1808. )
  1809. ) then
  1810. begin
  1811. AllocRegBetween(taicpu(hp2).oper[0]^.reg, p, hp2, UsedRegs);
  1812. if (taicpu(hp2).oppostfix = PF_S) then
  1813. AllocRegBetween(NR_DEFAULTFLAGS, p, hp2, UsedRegs);
  1814. DebugMsg(SPeepholeOptimization + 'TST; B.c; AND -> ANDS; B.c (TstBcAnd2AndsBc)', p);
  1815. taicpu(hp2).oppostfix := PF_S;
  1816. Asml.Remove(hp2);
  1817. Asml.InsertAfter(hp2, p);
  1818. RemoveCurrentP(p, hp2);
  1819. Result := True;
  1820. Exit;
  1821. end;
  1822. A_TST:
  1823. { Change:
  1824. tst r1,##
  1825. b.c .Lbl
  1826. ... (flags not modified)
  1827. tst r1,##
  1828. Remove second tst
  1829. }
  1830. if
  1831. {$ifndef AARCH64}
  1832. (taicpu(hp2).condition = C_None) and
  1833. {$endif AARCH64}
  1834. (taicpu(hp2).ops = taicpu(p).ops) and
  1835. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^.reg) and
  1836. MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) and
  1837. (
  1838. (taicpu(hp2).ops = 2) or
  1839. MatchOperand(taicpu(hp2).oper[2]^, taicpu(p).oper[2]^)
  1840. ) and
  1841. (
  1842. not (cs_opt_level3 in current_settings.optimizerswitches) or
  1843. (
  1844. { Make sure the flags aren't modified in between }
  1845. not RegModifiedBetween(NR_DEFAULTFLAGS, hp1, hp2) and
  1846. (
  1847. { If the second operand is a register, make sure it isn't modified in between }
  1848. (taicpu(p).oper[1]^.typ <> top_reg) or
  1849. not RegModifiedBetween(taicpu(p).oper[1]^.reg, hp1, hp2)
  1850. )
  1851. )
  1852. ) then
  1853. begin
  1854. DebugMsg(SPeepholeOptimization + 'TST; B.c; TST -> TST; B.c (TstBcTst2TstBc)', p);
  1855. AllocRegBetween(NR_DEFAULTFLAGS, hp1, hp2, UsedRegs);
  1856. RemoveInstruction(hp2);
  1857. Result := True;
  1858. Exit;
  1859. end;
  1860. else
  1861. ;
  1862. end;
  1863. end;
  1864. end;
  1865. function TARMAsmOptimizer.TryConstMerge(var p: tai; hp1: tai): Boolean;
  1866. const
  1867. {$ifdef ARM}
  1868. LO_16_WRITE: TAsmOp = A_MOVW;
  1869. HI_16_WRITE: TAsmOp = A_MOVT;
  1870. {$endif ARM}
  1871. {$ifdef AARCH64}
  1872. LO_16_WRITE: TAsmOp = A_MOVZ;
  1873. HI_16_WRITE: TAsmOp = A_MOVK;
  1874. {$endif AARCH64}
  1875. var
  1876. hp2, hp2_second, hp3, hp3_second, p_second, hp1_second: tai;
  1877. ThisReg: TRegister;
  1878. ThisRef: TReference;
  1879. so: TShifterOp;
  1880. procedure SearchAhead;
  1881. begin
  1882. { If p.opcode = A_STR, then ThisReg will be NR_NO }
  1883. if (
  1884. {$ifdef ARM}
  1885. (p_second.typ = ait_instruction) and
  1886. (taicpu(p_second).condition = taicpu(p).condition) and
  1887. (
  1888. (taicpu(p_second).opcode = A_MOV) or
  1889. (taicpu(p_second).opcode = A_MOVW)
  1890. )
  1891. {$endif ARM}
  1892. {$ifdef AARCH64}
  1893. MatchInstruction(p, A_MOVZ, []) or
  1894. (
  1895. MatchInstruction(p, A_STR, []) and
  1896. SetAndTest(p, hp1)
  1897. )
  1898. {$endif AARCH64}
  1899. ) and
  1900. (
  1901. (
  1902. (ThisReg <> NR_NO) and
  1903. (
  1904. {$ifdef AARCH64}
  1905. (
  1906. (getsubreg(ThisReg) = R_SUBD) and
  1907. MatchInstruction(hp1, A_MOVK, []) and
  1908. (taicpu(hp1).oper[0]^.reg = ThisReg) and
  1909. GetNextInstruction(hp1, hp2) and
  1910. MatchInstruction(hp2, A_STR, []) and
  1911. (taicpu(hp2).oper[0]^.reg = ThisReg) and
  1912. GetNextInstruction(hp2, p_second)
  1913. ) or
  1914. {$endif AARCH64}
  1915. (
  1916. MatchInstruction(hp1, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, []) and
  1917. (taicpu(hp1).oper[0]^.reg = ThisReg) and
  1918. GetNextInstruction(hp1, p_second)
  1919. )
  1920. )
  1921. ) or (
  1922. { Just search one ahead if ThisReg is NR_NO }
  1923. (ThisReg = NR_NO) and
  1924. GetNextInstruction(hp1, p_second)
  1925. )
  1926. ) and
  1927. (
  1928. (
  1929. {$ifdef ARM}
  1930. (p_second.typ = ait_instruction) and
  1931. (taicpu(p_second).condition = taicpu(p).condition) and
  1932. (
  1933. (taicpu(p_second).opcode = A_MOV) or
  1934. (taicpu(p_second).opcode = A_MOVW)
  1935. ) and
  1936. {$endif ARM}
  1937. {$ifdef AARCH64}
  1938. MatchInstruction(p_second, A_MOVZ, []) and
  1939. {$endif AARCH64}
  1940. { Don't use ThisReg because it may be NR_NO }
  1941. GetNextInstruction(p_second, hp1_second) and
  1942. (
  1943. {$ifdef AARCH64}
  1944. (
  1945. MatchInstruction(hp1_second, A_MOVK, []) and
  1946. GetNextInstruction(hp1_second, hp2_second) and
  1947. MatchInstruction(hp2_second, A_STR, [PF_None])
  1948. ) or
  1949. {$endif AARCH64}
  1950. MatchInstruction(hp1_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [])
  1951. )
  1952. )
  1953. {$ifdef AARCH64}
  1954. or (
  1955. MatchInstruction(p_second, A_STR, []) and
  1956. (getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) and
  1957. { Negate the result because we're setting hp1_second to nil }
  1958. not SetAndTest(nil, hp1_second)
  1959. )
  1960. {$endif AARCH64}
  1961. ) then
  1962. TryConstMerge(p_second, hp1_second);
  1963. end;
  1964. begin
  1965. Result := False;
  1966. {$ifdef ARM}
  1967. { We need a Cortex-A ARM processor that supports MOVW and MOVT }
  1968. if not (CPUARM_HAS_EXTENDED_CONSTANTS in cpu_capabilities[current_settings.cputype]) then
  1969. Exit;
  1970. {$endif ARM}
  1971. ThisReg := NR_NO; { Safe initialisation }
  1972. case taicpu(p).opcode of
  1973. {$ifdef ARM}
  1974. A_MOV,
  1975. A_MOVW:
  1976. if (taicpu(p).opcode <> A_MOV) or (taicpu(p).oper[1]^.typ = top_const) then
  1977. {$endif ARM}
  1978. {$ifdef AARCH64}
  1979. A_MOVZ:
  1980. {$endif AARCH64}
  1981. begin
  1982. ThisReg := taicpu(p).oper[0]^.reg;
  1983. if Assigned(hp1){$ifdef ARM} and (taicpu(hp1).condition = taicpu(p).condition){$endif ARM} then
  1984. case taicpu(hp1).opcode of
  1985. A_STR:
  1986. if {$ifdef ARM}(taicpu(hp1).ops = 2) and {$endif ARM}SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, ThisReg) then
  1987. begin
  1988. ThisRef := taicpu(hp1).oper[1]^.ref^;
  1989. if (ThisRef.addressmode = AM_OFFSET) and
  1990. (ThisRef.index = NR_NO) and
  1991. { Only permit writes to the stack, since we can guarantee alignment with that }
  1992. (
  1993. (ThisRef.base = NR_STACK_POINTER_REG) or
  1994. (ThisRef.base = current_procinfo.framepointer)
  1995. ) then
  1996. begin
  1997. case taicpu(hp1).oppostfix of
  1998. PF_B:
  1999. {
  2000. With sequences such as:
  2001. movz w0,x
  2002. strb w0,[sp, #ofs]
  2003. movz w0,y
  2004. strb w0,[sp, #ofs+1]
  2005. Merge the constants to:
  2006. movz w0,x + (y shl 8)
  2007. strh w0,[sp, #ofs]
  2008. Only use the stack pointer or frame pointer and an even offset though
  2009. to guarantee alignment
  2010. }
  2011. if ((ThisRef.offset mod 2) = 0) and
  2012. GetNextInstruction(hp1, p_second) and
  2013. (p_second.typ = ait_instruction)
  2014. {$ifdef ARM}
  2015. and (taicpu(p_second).condition = taicpu(p).condition)
  2016. {$endif ARM}
  2017. then
  2018. begin
  2019. case taicpu(p_second).opcode of
  2020. {$ifdef ARM}
  2021. A_MOV,
  2022. A_MOVW:
  2023. if (taicpu(p_second).oppostfix = PF_None) and
  2024. ((taicpu(p_second).opcode <> A_MOV) or (taicpu(p_second).oper[1]^.typ = top_const)) then
  2025. {$endif ARM}
  2026. {$ifdef AARCH64}
  2027. A_MOVZ:
  2028. {$endif AARCH64}
  2029. begin
  2030. if SuperRegistersEqual(taicpu(p_second).oper[0]^.reg, ThisReg) and
  2031. GetNextInstruction(p_second, hp1_second) and
  2032. MatchInstruction(hp1_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [PF_B]) and
  2033. SuperRegistersEqual(taicpu(hp1_second).oper[0]^.reg, ThisReg) then
  2034. begin
  2035. { Is the second storage location exactly one byte ahead? }
  2036. Inc(ThisRef.offset);
  2037. if RefsEqual(taicpu(hp1_second).oper[1]^.ref^, ThisRef) and
  2038. { The final safety check... make sure the register used
  2039. to store the constant isn't used afterwards }
  2040. RegEndOfLife(ThisReg, taicpu(hp1_second)) then
  2041. begin
  2042. { See if we can merge 4 bytes at once (this benefits ARM mostly, but provides a speed boost for AArch64 too) }
  2043. if GetNextInstruction(hp1_second, hp2) and
  2044. (
  2045. {$ifdef ARM}
  2046. MatchInstruction(hp2, A_MOVW, [taicpu(p).condition], []) or
  2047. {$endif ARM}
  2048. (
  2049. MatchInstruction(hp2, LO_16_WRITE{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [])
  2050. {$ifdef ARM}
  2051. and (taicpu(hp2).oper[1]^.typ = top_const)
  2052. {$endif ARM}
  2053. )
  2054. ) and
  2055. SuperRegistersEqual(taicpu(hp2).oper[0]^.reg, ThisReg) and
  2056. GetNextInstruction(hp2, hp2_second) and
  2057. MatchInstruction(hp2_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [PF_B]) and
  2058. SuperRegistersEqual(taicpu(hp2_second).oper[0]^.reg, ThisReg) and
  2059. GetNextInstruction(hp2_second, hp3) and
  2060. (
  2061. {$ifdef ARM}
  2062. MatchInstruction(hp3, A_MOVW, [taicpu(p).condition], []) or
  2063. {$endif ARM}
  2064. (
  2065. MatchInstruction(hp3, LO_16_WRITE{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [])
  2066. {$ifdef ARM}
  2067. and (taicpu(hp3).oper[1]^.typ = top_const)
  2068. {$endif ARM}
  2069. )
  2070. ) and
  2071. SuperRegistersEqual(taicpu(hp3).oper[0]^.reg, ThisReg) and
  2072. GetNextInstruction(hp3, hp3_second) and
  2073. MatchInstruction(hp3_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [PF_B]) and
  2074. SuperRegistersEqual(taicpu(hp3_second).oper[0]^.reg, ThisReg) then
  2075. begin
  2076. Inc(ThisRef.offset);
  2077. if RefsEqual(taicpu(hp2_second).oper[1]^.ref^, ThisRef) then
  2078. begin
  2079. Inc(ThisRef.offset);
  2080. if RefsEqual(taicpu(hp3_second).oper[1]^.ref^, ThisRef) then
  2081. begin
  2082. { Merge the constants }
  2083. DebugMsg(SPeepholeOptimization + 'Merged four byte-writes to memory into a single word-write (MovzStrbMovzStrbMovzStrbMovzStrb2MovzMovkStr)', p);
  2084. {$ifdef ARM}
  2085. taicpu(p).opcode := A_MOVW;
  2086. {$endif ARM}
  2087. taicpu(p).oper[1]^.val := (taicpu(p).oper[1]^.val and $FF) or ((taicpu(p_second).oper[1]^.val and $FF) shl 8);
  2088. taicpu(hp2).opcode := HI_16_WRITE;
  2089. taicpu(hp2).oper[1]^.val := (taicpu(hp2).oper[1]^.val and $FF) or ((taicpu(hp3).oper[1]^.val and $FF) shl 8);
  2090. so.shiftimm := 16;
  2091. so.shiftmode := SM_LSL;
  2092. taicpu(hp2).loadshifterop(2, so);
  2093. taicpu(hp2).ops := 3;
  2094. taicpu(hp1).oppostfix := PF_None;
  2095. AsmL.Remove(hp2);
  2096. AsmL.InsertAfter(hp2, p);
  2097. RemoveInstruction(p_second);
  2098. RemoveInstruction(hp1_second);
  2099. RemoveInstruction(hp2_second);
  2100. RemoveInstruction(hp3);
  2101. RemoveInstruction(hp3_second);
  2102. Result := True;
  2103. {$ifdef AARCH64}
  2104. { Searching ahead only benefits AArch64 here }
  2105. SearchAhead;
  2106. {$endif AARCH64}
  2107. Exit;
  2108. end;
  2109. { Reset the offset so the range check below is correct }
  2110. Dec(ThisRef.offset);
  2111. end;
  2112. Dec(ThisRef.offset);
  2113. end;
  2114. {$ifdef ARM}
  2115. { Be careful. strb and str support offsets between -4095 and +4095, but
  2116. strh only supports offsets between -255 and +255. However, we might be
  2117. able to bypass this if there are four bytes in a row (for AArch64, just
  2118. use SearchAhead below }
  2119. if { Remember we added 1 to the offset }
  2120. (ThisRef.offset >= -254) and (ThisRef.offset <= 256) then
  2121. {$endif ARM}
  2122. begin
  2123. { Merge the constants and remove the second pair of instructions }
  2124. DebugMsg(SPeepholeOptimization + 'Merged two byte-writes to memory into a single half-write (MovzStrbMovzStrb2MovzStrh)', p);
  2125. {$ifdef ARM}
  2126. taicpu(p).opcode := A_MOVW;
  2127. {$endif ARM}
  2128. taicpu(p).oper[1]^.val := (taicpu(p).oper[1]^.val and $FF) or ((taicpu(p_second).oper[1]^.val and $FF) shl 8);
  2129. taicpu(hp1).oppostfix := PF_H;
  2130. RemoveInstruction(p_second);
  2131. RemoveInstruction(hp1_second);
  2132. Result := True;
  2133. end;
  2134. end;
  2135. end;
  2136. end;
  2137. {$ifdef AARCH64}
  2138. A_STR:
  2139. { Sometimes, the second mov might not be present as we're writing the
  2140. zero register to the next address - that is:
  2141. movz w0,x
  2142. strb w0,[sp, #ofs]
  2143. strb wzr,[sp, #ofs+1]
  2144. Which becomes:
  2145. movz w0,x
  2146. strh w0,[sp, #ofs]
  2147. }
  2148. if RegEndOfLife(ThisReg, taicpu(hp1)) and
  2149. (taicpu(p_second).oppostfix = PF_B) and
  2150. (getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
  2151. begin
  2152. { Is the second storage location exactly one byte ahead? }
  2153. Inc(ThisRef.offset);
  2154. if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
  2155. begin
  2156. { Merge the constants and remove the second pair of instructions }
  2157. DebugMsg(SPeepholeOptimization + 'Merged a byte-write and a zero-register byte-write to memory into a single half-write (MovzStrbStrb2MovzStrh 1)', p);
  2158. taicpu(p).oper[1]^.val := taicpu(p).oper[1]^.val and $FF; { In case there's some extraneous bits }
  2159. taicpu(hp1).oppostfix := PF_H;
  2160. RemoveInstruction(p_second);
  2161. Result := True;
  2162. end;
  2163. end;
  2164. {$endif AARCH64}
  2165. else
  2166. ;
  2167. end;
  2168. { Search ahead to see if more bytes are written individually,
  2169. because then we may be able to merge 4 bytes into a full
  2170. word write in a single pass }
  2171. if Result then
  2172. begin
  2173. SearchAhead;
  2174. Exit;
  2175. end;
  2176. end;
  2177. PF_H:
  2178. {
  2179. With sequences such as:
  2180. movz w0,x
  2181. strh w0,[sp, #ofs]
  2182. movz w0,y
  2183. strh w0,[sp, #ofs+2]
  2184. Merge the constants to:
  2185. movz w0,x
  2186. movk w0,y,lsl #16
  2187. str w0,[sp, #ofs]
  2188. Only use the stack pointer or frame pointer and an offset
  2189. that's a multiple of 4 though to guarantee alignment
  2190. }
  2191. if ((ThisRef.offset mod 4) = 0) and
  2192. GetNextInstruction(hp1, p_second) and
  2193. (p_second.typ = ait_instruction)
  2194. {$ifdef ARM}
  2195. and (taicpu(p_second).condition = taicpu(p).condition)
  2196. {$endif ARM}
  2197. then
  2198. begin
  2199. case taicpu(p_second).opcode of
  2200. {$ifdef ARM}
  2201. A_MOV,
  2202. A_MOVW:
  2203. if (taicpu(p).oppostfix = PF_None) and
  2204. ((taicpu(p).opcode <> A_MOV) or (taicpu(p).oper[1]^.typ = top_const)) then
  2205. {$endif ARM}
  2206. {$ifdef AARCH64}
  2207. A_MOVZ:
  2208. {$endif AARCH64}
  2209. begin
  2210. if SuperRegistersEqual(taicpu(p_second).oper[0]^.reg, ThisReg) and
  2211. GetNextInstruction(p_second, hp1_second) and
  2212. MatchInstruction(hp1_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [PF_H]) and
  2213. SuperRegistersEqual(taicpu(hp1_second).oper[0]^.reg, ThisReg) then
  2214. begin
  2215. { Is the second storage location exactly one byte ahead? }
  2216. Inc(ThisRef.offset, 2);
  2217. if RefsEqual(taicpu(hp1_second).oper[1]^.ref^, ThisRef) and
  2218. { The final safety check... make sure the register used
  2219. to store the constant isn't used afterwards }
  2220. RegEndOfLife(ThisReg, taicpu(hp1_second)) then
  2221. begin
  2222. { Merge the constants }
  2223. DebugMsg(SPeepholeOptimization + 'Merged two half-writes to memory into a single word-write (MovzStrhMovzStrh2MovzMovkStr)', p);
  2224. { Repurpose the second MOVZ instruction into a MOVK instruction }
  2225. if taicpu(p_second).oper[1]^.val = 0 then
  2226. begin
  2227. { Or just remove it if it's not needed }
  2228. RemoveInstruction(p_second);
  2229. {$ifdef ARM}
  2230. { If within the range 0..255, MOV suffices (256 can also be encoded this way) }
  2231. if (taicpu(p).oper[1]^.val < 0) or (taicpu(p).oper[1]^.val > 256) then
  2232. taicpu(p).opcode := A_MOVW;
  2233. {$endif ARM}
  2234. end
  2235. else
  2236. begin
  2237. asml.Remove(p_second);
  2238. asml.InsertAfter(p_second, p);
  2239. {$ifdef ARM}
  2240. taicpu(p).opcode := A_MOVW;
  2241. {$endif ARM}
  2242. taicpu(p_second).opcode := HI_16_WRITE;
  2243. {$ifdef AARCH64}
  2244. so.shiftmode := SM_LSL;
  2245. so.shiftimm := 16;
  2246. taicpu(p_second).ops := 3;
  2247. taicpu(p_second).loadshifterop(2, so);
  2248. { Make doubly sure we're only using the 32-bit register, otherwise STR could write 64 bits }
  2249. setsubreg(ThisReg, R_SUBD);
  2250. taicpu(p).oper[0]^.reg := ThisReg;
  2251. taicpu(p_second).oper[0]^.reg := ThisReg;
  2252. taicpu(hp1).oper[0]^.reg := ThisReg;
  2253. {$endif AARCH64}
  2254. { TODO: Confirm that the A_MOVZ / A_MOVK combination is the most efficient }
  2255. end;
  2256. taicpu(hp1).oppostfix := PF_None;
  2257. RemoveInstruction(hp1_second);
  2258. Result := True;
  2259. end;
  2260. end;
  2261. end;
  2262. {$ifdef AARCH64}
  2263. A_STR:
  2264. { Sometimes, the second mov might not be present as we're writing the
  2265. zero register to the next address - that is:
  2266. movz w0,x
  2267. strh w0,[sp, #ofs]
  2268. strh wzr,[sp, #ofs+1]
  2269. Which becomes:
  2270. movz w0,x
  2271. str w0,[sp, #ofs]
  2272. }
  2273. if RegEndOfLife(ThisReg, taicpu(hp1)) and
  2274. (taicpu(p_second).oppostfix = PF_H) and
  2275. (getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
  2276. begin
  2277. { Is the second storage location exactly one byte ahead? }
  2278. Inc(ThisRef.offset, 2);
  2279. if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
  2280. begin
  2281. { Merge the constants and remove the second pair of instructions }
  2282. DebugMsg(SPeepholeOptimization + 'Merged a half-write and a zero-register half-write to memory into a single word-write (MovzStrhStrh2MovzStr)', p);
  2283. { Make doubly sure we're only using the 32-bit register, otherwise STR could write 64 bits }
  2284. setsubreg(ThisReg, R_SUBD);
  2285. taicpu(p).oper[0]^.reg := ThisReg;
  2286. taicpu(hp1).oper[0]^.reg := ThisReg;
  2287. taicpu(hp1).oppostfix := PF_None;
  2288. RemoveInstruction(p_second);
  2289. Result := True;
  2290. end;
  2291. end;
  2292. {$endif AARCH64}
  2293. else
  2294. ;
  2295. end;
  2296. {$ifdef AARCH64}
  2297. { Search ahead to see if more half-words are written
  2298. individually, because then we may be able to merge
  2299. 4 words into a full extended write in a single pass }
  2300. if Result then
  2301. begin
  2302. SearchAhead;
  2303. Exit;
  2304. end;
  2305. {$endif AARCH64}
  2306. end;
  2307. else
  2308. ;
  2309. end;
  2310. end;
  2311. end;
  2312. {$ifdef AARCH64}
  2313. A_MOVK:
  2314. if (getsubreg(ThisReg) = R_SUBD) and
  2315. (taicpu(hp1).oper[0]^.reg = ThisReg) and
  2316. (taicpu(hp1).ops = 3) and
  2317. (taicpu(hp1).oper[2]^.shifterop^.shiftmode = SM_LSL) and
  2318. (taicpu(hp1).oper[2]^.shifterop^.shiftimm = 16) and
  2319. GetNextInstruction(hp1, hp2) and
  2320. MatchInstruction(hp2, A_STR, [PF_None]) and
  2321. (taicpu(hp2).oper[0]^.reg = ThisReg) then
  2322. begin
  2323. {
  2324. With sequences such as:
  2325. movz w0,x
  2326. movk w0,y,lsl #16
  2327. str w0,[sp, #ofs]
  2328. movz w0,z
  2329. movk w0,q,lsl #16
  2330. str w0,[sp, #ofs+4]
  2331. Merge the constants to:
  2332. movz x0,x
  2333. movk x0,y,lsl #16
  2334. movk x0,z,lsl #32
  2335. movk x0,q,lsl #48
  2336. str x0,[sp, #ofs]
  2337. Only use the stack pointer or frame pointer and an offset
  2338. that's a multiple of 8 though to guarantee alignment
  2339. }
  2340. ThisRef := taicpu(hp2).oper[1]^.ref^;
  2341. if ((ThisRef.offset mod 8) = 0) and
  2342. GetNextInstruction(hp2, p_second) and
  2343. (p_second.typ = ait_instruction) then
  2344. case taicpu(p_second).opcode of
  2345. A_MOVZ:
  2346. if (
  2347. (taicpu(p_second).oper[0]^.reg = ThisReg) or
  2348. (
  2349. RegEndOfLife(ThisReg, taicpu(hp2)) and
  2350. (getsubreg(taicpu(p_second).oper[0]^.reg) = R_SUBD)
  2351. )
  2352. ) and GetNextInstruction(p_second, hp1_second) then
  2353. begin
  2354. case taicpu(hp1_second).opcode of
  2355. A_MOVK:
  2356. if (taicpu(p_second).oper[1]^.val <= $FFFF) and
  2357. (taicpu(hp1_second).oper[0]^.reg = taicpu(p_second).oper[0]^.reg) and
  2358. (taicpu(hp1_second).ops = 3) and
  2359. (taicpu(hp1_second).oper[2]^.shifterop^.shiftmode = SM_LSL) and
  2360. (taicpu(hp1_second).oper[2]^.shifterop^.shiftimm = 16) and
  2361. GetNextInstruction(hp1_second, hp2_second) and
  2362. MatchInstruction(hp2_second, A_STR, [PF_None]) and
  2363. (taicpu(hp1_second).oper[0]^.reg = taicpu(p_second).oper[0]^.reg) then
  2364. begin
  2365. Inc(ThisRef.offset, 4);
  2366. if RefsEqual(taicpu(hp2_second).oper[1]^.ref^, ThisRef) and
  2367. { The final safety check... make sure the register used
  2368. to store the constant isn't used afterwards }
  2369. RegEndOfLife(taicpu(p_second).oper[0]^.reg, taicpu(hp2_second)) then
  2370. begin
  2371. DebugMsg(SPeepholeOptimization + 'Merged two word-writes to memory into a single extended-write (MovzMovkStrMovzMovkStr2MovzMovkMovkMovkStr)', p);
  2372. { Extend register to 64-bit and repurpose second MOVZ to a MOVK with lsl 32 }
  2373. setsubreg(ThisReg, R_SUBQ);
  2374. taicpu(p).oper[0]^.reg := ThisReg;
  2375. taicpu(hp1).oper[0]^.reg := ThisReg;
  2376. { If the 3rd word is zero, we can remove the instruction entirely }
  2377. if taicpu(p_second).oper[1]^.val = 0 then
  2378. RemoveInstruction(p_second)
  2379. else
  2380. begin
  2381. taicpu(p_second).oper[0]^.reg := ThisReg;
  2382. so.shiftimm := 32;
  2383. so.shiftmode := SM_LSL;
  2384. taicpu(p_second).opcode := A_MOVK;
  2385. taicpu(p_second).ops := 3;
  2386. taicpu(p_second).loadshifterop(2, so);
  2387. AsmL.Remove(p_second);
  2388. AsmL.InsertBefore(p_second, hp2);
  2389. end;
  2390. taicpu(hp1_second).oper[0]^.reg := ThisReg;
  2391. taicpu(hp1_second).oper[2]^.shifterop^.shiftimm := 48;
  2392. taicpu(hp2).oper[0]^.reg := ThisReg;
  2393. AsmL.Remove(hp1_second);
  2394. AsmL.InsertBefore(hp1_second, hp2);
  2395. RemoveInstruction(hp2_second);
  2396. Result := True;
  2397. end;
  2398. end;
  2399. else
  2400. ;
  2401. end;
  2402. end;
  2403. A_STR:
  2404. { Sometimes, the second mov might not be present as we're writing the
  2405. zero register to the next address - that is:
  2406. movz w0,x
  2407. movk w0,y,lsl #16
  2408. str w0,[sp, #ofs]
  2409. str wzr,[sp, #ofs+4]
  2410. Which becomes:
  2411. movz x0,x
  2412. movk x0,y,lsl #16
  2413. str x0,[sp, #ofs]
  2414. }
  2415. begin
  2416. { Sometimes, the second mov might not be present as we're writing the
  2417. zero register to the next address - that is:
  2418. movz w0,x
  2419. strh w0,[sp, #ofs]
  2420. strh wzr,[sp, #ofs+1]
  2421. Which becomes:
  2422. movz w0,x
  2423. str w0,[sp, #ofs]
  2424. }
  2425. { Don't need to check end-of-life because the upper 32 bits are zero
  2426. and the overall value isn't being modified }
  2427. if (taicpu(p_second).oppostfix = PF_None) and
  2428. (taicpu(p_second).oper[0]^.reg = NR_WZR) then
  2429. begin
  2430. { Is the second storage location exactly one byte ahead? }
  2431. Inc(ThisRef.offset, 4);
  2432. if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
  2433. begin
  2434. { Merge the constants and remove the second pair of instructions }
  2435. DebugMsg(SPeepholeOptimization + 'Merged a word-write and a zero-register word-write to memory into a single extended-write (MovzStrStr2MovzStr)', p);
  2436. setsubreg(taicpu(p).oper[0]^.reg, R_SUBQ);
  2437. setsubreg(taicpu(hp1).oper[0]^.reg, R_SUBQ);
  2438. setsubreg(taicpu(hp2).oper[0]^.reg, R_SUBQ);
  2439. RemoveInstruction(p_second);
  2440. Result := True;
  2441. end;
  2442. end;
  2443. end
  2444. else
  2445. ;
  2446. end;
  2447. end;
  2448. {$endif AARCH64}
  2449. else
  2450. ;
  2451. end;
  2452. end;
  2453. {$ifdef AARCH64}
  2454. A_STR:
  2455. { hp1 is probably nil }
  2456. if getsupreg(taicpu(p).oper[0]^.reg) = RS_WZR then
  2457. begin
  2458. ThisRef := taicpu(p).oper[1]^.ref^;
  2459. if (ThisRef.addressmode = AM_OFFSET) and
  2460. (ThisRef.index = NR_NO) and
  2461. { Only permit writes to the stack, since we can guarantee alignment with that }
  2462. (
  2463. (ThisRef.base = NR_STACK_POINTER_REG) or
  2464. (ThisRef.base = current_procinfo.framepointer)
  2465. ) then
  2466. begin
  2467. case taicpu(p).oppostfix of
  2468. PF_B:
  2469. {
  2470. With sequences such as:
  2471. strb wzr,[sp, #ofs]
  2472. movz w0,x
  2473. strb w0,[sp, #ofs+1]
  2474. Merge the constants to:
  2475. movz w0,x shl 8
  2476. strh w0,[sp, #ofs]
  2477. Only use the stack pointer or frame pointer and an even offset though
  2478. to guarantee alignment
  2479. }
  2480. if ((ThisRef.offset mod 2) = 0) and
  2481. GetNextInstruction(p, p_second) and
  2482. (p_second.typ = ait_instruction) then
  2483. begin
  2484. case taicpu(p_second).opcode of
  2485. A_MOVZ:
  2486. begin
  2487. ThisReg := taicpu(p_second).oper[0]^.reg;
  2488. if GetNextInstruction(p_second, hp1_second) and
  2489. MatchInstruction(hp1_second, A_STR, [PF_B]) and
  2490. SuperRegistersEqual(taicpu(hp1_second).oper[0]^.reg, ThisReg) then
  2491. begin
  2492. { Is the second storage location exactly one byte ahead? }
  2493. Inc(ThisRef.offset);
  2494. if RefsEqual(taicpu(hp1_second).oper[1]^.ref^, ThisRef) and
  2495. { The final safety check... make sure the register used
  2496. to store the constant isn't used afterwards }
  2497. RegEndOfLife(ThisReg, taicpu(hp1_second)) then
  2498. begin
  2499. { Merge the constants by repurposing the 2nd move, changing the register in the first STR and removing the second STR }
  2500. DebugMsg(SPeepholeOptimization + 'Merged a zero-register byte-write and a byte-write to memory into a single half-write (MovzStrbStrb2MovzStrh 2)', p);
  2501. taicpu(p_second).oper[1]^.val := (taicpu(p_second).oper[1]^.val and $FF) shl 8;
  2502. taicpu(hp1_second).oppostfix := PF_H;
  2503. Dec(taicpu(hp1_second).oper[1]^.ref^.offset, 1);
  2504. RemoveCurrentP(p, p_second);
  2505. Result := True;
  2506. hp1 := hp1_second; { So SearchAhead works properly below }
  2507. end;
  2508. end;
  2509. end;
  2510. A_STR:
  2511. { Change:
  2512. strb wzr,[sp, #ofs]
  2513. strb wzr,[sp, #ofs+1]
  2514. To:
  2515. strh wzr,[sp, #ofs]
  2516. }
  2517. if (taicpu(p_second).oppostfix = PF_B) and
  2518. (getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
  2519. begin
  2520. { Is the second storage location exactly one byte ahead? }
  2521. Inc(ThisRef.offset);
  2522. if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
  2523. begin
  2524. DebugMsg(SPeepholeOptimization + 'Merged two zero-register byte-writes to memory into a single zero-register half-write (StrbStrb2Strh)', p);
  2525. taicpu(p).oppostfix := PF_H;
  2526. RemoveInstruction(p_second);
  2527. Result := True;
  2528. end;
  2529. end;
  2530. else
  2531. ;
  2532. end;
  2533. { Search ahead to see if more bytes are written individually,
  2534. because then we may be able to merge 4 bytes into a full
  2535. word write in a single pass }
  2536. if Result then
  2537. begin
  2538. SearchAhead;
  2539. Exit;
  2540. end;
  2541. end;
  2542. PF_H:
  2543. {
  2544. With sequences such as:
  2545. strh wzr,[sp, #ofs]
  2546. movz w0,x
  2547. strh w0,[sp, #ofs+2]
  2548. Merge the constants to:
  2549. movz w0,#0
  2550. movk w0,x,lsl #16
  2551. str w0,[sp, #ofs]
  2552. Only use the stack pointer or frame pointer and an offset
  2553. that's a multiple of 4 though to guarantee alignment
  2554. }
  2555. if ((ThisRef.offset mod 4) = 0) and
  2556. GetNextInstruction(p, p_second) and
  2557. (p_second.typ = ait_instruction) then
  2558. begin
  2559. case taicpu(p_second).opcode of
  2560. A_MOVZ:
  2561. begin
  2562. ThisReg := taicpu(p_second).oper[0]^.reg;
  2563. if GetNextInstruction(p_second, hp1_second) and
  2564. MatchInstruction(hp1_second, A_STR, [PF_H]) and
  2565. SuperRegistersEqual(taicpu(hp1_second).oper[0]^.reg, ThisReg) then
  2566. begin
  2567. { Is the second storage location exactly two bytes ahead? }
  2568. Inc(ThisRef.offset, 2);
  2569. if RefsEqual(taicpu(hp1_second).oper[1]^.ref^, ThisRef) and
  2570. { The final safety check... make sure the register used
  2571. to store the constant isn't used afterwards }
  2572. RegEndOfLife(ThisReg, taicpu(hp1_second)) then
  2573. begin
  2574. { Merge the constants }
  2575. DebugMsg(SPeepholeOptimization + 'Merged a zero-register half-write and a half-write to memory into a single word-write (StrhMovzStrh2MovzMovkStr)', p);
  2576. { Repurpose the first STR to a MOVZ instruction }
  2577. taicpu(p).opcode := A_MOVZ;
  2578. taicpu(p).oppostfix := PF_None;
  2579. taicpu(p).oper[0]^.reg := ThisReg;
  2580. taicpu(p).loadconst(1, 0);
  2581. so.shiftmode := SM_LSL;
  2582. so.shiftimm := 16;
  2583. taicpu(p_second).opcode := A_MOVK;
  2584. taicpu(p_second).ops := 3;
  2585. taicpu(p_second).loadshifterop(2, so);
  2586. { Make doubly sure we're only using the 32-bit register, otherwise STR could write 64 bits }
  2587. setsubreg(ThisReg, R_SUBD);
  2588. taicpu(p).oper[0]^.reg := ThisReg;
  2589. taicpu(p_second).oper[0]^.reg := ThisReg;
  2590. taicpu(hp1_second).oper[0]^.reg := ThisReg;
  2591. { TODO: Confirm that the A_MOVZ / A_MOVK combination is the most efficient }
  2592. taicpu(hp1_second).oppostfix := PF_None;
  2593. Dec(taicpu(hp1_second).oper[1]^.ref^.offset, 2);
  2594. Result := True;
  2595. end;
  2596. end;
  2597. end;
  2598. A_STR:
  2599. { Change:
  2600. strh wzr,[sp, #ofs]
  2601. strh wzr,[sp, #ofs+2]
  2602. To:
  2603. str wzr,[sp, #ofs]
  2604. }
  2605. if (taicpu(p_second).oppostfix = PF_H) and
  2606. (getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
  2607. begin
  2608. { Is the second storage location exactly one byte ahead? }
  2609. Inc(ThisRef.offset, 2);
  2610. if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
  2611. begin
  2612. DebugMsg(SPeepholeOptimization + 'Merged two zero-register half-writes to memory into a single zero-register word-write (StrhStrh2Str)', p);
  2613. { Make doubly sure we're only using the 32-bit register, otherwise STR could write 64 bits }
  2614. taicpu(p).oper[0]^.reg := NR_WZR;
  2615. taicpu(p).oppostfix := PF_None;
  2616. RemoveInstruction(p_second);
  2617. Result := True;
  2618. end;
  2619. end;
  2620. else
  2621. ;
  2622. end;
  2623. end;
  2624. PF_None:
  2625. {
  2626. With sequences such as:
  2627. str wzr,[sp, #ofs]
  2628. movz w0,x
  2629. movk w0,y,lsl #16
  2630. str w0,[sp, #ofs+4]
  2631. Merge the constants to:
  2632. movz x0,#0
  2633. movk x0,x,lsl #32
  2634. movk x0,y,lsl #48
  2635. str x0,[sp, #ofs]
  2636. Only use the stack pointer or frame pointer and an offset
  2637. that's a multiple of 8 though to guarantee alignment
  2638. }
  2639. if ((ThisRef.offset mod 8) = 0) and
  2640. GetNextInstruction(p, p_second) and
  2641. (p_second.typ = ait_instruction) then
  2642. begin
  2643. case taicpu(p_second).opcode of
  2644. A_MOVZ:
  2645. begin
  2646. ThisReg := taicpu(p_second).oper[0]^.reg;
  2647. if GetNextInstruction(p_second, hp1_second) and
  2648. MatchInstruction(hp1_second, A_MOVK, []) and
  2649. GetNextInstruction(hp1_second, hp2_second) and
  2650. MatchInstruction(hp2_second, A_STR, [PF_None]) and
  2651. (taicpu(hp2_second).oper[0]^.reg = ThisReg) then
  2652. begin
  2653. { Is the second storage location exactly four bytes ahead? }
  2654. Inc(ThisRef.offset, 4);
  2655. if RefsEqual(taicpu(hp2_second).oper[1]^.ref^, ThisRef) and
  2656. { The final safety check... make sure the register used
  2657. to store the constant isn't used afterwards }
  2658. RegEndOfLife(ThisReg, taicpu(hp1_second)) then
  2659. begin
  2660. { Merge the constants }
  2661. DebugMsg(SPeepholeOptimization + 'Merged a zero-register word-write and a word-write to memory into a single extended-write (StrMovzMovkStr2MovzMovkMovkStr)', p);
  2662. setsubreg(ThisReg, R_SUBQ);
  2663. { Repurpose the first STR to a MOVZ instruction }
  2664. taicpu(p).opcode := A_MOVZ;
  2665. taicpu(p).oppostfix := PF_None;
  2666. taicpu(p).oper[0]^.reg := ThisReg;
  2667. taicpu(p).loadconst(1, 0);
  2668. { If the 3rd word is zero, we can remove the instruction entirely }
  2669. if taicpu(p_second).oper[1]^.val = 0 then
  2670. RemoveInstruction(p_second)
  2671. else
  2672. begin
  2673. so.shiftmode := SM_LSL;
  2674. so.shiftimm := 32;
  2675. taicpu(p_second).opcode := A_MOVK;
  2676. taicpu(p_second).ops := 3;
  2677. taicpu(p_second).loadshifterop(2, so);
  2678. taicpu(p_second).oper[0]^.reg := ThisReg;
  2679. end;
  2680. taicpu(p).oper[0]^.reg := ThisReg;
  2681. taicpu(hp1_second).oper[0]^.reg := ThisReg;
  2682. taicpu(hp1_second).oper[2]^.shifterop^.shiftimm := 48;
  2683. { TODO: Confirm that the A_MOVZ / A_MOVK / A_MOVK combination is the most efficient }
  2684. taicpu(hp2_second).oppostfix := PF_None;
  2685. Dec(taicpu(hp2_second).oper[1]^.ref^.offset, 4);
  2686. taicpu(hp2_second).oper[0]^.reg := ThisReg; { Remember to change the register to its 64-bit counterpart }
  2687. Result := True;
  2688. end;
  2689. end;
  2690. end;
  2691. A_STR:
  2692. { Change:
  2693. str wzr,[sp, #ofs]
  2694. str wzr,[sp, #ofs+4]
  2695. To:
  2696. str xzr,[sp, #ofs]
  2697. }
  2698. if (taicpu(p_second).oppostfix = PF_None) and
  2699. (getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
  2700. begin
  2701. { Is the second storage location exactly one byte ahead? }
  2702. Inc(ThisRef.offset, 4);
  2703. if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
  2704. begin
  2705. DebugMsg(SPeepholeOptimization + 'Merged two zero-register word-writes to memory into a single zero-register extended-write (StrStr2Str)', p);
  2706. taicpu(p).oper[0]^.reg := NR_XZR;
  2707. RemoveInstruction(p_second);
  2708. Result := True;
  2709. end;
  2710. end;
  2711. else
  2712. ;
  2713. end;
  2714. end;
  2715. else
  2716. ;
  2717. end;
  2718. end;
  2719. end;
  2720. {$endif AARCH64}
  2721. else
  2722. ;
  2723. end;
  2724. end;
  2725. end.