aoptcpu.pas 45 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294
  1. {
  2. Copyright (c) 1998-2002 by Jonas Maebe, member of the Free Pascal
  3. Development Team
  4. This unit implements the ARM64 optimizer object
  5. This program is free software; you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation; either version 2 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program; if not, write to the Free Software
  15. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16. ****************************************************************************
  17. }
  18. Unit aoptcpu;
  19. {$i fpcdefs.inc}
  20. {$ifdef EXTDEBUG}
  21. {$define DEBUG_AOPTCPU}
  22. {$endif EXTDEBUG}
  23. Interface
  24. uses
  25. globtype, globals,
  26. cutils,
  27. cgbase, cpubase, aasmtai, aasmcpu,
  28. aopt, aoptcpub, aoptarm;
  29. Type
  30. TCpuAsmOptimizer = class(TARMAsmOptimizer)
  31. { uses the same constructor as TAopObj }
  32. function PrePeepHoleOptsCpu(var p: tai): boolean; override;
  33. function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
  34. function PeepHoleOptPass2Cpu(var p: tai): boolean; override;
  35. function PostPeepHoleOptsCpu(var p: tai): boolean; override;
  36. function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
  37. function InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;override;
  38. function LookForPostindexedPattern(var p : tai) : boolean;
  39. public
  40. { With these routines, there's optimisation code that's general for all ARM platforms }
  41. function OptPass1LDR(var p: tai): Boolean; override;
  42. function OptPass1STR(var p: tai): Boolean; override;
  43. private
  44. function RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string): boolean;
  45. function OptPass1Shift(var p: tai): boolean;
  46. function OptPostCMP(var p: tai): boolean;
  47. function OptPostAnd(var p: tai): Boolean;
  48. function OptPass1Data(var p: tai): boolean;
  49. function OptPass1FData(var p: tai): Boolean;
  50. function OptPass1STP(var p: tai): boolean;
  51. function OptPass1Mov(var p: tai): boolean;
  52. function OptPass1MOVZ(var p: tai): boolean;
  53. function OptPass1FMov(var p: tai): Boolean;
  54. function OptPass1B(var p: tai): boolean;
  55. function OptPass1SXTW(var p: tai): Boolean;
  56. function OptPass2LDRSTR(var p: tai): boolean;
  57. End;
  58. Implementation
  59. uses
  60. aasmbase,
  61. aoptutils,
  62. cgutils,
  63. verbose;
  64. {$ifdef DEBUG_AOPTCPU}
  65. const
  66. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  67. {$else DEBUG_AOPTCPU}
  68. { Empty strings help the optimizer to remove string concatenations that won't
  69. ever appear to the user on release builds. [Kit] }
  70. const
  71. SPeepholeOptimization = '';
  72. {$endif DEBUG_AOPTCPU}
  73. function CanBeCond(p : tai) : boolean;
  74. begin
  75. result:=(p.typ=ait_instruction) and (taicpu(p).condition=C_None);
  76. end;
  77. function TCpuAsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  78. var
  79. p: taicpu;
  80. begin
  81. Result := false;
  82. if not ((assigned(hp)) and (hp.typ = ait_instruction)) then
  83. exit;
  84. p := taicpu(hp);
  85. case p.opcode of
  86. { These operations do not write into a register at all
  87. LDR/STR with post/pre-indexed operations do not need special treatment
  88. because post-/preindexed does not mean that a register
  89. is loaded with a new value, it is only modified }
  90. A_STR, A_CMP, A_CMN, A_TST, A_B, A_BL, A_MSR, A_FCMP:
  91. exit;
  92. else
  93. ;
  94. end;
  95. if p.ops=0 then
  96. exit;
  97. case p.oper[0]^.typ of
  98. top_reg:
  99. Result := SuperRegistersEqual(p.oper[0]^.reg,reg);
  100. top_ref:
  101. Result :=
  102. (taicpu(p).oper[0]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
  103. (taicpu(p).oper[0]^.ref^.base = reg);
  104. else
  105. ;
  106. end;
  107. end;
  108. function TCpuAsmOptimizer.InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;
  109. var
  110. p: taicpu;
  111. i: longint;
  112. begin
  113. instructionLoadsFromReg := false;
  114. if not (assigned(hp) and (hp.typ = ait_instruction)) then
  115. exit;
  116. p:=taicpu(hp);
  117. i:=1;
  118. { Start on oper[0]? }
  119. if taicpu(hp).spilling_get_operation_type(0) in [operand_read, operand_readwrite] then
  120. i:=0;
  121. while(i<p.ops) do
  122. begin
  123. case p.oper[I]^.typ of
  124. top_reg:
  125. Result := (p.oper[I]^.reg = reg);
  126. top_ref:
  127. Result :=
  128. (p.oper[I]^.ref^.base = reg) or
  129. (p.oper[I]^.ref^.index = reg);
  130. else
  131. ;
  132. end;
  133. { Bailout if we found something }
  134. if Result then
  135. exit;
  136. Inc(I);
  137. end;
  138. end;
  139. {
  140. optimize
  141. ldr/str regX,[reg1]
  142. ...
  143. add/sub reg1,reg1,regY/const
  144. into
  145. ldr/str regX,[reg1], regY/const
  146. }
  147. function TCpuAsmOptimizer.LookForPostindexedPattern(var p: tai) : boolean;
  148. var
  149. hp1 : tai;
  150. begin
  151. Result:=false;
  152. if (taicpu(p).oper[1]^.typ = top_ref) and
  153. (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
  154. (taicpu(p).oper[1]^.ref^.index=NR_NO) and
  155. (taicpu(p).oper[1]^.ref^.offset=0) and
  156. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.ref^.base) and
  157. { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
  158. MatchInstruction(hp1, [A_ADD, A_SUB], [PF_None]) and
  159. (taicpu(hp1).oper[0]^.reg=taicpu(p).oper[1]^.ref^.base) and
  160. (taicpu(hp1).oper[1]^.reg=taicpu(p).oper[1]^.ref^.base) and
  161. (
  162. { valid offset? }
  163. (taicpu(hp1).oper[2]^.typ=top_const) and
  164. (taicpu(hp1).oper[2]^.val>=-256) and
  165. (abs(taicpu(hp1).oper[2]^.val)<256)
  166. ) and
  167. { don't apply the optimization if the base register is loaded }
  168. (getsupreg(taicpu(p).oper[0]^.reg)<>getsupreg(taicpu(p).oper[1]^.ref^.base)) and
  169. not(RegModifiedBetween(taicpu(hp1).oper[0]^.reg,p,hp1)) and
  170. not(RegModifiedBetween(taicpu(hp1).oper[2]^.reg,p,hp1)) then
  171. begin
  172. if taicpu(p).opcode = A_LDR then
  173. DebugMsg(SPeepholeOptimization + 'LdrAdd/Sub2Ldr Postindex done', p)
  174. else
  175. DebugMsg(SPeepholeOptimization + 'StrAdd/Sub2Str Postindex done', p);
  176. taicpu(p).oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
  177. if taicpu(hp1).opcode=A_ADD then
  178. taicpu(p).oper[1]^.ref^.offset:=taicpu(hp1).oper[2]^.val
  179. else
  180. taicpu(p).oper[1]^.ref^.offset:=-taicpu(hp1).oper[2]^.val;
  181. asml.Remove(hp1);
  182. hp1.Free;
  183. Result:=true;
  184. end;
  185. end;
  186. function TCpuAsmOptimizer.RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string):boolean;
  187. var
  188. alloc,
  189. dealloc : tai_regalloc;
  190. hp1 : tai;
  191. begin
  192. Result:=false;
  193. if ((MatchInstruction(movp, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  194. ((getregtype(taicpu(movp).oper[0]^.reg)=R_MMREGISTER) { or (taicpu(p).opcode in [A_LDUR])})
  195. ) { or
  196. (((taicpu(p).oppostfix in [PF_F64F32,PF_F64S16,PF_F64S32,PF_F64U16,PF_F64U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFD)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F64])) or
  197. (((taicpu(p).oppostfix in [PF_F32F64,PF_F32S16,PF_F32S32,PF_F32U16,PF_F32U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFS)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F32])) }
  198. ) and
  199. (taicpu(movp).ops=2) and
  200. MatchOperand(taicpu(movp).oper[1]^, taicpu(p).oper[0]^.reg) and
  201. { the destination register of the mov might not be used beween p and movp }
  202. not(RegUsedBetween(taicpu(movp).oper[0]^.reg,p,movp)) and
  203. { Take care to only do this for instructions which REALLY load to the first register.
  204. Otherwise
  205. str reg0, [reg1]
  206. fmov reg2, reg0
  207. will be optimized to
  208. str reg2, [reg1]
  209. }
  210. RegLoadedWithNewValue(taicpu(p).oper[0]^.reg, p) then
  211. begin
  212. dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(movp.Next));
  213. if assigned(dealloc) then
  214. begin
  215. DebugMsg(SPeepholeOptimization + optimizer+' removed superfluous vmov', movp);
  216. result:=true;
  217. { taicpu(p).oper[0]^.reg is not used anymore, try to find its allocation
  218. and remove it if possible }
  219. asml.Remove(dealloc);
  220. alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.previous));
  221. if assigned(alloc) then
  222. begin
  223. asml.Remove(alloc);
  224. alloc.free;
  225. dealloc.free;
  226. end
  227. else
  228. asml.InsertAfter(dealloc,p);
  229. { try to move the allocation of the target register }
  230. GetLastInstruction(movp,hp1);
  231. alloc:=FindRegAlloc(taicpu(movp).oper[0]^.reg,tai(hp1.Next));
  232. if assigned(alloc) then
  233. begin
  234. asml.Remove(alloc);
  235. asml.InsertBefore(alloc,p);
  236. { adjust used regs }
  237. IncludeRegInUsedRegs(taicpu(movp).oper[0]^.reg,UsedRegs);
  238. end;
  239. { change
  240. vldr reg0,[reg1]
  241. vmov reg2,reg0
  242. into
  243. ldr reg2,[reg1]
  244. if reg2 is an int register
  245. if (taicpu(p).opcode=A_VLDR) and (getregtype(taicpu(movp).oper[0]^.reg)=R_INTREGISTER) then
  246. taicpu(p).opcode:=A_LDR;
  247. }
  248. { finally get rid of the mov }
  249. taicpu(p).loadreg(0,taicpu(movp).oper[0]^.reg);
  250. asml.remove(movp);
  251. movp.free;
  252. end;
  253. end;
  254. end;
  255. function TCpuAsmOptimizer.OptPass1LDR(var p: tai): Boolean;
  256. var
  257. hp1: tai;
  258. begin
  259. Result := False;
  260. if inherited OptPass1LDR(p) or
  261. LookForPostindexedPattern(p) then
  262. Exit(True)
  263. else if (taicpu(p).oppostfix in [PF_B,PF_SB,PF_H,PF_SH,PF_None]) and
  264. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  265. RemoveSuperfluousMove(p, hp1, 'Ldr<Postfix>Mov2Ldr<Postfix>') then
  266. Exit(true);
  267. end;
  268. function TCpuAsmOptimizer.OptPass1STR(var p: tai): Boolean;
  269. begin
  270. Result := False;
  271. if inherited OptPass1STR(p) or
  272. LookForPostindexedPattern(p) then
  273. Exit(True);
  274. end;
  275. function TCpuAsmOptimizer.OptPass1Shift(var p : tai): boolean;
  276. var
  277. hp1,hp2: tai;
  278. I2, I: Integer;
  279. shifterop: tshifterop;
  280. begin
  281. Result:=false;
  282. { This folds shifterops into following instructions
  283. <shiftop> r0, r1, #imm
  284. <op> r2, r3, r0
  285. to
  286. <op> r2, r3, r1, <shiftop> #imm
  287. }
  288. { do not handle ROR yet, only part of the instructions below support ROR as shifter operand }
  289. if MatchInstruction(p,[A_LSL, A_LSR, A_ASR{, A_ROR}],[PF_None]) and
  290. MatchOpType(taicpu(p),top_reg,top_reg,top_const) and
  291. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  292. MatchInstruction(hp1, [A_ADD, A_AND, A_BIC, A_CMP, A_CMN,
  293. A_EON, A_EOR, A_NEG, A_ORN, A_ORR,
  294. A_SUB, A_TST], [PF_None]) and
  295. RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  296. (taicpu(hp1).ops >= 2) and
  297. { Currently we can't fold into another shifterop }
  298. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.typ = top_reg) and
  299. { SP does not work completely with shifted registers, as I didn't find the exact rules,
  300. we do not operate on SP }
  301. (taicpu(hp1).oper[0]^.reg<>NR_SP) and
  302. (taicpu(hp1).oper[1]^.reg<>NR_SP) and
  303. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.reg<>NR_SP) and
  304. { reg1 might not be modified inbetween }
  305. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
  306. (
  307. { Only ONE of the two src operands is allowed to match }
  308. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-2]^) xor
  309. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)
  310. ) and
  311. { for SUB, the last operand must match, there is no RSB on AArch64 }
  312. ((taicpu(hp1).opcode<>A_SUB) or
  313. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)) then
  314. begin
  315. { for the two operand instructions, start also at the second operand as they are not always commutative
  316. (depends on the flags tested laster on) and thus the operands cannot swapped }
  317. I2:=1;
  318. for I:=I2 to taicpu(hp1).ops-1 do
  319. if MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[I]^.reg) then
  320. begin
  321. { If the parameter matched on the second op from the RIGHT
  322. we have to switch the parameters, this will not happen for CMP
  323. were we're only evaluating the most right parameter
  324. }
  325. shifterop_reset(shifterop);
  326. case taicpu(p).opcode of
  327. A_LSL:
  328. shifterop.shiftmode:=SM_LSL;
  329. A_ROR:
  330. shifterop.shiftmode:=SM_ROR;
  331. A_LSR:
  332. shifterop.shiftmode:=SM_LSR;
  333. A_ASR:
  334. shifterop.shiftmode:=SM_ASR;
  335. else
  336. InternalError(2019090401);
  337. end;
  338. shifterop.shiftimm:=taicpu(p).oper[2]^.val;
  339. if I <> taicpu(hp1).ops-1 then
  340. begin
  341. if taicpu(hp1).ops = 3 then
  342. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  343. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[2]^.reg,
  344. taicpu(p).oper[1]^.reg, shifterop)
  345. else
  346. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  347. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  348. shifterop);
  349. end
  350. else
  351. if taicpu(hp1).ops = 3 then
  352. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  353. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg,
  354. taicpu(p).oper[1]^.reg,shifterop)
  355. else
  356. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  357. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  358. shifterop);
  359. { Make sure the register used in the shifting is tracked all
  360. the way through, otherwise it may become deallocated while
  361. it's still live and cause incorrect optimisations later }
  362. if (taicpu(hp1).oper[0]^.reg <> taicpu(p).oper[1]^.reg) then
  363. begin
  364. TransferUsedRegs(TmpUsedRegs);
  365. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  366. ALlocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, TmpUsedRegs);
  367. end;
  368. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  369. asml.insertbefore(hp2, hp1);
  370. RemoveInstruction(hp1);
  371. RemoveCurrentp(p);
  372. DebugMsg(SPeepholeOptimization + 'FoldShiftProcess done', hp2);
  373. Result:=true;
  374. break;
  375. end;
  376. end
  377. else if MatchInstruction(p,[A_LSL, A_LSR, A_ASR,A_ROR],[PF_None]) and
  378. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  379. RemoveSuperfluousMove(p, hp1, 'ShiftMov2Shift') then
  380. Result:=true;
  381. end;
  382. function TCpuAsmOptimizer.OptPass1Data(var p : tai): boolean;
  383. var
  384. hp1: tai;
  385. begin
  386. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  387. RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
  388. end;
  389. function TCpuAsmOptimizer.OptPass1FData(var p: tai): Boolean;
  390. var
  391. hp1: tai;
  392. begin
  393. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  394. RemoveSuperfluousFMov(p, hp1, 'FOpFMov2FOp');
  395. end;
  396. function TCpuAsmOptimizer.OptPass1STP(var p : tai): boolean;
  397. var
  398. hp1, hp2, hp3, hp4: tai;
  399. begin
  400. Result:=false;
  401. {
  402. change
  403. stp x29,x30,[sp, #-16]!
  404. mov x29,sp
  405. bl abc
  406. ldp x29,x30,[sp], #16
  407. ret
  408. into
  409. b abc
  410. }
  411. if MatchInstruction(p, A_STP, [C_None], [PF_None]) and
  412. MatchOpType(taicpu(p),top_reg,top_reg,top_ref) and
  413. (taicpu(p).oper[0]^.reg = NR_X29) and
  414. (taicpu(p).oper[1]^.reg = NR_X30) and
  415. (taicpu(p).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  416. (taicpu(p).oper[2]^.ref^.index=NR_NO) and
  417. (taicpu(p).oper[2]^.ref^.offset=-16) and
  418. (taicpu(p).oper[2]^.ref^.addressmode=AM_PREINDEXED) and
  419. GetNextInstruction(p, hp1) and
  420. MatchInstruction(hp1, A_MOV, [C_None], [PF_NONE]) and
  421. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[0]^) and
  422. (taicpu(hp1).oper[1]^.typ = top_reg) and
  423. (taicpu(hp1).oper[1]^.reg = NR_STACK_POINTER_REG) and
  424. GetNextInstruction(hp1, hp2) and
  425. SkipEntryExitMarker(hp2, hp2) and
  426. MatchInstruction(hp2, A_BL, [C_None], [PF_NONE]) and
  427. (taicpu(hp2).oper[0]^.typ = top_ref) and
  428. GetNextInstruction(hp2, hp3) and
  429. SkipEntryExitMarker(hp3, hp3) and
  430. MatchInstruction(hp3, A_LDP, [C_None], [PF_NONE]) and
  431. MatchOpType(taicpu(hp3),top_reg,top_reg,top_ref) and
  432. (taicpu(hp3).oper[0]^.reg = NR_X29) and
  433. (taicpu(hp3).oper[1]^.reg = NR_X30) and
  434. (taicpu(hp3).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  435. (taicpu(hp3).oper[2]^.ref^.index=NR_NO) and
  436. (taicpu(hp3).oper[2]^.ref^.offset=16) and
  437. (taicpu(hp3).oper[2]^.ref^.addressmode=AM_POSTINDEXED) and
  438. GetNextInstruction(hp3, hp4) and
  439. MatchInstruction(hp4, A_RET, [C_None], [PF_None]) and
  440. (taicpu(hp4).ops = 0) then
  441. begin
  442. asml.Remove(p);
  443. asml.Remove(hp1);
  444. asml.Remove(hp3);
  445. asml.Remove(hp4);
  446. taicpu(hp2).opcode:=A_B;
  447. p.free;
  448. hp1.free;
  449. hp3.free;
  450. hp4.free;
  451. p:=hp2;
  452. DebugMsg(SPeepholeOptimization + 'Bl2B done', p);
  453. Result:=true;
  454. end;
  455. end;
  456. function TCpuAsmOptimizer.OptPass1Mov(var p : tai): boolean;
  457. var
  458. hp1: tai;
  459. so: tshifterop;
  460. begin
  461. Result:=false;
  462. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  463. (taicpu(p).oppostfix=PF_None) then
  464. begin
  465. RemoveCurrentP(p);
  466. DebugMsg(SPeepholeOptimization + 'Mov2None done', p);
  467. Result:=true;
  468. end
  469. else if (taicpu(p).ops=2) and
  470. (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBD) and
  471. GetNextInstruction(p, hp1) and
  472. { Faster to get it out of the way than go through MatchInstruction }
  473. (hp1.typ=ait_instruction) and
  474. (taicpu(hp1).ops=3) and
  475. MatchInstruction(hp1,[A_ADD,A_SUB],[taicpu(p).condition], [PF_None,PF_S]) and
  476. (getsubreg(taicpu(hp1).oper[2]^.reg)=R_SUBQ) and
  477. (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[2]^.reg)) and
  478. RegEndOfLife(taicpu(hp1).oper[2]^.reg,taicpu(hp1)) then
  479. begin
  480. DebugMsg(SPeepholeOptimization + 'MovOp2AddUtxw 1 done', p);
  481. shifterop_reset(so);
  482. so.shiftmode:=SM_UXTW;
  483. taicpu(hp1).ops:=4;
  484. taicpu(hp1).loadreg(2,taicpu(p).oper[1]^.reg);
  485. taicpu(hp1).loadshifterop(3,so);
  486. RemoveCurrentP(p);
  487. Result:=true;
  488. exit;
  489. end
  490. {
  491. optimize
  492. mov rX, yyyy
  493. ....
  494. }
  495. else if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
  496. begin
  497. if RemoveSuperfluousMove(p, hp1, 'MovMov2Mov') then
  498. Result:=true
  499. else if (taicpu(p).ops = 2) and
  500. (tai(hp1).typ = ait_instruction) and
  501. RedundantMovProcess(p,hp1) then
  502. Result:=true
  503. end;
  504. end;
  505. function TCpuAsmOptimizer.OptPass1MOVZ(var p: tai): boolean;
  506. var
  507. hp1: tai;
  508. ZeroReg: TRegister;
  509. begin
  510. Result := False;
  511. hp1 := nil;
  512. if (taicpu(p).oppostfix = PF_None) and (taicpu(p).condition = C_None) then
  513. begin
  514. if
  515. { Check next instruction first so hp1 gets set to something, then
  516. if it remains nil, we know for sure that there's no valid next
  517. instruction. }
  518. not GetNextInstruction(p, hp1) or
  519. { MOVZ and MOVK/MOVN instructions undergo macro-fusion. }
  520. not MatchInstruction(hp1, [A_MOVK, A_MOVN], [C_None], [PF_None]) or
  521. (taicpu(hp1).oper[0]^.reg <> taicpu(p).oper[0]^.reg) then
  522. begin
  523. if (taicpu(p).oper[1]^.val = 0) then
  524. begin
  525. { Change;
  526. movz reg,#0
  527. (no movk or movn)
  528. To:
  529. mov reg,xzr (or wzr)
  530. Easier to perform other optimisations with registers
  531. }
  532. DebugMsg(SPeepholeOptimization + 'Movz0ToMovZeroReg', p);
  533. { Make sure the zero register is the correct size }
  534. ZeroReg := taicpu(p).oper[0]^.reg;
  535. setsupreg(ZeroReg, RS_XZR);
  536. taicpu(p).opcode := A_MOV;
  537. taicpu(p).loadreg(1, ZeroReg);
  538. Result := True;
  539. Exit;
  540. end;
  541. end;
  542. {
  543. remove the second Movz from
  544. movz reg,...
  545. movz reg,...
  546. }
  547. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  548. MatchInstruction(hp1,A_MOVZ,[C_None],[PF_none]) and
  549. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) then
  550. begin
  551. DebugMsg(SPeepholeOptimization + 'MovzMovz2Movz', p);
  552. RemoveCurrentP(p);
  553. Result:=true;
  554. exit;
  555. end;
  556. end;
  557. end;
  558. function TCpuAsmOptimizer.OptPass1FMov(var p: tai): Boolean;
  559. var
  560. hp1: tai;
  561. alloc, dealloc: tai_regalloc;
  562. begin
  563. {
  564. change
  565. fmov reg0,reg1
  566. fmov reg1,reg0
  567. into
  568. fmov reg0,reg1
  569. }
  570. Result := False;
  571. while GetNextInstruction(p, hp1) and
  572. MatchInstruction(hp1, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  573. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^) and
  574. MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[0]^) do
  575. begin
  576. asml.Remove(hp1);
  577. hp1.free;
  578. DebugMsg(SPeepholeOptimization + 'FMovFMov2FMov 1 done', p);
  579. Result:=true;
  580. end;
  581. { change
  582. fmov reg0,const
  583. fmov reg1,reg0
  584. dealloc reg0
  585. into
  586. fmov reg1,const
  587. }
  588. if MatchOpType(taicpu(p),top_reg,top_realconst) and
  589. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  590. (not RegModifiedBetween(taicpu(p).oper[1]^.reg, p, hp1)) and
  591. MatchInstruction(hp1,A_FMOV,[taicpu(p).condition],[taicpu(p).oppostfix]) and
  592. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  593. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^.reg) and
  594. (not RegModifiedByInstruction(taicpu(p).oper[0]^.reg, hp1)) and
  595. assigned(FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next)))
  596. then
  597. begin
  598. DebugMsg('Peephole FMovFMov2FMov 2 done', p);
  599. taicpu(hp1).loadrealconst(1,taicpu(p).oper[1]^.val_real);
  600. alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.Previous));
  601. dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next));
  602. if assigned(alloc) and assigned(dealloc) then
  603. begin
  604. asml.Remove(alloc);
  605. alloc.Free;
  606. asml.Remove(dealloc);
  607. dealloc.Free;
  608. end;
  609. { p will be removed, update used register as we continue
  610. with the next instruction after p }
  611. result:=RemoveCurrentP(p);
  612. end;
  613. { not enabled as apparently not happening
  614. if MatchOpType(taicpu(p),top_reg,top_reg) and
  615. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  616. MatchInstruction(hp1, [A_FSUB,A_FADD,A_FNEG,A_FMUL,A_FSQRT,A_FDIV,A_FABS], [PF_None]) and
  617. (MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) or
  618. ((taicpu(hp1).ops=3) and MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[2]^))
  619. ) and
  620. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  621. not(RegUsedBetween(taicpu(p).oper[0]^.reg,p,hp1)) then
  622. begin
  623. DebugMsg(SPeepholeOptimization + 'FMovFOp2FOp done', hp1);
  624. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
  625. if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) then
  626. taicpu(hp1).oper[1]^.reg:=taicpu(p).oper[1]^.reg;
  627. if (taicpu(hp1).ops=3) and MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[2]^) then
  628. taicpu(hp1).oper[2]^.reg:=taicpu(p).oper[1]^.reg;
  629. RemoveCurrentP(p);
  630. Result:=true;
  631. exit;
  632. end;
  633. }
  634. end;
  635. function TCpuAsmOptimizer.OptPass1SXTW(var p : tai) : Boolean;
  636. var
  637. hp1: tai;
  638. GetNextInstructionUsingReg_hp1: Boolean;
  639. begin
  640. Result:=false;
  641. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) then
  642. begin
  643. {
  644. change
  645. sxtw reg2,reg1
  646. str reg2,[...]
  647. dealloc reg2
  648. to
  649. str reg1,[...]
  650. }
  651. if MatchInstruction(p, taicpu(p).opcode, [C_None], [PF_None]) and
  652. (taicpu(p).ops=2) and
  653. MatchInstruction(hp1, A_STR, [C_None], [PF_None]) and
  654. (getsubreg(taicpu(hp1).oper[0]^.reg)=R_SUBD) and
  655. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  656. { the reference in strb might not use reg2 }
  657. not(RegInRef(taicpu(p).oper[0]^.reg,taicpu(hp1).oper[1]^.ref^)) and
  658. { reg1 might not be modified inbetween }
  659. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  660. begin
  661. DebugMsg('Peephole SXTHStr2Str done', p);
  662. taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
  663. result:=RemoveCurrentP(p);
  664. end
  665. {
  666. change
  667. sxtw reg2,reg1
  668. sxtw reg3,reg2
  669. dealloc reg2
  670. to
  671. sxtw reg3,reg1
  672. }
  673. else if MatchInstruction(p, A_SXTW, [C_None], [PF_None]) and
  674. (taicpu(p).ops=2) and
  675. MatchInstruction(hp1, A_SXTW, [C_None], [PF_None]) and
  676. (taicpu(hp1).ops=2) and
  677. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  678. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  679. { reg1 might not be modified inbetween }
  680. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  681. begin
  682. DebugMsg('Peephole SxtwSxtw2Sxtw done', p);
  683. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,UsedRegs);
  684. taicpu(hp1).opcode:=A_SXTW;
  685. taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
  686. result:=RemoveCurrentP(p);
  687. end
  688. else if USxtOp2Op(p,hp1,SM_SXTW) then
  689. Result:=true
  690. else if RemoveSuperfluousMove(p, hp1, 'SxtwMov2Data') then
  691. Result:=true;
  692. end;
  693. end;
  694. function TCpuAsmOptimizer.OptPass1B(var p: tai): boolean;
  695. var
  696. hp1, hp2, hp3, hp4, hp5: tai;
  697. Invert: Boolean;
  698. begin
  699. Result := False;
  700. {
  701. convert
  702. b<c> .L1
  703. movz reg,#1`
  704. b .L2
  705. .L1
  706. movz reg,#0 (or mov reg,xzr)
  707. .L2
  708. into
  709. cset reg,<not(c)>
  710. Also do the same if the constants are reversed, instead converting it to:
  711. cset reg,<c>
  712. }
  713. if (taicpu(p).condition <> C_None) and
  714. (taicpu(p).oper[0]^.typ = top_ref) and
  715. GetNextInstruction(p, hp1) and
  716. { Check individually instead of using MatchInstruction in order to save time }
  717. (hp1.typ = ait_instruction) and
  718. (taicpu(hp1).condition = C_None) and
  719. (taicpu(hp1).oppostfix = PF_None) and
  720. (taicpu(hp1).ops = 2) and
  721. (
  722. (
  723. (taicpu(hp1).opcode = A_MOVZ) and
  724. (taicpu(hp1).oper[1]^.val in [0, 1])
  725. ) or
  726. (
  727. (taicpu(hp1).opcode = A_MOV) and
  728. (getsupreg(taicpu(hp1).oper[1]^.reg) = RS_XZR)
  729. )
  730. ) and
  731. GetNextInstruction(hp1, hp2) and
  732. MatchInstruction(hp2, A_B, [PF_None]) and
  733. (taicpu(hp2).condition = C_None) and
  734. (taicpu(hp2).oper[0]^.typ = top_ref) and
  735. GetNextInstruction(hp2, hp3) and
  736. SkipAligns(hp3, hp3) and
  737. (hp3.typ = ait_label) and
  738. (tasmlabel(taicpu(p).oper[0]^.ref^.symbol) = tai_label(hp3).labsym) and
  739. GetNextInstruction(hp3, hp4) and
  740. { As before, check individually instead of using MatchInstruction in order to save time }
  741. (hp4.typ = ait_instruction) and
  742. (taicpu(hp4).condition = C_None) and
  743. (taicpu(hp4).oppostfix = PF_None) and
  744. (taicpu(hp4).ops = 2) and
  745. (taicpu(hp4).oper[0]^.reg = taicpu(hp1).oper[0]^.reg) and
  746. (
  747. (
  748. (taicpu(hp4).opcode = A_MOVZ) and
  749. (
  750. (
  751. { Check to confirm the following:
  752. - First mov is either "movz reg,#0" or "mov reg,xzr"
  753. - Second mov is "movz reg,#1"
  754. }
  755. (
  756. (taicpu(hp1).oper[1]^.typ = top_reg) { Will be the zero register } or
  757. (taicpu(hp1).oper[1]^.val = 0)
  758. ) and
  759. (taicpu(hp4).oper[1]^.val = 1)
  760. ) or
  761. (
  762. { Check to confirm the following:
  763. - First mov is "movz reg,#1"
  764. - Second mov is "movz reg,#0"
  765. }
  766. MatchOperand(taicpu(hp1).oper[1]^, 1) and
  767. (taicpu(hp4).oper[1]^.val = 0)
  768. )
  769. )
  770. ) or
  771. (
  772. { Check to confirm the following:
  773. - First mov is "movz reg,#1"
  774. - Second mov is "mov reg,xzr"
  775. }
  776. (taicpu(hp4).opcode = A_MOV) and
  777. (getsupreg(taicpu(hp4).oper[1]^.reg) = RS_XZR) and
  778. MatchOperand(taicpu(hp1).oper[1]^, 1)
  779. )
  780. ) and
  781. GetNextInstruction(hp4, hp5) and
  782. SkipAligns(hp5, hp5) and
  783. (hp5.typ = ait_label) and
  784. (tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol) = tai_label(hp5).labsym) then
  785. begin
  786. Invert := MatchOperand(taicpu(hp1).oper[1]^, 1); { if true, hp4 will be mov reg,0 in some form }
  787. if Invert then
  788. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  789. tai_label(hp3).labsym.DecRefs;
  790. { If this isn't the only reference to the middle label, we can
  791. still make a saving - only that the first jump and everything
  792. that follows will remain. }
  793. if (tai_label(hp3).labsym.getrefs = 0) then
  794. begin
  795. if Invert then
  796. DebugMsg(SPeepholeOptimization + 'B(c)Movz1BMovz0 -> Cset(~c)',p)
  797. else
  798. DebugMsg(SPeepholeOptimization + 'B(c)Movz0bMovZ1 -> Cset(c)',p);
  799. { remove jump, first label and second MOV (also catching any aligns) }
  800. repeat
  801. if not GetNextInstruction(hp2, hp3) then
  802. InternalError(2022070801);
  803. RemoveInstruction(hp2);
  804. hp2 := hp3;
  805. until hp2 = hp5;
  806. { Don't decrement reference count before the removal loop
  807. above, otherwise GetNextInstruction won't stop on the
  808. the label }
  809. tai_label(hp5).labsym.DecRefs;
  810. end
  811. else
  812. begin
  813. if Invert then
  814. DebugMsg(SPeepholeOptimization + 'B(c)Movz1BMovz0 -> Cset(~c) (partial)',p)
  815. else
  816. DebugMsg(SPeepholeOptimization + 'B(c)Movz0BMovz1 -> Cset(c) (partial)',p);
  817. end;
  818. taicpu(hp1).opcode := A_CSET;
  819. taicpu(hp1).loadconditioncode(1, taicpu(p).condition);
  820. RemoveCurrentP(p, hp1);
  821. Result:=true;
  822. exit;
  823. end;
  824. end;
  825. function TCpuAsmOptimizer.OptPass2LDRSTR(var p: tai): boolean;
  826. var
  827. hp1, hp1_last: tai;
  828. ThisRegister: TRegister;
  829. OffsetVal, ValidOffset, MinOffset, MaxOffset: asizeint;
  830. TargetOpcode: TAsmOp;
  831. begin
  832. Result := False;
  833. ThisRegister := taicpu(p).oper[0]^.reg;
  834. case taicpu(p).opcode of
  835. A_LDR:
  836. TargetOpcode := A_LDP;
  837. A_STR:
  838. TargetOpcode := A_STP;
  839. else
  840. InternalError(2020081501);
  841. end;
  842. { reg appearing in ref invalidates these optimisations }
  843. if (TargetOpcode = A_STP) or not RegInRef(ThisRegister, taicpu(p).oper[1]^.ref^) then
  844. begin
  845. { LDP/STP has a smaller permitted offset range than LDR/STR.
  846. TODO: For a group of out-of-range LDR/STR instructions, can
  847. we declare a temporary register equal to the offset base
  848. address, modify the STR instructions to use that register
  849. and then convert them to STP instructions? Note that STR
  850. generally takes 2 cycles (on top of the memory latency),
  851. while LDP/STP takes 3.
  852. }
  853. if (getsubreg(ThisRegister) = R_SUBQ) then
  854. begin
  855. ValidOffset := 8;
  856. MinOffset := -512;
  857. MaxOffset := 504;
  858. end
  859. else
  860. begin
  861. ValidOffset := 4;
  862. MinOffset := -256;
  863. MaxOffset := 252;
  864. end;
  865. hp1_last := p;
  866. { Look for nearby LDR/STR instructions }
  867. if (taicpu(p).oppostfix = PF_NONE) and
  868. (taicpu(p).oper[1]^.ref^.addressmode = AM_OFFSET) then
  869. { If SkipGetNext is True, GextNextInstruction isn't called }
  870. while GetNextInstruction(hp1_last, hp1) do
  871. begin
  872. if (hp1.typ <> ait_instruction) then
  873. Break;
  874. if (taicpu(hp1).opcode = taicpu(p).opcode) then
  875. begin
  876. if (taicpu(hp1).oppostfix = PF_NONE) and
  877. { Registers need to be the same size }
  878. (getsubreg(ThisRegister) = getsubreg(taicpu(hp1).oper[0]^.reg)) and
  879. (
  880. (TargetOpcode = A_STP) or
  881. { LDP x0, x0, [sp, #imm] is undefined behaviour, even
  882. though such an LDR pair should have been optimised
  883. out by now. STP is okay }
  884. (ThisRegister <> taicpu(hp1).oper[0]^.reg)
  885. ) and
  886. (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
  887. (taicpu(p).oper[1]^.ref^.base = taicpu(hp1).oper[1]^.ref^.base) and
  888. (taicpu(p).oper[1]^.ref^.index = taicpu(hp1).oper[1]^.ref^.index) and
  889. { Make sure the address registers haven't changed }
  890. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, p, hp1) and
  891. (
  892. (taicpu(hp1).oper[1]^.ref^.index = NR_NO) or
  893. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index, p, hp1)
  894. ) and
  895. { Don't need to check "RegInRef" because the base registers are identical,
  896. and the first one was checked already. [Kit] }
  897. (((TargetOpcode=A_LDP) and not RegUsedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) or
  898. ((TargetOpcode=A_STP) and not RegModifiedBetween(taicpu(hp1).oper[0]^.reg, p, hp1))) then
  899. begin
  900. { Can we convert these two LDR/STR instructions into a
  901. single LDR/STP? }
  902. OffsetVal := taicpu(hp1).oper[1]^.ref^.offset - taicpu(p).oper[1]^.ref^.offset;
  903. if (OffsetVal = ValidOffset) then
  904. begin
  905. if (taicpu(p).oper[1]^.ref^.offset >= MinOffset) and (taicpu(hp1).oper[1]^.ref^.offset <= MaxOffset) then
  906. begin
  907. { Convert:
  908. LDR/STR reg0, [reg2, #ofs]
  909. ...
  910. LDR/STR reg1. [reg2, #ofs + 8] // 4 if registers are 32-bit
  911. To:
  912. LDP/STP reg0, reg1, [reg2, #ofs]
  913. }
  914. taicpu(p).opcode := TargetOpcode;
  915. if TargetOpcode = A_STP then
  916. DebugMsg(SPeepholeOptimization + 'StrStr2Stp', p)
  917. else
  918. DebugMsg(SPeepholeOptimization + 'LdrLdr2Ldp', p);
  919. taicpu(p).ops := 3;
  920. taicpu(p).loadref(2, taicpu(p).oper[1]^.ref^);
  921. taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
  922. asml.Remove(hp1);
  923. hp1.Free;
  924. Result := True;
  925. Exit;
  926. end;
  927. end
  928. else if (OffsetVal = -ValidOffset) then
  929. begin
  930. if (taicpu(hp1).oper[1]^.ref^.offset >= MinOffset) and (taicpu(p).oper[1]^.ref^.offset <= MaxOffset) then
  931. begin
  932. { Convert:
  933. LDR/STR reg0, [reg2, #ofs + 8] // 4 if registers are 32-bit
  934. ...
  935. LDR/STR reg1. [reg2, #ofs]
  936. To:
  937. LDP/STP reg1, reg0, [reg2, #ofs]
  938. }
  939. taicpu(p).opcode := TargetOpcode;
  940. if TargetOpcode = A_STP then
  941. DebugMsg(SPeepholeOptimization + 'StrStr2Stp (reverse)', p)
  942. else
  943. DebugMsg(SPeepholeOptimization + 'LdrLdr2Ldp (reverse)', p);
  944. taicpu(p).ops := 3;
  945. taicpu(p).loadref(2, taicpu(hp1).oper[1]^.ref^);
  946. taicpu(p).loadreg(1, taicpu(p).oper[0]^.reg);
  947. taicpu(p).loadreg(0, taicpu(hp1).oper[0]^.reg);
  948. asml.Remove(hp1);
  949. hp1.Free;
  950. Result := True;
  951. Exit;
  952. end;
  953. end;
  954. end;
  955. end
  956. else
  957. Break;
  958. { Don't continue looking for LDR/STR pairs if the address register
  959. gets modified }
  960. if RegModifiedByInstruction(taicpu(p).oper[1]^.ref^.base, hp1) then
  961. Break;
  962. hp1_last := hp1;
  963. end;
  964. end;
  965. end;
  966. function TCpuAsmOptimizer.OptPostAnd(var p: tai): Boolean;
  967. var
  968. hp1, hp2: tai;
  969. hp3: taicpu;
  970. bitval : cardinal;
  971. begin
  972. Result:=false;
  973. {
  974. and reg1,reg0,<const=power of 2>
  975. cmp reg1,#0
  976. <reg1 end of life>
  977. b.e/b.ne label
  978. into
  979. tb(n)z reg0,<power of 2>,label
  980. }
  981. if MatchOpType(taicpu(p),top_reg,top_reg,top_const) and
  982. (PopCnt(QWord(taicpu(p).oper[2]^.val))=1) and
  983. GetNextInstruction(p,hp1) and
  984. MatchInstruction(hp1,A_CMP,[PF_None]) and
  985. MatchOpType(taicpu(hp1),top_reg,top_const) and
  986. (taicpu(hp1).oper[1]^.val=0) and
  987. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  988. RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  989. GetNextInstruction(hp1,hp2) and
  990. MatchInstruction(hp2,A_B,[PF_None]) and
  991. (taicpu(hp2).condition in [C_EQ,C_NE]) then
  992. begin
  993. bitval:=BsfQWord(qword(taicpu(p).oper[2]^.val));
  994. case taicpu(hp2).condition of
  995. C_NE:
  996. hp3:=taicpu.op_reg_const_ref(A_TBNZ,taicpu(p).oper[1]^.reg,bitval,taicpu(hp2).oper[0]^.ref^);
  997. C_EQ:
  998. hp3:=taicpu.op_reg_const_ref(A_TBZ,taicpu(p).oper[1]^.reg,bitval,taicpu(hp2).oper[0]^.ref^);
  999. else
  1000. Internalerror(2021100201);
  1001. end;
  1002. taicpu(hp3).fileinfo:=taicpu(hp1).fileinfo;
  1003. asml.insertbefore(hp3, hp1);
  1004. RemoveInstruction(hp1);
  1005. RemoveInstruction(hp2);
  1006. RemoveCurrentP(p);
  1007. DebugMsg(SPeepholeOptimization + 'AndCmpB.E/NE2Tbnz/Tbz done', p);
  1008. Result:=true;
  1009. end;
  1010. end;
  1011. function TCpuAsmOptimizer.OptPostCMP(var p : tai): boolean;
  1012. var
  1013. hp1,hp2: tai;
  1014. begin
  1015. Result:=false;
  1016. {
  1017. cmp reg0,#0
  1018. b.e/b.ne label
  1019. into
  1020. cb(n)z reg0,label
  1021. }
  1022. if MatchOpType(taicpu(p),top_reg,top_const) and
  1023. (taicpu(p).oper[0]^.reg<>NR_SP) and
  1024. (taicpu(p).oper[1]^.val=0) and
  1025. GetNextInstruction(p,hp1) and
  1026. MatchInstruction(hp1,A_B,[PF_None]) and
  1027. (taicpu(hp1).condition in [C_EQ,C_NE]) then
  1028. begin
  1029. case taicpu(hp1).condition of
  1030. C_NE:
  1031. hp2:=taicpu.op_reg_sym_ofs(A_CBNZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  1032. C_EQ:
  1033. hp2:=taicpu.op_reg_sym_ofs(A_CBZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  1034. else
  1035. Internalerror(2019090801);
  1036. end;
  1037. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  1038. asml.insertbefore(hp2, hp1);
  1039. asml.remove(p);
  1040. asml.remove(hp1);
  1041. p.free;
  1042. hp1.free;
  1043. p:=hp2;
  1044. DebugMsg(SPeepholeOptimization + 'CMPB.E/NE2CBNZ/CBZ done', p);
  1045. Result:=true;
  1046. end;
  1047. end;
  1048. function TCpuAsmOptimizer.PrePeepHoleOptsCpu(var p: tai): boolean;
  1049. begin
  1050. result := false;
  1051. if p.typ=ait_instruction then
  1052. begin
  1053. case taicpu(p).opcode of
  1054. A_SBFX,
  1055. A_UBFX:
  1056. Result:=OptPreSBFXUBFX(p);
  1057. else
  1058. ;
  1059. end;
  1060. end;
  1061. end;
  1062. function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
  1063. begin
  1064. result := false;
  1065. if p.typ=ait_instruction then
  1066. begin
  1067. case taicpu(p).opcode of
  1068. A_B:
  1069. Result:=OptPass1B(p);
  1070. A_LDR:
  1071. Result:=OptPass1LDR(p);
  1072. A_STR:
  1073. Result:=OptPass1STR(p);
  1074. A_MOV:
  1075. Result:=OptPass1Mov(p);
  1076. A_MOVZ:
  1077. Result:=OptPass1MOVZ(p);
  1078. A_STP:
  1079. Result:=OptPass1STP(p);
  1080. A_LSR,
  1081. A_ROR,
  1082. A_ASR,
  1083. A_LSL:
  1084. Result:=OptPass1Shift(p);
  1085. A_AND:
  1086. Result:=OptPass1And(p);
  1087. A_NEG,
  1088. A_CSEL,
  1089. A_ADD,
  1090. A_ADC,
  1091. A_SUB,
  1092. A_SBC,
  1093. A_BIC,
  1094. A_EOR,
  1095. A_ORR,
  1096. A_MUL:
  1097. Result:=OptPass1Data(p);
  1098. A_UXTB:
  1099. Result:=OptPass1UXTB(p);
  1100. A_UXTH:
  1101. Result:=OptPass1UXTH(p);
  1102. A_SXTB:
  1103. Result:=OptPass1SXTB(p);
  1104. A_SXTH:
  1105. Result:=OptPass1SXTH(p);
  1106. A_SXTW:
  1107. Result:=OptPass1SXTW(p);
  1108. // A_VLDR,
  1109. A_FMADD,
  1110. A_FMSUB,
  1111. A_FNMADD,
  1112. A_FNMSUB,
  1113. A_FNMUL,
  1114. A_FADD,
  1115. A_FMUL,
  1116. A_FDIV,
  1117. A_FSUB,
  1118. A_FSQRT,
  1119. A_FNEG,
  1120. A_FCVT,
  1121. A_FABS:
  1122. Result:=OptPass1FData(p);
  1123. A_FMOV:
  1124. Result:=OptPass1FMov(p);
  1125. else
  1126. ;
  1127. end;
  1128. end;
  1129. end;
  1130. function TCpuAsmOptimizer.PeepHoleOptPass2Cpu(var p: tai): boolean;
  1131. begin
  1132. result := false;
  1133. if p.typ=ait_instruction then
  1134. begin
  1135. case taicpu(p).opcode of
  1136. A_LDR,
  1137. A_STR:
  1138. Result:=OptPass2LDRSTR(p);
  1139. else
  1140. ;
  1141. end;
  1142. end;
  1143. end;
  1144. function TCpuAsmOptimizer.PostPeepHoleOptsCpu(var p: tai): boolean;
  1145. begin
  1146. result := false;
  1147. if p.typ=ait_instruction then
  1148. begin
  1149. case taicpu(p).opcode of
  1150. A_CMP:
  1151. Result:=OptPostCMP(p);
  1152. A_AND:
  1153. Result:=OptPostAnd(p);
  1154. else
  1155. ;
  1156. end;
  1157. end;
  1158. end;
  1159. begin
  1160. casmoptimizer:=TCpuAsmOptimizer;
  1161. End.