aoptcpu.pas 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855
  1. {
  2. Copyright (c) 1998-2002 by Jonas Maebe, member of the Free Pascal
  3. Development Team
  4. This unit implements the ARM64 optimizer object
  5. This program is free software; you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation; either version 2 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program; if not, write to the Free Software
  15. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16. ****************************************************************************
  17. }
  18. Unit aoptcpu;
  19. {$i fpcdefs.inc}
  20. { $define DEBUG_AOPTCPU}
  21. Interface
  22. uses
  23. globtype, globals,
  24. cutils,
  25. cgbase, cpubase, aasmtai, aasmcpu,
  26. aopt, aoptcpub, aoptarm;
  27. Type
  28. TCpuAsmOptimizer = class(TARMAsmOptimizer)
  29. { uses the same constructor as TAopObj }
  30. function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
  31. function PeepHoleOptPass2Cpu(var p: tai): boolean; override;
  32. function PostPeepHoleOptsCpu(var p: tai): boolean; override;
  33. function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
  34. function InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;override;
  35. function LookForPostindexedPattern(var p : tai) : boolean;
  36. private
  37. function RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string): boolean;
  38. function OptPass1Shift(var p: tai): boolean;
  39. function OptPostCMP(var p: tai): boolean;
  40. function OptPass1Data(var p: tai): boolean;
  41. function OptPass1FData(var p: tai): Boolean;
  42. function OptPass1STP(var p: tai): boolean;
  43. function OptPass1Mov(var p: tai): boolean;
  44. function OptPass1FMov(var p: tai): Boolean;
  45. function OptPass2LDRSTR(var p: tai): boolean;
  46. End;
  47. Implementation
  48. uses
  49. aasmbase,
  50. aoptutils,
  51. cgutils,
  52. verbose;
  53. {$ifdef DEBUG_AOPTCPU}
  54. const
  55. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  56. {$else DEBUG_AOPTCPU}
  57. { Empty strings help the optimizer to remove string concatenations that won't
  58. ever appear to the user on release builds. [Kit] }
  59. const
  60. SPeepholeOptimization = '';
  61. {$endif DEBUG_AOPTCPU}
  62. function CanBeCond(p : tai) : boolean;
  63. begin
  64. result:=(p.typ=ait_instruction) and (taicpu(p).condition=C_None);
  65. end;
  66. function TCpuAsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  67. var
  68. p: taicpu;
  69. begin
  70. Result := false;
  71. if not ((assigned(hp)) and (hp.typ = ait_instruction)) then
  72. exit;
  73. p := taicpu(hp);
  74. case p.opcode of
  75. { These operands do not write into a register at all }
  76. A_CMP, A_CMN, A_TST, A_B, A_BL, A_MSR, A_FCMP:
  77. exit;
  78. {Take care of post/preincremented store and loads, they will change their base register}
  79. A_STR, A_LDR:
  80. begin
  81. Result := false;
  82. { actually, this does not apply here because post-/preindexed does not mean that a register
  83. is loaded with a new value, it is only modified
  84. (taicpu(p).oper[1]^.typ=top_ref) and
  85. (taicpu(p).oper[1]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
  86. (taicpu(p).oper[1]^.ref^.base = reg);
  87. }
  88. { STR does not load into it's first register }
  89. if p.opcode = A_STR then
  90. exit;
  91. end;
  92. else
  93. ;
  94. end;
  95. if Result then
  96. exit;
  97. case p.oper[0]^.typ of
  98. top_reg:
  99. Result := (p.oper[0]^.reg = reg);
  100. top_ref:
  101. Result :=
  102. (taicpu(p).oper[0]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
  103. (taicpu(p).oper[0]^.ref^.base = reg);
  104. else
  105. ;
  106. end;
  107. end;
  108. function TCpuAsmOptimizer.InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;
  109. var
  110. p: taicpu;
  111. i: longint;
  112. begin
  113. instructionLoadsFromReg := false;
  114. if not (assigned(hp) and (hp.typ = ait_instruction)) then
  115. exit;
  116. p:=taicpu(hp);
  117. i:=1;
  118. { Start on oper[0]? }
  119. if taicpu(hp).spilling_get_operation_type(0) in [operand_read, operand_readwrite] then
  120. i:=0;
  121. while(i<p.ops) do
  122. begin
  123. case p.oper[I]^.typ of
  124. top_reg:
  125. Result := (p.oper[I]^.reg = reg);
  126. top_ref:
  127. Result :=
  128. (p.oper[I]^.ref^.base = reg) or
  129. (p.oper[I]^.ref^.index = reg);
  130. else
  131. ;
  132. end;
  133. { Bailout if we found something }
  134. if Result then
  135. exit;
  136. Inc(I);
  137. end;
  138. end;
  139. {
  140. optimize
  141. ldr/str regX,[reg1]
  142. ...
  143. add/sub reg1,reg1,regY/const
  144. into
  145. ldr/str regX,[reg1], regY/const
  146. }
  147. function TCpuAsmOptimizer.LookForPostindexedPattern(var p: tai) : boolean;
  148. var
  149. hp1 : tai;
  150. begin
  151. Result:=false;
  152. if (taicpu(p).oper[1]^.typ = top_ref) and
  153. (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
  154. (taicpu(p).oper[1]^.ref^.index=NR_NO) and
  155. (taicpu(p).oper[1]^.ref^.offset=0) and
  156. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.ref^.base) and
  157. { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
  158. MatchInstruction(hp1, [A_ADD, A_SUB], [PF_None]) and
  159. (taicpu(hp1).oper[0]^.reg=taicpu(p).oper[1]^.ref^.base) and
  160. (taicpu(hp1).oper[1]^.reg=taicpu(p).oper[1]^.ref^.base) and
  161. (
  162. { valid offset? }
  163. (taicpu(hp1).oper[2]^.typ=top_const) and
  164. (taicpu(hp1).oper[2]^.val>=-256) and
  165. (abs(taicpu(hp1).oper[2]^.val)<256)
  166. ) and
  167. { don't apply the optimization if the base register is loaded }
  168. (getsupreg(taicpu(p).oper[0]^.reg)<>getsupreg(taicpu(p).oper[1]^.ref^.base)) and
  169. not(RegModifiedBetween(taicpu(hp1).oper[0]^.reg,p,hp1)) and
  170. not(RegModifiedBetween(taicpu(hp1).oper[2]^.reg,p,hp1)) then
  171. begin
  172. if taicpu(p).opcode = A_LDR then
  173. DebugMsg('Peephole LdrAdd/Sub2Ldr Postindex done', p)
  174. else
  175. DebugMsg('Peephole StrAdd/Sub2Str Postindex done', p);
  176. taicpu(p).oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
  177. if taicpu(hp1).opcode=A_ADD then
  178. taicpu(p).oper[1]^.ref^.offset:=taicpu(hp1).oper[2]^.val
  179. else
  180. taicpu(p).oper[1]^.ref^.offset:=-taicpu(hp1).oper[2]^.val;
  181. asml.Remove(hp1);
  182. hp1.Free;
  183. Result:=true;
  184. end;
  185. end;
  186. function TCpuAsmOptimizer.RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string):boolean;
  187. var
  188. alloc,
  189. dealloc : tai_regalloc;
  190. hp1 : tai;
  191. begin
  192. Result:=false;
  193. if ((MatchInstruction(movp, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  194. ((getregtype(taicpu(movp).oper[0]^.reg)=R_MMREGISTER) { or (taicpu(p).opcode in [A_LDUR])})
  195. ) { or
  196. (((taicpu(p).oppostfix in [PF_F64F32,PF_F64S16,PF_F64S32,PF_F64U16,PF_F64U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFD)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F64])) or
  197. (((taicpu(p).oppostfix in [PF_F32F64,PF_F32S16,PF_F32S32,PF_F32U16,PF_F32U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFS)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F32])) }
  198. ) and
  199. (taicpu(movp).ops=2) and
  200. MatchOperand(taicpu(movp).oper[1]^, taicpu(p).oper[0]^.reg) and
  201. { the destination register of the mov might not be used beween p and movp }
  202. not(RegUsedBetween(taicpu(movp).oper[0]^.reg,p,movp)) and
  203. { Take care to only do this for instructions which REALLY load to the first register.
  204. Otherwise
  205. str reg0, [reg1]
  206. fmov reg2, reg0
  207. will be optimized to
  208. str reg2, [reg1]
  209. }
  210. RegLoadedWithNewValue(taicpu(p).oper[0]^.reg, p) then
  211. begin
  212. dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(movp.Next));
  213. if assigned(dealloc) then
  214. begin
  215. DebugMsg('Peephole '+optimizer+' removed superfluous vmov', movp);
  216. result:=true;
  217. { taicpu(p).oper[0]^.reg is not used anymore, try to find its allocation
  218. and remove it if possible }
  219. asml.Remove(dealloc);
  220. alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.previous));
  221. if assigned(alloc) then
  222. begin
  223. asml.Remove(alloc);
  224. alloc.free;
  225. dealloc.free;
  226. end
  227. else
  228. asml.InsertAfter(dealloc,p);
  229. { try to move the allocation of the target register }
  230. GetLastInstruction(movp,hp1);
  231. alloc:=FindRegAlloc(taicpu(movp).oper[0]^.reg,tai(hp1.Next));
  232. if assigned(alloc) then
  233. begin
  234. asml.Remove(alloc);
  235. asml.InsertBefore(alloc,p);
  236. { adjust used regs }
  237. IncludeRegInUsedRegs(taicpu(movp).oper[0]^.reg,UsedRegs);
  238. end;
  239. { change
  240. vldr reg0,[reg1]
  241. vmov reg2,reg0
  242. into
  243. ldr reg2,[reg1]
  244. if reg2 is an int register
  245. if (taicpu(p).opcode=A_VLDR) and (getregtype(taicpu(movp).oper[0]^.reg)=R_INTREGISTER) then
  246. taicpu(p).opcode:=A_LDR;
  247. }
  248. { finally get rid of the mov }
  249. taicpu(p).loadreg(0,taicpu(movp).oper[0]^.reg);
  250. asml.remove(movp);
  251. movp.free;
  252. end;
  253. end;
  254. end;
  255. function TCpuAsmOptimizer.OptPass1Shift(var p : tai): boolean;
  256. var
  257. hp1,hp2: tai;
  258. I2, I: Integer;
  259. shifterop: tshifterop;
  260. begin
  261. Result:=false;
  262. { This folds shifterops into following instructions
  263. <shiftop> r0, r1, #imm
  264. <op> r2, r3, r0
  265. to
  266. <op> r2, r3, r1, <shiftop> #imm
  267. }
  268. { do not handle ROR yet, only part of the instructions below support ROR as shifter operand }
  269. if MatchInstruction(p,[A_LSL, A_LSR, A_ASR{, A_ROR}],[PF_None]) and
  270. MatchOpType(taicpu(p),top_reg,top_reg,top_const) and
  271. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  272. MatchInstruction(hp1, [A_ADD, A_AND, A_BIC, A_CMP, A_CMN,
  273. A_EON, A_EOR, A_NEG, A_ORN, A_ORR,
  274. A_SUB, A_TST], [PF_None]) and
  275. RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  276. (taicpu(hp1).ops >= 2) and
  277. { Currently we can't fold into another shifterop }
  278. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.typ = top_reg) and
  279. { SP does not work completely with shifted registers, as I didn't find the exact rules,
  280. we do not operate on SP }
  281. (taicpu(hp1).oper[0]^.reg<>NR_SP) and
  282. (taicpu(hp1).oper[1]^.reg<>NR_SP) and
  283. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.reg<>NR_SP) and
  284. { reg1 might not be modified inbetween }
  285. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
  286. (
  287. { Only ONE of the two src operands is allowed to match }
  288. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-2]^) xor
  289. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)
  290. ) and
  291. { for SUB, the last operand must match, there is no RSB on AArch64 }
  292. ((taicpu(hp1).opcode<>A_SUB) or
  293. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)) then
  294. begin
  295. { for the two operand instructions, start also at the second operand as they are not always commutative
  296. (depends on the flags tested laster on) and thus the operands cannot swapped }
  297. I2:=1;
  298. for I:=I2 to taicpu(hp1).ops-1 do
  299. if MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[I]^.reg) then
  300. begin
  301. { If the parameter matched on the second op from the RIGHT
  302. we have to switch the parameters, this will not happen for CMP
  303. were we're only evaluating the most right parameter
  304. }
  305. shifterop_reset(shifterop);
  306. case taicpu(p).opcode of
  307. A_LSL:
  308. shifterop.shiftmode:=SM_LSL;
  309. A_ROR:
  310. shifterop.shiftmode:=SM_ROR;
  311. A_LSR:
  312. shifterop.shiftmode:=SM_LSR;
  313. A_ASR:
  314. shifterop.shiftmode:=SM_ASR;
  315. else
  316. InternalError(2019090401);
  317. end;
  318. shifterop.shiftimm:=taicpu(p).oper[2]^.val;
  319. if I <> taicpu(hp1).ops-1 then
  320. begin
  321. if taicpu(hp1).ops = 3 then
  322. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  323. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[2]^.reg,
  324. taicpu(p).oper[1]^.reg, shifterop)
  325. else
  326. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  327. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  328. shifterop);
  329. end
  330. else
  331. if taicpu(hp1).ops = 3 then
  332. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  333. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg,
  334. taicpu(p).oper[1]^.reg,shifterop)
  335. else
  336. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  337. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  338. shifterop);
  339. { Make sure the register used in the shifting is tracked all
  340. the way through, otherwise it may become deallocated while
  341. it's still live and cause incorrect optimisations later }
  342. if (taicpu(hp1).oper[0]^.reg <> taicpu(p).oper[1]^.reg) then
  343. begin
  344. TransferUsedRegs(TmpUsedRegs);
  345. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  346. ALlocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, TmpUsedRegs);
  347. end;
  348. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  349. asml.insertbefore(hp2, hp1);
  350. RemoveInstruction(hp1);
  351. RemoveCurrentp(p);
  352. DebugMsg('Peephole FoldShiftProcess done', hp2);
  353. Result:=true;
  354. break;
  355. end;
  356. end
  357. else if MatchInstruction(p,[A_LSL, A_LSR, A_ASR,A_ROR],[PF_None]) and
  358. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  359. RemoveSuperfluousMove(p, hp1, 'ShiftMov2Shift') then
  360. Result:=true;
  361. end;
  362. function TCpuAsmOptimizer.OptPass1Data(var p : tai): boolean;
  363. var
  364. hp1: tai;
  365. begin
  366. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  367. RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
  368. end;
  369. function TCpuAsmOptimizer.OptPass1FData(var p: tai): Boolean;
  370. var
  371. hp1: tai;
  372. begin
  373. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  374. RemoveSuperfluousFMov(p, hp1, 'FOpFMov2FOp');
  375. end;
  376. function TCpuAsmOptimizer.OptPass1STP(var p : tai): boolean;
  377. var
  378. hp1, hp2, hp3, hp4: tai;
  379. begin
  380. Result:=false;
  381. {
  382. change
  383. stp x29,x30,[sp, #-16]!
  384. mov x29,sp
  385. bl abc
  386. ldp x29,x30,[sp], #16
  387. ret
  388. into
  389. b abc
  390. }
  391. if MatchInstruction(p, A_STP, [C_None], [PF_None]) and
  392. MatchOpType(taicpu(p),top_reg,top_reg,top_ref) and
  393. (taicpu(p).oper[0]^.reg = NR_X29) and
  394. (taicpu(p).oper[1]^.reg = NR_X30) and
  395. (taicpu(p).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  396. (taicpu(p).oper[2]^.ref^.index=NR_NO) and
  397. (taicpu(p).oper[2]^.ref^.offset=-16) and
  398. (taicpu(p).oper[2]^.ref^.addressmode=AM_PREINDEXED) and
  399. GetNextInstruction(p, hp1) and
  400. MatchInstruction(hp1, A_MOV, [C_None], [PF_NONE]) and
  401. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[0]^) and
  402. (taicpu(hp1).oper[1]^.typ = top_reg) and
  403. (taicpu(hp1).oper[1]^.reg = NR_STACK_POINTER_REG) and
  404. GetNextInstruction(hp1, hp2) and
  405. SkipEntryExitMarker(hp2, hp2) and
  406. MatchInstruction(hp2, A_BL, [C_None], [PF_NONE]) and
  407. (taicpu(hp2).oper[0]^.typ = top_ref) and
  408. GetNextInstruction(hp2, hp3) and
  409. SkipEntryExitMarker(hp3, hp3) and
  410. MatchInstruction(hp3, A_LDP, [C_None], [PF_NONE]) and
  411. MatchOpType(taicpu(hp3),top_reg,top_reg,top_ref) and
  412. (taicpu(hp3).oper[0]^.reg = NR_X29) and
  413. (taicpu(hp3).oper[1]^.reg = NR_X30) and
  414. (taicpu(hp3).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  415. (taicpu(hp3).oper[2]^.ref^.index=NR_NO) and
  416. (taicpu(hp3).oper[2]^.ref^.offset=16) and
  417. (taicpu(hp3).oper[2]^.ref^.addressmode=AM_POSTINDEXED) and
  418. GetNextInstruction(hp3, hp4) and
  419. MatchInstruction(hp4, A_RET, [C_None], [PF_None]) and
  420. (taicpu(hp4).ops = 0) then
  421. begin
  422. asml.Remove(p);
  423. asml.Remove(hp1);
  424. asml.Remove(hp3);
  425. asml.Remove(hp4);
  426. taicpu(hp2).opcode:=A_B;
  427. p.free;
  428. hp1.free;
  429. hp3.free;
  430. hp4.free;
  431. p:=hp2;
  432. DebugMsg('Peephole Bl2B done', p);
  433. Result:=true;
  434. end;
  435. end;
  436. function TCpuAsmOptimizer.OptPass1Mov(var p : tai): boolean;
  437. var
  438. hp1: tai;
  439. begin
  440. Result:=false;
  441. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  442. (taicpu(p).oppostfix=PF_None) then
  443. begin
  444. RemoveCurrentP(p);
  445. DebugMsg('Peephole Mov2None done', p);
  446. Result:=true;
  447. end
  448. {
  449. optimize
  450. mov rX, yyyy
  451. ....
  452. }
  453. else if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
  454. begin
  455. if RemoveSuperfluousMove(p, hp1, 'MovMov2Mov') then
  456. Result:=true
  457. else if (taicpu(p).ops = 2) and
  458. (tai(hp1).typ = ait_instruction) and
  459. RedundantMovProcess(p,hp1) then
  460. Result:=true;
  461. end;
  462. end;
  463. function TCpuAsmOptimizer.OptPass1FMov(var p: tai): Boolean;
  464. var
  465. hp1: tai;
  466. begin
  467. {
  468. change
  469. fmov reg0,reg1
  470. fmov reg1,reg0
  471. into
  472. fmov reg0,reg1
  473. }
  474. Result := False;
  475. while GetNextInstruction(p, hp1) and
  476. MatchInstruction(hp1, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  477. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^) and
  478. MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[0]^) do
  479. begin
  480. asml.Remove(hp1);
  481. hp1.free;
  482. DebugMsg(SPeepholeOptimization + 'FMovFMov2FMov done', p);
  483. Result:=true;
  484. end;
  485. { not enabled as apparently not happening
  486. if MatchOpType(taicpu(p),top_reg,top_reg) and
  487. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  488. MatchInstruction(hp1, [A_FSUB,A_FADD,A_FNEG,A_FMUL,A_FSQRT,A_FDIV,A_FABS], [PF_None]) and
  489. (MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) or
  490. ((taicpu(hp1).ops=3) and MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[2]^))
  491. ) and
  492. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  493. not(RegUsedBetween(taicpu(p).oper[0]^.reg,p,hp1)) then
  494. begin
  495. DebugMsg(SPeepholeOptimization + 'FMovFOp2FOp done', hp1);
  496. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
  497. if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) then
  498. taicpu(hp1).oper[1]^.reg:=taicpu(p).oper[1]^.reg;
  499. if (taicpu(hp1).ops=3) and MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[2]^) then
  500. taicpu(hp1).oper[2]^.reg:=taicpu(p).oper[1]^.reg;
  501. RemoveCurrentP(p);
  502. Result:=true;
  503. exit;
  504. end;
  505. }
  506. end;
  507. function TCpuAsmOptimizer.OptPass2LDRSTR(var p: tai): boolean;
  508. var
  509. hp1, hp1_last: tai;
  510. ThisRegister: TRegister;
  511. OffsetVal, ValidOffset, MinOffset, MaxOffset: asizeint;
  512. TargetOpcode: TAsmOp;
  513. begin
  514. Result := False;
  515. ThisRegister := taicpu(p).oper[0]^.reg;
  516. case taicpu(p).opcode of
  517. A_LDR:
  518. TargetOpcode := A_LDP;
  519. A_STR:
  520. TargetOpcode := A_STP;
  521. else
  522. InternalError(2020081501);
  523. end;
  524. { reg appearing in ref invalidates these optimisations }
  525. if (TargetOpcode = A_STP) or not RegInRef(ThisRegister, taicpu(p).oper[1]^.ref^) then
  526. begin
  527. { LDP/STP has a smaller permitted offset range than LDR/STR.
  528. TODO: For a group of out-of-range LDR/STR instructions, can
  529. we declare a temporary register equal to the offset base
  530. address, modify the STR instructions to use that register
  531. and then convert them to STP instructions? Note that STR
  532. generally takes 2 cycles (on top of the memory latency),
  533. while LDP/STP takes 3.
  534. }
  535. if (getsubreg(ThisRegister) = R_SUBQ) then
  536. begin
  537. ValidOffset := 8;
  538. MinOffset := -512;
  539. MaxOffset := 504;
  540. end
  541. else
  542. begin
  543. ValidOffset := 4;
  544. MinOffset := -256;
  545. MaxOffset := 252;
  546. end;
  547. hp1_last := p;
  548. { Look for nearby LDR/STR instructions }
  549. if (taicpu(p).oppostfix = PF_NONE) and
  550. (taicpu(p).oper[1]^.ref^.addressmode = AM_OFFSET) then
  551. { If SkipGetNext is True, GextNextInstruction isn't called }
  552. while GetNextInstruction(hp1_last, hp1) do
  553. begin
  554. if (hp1.typ <> ait_instruction) then
  555. Break;
  556. if (taicpu(hp1).opcode = taicpu(p).opcode) then
  557. begin
  558. if (taicpu(hp1).oppostfix = PF_NONE) and
  559. { Registers need to be the same size }
  560. (getsubreg(ThisRegister) = getsubreg(taicpu(hp1).oper[0]^.reg)) and
  561. (
  562. (TargetOpcode = A_STP) or
  563. { LDP x0, x0, [sp, #imm] is undefined behaviour, even
  564. though such an LDR pair should have been optimised
  565. out by now. STP is okay }
  566. (ThisRegister <> taicpu(hp1).oper[0]^.reg)
  567. ) and
  568. (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
  569. (taicpu(p).oper[1]^.ref^.base = taicpu(hp1).oper[1]^.ref^.base) and
  570. (taicpu(p).oper[1]^.ref^.index = taicpu(hp1).oper[1]^.ref^.index) and
  571. { Make sure the address registers haven't changed }
  572. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, p, hp1) and
  573. (
  574. (taicpu(hp1).oper[1]^.ref^.index = NR_NO) or
  575. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index, p, hp1)
  576. ) and
  577. { Don't need to check "RegInRef" because the base registers are identical,
  578. and the first one was checked already. [Kit] }
  579. (((TargetOpcode=A_LDP) and not RegUsedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) or
  580. ((TargetOpcode=A_STP) and not RegModifiedBetween(taicpu(hp1).oper[0]^.reg, p, hp1))) then
  581. begin
  582. { Can we convert these two LDR/STR instructions into a
  583. single LDR/STP? }
  584. OffsetVal := taicpu(hp1).oper[1]^.ref^.offset - taicpu(p).oper[1]^.ref^.offset;
  585. if (OffsetVal = ValidOffset) then
  586. begin
  587. if (taicpu(p).oper[1]^.ref^.offset >= MinOffset) and (taicpu(hp1).oper[1]^.ref^.offset <= MaxOffset) then
  588. begin
  589. { Convert:
  590. LDR/STR reg0, [reg2, #ofs]
  591. ...
  592. LDR/STR reg1. [reg2, #ofs + 8] // 4 if registers are 32-bit
  593. To:
  594. LDP/STP reg0, reg1, [reg2, #ofs]
  595. }
  596. taicpu(p).opcode := TargetOpcode;
  597. if TargetOpcode = A_STP then
  598. DebugMsg('Peephole Optimization: StrStr2Stp', p)
  599. else
  600. DebugMsg('Peephole Optimization: LdrLdr2Ldp', p);
  601. taicpu(p).ops := 3;
  602. taicpu(p).loadref(2, taicpu(p).oper[1]^.ref^);
  603. taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
  604. asml.Remove(hp1);
  605. hp1.Free;
  606. Result := True;
  607. Exit;
  608. end;
  609. end
  610. else if (OffsetVal = -ValidOffset) then
  611. begin
  612. if (taicpu(hp1).oper[1]^.ref^.offset >= MinOffset) and (taicpu(p).oper[1]^.ref^.offset <= MaxOffset) then
  613. begin
  614. { Convert:
  615. LDR/STR reg0, [reg2, #ofs + 8] // 4 if registers are 32-bit
  616. ...
  617. LDR/STR reg1. [reg2, #ofs]
  618. To:
  619. LDP/STP reg1, reg0, [reg2, #ofs]
  620. }
  621. taicpu(p).opcode := TargetOpcode;
  622. if TargetOpcode = A_STP then
  623. DebugMsg('Peephole Optimization: StrStr2Stp (reverse)', p)
  624. else
  625. DebugMsg('Peephole Optimization: LdrLdr2Ldp (reverse)', p);
  626. taicpu(p).ops := 3;
  627. taicpu(p).loadref(2, taicpu(hp1).oper[1]^.ref^);
  628. taicpu(p).loadreg(1, taicpu(p).oper[0]^.reg);
  629. taicpu(p).loadreg(0, taicpu(hp1).oper[0]^.reg);
  630. asml.Remove(hp1);
  631. hp1.Free;
  632. Result := True;
  633. Exit;
  634. end;
  635. end;
  636. end;
  637. end
  638. else
  639. Break;
  640. { Don't continue looking for LDR/STR pairs if the address register
  641. gets modified }
  642. if RegModifiedByInstruction(taicpu(p).oper[1]^.ref^.base, hp1) then
  643. Break;
  644. hp1_last := hp1;
  645. end;
  646. end;
  647. end;
  648. function TCpuAsmOptimizer.OptPostCMP(var p : tai): boolean;
  649. var
  650. hp1,hp2: tai;
  651. begin
  652. Result:=false;
  653. if MatchOpType(taicpu(p),top_reg,top_const) and
  654. (taicpu(p).oper[1]^.val=0) and
  655. GetNextInstruction(p,hp1) and
  656. MatchInstruction(hp1,A_B,[PF_None]) and
  657. (taicpu(hp1).condition in [C_EQ,C_NE]) then
  658. begin
  659. case taicpu(hp1).condition of
  660. C_NE:
  661. hp2:=taicpu.op_reg_sym_ofs(A_CBNZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  662. C_EQ:
  663. hp2:=taicpu.op_reg_sym_ofs(A_CBZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  664. else
  665. Internalerror(2019090801);
  666. end;
  667. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  668. asml.insertbefore(hp2, hp1);
  669. asml.remove(p);
  670. asml.remove(hp1);
  671. p.free;
  672. hp1.free;
  673. p:=hp2;
  674. DebugMsg('Peephole CMPB.E/NE2CBNZ/CBZ done', p);
  675. Result:=true;
  676. end;
  677. end;
  678. function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
  679. begin
  680. result := false;
  681. if p.typ=ait_instruction then
  682. begin
  683. case taicpu(p).opcode of
  684. A_LDR,
  685. A_STR:
  686. Result:=LookForPostindexedPattern(p);
  687. A_MOV:
  688. Result:=OptPass1Mov(p);
  689. A_STP:
  690. Result:=OptPass1STP(p);
  691. A_LSR,
  692. A_ROR,
  693. A_ASR,
  694. A_LSL:
  695. Result:=OptPass1Shift(p);
  696. A_AND:
  697. Result:=OptPass1And(p);
  698. A_ADD,
  699. A_ADC,
  700. A_SUB,
  701. A_SBC,
  702. A_BIC,
  703. A_EOR,
  704. A_ORR,
  705. A_MUL:
  706. Result:=OptPass1Data(p);
  707. A_UXTB:
  708. Result:=OptPass1UXTB(p);
  709. A_UXTH:
  710. Result:=OptPass1UXTH(p);
  711. A_SXTB:
  712. Result:=OptPass1SXTB(p);
  713. A_SXTH:
  714. Result:=OptPass1SXTH(p);
  715. // A_VLDR,
  716. A_FMADD,
  717. A_FMSUB,
  718. A_FNMADD,
  719. A_FNMSUB,
  720. A_FNMUL,
  721. A_FADD,
  722. A_FMUL,
  723. A_FDIV,
  724. A_FSUB,
  725. A_FSQRT,
  726. A_FNEG,
  727. A_FCVT,
  728. A_FABS:
  729. Result:=OptPass1FData(p);
  730. A_FMOV:
  731. Result:=OptPass1FMov(p);
  732. else
  733. ;
  734. end;
  735. end;
  736. end;
  737. function TCpuAsmOptimizer.PeepHoleOptPass2Cpu(var p: tai): boolean;
  738. begin
  739. result := false;
  740. if p.typ=ait_instruction then
  741. begin
  742. case taicpu(p).opcode of
  743. A_LDR,
  744. A_STR:
  745. Result:=OptPass2LDRSTR(p);
  746. else
  747. ;
  748. end;
  749. end;
  750. end;
  751. function TCpuAsmOptimizer.PostPeepHoleOptsCpu(var p: tai): boolean;
  752. begin
  753. result := false;
  754. if p.typ=ait_instruction then
  755. begin
  756. case taicpu(p).opcode of
  757. A_CMP:
  758. Result:=OptPostCMP(p);
  759. else
  760. ;
  761. end;
  762. end;
  763. end;
  764. begin
  765. casmoptimizer:=TCpuAsmOptimizer;
  766. End.