aoptcpu.pas 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847
  1. {
  2. Copyright (c) 1998-2002 by Jonas Maebe, member of the Free Pascal
  3. Development Team
  4. This unit implements the ARM64 optimizer object
  5. This program is free software; you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation; either version 2 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program; if not, write to the Free Software
  15. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16. ****************************************************************************
  17. }
  18. Unit aoptcpu;
  19. {$i fpcdefs.inc}
  20. { $define DEBUG_AOPTCPU}
  21. Interface
  22. uses
  23. globtype, globals,
  24. cutils,
  25. cgbase, cpubase, aasmtai, aasmcpu,
  26. aopt, aoptcpub, aoptarm;
  27. Type
  28. TCpuAsmOptimizer = class(TARMAsmOptimizer)
  29. { uses the same constructor as TAopObj }
  30. function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
  31. function PeepHoleOptPass2Cpu(var p: tai): boolean; override;
  32. function PostPeepHoleOptsCpu(var p: tai): boolean; override;
  33. function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
  34. function InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;override;
  35. function LookForPostindexedPattern(var p : tai) : boolean;
  36. private
  37. function RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string): boolean;
  38. function OptPass1Shift(var p: tai): boolean;
  39. function OptPostCMP(var p: tai): boolean;
  40. function OptPass1Data(var p: tai): boolean;
  41. function OptPass1FData(var p: tai): Boolean;
  42. function OptPass1STP(var p: tai): boolean;
  43. function OptPass1Mov(var p: tai): boolean;
  44. function OptPass1FMov(var p: tai): Boolean;
  45. function OptPass2LDRSTR(var p: tai): boolean;
  46. End;
  47. Implementation
  48. uses
  49. aasmbase,
  50. aoptutils,
  51. cgutils,
  52. verbose;
  53. {$ifdef DEBUG_AOPTCPU}
  54. const
  55. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  56. {$else DEBUG_AOPTCPU}
  57. { Empty strings help the optimizer to remove string concatenations that won't
  58. ever appear to the user on release builds. [Kit] }
  59. const
  60. SPeepholeOptimization = '';
  61. {$endif DEBUG_AOPTCPU}
  62. function CanBeCond(p : tai) : boolean;
  63. begin
  64. result:=(p.typ=ait_instruction) and (taicpu(p).condition=C_None);
  65. end;
  66. function TCpuAsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  67. var
  68. p: taicpu;
  69. begin
  70. Result := false;
  71. if not ((assigned(hp)) and (hp.typ = ait_instruction)) then
  72. exit;
  73. p := taicpu(hp);
  74. case p.opcode of
  75. { These operands do not write into a register at all }
  76. A_CMP, A_CMN, A_TST, A_B, A_BL, A_MSR, A_FCMP:
  77. exit;
  78. {Take care of post/preincremented store and loads, they will change their base register}
  79. A_STR, A_LDR:
  80. begin
  81. Result := false;
  82. { actually, this does not apply here because post-/preindexed does not mean that a register
  83. is loaded with a new value, it is only modified
  84. (taicpu(p).oper[1]^.typ=top_ref) and
  85. (taicpu(p).oper[1]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
  86. (taicpu(p).oper[1]^.ref^.base = reg);
  87. }
  88. { STR does not load into it's first register }
  89. if p.opcode = A_STR then
  90. exit;
  91. end;
  92. else
  93. ;
  94. end;
  95. if Result then
  96. exit;
  97. case p.oper[0]^.typ of
  98. top_reg:
  99. Result := (p.oper[0]^.reg = reg);
  100. top_ref:
  101. Result :=
  102. (taicpu(p).oper[0]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
  103. (taicpu(p).oper[0]^.ref^.base = reg);
  104. else
  105. ;
  106. end;
  107. end;
  108. function TCpuAsmOptimizer.InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;
  109. var
  110. p: taicpu;
  111. i: longint;
  112. begin
  113. instructionLoadsFromReg := false;
  114. if not (assigned(hp) and (hp.typ = ait_instruction)) then
  115. exit;
  116. p:=taicpu(hp);
  117. i:=1;
  118. { Start on oper[0]? }
  119. if taicpu(hp).spilling_get_operation_type(0) in [operand_read, operand_readwrite] then
  120. i:=0;
  121. while(i<p.ops) do
  122. begin
  123. case p.oper[I]^.typ of
  124. top_reg:
  125. Result := (p.oper[I]^.reg = reg);
  126. top_ref:
  127. Result :=
  128. (p.oper[I]^.ref^.base = reg) or
  129. (p.oper[I]^.ref^.index = reg);
  130. else
  131. ;
  132. end;
  133. { Bailout if we found something }
  134. if Result then
  135. exit;
  136. Inc(I);
  137. end;
  138. end;
  139. {
  140. optimize
  141. ldr/str regX,[reg1]
  142. ...
  143. add/sub reg1,reg1,regY/const
  144. into
  145. ldr/str regX,[reg1], regY/const
  146. }
  147. function TCpuAsmOptimizer.LookForPostindexedPattern(var p: tai) : boolean;
  148. var
  149. hp1 : tai;
  150. begin
  151. Result:=false;
  152. if (taicpu(p).oper[1]^.typ = top_ref) and
  153. (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
  154. (taicpu(p).oper[1]^.ref^.index=NR_NO) and
  155. (taicpu(p).oper[1]^.ref^.offset=0) and
  156. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.ref^.base) and
  157. { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
  158. MatchInstruction(hp1, [A_ADD, A_SUB], [PF_None]) and
  159. (taicpu(hp1).oper[0]^.reg=taicpu(p).oper[1]^.ref^.base) and
  160. (taicpu(hp1).oper[1]^.reg=taicpu(p).oper[1]^.ref^.base) and
  161. (
  162. { valid offset? }
  163. (taicpu(hp1).oper[2]^.typ=top_const) and
  164. (taicpu(hp1).oper[2]^.val>=-256) and
  165. (abs(taicpu(hp1).oper[2]^.val)<256)
  166. ) and
  167. { don't apply the optimization if the base register is loaded }
  168. (getsupreg(taicpu(p).oper[0]^.reg)<>getsupreg(taicpu(p).oper[1]^.ref^.base)) and
  169. not(RegModifiedBetween(taicpu(hp1).oper[0]^.reg,p,hp1)) and
  170. not(RegModifiedBetween(taicpu(hp1).oper[2]^.reg,p,hp1)) then
  171. begin
  172. if taicpu(p).opcode = A_LDR then
  173. DebugMsg('Peephole LdrAdd/Sub2Ldr Postindex done', p)
  174. else
  175. DebugMsg('Peephole StrAdd/Sub2Str Postindex done', p);
  176. taicpu(p).oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
  177. if taicpu(hp1).opcode=A_ADD then
  178. taicpu(p).oper[1]^.ref^.offset:=taicpu(hp1).oper[2]^.val
  179. else
  180. taicpu(p).oper[1]^.ref^.offset:=-taicpu(hp1).oper[2]^.val;
  181. asml.Remove(hp1);
  182. hp1.Free;
  183. Result:=true;
  184. end;
  185. end;
  186. function TCpuAsmOptimizer.RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string):boolean;
  187. var
  188. alloc,
  189. dealloc : tai_regalloc;
  190. hp1 : tai;
  191. begin
  192. Result:=false;
  193. if ((MatchInstruction(movp, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  194. ((getregtype(taicpu(movp).oper[0]^.reg)=R_MMREGISTER) { or (taicpu(p).opcode in [A_LDUR])})
  195. ) { or
  196. (((taicpu(p).oppostfix in [PF_F64F32,PF_F64S16,PF_F64S32,PF_F64U16,PF_F64U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFD)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F64])) or
  197. (((taicpu(p).oppostfix in [PF_F32F64,PF_F32S16,PF_F32S32,PF_F32U16,PF_F32U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFS)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F32])) }
  198. ) and
  199. (taicpu(movp).ops=2) and
  200. MatchOperand(taicpu(movp).oper[1]^, taicpu(p).oper[0]^.reg) and
  201. { the destination register of the mov might not be used beween p and movp }
  202. not(RegUsedBetween(taicpu(movp).oper[0]^.reg,p,movp)) and
  203. { Take care to only do this for instructions which REALLY load to the first register.
  204. Otherwise
  205. str reg0, [reg1]
  206. fmov reg2, reg0
  207. will be optimized to
  208. str reg2, [reg1]
  209. }
  210. RegLoadedWithNewValue(taicpu(p).oper[0]^.reg, p) then
  211. begin
  212. dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(movp.Next));
  213. if assigned(dealloc) then
  214. begin
  215. DebugMsg('Peephole '+optimizer+' removed superfluous vmov', movp);
  216. result:=true;
  217. { taicpu(p).oper[0]^.reg is not used anymore, try to find its allocation
  218. and remove it if possible }
  219. asml.Remove(dealloc);
  220. alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.previous));
  221. if assigned(alloc) then
  222. begin
  223. asml.Remove(alloc);
  224. alloc.free;
  225. dealloc.free;
  226. end
  227. else
  228. asml.InsertAfter(dealloc,p);
  229. { try to move the allocation of the target register }
  230. GetLastInstruction(movp,hp1);
  231. alloc:=FindRegAlloc(taicpu(movp).oper[0]^.reg,tai(hp1.Next));
  232. if assigned(alloc) then
  233. begin
  234. asml.Remove(alloc);
  235. asml.InsertBefore(alloc,p);
  236. { adjust used regs }
  237. IncludeRegInUsedRegs(taicpu(movp).oper[0]^.reg,UsedRegs);
  238. end;
  239. { change
  240. vldr reg0,[reg1]
  241. vmov reg2,reg0
  242. into
  243. ldr reg2,[reg1]
  244. if reg2 is an int register
  245. if (taicpu(p).opcode=A_VLDR) and (getregtype(taicpu(movp).oper[0]^.reg)=R_INTREGISTER) then
  246. taicpu(p).opcode:=A_LDR;
  247. }
  248. { finally get rid of the mov }
  249. taicpu(p).loadreg(0,taicpu(movp).oper[0]^.reg);
  250. asml.remove(movp);
  251. movp.free;
  252. end;
  253. end;
  254. end;
  255. function TCpuAsmOptimizer.OptPass1Shift(var p : tai): boolean;
  256. var
  257. hp1,hp2: tai;
  258. I2, I: Integer;
  259. shifterop: tshifterop;
  260. begin
  261. Result:=false;
  262. { This folds shifterops into following instructions
  263. <shiftop> r0, r1, #imm
  264. <op> r2, r3, r0
  265. to
  266. <op> r2, r3, r1, <shiftop> #imm
  267. }
  268. { do not handle ROR yet, only part of the instructions below support ROR as shifter operand }
  269. if MatchInstruction(p,[A_LSL, A_LSR, A_ASR{, A_ROR}],[PF_None]) and
  270. MatchOpType(taicpu(p),top_reg,top_reg,top_const) and
  271. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  272. MatchInstruction(hp1, [A_ADD, A_AND, A_BIC, A_CMP, A_CMN,
  273. A_EON, A_EOR, A_NEG, A_ORN, A_ORR,
  274. A_SUB, A_TST], [PF_None]) and
  275. RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  276. (taicpu(hp1).ops >= 2) and
  277. { Currently we can't fold into another shifterop }
  278. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.typ = top_reg) and
  279. { SP does not work completely with shifted registers, as I didn't find the exact rules,
  280. we do not operate on SP }
  281. (taicpu(hp1).oper[0]^.reg<>NR_SP) and
  282. (taicpu(hp1).oper[1]^.reg<>NR_SP) and
  283. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.reg<>NR_SP) and
  284. { reg1 might not be modified inbetween }
  285. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
  286. (
  287. { Only ONE of the two src operands is allowed to match }
  288. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-2]^) xor
  289. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)
  290. ) and
  291. { for SUB, the last operand must match, there is no RSB on AArch64 }
  292. ((taicpu(hp1).opcode<>A_SUB) or
  293. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)) then
  294. begin
  295. { for the two operand instructions, start also at the second operand as they are not always commutative
  296. (depends on the flags tested laster on) and thus the operands cannot swapped }
  297. I2:=1;
  298. for I:=I2 to taicpu(hp1).ops-1 do
  299. if MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[I]^.reg) then
  300. begin
  301. { If the parameter matched on the second op from the RIGHT
  302. we have to switch the parameters, this will not happen for CMP
  303. were we're only evaluating the most right parameter
  304. }
  305. shifterop_reset(shifterop);
  306. case taicpu(p).opcode of
  307. A_LSL:
  308. shifterop.shiftmode:=SM_LSL;
  309. A_ROR:
  310. shifterop.shiftmode:=SM_ROR;
  311. A_LSR:
  312. shifterop.shiftmode:=SM_LSR;
  313. A_ASR:
  314. shifterop.shiftmode:=SM_ASR;
  315. else
  316. InternalError(2019090401);
  317. end;
  318. shifterop.shiftimm:=taicpu(p).oper[2]^.val;
  319. if I <> taicpu(hp1).ops-1 then
  320. begin
  321. if taicpu(hp1).ops = 3 then
  322. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  323. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[2]^.reg,
  324. taicpu(p).oper[1]^.reg, shifterop)
  325. else
  326. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  327. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  328. shifterop);
  329. end
  330. else
  331. if taicpu(hp1).ops = 3 then
  332. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  333. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg,
  334. taicpu(p).oper[1]^.reg,shifterop)
  335. else
  336. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  337. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  338. shifterop);
  339. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  340. asml.insertbefore(hp2, hp1);
  341. GetNextInstruction(p, hp2);
  342. asml.remove(p);
  343. asml.remove(hp1);
  344. p.free;
  345. hp1.free;
  346. p:=hp2;
  347. DebugMsg('Peephole FoldShiftProcess done', p);
  348. Result:=true;
  349. break;
  350. end;
  351. end
  352. else if MatchInstruction(p,[A_LSL, A_LSR, A_ASR,A_ROR],[PF_None]) and
  353. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  354. RemoveSuperfluousMove(p, hp1, 'ShiftMov2Shift') then
  355. Result:=true;
  356. end;
  357. function TCpuAsmOptimizer.OptPass1Data(var p : tai): boolean;
  358. var
  359. hp1: tai;
  360. begin
  361. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  362. RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
  363. end;
  364. function TCpuAsmOptimizer.OptPass1FData(var p: tai): Boolean;
  365. var
  366. hp1: tai;
  367. begin
  368. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  369. RemoveSuperfluousFMov(p, hp1, 'FOpFMov2FOp');
  370. end;
  371. function TCpuAsmOptimizer.OptPass1STP(var p : tai): boolean;
  372. var
  373. hp1, hp2, hp3, hp4: tai;
  374. begin
  375. Result:=false;
  376. {
  377. change
  378. stp x29,x30,[sp, #-16]!
  379. mov x29,sp
  380. bl abc
  381. ldp x29,x30,[sp], #16
  382. ret
  383. into
  384. b abc
  385. }
  386. if MatchInstruction(p, A_STP, [C_None], [PF_None]) and
  387. MatchOpType(taicpu(p),top_reg,top_reg,top_ref) and
  388. (taicpu(p).oper[0]^.reg = NR_X29) and
  389. (taicpu(p).oper[1]^.reg = NR_X30) and
  390. (taicpu(p).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  391. (taicpu(p).oper[2]^.ref^.index=NR_NO) and
  392. (taicpu(p).oper[2]^.ref^.offset=-16) and
  393. (taicpu(p).oper[2]^.ref^.addressmode=AM_PREINDEXED) and
  394. GetNextInstruction(p, hp1) and
  395. MatchInstruction(hp1, A_MOV, [C_None], [PF_NONE]) and
  396. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[0]^) and
  397. (taicpu(hp1).oper[1]^.typ = top_reg) and
  398. (taicpu(hp1).oper[1]^.reg = NR_STACK_POINTER_REG) and
  399. GetNextInstruction(hp1, hp2) and
  400. SkipEntryExitMarker(hp2, hp2) and
  401. MatchInstruction(hp2, A_BL, [C_None], [PF_NONE]) and
  402. (taicpu(hp2).oper[0]^.typ = top_ref) and
  403. GetNextInstruction(hp2, hp3) and
  404. SkipEntryExitMarker(hp3, hp3) and
  405. MatchInstruction(hp3, A_LDP, [C_None], [PF_NONE]) and
  406. MatchOpType(taicpu(hp3),top_reg,top_reg,top_ref) and
  407. (taicpu(hp3).oper[0]^.reg = NR_X29) and
  408. (taicpu(hp3).oper[1]^.reg = NR_X30) and
  409. (taicpu(hp3).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  410. (taicpu(hp3).oper[2]^.ref^.index=NR_NO) and
  411. (taicpu(hp3).oper[2]^.ref^.offset=16) and
  412. (taicpu(hp3).oper[2]^.ref^.addressmode=AM_POSTINDEXED) and
  413. GetNextInstruction(hp3, hp4) and
  414. MatchInstruction(hp4, A_RET, [C_None], [PF_None]) and
  415. (taicpu(hp4).ops = 0) then
  416. begin
  417. asml.Remove(p);
  418. asml.Remove(hp1);
  419. asml.Remove(hp3);
  420. asml.Remove(hp4);
  421. taicpu(hp2).opcode:=A_B;
  422. p.free;
  423. hp1.free;
  424. hp3.free;
  425. hp4.free;
  426. p:=hp2;
  427. DebugMsg('Peephole Bl2B done', p);
  428. Result:=true;
  429. end;
  430. end;
  431. function TCpuAsmOptimizer.OptPass1Mov(var p : tai): boolean;
  432. var
  433. hp1: tai;
  434. begin
  435. Result:=false;
  436. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  437. (taicpu(p).oppostfix=PF_None) then
  438. begin
  439. RemoveCurrentP(p);
  440. DebugMsg('Peephole Mov2None done', p);
  441. Result:=true;
  442. end
  443. {
  444. optimize
  445. mov rX, yyyy
  446. ....
  447. }
  448. else if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
  449. begin
  450. if RemoveSuperfluousMove(p, hp1, 'MovMov2Mov') then
  451. Result:=true
  452. else if (taicpu(p).ops = 2) and
  453. (tai(hp1).typ = ait_instruction) and
  454. RedundantMovProcess(p,hp1) then
  455. Result:=true;
  456. end;
  457. end;
  458. function TCpuAsmOptimizer.OptPass1FMov(var p: tai): Boolean;
  459. var
  460. hp1: tai;
  461. begin
  462. {
  463. change
  464. fmov reg0,reg1
  465. fmov reg1,reg0
  466. into
  467. fmov reg0,reg1
  468. }
  469. Result := False;
  470. while GetNextInstruction(p, hp1) and
  471. MatchInstruction(hp1, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  472. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^) and
  473. MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[0]^) do
  474. begin
  475. asml.Remove(hp1);
  476. hp1.free;
  477. DebugMsg(SPeepholeOptimization + 'FMovFMov2FMov done', p);
  478. Result:=true;
  479. end;
  480. { not enabled as apparently not happening
  481. if MatchOpType(taicpu(p),top_reg,top_reg) and
  482. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  483. MatchInstruction(hp1, [A_FSUB,A_FADD,A_FNEG,A_FMUL,A_FSQRT,A_FDIV,A_FABS], [PF_None]) and
  484. (MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) or
  485. ((taicpu(hp1).ops=3) and MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[2]^))
  486. ) and
  487. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  488. not(RegUsedBetween(taicpu(p).oper[0]^.reg,p,hp1)) then
  489. begin
  490. DebugMsg(SPeepholeOptimization + 'FMovFOp2FOp done', hp1);
  491. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
  492. if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) then
  493. taicpu(hp1).oper[1]^.reg:=taicpu(p).oper[1]^.reg;
  494. if (taicpu(hp1).ops=3) and MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[2]^) then
  495. taicpu(hp1).oper[2]^.reg:=taicpu(p).oper[1]^.reg;
  496. RemoveCurrentP(p);
  497. Result:=true;
  498. exit;
  499. end;
  500. }
  501. end;
  502. function TCpuAsmOptimizer.OptPass2LDRSTR(var p: tai): boolean;
  503. var
  504. hp1, hp1_last: tai;
  505. ThisRegister: TRegister;
  506. OffsetVal, ValidOffset, MinOffset, MaxOffset: asizeint;
  507. TargetOpcode: TAsmOp;
  508. begin
  509. Result := False;
  510. ThisRegister := taicpu(p).oper[0]^.reg;
  511. case taicpu(p).opcode of
  512. A_LDR:
  513. TargetOpcode := A_LDP;
  514. A_STR:
  515. TargetOpcode := A_STP;
  516. else
  517. InternalError(2020081501);
  518. end;
  519. { reg appearing in ref invalidates these optimisations }
  520. if (TargetOpcode = A_STP) or not RegInRef(ThisRegister, taicpu(p).oper[1]^.ref^) then
  521. begin
  522. { LDP/STP has a smaller permitted offset range than LDR/STR.
  523. TODO: For a group of out-of-range LDR/STR instructions, can
  524. we declare a temporary register equal to the offset base
  525. address, modify the STR instructions to use that register
  526. and then convert them to STP instructions? Note that STR
  527. generally takes 2 cycles (on top of the memory latency),
  528. while LDP/STP takes 3.
  529. }
  530. if (getsubreg(ThisRegister) = R_SUBQ) then
  531. begin
  532. ValidOffset := 8;
  533. MinOffset := -512;
  534. MaxOffset := 504;
  535. end
  536. else
  537. begin
  538. ValidOffset := 4;
  539. MinOffset := -256;
  540. MaxOffset := 252;
  541. end;
  542. hp1_last := p;
  543. { Look for nearby LDR/STR instructions }
  544. if (taicpu(p).oppostfix = PF_NONE) and
  545. (taicpu(p).oper[1]^.ref^.addressmode = AM_OFFSET) then
  546. { If SkipGetNext is True, GextNextInstruction isn't called }
  547. while GetNextInstruction(hp1_last, hp1) do
  548. begin
  549. if (hp1.typ <> ait_instruction) then
  550. Break;
  551. if (taicpu(hp1).opcode = taicpu(p).opcode) then
  552. begin
  553. if (taicpu(hp1).oppostfix = PF_NONE) and
  554. { Registers need to be the same size }
  555. (getsubreg(ThisRegister) = getsubreg(taicpu(hp1).oper[0]^.reg)) and
  556. (
  557. (TargetOpcode = A_STP) or
  558. { LDP x0, x0, [sp, #imm] is undefined behaviour, even
  559. though such an LDR pair should have been optimised
  560. out by now. STP is okay }
  561. (ThisRegister <> taicpu(hp1).oper[0]^.reg)
  562. ) and
  563. (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
  564. (taicpu(p).oper[1]^.ref^.base = taicpu(hp1).oper[1]^.ref^.base) and
  565. (taicpu(p).oper[1]^.ref^.index = taicpu(hp1).oper[1]^.ref^.index) and
  566. { Make sure the address registers haven't changed }
  567. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, p, hp1) and
  568. (
  569. (taicpu(hp1).oper[1]^.ref^.index = NR_NO) or
  570. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index, p, hp1)
  571. ) and
  572. { Don't need to check "RegInRef" because the base registers are identical,
  573. and the first one was checked already. [Kit] }
  574. (((TargetOpcode=A_LDP) and not RegUsedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) or
  575. ((TargetOpcode=A_STP) and not RegModifiedBetween(taicpu(hp1).oper[0]^.reg, p, hp1))) then
  576. begin
  577. { Can we convert these two LDR/STR instructions into a
  578. single LDR/STP? }
  579. OffsetVal := taicpu(hp1).oper[1]^.ref^.offset - taicpu(p).oper[1]^.ref^.offset;
  580. if (OffsetVal = ValidOffset) then
  581. begin
  582. if (taicpu(p).oper[1]^.ref^.offset >= MinOffset) and (taicpu(hp1).oper[1]^.ref^.offset <= MaxOffset) then
  583. begin
  584. { Convert:
  585. LDR/STR reg0, [reg2, #ofs]
  586. ...
  587. LDR/STR reg1. [reg2, #ofs + 8] // 4 if registers are 32-bit
  588. To:
  589. LDP/STP reg0, reg1, [reg2, #ofs]
  590. }
  591. taicpu(p).opcode := TargetOpcode;
  592. if TargetOpcode = A_STP then
  593. DebugMsg('Peephole Optimization: StrStr2Stp', p)
  594. else
  595. DebugMsg('Peephole Optimization: LdrLdr2Ldp', p);
  596. taicpu(p).ops := 3;
  597. taicpu(p).loadref(2, taicpu(p).oper[1]^.ref^);
  598. taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
  599. asml.Remove(hp1);
  600. hp1.Free;
  601. Result := True;
  602. Exit;
  603. end;
  604. end
  605. else if (OffsetVal = -ValidOffset) then
  606. begin
  607. if (taicpu(hp1).oper[1]^.ref^.offset >= MinOffset) and (taicpu(p).oper[1]^.ref^.offset <= MaxOffset) then
  608. begin
  609. { Convert:
  610. LDR/STR reg0, [reg2, #ofs + 8] // 4 if registers are 32-bit
  611. ...
  612. LDR/STR reg1. [reg2, #ofs]
  613. To:
  614. LDP/STP reg1, reg0, [reg2, #ofs]
  615. }
  616. taicpu(p).opcode := TargetOpcode;
  617. if TargetOpcode = A_STP then
  618. DebugMsg('Peephole Optimization: StrStr2Stp (reverse)', p)
  619. else
  620. DebugMsg('Peephole Optimization: LdrLdr2Ldp (reverse)', p);
  621. taicpu(p).ops := 3;
  622. taicpu(p).loadref(2, taicpu(hp1).oper[1]^.ref^);
  623. taicpu(p).loadreg(1, taicpu(p).oper[0]^.reg);
  624. taicpu(p).loadreg(0, taicpu(hp1).oper[0]^.reg);
  625. asml.Remove(hp1);
  626. hp1.Free;
  627. Result := True;
  628. Exit;
  629. end;
  630. end;
  631. end;
  632. end
  633. else
  634. Break;
  635. { Don't continue looking for LDR/STR pairs if the address register
  636. gets modified }
  637. if RegModifiedByInstruction(taicpu(p).oper[1]^.ref^.base, hp1) then
  638. Break;
  639. hp1_last := hp1;
  640. end;
  641. end;
  642. end;
  643. function TCpuAsmOptimizer.OptPostCMP(var p : tai): boolean;
  644. var
  645. hp1,hp2: tai;
  646. begin
  647. Result:=false;
  648. if MatchOpType(taicpu(p),top_reg,top_const) and
  649. (taicpu(p).oper[1]^.val=0) and
  650. GetNextInstruction(p,hp1) and
  651. MatchInstruction(hp1,A_B,[PF_None]) and
  652. (taicpu(hp1).condition in [C_EQ,C_NE]) then
  653. begin
  654. case taicpu(hp1).condition of
  655. C_NE:
  656. hp2:=taicpu.op_reg_sym_ofs(A_CBNZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  657. C_EQ:
  658. hp2:=taicpu.op_reg_sym_ofs(A_CBZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  659. else
  660. Internalerror(2019090801);
  661. end;
  662. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  663. asml.insertbefore(hp2, hp1);
  664. asml.remove(p);
  665. asml.remove(hp1);
  666. p.free;
  667. hp1.free;
  668. p:=hp2;
  669. DebugMsg('Peephole CMPB.E/NE2CBNZ/CBZ done', p);
  670. Result:=true;
  671. end;
  672. end;
  673. function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
  674. begin
  675. result := false;
  676. if p.typ=ait_instruction then
  677. begin
  678. case taicpu(p).opcode of
  679. A_LDR,
  680. A_STR:
  681. Result:=LookForPostindexedPattern(p);
  682. A_MOV:
  683. Result:=OptPass1Mov(p);
  684. A_STP:
  685. Result:=OptPass1STP(p);
  686. A_LSR,
  687. A_ROR,
  688. A_ASR,
  689. A_LSL:
  690. Result:=OptPass1Shift(p);
  691. A_AND:
  692. Result:=OptPass1And(p);
  693. A_ADD,
  694. A_ADC,
  695. A_SUB,
  696. A_SBC,
  697. A_BIC,
  698. A_EOR,
  699. A_ORR,
  700. A_MUL:
  701. Result:=OptPass1Data(p);
  702. A_UXTB:
  703. Result:=OptPass1UXTB(p);
  704. A_UXTH:
  705. Result:=OptPass1UXTH(p);
  706. A_SXTB:
  707. Result:=OptPass1SXTB(p);
  708. A_SXTH:
  709. Result:=OptPass1SXTH(p);
  710. // A_VLDR,
  711. A_FMADD,
  712. A_FMSUB,
  713. A_FNMADD,
  714. A_FNMSUB,
  715. A_FNMUL,
  716. A_FADD,
  717. A_FMUL,
  718. A_FDIV,
  719. A_FSUB,
  720. A_FSQRT,
  721. A_FNEG,
  722. A_FCVT,
  723. A_FABS:
  724. Result:=OptPass1FData(p);
  725. A_FMOV:
  726. Result:=OptPass1FMov(p);
  727. else
  728. ;
  729. end;
  730. end;
  731. end;
  732. function TCpuAsmOptimizer.PeepHoleOptPass2Cpu(var p: tai): boolean;
  733. begin
  734. result := false;
  735. if p.typ=ait_instruction then
  736. begin
  737. case taicpu(p).opcode of
  738. A_LDR,
  739. A_STR:
  740. Result:=OptPass2LDRSTR(p);
  741. else
  742. ;
  743. end;
  744. end;
  745. end;
  746. function TCpuAsmOptimizer.PostPeepHoleOptsCpu(var p: tai): boolean;
  747. begin
  748. result := false;
  749. if p.typ=ait_instruction then
  750. begin
  751. case taicpu(p).opcode of
  752. A_CMP:
  753. Result:=OptPostCMP(p);
  754. else
  755. ;
  756. end;
  757. end;
  758. end;
  759. begin
  760. casmoptimizer:=TCpuAsmOptimizer;
  761. End.