aoptcpu.pas 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828
  1. {
  2. Copyright (c) 1998-2002 by Jonas Maebe, member of the Free Pascal
  3. Development Team
  4. This unit implements the ARM64 optimizer object
  5. This program is free software; you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation; either version 2 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program; if not, write to the Free Software
  15. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16. ****************************************************************************
  17. }
  18. Unit aoptcpu;
  19. {$i fpcdefs.inc}
  20. { $define DEBUG_AOPTCPU}
  21. Interface
  22. uses
  23. globtype, globals,
  24. cutils,
  25. cgbase, cpubase, aasmtai, aasmcpu,
  26. aopt, aoptcpub, aoptarm;
  27. Type
  28. TCpuAsmOptimizer = class(TARMAsmOptimizer)
  29. { uses the same constructor as TAopObj }
  30. function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
  31. function PeepHoleOptPass2Cpu(var p: tai): boolean; override;
  32. function PostPeepHoleOptsCpu(var p: tai): boolean; override;
  33. function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
  34. function InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;override;
  35. function LookForPostindexedPattern(var p : tai) : boolean;
  36. private
  37. function RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string): boolean;
  38. function OptPass1Shift(var p: tai): boolean;
  39. function OptPostCMP(var p: tai): boolean;
  40. function OptPass1Data(var p: tai): boolean;
  41. function OptPass1FData(var p: tai): Boolean;
  42. function OptPass1STP(var p: tai): boolean;
  43. function OptPass1Mov(var p: tai): boolean;
  44. function OptPass1FMov(var p: tai): Boolean;
  45. function OptPass2LDRSTR(var p: tai): boolean;
  46. End;
  47. Implementation
  48. uses
  49. aasmbase,
  50. aoptutils,
  51. cgutils,
  52. verbose;
  53. {$ifdef DEBUG_AOPTCPU}
  54. const
  55. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  56. {$else DEBUG_AOPTCPU}
  57. { Empty strings help the optimizer to remove string concatenations that won't
  58. ever appear to the user on release builds. [Kit] }
  59. const
  60. SPeepholeOptimization = '';
  61. {$endif DEBUG_AOPTCPU}
  62. function CanBeCond(p : tai) : boolean;
  63. begin
  64. result:=(p.typ=ait_instruction) and (taicpu(p).condition=C_None);
  65. end;
  66. function TCpuAsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  67. var
  68. p: taicpu;
  69. begin
  70. p := taicpu(hp);
  71. Result := false;
  72. if not ((assigned(hp)) and (hp.typ = ait_instruction)) then
  73. exit;
  74. case p.opcode of
  75. { These operands do not write into a register at all }
  76. A_CMP, A_CMN, A_TST, A_B, A_BL, A_MSR, A_FCMP:
  77. exit;
  78. {Take care of post/preincremented store and loads, they will change their base register}
  79. A_STR, A_LDR:
  80. begin
  81. Result := false;
  82. { actually, this does not apply here because post-/preindexed does not mean that a register
  83. is loaded with a new value, it is only modified
  84. (taicpu(p).oper[1]^.typ=top_ref) and
  85. (taicpu(p).oper[1]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
  86. (taicpu(p).oper[1]^.ref^.base = reg);
  87. }
  88. { STR does not load into it's first register }
  89. if p.opcode = A_STR then
  90. exit;
  91. end;
  92. else
  93. ;
  94. end;
  95. if Result then
  96. exit;
  97. case p.oper[0]^.typ of
  98. top_reg:
  99. Result := (p.oper[0]^.reg = reg);
  100. top_ref:
  101. Result :=
  102. (taicpu(p).oper[0]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
  103. (taicpu(p).oper[0]^.ref^.base = reg);
  104. else
  105. ;
  106. end;
  107. end;
  108. function TCpuAsmOptimizer.InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;
  109. var
  110. p: taicpu;
  111. i: longint;
  112. begin
  113. instructionLoadsFromReg := false;
  114. if not (assigned(hp) and (hp.typ = ait_instruction)) then
  115. exit;
  116. p:=taicpu(hp);
  117. i:=1;
  118. { Start on oper[0]? }
  119. if taicpu(hp).spilling_get_operation_type(0) in [operand_read, operand_readwrite] then
  120. i:=0;
  121. while(i<p.ops) do
  122. begin
  123. case p.oper[I]^.typ of
  124. top_reg:
  125. Result := (p.oper[I]^.reg = reg);
  126. top_ref:
  127. Result :=
  128. (p.oper[I]^.ref^.base = reg) or
  129. (p.oper[I]^.ref^.index = reg);
  130. else
  131. ;
  132. end;
  133. { Bailout if we found something }
  134. if Result then
  135. exit;
  136. Inc(I);
  137. end;
  138. end;
  139. {
  140. optimize
  141. ldr/str regX,[reg1]
  142. ...
  143. add/sub reg1,reg1,regY/const
  144. into
  145. ldr/str regX,[reg1], regY/const
  146. }
  147. function TCpuAsmOptimizer.LookForPostindexedPattern(var p: tai) : boolean;
  148. var
  149. hp1 : tai;
  150. begin
  151. Result:=false;
  152. if (taicpu(p).oper[1]^.typ = top_ref) and
  153. (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
  154. (taicpu(p).oper[1]^.ref^.index=NR_NO) and
  155. (taicpu(p).oper[1]^.ref^.offset=0) and
  156. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.ref^.base) and
  157. { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
  158. MatchInstruction(hp1, [A_ADD, A_SUB], [PF_None]) and
  159. (taicpu(hp1).oper[0]^.reg=taicpu(p).oper[1]^.ref^.base) and
  160. (taicpu(hp1).oper[1]^.reg=taicpu(p).oper[1]^.ref^.base) and
  161. (
  162. { valid offset? }
  163. (taicpu(hp1).oper[2]^.typ=top_const) and
  164. (taicpu(hp1).oper[2]^.val>=-256) and
  165. (abs(taicpu(hp1).oper[2]^.val)<256)
  166. ) and
  167. { don't apply the optimization if the base register is loaded }
  168. (getsupreg(taicpu(p).oper[0]^.reg)<>getsupreg(taicpu(p).oper[1]^.ref^.base)) and
  169. not(RegModifiedBetween(taicpu(hp1).oper[0]^.reg,p,hp1)) and
  170. not(RegModifiedBetween(taicpu(hp1).oper[2]^.reg,p,hp1)) then
  171. begin
  172. if taicpu(p).opcode = A_LDR then
  173. DebugMsg('Peephole LdrAdd/Sub2Ldr Postindex done', p)
  174. else
  175. DebugMsg('Peephole StrAdd/Sub2Str Postindex done', p);
  176. taicpu(p).oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
  177. if taicpu(hp1).opcode=A_ADD then
  178. taicpu(p).oper[1]^.ref^.offset:=taicpu(hp1).oper[2]^.val
  179. else
  180. taicpu(p).oper[1]^.ref^.offset:=-taicpu(hp1).oper[2]^.val;
  181. asml.Remove(hp1);
  182. hp1.Free;
  183. Result:=true;
  184. end;
  185. end;
  186. function TCpuAsmOptimizer.RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string):boolean;
  187. var
  188. alloc,
  189. dealloc : tai_regalloc;
  190. hp1 : tai;
  191. begin
  192. Result:=false;
  193. if ((MatchInstruction(movp, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  194. ((getregtype(taicpu(movp).oper[0]^.reg)=R_MMREGISTER) { or (taicpu(p).opcode in [A_LDUR])})
  195. ) { or
  196. (((taicpu(p).oppostfix in [PF_F64F32,PF_F64S16,PF_F64S32,PF_F64U16,PF_F64U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFD)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F64])) or
  197. (((taicpu(p).oppostfix in [PF_F32F64,PF_F32S16,PF_F32S32,PF_F32U16,PF_F32U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFS)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F32])) }
  198. ) and
  199. (taicpu(movp).ops=2) and
  200. MatchOperand(taicpu(movp).oper[1]^, taicpu(p).oper[0]^.reg) and
  201. { the destination register of the mov might not be used beween p and movp }
  202. not(RegUsedBetween(taicpu(movp).oper[0]^.reg,p,movp)) and
  203. { Take care to only do this for instructions which REALLY load to the first register.
  204. Otherwise
  205. str reg0, [reg1]
  206. fmov reg2, reg0
  207. will be optimized to
  208. str reg2, [reg1]
  209. }
  210. RegLoadedWithNewValue(taicpu(p).oper[0]^.reg, p) then
  211. begin
  212. dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(movp.Next));
  213. if assigned(dealloc) then
  214. begin
  215. DebugMsg('Peephole '+optimizer+' removed superfluous vmov', movp);
  216. result:=true;
  217. { taicpu(p).oper[0]^.reg is not used anymore, try to find its allocation
  218. and remove it if possible }
  219. asml.Remove(dealloc);
  220. alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.previous));
  221. if assigned(alloc) then
  222. begin
  223. asml.Remove(alloc);
  224. alloc.free;
  225. dealloc.free;
  226. end
  227. else
  228. asml.InsertAfter(dealloc,p);
  229. { try to move the allocation of the target register }
  230. GetLastInstruction(movp,hp1);
  231. alloc:=FindRegAlloc(taicpu(movp).oper[0]^.reg,tai(hp1.Next));
  232. if assigned(alloc) then
  233. begin
  234. asml.Remove(alloc);
  235. asml.InsertBefore(alloc,p);
  236. { adjust used regs }
  237. IncludeRegInUsedRegs(taicpu(movp).oper[0]^.reg,UsedRegs);
  238. end;
  239. { change
  240. vldr reg0,[reg1]
  241. vmov reg2,reg0
  242. into
  243. ldr reg2,[reg1]
  244. if reg2 is an int register
  245. if (taicpu(p).opcode=A_VLDR) and (getregtype(taicpu(movp).oper[0]^.reg)=R_INTREGISTER) then
  246. taicpu(p).opcode:=A_LDR;
  247. }
  248. { finally get rid of the mov }
  249. taicpu(p).loadreg(0,taicpu(movp).oper[0]^.reg);
  250. asml.remove(movp);
  251. movp.free;
  252. end;
  253. end;
  254. end;
  255. function TCpuAsmOptimizer.OptPass1Shift(var p : tai): boolean;
  256. var
  257. hp1,hp2: tai;
  258. I2, I: Integer;
  259. shifterop: tshifterop;
  260. begin
  261. Result:=false;
  262. { This folds shifterops into following instructions
  263. <shiftop> r0, r1, #imm
  264. <op> r2, r3, r0
  265. to
  266. <op> r2, r3, r1, <shiftop> #imm
  267. }
  268. { do not handle ROR yet, only part of the instructions below support ROR as shifter operand }
  269. if MatchInstruction(p,[A_LSL, A_LSR, A_ASR{, A_ROR}],[PF_None]) and
  270. MatchOpType(taicpu(p),top_reg,top_reg,top_const) and
  271. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  272. MatchInstruction(hp1, [A_ADD, A_AND, A_BIC, A_CMP, A_CMN,
  273. A_EON, A_EOR, A_NEG, A_ORN, A_ORR,
  274. A_SUB, A_TST], [PF_None]) and
  275. RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  276. (taicpu(hp1).ops >= 2) and
  277. { Currently we can't fold into another shifterop }
  278. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.typ = top_reg) and
  279. { SP does not work completely with shifted registers, as I didn't find the exact rules,
  280. we do not operate on SP }
  281. (taicpu(hp1).oper[0]^.reg<>NR_SP) and
  282. (taicpu(hp1).oper[1]^.reg<>NR_SP) and
  283. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.reg<>NR_SP) and
  284. { reg1 might not be modified inbetween }
  285. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
  286. (
  287. { Only ONE of the two src operands is allowed to match }
  288. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-2]^) xor
  289. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)
  290. ) and
  291. { for SUB, the last operand must match, there is no RSB on AArch64 }
  292. ((taicpu(hp1).opcode<>A_SUB) or
  293. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)) then
  294. begin
  295. { for the two operand instructions, start also at the second operand as they are not always commutative
  296. (depends on the flags tested laster on) and thus the operands cannot swapped }
  297. I2:=1;
  298. for I:=I2 to taicpu(hp1).ops-1 do
  299. if MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[I]^.reg) then
  300. begin
  301. { If the parameter matched on the second op from the RIGHT
  302. we have to switch the parameters, this will not happen for CMP
  303. were we're only evaluating the most right parameter
  304. }
  305. shifterop_reset(shifterop);
  306. case taicpu(p).opcode of
  307. A_LSL:
  308. shifterop.shiftmode:=SM_LSL;
  309. A_ROR:
  310. shifterop.shiftmode:=SM_ROR;
  311. A_LSR:
  312. shifterop.shiftmode:=SM_LSR;
  313. A_ASR:
  314. shifterop.shiftmode:=SM_ASR;
  315. else
  316. InternalError(2019090401);
  317. end;
  318. shifterop.shiftimm:=taicpu(p).oper[2]^.val;
  319. if I <> taicpu(hp1).ops-1 then
  320. begin
  321. if taicpu(hp1).ops = 3 then
  322. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  323. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[2]^.reg,
  324. taicpu(p).oper[1]^.reg, shifterop)
  325. else
  326. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  327. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  328. shifterop);
  329. end
  330. else
  331. if taicpu(hp1).ops = 3 then
  332. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  333. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg,
  334. taicpu(p).oper[1]^.reg,shifterop)
  335. else
  336. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  337. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  338. shifterop);
  339. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  340. asml.insertbefore(hp2, hp1);
  341. GetNextInstruction(p, hp2);
  342. asml.remove(p);
  343. asml.remove(hp1);
  344. p.free;
  345. hp1.free;
  346. p:=hp2;
  347. DebugMsg('Peephole FoldShiftProcess done', p);
  348. Result:=true;
  349. break;
  350. end;
  351. end
  352. else if MatchInstruction(p,[A_LSL, A_LSR, A_ASR,A_ROR],[PF_None]) and
  353. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  354. RemoveSuperfluousMove(p, hp1, 'ShiftMov2Shift') then
  355. Result:=true;
  356. end;
  357. function TCpuAsmOptimizer.OptPass1Data(var p : tai): boolean;
  358. var
  359. hp1: tai;
  360. begin
  361. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  362. RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
  363. end;
  364. function TCpuAsmOptimizer.OptPass1FData(var p: tai): Boolean;
  365. var
  366. hp1: tai;
  367. begin
  368. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  369. RemoveSuperfluousFMov(p, hp1, 'FOpFMov2FOp');
  370. end;
  371. function TCpuAsmOptimizer.OptPass1STP(var p : tai): boolean;
  372. var
  373. hp1, hp2, hp3, hp4: tai;
  374. begin
  375. Result:=false;
  376. {
  377. change
  378. stp x29,x30,[sp, #-16]!
  379. mov x29,sp
  380. bl abc
  381. ldp x29,x30,[sp], #16
  382. ret
  383. into
  384. b abc
  385. }
  386. if MatchInstruction(p, A_STP, [C_None], [PF_None]) and
  387. MatchOpType(taicpu(p),top_reg,top_reg,top_ref) and
  388. (taicpu(p).oper[0]^.reg = NR_X29) and
  389. (taicpu(p).oper[1]^.reg = NR_X30) and
  390. (taicpu(p).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  391. (taicpu(p).oper[2]^.ref^.index=NR_NO) and
  392. (taicpu(p).oper[2]^.ref^.offset=-16) and
  393. (taicpu(p).oper[2]^.ref^.addressmode=AM_PREINDEXED) and
  394. GetNextInstruction(p, hp1) and
  395. MatchInstruction(hp1, A_MOV, [C_None], [PF_NONE]) and
  396. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[0]^) and
  397. (taicpu(hp1).oper[1]^.typ = top_reg) and
  398. (taicpu(hp1).oper[1]^.reg = NR_STACK_POINTER_REG) and
  399. GetNextInstruction(hp1, hp2) and
  400. SkipEntryExitMarker(hp2, hp2) and
  401. MatchInstruction(hp2, A_BL, [C_None], [PF_NONE]) and
  402. (taicpu(hp2).oper[0]^.typ = top_ref) and
  403. GetNextInstruction(hp2, hp3) and
  404. SkipEntryExitMarker(hp3, hp3) and
  405. MatchInstruction(hp3, A_LDP, [C_None], [PF_NONE]) and
  406. MatchOpType(taicpu(hp3),top_reg,top_reg,top_ref) and
  407. (taicpu(hp3).oper[0]^.reg = NR_X29) and
  408. (taicpu(hp3).oper[1]^.reg = NR_X30) and
  409. (taicpu(hp3).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  410. (taicpu(hp3).oper[2]^.ref^.index=NR_NO) and
  411. (taicpu(hp3).oper[2]^.ref^.offset=16) and
  412. (taicpu(hp3).oper[2]^.ref^.addressmode=AM_POSTINDEXED) and
  413. GetNextInstruction(hp3, hp4) and
  414. MatchInstruction(hp4, A_RET, [C_None], [PF_None]) and
  415. (taicpu(hp4).ops = 0) then
  416. begin
  417. asml.Remove(p);
  418. asml.Remove(hp1);
  419. asml.Remove(hp3);
  420. asml.Remove(hp4);
  421. taicpu(hp2).opcode:=A_B;
  422. p.free;
  423. hp1.free;
  424. hp3.free;
  425. hp4.free;
  426. p:=hp2;
  427. DebugMsg('Peephole Bl2B done', p);
  428. Result:=true;
  429. end;
  430. end;
  431. function TCpuAsmOptimizer.OptPass1Mov(var p : tai): boolean;
  432. var
  433. hp1: tai;
  434. begin
  435. Result:=false;
  436. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  437. (taicpu(p).oppostfix=PF_None) then
  438. begin
  439. RemoveCurrentP(p);
  440. DebugMsg('Peephole Mov2None done', p);
  441. Result:=true;
  442. end
  443. {
  444. optimize
  445. mov rX, yyyy
  446. ....
  447. }
  448. else if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
  449. begin
  450. if RemoveSuperfluousMove(p, hp1, 'MovMov2Mov') then
  451. Result:=true
  452. else if (taicpu(p).ops = 2) and
  453. (tai(hp1).typ = ait_instruction) and
  454. RedundantMovProcess(p,hp1) then
  455. Result:=true;
  456. end;
  457. end;
  458. function TCpuAsmOptimizer.OptPass1FMov(var p: tai): Boolean;
  459. var
  460. hp1: tai;
  461. begin
  462. {
  463. change
  464. fmov reg0,reg1
  465. fmov reg1,reg0
  466. into
  467. fmov reg0,reg1
  468. }
  469. Result := False;
  470. while GetNextInstruction(p, hp1) and
  471. MatchInstruction(hp1, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  472. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^) and
  473. MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[0]^) do
  474. begin
  475. asml.Remove(hp1);
  476. hp1.free;
  477. DebugMsg(SPeepholeOptimization + 'FMovFMov2FMov done', p);
  478. Result:=true;
  479. end;
  480. end;
  481. function TCpuAsmOptimizer.OptPass2LDRSTR(var p: tai): boolean;
  482. var
  483. hp1, hp1_last: tai;
  484. ThisRegister: TRegister;
  485. OffsetVal, ValidOffset, MinOffset, MaxOffset: asizeint;
  486. TargetOpcode: TAsmOp;
  487. Breakout: Boolean;
  488. begin
  489. Result := False;
  490. ThisRegister := taicpu(p).oper[0]^.reg;
  491. case taicpu(p).opcode of
  492. A_LDR:
  493. TargetOpcode := A_LDP;
  494. A_STR:
  495. TargetOpcode := A_STP;
  496. else
  497. InternalError(2020081501);
  498. end;
  499. { reg appearing in ref invalidates these optimisations }
  500. if (TargetOpcode = A_STP) or not RegInRef(ThisRegister, taicpu(p).oper[1]^.ref^) then
  501. begin
  502. { LDP/STP has a smaller permitted offset range than LDR/STR.
  503. TODO: For a group of out-of-range LDR/STR instructions, can
  504. we declare a temporary register equal to the offset base
  505. address, modify the STR instructions to use that register
  506. and then convert them to STP instructions? Note that STR
  507. generally takes 2 cycles (on top of the memory latency),
  508. while LDP/STP takes 3.
  509. }
  510. if (getsubreg(ThisRegister) = R_SUBQ) then
  511. begin
  512. ValidOffset := 8;
  513. MinOffset := -512;
  514. MaxOffset := 504;
  515. end
  516. else
  517. begin
  518. ValidOffset := 4;
  519. MinOffset := -256;
  520. MaxOffset := 252;
  521. end;
  522. hp1_last := p;
  523. { Look for nearby LDR/STR instructions }
  524. if (taicpu(p).oppostfix = PF_NONE) and
  525. (taicpu(p).oper[1]^.ref^.addressmode = AM_OFFSET) then
  526. { If SkipGetNext is True, GextNextInstruction isn't called }
  527. while GetNextInstruction(hp1_last, hp1) do
  528. begin
  529. if (hp1.typ <> ait_instruction) then
  530. Break;
  531. if (taicpu(hp1).opcode = taicpu(p).opcode) then
  532. begin
  533. Breakout := False;
  534. if (taicpu(hp1).oppostfix = PF_NONE) and
  535. { Registers need to be the same size }
  536. (getsubreg(ThisRegister) = getsubreg(taicpu(hp1).oper[0]^.reg)) and
  537. (
  538. (TargetOpcode = A_STP) or
  539. { LDP x0, x0, [sp, #imm] is undefined behaviour, even
  540. though such an LDR pair should have been optimised
  541. out by now. STP is okay }
  542. (ThisRegister <> taicpu(hp1).oper[0]^.reg)
  543. ) and
  544. (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
  545. (taicpu(p).oper[1]^.ref^.base = taicpu(hp1).oper[1]^.ref^.base) and
  546. (taicpu(p).oper[1]^.ref^.index = taicpu(hp1).oper[1]^.ref^.index) and
  547. { Make sure the address registers haven't changed }
  548. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, p, hp1) and
  549. (
  550. (taicpu(hp1).oper[1]^.ref^.index = NR_NO) or
  551. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index, p, hp1)
  552. ) and
  553. { Don't need to check "RegInRef" because the base registers are identical,
  554. and the first one was checked already. [Kit] }
  555. (((TargetOpcode=A_LDP) and not RegUsedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) or
  556. ((TargetOpcode=A_STP) and not RegModifiedBetween(taicpu(hp1).oper[0]^.reg, p, hp1))) then
  557. begin
  558. { Can we convert these two LDR/STR instructions into a
  559. single LDR/STP? }
  560. OffsetVal := taicpu(hp1).oper[1]^.ref^.offset - taicpu(p).oper[1]^.ref^.offset;
  561. if (OffsetVal = ValidOffset) then
  562. begin
  563. if (taicpu(p).oper[1]^.ref^.offset >= MinOffset) and (taicpu(hp1).oper[1]^.ref^.offset <= MaxOffset) then
  564. begin
  565. { Convert:
  566. LDR/STR reg0, [reg2, #ofs]
  567. ...
  568. LDR/STR reg1. [reg2, #ofs + 8] // 4 if registers are 32-bit
  569. To:
  570. LDP/STP reg0, reg1, [reg2, #ofs]
  571. }
  572. taicpu(p).opcode := TargetOpcode;
  573. if TargetOpcode = A_STP then
  574. DebugMsg('Peephole Optimization: StrStr2Stp', p)
  575. else
  576. DebugMsg('Peephole Optimization: LdrLdr2Ldp', p);
  577. taicpu(p).ops := 3;
  578. taicpu(p).loadref(2, taicpu(p).oper[1]^.ref^);
  579. taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
  580. asml.Remove(hp1);
  581. hp1.Free;
  582. Result := True;
  583. Exit;
  584. end;
  585. end
  586. else if (OffsetVal = -ValidOffset) then
  587. begin
  588. if (taicpu(hp1).oper[1]^.ref^.offset >= MinOffset) and (taicpu(p).oper[1]^.ref^.offset <= MaxOffset) then
  589. begin
  590. { Convert:
  591. LDR/STR reg0, [reg2, #ofs + 8] // 4 if registers are 32-bit
  592. ...
  593. LDR/STR reg1. [reg2, #ofs]
  594. To:
  595. LDP/STP reg1, reg0, [reg2, #ofs]
  596. }
  597. taicpu(p).opcode := TargetOpcode;
  598. if TargetOpcode = A_STP then
  599. DebugMsg('Peephole Optimization: StrStr2Stp (reverse)', p)
  600. else
  601. DebugMsg('Peephole Optimization: LdrLdr2Ldp (reverse)', p);
  602. taicpu(p).ops := 3;
  603. taicpu(p).loadref(2, taicpu(hp1).oper[1]^.ref^);
  604. taicpu(p).loadreg(1, taicpu(p).oper[0]^.reg);
  605. taicpu(p).loadreg(0, taicpu(hp1).oper[0]^.reg);
  606. asml.Remove(hp1);
  607. hp1.Free;
  608. Result := True;
  609. Exit;
  610. end;
  611. end;
  612. end;
  613. end
  614. else
  615. Break;
  616. { Don't continue looking for LDR/STR pairs if the address register
  617. gets modified }
  618. if RegModifiedByInstruction(taicpu(p).oper[1]^.ref^.base, hp1) then
  619. Break;
  620. hp1_last := hp1;
  621. end;
  622. end;
  623. end;
  624. function TCpuAsmOptimizer.OptPostCMP(var p : tai): boolean;
  625. var
  626. hp1,hp2: tai;
  627. begin
  628. Result:=false;
  629. if MatchOpType(taicpu(p),top_reg,top_const) and
  630. (taicpu(p).oper[1]^.val=0) and
  631. GetNextInstruction(p,hp1) and
  632. MatchInstruction(hp1,A_B,[PF_None]) and
  633. (taicpu(hp1).condition in [C_EQ,C_NE]) then
  634. begin
  635. case taicpu(hp1).condition of
  636. C_NE:
  637. hp2:=taicpu.op_reg_sym_ofs(A_CBNZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  638. C_EQ:
  639. hp2:=taicpu.op_reg_sym_ofs(A_CBZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  640. else
  641. Internalerror(2019090801);
  642. end;
  643. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  644. asml.insertbefore(hp2, hp1);
  645. asml.remove(p);
  646. asml.remove(hp1);
  647. p.free;
  648. hp1.free;
  649. p:=hp2;
  650. DebugMsg('Peephole CMPB.E/NE2CBNZ/CBZ done', p);
  651. Result:=true;
  652. end;
  653. end;
  654. function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
  655. var
  656. hp1: tai;
  657. begin
  658. result := false;
  659. if p.typ=ait_instruction then
  660. begin
  661. case taicpu(p).opcode of
  662. A_LDR,
  663. A_STR:
  664. Result:=LookForPostindexedPattern(p);
  665. A_MOV:
  666. Result:=OptPass1Mov(p);
  667. A_STP:
  668. Result:=OptPass1STP(p);
  669. A_LSR,
  670. A_ROR,
  671. A_ASR,
  672. A_LSL:
  673. Result:=OptPass1Shift(p);
  674. A_AND:
  675. Result:=OptPass1And(p);
  676. A_ADD,
  677. A_ADC,
  678. A_SUB,
  679. A_SBC,
  680. A_BIC,
  681. A_EOR,
  682. A_ORR,
  683. A_MUL:
  684. Result:=OptPass1Data(p);
  685. A_UXTB:
  686. Result:=OptPass1UXTB(p);
  687. A_UXTH:
  688. Result:=OptPass1UXTH(p);
  689. A_SXTB:
  690. Result:=OptPass1SXTB(p);
  691. A_SXTH:
  692. Result:=OptPass1SXTH(p);
  693. // A_VLDR,
  694. A_FADD,
  695. A_FMUL,
  696. A_FDIV,
  697. A_FSUB,
  698. A_FSQRT,
  699. A_FNEG,
  700. A_FCVT,
  701. A_FABS:
  702. Result:=OptPass1FData(p);
  703. A_FMOV:
  704. Result:=OptPass1FMov(p);
  705. else
  706. ;
  707. end;
  708. end;
  709. end;
  710. function TCpuAsmOptimizer.PeepHoleOptPass2Cpu(var p: tai): boolean;
  711. var
  712. hp1: tai;
  713. begin
  714. result := false;
  715. if p.typ=ait_instruction then
  716. begin
  717. case taicpu(p).opcode of
  718. A_LDR,
  719. A_STR:
  720. Result:=OptPass2LDRSTR(p);
  721. else
  722. ;
  723. end;
  724. end;
  725. end;
  726. function TCpuAsmOptimizer.PostPeepHoleOptsCpu(var p: tai): boolean;
  727. begin
  728. result := false;
  729. if p.typ=ait_instruction then
  730. begin
  731. case taicpu(p).opcode of
  732. A_CMP:
  733. Result:=OptPostCMP(p);
  734. else
  735. ;
  736. end;
  737. end;
  738. end;
  739. begin
  740. casmoptimizer:=TCpuAsmOptimizer;
  741. End.