aoptcpu.pas 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879
  1. {
  2. Copyright (c) 1998-2002 by Jonas Maebe, member of the Free Pascal
  3. Development Team
  4. This unit implements the ARM64 optimizer object
  5. This program is free software; you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation; either version 2 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program; if not, write to the Free Software
  15. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16. ****************************************************************************
  17. }
  18. Unit aoptcpu;
  19. {$i fpcdefs.inc}
  20. { $define DEBUG_AOPTCPU}
  21. Interface
  22. uses
  23. globtype, globals,
  24. cutils,
  25. cgbase, cpubase, aasmtai, aasmcpu,
  26. aopt, aoptcpub, aoptarm;
  27. Type
  28. TCpuAsmOptimizer = class(TARMAsmOptimizer)
  29. { uses the same constructor as TAopObj }
  30. function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
  31. function PeepHoleOptPass2Cpu(var p: tai): boolean; override;
  32. function PostPeepHoleOptsCpu(var p: tai): boolean; override;
  33. function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
  34. function InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;override;
  35. function LookForPostindexedPattern(var p : tai) : boolean;
  36. public
  37. { With these routines, there's optimisation code that's general for all ARM platforms }
  38. function OptPass1LDR(var p: tai): Boolean; override;
  39. function OptPass1STR(var p: tai): Boolean; override;
  40. private
  41. function RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string): boolean;
  42. function OptPass1Shift(var p: tai): boolean;
  43. function OptPostCMP(var p: tai): boolean;
  44. function OptPass1Data(var p: tai): boolean;
  45. function OptPass1FData(var p: tai): Boolean;
  46. function OptPass1STP(var p: tai): boolean;
  47. function OptPass1Mov(var p: tai): boolean;
  48. function OptPass1FMov(var p: tai): Boolean;
  49. function OptPass2LDRSTR(var p: tai): boolean;
  50. End;
  51. Implementation
  52. uses
  53. aasmbase,
  54. aoptutils,
  55. cgutils,
  56. verbose;
  57. {$ifdef DEBUG_AOPTCPU}
  58. const
  59. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  60. {$else DEBUG_AOPTCPU}
  61. { Empty strings help the optimizer to remove string concatenations that won't
  62. ever appear to the user on release builds. [Kit] }
  63. const
  64. SPeepholeOptimization = '';
  65. {$endif DEBUG_AOPTCPU}
  66. function CanBeCond(p : tai) : boolean;
  67. begin
  68. result:=(p.typ=ait_instruction) and (taicpu(p).condition=C_None);
  69. end;
  70. function TCpuAsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  71. var
  72. p: taicpu;
  73. begin
  74. Result := false;
  75. if not ((assigned(hp)) and (hp.typ = ait_instruction)) then
  76. exit;
  77. p := taicpu(hp);
  78. case p.opcode of
  79. { These operands do not write into a register at all }
  80. A_CMP, A_CMN, A_TST, A_B, A_BL, A_MSR, A_FCMP:
  81. exit;
  82. {Take care of post/preincremented store and loads, they will change their base register}
  83. A_STR, A_LDR:
  84. begin
  85. Result := false;
  86. { actually, this does not apply here because post-/preindexed does not mean that a register
  87. is loaded with a new value, it is only modified
  88. (taicpu(p).oper[1]^.typ=top_ref) and
  89. (taicpu(p).oper[1]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
  90. (taicpu(p).oper[1]^.ref^.base = reg);
  91. }
  92. { STR does not load into it's first register }
  93. if p.opcode = A_STR then
  94. exit;
  95. end;
  96. else
  97. ;
  98. end;
  99. if Result then
  100. exit;
  101. case p.oper[0]^.typ of
  102. top_reg:
  103. Result := (p.oper[0]^.reg = reg);
  104. top_ref:
  105. Result :=
  106. (taicpu(p).oper[0]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
  107. (taicpu(p).oper[0]^.ref^.base = reg);
  108. else
  109. ;
  110. end;
  111. end;
  112. function TCpuAsmOptimizer.InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;
  113. var
  114. p: taicpu;
  115. i: longint;
  116. begin
  117. instructionLoadsFromReg := false;
  118. if not (assigned(hp) and (hp.typ = ait_instruction)) then
  119. exit;
  120. p:=taicpu(hp);
  121. i:=1;
  122. { Start on oper[0]? }
  123. if taicpu(hp).spilling_get_operation_type(0) in [operand_read, operand_readwrite] then
  124. i:=0;
  125. while(i<p.ops) do
  126. begin
  127. case p.oper[I]^.typ of
  128. top_reg:
  129. Result := (p.oper[I]^.reg = reg);
  130. top_ref:
  131. Result :=
  132. (p.oper[I]^.ref^.base = reg) or
  133. (p.oper[I]^.ref^.index = reg);
  134. else
  135. ;
  136. end;
  137. { Bailout if we found something }
  138. if Result then
  139. exit;
  140. Inc(I);
  141. end;
  142. end;
  143. {
  144. optimize
  145. ldr/str regX,[reg1]
  146. ...
  147. add/sub reg1,reg1,regY/const
  148. into
  149. ldr/str regX,[reg1], regY/const
  150. }
  151. function TCpuAsmOptimizer.LookForPostindexedPattern(var p: tai) : boolean;
  152. var
  153. hp1 : tai;
  154. begin
  155. Result:=false;
  156. if (taicpu(p).oper[1]^.typ = top_ref) and
  157. (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
  158. (taicpu(p).oper[1]^.ref^.index=NR_NO) and
  159. (taicpu(p).oper[1]^.ref^.offset=0) and
  160. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.ref^.base) and
  161. { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
  162. MatchInstruction(hp1, [A_ADD, A_SUB], [PF_None]) and
  163. (taicpu(hp1).oper[0]^.reg=taicpu(p).oper[1]^.ref^.base) and
  164. (taicpu(hp1).oper[1]^.reg=taicpu(p).oper[1]^.ref^.base) and
  165. (
  166. { valid offset? }
  167. (taicpu(hp1).oper[2]^.typ=top_const) and
  168. (taicpu(hp1).oper[2]^.val>=-256) and
  169. (abs(taicpu(hp1).oper[2]^.val)<256)
  170. ) and
  171. { don't apply the optimization if the base register is loaded }
  172. (getsupreg(taicpu(p).oper[0]^.reg)<>getsupreg(taicpu(p).oper[1]^.ref^.base)) and
  173. not(RegModifiedBetween(taicpu(hp1).oper[0]^.reg,p,hp1)) and
  174. not(RegModifiedBetween(taicpu(hp1).oper[2]^.reg,p,hp1)) then
  175. begin
  176. if taicpu(p).opcode = A_LDR then
  177. DebugMsg(SPeepholeOptimization + 'LdrAdd/Sub2Ldr Postindex done', p)
  178. else
  179. DebugMsg(SPeepholeOptimization + 'StrAdd/Sub2Str Postindex done', p);
  180. taicpu(p).oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
  181. if taicpu(hp1).opcode=A_ADD then
  182. taicpu(p).oper[1]^.ref^.offset:=taicpu(hp1).oper[2]^.val
  183. else
  184. taicpu(p).oper[1]^.ref^.offset:=-taicpu(hp1).oper[2]^.val;
  185. asml.Remove(hp1);
  186. hp1.Free;
  187. Result:=true;
  188. end;
  189. end;
  190. function TCpuAsmOptimizer.RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string):boolean;
  191. var
  192. alloc,
  193. dealloc : tai_regalloc;
  194. hp1 : tai;
  195. begin
  196. Result:=false;
  197. if ((MatchInstruction(movp, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  198. ((getregtype(taicpu(movp).oper[0]^.reg)=R_MMREGISTER) { or (taicpu(p).opcode in [A_LDUR])})
  199. ) { or
  200. (((taicpu(p).oppostfix in [PF_F64F32,PF_F64S16,PF_F64S32,PF_F64U16,PF_F64U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFD)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F64])) or
  201. (((taicpu(p).oppostfix in [PF_F32F64,PF_F32S16,PF_F32S32,PF_F32U16,PF_F32U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFS)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F32])) }
  202. ) and
  203. (taicpu(movp).ops=2) and
  204. MatchOperand(taicpu(movp).oper[1]^, taicpu(p).oper[0]^.reg) and
  205. { the destination register of the mov might not be used beween p and movp }
  206. not(RegUsedBetween(taicpu(movp).oper[0]^.reg,p,movp)) and
  207. { Take care to only do this for instructions which REALLY load to the first register.
  208. Otherwise
  209. str reg0, [reg1]
  210. fmov reg2, reg0
  211. will be optimized to
  212. str reg2, [reg1]
  213. }
  214. RegLoadedWithNewValue(taicpu(p).oper[0]^.reg, p) then
  215. begin
  216. dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(movp.Next));
  217. if assigned(dealloc) then
  218. begin
  219. DebugMsg(SPeepholeOptimization + optimizer+' removed superfluous vmov', movp);
  220. result:=true;
  221. { taicpu(p).oper[0]^.reg is not used anymore, try to find its allocation
  222. and remove it if possible }
  223. asml.Remove(dealloc);
  224. alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.previous));
  225. if assigned(alloc) then
  226. begin
  227. asml.Remove(alloc);
  228. alloc.free;
  229. dealloc.free;
  230. end
  231. else
  232. asml.InsertAfter(dealloc,p);
  233. { try to move the allocation of the target register }
  234. GetLastInstruction(movp,hp1);
  235. alloc:=FindRegAlloc(taicpu(movp).oper[0]^.reg,tai(hp1.Next));
  236. if assigned(alloc) then
  237. begin
  238. asml.Remove(alloc);
  239. asml.InsertBefore(alloc,p);
  240. { adjust used regs }
  241. IncludeRegInUsedRegs(taicpu(movp).oper[0]^.reg,UsedRegs);
  242. end;
  243. { change
  244. vldr reg0,[reg1]
  245. vmov reg2,reg0
  246. into
  247. ldr reg2,[reg1]
  248. if reg2 is an int register
  249. if (taicpu(p).opcode=A_VLDR) and (getregtype(taicpu(movp).oper[0]^.reg)=R_INTREGISTER) then
  250. taicpu(p).opcode:=A_LDR;
  251. }
  252. { finally get rid of the mov }
  253. taicpu(p).loadreg(0,taicpu(movp).oper[0]^.reg);
  254. asml.remove(movp);
  255. movp.free;
  256. end;
  257. end;
  258. end;
  259. function TCpuAsmOptimizer.OptPass1LDR(var p: tai): Boolean;
  260. begin
  261. Result := False;
  262. if inherited OptPass1LDR(p) or
  263. LookForPostindexedPattern(p) then
  264. Exit(True);
  265. end;
  266. function TCpuAsmOptimizer.OptPass1STR(var p: tai): Boolean;
  267. begin
  268. Result := False;
  269. if inherited OptPass1STR(p) or
  270. LookForPostindexedPattern(p) then
  271. Exit(True);
  272. end;
  273. function TCpuAsmOptimizer.OptPass1Shift(var p : tai): boolean;
  274. var
  275. hp1,hp2: tai;
  276. I2, I: Integer;
  277. shifterop: tshifterop;
  278. begin
  279. Result:=false;
  280. { This folds shifterops into following instructions
  281. <shiftop> r0, r1, #imm
  282. <op> r2, r3, r0
  283. to
  284. <op> r2, r3, r1, <shiftop> #imm
  285. }
  286. { do not handle ROR yet, only part of the instructions below support ROR as shifter operand }
  287. if MatchInstruction(p,[A_LSL, A_LSR, A_ASR{, A_ROR}],[PF_None]) and
  288. MatchOpType(taicpu(p),top_reg,top_reg,top_const) and
  289. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  290. MatchInstruction(hp1, [A_ADD, A_AND, A_BIC, A_CMP, A_CMN,
  291. A_EON, A_EOR, A_NEG, A_ORN, A_ORR,
  292. A_SUB, A_TST], [PF_None]) and
  293. RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  294. (taicpu(hp1).ops >= 2) and
  295. { Currently we can't fold into another shifterop }
  296. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.typ = top_reg) and
  297. { SP does not work completely with shifted registers, as I didn't find the exact rules,
  298. we do not operate on SP }
  299. (taicpu(hp1).oper[0]^.reg<>NR_SP) and
  300. (taicpu(hp1).oper[1]^.reg<>NR_SP) and
  301. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.reg<>NR_SP) and
  302. { reg1 might not be modified inbetween }
  303. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
  304. (
  305. { Only ONE of the two src operands is allowed to match }
  306. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-2]^) xor
  307. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)
  308. ) and
  309. { for SUB, the last operand must match, there is no RSB on AArch64 }
  310. ((taicpu(hp1).opcode<>A_SUB) or
  311. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)) then
  312. begin
  313. { for the two operand instructions, start also at the second operand as they are not always commutative
  314. (depends on the flags tested laster on) and thus the operands cannot swapped }
  315. I2:=1;
  316. for I:=I2 to taicpu(hp1).ops-1 do
  317. if MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[I]^.reg) then
  318. begin
  319. { If the parameter matched on the second op from the RIGHT
  320. we have to switch the parameters, this will not happen for CMP
  321. were we're only evaluating the most right parameter
  322. }
  323. shifterop_reset(shifterop);
  324. case taicpu(p).opcode of
  325. A_LSL:
  326. shifterop.shiftmode:=SM_LSL;
  327. A_ROR:
  328. shifterop.shiftmode:=SM_ROR;
  329. A_LSR:
  330. shifterop.shiftmode:=SM_LSR;
  331. A_ASR:
  332. shifterop.shiftmode:=SM_ASR;
  333. else
  334. InternalError(2019090401);
  335. end;
  336. shifterop.shiftimm:=taicpu(p).oper[2]^.val;
  337. if I <> taicpu(hp1).ops-1 then
  338. begin
  339. if taicpu(hp1).ops = 3 then
  340. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  341. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[2]^.reg,
  342. taicpu(p).oper[1]^.reg, shifterop)
  343. else
  344. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  345. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  346. shifterop);
  347. end
  348. else
  349. if taicpu(hp1).ops = 3 then
  350. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  351. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg,
  352. taicpu(p).oper[1]^.reg,shifterop)
  353. else
  354. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  355. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  356. shifterop);
  357. { Make sure the register used in the shifting is tracked all
  358. the way through, otherwise it may become deallocated while
  359. it's still live and cause incorrect optimisations later }
  360. if (taicpu(hp1).oper[0]^.reg <> taicpu(p).oper[1]^.reg) then
  361. begin
  362. TransferUsedRegs(TmpUsedRegs);
  363. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  364. ALlocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, TmpUsedRegs);
  365. end;
  366. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  367. asml.insertbefore(hp2, hp1);
  368. RemoveInstruction(hp1);
  369. RemoveCurrentp(p);
  370. DebugMsg(SPeepholeOptimization + 'FoldShiftProcess done', hp2);
  371. Result:=true;
  372. break;
  373. end;
  374. end
  375. else if MatchInstruction(p,[A_LSL, A_LSR, A_ASR,A_ROR],[PF_None]) and
  376. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  377. RemoveSuperfluousMove(p, hp1, 'ShiftMov2Shift') then
  378. Result:=true;
  379. end;
  380. function TCpuAsmOptimizer.OptPass1Data(var p : tai): boolean;
  381. var
  382. hp1: tai;
  383. begin
  384. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  385. RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
  386. end;
  387. function TCpuAsmOptimizer.OptPass1FData(var p: tai): Boolean;
  388. var
  389. hp1: tai;
  390. begin
  391. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  392. RemoveSuperfluousFMov(p, hp1, 'FOpFMov2FOp');
  393. end;
  394. function TCpuAsmOptimizer.OptPass1STP(var p : tai): boolean;
  395. var
  396. hp1, hp2, hp3, hp4: tai;
  397. begin
  398. Result:=false;
  399. {
  400. change
  401. stp x29,x30,[sp, #-16]!
  402. mov x29,sp
  403. bl abc
  404. ldp x29,x30,[sp], #16
  405. ret
  406. into
  407. b abc
  408. }
  409. if MatchInstruction(p, A_STP, [C_None], [PF_None]) and
  410. MatchOpType(taicpu(p),top_reg,top_reg,top_ref) and
  411. (taicpu(p).oper[0]^.reg = NR_X29) and
  412. (taicpu(p).oper[1]^.reg = NR_X30) and
  413. (taicpu(p).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  414. (taicpu(p).oper[2]^.ref^.index=NR_NO) and
  415. (taicpu(p).oper[2]^.ref^.offset=-16) and
  416. (taicpu(p).oper[2]^.ref^.addressmode=AM_PREINDEXED) and
  417. GetNextInstruction(p, hp1) and
  418. MatchInstruction(hp1, A_MOV, [C_None], [PF_NONE]) and
  419. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[0]^) and
  420. (taicpu(hp1).oper[1]^.typ = top_reg) and
  421. (taicpu(hp1).oper[1]^.reg = NR_STACK_POINTER_REG) and
  422. GetNextInstruction(hp1, hp2) and
  423. SkipEntryExitMarker(hp2, hp2) and
  424. MatchInstruction(hp2, A_BL, [C_None], [PF_NONE]) and
  425. (taicpu(hp2).oper[0]^.typ = top_ref) and
  426. GetNextInstruction(hp2, hp3) and
  427. SkipEntryExitMarker(hp3, hp3) and
  428. MatchInstruction(hp3, A_LDP, [C_None], [PF_NONE]) and
  429. MatchOpType(taicpu(hp3),top_reg,top_reg,top_ref) and
  430. (taicpu(hp3).oper[0]^.reg = NR_X29) and
  431. (taicpu(hp3).oper[1]^.reg = NR_X30) and
  432. (taicpu(hp3).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  433. (taicpu(hp3).oper[2]^.ref^.index=NR_NO) and
  434. (taicpu(hp3).oper[2]^.ref^.offset=16) and
  435. (taicpu(hp3).oper[2]^.ref^.addressmode=AM_POSTINDEXED) and
  436. GetNextInstruction(hp3, hp4) and
  437. MatchInstruction(hp4, A_RET, [C_None], [PF_None]) and
  438. (taicpu(hp4).ops = 0) then
  439. begin
  440. asml.Remove(p);
  441. asml.Remove(hp1);
  442. asml.Remove(hp3);
  443. asml.Remove(hp4);
  444. taicpu(hp2).opcode:=A_B;
  445. p.free;
  446. hp1.free;
  447. hp3.free;
  448. hp4.free;
  449. p:=hp2;
  450. DebugMsg(SPeepholeOptimization + 'Bl2B done', p);
  451. Result:=true;
  452. end;
  453. end;
  454. function TCpuAsmOptimizer.OptPass1Mov(var p : tai): boolean;
  455. var
  456. hp1: tai;
  457. begin
  458. Result:=false;
  459. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  460. (taicpu(p).oppostfix=PF_None) then
  461. begin
  462. RemoveCurrentP(p);
  463. DebugMsg(SPeepholeOptimization + 'Mov2None done', p);
  464. Result:=true;
  465. end
  466. {
  467. optimize
  468. mov rX, yyyy
  469. ....
  470. }
  471. else if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
  472. begin
  473. if RemoveSuperfluousMove(p, hp1, 'MovMov2Mov') then
  474. Result:=true
  475. else if (taicpu(p).ops = 2) and
  476. (tai(hp1).typ = ait_instruction) and
  477. RedundantMovProcess(p,hp1) then
  478. Result:=true;
  479. end;
  480. end;
  481. function TCpuAsmOptimizer.OptPass1FMov(var p: tai): Boolean;
  482. var
  483. hp1: tai;
  484. begin
  485. {
  486. change
  487. fmov reg0,reg1
  488. fmov reg1,reg0
  489. into
  490. fmov reg0,reg1
  491. }
  492. Result := False;
  493. while GetNextInstruction(p, hp1) and
  494. MatchInstruction(hp1, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  495. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^) and
  496. MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[0]^) do
  497. begin
  498. asml.Remove(hp1);
  499. hp1.free;
  500. DebugMsg(SPeepholeOptimization + 'FMovFMov2FMov done', p);
  501. Result:=true;
  502. end;
  503. { not enabled as apparently not happening
  504. if MatchOpType(taicpu(p),top_reg,top_reg) and
  505. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  506. MatchInstruction(hp1, [A_FSUB,A_FADD,A_FNEG,A_FMUL,A_FSQRT,A_FDIV,A_FABS], [PF_None]) and
  507. (MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) or
  508. ((taicpu(hp1).ops=3) and MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[2]^))
  509. ) and
  510. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  511. not(RegUsedBetween(taicpu(p).oper[0]^.reg,p,hp1)) then
  512. begin
  513. DebugMsg(SPeepholeOptimization + 'FMovFOp2FOp done', hp1);
  514. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
  515. if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) then
  516. taicpu(hp1).oper[1]^.reg:=taicpu(p).oper[1]^.reg;
  517. if (taicpu(hp1).ops=3) and MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[2]^) then
  518. taicpu(hp1).oper[2]^.reg:=taicpu(p).oper[1]^.reg;
  519. RemoveCurrentP(p);
  520. Result:=true;
  521. exit;
  522. end;
  523. }
  524. end;
  525. function TCpuAsmOptimizer.OptPass2LDRSTR(var p: tai): boolean;
  526. var
  527. hp1, hp1_last: tai;
  528. ThisRegister: TRegister;
  529. OffsetVal, ValidOffset, MinOffset, MaxOffset: asizeint;
  530. TargetOpcode: TAsmOp;
  531. begin
  532. Result := False;
  533. ThisRegister := taicpu(p).oper[0]^.reg;
  534. case taicpu(p).opcode of
  535. A_LDR:
  536. TargetOpcode := A_LDP;
  537. A_STR:
  538. TargetOpcode := A_STP;
  539. else
  540. InternalError(2020081501);
  541. end;
  542. { reg appearing in ref invalidates these optimisations }
  543. if (TargetOpcode = A_STP) or not RegInRef(ThisRegister, taicpu(p).oper[1]^.ref^) then
  544. begin
  545. { LDP/STP has a smaller permitted offset range than LDR/STR.
  546. TODO: For a group of out-of-range LDR/STR instructions, can
  547. we declare a temporary register equal to the offset base
  548. address, modify the STR instructions to use that register
  549. and then convert them to STP instructions? Note that STR
  550. generally takes 2 cycles (on top of the memory latency),
  551. while LDP/STP takes 3.
  552. }
  553. if (getsubreg(ThisRegister) = R_SUBQ) then
  554. begin
  555. ValidOffset := 8;
  556. MinOffset := -512;
  557. MaxOffset := 504;
  558. end
  559. else
  560. begin
  561. ValidOffset := 4;
  562. MinOffset := -256;
  563. MaxOffset := 252;
  564. end;
  565. hp1_last := p;
  566. { Look for nearby LDR/STR instructions }
  567. if (taicpu(p).oppostfix = PF_NONE) and
  568. (taicpu(p).oper[1]^.ref^.addressmode = AM_OFFSET) then
  569. { If SkipGetNext is True, GextNextInstruction isn't called }
  570. while GetNextInstruction(hp1_last, hp1) do
  571. begin
  572. if (hp1.typ <> ait_instruction) then
  573. Break;
  574. if (taicpu(hp1).opcode = taicpu(p).opcode) then
  575. begin
  576. if (taicpu(hp1).oppostfix = PF_NONE) and
  577. { Registers need to be the same size }
  578. (getsubreg(ThisRegister) = getsubreg(taicpu(hp1).oper[0]^.reg)) and
  579. (
  580. (TargetOpcode = A_STP) or
  581. { LDP x0, x0, [sp, #imm] is undefined behaviour, even
  582. though such an LDR pair should have been optimised
  583. out by now. STP is okay }
  584. (ThisRegister <> taicpu(hp1).oper[0]^.reg)
  585. ) and
  586. (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
  587. (taicpu(p).oper[1]^.ref^.base = taicpu(hp1).oper[1]^.ref^.base) and
  588. (taicpu(p).oper[1]^.ref^.index = taicpu(hp1).oper[1]^.ref^.index) and
  589. { Make sure the address registers haven't changed }
  590. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, p, hp1) and
  591. (
  592. (taicpu(hp1).oper[1]^.ref^.index = NR_NO) or
  593. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index, p, hp1)
  594. ) and
  595. { Don't need to check "RegInRef" because the base registers are identical,
  596. and the first one was checked already. [Kit] }
  597. (((TargetOpcode=A_LDP) and not RegUsedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) or
  598. ((TargetOpcode=A_STP) and not RegModifiedBetween(taicpu(hp1).oper[0]^.reg, p, hp1))) then
  599. begin
  600. { Can we convert these two LDR/STR instructions into a
  601. single LDR/STP? }
  602. OffsetVal := taicpu(hp1).oper[1]^.ref^.offset - taicpu(p).oper[1]^.ref^.offset;
  603. if (OffsetVal = ValidOffset) then
  604. begin
  605. if (taicpu(p).oper[1]^.ref^.offset >= MinOffset) and (taicpu(hp1).oper[1]^.ref^.offset <= MaxOffset) then
  606. begin
  607. { Convert:
  608. LDR/STR reg0, [reg2, #ofs]
  609. ...
  610. LDR/STR reg1. [reg2, #ofs + 8] // 4 if registers are 32-bit
  611. To:
  612. LDP/STP reg0, reg1, [reg2, #ofs]
  613. }
  614. taicpu(p).opcode := TargetOpcode;
  615. if TargetOpcode = A_STP then
  616. DebugMsg(SPeepholeOptimization + 'StrStr2Stp', p)
  617. else
  618. DebugMsg(SPeepholeOptimization + 'LdrLdr2Ldp', p);
  619. taicpu(p).ops := 3;
  620. taicpu(p).loadref(2, taicpu(p).oper[1]^.ref^);
  621. taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
  622. asml.Remove(hp1);
  623. hp1.Free;
  624. Result := True;
  625. Exit;
  626. end;
  627. end
  628. else if (OffsetVal = -ValidOffset) then
  629. begin
  630. if (taicpu(hp1).oper[1]^.ref^.offset >= MinOffset) and (taicpu(p).oper[1]^.ref^.offset <= MaxOffset) then
  631. begin
  632. { Convert:
  633. LDR/STR reg0, [reg2, #ofs + 8] // 4 if registers are 32-bit
  634. ...
  635. LDR/STR reg1. [reg2, #ofs]
  636. To:
  637. LDP/STP reg1, reg0, [reg2, #ofs]
  638. }
  639. taicpu(p).opcode := TargetOpcode;
  640. if TargetOpcode = A_STP then
  641. DebugMsg(SPeepholeOptimization + 'StrStr2Stp (reverse)', p)
  642. else
  643. DebugMsg(SPeepholeOptimization + 'LdrLdr2Ldp (reverse)', p);
  644. taicpu(p).ops := 3;
  645. taicpu(p).loadref(2, taicpu(hp1).oper[1]^.ref^);
  646. taicpu(p).loadreg(1, taicpu(p).oper[0]^.reg);
  647. taicpu(p).loadreg(0, taicpu(hp1).oper[0]^.reg);
  648. asml.Remove(hp1);
  649. hp1.Free;
  650. Result := True;
  651. Exit;
  652. end;
  653. end;
  654. end;
  655. end
  656. else
  657. Break;
  658. { Don't continue looking for LDR/STR pairs if the address register
  659. gets modified }
  660. if RegModifiedByInstruction(taicpu(p).oper[1]^.ref^.base, hp1) then
  661. Break;
  662. hp1_last := hp1;
  663. end;
  664. end;
  665. end;
  666. function TCpuAsmOptimizer.OptPostCMP(var p : tai): boolean;
  667. var
  668. hp1,hp2: tai;
  669. begin
  670. Result:=false;
  671. if MatchOpType(taicpu(p),top_reg,top_const) and
  672. (taicpu(p).oper[1]^.val=0) and
  673. GetNextInstruction(p,hp1) and
  674. MatchInstruction(hp1,A_B,[PF_None]) and
  675. (taicpu(hp1).condition in [C_EQ,C_NE]) then
  676. begin
  677. case taicpu(hp1).condition of
  678. C_NE:
  679. hp2:=taicpu.op_reg_sym_ofs(A_CBNZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  680. C_EQ:
  681. hp2:=taicpu.op_reg_sym_ofs(A_CBZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  682. else
  683. Internalerror(2019090801);
  684. end;
  685. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  686. asml.insertbefore(hp2, hp1);
  687. asml.remove(p);
  688. asml.remove(hp1);
  689. p.free;
  690. hp1.free;
  691. p:=hp2;
  692. DebugMsg(SPeepholeOptimization + 'CMPB.E/NE2CBNZ/CBZ done', p);
  693. Result:=true;
  694. end;
  695. end;
  696. function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
  697. begin
  698. result := false;
  699. if p.typ=ait_instruction then
  700. begin
  701. case taicpu(p).opcode of
  702. A_LDR:
  703. Result:=OptPass1LDR(p);
  704. A_STR:
  705. Result:=OptPass1STR(p);
  706. A_MOV:
  707. Result:=OptPass1Mov(p);
  708. A_STP:
  709. Result:=OptPass1STP(p);
  710. A_LSR,
  711. A_ROR,
  712. A_ASR,
  713. A_LSL:
  714. Result:=OptPass1Shift(p);
  715. A_AND:
  716. Result:=OptPass1And(p);
  717. A_CSEL,
  718. A_ADD,
  719. A_ADC,
  720. A_SUB,
  721. A_SBC,
  722. A_BIC,
  723. A_EOR,
  724. A_ORR,
  725. A_MUL:
  726. Result:=OptPass1Data(p);
  727. A_UXTB:
  728. Result:=OptPass1UXTB(p);
  729. A_UXTH:
  730. Result:=OptPass1UXTH(p);
  731. A_SXTB:
  732. Result:=OptPass1SXTB(p);
  733. A_SXTH:
  734. Result:=OptPass1SXTH(p);
  735. // A_VLDR,
  736. A_FMADD,
  737. A_FMSUB,
  738. A_FNMADD,
  739. A_FNMSUB,
  740. A_FNMUL,
  741. A_FADD,
  742. A_FMUL,
  743. A_FDIV,
  744. A_FSUB,
  745. A_FSQRT,
  746. A_FNEG,
  747. A_FCVT,
  748. A_FABS:
  749. Result:=OptPass1FData(p);
  750. A_FMOV:
  751. Result:=OptPass1FMov(p);
  752. else
  753. ;
  754. end;
  755. end;
  756. end;
  757. function TCpuAsmOptimizer.PeepHoleOptPass2Cpu(var p: tai): boolean;
  758. begin
  759. result := false;
  760. if p.typ=ait_instruction then
  761. begin
  762. case taicpu(p).opcode of
  763. A_LDR,
  764. A_STR:
  765. Result:=OptPass2LDRSTR(p);
  766. else
  767. ;
  768. end;
  769. end;
  770. end;
  771. function TCpuAsmOptimizer.PostPeepHoleOptsCpu(var p: tai): boolean;
  772. begin
  773. result := false;
  774. if p.typ=ait_instruction then
  775. begin
  776. case taicpu(p).opcode of
  777. A_CMP:
  778. Result:=OptPostCMP(p);
  779. else
  780. ;
  781. end;
  782. end;
  783. end;
  784. begin
  785. casmoptimizer:=TCpuAsmOptimizer;
  786. End.