aoptcpu.pas 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999
  1. {
  2. Copyright (c) 1998-2002 by Jonas Maebe, member of the Free Pascal
  3. Development Team
  4. This unit implements the ARM64 optimizer object
  5. This program is free software; you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation; either version 2 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program; if not, write to the Free Software
  15. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16. ****************************************************************************
  17. }
  18. Unit aoptcpu;
  19. {$i fpcdefs.inc}
  20. {$ifdef EXTDEBUG}
  21. {$define DEBUG_AOPTCPU}
  22. {$endif EXTDEBUG}
  23. Interface
  24. uses
  25. globtype, globals,
  26. cutils,
  27. cgbase, cpubase, aasmtai, aasmcpu,
  28. aopt, aoptcpub, aoptarm;
  29. Type
  30. TCpuAsmOptimizer = class(TARMAsmOptimizer)
  31. { uses the same constructor as TAopObj }
  32. function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
  33. function PeepHoleOptPass2Cpu(var p: tai): boolean; override;
  34. function PostPeepHoleOptsCpu(var p: tai): boolean; override;
  35. function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
  36. function InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;override;
  37. function LookForPostindexedPattern(var p : tai) : boolean;
  38. public
  39. { With these routines, there's optimisation code that's general for all ARM platforms }
  40. function OptPass1LDR(var p: tai): Boolean; override;
  41. function OptPass1STR(var p: tai): Boolean; override;
  42. private
  43. function RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string): boolean;
  44. function OptPass1Shift(var p: tai): boolean;
  45. function OptPostCMP(var p: tai): boolean;
  46. function OptPostAnd(var p: tai): Boolean;
  47. function OptPass1Data(var p: tai): boolean;
  48. function OptPass1FData(var p: tai): Boolean;
  49. function OptPass1STP(var p: tai): boolean;
  50. function OptPass1Mov(var p: tai): boolean;
  51. function OptPass1FMov(var p: tai): Boolean;
  52. function OptPass2LDRSTR(var p: tai): boolean;
  53. End;
  54. Implementation
  55. uses
  56. aasmbase,
  57. aoptutils,
  58. cgutils,
  59. verbose;
  60. {$ifdef DEBUG_AOPTCPU}
  61. const
  62. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  63. {$else DEBUG_AOPTCPU}
  64. { Empty strings help the optimizer to remove string concatenations that won't
  65. ever appear to the user on release builds. [Kit] }
  66. const
  67. SPeepholeOptimization = '';
  68. {$endif DEBUG_AOPTCPU}
  69. function CanBeCond(p : tai) : boolean;
  70. begin
  71. result:=(p.typ=ait_instruction) and (taicpu(p).condition=C_None);
  72. end;
  73. function TCpuAsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  74. var
  75. p: taicpu;
  76. begin
  77. Result := false;
  78. if not ((assigned(hp)) and (hp.typ = ait_instruction)) then
  79. exit;
  80. p := taicpu(hp);
  81. case p.opcode of
  82. { These operations do not write into a register at all
  83. LDR/STR with post/pre-indexed operations do not need special treatment
  84. because post-/preindexed does not mean that a register
  85. is loaded with a new value, it is only modified }
  86. A_STR, A_CMP, A_CMN, A_TST, A_B, A_BL, A_MSR, A_FCMP:
  87. exit;
  88. else
  89. ;
  90. end;
  91. if p.ops=0 then
  92. exit;
  93. case p.oper[0]^.typ of
  94. top_reg:
  95. Result := SuperRegistersEqual(p.oper[0]^.reg,reg);
  96. top_ref:
  97. Result :=
  98. (taicpu(p).oper[0]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
  99. (taicpu(p).oper[0]^.ref^.base = reg);
  100. else
  101. ;
  102. end;
  103. end;
  104. function TCpuAsmOptimizer.InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;
  105. var
  106. p: taicpu;
  107. i: longint;
  108. begin
  109. instructionLoadsFromReg := false;
  110. if not (assigned(hp) and (hp.typ = ait_instruction)) then
  111. exit;
  112. p:=taicpu(hp);
  113. i:=1;
  114. { Start on oper[0]? }
  115. if taicpu(hp).spilling_get_operation_type(0) in [operand_read, operand_readwrite] then
  116. i:=0;
  117. while(i<p.ops) do
  118. begin
  119. case p.oper[I]^.typ of
  120. top_reg:
  121. Result := (p.oper[I]^.reg = reg);
  122. top_ref:
  123. Result :=
  124. (p.oper[I]^.ref^.base = reg) or
  125. (p.oper[I]^.ref^.index = reg);
  126. else
  127. ;
  128. end;
  129. { Bailout if we found something }
  130. if Result then
  131. exit;
  132. Inc(I);
  133. end;
  134. end;
  135. {
  136. optimize
  137. ldr/str regX,[reg1]
  138. ...
  139. add/sub reg1,reg1,regY/const
  140. into
  141. ldr/str regX,[reg1], regY/const
  142. }
  143. function TCpuAsmOptimizer.LookForPostindexedPattern(var p: tai) : boolean;
  144. var
  145. hp1 : tai;
  146. begin
  147. Result:=false;
  148. if (taicpu(p).oper[1]^.typ = top_ref) and
  149. (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
  150. (taicpu(p).oper[1]^.ref^.index=NR_NO) and
  151. (taicpu(p).oper[1]^.ref^.offset=0) and
  152. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.ref^.base) and
  153. { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
  154. MatchInstruction(hp1, [A_ADD, A_SUB], [PF_None]) and
  155. (taicpu(hp1).oper[0]^.reg=taicpu(p).oper[1]^.ref^.base) and
  156. (taicpu(hp1).oper[1]^.reg=taicpu(p).oper[1]^.ref^.base) and
  157. (
  158. { valid offset? }
  159. (taicpu(hp1).oper[2]^.typ=top_const) and
  160. (taicpu(hp1).oper[2]^.val>=-256) and
  161. (abs(taicpu(hp1).oper[2]^.val)<256)
  162. ) and
  163. { don't apply the optimization if the base register is loaded }
  164. (getsupreg(taicpu(p).oper[0]^.reg)<>getsupreg(taicpu(p).oper[1]^.ref^.base)) and
  165. not(RegModifiedBetween(taicpu(hp1).oper[0]^.reg,p,hp1)) and
  166. not(RegModifiedBetween(taicpu(hp1).oper[2]^.reg,p,hp1)) then
  167. begin
  168. if taicpu(p).opcode = A_LDR then
  169. DebugMsg(SPeepholeOptimization + 'LdrAdd/Sub2Ldr Postindex done', p)
  170. else
  171. DebugMsg(SPeepholeOptimization + 'StrAdd/Sub2Str Postindex done', p);
  172. taicpu(p).oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
  173. if taicpu(hp1).opcode=A_ADD then
  174. taicpu(p).oper[1]^.ref^.offset:=taicpu(hp1).oper[2]^.val
  175. else
  176. taicpu(p).oper[1]^.ref^.offset:=-taicpu(hp1).oper[2]^.val;
  177. asml.Remove(hp1);
  178. hp1.Free;
  179. Result:=true;
  180. end;
  181. end;
  182. function TCpuAsmOptimizer.RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string):boolean;
  183. var
  184. alloc,
  185. dealloc : tai_regalloc;
  186. hp1 : tai;
  187. begin
  188. Result:=false;
  189. if ((MatchInstruction(movp, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  190. ((getregtype(taicpu(movp).oper[0]^.reg)=R_MMREGISTER) { or (taicpu(p).opcode in [A_LDUR])})
  191. ) { or
  192. (((taicpu(p).oppostfix in [PF_F64F32,PF_F64S16,PF_F64S32,PF_F64U16,PF_F64U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFD)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F64])) or
  193. (((taicpu(p).oppostfix in [PF_F32F64,PF_F32S16,PF_F32S32,PF_F32U16,PF_F32U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFS)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F32])) }
  194. ) and
  195. (taicpu(movp).ops=2) and
  196. MatchOperand(taicpu(movp).oper[1]^, taicpu(p).oper[0]^.reg) and
  197. { the destination register of the mov might not be used beween p and movp }
  198. not(RegUsedBetween(taicpu(movp).oper[0]^.reg,p,movp)) and
  199. { Take care to only do this for instructions which REALLY load to the first register.
  200. Otherwise
  201. str reg0, [reg1]
  202. fmov reg2, reg0
  203. will be optimized to
  204. str reg2, [reg1]
  205. }
  206. RegLoadedWithNewValue(taicpu(p).oper[0]^.reg, p) then
  207. begin
  208. dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(movp.Next));
  209. if assigned(dealloc) then
  210. begin
  211. DebugMsg(SPeepholeOptimization + optimizer+' removed superfluous vmov', movp);
  212. result:=true;
  213. { taicpu(p).oper[0]^.reg is not used anymore, try to find its allocation
  214. and remove it if possible }
  215. asml.Remove(dealloc);
  216. alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.previous));
  217. if assigned(alloc) then
  218. begin
  219. asml.Remove(alloc);
  220. alloc.free;
  221. dealloc.free;
  222. end
  223. else
  224. asml.InsertAfter(dealloc,p);
  225. { try to move the allocation of the target register }
  226. GetLastInstruction(movp,hp1);
  227. alloc:=FindRegAlloc(taicpu(movp).oper[0]^.reg,tai(hp1.Next));
  228. if assigned(alloc) then
  229. begin
  230. asml.Remove(alloc);
  231. asml.InsertBefore(alloc,p);
  232. { adjust used regs }
  233. IncludeRegInUsedRegs(taicpu(movp).oper[0]^.reg,UsedRegs);
  234. end;
  235. { change
  236. vldr reg0,[reg1]
  237. vmov reg2,reg0
  238. into
  239. ldr reg2,[reg1]
  240. if reg2 is an int register
  241. if (taicpu(p).opcode=A_VLDR) and (getregtype(taicpu(movp).oper[0]^.reg)=R_INTREGISTER) then
  242. taicpu(p).opcode:=A_LDR;
  243. }
  244. { finally get rid of the mov }
  245. taicpu(p).loadreg(0,taicpu(movp).oper[0]^.reg);
  246. asml.remove(movp);
  247. movp.free;
  248. end;
  249. end;
  250. end;
  251. function TCpuAsmOptimizer.OptPass1LDR(var p: tai): Boolean;
  252. var
  253. hp1: tai;
  254. begin
  255. Result := False;
  256. if inherited OptPass1LDR(p) or
  257. LookForPostindexedPattern(p) then
  258. Exit(True)
  259. else if (taicpu(p).oppostfix in [PF_B,PF_SB,PF_H,PF_SH,PF_None]) and
  260. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  261. RemoveSuperfluousMove(p, hp1, 'Ldr<Postfix>Mov2Ldr<Postfix>') then
  262. Exit(true);
  263. end;
  264. function TCpuAsmOptimizer.OptPass1STR(var p: tai): Boolean;
  265. begin
  266. Result := False;
  267. if inherited OptPass1STR(p) or
  268. LookForPostindexedPattern(p) then
  269. Exit(True);
  270. end;
  271. function TCpuAsmOptimizer.OptPass1Shift(var p : tai): boolean;
  272. var
  273. hp1,hp2: tai;
  274. I2, I: Integer;
  275. shifterop: tshifterop;
  276. begin
  277. Result:=false;
  278. { This folds shifterops into following instructions
  279. <shiftop> r0, r1, #imm
  280. <op> r2, r3, r0
  281. to
  282. <op> r2, r3, r1, <shiftop> #imm
  283. }
  284. { do not handle ROR yet, only part of the instructions below support ROR as shifter operand }
  285. if MatchInstruction(p,[A_LSL, A_LSR, A_ASR{, A_ROR}],[PF_None]) and
  286. MatchOpType(taicpu(p),top_reg,top_reg,top_const) and
  287. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  288. MatchInstruction(hp1, [A_ADD, A_AND, A_BIC, A_CMP, A_CMN,
  289. A_EON, A_EOR, A_NEG, A_ORN, A_ORR,
  290. A_SUB, A_TST], [PF_None]) and
  291. RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  292. (taicpu(hp1).ops >= 2) and
  293. { Currently we can't fold into another shifterop }
  294. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.typ = top_reg) and
  295. { SP does not work completely with shifted registers, as I didn't find the exact rules,
  296. we do not operate on SP }
  297. (taicpu(hp1).oper[0]^.reg<>NR_SP) and
  298. (taicpu(hp1).oper[1]^.reg<>NR_SP) and
  299. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.reg<>NR_SP) and
  300. { reg1 might not be modified inbetween }
  301. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
  302. (
  303. { Only ONE of the two src operands is allowed to match }
  304. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-2]^) xor
  305. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)
  306. ) and
  307. { for SUB, the last operand must match, there is no RSB on AArch64 }
  308. ((taicpu(hp1).opcode<>A_SUB) or
  309. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)) then
  310. begin
  311. { for the two operand instructions, start also at the second operand as they are not always commutative
  312. (depends on the flags tested laster on) and thus the operands cannot swapped }
  313. I2:=1;
  314. for I:=I2 to taicpu(hp1).ops-1 do
  315. if MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[I]^.reg) then
  316. begin
  317. { If the parameter matched on the second op from the RIGHT
  318. we have to switch the parameters, this will not happen for CMP
  319. were we're only evaluating the most right parameter
  320. }
  321. shifterop_reset(shifterop);
  322. case taicpu(p).opcode of
  323. A_LSL:
  324. shifterop.shiftmode:=SM_LSL;
  325. A_ROR:
  326. shifterop.shiftmode:=SM_ROR;
  327. A_LSR:
  328. shifterop.shiftmode:=SM_LSR;
  329. A_ASR:
  330. shifterop.shiftmode:=SM_ASR;
  331. else
  332. InternalError(2019090401);
  333. end;
  334. shifterop.shiftimm:=taicpu(p).oper[2]^.val;
  335. if I <> taicpu(hp1).ops-1 then
  336. begin
  337. if taicpu(hp1).ops = 3 then
  338. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  339. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[2]^.reg,
  340. taicpu(p).oper[1]^.reg, shifterop)
  341. else
  342. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  343. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  344. shifterop);
  345. end
  346. else
  347. if taicpu(hp1).ops = 3 then
  348. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  349. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg,
  350. taicpu(p).oper[1]^.reg,shifterop)
  351. else
  352. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  353. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  354. shifterop);
  355. { Make sure the register used in the shifting is tracked all
  356. the way through, otherwise it may become deallocated while
  357. it's still live and cause incorrect optimisations later }
  358. if (taicpu(hp1).oper[0]^.reg <> taicpu(p).oper[1]^.reg) then
  359. begin
  360. TransferUsedRegs(TmpUsedRegs);
  361. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  362. ALlocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, TmpUsedRegs);
  363. end;
  364. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  365. asml.insertbefore(hp2, hp1);
  366. RemoveInstruction(hp1);
  367. RemoveCurrentp(p);
  368. DebugMsg(SPeepholeOptimization + 'FoldShiftProcess done', hp2);
  369. Result:=true;
  370. break;
  371. end;
  372. end
  373. else if MatchInstruction(p,[A_LSL, A_LSR, A_ASR,A_ROR],[PF_None]) and
  374. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  375. RemoveSuperfluousMove(p, hp1, 'ShiftMov2Shift') then
  376. Result:=true;
  377. end;
  378. function TCpuAsmOptimizer.OptPass1Data(var p : tai): boolean;
  379. var
  380. hp1: tai;
  381. begin
  382. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  383. RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
  384. end;
  385. function TCpuAsmOptimizer.OptPass1FData(var p: tai): Boolean;
  386. var
  387. hp1: tai;
  388. begin
  389. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  390. RemoveSuperfluousFMov(p, hp1, 'FOpFMov2FOp');
  391. end;
  392. function TCpuAsmOptimizer.OptPass1STP(var p : tai): boolean;
  393. var
  394. hp1, hp2, hp3, hp4: tai;
  395. begin
  396. Result:=false;
  397. {
  398. change
  399. stp x29,x30,[sp, #-16]!
  400. mov x29,sp
  401. bl abc
  402. ldp x29,x30,[sp], #16
  403. ret
  404. into
  405. b abc
  406. }
  407. if MatchInstruction(p, A_STP, [C_None], [PF_None]) and
  408. MatchOpType(taicpu(p),top_reg,top_reg,top_ref) and
  409. (taicpu(p).oper[0]^.reg = NR_X29) and
  410. (taicpu(p).oper[1]^.reg = NR_X30) and
  411. (taicpu(p).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  412. (taicpu(p).oper[2]^.ref^.index=NR_NO) and
  413. (taicpu(p).oper[2]^.ref^.offset=-16) and
  414. (taicpu(p).oper[2]^.ref^.addressmode=AM_PREINDEXED) and
  415. GetNextInstruction(p, hp1) and
  416. MatchInstruction(hp1, A_MOV, [C_None], [PF_NONE]) and
  417. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[0]^) and
  418. (taicpu(hp1).oper[1]^.typ = top_reg) and
  419. (taicpu(hp1).oper[1]^.reg = NR_STACK_POINTER_REG) and
  420. GetNextInstruction(hp1, hp2) and
  421. SkipEntryExitMarker(hp2, hp2) and
  422. MatchInstruction(hp2, A_BL, [C_None], [PF_NONE]) and
  423. (taicpu(hp2).oper[0]^.typ = top_ref) and
  424. GetNextInstruction(hp2, hp3) and
  425. SkipEntryExitMarker(hp3, hp3) and
  426. MatchInstruction(hp3, A_LDP, [C_None], [PF_NONE]) and
  427. MatchOpType(taicpu(hp3),top_reg,top_reg,top_ref) and
  428. (taicpu(hp3).oper[0]^.reg = NR_X29) and
  429. (taicpu(hp3).oper[1]^.reg = NR_X30) and
  430. (taicpu(hp3).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  431. (taicpu(hp3).oper[2]^.ref^.index=NR_NO) and
  432. (taicpu(hp3).oper[2]^.ref^.offset=16) and
  433. (taicpu(hp3).oper[2]^.ref^.addressmode=AM_POSTINDEXED) and
  434. GetNextInstruction(hp3, hp4) and
  435. MatchInstruction(hp4, A_RET, [C_None], [PF_None]) and
  436. (taicpu(hp4).ops = 0) then
  437. begin
  438. asml.Remove(p);
  439. asml.Remove(hp1);
  440. asml.Remove(hp3);
  441. asml.Remove(hp4);
  442. taicpu(hp2).opcode:=A_B;
  443. p.free;
  444. hp1.free;
  445. hp3.free;
  446. hp4.free;
  447. p:=hp2;
  448. DebugMsg(SPeepholeOptimization + 'Bl2B done', p);
  449. Result:=true;
  450. end;
  451. end;
  452. function TCpuAsmOptimizer.OptPass1Mov(var p : tai): boolean;
  453. var
  454. hp1: tai;
  455. so: tshifterop;
  456. begin
  457. Result:=false;
  458. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  459. (taicpu(p).oppostfix=PF_None) then
  460. begin
  461. RemoveCurrentP(p);
  462. DebugMsg(SPeepholeOptimization + 'Mov2None done', p);
  463. Result:=true;
  464. end
  465. else if GetNextInstruction(p, hp1) and
  466. MatchInstruction(hp1,[A_ADD,A_SUB],[taicpu(p).condition], [PF_None,PF_S]) and
  467. (taicpu(p).ops=2) and
  468. (taicpu(hp1).ops=3) and
  469. (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBD) and
  470. (getsubreg(taicpu(hp1).oper[2]^.reg)=R_SUBQ) and
  471. (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[2]^.reg)) and
  472. RegEndOfLife(taicpu(hp1).oper[2]^.reg,taicpu(hp1)) then
  473. begin
  474. DebugMsg(SPeepholeOptimization + 'MovOp2AddUtxw 1 done', p);
  475. shifterop_reset(so);
  476. so.shiftmode:=SM_UXTW;
  477. taicpu(hp1).ops:=4;
  478. taicpu(hp1).loadreg(2,taicpu(p).oper[1]^.reg);
  479. taicpu(hp1).loadshifterop(3,so);
  480. RemoveCurrentP(p);
  481. Result:=true;
  482. exit;
  483. end
  484. {
  485. optimize
  486. mov rX, yyyy
  487. ....
  488. }
  489. else if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
  490. begin
  491. if RemoveSuperfluousMove(p, hp1, 'MovMov2Mov') then
  492. Result:=true
  493. else if (taicpu(p).ops = 2) and
  494. (tai(hp1).typ = ait_instruction) and
  495. RedundantMovProcess(p,hp1) then
  496. Result:=true
  497. end;
  498. end;
  499. function TCpuAsmOptimizer.OptPass1FMov(var p: tai): Boolean;
  500. var
  501. hp1: tai;
  502. alloc, dealloc: tai_regalloc;
  503. begin
  504. {
  505. change
  506. fmov reg0,reg1
  507. fmov reg1,reg0
  508. into
  509. fmov reg0,reg1
  510. }
  511. Result := False;
  512. while GetNextInstruction(p, hp1) and
  513. MatchInstruction(hp1, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  514. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^) and
  515. MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[0]^) do
  516. begin
  517. asml.Remove(hp1);
  518. hp1.free;
  519. DebugMsg(SPeepholeOptimization + 'FMovFMov2FMov 1 done', p);
  520. Result:=true;
  521. end;
  522. { change
  523. fmov reg0,const
  524. fmov reg1,reg0
  525. dealloc reg0
  526. into
  527. fmov reg1,const
  528. }
  529. if MatchOpType(taicpu(p),top_reg,top_realconst) and
  530. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  531. (not RegModifiedBetween(taicpu(p).oper[1]^.reg, p, hp1)) and
  532. MatchInstruction(hp1,A_FMOV,[taicpu(p).condition],[taicpu(p).oppostfix]) and
  533. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  534. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^.reg) and
  535. (not RegModifiedByInstruction(taicpu(p).oper[0]^.reg, hp1)) and
  536. assigned(FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next)))
  537. then
  538. begin
  539. DebugMsg('Peephole FMovFMov2FMov 2 done', p);
  540. taicpu(hp1).loadrealconst(1,taicpu(p).oper[1]^.val_real);
  541. alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.Previous));
  542. dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next));
  543. if assigned(alloc) and assigned(dealloc) then
  544. begin
  545. asml.Remove(alloc);
  546. alloc.Free;
  547. asml.Remove(dealloc);
  548. dealloc.Free;
  549. end;
  550. { p will be removed, update used register as we continue
  551. with the next instruction after p }
  552. result:=RemoveCurrentP(p);
  553. end;
  554. { not enabled as apparently not happening
  555. if MatchOpType(taicpu(p),top_reg,top_reg) and
  556. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  557. MatchInstruction(hp1, [A_FSUB,A_FADD,A_FNEG,A_FMUL,A_FSQRT,A_FDIV,A_FABS], [PF_None]) and
  558. (MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) or
  559. ((taicpu(hp1).ops=3) and MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[2]^))
  560. ) and
  561. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  562. not(RegUsedBetween(taicpu(p).oper[0]^.reg,p,hp1)) then
  563. begin
  564. DebugMsg(SPeepholeOptimization + 'FMovFOp2FOp done', hp1);
  565. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
  566. if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) then
  567. taicpu(hp1).oper[1]^.reg:=taicpu(p).oper[1]^.reg;
  568. if (taicpu(hp1).ops=3) and MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[2]^) then
  569. taicpu(hp1).oper[2]^.reg:=taicpu(p).oper[1]^.reg;
  570. RemoveCurrentP(p);
  571. Result:=true;
  572. exit;
  573. end;
  574. }
  575. end;
  576. function TCpuAsmOptimizer.OptPass2LDRSTR(var p: tai): boolean;
  577. var
  578. hp1, hp1_last: tai;
  579. ThisRegister: TRegister;
  580. OffsetVal, ValidOffset, MinOffset, MaxOffset: asizeint;
  581. TargetOpcode: TAsmOp;
  582. begin
  583. Result := False;
  584. ThisRegister := taicpu(p).oper[0]^.reg;
  585. case taicpu(p).opcode of
  586. A_LDR:
  587. TargetOpcode := A_LDP;
  588. A_STR:
  589. TargetOpcode := A_STP;
  590. else
  591. InternalError(2020081501);
  592. end;
  593. { reg appearing in ref invalidates these optimisations }
  594. if (TargetOpcode = A_STP) or not RegInRef(ThisRegister, taicpu(p).oper[1]^.ref^) then
  595. begin
  596. { LDP/STP has a smaller permitted offset range than LDR/STR.
  597. TODO: For a group of out-of-range LDR/STR instructions, can
  598. we declare a temporary register equal to the offset base
  599. address, modify the STR instructions to use that register
  600. and then convert them to STP instructions? Note that STR
  601. generally takes 2 cycles (on top of the memory latency),
  602. while LDP/STP takes 3.
  603. }
  604. if (getsubreg(ThisRegister) = R_SUBQ) then
  605. begin
  606. ValidOffset := 8;
  607. MinOffset := -512;
  608. MaxOffset := 504;
  609. end
  610. else
  611. begin
  612. ValidOffset := 4;
  613. MinOffset := -256;
  614. MaxOffset := 252;
  615. end;
  616. hp1_last := p;
  617. { Look for nearby LDR/STR instructions }
  618. if (taicpu(p).oppostfix = PF_NONE) and
  619. (taicpu(p).oper[1]^.ref^.addressmode = AM_OFFSET) then
  620. { If SkipGetNext is True, GextNextInstruction isn't called }
  621. while GetNextInstruction(hp1_last, hp1) do
  622. begin
  623. if (hp1.typ <> ait_instruction) then
  624. Break;
  625. if (taicpu(hp1).opcode = taicpu(p).opcode) then
  626. begin
  627. if (taicpu(hp1).oppostfix = PF_NONE) and
  628. { Registers need to be the same size }
  629. (getsubreg(ThisRegister) = getsubreg(taicpu(hp1).oper[0]^.reg)) and
  630. (
  631. (TargetOpcode = A_STP) or
  632. { LDP x0, x0, [sp, #imm] is undefined behaviour, even
  633. though such an LDR pair should have been optimised
  634. out by now. STP is okay }
  635. (ThisRegister <> taicpu(hp1).oper[0]^.reg)
  636. ) and
  637. (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
  638. (taicpu(p).oper[1]^.ref^.base = taicpu(hp1).oper[1]^.ref^.base) and
  639. (taicpu(p).oper[1]^.ref^.index = taicpu(hp1).oper[1]^.ref^.index) and
  640. { Make sure the address registers haven't changed }
  641. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, p, hp1) and
  642. (
  643. (taicpu(hp1).oper[1]^.ref^.index = NR_NO) or
  644. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index, p, hp1)
  645. ) and
  646. { Don't need to check "RegInRef" because the base registers are identical,
  647. and the first one was checked already. [Kit] }
  648. (((TargetOpcode=A_LDP) and not RegUsedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) or
  649. ((TargetOpcode=A_STP) and not RegModifiedBetween(taicpu(hp1).oper[0]^.reg, p, hp1))) then
  650. begin
  651. { Can we convert these two LDR/STR instructions into a
  652. single LDR/STP? }
  653. OffsetVal := taicpu(hp1).oper[1]^.ref^.offset - taicpu(p).oper[1]^.ref^.offset;
  654. if (OffsetVal = ValidOffset) then
  655. begin
  656. if (taicpu(p).oper[1]^.ref^.offset >= MinOffset) and (taicpu(hp1).oper[1]^.ref^.offset <= MaxOffset) then
  657. begin
  658. { Convert:
  659. LDR/STR reg0, [reg2, #ofs]
  660. ...
  661. LDR/STR reg1. [reg2, #ofs + 8] // 4 if registers are 32-bit
  662. To:
  663. LDP/STP reg0, reg1, [reg2, #ofs]
  664. }
  665. taicpu(p).opcode := TargetOpcode;
  666. if TargetOpcode = A_STP then
  667. DebugMsg(SPeepholeOptimization + 'StrStr2Stp', p)
  668. else
  669. DebugMsg(SPeepholeOptimization + 'LdrLdr2Ldp', p);
  670. taicpu(p).ops := 3;
  671. taicpu(p).loadref(2, taicpu(p).oper[1]^.ref^);
  672. taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
  673. asml.Remove(hp1);
  674. hp1.Free;
  675. Result := True;
  676. Exit;
  677. end;
  678. end
  679. else if (OffsetVal = -ValidOffset) then
  680. begin
  681. if (taicpu(hp1).oper[1]^.ref^.offset >= MinOffset) and (taicpu(p).oper[1]^.ref^.offset <= MaxOffset) then
  682. begin
  683. { Convert:
  684. LDR/STR reg0, [reg2, #ofs + 8] // 4 if registers are 32-bit
  685. ...
  686. LDR/STR reg1. [reg2, #ofs]
  687. To:
  688. LDP/STP reg1, reg0, [reg2, #ofs]
  689. }
  690. taicpu(p).opcode := TargetOpcode;
  691. if TargetOpcode = A_STP then
  692. DebugMsg(SPeepholeOptimization + 'StrStr2Stp (reverse)', p)
  693. else
  694. DebugMsg(SPeepholeOptimization + 'LdrLdr2Ldp (reverse)', p);
  695. taicpu(p).ops := 3;
  696. taicpu(p).loadref(2, taicpu(hp1).oper[1]^.ref^);
  697. taicpu(p).loadreg(1, taicpu(p).oper[0]^.reg);
  698. taicpu(p).loadreg(0, taicpu(hp1).oper[0]^.reg);
  699. asml.Remove(hp1);
  700. hp1.Free;
  701. Result := True;
  702. Exit;
  703. end;
  704. end;
  705. end;
  706. end
  707. else
  708. Break;
  709. { Don't continue looking for LDR/STR pairs if the address register
  710. gets modified }
  711. if RegModifiedByInstruction(taicpu(p).oper[1]^.ref^.base, hp1) then
  712. Break;
  713. hp1_last := hp1;
  714. end;
  715. end;
  716. end;
  717. function TCpuAsmOptimizer.OptPostAnd(var p: tai): Boolean;
  718. var
  719. hp1, hp2: tai;
  720. hp3: taicpu;
  721. bitval : cardinal;
  722. begin
  723. Result:=false;
  724. {
  725. and reg1,reg0,<const=power of 2>
  726. cmp reg1,#0
  727. <reg1 end of life>
  728. b.e/b.ne label
  729. into
  730. tb(n)z reg0,<power of 2>,label
  731. }
  732. if MatchOpType(taicpu(p),top_reg,top_reg,top_const) and
  733. (PopCnt(QWord(taicpu(p).oper[2]^.val))=1) and
  734. GetNextInstruction(p,hp1) and
  735. MatchInstruction(hp1,A_CMP,[PF_None]) and
  736. MatchOpType(taicpu(hp1),top_reg,top_const) and
  737. (taicpu(hp1).oper[1]^.val=0) and
  738. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  739. RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  740. GetNextInstruction(hp1,hp2) and
  741. MatchInstruction(hp2,A_B,[PF_None]) and
  742. (taicpu(hp2).condition in [C_EQ,C_NE]) then
  743. begin
  744. bitval:=BsfQWord(qword(taicpu(p).oper[2]^.val));
  745. case taicpu(hp2).condition of
  746. C_NE:
  747. hp3:=taicpu.op_reg_const_ref(A_TBNZ,taicpu(p).oper[1]^.reg,bitval,taicpu(hp2).oper[0]^.ref^);
  748. C_EQ:
  749. hp3:=taicpu.op_reg_const_ref(A_TBZ,taicpu(p).oper[1]^.reg,bitval,taicpu(hp2).oper[0]^.ref^);
  750. else
  751. Internalerror(2021100201);
  752. end;
  753. taicpu(hp3).fileinfo:=taicpu(hp1).fileinfo;
  754. asml.insertbefore(hp3, hp1);
  755. RemoveInstruction(hp1);
  756. RemoveInstruction(hp2);
  757. RemoveCurrentP(p);
  758. DebugMsg(SPeepholeOptimization + 'AndCmpB.E/NE2Tbnz/Tbz done', p);
  759. Result:=true;
  760. end;
  761. end;
  762. function TCpuAsmOptimizer.OptPostCMP(var p : tai): boolean;
  763. var
  764. hp1,hp2: tai;
  765. begin
  766. Result:=false;
  767. {
  768. cmp reg0,#0
  769. b.e/b.ne label
  770. into
  771. cb(n)z reg0,label
  772. }
  773. if MatchOpType(taicpu(p),top_reg,top_const) and
  774. (taicpu(p).oper[1]^.val=0) and
  775. GetNextInstruction(p,hp1) and
  776. MatchInstruction(hp1,A_B,[PF_None]) and
  777. (taicpu(hp1).condition in [C_EQ,C_NE]) then
  778. begin
  779. case taicpu(hp1).condition of
  780. C_NE:
  781. hp2:=taicpu.op_reg_sym_ofs(A_CBNZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  782. C_EQ:
  783. hp2:=taicpu.op_reg_sym_ofs(A_CBZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  784. else
  785. Internalerror(2019090801);
  786. end;
  787. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  788. asml.insertbefore(hp2, hp1);
  789. asml.remove(p);
  790. asml.remove(hp1);
  791. p.free;
  792. hp1.free;
  793. p:=hp2;
  794. DebugMsg(SPeepholeOptimization + 'CMPB.E/NE2CBNZ/CBZ done', p);
  795. Result:=true;
  796. end;
  797. end;
  798. function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
  799. begin
  800. result := false;
  801. if p.typ=ait_instruction then
  802. begin
  803. case taicpu(p).opcode of
  804. A_LDR:
  805. Result:=OptPass1LDR(p);
  806. A_STR:
  807. Result:=OptPass1STR(p);
  808. A_MOV:
  809. Result:=OptPass1Mov(p);
  810. A_STP:
  811. Result:=OptPass1STP(p);
  812. A_LSR,
  813. A_ROR,
  814. A_ASR,
  815. A_LSL:
  816. Result:=OptPass1Shift(p);
  817. A_AND:
  818. Result:=OptPass1And(p);
  819. A_CSEL,
  820. A_ADD,
  821. A_ADC,
  822. A_SUB,
  823. A_SBC,
  824. A_BIC,
  825. A_EOR,
  826. A_ORR,
  827. A_MUL:
  828. Result:=OptPass1Data(p);
  829. A_UXTB:
  830. Result:=OptPass1UXTB(p);
  831. A_UXTH:
  832. Result:=OptPass1UXTH(p);
  833. A_SXTB:
  834. Result:=OptPass1SXTB(p);
  835. A_SXTH:
  836. Result:=OptPass1SXTH(p);
  837. // A_VLDR,
  838. A_FMADD,
  839. A_FMSUB,
  840. A_FNMADD,
  841. A_FNMSUB,
  842. A_FNMUL,
  843. A_FADD,
  844. A_FMUL,
  845. A_FDIV,
  846. A_FSUB,
  847. A_FSQRT,
  848. A_FNEG,
  849. A_FCVT,
  850. A_FABS:
  851. Result:=OptPass1FData(p);
  852. A_FMOV:
  853. Result:=OptPass1FMov(p);
  854. else
  855. ;
  856. end;
  857. end;
  858. end;
  859. function TCpuAsmOptimizer.PeepHoleOptPass2Cpu(var p: tai): boolean;
  860. begin
  861. result := false;
  862. if p.typ=ait_instruction then
  863. begin
  864. case taicpu(p).opcode of
  865. A_LDR,
  866. A_STR:
  867. Result:=OptPass2LDRSTR(p);
  868. else
  869. ;
  870. end;
  871. end;
  872. end;
  873. function TCpuAsmOptimizer.PostPeepHoleOptsCpu(var p: tai): boolean;
  874. begin
  875. result := false;
  876. if p.typ=ait_instruction then
  877. begin
  878. case taicpu(p).opcode of
  879. A_CMP:
  880. Result:=OptPostCMP(p);
  881. A_AND:
  882. Result:=OptPostAnd(p);
  883. else
  884. ;
  885. end;
  886. end;
  887. end;
  888. begin
  889. casmoptimizer:=TCpuAsmOptimizer;
  890. End.