aoptcpu.pas 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978
  1. {
  2. Copyright (c) 1998-2002 by Jonas Maebe, member of the Free Pascal
  3. Development Team
  4. This unit implements the ARM64 optimizer object
  5. This program is free software; you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation; either version 2 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program; if not, write to the Free Software
  15. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16. ****************************************************************************
  17. }
  18. Unit aoptcpu;
  19. {$i fpcdefs.inc}
  20. {$ifdef EXTDEBUG}
  21. {$define DEBUG_AOPTCPU}
  22. {$endif EXTDEBUG}
  23. Interface
  24. uses
  25. globtype, globals,
  26. cutils,
  27. cgbase, cpubase, aasmtai, aasmcpu,
  28. aopt, aoptcpub, aoptarm;
  29. Type
  30. TCpuAsmOptimizer = class(TARMAsmOptimizer)
  31. { uses the same constructor as TAopObj }
  32. function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
  33. function PeepHoleOptPass2Cpu(var p: tai): boolean; override;
  34. function PostPeepHoleOptsCpu(var p: tai): boolean; override;
  35. function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
  36. function InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;override;
  37. function LookForPostindexedPattern(var p : tai) : boolean;
  38. public
  39. { With these routines, there's optimisation code that's general for all ARM platforms }
  40. function OptPass1LDR(var p: tai): Boolean; override;
  41. function OptPass1STR(var p: tai): Boolean; override;
  42. private
  43. function RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string): boolean;
  44. function OptPass1Shift(var p: tai): boolean;
  45. function OptPostCMP(var p: tai): boolean;
  46. function OptPostAnd(var p: tai): Boolean;
  47. function OptPass1Data(var p: tai): boolean;
  48. function OptPass1FData(var p: tai): Boolean;
  49. function OptPass1STP(var p: tai): boolean;
  50. function OptPass1Mov(var p: tai): boolean;
  51. function OptPass1FMov(var p: tai): Boolean;
  52. function OptPass2LDRSTR(var p: tai): boolean;
  53. End;
  54. Implementation
  55. uses
  56. aasmbase,
  57. aoptutils,
  58. cgutils,
  59. verbose;
  60. {$ifdef DEBUG_AOPTCPU}
  61. const
  62. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  63. {$else DEBUG_AOPTCPU}
  64. { Empty strings help the optimizer to remove string concatenations that won't
  65. ever appear to the user on release builds. [Kit] }
  66. const
  67. SPeepholeOptimization = '';
  68. {$endif DEBUG_AOPTCPU}
  69. function CanBeCond(p : tai) : boolean;
  70. begin
  71. result:=(p.typ=ait_instruction) and (taicpu(p).condition=C_None);
  72. end;
  73. function TCpuAsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  74. var
  75. p: taicpu;
  76. begin
  77. Result := false;
  78. if not ((assigned(hp)) and (hp.typ = ait_instruction)) then
  79. exit;
  80. p := taicpu(hp);
  81. case p.opcode of
  82. { These operations do not write into a register at all
  83. LDR/STR with post/pre-indexed operations do not need special treatment
  84. because post-/preindexed does not mean that a register
  85. is loaded with a new value, it is only modified }
  86. A_STR, A_CMP, A_CMN, A_TST, A_B, A_BL, A_MSR, A_FCMP:
  87. exit;
  88. else
  89. ;
  90. end;
  91. if p.ops=0 then
  92. exit;
  93. case p.oper[0]^.typ of
  94. top_reg:
  95. Result := SuperRegistersEqual(p.oper[0]^.reg,reg);
  96. top_ref:
  97. Result :=
  98. (taicpu(p).oper[0]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
  99. (taicpu(p).oper[0]^.ref^.base = reg);
  100. else
  101. ;
  102. end;
  103. end;
  104. function TCpuAsmOptimizer.InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;
  105. var
  106. p: taicpu;
  107. i: longint;
  108. begin
  109. instructionLoadsFromReg := false;
  110. if not (assigned(hp) and (hp.typ = ait_instruction)) then
  111. exit;
  112. p:=taicpu(hp);
  113. i:=1;
  114. { Start on oper[0]? }
  115. if taicpu(hp).spilling_get_operation_type(0) in [operand_read, operand_readwrite] then
  116. i:=0;
  117. while(i<p.ops) do
  118. begin
  119. case p.oper[I]^.typ of
  120. top_reg:
  121. Result := (p.oper[I]^.reg = reg);
  122. top_ref:
  123. Result :=
  124. (p.oper[I]^.ref^.base = reg) or
  125. (p.oper[I]^.ref^.index = reg);
  126. else
  127. ;
  128. end;
  129. { Bailout if we found something }
  130. if Result then
  131. exit;
  132. Inc(I);
  133. end;
  134. end;
  135. {
  136. optimize
  137. ldr/str regX,[reg1]
  138. ...
  139. add/sub reg1,reg1,regY/const
  140. into
  141. ldr/str regX,[reg1], regY/const
  142. }
  143. function TCpuAsmOptimizer.LookForPostindexedPattern(var p: tai) : boolean;
  144. var
  145. hp1 : tai;
  146. begin
  147. Result:=false;
  148. if (taicpu(p).oper[1]^.typ = top_ref) and
  149. (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
  150. (taicpu(p).oper[1]^.ref^.index=NR_NO) and
  151. (taicpu(p).oper[1]^.ref^.offset=0) and
  152. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.ref^.base) and
  153. { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
  154. MatchInstruction(hp1, [A_ADD, A_SUB], [PF_None]) and
  155. (taicpu(hp1).oper[0]^.reg=taicpu(p).oper[1]^.ref^.base) and
  156. (taicpu(hp1).oper[1]^.reg=taicpu(p).oper[1]^.ref^.base) and
  157. (
  158. { valid offset? }
  159. (taicpu(hp1).oper[2]^.typ=top_const) and
  160. (taicpu(hp1).oper[2]^.val>=-256) and
  161. (abs(taicpu(hp1).oper[2]^.val)<256)
  162. ) and
  163. { don't apply the optimization if the base register is loaded }
  164. (getsupreg(taicpu(p).oper[0]^.reg)<>getsupreg(taicpu(p).oper[1]^.ref^.base)) and
  165. not(RegModifiedBetween(taicpu(hp1).oper[0]^.reg,p,hp1)) and
  166. not(RegModifiedBetween(taicpu(hp1).oper[2]^.reg,p,hp1)) then
  167. begin
  168. if taicpu(p).opcode = A_LDR then
  169. DebugMsg(SPeepholeOptimization + 'LdrAdd/Sub2Ldr Postindex done', p)
  170. else
  171. DebugMsg(SPeepholeOptimization + 'StrAdd/Sub2Str Postindex done', p);
  172. taicpu(p).oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
  173. if taicpu(hp1).opcode=A_ADD then
  174. taicpu(p).oper[1]^.ref^.offset:=taicpu(hp1).oper[2]^.val
  175. else
  176. taicpu(p).oper[1]^.ref^.offset:=-taicpu(hp1).oper[2]^.val;
  177. asml.Remove(hp1);
  178. hp1.Free;
  179. Result:=true;
  180. end;
  181. end;
  182. function TCpuAsmOptimizer.RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string):boolean;
  183. var
  184. alloc,
  185. dealloc : tai_regalloc;
  186. hp1 : tai;
  187. begin
  188. Result:=false;
  189. if ((MatchInstruction(movp, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  190. ((getregtype(taicpu(movp).oper[0]^.reg)=R_MMREGISTER) { or (taicpu(p).opcode in [A_LDUR])})
  191. ) { or
  192. (((taicpu(p).oppostfix in [PF_F64F32,PF_F64S16,PF_F64S32,PF_F64U16,PF_F64U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFD)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F64])) or
  193. (((taicpu(p).oppostfix in [PF_F32F64,PF_F32S16,PF_F32S32,PF_F32U16,PF_F32U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFS)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F32])) }
  194. ) and
  195. (taicpu(movp).ops=2) and
  196. MatchOperand(taicpu(movp).oper[1]^, taicpu(p).oper[0]^.reg) and
  197. { the destination register of the mov might not be used beween p and movp }
  198. not(RegUsedBetween(taicpu(movp).oper[0]^.reg,p,movp)) and
  199. { Take care to only do this for instructions which REALLY load to the first register.
  200. Otherwise
  201. str reg0, [reg1]
  202. fmov reg2, reg0
  203. will be optimized to
  204. str reg2, [reg1]
  205. }
  206. RegLoadedWithNewValue(taicpu(p).oper[0]^.reg, p) then
  207. begin
  208. dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(movp.Next));
  209. if assigned(dealloc) then
  210. begin
  211. DebugMsg(SPeepholeOptimization + optimizer+' removed superfluous vmov', movp);
  212. result:=true;
  213. { taicpu(p).oper[0]^.reg is not used anymore, try to find its allocation
  214. and remove it if possible }
  215. asml.Remove(dealloc);
  216. alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.previous));
  217. if assigned(alloc) then
  218. begin
  219. asml.Remove(alloc);
  220. alloc.free;
  221. dealloc.free;
  222. end
  223. else
  224. asml.InsertAfter(dealloc,p);
  225. { try to move the allocation of the target register }
  226. GetLastInstruction(movp,hp1);
  227. alloc:=FindRegAlloc(taicpu(movp).oper[0]^.reg,tai(hp1.Next));
  228. if assigned(alloc) then
  229. begin
  230. asml.Remove(alloc);
  231. asml.InsertBefore(alloc,p);
  232. { adjust used regs }
  233. IncludeRegInUsedRegs(taicpu(movp).oper[0]^.reg,UsedRegs);
  234. end;
  235. { change
  236. vldr reg0,[reg1]
  237. vmov reg2,reg0
  238. into
  239. ldr reg2,[reg1]
  240. if reg2 is an int register
  241. if (taicpu(p).opcode=A_VLDR) and (getregtype(taicpu(movp).oper[0]^.reg)=R_INTREGISTER) then
  242. taicpu(p).opcode:=A_LDR;
  243. }
  244. { finally get rid of the mov }
  245. taicpu(p).loadreg(0,taicpu(movp).oper[0]^.reg);
  246. asml.remove(movp);
  247. movp.free;
  248. end;
  249. end;
  250. end;
  251. function TCpuAsmOptimizer.OptPass1LDR(var p: tai): Boolean;
  252. var
  253. hp1: tai;
  254. begin
  255. Result := False;
  256. if inherited OptPass1LDR(p) or
  257. LookForPostindexedPattern(p) then
  258. Exit(True)
  259. else if (taicpu(p).oppostfix in [PF_B,PF_SB,PF_H,PF_SH,PF_None]) and
  260. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  261. RemoveSuperfluousMove(p, hp1, 'Ldr<Postfix>Mov2Ldr<Postfix>') then
  262. Exit(true);
  263. end;
  264. function TCpuAsmOptimizer.OptPass1STR(var p: tai): Boolean;
  265. begin
  266. Result := False;
  267. if inherited OptPass1STR(p) or
  268. LookForPostindexedPattern(p) then
  269. Exit(True);
  270. end;
  271. function TCpuAsmOptimizer.OptPass1Shift(var p : tai): boolean;
  272. var
  273. hp1,hp2: tai;
  274. I2, I: Integer;
  275. shifterop: tshifterop;
  276. begin
  277. Result:=false;
  278. { This folds shifterops into following instructions
  279. <shiftop> r0, r1, #imm
  280. <op> r2, r3, r0
  281. to
  282. <op> r2, r3, r1, <shiftop> #imm
  283. }
  284. { do not handle ROR yet, only part of the instructions below support ROR as shifter operand }
  285. if MatchInstruction(p,[A_LSL, A_LSR, A_ASR{, A_ROR}],[PF_None]) and
  286. MatchOpType(taicpu(p),top_reg,top_reg,top_const) and
  287. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  288. MatchInstruction(hp1, [A_ADD, A_AND, A_BIC, A_CMP, A_CMN,
  289. A_EON, A_EOR, A_NEG, A_ORN, A_ORR,
  290. A_SUB, A_TST], [PF_None]) and
  291. RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  292. (taicpu(hp1).ops >= 2) and
  293. { Currently we can't fold into another shifterop }
  294. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.typ = top_reg) and
  295. { SP does not work completely with shifted registers, as I didn't find the exact rules,
  296. we do not operate on SP }
  297. (taicpu(hp1).oper[0]^.reg<>NR_SP) and
  298. (taicpu(hp1).oper[1]^.reg<>NR_SP) and
  299. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.reg<>NR_SP) and
  300. { reg1 might not be modified inbetween }
  301. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
  302. (
  303. { Only ONE of the two src operands is allowed to match }
  304. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-2]^) xor
  305. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)
  306. ) and
  307. { for SUB, the last operand must match, there is no RSB on AArch64 }
  308. ((taicpu(hp1).opcode<>A_SUB) or
  309. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)) then
  310. begin
  311. { for the two operand instructions, start also at the second operand as they are not always commutative
  312. (depends on the flags tested laster on) and thus the operands cannot swapped }
  313. I2:=1;
  314. for I:=I2 to taicpu(hp1).ops-1 do
  315. if MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[I]^.reg) then
  316. begin
  317. { If the parameter matched on the second op from the RIGHT
  318. we have to switch the parameters, this will not happen for CMP
  319. were we're only evaluating the most right parameter
  320. }
  321. shifterop_reset(shifterop);
  322. case taicpu(p).opcode of
  323. A_LSL:
  324. shifterop.shiftmode:=SM_LSL;
  325. A_ROR:
  326. shifterop.shiftmode:=SM_ROR;
  327. A_LSR:
  328. shifterop.shiftmode:=SM_LSR;
  329. A_ASR:
  330. shifterop.shiftmode:=SM_ASR;
  331. else
  332. InternalError(2019090401);
  333. end;
  334. shifterop.shiftimm:=taicpu(p).oper[2]^.val;
  335. if I <> taicpu(hp1).ops-1 then
  336. begin
  337. if taicpu(hp1).ops = 3 then
  338. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  339. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[2]^.reg,
  340. taicpu(p).oper[1]^.reg, shifterop)
  341. else
  342. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  343. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  344. shifterop);
  345. end
  346. else
  347. if taicpu(hp1).ops = 3 then
  348. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  349. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg,
  350. taicpu(p).oper[1]^.reg,shifterop)
  351. else
  352. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  353. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  354. shifterop);
  355. { Make sure the register used in the shifting is tracked all
  356. the way through, otherwise it may become deallocated while
  357. it's still live and cause incorrect optimisations later }
  358. if (taicpu(hp1).oper[0]^.reg <> taicpu(p).oper[1]^.reg) then
  359. begin
  360. TransferUsedRegs(TmpUsedRegs);
  361. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  362. ALlocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, TmpUsedRegs);
  363. end;
  364. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  365. asml.insertbefore(hp2, hp1);
  366. RemoveInstruction(hp1);
  367. RemoveCurrentp(p);
  368. DebugMsg(SPeepholeOptimization + 'FoldShiftProcess done', hp2);
  369. Result:=true;
  370. break;
  371. end;
  372. end
  373. else if MatchInstruction(p,[A_LSL, A_LSR, A_ASR,A_ROR],[PF_None]) and
  374. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  375. RemoveSuperfluousMove(p, hp1, 'ShiftMov2Shift') then
  376. Result:=true;
  377. end;
  378. function TCpuAsmOptimizer.OptPass1Data(var p : tai): boolean;
  379. var
  380. hp1: tai;
  381. begin
  382. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  383. RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
  384. end;
  385. function TCpuAsmOptimizer.OptPass1FData(var p: tai): Boolean;
  386. var
  387. hp1: tai;
  388. begin
  389. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  390. RemoveSuperfluousFMov(p, hp1, 'FOpFMov2FOp');
  391. end;
  392. function TCpuAsmOptimizer.OptPass1STP(var p : tai): boolean;
  393. var
  394. hp1, hp2, hp3, hp4: tai;
  395. begin
  396. Result:=false;
  397. {
  398. change
  399. stp x29,x30,[sp, #-16]!
  400. mov x29,sp
  401. bl abc
  402. ldp x29,x30,[sp], #16
  403. ret
  404. into
  405. b abc
  406. }
  407. if MatchInstruction(p, A_STP, [C_None], [PF_None]) and
  408. MatchOpType(taicpu(p),top_reg,top_reg,top_ref) and
  409. (taicpu(p).oper[0]^.reg = NR_X29) and
  410. (taicpu(p).oper[1]^.reg = NR_X30) and
  411. (taicpu(p).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  412. (taicpu(p).oper[2]^.ref^.index=NR_NO) and
  413. (taicpu(p).oper[2]^.ref^.offset=-16) and
  414. (taicpu(p).oper[2]^.ref^.addressmode=AM_PREINDEXED) and
  415. GetNextInstruction(p, hp1) and
  416. MatchInstruction(hp1, A_MOV, [C_None], [PF_NONE]) and
  417. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[0]^) and
  418. (taicpu(hp1).oper[1]^.typ = top_reg) and
  419. (taicpu(hp1).oper[1]^.reg = NR_STACK_POINTER_REG) and
  420. GetNextInstruction(hp1, hp2) and
  421. SkipEntryExitMarker(hp2, hp2) and
  422. MatchInstruction(hp2, A_BL, [C_None], [PF_NONE]) and
  423. (taicpu(hp2).oper[0]^.typ = top_ref) and
  424. GetNextInstruction(hp2, hp3) and
  425. SkipEntryExitMarker(hp3, hp3) and
  426. MatchInstruction(hp3, A_LDP, [C_None], [PF_NONE]) and
  427. MatchOpType(taicpu(hp3),top_reg,top_reg,top_ref) and
  428. (taicpu(hp3).oper[0]^.reg = NR_X29) and
  429. (taicpu(hp3).oper[1]^.reg = NR_X30) and
  430. (taicpu(hp3).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  431. (taicpu(hp3).oper[2]^.ref^.index=NR_NO) and
  432. (taicpu(hp3).oper[2]^.ref^.offset=16) and
  433. (taicpu(hp3).oper[2]^.ref^.addressmode=AM_POSTINDEXED) and
  434. GetNextInstruction(hp3, hp4) and
  435. MatchInstruction(hp4, A_RET, [C_None], [PF_None]) and
  436. (taicpu(hp4).ops = 0) then
  437. begin
  438. asml.Remove(p);
  439. asml.Remove(hp1);
  440. asml.Remove(hp3);
  441. asml.Remove(hp4);
  442. taicpu(hp2).opcode:=A_B;
  443. p.free;
  444. hp1.free;
  445. hp3.free;
  446. hp4.free;
  447. p:=hp2;
  448. DebugMsg(SPeepholeOptimization + 'Bl2B done', p);
  449. Result:=true;
  450. end;
  451. end;
  452. function TCpuAsmOptimizer.OptPass1Mov(var p : tai): boolean;
  453. var
  454. hp1: tai;
  455. begin
  456. Result:=false;
  457. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  458. (taicpu(p).oppostfix=PF_None) then
  459. begin
  460. RemoveCurrentP(p);
  461. DebugMsg(SPeepholeOptimization + 'Mov2None done', p);
  462. Result:=true;
  463. end
  464. {
  465. optimize
  466. mov rX, yyyy
  467. ....
  468. }
  469. else if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
  470. begin
  471. if RemoveSuperfluousMove(p, hp1, 'MovMov2Mov') then
  472. Result:=true
  473. else if (taicpu(p).ops = 2) and
  474. (tai(hp1).typ = ait_instruction) and
  475. RedundantMovProcess(p,hp1) then
  476. Result:=true;
  477. end;
  478. end;
  479. function TCpuAsmOptimizer.OptPass1FMov(var p: tai): Boolean;
  480. var
  481. hp1: tai;
  482. alloc, dealloc: tai_regalloc;
  483. begin
  484. {
  485. change
  486. fmov reg0,reg1
  487. fmov reg1,reg0
  488. into
  489. fmov reg0,reg1
  490. }
  491. Result := False;
  492. while GetNextInstruction(p, hp1) and
  493. MatchInstruction(hp1, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  494. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^) and
  495. MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[0]^) do
  496. begin
  497. asml.Remove(hp1);
  498. hp1.free;
  499. DebugMsg(SPeepholeOptimization + 'FMovFMov2FMov 1 done', p);
  500. Result:=true;
  501. end;
  502. { change
  503. fmov reg0,const
  504. fmov reg1,reg0
  505. dealloc reg0
  506. into
  507. fmov reg1,const
  508. }
  509. if MatchOpType(taicpu(p),top_reg,top_realconst) and
  510. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  511. (not RegModifiedBetween(taicpu(p).oper[1]^.reg, p, hp1)) and
  512. MatchInstruction(hp1,A_FMOV,[taicpu(p).condition],[taicpu(p).oppostfix]) and
  513. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  514. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^.reg) and
  515. (not RegModifiedByInstruction(taicpu(p).oper[0]^.reg, hp1)) and
  516. assigned(FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next)))
  517. then
  518. begin
  519. DebugMsg('Peephole FMovFMov2FMov 2 done', p);
  520. taicpu(hp1).loadrealconst(1,taicpu(p).oper[1]^.val_real);
  521. alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.Previous));
  522. dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next));
  523. if assigned(alloc) and assigned(dealloc) then
  524. begin
  525. asml.Remove(alloc);
  526. alloc.Free;
  527. asml.Remove(dealloc);
  528. dealloc.Free;
  529. end;
  530. { p will be removed, update used register as we continue
  531. with the next instruction after p }
  532. result:=RemoveCurrentP(p);
  533. end;
  534. { not enabled as apparently not happening
  535. if MatchOpType(taicpu(p),top_reg,top_reg) and
  536. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  537. MatchInstruction(hp1, [A_FSUB,A_FADD,A_FNEG,A_FMUL,A_FSQRT,A_FDIV,A_FABS], [PF_None]) and
  538. (MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) or
  539. ((taicpu(hp1).ops=3) and MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[2]^))
  540. ) and
  541. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  542. not(RegUsedBetween(taicpu(p).oper[0]^.reg,p,hp1)) then
  543. begin
  544. DebugMsg(SPeepholeOptimization + 'FMovFOp2FOp done', hp1);
  545. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
  546. if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) then
  547. taicpu(hp1).oper[1]^.reg:=taicpu(p).oper[1]^.reg;
  548. if (taicpu(hp1).ops=3) and MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[2]^) then
  549. taicpu(hp1).oper[2]^.reg:=taicpu(p).oper[1]^.reg;
  550. RemoveCurrentP(p);
  551. Result:=true;
  552. exit;
  553. end;
  554. }
  555. end;
  556. function TCpuAsmOptimizer.OptPass2LDRSTR(var p: tai): boolean;
  557. var
  558. hp1, hp1_last: tai;
  559. ThisRegister: TRegister;
  560. OffsetVal, ValidOffset, MinOffset, MaxOffset: asizeint;
  561. TargetOpcode: TAsmOp;
  562. begin
  563. Result := False;
  564. ThisRegister := taicpu(p).oper[0]^.reg;
  565. case taicpu(p).opcode of
  566. A_LDR:
  567. TargetOpcode := A_LDP;
  568. A_STR:
  569. TargetOpcode := A_STP;
  570. else
  571. InternalError(2020081501);
  572. end;
  573. { reg appearing in ref invalidates these optimisations }
  574. if (TargetOpcode = A_STP) or not RegInRef(ThisRegister, taicpu(p).oper[1]^.ref^) then
  575. begin
  576. { LDP/STP has a smaller permitted offset range than LDR/STR.
  577. TODO: For a group of out-of-range LDR/STR instructions, can
  578. we declare a temporary register equal to the offset base
  579. address, modify the STR instructions to use that register
  580. and then convert them to STP instructions? Note that STR
  581. generally takes 2 cycles (on top of the memory latency),
  582. while LDP/STP takes 3.
  583. }
  584. if (getsubreg(ThisRegister) = R_SUBQ) then
  585. begin
  586. ValidOffset := 8;
  587. MinOffset := -512;
  588. MaxOffset := 504;
  589. end
  590. else
  591. begin
  592. ValidOffset := 4;
  593. MinOffset := -256;
  594. MaxOffset := 252;
  595. end;
  596. hp1_last := p;
  597. { Look for nearby LDR/STR instructions }
  598. if (taicpu(p).oppostfix = PF_NONE) and
  599. (taicpu(p).oper[1]^.ref^.addressmode = AM_OFFSET) then
  600. { If SkipGetNext is True, GextNextInstruction isn't called }
  601. while GetNextInstruction(hp1_last, hp1) do
  602. begin
  603. if (hp1.typ <> ait_instruction) then
  604. Break;
  605. if (taicpu(hp1).opcode = taicpu(p).opcode) then
  606. begin
  607. if (taicpu(hp1).oppostfix = PF_NONE) and
  608. { Registers need to be the same size }
  609. (getsubreg(ThisRegister) = getsubreg(taicpu(hp1).oper[0]^.reg)) and
  610. (
  611. (TargetOpcode = A_STP) or
  612. { LDP x0, x0, [sp, #imm] is undefined behaviour, even
  613. though such an LDR pair should have been optimised
  614. out by now. STP is okay }
  615. (ThisRegister <> taicpu(hp1).oper[0]^.reg)
  616. ) and
  617. (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
  618. (taicpu(p).oper[1]^.ref^.base = taicpu(hp1).oper[1]^.ref^.base) and
  619. (taicpu(p).oper[1]^.ref^.index = taicpu(hp1).oper[1]^.ref^.index) and
  620. { Make sure the address registers haven't changed }
  621. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, p, hp1) and
  622. (
  623. (taicpu(hp1).oper[1]^.ref^.index = NR_NO) or
  624. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index, p, hp1)
  625. ) and
  626. { Don't need to check "RegInRef" because the base registers are identical,
  627. and the first one was checked already. [Kit] }
  628. (((TargetOpcode=A_LDP) and not RegUsedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) or
  629. ((TargetOpcode=A_STP) and not RegModifiedBetween(taicpu(hp1).oper[0]^.reg, p, hp1))) then
  630. begin
  631. { Can we convert these two LDR/STR instructions into a
  632. single LDR/STP? }
  633. OffsetVal := taicpu(hp1).oper[1]^.ref^.offset - taicpu(p).oper[1]^.ref^.offset;
  634. if (OffsetVal = ValidOffset) then
  635. begin
  636. if (taicpu(p).oper[1]^.ref^.offset >= MinOffset) and (taicpu(hp1).oper[1]^.ref^.offset <= MaxOffset) then
  637. begin
  638. { Convert:
  639. LDR/STR reg0, [reg2, #ofs]
  640. ...
  641. LDR/STR reg1. [reg2, #ofs + 8] // 4 if registers are 32-bit
  642. To:
  643. LDP/STP reg0, reg1, [reg2, #ofs]
  644. }
  645. taicpu(p).opcode := TargetOpcode;
  646. if TargetOpcode = A_STP then
  647. DebugMsg(SPeepholeOptimization + 'StrStr2Stp', p)
  648. else
  649. DebugMsg(SPeepholeOptimization + 'LdrLdr2Ldp', p);
  650. taicpu(p).ops := 3;
  651. taicpu(p).loadref(2, taicpu(p).oper[1]^.ref^);
  652. taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
  653. asml.Remove(hp1);
  654. hp1.Free;
  655. Result := True;
  656. Exit;
  657. end;
  658. end
  659. else if (OffsetVal = -ValidOffset) then
  660. begin
  661. if (taicpu(hp1).oper[1]^.ref^.offset >= MinOffset) and (taicpu(p).oper[1]^.ref^.offset <= MaxOffset) then
  662. begin
  663. { Convert:
  664. LDR/STR reg0, [reg2, #ofs + 8] // 4 if registers are 32-bit
  665. ...
  666. LDR/STR reg1. [reg2, #ofs]
  667. To:
  668. LDP/STP reg1, reg0, [reg2, #ofs]
  669. }
  670. taicpu(p).opcode := TargetOpcode;
  671. if TargetOpcode = A_STP then
  672. DebugMsg(SPeepholeOptimization + 'StrStr2Stp (reverse)', p)
  673. else
  674. DebugMsg(SPeepholeOptimization + 'LdrLdr2Ldp (reverse)', p);
  675. taicpu(p).ops := 3;
  676. taicpu(p).loadref(2, taicpu(hp1).oper[1]^.ref^);
  677. taicpu(p).loadreg(1, taicpu(p).oper[0]^.reg);
  678. taicpu(p).loadreg(0, taicpu(hp1).oper[0]^.reg);
  679. asml.Remove(hp1);
  680. hp1.Free;
  681. Result := True;
  682. Exit;
  683. end;
  684. end;
  685. end;
  686. end
  687. else
  688. Break;
  689. { Don't continue looking for LDR/STR pairs if the address register
  690. gets modified }
  691. if RegModifiedByInstruction(taicpu(p).oper[1]^.ref^.base, hp1) then
  692. Break;
  693. hp1_last := hp1;
  694. end;
  695. end;
  696. end;
  697. function TCpuAsmOptimizer.OptPostAnd(var p: tai): Boolean;
  698. var
  699. hp1, hp2: tai;
  700. hp3: taicpu;
  701. bitval : cardinal;
  702. begin
  703. Result:=false;
  704. {
  705. and reg1,reg0,<const=power of 2>
  706. cmp reg1,#0
  707. <reg1 end of life>
  708. b.e/b.ne label
  709. into
  710. tb(n)z reg0,<power of 2>,label
  711. }
  712. if MatchOpType(taicpu(p),top_reg,top_reg,top_const) and
  713. (PopCnt(QWord(taicpu(p).oper[2]^.val))=1) and
  714. GetNextInstruction(p,hp1) and
  715. MatchInstruction(hp1,A_CMP,[PF_None]) and
  716. MatchOpType(taicpu(hp1),top_reg,top_const) and
  717. (taicpu(hp1).oper[1]^.val=0) and
  718. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  719. RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  720. GetNextInstruction(hp1,hp2) and
  721. MatchInstruction(hp2,A_B,[PF_None]) and
  722. (taicpu(hp2).condition in [C_EQ,C_NE]) then
  723. begin
  724. bitval:=BsfQWord(qword(taicpu(p).oper[2]^.val));
  725. case taicpu(hp2).condition of
  726. C_NE:
  727. hp3:=taicpu.op_reg_const_ref(A_TBNZ,taicpu(p).oper[1]^.reg,bitval,taicpu(hp2).oper[0]^.ref^);
  728. C_EQ:
  729. hp3:=taicpu.op_reg_const_ref(A_TBZ,taicpu(p).oper[1]^.reg,bitval,taicpu(hp2).oper[0]^.ref^);
  730. else
  731. Internalerror(2021100201);
  732. end;
  733. taicpu(hp3).fileinfo:=taicpu(hp1).fileinfo;
  734. asml.insertbefore(hp3, hp1);
  735. RemoveInstruction(hp1);
  736. RemoveInstruction(hp2);
  737. RemoveCurrentP(p);
  738. DebugMsg(SPeepholeOptimization + 'AndCmpB.E/NE2Tbnz/Tbz done', p);
  739. Result:=true;
  740. end;
  741. end;
  742. function TCpuAsmOptimizer.OptPostCMP(var p : tai): boolean;
  743. var
  744. hp1,hp2: tai;
  745. begin
  746. Result:=false;
  747. {
  748. cmp reg0,#0
  749. b.e/b.ne label
  750. into
  751. cb(n)z reg0,label
  752. }
  753. if MatchOpType(taicpu(p),top_reg,top_const) and
  754. (taicpu(p).oper[1]^.val=0) and
  755. GetNextInstruction(p,hp1) and
  756. MatchInstruction(hp1,A_B,[PF_None]) and
  757. (taicpu(hp1).condition in [C_EQ,C_NE]) then
  758. begin
  759. case taicpu(hp1).condition of
  760. C_NE:
  761. hp2:=taicpu.op_reg_sym_ofs(A_CBNZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  762. C_EQ:
  763. hp2:=taicpu.op_reg_sym_ofs(A_CBZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  764. else
  765. Internalerror(2019090801);
  766. end;
  767. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  768. asml.insertbefore(hp2, hp1);
  769. asml.remove(p);
  770. asml.remove(hp1);
  771. p.free;
  772. hp1.free;
  773. p:=hp2;
  774. DebugMsg(SPeepholeOptimization + 'CMPB.E/NE2CBNZ/CBZ done', p);
  775. Result:=true;
  776. end;
  777. end;
  778. function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
  779. begin
  780. result := false;
  781. if p.typ=ait_instruction then
  782. begin
  783. case taicpu(p).opcode of
  784. A_LDR:
  785. Result:=OptPass1LDR(p);
  786. A_STR:
  787. Result:=OptPass1STR(p);
  788. A_MOV:
  789. Result:=OptPass1Mov(p);
  790. A_STP:
  791. Result:=OptPass1STP(p);
  792. A_LSR,
  793. A_ROR,
  794. A_ASR,
  795. A_LSL:
  796. Result:=OptPass1Shift(p);
  797. A_AND:
  798. Result:=OptPass1And(p);
  799. A_CSEL,
  800. A_ADD,
  801. A_ADC,
  802. A_SUB,
  803. A_SBC,
  804. A_BIC,
  805. A_EOR,
  806. A_ORR,
  807. A_MUL:
  808. Result:=OptPass1Data(p);
  809. A_UXTB:
  810. Result:=OptPass1UXTB(p);
  811. A_UXTH:
  812. Result:=OptPass1UXTH(p);
  813. A_SXTB:
  814. Result:=OptPass1SXTB(p);
  815. A_SXTH:
  816. Result:=OptPass1SXTH(p);
  817. // A_VLDR,
  818. A_FMADD,
  819. A_FMSUB,
  820. A_FNMADD,
  821. A_FNMSUB,
  822. A_FNMUL,
  823. A_FADD,
  824. A_FMUL,
  825. A_FDIV,
  826. A_FSUB,
  827. A_FSQRT,
  828. A_FNEG,
  829. A_FCVT,
  830. A_FABS:
  831. Result:=OptPass1FData(p);
  832. A_FMOV:
  833. Result:=OptPass1FMov(p);
  834. else
  835. ;
  836. end;
  837. end;
  838. end;
  839. function TCpuAsmOptimizer.PeepHoleOptPass2Cpu(var p: tai): boolean;
  840. begin
  841. result := false;
  842. if p.typ=ait_instruction then
  843. begin
  844. case taicpu(p).opcode of
  845. A_LDR,
  846. A_STR:
  847. Result:=OptPass2LDRSTR(p);
  848. else
  849. ;
  850. end;
  851. end;
  852. end;
  853. function TCpuAsmOptimizer.PostPeepHoleOptsCpu(var p: tai): boolean;
  854. begin
  855. result := false;
  856. if p.typ=ait_instruction then
  857. begin
  858. case taicpu(p).opcode of
  859. A_CMP:
  860. Result:=OptPostCMP(p);
  861. A_AND:
  862. Result:=OptPostAnd(p);
  863. else
  864. ;
  865. end;
  866. end;
  867. end;
  868. begin
  869. casmoptimizer:=TCpuAsmOptimizer;
  870. End.