aoptcpu.pas 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567
  1. {
  2. Copyright (c) 1998-2002 by Jonas Maebe, member of the Free Pascal
  3. Development Team
  4. This unit implements the ARM64 optimizer object
  5. This program is free software; you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation; either version 2 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program; if not, write to the Free Software
  15. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16. ****************************************************************************
  17. }
  18. Unit aoptcpu;
  19. {$i fpcdefs.inc}
  20. { $define DEBUG_AOPTCPU}
  21. Interface
  22. uses
  23. globtype, globals,
  24. cutils,
  25. cgbase, cpubase, aasmtai, aasmcpu, aopt, aoptcpub;
  26. Type
  27. TCpuAsmOptimizer = class(TAsmOptimizer)
  28. { uses the same constructor as TAopObj }
  29. function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
  30. function PostPeepHoleOptsCpu(var p: tai): boolean; override;
  31. function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
  32. function InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;override;
  33. function GetNextInstructionUsingReg(Current : tai; out Next : tai; reg : TRegister) : Boolean;
  34. function LookForPostindexedPattern(p : taicpu) : boolean;
  35. procedure DebugMsg(const s : string; p : tai);
  36. private
  37. function OptPass1Shift(var p: tai): boolean;
  38. function OptPostCMP(var p: tai): boolean;
  39. function RemoveSuperfluousMove(const p: tai; movp: tai; const optimizer: string): boolean;
  40. function OptPass1Data(var p: tai): boolean;
  41. End;
  42. Implementation
  43. uses
  44. aasmbase,
  45. aoptutils,
  46. cgutils,
  47. verbose;
  48. {$ifdef DEBUG_AOPTCPU}
  49. procedure TCpuAsmOptimizer.DebugMsg(const s: string;p : tai);
  50. begin
  51. asml.insertbefore(tai_comment.Create(strpnew(s)), p);
  52. end;
  53. {$else DEBUG_AOPTCPU}
  54. procedure TCpuAsmOptimizer.DebugMsg(const s: string;p : tai);inline;
  55. begin
  56. end;
  57. {$endif DEBUG_AOPTCPU}
  58. function CanBeCond(p : tai) : boolean;
  59. begin
  60. result:=(p.typ=ait_instruction) and (taicpu(p).condition=C_None);
  61. end;
  62. function RefsEqual(const r1, r2: treference): boolean;
  63. begin
  64. refsequal :=
  65. (r1.offset = r2.offset) and
  66. (r1.base = r2.base) and
  67. (r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
  68. (r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
  69. (r1.relsymbol = r2.relsymbol) and
  70. (r1.shiftimm = r2.shiftimm) and
  71. (r1.addressmode = r2.addressmode) and
  72. (r1.shiftmode = r2.shiftmode) and
  73. (r1.volatility=[]) and
  74. (r2.volatility=[]);
  75. end;
  76. function MatchInstruction(const instr: tai; const op: TAsmOps; const postfix: TOpPostfixes): boolean;
  77. begin
  78. result :=
  79. (instr.typ = ait_instruction) and
  80. ((op = []) or (taicpu(instr).opcode in op)) and
  81. ((postfix = []) or (taicpu(instr).oppostfix in postfix));
  82. end;
  83. function MatchInstruction(const instr: tai; const op: TAsmOp; const postfix: TOpPostfixes): boolean;
  84. begin
  85. result :=
  86. (instr.typ = ait_instruction) and
  87. (taicpu(instr).opcode = op) and
  88. ((postfix = []) or (taicpu(instr).oppostfix in postfix));
  89. end;
  90. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  91. begin
  92. result := (oper.typ = top_reg) and (oper.reg = reg);
  93. end;
  94. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean; inline;
  95. begin
  96. result := oper1.typ = oper2.typ;
  97. if result then
  98. case oper1.typ of
  99. top_const:
  100. Result:=oper1.val = oper2.val;
  101. top_reg:
  102. Result:=oper1.reg = oper2.reg;
  103. top_conditioncode:
  104. Result:=oper1.cc = oper2.cc;
  105. top_realconst:
  106. Result:=oper1.val_real = oper2.val_real;
  107. top_ref:
  108. Result:=RefsEqual(oper1.ref^, oper2.ref^);
  109. else Result:=false;
  110. end
  111. end;
  112. function TCpuAsmOptimizer.GetNextInstructionUsingReg(Current: tai;
  113. Out Next: tai; reg: TRegister): Boolean;
  114. begin
  115. Next:=Current;
  116. repeat
  117. Result:=GetNextInstruction(Next,Next);
  118. until not (Result) or
  119. not(cs_opt_level3 in current_settings.optimizerswitches) or
  120. (Next.typ<>ait_instruction) or
  121. RegInInstruction(reg,Next) or
  122. is_calljmp(taicpu(Next).opcode);
  123. end;
  124. function TCpuAsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  125. var
  126. p: taicpu;
  127. begin
  128. p := taicpu(hp);
  129. Result := false;
  130. if not ((assigned(hp)) and (hp.typ = ait_instruction)) then
  131. exit;
  132. case p.opcode of
  133. { These operands do not write into a register at all }
  134. A_CMP, A_CMN, A_TST, A_B, A_BL, A_MSR, A_FCMP:
  135. exit;
  136. {Take care of post/preincremented store and loads, they will change their base register}
  137. A_STR, A_LDR:
  138. begin
  139. Result := false;
  140. { actually, this does not apply here because post-/preindexed does not mean that a register
  141. is loaded with a new value, it is only modified
  142. (taicpu(p).oper[1]^.typ=top_ref) and
  143. (taicpu(p).oper[1]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
  144. (taicpu(p).oper[1]^.ref^.base = reg);
  145. }
  146. { STR does not load into it's first register }
  147. if p.opcode = A_STR then
  148. exit;
  149. end;
  150. else
  151. ;
  152. end;
  153. if Result then
  154. exit;
  155. case p.oper[0]^.typ of
  156. top_reg:
  157. Result := (p.oper[0]^.reg = reg);
  158. top_ref:
  159. Result :=
  160. (taicpu(p).oper[0]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
  161. (taicpu(p).oper[0]^.ref^.base = reg);
  162. else
  163. ;
  164. end;
  165. end;
  166. function TCpuAsmOptimizer.InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;
  167. var
  168. p: taicpu;
  169. i: longint;
  170. begin
  171. instructionLoadsFromReg := false;
  172. if not (assigned(hp) and (hp.typ = ait_instruction)) then
  173. exit;
  174. p:=taicpu(hp);
  175. i:=1;
  176. { Start on oper[0]? }
  177. if taicpu(hp).spilling_get_operation_type(0) in [operand_read, operand_readwrite] then
  178. i:=0;
  179. while(i<p.ops) do
  180. begin
  181. case p.oper[I]^.typ of
  182. top_reg:
  183. Result := (p.oper[I]^.reg = reg);
  184. top_ref:
  185. Result :=
  186. (p.oper[I]^.ref^.base = reg) or
  187. (p.oper[I]^.ref^.index = reg);
  188. else
  189. ;
  190. end;
  191. { Bailout if we found something }
  192. if Result then
  193. exit;
  194. Inc(I);
  195. end;
  196. end;
  197. function TCpuAsmOptimizer.RemoveSuperfluousMove(const p: tai; movp: tai; const optimizer: string):boolean;
  198. var
  199. alloc,
  200. dealloc : tai_regalloc;
  201. hp1 : tai;
  202. begin
  203. Result:=false;
  204. if MatchInstruction(movp, A_MOV, [PF_None]) and
  205. (taicpu(p).ops>=3) and
  206. { We can't optimize if there is a shiftop }
  207. (taicpu(movp).ops=2) and
  208. MatchOperand(taicpu(movp).oper[1]^, taicpu(p).oper[0]^.reg) and
  209. { don't mess with moves to fp }
  210. (taicpu(movp).oper[0]^.reg<>NR_FP) and
  211. { the destination register of the mov might not be used beween p and movp }
  212. not(RegUsedBetween(taicpu(movp).oper[0]^.reg,p,movp)) and
  213. { Take care to only do this for instructions which REALLY load to the first register.
  214. Otherwise
  215. str reg0, [reg1]
  216. mov reg2, reg0
  217. will be optimized to
  218. str reg2, [reg1]
  219. }
  220. RegLoadedWithNewValue(taicpu(p).oper[0]^.reg, p) then
  221. begin
  222. dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(movp.Next));
  223. if assigned(dealloc) then
  224. begin
  225. DebugMsg('Peephole '+optimizer+' removed superfluous mov', movp);
  226. result:=true;
  227. { taicpu(p).oper[0]^.reg is not used anymore, try to find its allocation
  228. and remove it if possible }
  229. asml.Remove(dealloc);
  230. alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.previous));
  231. if assigned(alloc) then
  232. begin
  233. asml.Remove(alloc);
  234. alloc.free;
  235. dealloc.free;
  236. end
  237. else
  238. asml.InsertAfter(dealloc,p);
  239. { try to move the allocation of the target register }
  240. GetLastInstruction(movp,hp1);
  241. alloc:=FindRegAlloc(taicpu(movp).oper[0]^.reg,tai(hp1.Next));
  242. if assigned(alloc) then
  243. begin
  244. asml.Remove(alloc);
  245. asml.InsertBefore(alloc,p);
  246. { adjust used regs }
  247. IncludeRegInUsedRegs(taicpu(movp).oper[0]^.reg,UsedRegs);
  248. end;
  249. { finally get rid of the mov }
  250. taicpu(p).loadreg(0,taicpu(movp).oper[0]^.reg);
  251. { Remove preindexing and postindexing for LDR in some cases.
  252. For example:
  253. ldr reg2,[reg1, xxx]!
  254. mov reg1,reg2
  255. must be translated to:
  256. ldr reg1,[reg1, xxx]
  257. Preindexing must be removed there, since the same register is used as the base and as the target.
  258. Such case is not allowed for ARM CPU and produces crash. }
  259. if (taicpu(p).opcode = A_LDR) and (taicpu(p).oper[1]^.typ = top_ref)
  260. and (taicpu(movp).oper[0]^.reg = taicpu(p).oper[1]^.ref^.base)
  261. then
  262. taicpu(p).oper[1]^.ref^.addressmode:=AM_OFFSET;
  263. asml.remove(movp);
  264. movp.free;
  265. end;
  266. end;
  267. end;
  268. {
  269. optimize
  270. ldr/str regX,[reg1]
  271. ...
  272. add/sub reg1,reg1,regY/const
  273. into
  274. ldr/str regX,[reg1], regY/const
  275. }
  276. function TCpuAsmOptimizer.LookForPostindexedPattern(p: taicpu) : boolean;
  277. var
  278. hp1 : tai;
  279. begin
  280. Result:=false;
  281. if (p.oper[1]^.typ = top_ref) and
  282. (p.oper[1]^.ref^.addressmode=AM_OFFSET) and
  283. (p.oper[1]^.ref^.index=NR_NO) and
  284. (p.oper[1]^.ref^.offset=0) and
  285. GetNextInstructionUsingReg(p, hp1, p.oper[1]^.ref^.base) and
  286. { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
  287. MatchInstruction(hp1, [A_ADD, A_SUB], [PF_None]) and
  288. (taicpu(hp1).oper[0]^.reg=p.oper[1]^.ref^.base) and
  289. (taicpu(hp1).oper[1]^.reg=p.oper[1]^.ref^.base) and
  290. (
  291. { valid offset? }
  292. (taicpu(hp1).oper[2]^.typ=top_const) and
  293. (taicpu(hp1).oper[2]^.val>=-256) and
  294. (abs(taicpu(hp1).oper[2]^.val)<256)
  295. ) and
  296. { don't apply the optimization if the base register is loaded }
  297. (getsupreg(p.oper[0]^.reg)<>getsupreg(p.oper[1]^.ref^.base)) and
  298. not(RegModifiedBetween(taicpu(hp1).oper[0]^.reg,p,hp1)) and
  299. not(RegModifiedBetween(taicpu(hp1).oper[2]^.reg,p,hp1)) then
  300. begin
  301. DebugMsg('Peephole Str/LdrAdd/Sub2Str/Ldr Postindex done', p);
  302. p.oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
  303. if taicpu(hp1).opcode=A_ADD then
  304. p.oper[1]^.ref^.offset:=taicpu(hp1).oper[2]^.val
  305. else
  306. p.oper[1]^.ref^.offset:=-taicpu(hp1).oper[2]^.val;
  307. asml.Remove(hp1);
  308. hp1.Free;
  309. Result:=true;
  310. end;
  311. end;
  312. function TCpuAsmOptimizer.OptPass1Shift(var p : tai): boolean;
  313. var
  314. hp1,hp2: tai;
  315. I2, I: Integer;
  316. shifterop: tshifterop;
  317. begin
  318. Result:=false;
  319. { This folds shifterops into following instructions
  320. <shiftop> r0, r1, #imm
  321. <op> r2, r3, r0
  322. to
  323. <op> r2, r3, r1, <shiftop> #imm
  324. }
  325. { do not handle ROR yet, only part of the instructions below support ROR as shifter operand }
  326. if MatchInstruction(p,[A_LSL, A_LSR, A_ASR{, A_ROR}],[PF_None]) and
  327. MatchOpType(taicpu(p),top_reg,top_reg,top_const) and
  328. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  329. MatchInstruction(hp1, [A_ADD, A_AND, A_BIC, A_CMP, A_CMN,
  330. A_EON, A_EOR, A_NEG, A_ORN, A_ORR,
  331. A_SUB, A_TST], [PF_None]) and
  332. RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  333. (taicpu(hp1).ops >= 2) and
  334. { Currently we can't fold into another shifterop }
  335. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.typ = top_reg) and
  336. { SP does not work completely with shifted registers, as I didn't find the exact rules,
  337. we do not operate on SP }
  338. (taicpu(hp1).oper[0]^.reg<>NR_SP) and
  339. (taicpu(hp1).oper[1]^.reg<>NR_SP) and
  340. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.reg<>NR_SP) and
  341. { reg1 might not be modified inbetween }
  342. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
  343. (
  344. { Only ONE of the two src operands is allowed to match }
  345. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-2]^) xor
  346. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)
  347. ) and
  348. { for SUB, the last operand must match, there is no RSB on AArch64 }
  349. ((taicpu(hp1).opcode<>A_SUB) or
  350. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)) then
  351. begin
  352. { for the two operand instructions, start also at the second operand as they are not always commutative
  353. (depends on the flags tested laster on) and thus the operands cannot swapped }
  354. I2:=1;
  355. for I:=I2 to taicpu(hp1).ops-1 do
  356. if MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[I]^.reg) then
  357. begin
  358. { If the parameter matched on the second op from the RIGHT
  359. we have to switch the parameters, this will not happen for CMP
  360. were we're only evaluating the most right parameter
  361. }
  362. shifterop_reset(shifterop);
  363. case taicpu(p).opcode of
  364. A_LSL:
  365. shifterop.shiftmode:=SM_LSL;
  366. A_ROR:
  367. shifterop.shiftmode:=SM_ROR;
  368. A_LSR:
  369. shifterop.shiftmode:=SM_LSR;
  370. A_ASR:
  371. shifterop.shiftmode:=SM_ASR;
  372. else
  373. InternalError(2019090401);
  374. end;
  375. shifterop.shiftimm:=taicpu(p).oper[2]^.val;
  376. if I <> taicpu(hp1).ops-1 then
  377. begin
  378. if taicpu(hp1).ops = 3 then
  379. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  380. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[2]^.reg,
  381. taicpu(p).oper[1]^.reg, shifterop)
  382. else
  383. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  384. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  385. shifterop);
  386. end
  387. else
  388. if taicpu(hp1).ops = 3 then
  389. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  390. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg,
  391. taicpu(p).oper[1]^.reg,shifterop)
  392. else
  393. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  394. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  395. shifterop);
  396. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  397. asml.insertbefore(hp2, hp1);
  398. GetNextInstruction(p, hp2);
  399. asml.remove(p);
  400. asml.remove(hp1);
  401. p.free;
  402. hp1.free;
  403. p:=hp2;
  404. DebugMsg('Peephole FoldShiftProcess done', p);
  405. Result:=true;
  406. break;
  407. end;
  408. end
  409. else if MatchInstruction(p,[A_LSL, A_LSR, A_ASR,A_ROR],[PF_None]) and
  410. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  411. RemoveSuperfluousMove(p, hp1, 'ShiftMov2Shift') then
  412. Result:=true;
  413. end;
  414. function TCpuAsmOptimizer.OptPass1Data(var p : tai): boolean;
  415. var
  416. hp1: tai;
  417. begin
  418. result:=false;
  419. if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  420. RemoveSuperfluousMove(p, hp1, 'DataMov2Data') then
  421. Result:=true;
  422. end;
  423. function TCpuAsmOptimizer.OptPostCMP(var p : tai): boolean;
  424. var
  425. hp1,hp2: tai;
  426. begin
  427. Result:=false;
  428. if MatchOpType(taicpu(p),top_reg,top_const) and
  429. (taicpu(p).oper[1]^.val=0) and
  430. GetNextInstruction(p,hp1) and
  431. MatchInstruction(hp1,A_B,[PF_None]) and
  432. (taicpu(hp1).condition in [C_EQ,C_NE]) then
  433. begin
  434. case taicpu(hp1).condition of
  435. C_NE:
  436. hp2:=taicpu.op_reg_sym_ofs(A_CBNZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  437. C_EQ:
  438. hp2:=taicpu.op_reg_sym_ofs(A_CBZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  439. else
  440. Internalerror(2019090801);
  441. end;
  442. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  443. asml.insertbefore(hp2, hp1);
  444. asml.remove(p);
  445. asml.remove(hp1);
  446. p.free;
  447. hp1.free;
  448. p:=hp2;
  449. DebugMsg('Peephole CMPB.E/NE2CBNZ/CBZ done', p);
  450. Result:=true;
  451. end;
  452. end;
  453. function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
  454. begin
  455. result := false;
  456. if p.typ=ait_instruction then
  457. begin
  458. case taicpu(p).opcode of
  459. A_LDR:
  460. begin
  461. Result:=LookForPostindexedPattern(taicpu(p));
  462. end;
  463. A_STR:
  464. begin
  465. Result:=LookForPostindexedPattern(taicpu(p));
  466. end;
  467. A_LSR,
  468. A_ROR,
  469. A_ASR,
  470. A_LSL:
  471. Result:=OptPass1Shift(p);
  472. A_ADD,
  473. A_ADC,
  474. A_SUB,
  475. A_SBC,
  476. A_AND,
  477. A_BIC,
  478. A_EOR,
  479. A_ORR,
  480. A_MUL:
  481. Result:=OptPass1Data(p);
  482. else
  483. ;
  484. end;
  485. end;
  486. end;
  487. function TCpuAsmOptimizer.PostPeepHoleOptsCpu(var p: tai): boolean;
  488. begin
  489. result := false;
  490. if p.typ=ait_instruction then
  491. begin
  492. case taicpu(p).opcode of
  493. A_CMP:
  494. Result:=OptPostCMP(p);
  495. else
  496. ;
  497. end;
  498. end;
  499. end;
  500. begin
  501. casmoptimizer:=TCpuAsmOptimizer;
  502. End.