aoptcpu.pas 47 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340
  1. {
  2. Copyright (c) 1998-2002 by Jonas Maebe, member of the Free Pascal
  3. Development Team
  4. This unit implements the ARM64 optimizer object
  5. This program is free software; you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation; either version 2 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program; if not, write to the Free Software
  15. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16. ****************************************************************************
  17. }
  18. Unit aoptcpu;
  19. {$i fpcdefs.inc}
  20. {$ifdef EXTDEBUG}
  21. {$define DEBUG_AOPTCPU}
  22. {$endif EXTDEBUG}
  23. Interface
  24. uses
  25. globtype, globals,
  26. cutils,
  27. cgbase, cpubase, aasmtai, aasmcpu,
  28. aopt, aoptcpub, aoptarm;
  29. Type
  30. TCpuAsmOptimizer = class(TARMAsmOptimizer)
  31. { uses the same constructor as TAopObj }
  32. function PrePeepHoleOptsCpu(var p: tai): boolean; override;
  33. function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
  34. function PeepHoleOptPass2Cpu(var p: tai): boolean; override;
  35. function PostPeepHoleOptsCpu(var p: tai): boolean; override;
  36. function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
  37. function InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;override;
  38. function LookForPostindexedPattern(var p : tai) : boolean;
  39. public
  40. { With these routines, there's optimisation code that's general for all ARM platforms }
  41. function OptPass1LDR(var p: tai): Boolean; override;
  42. function OptPass1STR(var p: tai): Boolean; override;
  43. private
  44. function RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string): boolean;
  45. function OptPass1Shift(var p: tai): boolean;
  46. function OptPass1Data(var p: tai): boolean;
  47. function OptPass1FData(var p: tai): Boolean;
  48. function OptPass1STP(var p: tai): boolean;
  49. function OptPass1Mov(var p: tai): boolean;
  50. function OptPass1MOVZ(var p: tai): boolean;
  51. function OptPass1FMov(var p: tai): Boolean;
  52. function OptPass1B(var p: tai): boolean;
  53. function OptPass1SXTW(var p: tai): Boolean;
  54. function OptPass2LDRSTR(var p: tai): boolean;
  55. function PostPeepholeOptAND(var p: tai): Boolean;
  56. function PostPeepholeOptCMP(var p: tai): boolean;
  57. function PostPeepholeOptTST(var p: tai): Boolean;
  58. End;
  59. Implementation
  60. uses
  61. aasmbase,
  62. aoptutils,
  63. cgutils,
  64. verbose;
  65. {$ifdef DEBUG_AOPTCPU}
  66. const
  67. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  68. {$else DEBUG_AOPTCPU}
  69. { Empty strings help the optimizer to remove string concatenations that won't
  70. ever appear to the user on release builds. [Kit] }
  71. const
  72. SPeepholeOptimization = '';
  73. {$endif DEBUG_AOPTCPU}
  74. function CanBeCond(p : tai) : boolean;
  75. begin
  76. result:=(p.typ=ait_instruction) and (taicpu(p).condition=C_None);
  77. end;
  78. function TCpuAsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  79. var
  80. p: taicpu;
  81. begin
  82. Result := false;
  83. if not ((assigned(hp)) and (hp.typ = ait_instruction)) then
  84. exit;
  85. p := taicpu(hp);
  86. case p.opcode of
  87. { These operations do not write into a register at all
  88. LDR/STR with post/pre-indexed operations do not need special treatment
  89. because post-/preindexed does not mean that a register
  90. is loaded with a new value, it is only modified }
  91. A_STR, A_CMP, A_CMN, A_TST, A_B, A_BL, A_MSR, A_FCMP:
  92. exit;
  93. else
  94. ;
  95. end;
  96. if p.ops=0 then
  97. exit;
  98. case p.oper[0]^.typ of
  99. top_reg:
  100. Result := SuperRegistersEqual(p.oper[0]^.reg,reg);
  101. top_ref:
  102. Result :=
  103. (taicpu(p).oper[0]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
  104. (taicpu(p).oper[0]^.ref^.base = reg);
  105. else
  106. ;
  107. end;
  108. end;
  109. function TCpuAsmOptimizer.InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;
  110. var
  111. p: taicpu;
  112. i: longint;
  113. begin
  114. instructionLoadsFromReg := false;
  115. if not (assigned(hp) and (hp.typ = ait_instruction)) then
  116. exit;
  117. p:=taicpu(hp);
  118. i:=1;
  119. { Start on oper[0]? }
  120. if taicpu(hp).spilling_get_operation_type(0) in [operand_read, operand_readwrite] then
  121. i:=0;
  122. while(i<p.ops) do
  123. begin
  124. case p.oper[I]^.typ of
  125. top_reg:
  126. Result := (p.oper[I]^.reg = reg);
  127. top_ref:
  128. Result :=
  129. (p.oper[I]^.ref^.base = reg) or
  130. (p.oper[I]^.ref^.index = reg);
  131. else
  132. ;
  133. end;
  134. { Bailout if we found something }
  135. if Result then
  136. exit;
  137. Inc(I);
  138. end;
  139. end;
  140. {
  141. optimize
  142. ldr/str regX,[reg1]
  143. ...
  144. add/sub reg1,reg1,regY/const
  145. into
  146. ldr/str regX,[reg1], regY/const
  147. }
  148. function TCpuAsmOptimizer.LookForPostindexedPattern(var p: tai) : boolean;
  149. var
  150. hp1 : tai;
  151. begin
  152. Result:=false;
  153. if (taicpu(p).oper[1]^.typ = top_ref) and
  154. (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
  155. (taicpu(p).oper[1]^.ref^.index=NR_NO) and
  156. (taicpu(p).oper[1]^.ref^.offset=0) and
  157. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.ref^.base) and
  158. { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
  159. MatchInstruction(hp1, [A_ADD, A_SUB], [PF_None]) and
  160. (taicpu(hp1).oper[0]^.reg=taicpu(p).oper[1]^.ref^.base) and
  161. (taicpu(hp1).oper[1]^.reg=taicpu(p).oper[1]^.ref^.base) and
  162. (
  163. { valid offset? }
  164. (taicpu(hp1).oper[2]^.typ=top_const) and
  165. (taicpu(hp1).oper[2]^.val>=-256) and
  166. (abs(taicpu(hp1).oper[2]^.val)<256)
  167. ) and
  168. { don't apply the optimization if the base register is loaded }
  169. (getsupreg(taicpu(p).oper[0]^.reg)<>getsupreg(taicpu(p).oper[1]^.ref^.base)) and
  170. not(RegModifiedBetween(taicpu(hp1).oper[0]^.reg,p,hp1)) and
  171. not(RegModifiedBetween(taicpu(hp1).oper[2]^.reg,p,hp1)) then
  172. begin
  173. if taicpu(p).opcode = A_LDR then
  174. DebugMsg(SPeepholeOptimization + 'LdrAdd/Sub2Ldr Postindex done', p)
  175. else
  176. DebugMsg(SPeepholeOptimization + 'StrAdd/Sub2Str Postindex done', p);
  177. taicpu(p).oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
  178. if taicpu(hp1).opcode=A_ADD then
  179. taicpu(p).oper[1]^.ref^.offset:=taicpu(hp1).oper[2]^.val
  180. else
  181. taicpu(p).oper[1]^.ref^.offset:=-taicpu(hp1).oper[2]^.val;
  182. asml.Remove(hp1);
  183. hp1.Free;
  184. Result:=true;
  185. end;
  186. end;
  187. function TCpuAsmOptimizer.RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string):boolean;
  188. var
  189. alloc,
  190. dealloc : tai_regalloc;
  191. hp1 : tai;
  192. begin
  193. Result:=false;
  194. if ((MatchInstruction(movp, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  195. ((getregtype(taicpu(movp).oper[0]^.reg)=R_MMREGISTER) { or (taicpu(p).opcode in [A_LDUR])})
  196. ) { or
  197. (((taicpu(p).oppostfix in [PF_F64F32,PF_F64S16,PF_F64S32,PF_F64U16,PF_F64U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFD)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F64])) or
  198. (((taicpu(p).oppostfix in [PF_F32F64,PF_F32S16,PF_F32S32,PF_F32U16,PF_F32U32]) or (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBFS)) and MatchInstruction(movp, A_VMOV, [taicpu(p).condition], [PF_F32])) }
  199. ) and
  200. (taicpu(movp).ops=2) and
  201. MatchOperand(taicpu(movp).oper[1]^, taicpu(p).oper[0]^.reg) and
  202. { the destination register of the mov might not be used beween p and movp }
  203. not(RegUsedBetween(taicpu(movp).oper[0]^.reg,p,movp)) and
  204. { Take care to only do this for instructions which REALLY load to the first register.
  205. Otherwise
  206. str reg0, [reg1]
  207. fmov reg2, reg0
  208. will be optimized to
  209. str reg2, [reg1]
  210. }
  211. RegLoadedWithNewValue(taicpu(p).oper[0]^.reg, p) then
  212. begin
  213. dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(movp.Next));
  214. if assigned(dealloc) then
  215. begin
  216. DebugMsg(SPeepholeOptimization + optimizer+' removed superfluous vmov', movp);
  217. result:=true;
  218. { taicpu(p).oper[0]^.reg is not used anymore, try to find its allocation
  219. and remove it if possible }
  220. asml.Remove(dealloc);
  221. alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.previous));
  222. if assigned(alloc) then
  223. begin
  224. asml.Remove(alloc);
  225. alloc.free;
  226. dealloc.free;
  227. end
  228. else
  229. asml.InsertAfter(dealloc,p);
  230. { try to move the allocation of the target register }
  231. GetLastInstruction(movp,hp1);
  232. alloc:=FindRegAlloc(taicpu(movp).oper[0]^.reg,tai(hp1.Next));
  233. if assigned(alloc) then
  234. begin
  235. asml.Remove(alloc);
  236. asml.InsertBefore(alloc,p);
  237. { adjust used regs }
  238. IncludeRegInUsedRegs(taicpu(movp).oper[0]^.reg,UsedRegs);
  239. end;
  240. { change
  241. vldr reg0,[reg1]
  242. vmov reg2,reg0
  243. into
  244. ldr reg2,[reg1]
  245. if reg2 is an int register
  246. if (taicpu(p).opcode=A_VLDR) and (getregtype(taicpu(movp).oper[0]^.reg)=R_INTREGISTER) then
  247. taicpu(p).opcode:=A_LDR;
  248. }
  249. { finally get rid of the mov }
  250. taicpu(p).loadreg(0,taicpu(movp).oper[0]^.reg);
  251. asml.remove(movp);
  252. movp.free;
  253. end;
  254. end;
  255. end;
  256. function TCpuAsmOptimizer.OptPass1LDR(var p: tai): Boolean;
  257. var
  258. hp1: tai;
  259. begin
  260. Result := False;
  261. if inherited OptPass1LDR(p) or
  262. LookForPostindexedPattern(p) then
  263. Exit(True)
  264. else if (taicpu(p).oppostfix in [PF_B,PF_SB,PF_H,PF_SH,PF_None]) and
  265. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  266. RemoveSuperfluousMove(p, hp1, 'Ldr<Postfix>Mov2Ldr<Postfix>') then
  267. Exit(true);
  268. end;
  269. function TCpuAsmOptimizer.OptPass1STR(var p: tai): Boolean;
  270. begin
  271. Result := False;
  272. if inherited OptPass1STR(p) or
  273. LookForPostindexedPattern(p) then
  274. Exit(True);
  275. end;
  276. function TCpuAsmOptimizer.OptPass1Shift(var p : tai): boolean;
  277. var
  278. hp1,hp2: tai;
  279. I2, I: Integer;
  280. shifterop: tshifterop;
  281. begin
  282. Result:=false;
  283. { This folds shifterops into following instructions
  284. <shiftop> r0, r1, #imm
  285. <op> r2, r3, r0
  286. to
  287. <op> r2, r3, r1, <shiftop> #imm
  288. }
  289. { do not handle ROR yet, only part of the instructions below support ROR as shifter operand }
  290. if MatchInstruction(p,[A_LSL, A_LSR, A_ASR{, A_ROR}],[PF_None]) and
  291. MatchOpType(taicpu(p),top_reg,top_reg,top_const) and
  292. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  293. MatchInstruction(hp1, [A_ADD, A_AND, A_BIC, A_CMP, A_CMN,
  294. A_EON, A_EOR, A_NEG, A_ORN, A_ORR,
  295. A_SUB, A_TST], [PF_None]) and
  296. RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  297. (taicpu(hp1).ops >= 2) and
  298. { Currently we can't fold into another shifterop }
  299. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.typ = top_reg) and
  300. { SP does not work completely with shifted registers, as I didn't find the exact rules,
  301. we do not operate on SP }
  302. (taicpu(hp1).oper[0]^.reg<>NR_SP) and
  303. (taicpu(hp1).oper[1]^.reg<>NR_SP) and
  304. (taicpu(hp1).oper[taicpu(hp1).ops-1]^.reg<>NR_SP) and
  305. { reg1 might not be modified inbetween }
  306. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
  307. (
  308. { Only ONE of the two src operands is allowed to match }
  309. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-2]^) xor
  310. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)
  311. ) and
  312. { for SUB, the last operand must match, there is no RSB on AArch64 }
  313. ((taicpu(hp1).opcode<>A_SUB) or
  314. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)) then
  315. begin
  316. { for the two operand instructions, start also at the second operand as they are not always commutative
  317. (depends on the flags tested laster on) and thus the operands cannot swapped }
  318. I2:=1;
  319. for I:=I2 to taicpu(hp1).ops-1 do
  320. if MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[I]^.reg) then
  321. begin
  322. { If the parameter matched on the second op from the RIGHT
  323. we have to switch the parameters, this will not happen for CMP
  324. were we're only evaluating the most right parameter
  325. }
  326. shifterop_reset(shifterop);
  327. case taicpu(p).opcode of
  328. A_LSL:
  329. shifterop.shiftmode:=SM_LSL;
  330. A_ROR:
  331. shifterop.shiftmode:=SM_ROR;
  332. A_LSR:
  333. shifterop.shiftmode:=SM_LSR;
  334. A_ASR:
  335. shifterop.shiftmode:=SM_ASR;
  336. else
  337. InternalError(2019090401);
  338. end;
  339. shifterop.shiftimm:=taicpu(p).oper[2]^.val;
  340. if I <> taicpu(hp1).ops-1 then
  341. begin
  342. if taicpu(hp1).ops = 3 then
  343. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  344. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[2]^.reg,
  345. taicpu(p).oper[1]^.reg, shifterop)
  346. else
  347. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  348. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  349. shifterop);
  350. end
  351. else
  352. if taicpu(hp1).ops = 3 then
  353. hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
  354. taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg,
  355. taicpu(p).oper[1]^.reg,shifterop)
  356. else
  357. hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
  358. taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
  359. shifterop);
  360. { Make sure the register used in the shifting is tracked all
  361. the way through, otherwise it may become deallocated while
  362. it's still live and cause incorrect optimisations later }
  363. if (taicpu(hp1).oper[0]^.reg <> taicpu(p).oper[1]^.reg) then
  364. begin
  365. TransferUsedRegs(TmpUsedRegs);
  366. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  367. ALlocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, TmpUsedRegs);
  368. end;
  369. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  370. asml.insertbefore(hp2, hp1);
  371. RemoveInstruction(hp1);
  372. RemoveCurrentp(p);
  373. DebugMsg(SPeepholeOptimization + 'FoldShiftProcess done', hp2);
  374. Result:=true;
  375. break;
  376. end;
  377. end
  378. else if MatchInstruction(p,[A_LSL, A_LSR, A_ASR,A_ROR],[PF_None]) and
  379. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  380. RemoveSuperfluousMove(p, hp1, 'ShiftMov2Shift') then
  381. Result:=true;
  382. end;
  383. function TCpuAsmOptimizer.OptPass1Data(var p : tai): boolean;
  384. var
  385. hp1: tai;
  386. begin
  387. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  388. RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
  389. end;
  390. function TCpuAsmOptimizer.OptPass1FData(var p: tai): Boolean;
  391. var
  392. hp1: tai;
  393. begin
  394. Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  395. RemoveSuperfluousFMov(p, hp1, 'FOpFMov2FOp');
  396. end;
  397. function TCpuAsmOptimizer.OptPass1STP(var p : tai): boolean;
  398. var
  399. hp1, hp2, hp3, hp4: tai;
  400. begin
  401. Result:=false;
  402. {
  403. change
  404. stp x29,x30,[sp, #-16]!
  405. mov x29,sp
  406. bl abc
  407. ldp x29,x30,[sp], #16
  408. ret
  409. into
  410. b abc
  411. }
  412. if MatchInstruction(p, A_STP, [C_None], [PF_None]) and
  413. MatchOpType(taicpu(p),top_reg,top_reg,top_ref) and
  414. (taicpu(p).oper[0]^.reg = NR_X29) and
  415. (taicpu(p).oper[1]^.reg = NR_X30) and
  416. (taicpu(p).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  417. (taicpu(p).oper[2]^.ref^.index=NR_NO) and
  418. (taicpu(p).oper[2]^.ref^.offset=-16) and
  419. (taicpu(p).oper[2]^.ref^.addressmode=AM_PREINDEXED) and
  420. GetNextInstruction(p, hp1) and
  421. MatchInstruction(hp1, A_MOV, [C_None], [PF_NONE]) and
  422. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[0]^) and
  423. (taicpu(hp1).oper[1]^.typ = top_reg) and
  424. (taicpu(hp1).oper[1]^.reg = NR_STACK_POINTER_REG) and
  425. GetNextInstruction(hp1, hp2) and
  426. SkipEntryExitMarker(hp2, hp2) and
  427. MatchInstruction(hp2, A_BL, [C_None], [PF_NONE]) and
  428. (taicpu(hp2).oper[0]^.typ = top_ref) and
  429. GetNextInstruction(hp2, hp3) and
  430. SkipEntryExitMarker(hp3, hp3) and
  431. MatchInstruction(hp3, A_LDP, [C_None], [PF_NONE]) and
  432. MatchOpType(taicpu(hp3),top_reg,top_reg,top_ref) and
  433. (taicpu(hp3).oper[0]^.reg = NR_X29) and
  434. (taicpu(hp3).oper[1]^.reg = NR_X30) and
  435. (taicpu(hp3).oper[2]^.ref^.base=NR_STACK_POINTER_REG) and
  436. (taicpu(hp3).oper[2]^.ref^.index=NR_NO) and
  437. (taicpu(hp3).oper[2]^.ref^.offset=16) and
  438. (taicpu(hp3).oper[2]^.ref^.addressmode=AM_POSTINDEXED) and
  439. GetNextInstruction(hp3, hp4) and
  440. MatchInstruction(hp4, A_RET, [C_None], [PF_None]) and
  441. (taicpu(hp4).ops = 0) then
  442. begin
  443. asml.Remove(p);
  444. asml.Remove(hp1);
  445. asml.Remove(hp3);
  446. asml.Remove(hp4);
  447. taicpu(hp2).opcode:=A_B;
  448. p.free;
  449. hp1.free;
  450. hp3.free;
  451. hp4.free;
  452. p:=hp2;
  453. DebugMsg(SPeepholeOptimization + 'Bl2B done', p);
  454. Result:=true;
  455. end;
  456. end;
  457. function TCpuAsmOptimizer.OptPass1Mov(var p : tai): boolean;
  458. var
  459. hp1: tai;
  460. so: tshifterop;
  461. begin
  462. Result:=false;
  463. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  464. (taicpu(p).oppostfix=PF_None) then
  465. begin
  466. RemoveCurrentP(p);
  467. DebugMsg(SPeepholeOptimization + 'Mov2None done', p);
  468. Result:=true;
  469. end
  470. else if (taicpu(p).ops=2) and
  471. (getsubreg(taicpu(p).oper[0]^.reg)=R_SUBD) and
  472. GetNextInstruction(p, hp1) and
  473. { Faster to get it out of the way than go through MatchInstruction }
  474. (hp1.typ=ait_instruction) and
  475. (taicpu(hp1).ops=3) and
  476. MatchInstruction(hp1,[A_ADD,A_SUB],[taicpu(p).condition], [PF_None,PF_S]) and
  477. (getsubreg(taicpu(hp1).oper[2]^.reg)=R_SUBQ) and
  478. (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[2]^.reg)) and
  479. RegEndOfLife(taicpu(hp1).oper[2]^.reg,taicpu(hp1)) then
  480. begin
  481. DebugMsg(SPeepholeOptimization + 'MovOp2AddUtxw 1 done', p);
  482. shifterop_reset(so);
  483. so.shiftmode:=SM_UXTW;
  484. taicpu(hp1).ops:=4;
  485. taicpu(hp1).loadreg(2,taicpu(p).oper[1]^.reg);
  486. taicpu(hp1).loadshifterop(3,so);
  487. RemoveCurrentP(p);
  488. Result:=true;
  489. exit;
  490. end
  491. {
  492. optimize
  493. mov rX, yyyy
  494. ....
  495. }
  496. else if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
  497. begin
  498. if RemoveSuperfluousMove(p, hp1, 'MovMov2Mov') then
  499. Result:=true
  500. else if (taicpu(p).ops = 2) and
  501. (tai(hp1).typ = ait_instruction) and
  502. RedundantMovProcess(p,hp1) then
  503. Result:=true
  504. end;
  505. end;
  506. function TCpuAsmOptimizer.OptPass1MOVZ(var p: tai): boolean;
  507. var
  508. hp1: tai;
  509. ZeroReg: TRegister;
  510. begin
  511. Result := False;
  512. hp1 := nil;
  513. if (taicpu(p).oppostfix = PF_None) and (taicpu(p).condition = C_None) then
  514. begin
  515. if
  516. { Check next instruction first so hp1 gets set to something, then
  517. if it remains nil, we know for sure that there's no valid next
  518. instruction. }
  519. not GetNextInstruction(p, hp1) or
  520. { MOVZ and MOVK/MOVN instructions undergo macro-fusion. }
  521. not MatchInstruction(hp1, [A_MOVK, A_MOVN], [C_None], [PF_None]) or
  522. (taicpu(hp1).oper[0]^.reg <> taicpu(p).oper[0]^.reg) then
  523. begin
  524. if (taicpu(p).oper[1]^.val = 0) then
  525. begin
  526. { Change;
  527. movz reg,#0
  528. (no movk or movn)
  529. To:
  530. mov reg,xzr (or wzr)
  531. Easier to perform other optimisations with registers
  532. }
  533. DebugMsg(SPeepholeOptimization + 'Movz0ToMovZeroReg', p);
  534. { Make sure the zero register is the correct size }
  535. ZeroReg := taicpu(p).oper[0]^.reg;
  536. setsupreg(ZeroReg, RS_XZR);
  537. taicpu(p).opcode := A_MOV;
  538. taicpu(p).loadreg(1, ZeroReg);
  539. Result := True;
  540. Exit;
  541. end;
  542. end;
  543. {
  544. remove the second Movz from
  545. movz reg,...
  546. movz reg,...
  547. }
  548. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  549. MatchInstruction(hp1,A_MOVZ,[C_None],[PF_none]) and
  550. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) then
  551. begin
  552. DebugMsg(SPeepholeOptimization + 'MovzMovz2Movz', p);
  553. RemoveCurrentP(p);
  554. Result:=true;
  555. exit;
  556. end;
  557. end;
  558. end;
  559. function TCpuAsmOptimizer.OptPass1FMov(var p: tai): Boolean;
  560. var
  561. hp1: tai;
  562. alloc, dealloc: tai_regalloc;
  563. begin
  564. {
  565. change
  566. fmov reg0,reg1
  567. fmov reg1,reg0
  568. into
  569. fmov reg0,reg1
  570. }
  571. Result := False;
  572. while GetNextInstruction(p, hp1) and
  573. MatchInstruction(hp1, A_FMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
  574. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^) and
  575. MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[0]^) do
  576. begin
  577. asml.Remove(hp1);
  578. hp1.free;
  579. DebugMsg(SPeepholeOptimization + 'FMovFMov2FMov 1 done', p);
  580. Result:=true;
  581. end;
  582. { change
  583. fmov reg0,const
  584. fmov reg1,reg0
  585. dealloc reg0
  586. into
  587. fmov reg1,const
  588. }
  589. if MatchOpType(taicpu(p),top_reg,top_realconst) and
  590. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  591. (not RegModifiedBetween(taicpu(p).oper[1]^.reg, p, hp1)) and
  592. MatchInstruction(hp1,A_FMOV,[taicpu(p).condition],[taicpu(p).oppostfix]) and
  593. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  594. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^.reg) and
  595. (not RegModifiedByInstruction(taicpu(p).oper[0]^.reg, hp1)) and
  596. assigned(FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next)))
  597. then
  598. begin
  599. DebugMsg('Peephole FMovFMov2FMov 2 done', p);
  600. taicpu(hp1).loadrealconst(1,taicpu(p).oper[1]^.val_real);
  601. alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.Previous));
  602. dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next));
  603. if assigned(alloc) and assigned(dealloc) then
  604. begin
  605. asml.Remove(alloc);
  606. alloc.Free;
  607. asml.Remove(dealloc);
  608. dealloc.Free;
  609. end;
  610. { p will be removed, update used register as we continue
  611. with the next instruction after p }
  612. result:=RemoveCurrentP(p);
  613. end;
  614. { not enabled as apparently not happening
  615. if MatchOpType(taicpu(p),top_reg,top_reg) and
  616. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  617. MatchInstruction(hp1, [A_FSUB,A_FADD,A_FNEG,A_FMUL,A_FSQRT,A_FDIV,A_FABS], [PF_None]) and
  618. (MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) or
  619. ((taicpu(hp1).ops=3) and MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[2]^))
  620. ) and
  621. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  622. not(RegUsedBetween(taicpu(p).oper[0]^.reg,p,hp1)) then
  623. begin
  624. DebugMsg(SPeepholeOptimization + 'FMovFOp2FOp done', hp1);
  625. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
  626. if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) then
  627. taicpu(hp1).oper[1]^.reg:=taicpu(p).oper[1]^.reg;
  628. if (taicpu(hp1).ops=3) and MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[2]^) then
  629. taicpu(hp1).oper[2]^.reg:=taicpu(p).oper[1]^.reg;
  630. RemoveCurrentP(p);
  631. Result:=true;
  632. exit;
  633. end;
  634. }
  635. end;
  636. function TCpuAsmOptimizer.OptPass1SXTW(var p : tai) : Boolean;
  637. var
  638. hp1: tai;
  639. GetNextInstructionUsingReg_hp1: Boolean;
  640. begin
  641. Result:=false;
  642. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) then
  643. begin
  644. {
  645. change
  646. sxtw reg2,reg1
  647. str reg2,[...]
  648. dealloc reg2
  649. to
  650. str reg1,[...]
  651. }
  652. if MatchInstruction(p, taicpu(p).opcode, [C_None], [PF_None]) and
  653. (taicpu(p).ops=2) and
  654. MatchInstruction(hp1, A_STR, [C_None], [PF_None]) and
  655. (getsubreg(taicpu(hp1).oper[0]^.reg)=R_SUBD) and
  656. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  657. { the reference in strb might not use reg2 }
  658. not(RegInRef(taicpu(p).oper[0]^.reg,taicpu(hp1).oper[1]^.ref^)) and
  659. { reg1 might not be modified inbetween }
  660. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  661. begin
  662. DebugMsg('Peephole SXTHStr2Str done', p);
  663. taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
  664. result:=RemoveCurrentP(p);
  665. end
  666. {
  667. change
  668. sxtw reg2,reg1
  669. sxtw reg3,reg2
  670. dealloc reg2
  671. to
  672. sxtw reg3,reg1
  673. }
  674. else if MatchInstruction(p, A_SXTW, [C_None], [PF_None]) and
  675. (taicpu(p).ops=2) and
  676. MatchInstruction(hp1, A_SXTW, [C_None], [PF_None]) and
  677. (taicpu(hp1).ops=2) and
  678. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
  679. RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
  680. { reg1 might not be modified inbetween }
  681. not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
  682. begin
  683. DebugMsg('Peephole SxtwSxtw2Sxtw done', p);
  684. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,UsedRegs);
  685. taicpu(hp1).opcode:=A_SXTW;
  686. taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
  687. result:=RemoveCurrentP(p);
  688. end
  689. else if USxtOp2Op(p,hp1,SM_SXTW) then
  690. Result:=true
  691. else if RemoveSuperfluousMove(p, hp1, 'SxtwMov2Data') then
  692. Result:=true;
  693. end;
  694. end;
  695. function TCpuAsmOptimizer.OptPass1B(var p: tai): boolean;
  696. var
  697. hp1, hp2, hp3, hp4, hp5: tai;
  698. Invert: Boolean;
  699. begin
  700. Result := False;
  701. {
  702. convert
  703. b<c> .L1
  704. movz reg,#1`
  705. b .L2
  706. .L1
  707. movz reg,#0 (or mov reg,xzr)
  708. .L2
  709. into
  710. cset reg,<not(c)>
  711. Also do the same if the constants are reversed, instead converting it to:
  712. cset reg,<c>
  713. }
  714. if (taicpu(p).condition <> C_None) and
  715. (taicpu(p).oper[0]^.typ = top_ref) and
  716. GetNextInstruction(p, hp1) and
  717. { Check individually instead of using MatchInstruction in order to save time }
  718. (hp1.typ = ait_instruction) and
  719. (taicpu(hp1).condition = C_None) and
  720. (taicpu(hp1).oppostfix = PF_None) and
  721. (taicpu(hp1).ops = 2) and
  722. (
  723. (
  724. (taicpu(hp1).opcode = A_MOVZ) and
  725. (taicpu(hp1).oper[1]^.val in [0, 1])
  726. ) or
  727. (
  728. (taicpu(hp1).opcode = A_MOV) and
  729. (getsupreg(taicpu(hp1).oper[1]^.reg) = RS_XZR)
  730. )
  731. ) and
  732. GetNextInstruction(hp1, hp2) and
  733. MatchInstruction(hp2, A_B, [PF_None]) and
  734. (taicpu(hp2).condition = C_None) and
  735. (taicpu(hp2).oper[0]^.typ = top_ref) and
  736. GetNextInstruction(hp2, hp3) and
  737. (hp3.typ = ait_label) and
  738. (tasmlabel(taicpu(p).oper[0]^.ref^.symbol) = tai_label(hp3).labsym) and
  739. GetNextInstruction(hp3, hp4) and
  740. { As before, check individually instead of using MatchInstruction in order to save time }
  741. (hp4.typ = ait_instruction) and
  742. (taicpu(hp4).condition = C_None) and
  743. (taicpu(hp4).oppostfix = PF_None) and
  744. (taicpu(hp4).ops = 2) and
  745. (taicpu(hp4).oper[0]^.reg = taicpu(hp1).oper[0]^.reg) and
  746. (
  747. (
  748. (taicpu(hp4).opcode = A_MOVZ) and
  749. (
  750. (
  751. { Check to confirm the following:
  752. - First mov is either "movz reg,#0" or "mov reg,xzr"
  753. - Second mov is "movz reg,#1"
  754. }
  755. (
  756. (taicpu(hp1).oper[1]^.typ = top_reg) { Will be the zero register } or
  757. (taicpu(hp1).oper[1]^.val = 0)
  758. ) and
  759. (taicpu(hp4).oper[1]^.val = 1)
  760. ) or
  761. (
  762. { Check to confirm the following:
  763. - First mov is "movz reg,#1"
  764. - Second mov is "movz reg,#0"
  765. }
  766. MatchOperand(taicpu(hp1).oper[1]^, 1) and
  767. (taicpu(hp4).oper[1]^.val = 0)
  768. )
  769. )
  770. ) or
  771. (
  772. { Check to confirm the following:
  773. - First mov is "movz reg,#1"
  774. - Second mov is "mov reg,xzr"
  775. }
  776. (taicpu(hp4).opcode = A_MOV) and
  777. (getsupreg(taicpu(hp4).oper[1]^.reg) = RS_XZR) and
  778. MatchOperand(taicpu(hp1).oper[1]^, 1)
  779. )
  780. ) and
  781. GetNextInstruction(hp4, hp5) and
  782. (hp5.typ = ait_label) and
  783. (tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol) = tai_label(hp5).labsym) then
  784. begin
  785. Invert := MatchOperand(taicpu(hp1).oper[1]^, 1); { if true, hp4 will be mov reg,0 in some form }
  786. if Invert then
  787. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  788. tai_label(hp3).labsym.DecRefs;
  789. { If this isn't the only reference to the middle label, we can
  790. still make a saving - only that the first jump and everything
  791. that follows will remain. }
  792. if (tai_label(hp3).labsym.getrefs = 0) then
  793. begin
  794. if Invert then
  795. DebugMsg(SPeepholeOptimization + 'B(c)Movz1BMovz0 -> Cset(~c)',p)
  796. else
  797. DebugMsg(SPeepholeOptimization + 'B(c)Movz0bMovZ1 -> Cset(c)',p);
  798. { remove jump, first label and second MOV (also catching any aligns) }
  799. repeat
  800. if not GetNextInstruction(hp2, hp3) then
  801. InternalError(2022070801);
  802. RemoveInstruction(hp2);
  803. hp2 := hp3;
  804. until hp2 = hp5;
  805. { Don't decrement reference count before the removal loop
  806. above, otherwise GetNextInstruction won't stop on the
  807. the label }
  808. tai_label(hp5).labsym.DecRefs;
  809. end
  810. else
  811. begin
  812. if Invert then
  813. DebugMsg(SPeepholeOptimization + 'B(c)Movz1BMovz0 -> Cset(~c) (partial)',p)
  814. else
  815. DebugMsg(SPeepholeOptimization + 'B(c)Movz0BMovz1 -> Cset(c) (partial)',p);
  816. end;
  817. taicpu(hp1).opcode := A_CSET;
  818. taicpu(hp1).loadconditioncode(1, taicpu(p).condition);
  819. RemoveCurrentP(p, hp1);
  820. Result:=true;
  821. exit;
  822. end;
  823. end;
  824. function TCpuAsmOptimizer.OptPass2LDRSTR(var p: tai): boolean;
  825. var
  826. hp1, hp1_last: tai;
  827. ThisRegister: TRegister;
  828. OffsetVal, ValidOffset, MinOffset, MaxOffset: asizeint;
  829. TargetOpcode: TAsmOp;
  830. begin
  831. Result := False;
  832. ThisRegister := taicpu(p).oper[0]^.reg;
  833. case taicpu(p).opcode of
  834. A_LDR:
  835. TargetOpcode := A_LDP;
  836. A_STR:
  837. TargetOpcode := A_STP;
  838. else
  839. InternalError(2020081501);
  840. end;
  841. { reg appearing in ref invalidates these optimisations }
  842. if (TargetOpcode = A_STP) or not RegInRef(ThisRegister, taicpu(p).oper[1]^.ref^) then
  843. begin
  844. { LDP/STP has a smaller permitted offset range than LDR/STR.
  845. TODO: For a group of out-of-range LDR/STR instructions, can
  846. we declare a temporary register equal to the offset base
  847. address, modify the STR instructions to use that register
  848. and then convert them to STP instructions? Note that STR
  849. generally takes 2 cycles (on top of the memory latency),
  850. while LDP/STP takes 3.
  851. }
  852. if (getsubreg(ThisRegister) = R_SUBQ) then
  853. begin
  854. ValidOffset := 8;
  855. MinOffset := -512;
  856. MaxOffset := 504;
  857. end
  858. else
  859. begin
  860. ValidOffset := 4;
  861. MinOffset := -256;
  862. MaxOffset := 252;
  863. end;
  864. hp1_last := p;
  865. { Look for nearby LDR/STR instructions }
  866. if (taicpu(p).oppostfix = PF_NONE) and
  867. (taicpu(p).oper[1]^.ref^.addressmode = AM_OFFSET) then
  868. { If SkipGetNext is True, GextNextInstruction isn't called }
  869. while GetNextInstruction(hp1_last, hp1) do
  870. begin
  871. if (hp1.typ <> ait_instruction) then
  872. Break;
  873. if (taicpu(hp1).opcode = taicpu(p).opcode) then
  874. begin
  875. if (taicpu(hp1).oppostfix = PF_NONE) and
  876. { Registers need to be the same size }
  877. (getsubreg(ThisRegister) = getsubreg(taicpu(hp1).oper[0]^.reg)) and
  878. (
  879. (TargetOpcode = A_STP) or
  880. { LDP x0, x0, [sp, #imm] is undefined behaviour, even
  881. though such an LDR pair should have been optimised
  882. out by now. STP is okay }
  883. (ThisRegister <> taicpu(hp1).oper[0]^.reg)
  884. ) and
  885. (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
  886. (taicpu(p).oper[1]^.ref^.base = taicpu(hp1).oper[1]^.ref^.base) and
  887. (taicpu(p).oper[1]^.ref^.index = taicpu(hp1).oper[1]^.ref^.index) and
  888. { Make sure the address registers haven't changed }
  889. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, p, hp1) and
  890. (
  891. (taicpu(hp1).oper[1]^.ref^.index = NR_NO) or
  892. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index, p, hp1)
  893. ) and
  894. { Don't need to check "RegInRef" because the base registers are identical,
  895. and the first one was checked already. [Kit] }
  896. (((TargetOpcode=A_LDP) and not RegUsedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) or
  897. ((TargetOpcode=A_STP) and not RegModifiedBetween(taicpu(hp1).oper[0]^.reg, p, hp1))) then
  898. begin
  899. { Can we convert these two LDR/STR instructions into a
  900. single LDR/STP? }
  901. OffsetVal := taicpu(hp1).oper[1]^.ref^.offset - taicpu(p).oper[1]^.ref^.offset;
  902. if (OffsetVal = ValidOffset) then
  903. begin
  904. if (taicpu(p).oper[1]^.ref^.offset >= MinOffset) and (taicpu(hp1).oper[1]^.ref^.offset <= MaxOffset) then
  905. begin
  906. { Convert:
  907. LDR/STR reg0, [reg2, #ofs]
  908. ...
  909. LDR/STR reg1. [reg2, #ofs + 8] // 4 if registers are 32-bit
  910. To:
  911. LDP/STP reg0, reg1, [reg2, #ofs]
  912. }
  913. taicpu(p).opcode := TargetOpcode;
  914. if TargetOpcode = A_STP then
  915. DebugMsg(SPeepholeOptimization + 'StrStr2Stp', p)
  916. else
  917. DebugMsg(SPeepholeOptimization + 'LdrLdr2Ldp', p);
  918. taicpu(p).ops := 3;
  919. taicpu(p).loadref(2, taicpu(p).oper[1]^.ref^);
  920. taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
  921. asml.Remove(hp1);
  922. hp1.Free;
  923. Result := True;
  924. Exit;
  925. end;
  926. end
  927. else if (OffsetVal = -ValidOffset) then
  928. begin
  929. if (taicpu(hp1).oper[1]^.ref^.offset >= MinOffset) and (taicpu(p).oper[1]^.ref^.offset <= MaxOffset) then
  930. begin
  931. { Convert:
  932. LDR/STR reg0, [reg2, #ofs + 8] // 4 if registers are 32-bit
  933. ...
  934. LDR/STR reg1. [reg2, #ofs]
  935. To:
  936. LDP/STP reg1, reg0, [reg2, #ofs]
  937. }
  938. taicpu(p).opcode := TargetOpcode;
  939. if TargetOpcode = A_STP then
  940. DebugMsg(SPeepholeOptimization + 'StrStr2Stp (reverse)', p)
  941. else
  942. DebugMsg(SPeepholeOptimization + 'LdrLdr2Ldp (reverse)', p);
  943. taicpu(p).ops := 3;
  944. taicpu(p).loadref(2, taicpu(hp1).oper[1]^.ref^);
  945. taicpu(p).loadreg(1, taicpu(p).oper[0]^.reg);
  946. taicpu(p).loadreg(0, taicpu(hp1).oper[0]^.reg);
  947. asml.Remove(hp1);
  948. hp1.Free;
  949. Result := True;
  950. Exit;
  951. end;
  952. end;
  953. end;
  954. end
  955. else
  956. Break;
  957. { Don't continue looking for LDR/STR pairs if the address register
  958. gets modified }
  959. if RegModifiedByInstruction(taicpu(p).oper[1]^.ref^.base, hp1) then
  960. Break;
  961. hp1_last := hp1;
  962. end;
  963. end;
  964. end;
  965. function TCpuAsmOptimizer.PostPeepholeOptAND(var p: tai): Boolean;
  966. var
  967. hp1, hp2: tai;
  968. hp3: taicpu;
  969. bitval : cardinal;
  970. begin
  971. Result:=false;
  972. {
  973. and reg1,reg0,<const=power of 2>
  974. cmp reg1,#0
  975. <reg1 end of life>
  976. b.e/b.ne label
  977. into
  978. tb(n)z reg0,<power of 2>,label
  979. }
  980. if MatchOpType(taicpu(p),top_reg,top_reg,top_const) and
  981. (PopCnt(QWord(taicpu(p).oper[2]^.val))=1) and
  982. GetNextInstruction(p,hp1) and
  983. MatchInstruction(hp1,A_CMP,[PF_None]) and
  984. MatchOpType(taicpu(hp1),top_reg,top_const) and
  985. (taicpu(hp1).oper[1]^.val=0) and
  986. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  987. RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  988. GetNextInstruction(hp1,hp2) and
  989. MatchInstruction(hp2,A_B,[PF_None]) and
  990. (taicpu(hp2).condition in [C_EQ,C_NE]) then
  991. begin
  992. bitval:=BsfQWord(qword(taicpu(p).oper[2]^.val));
  993. case taicpu(hp2).condition of
  994. C_NE:
  995. hp3:=taicpu.op_reg_const_ref(A_TBNZ,taicpu(p).oper[1]^.reg,bitval,taicpu(hp2).oper[0]^.ref^);
  996. C_EQ:
  997. hp3:=taicpu.op_reg_const_ref(A_TBZ,taicpu(p).oper[1]^.reg,bitval,taicpu(hp2).oper[0]^.ref^);
  998. else
  999. Internalerror(2021100201);
  1000. end;
  1001. taicpu(hp3).fileinfo:=taicpu(hp1).fileinfo;
  1002. asml.insertbefore(hp3, hp1);
  1003. RemoveInstruction(hp1);
  1004. RemoveInstruction(hp2);
  1005. RemoveCurrentP(p);
  1006. DebugMsg(SPeepholeOptimization + 'AndCmpB.E/NE2Tbnz/Tbz done', p);
  1007. Result:=true;
  1008. end;
  1009. end;
  1010. function TCpuAsmOptimizer.PostPeepholeOptCMP(var p : tai): boolean;
  1011. var
  1012. hp1,hp2: tai;
  1013. begin
  1014. Result:=false;
  1015. {
  1016. cmp reg0,#0
  1017. b.e/b.ne label
  1018. into
  1019. cb(n)z reg0,label
  1020. }
  1021. if MatchOpType(taicpu(p),top_reg,top_const) and
  1022. (taicpu(p).oper[0]^.reg<>NR_SP) and
  1023. (taicpu(p).oper[1]^.val=0) and
  1024. GetNextInstruction(p,hp1) and
  1025. MatchInstruction(hp1,A_B,[PF_None]) and
  1026. (taicpu(hp1).condition in [C_EQ,C_NE]) then
  1027. begin
  1028. case taicpu(hp1).condition of
  1029. C_NE:
  1030. hp2:=taicpu.op_reg_sym_ofs(A_CBNZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  1031. C_EQ:
  1032. hp2:=taicpu.op_reg_sym_ofs(A_CBZ,taicpu(p).oper[0]^.reg,taicpu(hp1).oper[0]^.ref^.symbol,taicpu(hp1).oper[0]^.ref^.offset);
  1033. else
  1034. Internalerror(2019090801);
  1035. end;
  1036. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  1037. asml.insertbefore(hp2, hp1);
  1038. asml.remove(p);
  1039. asml.remove(hp1);
  1040. p.free;
  1041. hp1.free;
  1042. p:=hp2;
  1043. DebugMsg(SPeepholeOptimization + 'CMPB.E/NE2CBNZ/CBZ done', p);
  1044. Result:=true;
  1045. end;
  1046. end;
  1047. function TCpuAsmOptimizer.PostPeepholeOptTST(var p : tai): boolean;
  1048. var
  1049. hp1: tai;
  1050. hp3: taicpu;
  1051. bitval : cardinal;
  1052. begin
  1053. Result:=false;
  1054. {
  1055. tst reg1,<const=power of 2>
  1056. b.e/b.ne label
  1057. into
  1058. tb(n)z reg0,<power of 2>,label
  1059. }
  1060. if MatchOpType(taicpu(p),top_reg,top_const) and
  1061. (PopCnt(QWord(taicpu(p).oper[1]^.val))=1) and
  1062. GetNextInstruction(p,hp1) and
  1063. MatchInstruction(hp1,A_B,[C_EQ,C_NE],[PF_None]) then
  1064. begin
  1065. bitval:=BsfQWord(qword(taicpu(p).oper[1]^.val));
  1066. case taicpu(hp1).condition of
  1067. C_NE:
  1068. hp3:=taicpu.op_reg_const_ref(A_TBNZ,taicpu(p).oper[0]^.reg,bitval,taicpu(hp1).oper[0]^.ref^);
  1069. C_EQ:
  1070. hp3:=taicpu.op_reg_const_ref(A_TBZ,taicpu(p).oper[0]^.reg,bitval,taicpu(hp1).oper[0]^.ref^);
  1071. else
  1072. Internalerror(2021100210);
  1073. end;
  1074. taicpu(hp3).fileinfo:=taicpu(p).fileinfo;
  1075. asml.insertafter(hp3, p);
  1076. RemoveInstruction(hp1);
  1077. RemoveCurrentP(p, hp3);
  1078. DebugMsg(SPeepholeOptimization + 'TST; B(E/NE) -> TB(Z/NZ) done', p);
  1079. Result:=true;
  1080. end;
  1081. end;
  1082. function TCpuAsmOptimizer.PrePeepHoleOptsCpu(var p: tai): boolean;
  1083. begin
  1084. result := false;
  1085. if p.typ=ait_instruction then
  1086. begin
  1087. case taicpu(p).opcode of
  1088. A_SBFX,
  1089. A_UBFX:
  1090. Result:=OptPreSBFXUBFX(p);
  1091. else
  1092. ;
  1093. end;
  1094. end;
  1095. end;
  1096. function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
  1097. begin
  1098. result := false;
  1099. if p.typ=ait_instruction then
  1100. begin
  1101. case taicpu(p).opcode of
  1102. A_B:
  1103. Result:=OptPass1B(p);
  1104. A_LDR:
  1105. Result:=OptPass1LDR(p);
  1106. A_STR:
  1107. Result:=OptPass1STR(p);
  1108. A_MOV:
  1109. Result:=OptPass1Mov(p);
  1110. A_MOVZ:
  1111. Result:=OptPass1MOVZ(p);
  1112. A_STP:
  1113. Result:=OptPass1STP(p);
  1114. A_LSR,
  1115. A_ROR,
  1116. A_ASR,
  1117. A_LSL:
  1118. Result:=OptPass1Shift(p);
  1119. A_AND:
  1120. Result:=OptPass1And(p);
  1121. A_NEG,
  1122. A_CSEL,
  1123. A_ADD,
  1124. A_ADC,
  1125. A_SUB,
  1126. A_SBC,
  1127. A_BIC,
  1128. A_EOR,
  1129. A_ORR,
  1130. A_MUL:
  1131. Result:=OptPass1Data(p);
  1132. A_UXTB:
  1133. Result:=OptPass1UXTB(p);
  1134. A_UXTH:
  1135. Result:=OptPass1UXTH(p);
  1136. A_SXTB:
  1137. Result:=OptPass1SXTB(p);
  1138. A_SXTH:
  1139. Result:=OptPass1SXTH(p);
  1140. A_SXTW:
  1141. Result:=OptPass1SXTW(p);
  1142. // A_VLDR,
  1143. A_FMADD,
  1144. A_FMSUB,
  1145. A_FNMADD,
  1146. A_FNMSUB,
  1147. A_FNMUL,
  1148. A_FADD,
  1149. A_FMUL,
  1150. A_FDIV,
  1151. A_FSUB,
  1152. A_FSQRT,
  1153. A_FNEG,
  1154. A_FCVT,
  1155. A_FABS:
  1156. Result:=OptPass1FData(p);
  1157. A_FMOV:
  1158. Result:=OptPass1FMov(p);
  1159. else
  1160. ;
  1161. end;
  1162. end;
  1163. end;
  1164. function TCpuAsmOptimizer.PeepHoleOptPass2Cpu(var p: tai): boolean;
  1165. begin
  1166. result := false;
  1167. if p.typ=ait_instruction then
  1168. begin
  1169. case taicpu(p).opcode of
  1170. A_AND:
  1171. Result := OptPass2AND(p);
  1172. A_LDR,
  1173. A_STR:
  1174. Result:=OptPass2LDRSTR(p);
  1175. A_TST:
  1176. Result := OptPass2TST(p);
  1177. else
  1178. ;
  1179. end;
  1180. end;
  1181. end;
  1182. function TCpuAsmOptimizer.PostPeepHoleOptsCpu(var p: tai): boolean;
  1183. begin
  1184. result := false;
  1185. if p.typ=ait_instruction then
  1186. begin
  1187. case taicpu(p).opcode of
  1188. A_CMP:
  1189. Result:=PostPeepholeOptCMP(p);
  1190. A_AND:
  1191. Result:=PostPeepholeOptAND(p);
  1192. A_TST:
  1193. Result:=PostPeepholeOptTST(p);
  1194. else
  1195. ;
  1196. end;
  1197. end;
  1198. end;
  1199. begin
  1200. casmoptimizer:=TCpuAsmOptimizer;
  1201. End.