aoptcpu.pas 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. {
  2. Copyright (c) 1998-2002 by Jonas Maebe, member of the Free Pascal
  3. Development Team
  4. This unit implements the PowerPC optimizer object
  5. This program is free software; you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation; either version 2 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program; if not, write to the Free Software
  15. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16. ****************************************************************************
  17. }
  18. Unit aoptcpu;
  19. Interface
  20. {$i fpcdefs.inc}
  21. uses cpubase, aoptobj, aoptcpub, aopt, aasmtai,aasmdata, aasmcpu;
  22. Type
  23. TCpuAsmOptimizer = class(TAsmOptimizer)
  24. { uses the same constructor as TAopObj }
  25. function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
  26. function PostPeepHoleOptsCpu(var p: tai): boolean; override;
  27. private
  28. function cmpi_mfcr_opt(p, next1, next2: taicpu): boolean;
  29. End;
  30. Implementation
  31. uses
  32. cutils, verbose, cgbase, cgcpu, cgobj;
  33. function TCpuAsmOptimizer.cmpi_mfcr_opt(p, next1, next2: taicpu): boolean;
  34. var
  35. next3, prev: tai;
  36. inverse, prevrlwinm: boolean;
  37. begin
  38. result := true;
  39. inverse :=
  40. getnextinstruction(next2,next3) and
  41. (next3.typ = ait_instruction) and
  42. (taicpu(next3).opcode = A_XORI) and
  43. (taicpu(next3).oper[0]^.reg = taicpu(next3).oper[1]^.reg) and
  44. (taicpu(next3).oper[0]^.reg = taicpu(next2).oper[0]^.reg) and
  45. (taicpu(next3).oper[2]^.val = 1);
  46. case taicpu(next2).oper[2]^.val of
  47. 1:
  48. begin
  49. // less than zero or greater/equal than zero (the xori remains in
  50. // in the latter case). Doesn't make sense for unsigned comparisons.
  51. if (p.opcode = A_CMPWI) then
  52. begin
  53. p.opcode := A_SRWI;
  54. p.ops := 3;
  55. p.loadreg(1,p.oper[0]^.reg);
  56. p.loadreg(0,next1.oper[0]^.reg);
  57. p.loadconst(2,31);
  58. asml.remove(next1);
  59. next1.free;
  60. asml.remove(next2);
  61. next2.free;
  62. end
  63. else
  64. result := false;
  65. end;
  66. {
  67. needs two registers to work with
  68. 2:
  69. begin
  70. // greater or less/equal to zero
  71. end;
  72. }
  73. 3:
  74. begin
  75. prevrlwinm :=
  76. getlastinstruction(p,prev) and
  77. (prev.typ = ait_instruction) and
  78. ((taicpu(prev).opcode = A_RLWINM) or
  79. (taicpu(prev).opcode = A_RLWINM_)) and
  80. (taicpu(prev).oper[0]^.reg = p.oper[0]^.reg) and
  81. (taicpu(prev).oper[3]^.val = taicpu(prev).oper[4]^.val);
  82. if (prevrlwinm) then
  83. begin
  84. // isolate the bit we need
  85. if (taicpu(prev).oper[3]^.val <> 31) then
  86. begin
  87. p.opcode := A_RLWINM;
  88. p.ops := 5;
  89. p.loadreg(1,p.oper[0]^.reg);
  90. p.loadreg(0,next1.oper[0]^.reg);
  91. p.loadconst(2,taicpu(prev).oper[3]^.val + 1);
  92. p.loadconst(3,31);
  93. p.loadconst(4,31);
  94. end
  95. else { if (taicpu(prev).oper[0]^.reg <> next1.oper[0]^.reg) then }
  96. begin
  97. p.opcode := A_MR;
  98. p.loadreg(1,p.oper[0]^.reg);
  99. p.loadreg(0,next1.oper[0]^.reg);
  100. end;
  101. if not inverse then
  102. begin
  103. next1.ops := 3;
  104. next1.opcode := A_XORI;
  105. next1.loadreg(1,next1.oper[0]^.reg);
  106. next1.loadconst(2,1);
  107. end
  108. else
  109. begin
  110. asml.remove(next1);
  111. next1.free;
  112. asml.remove(next3);
  113. next3.free;
  114. end;
  115. asml.remove(next2);
  116. next2.free;
  117. end
  118. else
  119. begin
  120. // equal/not equal to zero (the xori remains in the latter case;
  121. // there's a more optimal sequence without it, but needs extra
  122. // register)
  123. p.opcode := A_CNTLZW;
  124. p.loadreg(1,p.oper[0]^.reg);
  125. p.loadreg(0,next1.oper[0]^.reg);
  126. next1.ops := 3;
  127. next1.opcode := A_SRWI;
  128. next1.loadreg(1,next1.oper[0]^.reg);
  129. next1.loadconst(2,5);
  130. asml.remove(next2);
  131. next2.free;
  132. end;
  133. end;
  134. else
  135. result := false;
  136. end;
  137. end;
  138. function rlwinm2mask(l1,l2: longint): longint;
  139. begin
  140. // 1 shl 32 = 1 instead of 0 on x86
  141. if (l1 <> 0) then
  142. result := longint(cardinal(1) shl (32 - l1) - 1) xor (cardinal(1) shl (31 - l2) - 1)
  143. else
  144. result := longint(not(cardinal(1) shl (31 - l2) - 1));
  145. if (l1 > l2) then
  146. result := not(result);
  147. end;
  148. function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
  149. var
  150. next1, next2: tai;
  151. l1, l2, shlcount: longint;
  152. begin
  153. result := false;
  154. case p.typ of
  155. ait_instruction:
  156. begin
  157. case taicpu(p).opcode of
  158. A_CMPWI,
  159. A_CMPLWI:
  160. begin
  161. if (taicpu(p).oper[1]^.typ = top_const) and
  162. (taicpu(p).oper[1]^.val = 0) and
  163. getnextinstruction(p,next1) and
  164. (next1.typ = ait_instruction) and
  165. (taicpu(next1).opcode = A_MFCR) and
  166. getnextinstruction(next1,next2) and
  167. (taicpu(next2).opcode = A_RLWINM) and
  168. (taicpu(next2).oper[0]^.reg = taicpu(next2).oper[1]^.reg) and
  169. (taicpu(next2).oper[0]^.reg = taicpu(next1).oper[0]^.reg) and
  170. (taicpu(next2).oper[3]^.val = 31) and
  171. (taicpu(next2).oper[4]^.val = 31) and
  172. cmpi_mfcr_opt(taicpu(p),taicpu(next1),taicpu(next2)) then
  173. result := true;
  174. end;
  175. { seems the register allocator doesn't generate superfluous fmr's }
  176. { A_FMR, }
  177. A_MR:
  178. begin
  179. if getnextinstruction(p,next1) and
  180. (next1.typ = ait_instruction) and
  181. (taicpu(next1).ops >= 1) and
  182. { spilling_get_operation_type does not support lmw/stmw }
  183. (taicpu(next1).opcode <> A_LMW) and
  184. (taicpu(next1).opcode <> A_STMW) and
  185. (taicpu(next1).spilling_get_operation_type(0) = operand_write) and
  186. (taicpu(next1).oper[0]^.reg = taicpu(p).oper[0]^.reg) then
  187. begin
  188. for l1 := 1 to taicpu(next1).ops - 1 do
  189. case taicpu(next1).oper[l1]^.typ of
  190. top_reg:
  191. if taicpu(next1).oper[l1]^.reg = taicpu(p).oper[0]^.reg then
  192. taicpu(next1).loadreg(l1,taicpu(p).oper[1]^.reg);
  193. top_ref:
  194. begin
  195. if taicpu(next1).oper[l1]^.ref^.base = taicpu(p).oper[0]^.reg then
  196. taicpu(next1).oper[l1]^.ref^.base := taicpu(p).oper[1]^.reg;
  197. if taicpu(next1).oper[l1]^.ref^.index = taicpu(p).oper[0]^.reg then
  198. taicpu(next1).oper[l1]^.ref^.index := taicpu(p).oper[1]^.reg;
  199. end;
  200. else
  201. ;
  202. end;
  203. asml.remove(p);
  204. p.free;
  205. p := next1;
  206. result := true;
  207. end;
  208. end;
  209. A_SLWI:
  210. begin
  211. if getnextinstruction(p,next1) and
  212. (next1.typ = ait_instruction) and
  213. ((taicpu(next1).opcode = A_RLWINM) or
  214. (taicpu(next1).opcode = A_SLWI) or
  215. (taicpu(next1).opcode = A_SRWI)) and
  216. (taicpu(next1).oper[0]^.reg = taicpu(p).oper[0]^.reg) and
  217. (taicpu(next1).oper[1]^.reg = taicpu(p).oper[0]^.reg) then
  218. begin
  219. { convert slwi to rlwinm and see if the rlwinm }
  220. { optimization can do something with it }
  221. taicpu(p).opcode := A_RLWINM;
  222. taicpu(p).ops := 5;
  223. taicpu(p).loadconst(2,taicpu(p).oper[2]^.val);
  224. taicpu(p).loadconst(3,0);
  225. taicpu(p).loadconst(4,31-taicpu(p).oper[2]^.val);
  226. result := true;
  227. end;
  228. end;
  229. A_SRWI:
  230. begin
  231. if getnextinstruction(p,next1) and
  232. (next1.typ = ait_instruction) and
  233. ((taicpu(next1).opcode = A_SLWI) or
  234. (taicpu(next1).opcode = A_RLWINM) or
  235. (taicpu(next1).opcode = A_SRWI)) and
  236. (taicpu(next1).oper[0]^.reg = taicpu(p).oper[0]^.reg) and
  237. (taicpu(next1).oper[1]^.reg = taicpu(p).oper[0]^.reg) then
  238. case taicpu(next1).opcode of
  239. A_SLWI:
  240. begin
  241. taicpu(p).opcode := A_RLWINM;
  242. taicpu(p).ops := 5;
  243. taicpu(p).loadconst(2,taicpu(next1).oper[2]^.val-taicpu(p).oper[2]^.val);
  244. if (taicpu(p).oper[2]^.val < 0) then
  245. begin
  246. taicpu(p).loadconst(3,-taicpu(p).oper[2]^.val);
  247. taicpu(p).loadconst(4,31-taicpu(next1).oper[2]^.val);
  248. inc(taicpu(p).oper[2]^.val,32);
  249. end
  250. else
  251. begin
  252. taicpu(p).loadconst(3,0);
  253. taicpu(p).loadconst(4,31-taicpu(next1).oper[2]^.val);
  254. end;
  255. asml.remove(next1);
  256. next1.free;
  257. result := true;
  258. end;
  259. A_RLWINM:
  260. begin
  261. { convert srwi to rlwinm and see if the rlwinm }
  262. { optimization can do something with it }
  263. taicpu(p).opcode := A_RLWINM;
  264. taicpu(p).ops := 5;
  265. taicpu(p).loadconst(3,taicpu(p).oper[2]^.val);
  266. taicpu(p).loadconst(4,31);
  267. taicpu(p).loadconst(2,(32-taicpu(p).oper[2]^.val) and 31);
  268. result := true;
  269. end;
  270. else
  271. internalerror(2019050941);
  272. end;
  273. end;
  274. A_RLWINM:
  275. begin
  276. if getnextinstruction(p,next1) and
  277. (next1.typ = ait_instruction) and
  278. ((taicpu(next1).opcode = A_RLWINM) or
  279. (taicpu(next1).opcode = A_SRWI) or
  280. (taicpu(next1).opcode = A_SLWI)) and
  281. (taicpu(next1).oper[0]^.reg = taicpu(p).oper[0]^.reg) and
  282. // both source and target of next1 must equal target of p
  283. (taicpu(next1).oper[1]^.reg = taicpu(p).oper[0]^.reg) then
  284. begin
  285. case taicpu(next1).opcode of
  286. A_RLWINM:
  287. begin
  288. shlcount := taicpu(next1).oper[2]^.val;
  289. l2 := rlwinm2mask(taicpu(next1).oper[3]^.val,taicpu(next1).oper[4]^.val);
  290. end;
  291. A_SLWI:
  292. begin
  293. shlcount := taicpu(next1).oper[2]^.val;
  294. l2 := (-1) shl shlcount;
  295. end;
  296. A_SRWI:
  297. begin
  298. shlcount := 32-taicpu(next1).oper[2]^.val;
  299. l2 := (-1) shr taicpu(next1).oper[2]^.val;
  300. end;
  301. else
  302. internalerror(2013113008);
  303. end;
  304. l1 := rlwinm2mask((taicpu(p).oper[3]^.val-shlcount) and 31,(taicpu(p).oper[4]^.val-shlcount) and 31);
  305. l1 := l1 and l2;
  306. case l1 of
  307. -1:
  308. begin
  309. taicpu(p).oper[2]^.val := (taicpu(p).oper[2]^.val + shlcount) and 31;
  310. asml.remove(next1);
  311. next1.free;
  312. if (taicpu(p).oper[2]^.val = 0) then
  313. begin
  314. next1 := tai(p.next);
  315. asml.remove(p);
  316. p.free;
  317. p := next1;
  318. result := true;
  319. end;
  320. end;
  321. 0:
  322. begin
  323. // masks have no bits in common
  324. taicpu(p).opcode := A_LI;
  325. taicpu(p).loadconst(1,0);
  326. taicpu(p).freeop(2);
  327. taicpu(p).freeop(3);
  328. taicpu(p).freeop(4);
  329. taicpu(p).ops := 2;
  330. taicpu(p).opercnt := 2;
  331. asml.remove(next1);
  332. next1.free;
  333. result := true;
  334. end
  335. else if tcgppc(cg).get_rlwi_const(l1,l1,l2) then
  336. begin
  337. taicpu(p).oper[2]^.val := (taicpu(p).oper[2]^.val + shlcount) and 31;
  338. taicpu(p).oper[3]^.val := l1;
  339. taicpu(p).oper[4]^.val := l2;
  340. asml.remove(next1);
  341. next1.free;
  342. result := true;
  343. end;
  344. end;
  345. end;
  346. end;
  347. else
  348. ;
  349. end;
  350. end;
  351. else
  352. ;
  353. end;
  354. end;
  355. const
  356. modifyflags: array[tasmop] of tasmop =
  357. (a_none, a_add_, a_add_, a_addo_, a_addo_, a_addc_, a_addc_, a_addco_, a_addco_,
  358. a_adde_, a_adde_, a_addeo_, a_addeo_, {a_addi could be addic_ if sure doesn't disturb carry} a_none, a_addic_, a_addic_, a_none,
  359. a_addme_, a_addme_, a_addmeo_, a_addmeo_, a_addze_, a_addze_, a_addzeo_,
  360. a_addzeo_, a_and_, a_and_, a_andc_, a_andc_, a_andi_, a_andis_, a_none,
  361. a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  362. a_none, a_none, a_none, a_none, a_none, a_cntlzw_, a_cntlzw_, a_none,
  363. a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  364. a_none, a_none, a_none, a_none, a_none, a_none, a_divw_, a_divw_, a_divwo_, a_divwo_,
  365. a_divwu_, a_divwu_, a_divwuo_, a_divwuo_, a_none, a_none, a_none, a_eqv_,
  366. a_eqv_, a_extsb_, a_extsb_, a_extsh_, a_extsh_, a_none, a_none, a_none,
  367. a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  368. a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  369. a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  370. a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  371. a_none, a_none, a_none, a_none, a_none, a_none,
  372. a_none, a_none, a_none, a_none,
  373. a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  374. a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  375. a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  376. a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  377. a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  378. a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  379. a_none, a_none, a_none, a_mffs, a_mffs_, a_mfmsr, a_mfspr, a_mfsr,
  380. a_mfsrin, a_mftb, a_mtcrf, a_none, a_none, a_none, a_none,
  381. a_none, a_none, a_none, a_none, a_none, a_none, a_mulhw_,
  382. a_mulhw_, a_mulhwu_, a_mulhwu_, a_none, a_mullw_, a_mullw_, a_mullwo_,
  383. a_mullwo_, a_nand_, a_nand_, a_neg_, a_neg_, a_nego_, a_nego_, a_nor_, a_nor_,
  384. a_or_, a_or_, a_orc_, a_orc_, a_none, a_none, a_none, a_rlwimi_, a_rlwimi_,
  385. a_rlwinm_, a_rlwinm_, a_rlwnm_, a_rlwnm_, a_none, a_slw_, a_slw_, a_sraw_, a_sraw_,
  386. a_srawi_, a_srawi_,a_srw_, a_srw_, a_none, a_none, a_none, a_none, a_none,
  387. a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  388. a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  389. a_none, a_none, a_none, a_none, a_none, a_subf_, a_subf_, a_subfo_,
  390. a_subfo_, a_subfc_, a_subfc_, a_subfco_, a_subfco_, a_subfe_, a_subfe_,
  391. a_subfeo_, a_subfeo_, a_none, a_subfme_, a_subfme_, a_subfmeo_, a_subfmeo_,
  392. a_subfze_, a_subfze_, a_subfzeo_, a_subfzeo_, a_none, a_none, a_none,
  393. a_none, a_none, a_none, a_xor_, a_xor_, a_none, a_none,
  394. { simplified mnemonics }
  395. a_none, a_none, a_subic_, a_subic_, a_sub_, a_sub_, a_subo_, a_subo_,
  396. a_subc_, a_subc_, a_subco_, a_subco_, a_none, a_none, a_none, a_none,
  397. a_extlwi_, a_extlwi_, a_extrwi_, a_extrwi_, a_inslwi_, a_inslwi_, a_insrwi_,
  398. a_insrwi_, a_rotlwi_, a_rotlwi_, a_rotlw_, a_rotlw_, a_slwi_, a_slwi_,
  399. a_srwi_, a_srwi_, a_clrlwi_, a_clrlwi_, a_clrrwi_, a_clrrwi_, a_clrslwi_,
  400. a_clrslwi_, a_none, a_none, a_none, a_none, a_none, a_none, a_none,
  401. a_none, a_none {move to special purpose reg}, a_none {move from special purpose reg},
  402. a_none, a_none, a_none, a_none, a_mr_, a_mr_, a_not_, a_not_, a_none, a_none, a_none,
  403. a_none, a_none, a_none, a_none,
  404. a_none, a_none, a_none, a_none, a_none);
  405. function changetomodifyflags(p: taicpu): boolean;
  406. begin
  407. result := false;
  408. if (modifyflags[p.opcode] <> a_none) then
  409. begin
  410. p.opcode := modifyflags[p.opcode];
  411. result := true;
  412. end;
  413. end;
  414. function TCpuAsmOptimizer.PostPeepHoleOptsCpu(var p: tai): boolean;
  415. var
  416. next1: tai;
  417. begin
  418. result := false;
  419. case p.typ of
  420. ait_instruction:
  421. begin
  422. case taicpu(p).opcode of
  423. A_RLWINM_:
  424. begin
  425. // rlwinm_ is cracked on the G5, andi_/andis_ aren't
  426. if (taicpu(p).oper[2]^.val = 0) then
  427. if (taicpu(p).oper[3]^.val < 16) and
  428. (taicpu(p).oper[4]^.val < 16) then
  429. begin
  430. taicpu(p).opcode := A_ANDIS_;
  431. taicpu(p).oper[2]^.val := word(
  432. ((1 shl (16-taicpu(p).oper[3]^.val)) - 1) xor
  433. ((1 shl (15-taicpu(p).oper[4]^.val)) - 1));
  434. taicpu(p).freeop(3);
  435. taicpu(p).freeop(4);
  436. taicpu(p).ops := 3;
  437. taicpu(p).opercnt := 3;
  438. end
  439. else if (taicpu(p).oper[3]^.val >= 16) and
  440. (taicpu(p).oper[4]^.val >= 16) then
  441. begin
  442. taicpu(p).opcode := A_ANDI_;
  443. taicpu(p).oper[2]^.val := word(rlwinm2mask(taicpu(p).oper[3]^.val,taicpu(p).oper[4]^.val));
  444. taicpu(p).freeop(3);
  445. taicpu(p).freeop(4);
  446. taicpu(p).ops := 3;
  447. taicpu(p).opercnt := 3;
  448. end;
  449. end;
  450. else
  451. ;
  452. end;
  453. // change "integer operation with destination reg" followed by a
  454. // comparison to zero of that reg, with a variant of that integer
  455. // operation which sets the flags (if it exists)
  456. if not(result) and
  457. (taicpu(p).ops >= 2) and
  458. (taicpu(p).oper[0]^.typ = top_reg) and
  459. (taicpu(p).oper[1]^.typ = top_reg) and
  460. getnextinstruction(p,next1) and
  461. (next1.typ = ait_instruction) and
  462. (taicpu(next1).opcode = A_CMPWI) and
  463. // make sure it the result goes to cr0
  464. (((taicpu(next1).ops = 2) and
  465. (taicpu(next1).oper[1]^.val = 0) and
  466. (taicpu(next1).oper[0]^.reg = taicpu(p).oper[0]^.reg)) or
  467. ((taicpu(next1).ops = 3) and
  468. (taicpu(next1).oper[2]^.val = 0) and
  469. (taicpu(next1).oper[0]^.typ = top_reg) and
  470. (getsupreg(taicpu(next1).oper[0]^.reg) = RS_CR0) and
  471. (taicpu(next1).oper[1]^.reg = taicpu(p).oper[0]^.reg))) and
  472. changetomodifyflags(taicpu(p)) then
  473. begin
  474. asml.remove(next1);
  475. next1.free;
  476. result := true;
  477. end;
  478. end;
  479. else
  480. ;
  481. end;
  482. end;
  483. begin
  484. casmoptimizer:=TCpuAsmOptimizer;
  485. End.