optloop.pas 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. {
  2. Loop optimization
  3. Copyright (c) 2005 by Florian Klaempfl
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit optloop;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. node;
  22. function unroll_loop(node : tnode) : tnode;
  23. function OptimizeInductionVariables(node : tnode) : boolean;
  24. implementation
  25. uses
  26. cutils,cclasses,
  27. globtype,globals,constexp,
  28. symdef,symsym,
  29. defutil,
  30. cpuinfo,
  31. nutils,
  32. nadd,nbas,nflw,ncon,ninl,ncal,nld,nmem,ncnv,
  33. ncgmem,
  34. pass_1,
  35. optbase,optutils,
  36. procinfo;
  37. var
  38. nodecount : aword;
  39. function donodecount(var n: tnode; arg: pointer): foreachnoderesult;
  40. begin
  41. inc(nodecount);
  42. result:=fen_false;
  43. end;
  44. { rough estimation how large the tree "node" is }
  45. function countnodes(node : tnode) : aword;
  46. begin
  47. nodecount:=0;
  48. foreachnodestatic(node,@donodecount,nil);
  49. result:=nodecount;
  50. end;
  51. function number_unrolls(node : tnode) : cardinal;
  52. begin
  53. {$ifdef i386}
  54. { multiply by 2 for CPUs with a long pipeline }
  55. if current_settings.cputype in [cpu_Pentium4] then
  56. number_unrolls:=60 div countnodes(node)
  57. else
  58. {$endif i386}
  59. number_unrolls:=30 div countnodes(node);
  60. if number_unrolls=0 then
  61. number_unrolls:=1;
  62. end;
  63. function unroll_loop(node : tnode) : tnode;
  64. var
  65. unrolls,i : cardinal;
  66. counts : qword;
  67. unrollstatement,newforstatement : tstatementnode;
  68. unrollblock : tblocknode;
  69. begin
  70. result:=nil;
  71. if (cs_opt_size in current_settings.optimizerswitches) then
  72. exit;
  73. if not(node.nodetype in [forn]) then
  74. exit;
  75. unrolls:=number_unrolls(tfornode(node).t2);
  76. if unrolls>1 then
  77. begin
  78. { number of executions known? }
  79. if (tfornode(node).right.nodetype=ordconstn) and (tfornode(node).t1.nodetype=ordconstn) then
  80. begin
  81. if lnf_backward in tfornode(node).loopflags then
  82. counts:=tordconstnode(tfornode(node).right).value-tordconstnode(tfornode(node).t1).value+1
  83. else
  84. counts:=tordconstnode(tfornode(node).t1).value-tordconstnode(tfornode(node).right).value+1;
  85. { don't unroll more than we need }
  86. if unrolls>counts then
  87. unrolls:=counts;
  88. { create block statement }
  89. unrollblock:=internalstatements(unrollstatement);
  90. { let's unroll (and rock of course) }
  91. for i:=1 to unrolls do
  92. begin
  93. { create and insert copy of the statement block }
  94. addstatement(unrollstatement,tfornode(node).t2.getcopy);
  95. { set and insert entry label? }
  96. if (counts mod unrolls<>0) and
  97. ((counts mod unrolls)=unrolls-i) then
  98. begin
  99. tfornode(node).entrylabel:=clabelnode.create(cnothingnode.create,tlabelsym.create('$optunrol'));
  100. addstatement(unrollstatement,tfornode(node).entrylabel);
  101. end;
  102. { for itself increases at the last iteration }
  103. if i<unrolls then
  104. begin
  105. { insert incr/decrementation of counter var }
  106. if lnf_backward in tfornode(node).loopflags then
  107. addstatement(unrollstatement,
  108. geninlinenode(in_dec_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)))
  109. else
  110. addstatement(unrollstatement,
  111. geninlinenode(in_inc_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)));
  112. end;
  113. end;
  114. { can we get rid of the for statement? }
  115. if unrolls=counts then
  116. begin
  117. { create block statement }
  118. result:=internalstatements(newforstatement);
  119. { initial assignment }
  120. addstatement(newforstatement,cassignmentnode.create(
  121. tfornode(node).left.getcopy,tfornode(node).right.getcopy));
  122. addstatement(newforstatement,unrollblock);
  123. end;
  124. end
  125. else
  126. begin
  127. { unrolling is a little bit more tricky if we don't know the
  128. loop count at compile time, but the solution is to use a jump table
  129. which is indexed by "loop count mod unrolls" at run time and which
  130. jumps then at the appropriate place inside the loop. Because
  131. a module division is expensive, we can use only unroll counts dividable
  132. by 2 }
  133. case unrolls of
  134. 1..2:
  135. ;
  136. 3:
  137. unrolls:=2;
  138. 4..7:
  139. unrolls:=4;
  140. { unrolls>4 already make no sense imo, but who knows (FK) }
  141. 8..15:
  142. unrolls:=8;
  143. 16..31:
  144. unrolls:=16;
  145. 32..63:
  146. unrolls:=32;
  147. 64..$7fff:
  148. unrolls:=64;
  149. else
  150. exit;
  151. end;
  152. { we don't handle this yet }
  153. exit;
  154. end;
  155. if not(assigned(result)) then
  156. begin
  157. tfornode(node).t2.free;
  158. tfornode(node).t2:=unrollblock;
  159. end;
  160. end;
  161. end;
  162. var
  163. initcode,
  164. calccode,
  165. deletecode : tblocknode;
  166. initcodestatements,
  167. calccodestatements,
  168. deletecodestatements: tstatementnode;
  169. templist : tfplist;
  170. inductionexprs : tfplist;
  171. changedforloop,
  172. containsnestedforloop : boolean;
  173. function is_loop_invariant(loop : tnode;expr : tnode) : boolean;
  174. begin
  175. result:=is_constnode(expr);
  176. case expr.nodetype of
  177. loadn:
  178. begin
  179. if (pi_dfaavailable in current_procinfo.flags) and
  180. assigned(loop.optinfo) and
  181. assigned(expr.optinfo) then
  182. { no aliasing? }
  183. result:=not(tabstractvarsym(tloadnode(expr).symtableentry).addr_taken) and
  184. { no definition in the loop? }
  185. not(DFASetIn(loop.optinfo^.defsum,expr.optinfo^.index));
  186. end;
  187. vecn:
  188. begin
  189. result:=((tvecnode(expr).left.nodetype=loadn) or is_loop_invariant(loop,tvecnode(expr).left)) and
  190. is_loop_invariant(loop,tvecnode(expr).right);
  191. end;
  192. typeconvn:
  193. result:=is_loop_invariant(loop,ttypeconvnode(expr).left);
  194. end;
  195. end;
  196. { checks if the strength of n can be recuded, arg is the tforloop being considered }
  197. function dostrengthreductiontest(var n: tnode; arg: pointer): foreachnoderesult;
  198. function findpreviousstrengthreduction : boolean;
  199. var
  200. i : longint;
  201. begin
  202. result:=false;
  203. for i:=0 to inductionexprs.count-1 do
  204. begin
  205. { do we already maintain one expression? }
  206. if tnode(inductionexprs[i]).isequal(n) then
  207. begin
  208. n.free;
  209. case n.nodetype of
  210. muln:
  211. n:=ctemprefnode.create(ttempcreatenode(templist[i]));
  212. vecn:
  213. n:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(
  214. ttempcreatenode(templist[i]))),n.resultdef);
  215. else
  216. internalerror(200809211);
  217. end;
  218. result:=true;
  219. exit;
  220. end;
  221. end;
  222. end;
  223. procedure CreateNodes;
  224. begin
  225. if not assigned(initcode) then
  226. begin
  227. initcode:=internalstatements(initcodestatements);
  228. calccode:=internalstatements(calccodestatements);
  229. deletecode:=internalstatements(deletecodestatements);
  230. end;
  231. end;
  232. var
  233. tempnode : ttempcreatenode;
  234. dummy : longint;
  235. begin
  236. result:=fen_false;
  237. case n.nodetype of
  238. forn:
  239. { inform for loop search routine, that it needs to search more deeply }
  240. containsnestedforloop:=true;
  241. muln:
  242. begin
  243. if (taddnode(n).right.nodetype=loadn) and
  244. taddnode(n).right.isequal(tfornode(arg).left) and
  245. { plain read of the loop variable? }
  246. not(nf_write in taddnode(n).right.flags) and
  247. not(nf_modify in taddnode(n).right.flags) and
  248. is_loop_invariant(tfornode(arg),taddnode(n).left) and
  249. { for now, we can handle only constant lower borders }
  250. is_constnode(tfornode(arg).right) then
  251. taddnode(n).swapleftright;
  252. if (taddnode(n).left.nodetype=loadn) and
  253. taddnode(n).left.isequal(tfornode(arg).left) and
  254. { plain read of the loop variable? }
  255. not(nf_write in taddnode(n).left.flags) and
  256. not(nf_modify in taddnode(n).left.flags) and
  257. is_loop_invariant(tfornode(arg),taddnode(n).right) and
  258. { for now, we can handle only constant lower borders }
  259. is_constnode(tfornode(arg).right) then
  260. begin
  261. changedforloop:=true;
  262. { did we use the same expression before already? }
  263. if not(findpreviousstrengthreduction) then
  264. begin
  265. tempnode:=ctempcreatenode.create(n.resultdef,n.resultdef.size,tt_persistent,
  266. tstoreddef(n.resultdef).is_intregable or tstoreddef(n.resultdef).is_fpuregable);
  267. templist.Add(tempnode);
  268. inductionexprs.Add(n);
  269. CreateNodes;
  270. if lnf_backward in tfornode(arg).loopflags then
  271. addstatement(calccodestatements,
  272. geninlinenode(in_dec_x,false,
  273. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))))
  274. else
  275. addstatement(calccodestatements,
  276. geninlinenode(in_inc_x,false,
  277. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))));
  278. addstatement(initcodestatements,tempnode);
  279. addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),
  280. caddnode.create(muln,
  281. caddnode.create(subn,tfornode(arg).right.getcopy,cordconstnode.create(1,tfornode(arg).right.resultdef,false)),
  282. taddnode(n).right.getcopy)
  283. )
  284. );
  285. { finally replace the node by a temp. ref }
  286. n:=ctemprefnode.create(tempnode);
  287. { ... and add a temp. release node }
  288. addstatement(deletecodestatements,ctempdeletenode.create(tempnode));
  289. end;
  290. { set types }
  291. do_firstpass(n);
  292. result:=fen_norecurse_false;
  293. end;
  294. end;
  295. vecn:
  296. begin
  297. { is the index the counter variable? }
  298. if not(is_special_array(tvecnode(n).left.resultdef)) and
  299. (tvecnode(n).right.isequal(tfornode(arg).left) or
  300. { fpc usually creates a type cast to access an array }
  301. ((tvecnode(n).right.nodetype=typeconvn) and
  302. ttypeconvnode(tvecnode(n).right).left.isequal(tfornode(arg).left)
  303. )
  304. ) and
  305. { plain read of the loop variable? }
  306. not(nf_write in tvecnode(n).right.flags) and
  307. not(nf_modify in tvecnode(n).right.flags) and
  308. { direct array access? }
  309. ((tvecnode(n).left.nodetype=loadn) or
  310. { ... or loop invariant expression? }
  311. is_loop_invariant(tfornode(arg),tvecnode(n).left)) and
  312. { removing the multiplication is only worth the
  313. effort if it's not a simple shift }
  314. not(ispowerof2(tcgvecnode(n).get_mul_size,dummy)) then
  315. begin
  316. changedforloop:=true;
  317. { did we use the same expression before already? }
  318. if not(findpreviousstrengthreduction) then
  319. begin
  320. tempnode:=ctempcreatenode.create(voidpointertype,voidpointertype.size,tt_persistent,true);
  321. templist.Add(tempnode);
  322. inductionexprs.Add(n);
  323. CreateNodes;
  324. if lnf_backward in tfornode(arg).loopflags then
  325. addstatement(calccodestatements,
  326. geninlinenode(in_dec_x,false,
  327. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(
  328. cordconstnode.create(tcgvecnode(n).get_mul_size,tfornode(arg).right.resultdef,false),nil))))
  329. else
  330. addstatement(calccodestatements,
  331. geninlinenode(in_inc_x,false,
  332. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(
  333. cordconstnode.create(tcgvecnode(n).get_mul_size,tfornode(arg).right.resultdef,false),nil))));
  334. addstatement(initcodestatements,tempnode);
  335. addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),
  336. caddrnode.create(
  337. cvecnode.create(tvecnode(n).left.getcopy,tfornode(arg).right.getcopy)
  338. )
  339. ));
  340. { finally replace the node by a temp. ref }
  341. n:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(tempnode)),n.resultdef);
  342. { ... and add a temp. release node }
  343. addstatement(deletecodestatements,ctempdeletenode.create(tempnode));
  344. end;
  345. { set types }
  346. do_firstpass(n);
  347. result:=fen_norecurse_false;
  348. end;
  349. end;
  350. end;
  351. end;
  352. function OptimizeInductionVariablesSingleForLoop(node : tnode) : tnode;
  353. var
  354. loopcode : tblocknode;
  355. loopcodestatements,
  356. newcodestatements : tstatementnode;
  357. fornode : tfornode;
  358. begin
  359. result:=nil;
  360. if node.nodetype<>forn then
  361. exit;
  362. templist:=TFPList.Create;
  363. inductionexprs:=TFPList.Create;
  364. initcode:=nil;
  365. calccode:=nil;
  366. deletecode:=nil;
  367. initcodestatements:=nil;
  368. calccodestatements:=nil;
  369. deletecodestatements:=nil;
  370. { find all expressions being candidates for strength reduction
  371. and replace them }
  372. foreachnodestatic(pm_postprocess,node,@dostrengthreductiontest,node);
  373. { clue everything together }
  374. if assigned(initcode) then
  375. begin
  376. do_firstpass(tnode(initcode));
  377. do_firstpass(tnode(calccode));
  378. do_firstpass(tnode(deletecode));
  379. { create a new for node, the old one will be released by the compiler }
  380. with tfornode(node) do
  381. begin
  382. fornode:=cfornode.create(left,right,t1,t2,lnf_backward in loopflags);
  383. left:=nil;
  384. right:=nil;
  385. t1:=nil;
  386. t2:=nil;
  387. end;
  388. node:=fornode;
  389. loopcode:=internalstatements(loopcodestatements);
  390. addstatement(loopcodestatements,calccode);
  391. addstatement(loopcodestatements,tfornode(node).t2);
  392. tfornode(node).t2:=loopcode;
  393. do_firstpass(node);
  394. result:=internalstatements(newcodestatements);
  395. addstatement(newcodestatements,initcode);
  396. addstatement(newcodestatements,node);
  397. addstatement(newcodestatements,deletecode);
  398. end;
  399. templist.Free;
  400. inductionexprs.Free;
  401. end;
  402. function iterforloops(var n: tnode; arg: pointer): foreachnoderesult;
  403. var
  404. hp : tnode;
  405. begin
  406. Result:=fen_false;
  407. if n.nodetype=forn then
  408. begin
  409. { do we have DFA available? }
  410. if pi_dfaavailable in current_procinfo.flags then
  411. begin
  412. CalcDefSum(n);
  413. end;
  414. containsnestedforloop:=false;
  415. hp:=OptimizeInductionVariablesSingleForLoop(n);
  416. if assigned(hp) then
  417. begin
  418. n.Free;
  419. n:=hp;
  420. end;
  421. { can we avoid further searching? }
  422. if not(containsnestedforloop) then
  423. Result:=fen_norecurse_false;
  424. end;
  425. end;
  426. function OptimizeInductionVariables(node : tnode) : boolean;
  427. begin
  428. changedforloop:=false;
  429. foreachnodestatic(pm_postprocess,node,@iterforloops,nil);
  430. Result:=changedforloop;
  431. end;
  432. end.