optloop.pas 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. {
  2. Loop optimization
  3. Copyright (c) 2005 by Florian Klaempfl
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit optloop;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. node;
  22. function unroll_loop(node : tnode) : tnode;
  23. function OptimizeInductionVariables(node : tnode) : boolean;
  24. implementation
  25. uses
  26. cclasses,
  27. globtype,globals,constexp,
  28. symdef,symsym,
  29. cpuinfo,
  30. nutils,
  31. nadd,nbas,nflw,ncon,ninl,ncal,nld,
  32. pass_1,
  33. optbase,optutils,
  34. procinfo;
  35. var
  36. nodecount : aword;
  37. function donodecount(var n: tnode; arg: pointer): foreachnoderesult;
  38. begin
  39. inc(nodecount);
  40. result:=fen_false;
  41. end;
  42. { rough estimation how large the tree "node" is }
  43. function countnodes(node : tnode) : aword;
  44. begin
  45. nodecount:=0;
  46. foreachnodestatic(node,@donodecount,nil);
  47. result:=nodecount;
  48. end;
  49. function number_unrolls(node : tnode) : cardinal;
  50. begin
  51. {$ifdef i386}
  52. { multiply by 2 for CPUs with a long pipeline }
  53. if current_settings.cputype in [cpu_Pentium4] then
  54. number_unrolls:=60 div countnodes(node)
  55. else
  56. {$endif i386}
  57. number_unrolls:=30 div countnodes(node);
  58. if number_unrolls=0 then
  59. number_unrolls:=1;
  60. end;
  61. function unroll_loop(node : tnode) : tnode;
  62. var
  63. unrolls,i : cardinal;
  64. counts : qword;
  65. unrollstatement,newforstatement : tstatementnode;
  66. unrollblock : tblocknode;
  67. begin
  68. result:=nil;
  69. if (cs_opt_size in current_settings.optimizerswitches) then
  70. exit;
  71. if not(node.nodetype in [forn]) then
  72. exit;
  73. unrolls:=number_unrolls(tfornode(node).t2);
  74. if unrolls>1 then
  75. begin
  76. { number of executions known? }
  77. if (tfornode(node).right.nodetype=ordconstn) and (tfornode(node).t1.nodetype=ordconstn) then
  78. begin
  79. if lnf_backward in tfornode(node).loopflags then
  80. counts:=tordconstnode(tfornode(node).right).value-tordconstnode(tfornode(node).t1).value+1
  81. else
  82. counts:=tordconstnode(tfornode(node).t1).value-tordconstnode(tfornode(node).right).value+1;
  83. { don't unroll more than we need }
  84. if unrolls>counts then
  85. unrolls:=counts;
  86. { create block statement }
  87. unrollblock:=internalstatements(unrollstatement);
  88. { let's unroll (and rock of course) }
  89. for i:=1 to unrolls do
  90. begin
  91. { create and insert copy of the statement block }
  92. addstatement(unrollstatement,tfornode(node).t2.getcopy);
  93. { set and insert entry label? }
  94. if (counts mod unrolls<>0) and
  95. ((counts mod unrolls)=unrolls-i) then
  96. begin
  97. tfornode(node).entrylabel:=clabelnode.create(cnothingnode.create,tlabelsym.create('$optunrol'));
  98. addstatement(unrollstatement,tfornode(node).entrylabel);
  99. end;
  100. { for itself increases at the last iteration }
  101. if i<unrolls then
  102. begin
  103. { insert incrementation of counter var }
  104. addstatement(unrollstatement,
  105. geninlinenode(in_inc_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)));
  106. end;
  107. end;
  108. { can we get rid of the for statement? }
  109. if unrolls=counts then
  110. begin
  111. { create block statement }
  112. result:=internalstatements(newforstatement);
  113. { initial assignment }
  114. addstatement(newforstatement,cassignmentnode.create(
  115. tfornode(node).left.getcopy,tfornode(node).right.getcopy));
  116. addstatement(newforstatement,unrollblock);
  117. end;
  118. end
  119. else
  120. begin
  121. { unrolling is a little bit more tricky if we don't know the
  122. loop count at compile time, but the solution is to use a jump table
  123. which is indexed by "loop count mod unrolls" at run time and which
  124. jumps then at the appropriate place inside the loop. Because
  125. a module division is expensive, we can use only unroll counts dividable
  126. by 2 }
  127. case unrolls of
  128. 1..2:
  129. ;
  130. 3:
  131. unrolls:=2;
  132. 4..7:
  133. unrolls:=4;
  134. { unrolls>4 already make no sense imo, but who knows (FK) }
  135. 8..15:
  136. unrolls:=8;
  137. 16..31:
  138. unrolls:=16;
  139. 32..63:
  140. unrolls:=32;
  141. 64..$7fff:
  142. unrolls:=64;
  143. else
  144. exit;
  145. end;
  146. { we don't handle this yet }
  147. exit;
  148. end;
  149. if not(assigned(result)) then
  150. begin
  151. tfornode(node).t2.free;
  152. tfornode(node).t2:=unrollblock;
  153. end;
  154. end;
  155. end;
  156. var
  157. initcode,
  158. calccode,
  159. deletecode : tblocknode;
  160. initcodestatements,
  161. calccodestatements,
  162. deletecodestatements: tstatementnode;
  163. templist : tfplist;
  164. inductionexprs : tfplist;
  165. changedforloop,
  166. containsnestedforloop : boolean;
  167. function is_loop_invariant(loop : tnode;expr : tnode) : boolean;
  168. begin
  169. result:=is_constnode(expr);
  170. case expr.nodetype of
  171. loadn:
  172. begin
  173. if (pi_dfaavailable in current_procinfo.flags) and
  174. assigned(loop.optinfo) and
  175. assigned(expr.optinfo) then
  176. { no aliasing? }
  177. result:=not(tabstractvarsym(tloadnode(expr).symtableentry).addr_taken) and
  178. { no definition in the loop? }
  179. not(DFASetIn(loop.optinfo^.defsum,expr.optinfo^.index));
  180. end;
  181. end;
  182. end;
  183. { checks if the strength of n can be recuded, arg is the tforloop being considered }
  184. function dostrengthreductiontest(var n: tnode; arg: pointer): foreachnoderesult;
  185. function findpreviousstrengthreduction : boolean;
  186. var
  187. i : longint;
  188. begin
  189. result:=false;
  190. for i:=0 to inductionexprs.count-1 do
  191. begin
  192. { do we already maintain one expression? }
  193. if tnode(inductionexprs[i]).isequal(n) then
  194. begin
  195. n.free;
  196. n:=ttemprefnode.create(ttempcreatenode(templist[i]));
  197. result:=true;
  198. exit;
  199. end;
  200. end;
  201. end;
  202. procedure CreateNodes;
  203. begin
  204. if not assigned(initcode) then
  205. begin
  206. initcode:=internalstatements(initcodestatements);
  207. calccode:=internalstatements(calccodestatements);
  208. deletecode:=internalstatements(deletecodestatements);
  209. end;
  210. end;
  211. var
  212. tempnode : ttempcreatenode;
  213. begin
  214. result:=fen_false;
  215. case n.nodetype of
  216. forn:
  217. { inform for loop search routine, that it needs to search more deeply }
  218. containsnestedforloop:=true;
  219. muln:
  220. begin
  221. if (taddnode(n).right.nodetype=loadn) and
  222. taddnode(n).right.isequal(tfornode(arg).left) and
  223. { plain read of the loop variable? }
  224. not(nf_write in taddnode(n).right.flags) and
  225. not(nf_modify in taddnode(n).right.flags) and
  226. is_loop_invariant(tfornode(arg),taddnode(n).left) and
  227. { for now, we can handle only constant lower borders }
  228. is_constnode(tfornode(arg).right) then
  229. taddnode(n).swapleftright;
  230. if (taddnode(n).left.nodetype=loadn) and
  231. taddnode(n).left.isequal(tfornode(arg).left) and
  232. { plain read of the loop variable? }
  233. not(nf_write in taddnode(n).left.flags) and
  234. not(nf_modify in taddnode(n).left.flags) and
  235. is_loop_invariant(tfornode(arg),taddnode(n).right) and
  236. { for now, we can handle only constant lower borders }
  237. is_constnode(tfornode(arg).right) then
  238. begin
  239. changedforloop:=true;
  240. { did we use the same expression before already? }
  241. if not(findpreviousstrengthreduction) then
  242. begin
  243. tempnode:=ctempcreatenode.create(n.resultdef,n.resultdef.size,tt_persistent,
  244. tstoreddef(n.resultdef).is_intregable or tstoreddef(n.resultdef).is_fpuregable);
  245. templist.Add(tempnode);
  246. inductionexprs.Add(n);
  247. CreateNodes;
  248. if lnf_backward in tfornode(arg).loopflags then
  249. addstatement(calccodestatements,
  250. geninlinenode(in_dec_x,false,
  251. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))))
  252. else
  253. addstatement(calccodestatements,
  254. geninlinenode(in_inc_x,false,
  255. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))));
  256. addstatement(initcodestatements,tempnode);
  257. addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),
  258. caddnode.create(muln,
  259. caddnode.create(subn,tfornode(arg).right.getcopy,cordconstnode.create(1,tfornode(arg).right.resultdef,false)),
  260. taddnode(n).right.getcopy)
  261. )
  262. );
  263. { finally replace the node by a temp. ref }
  264. n:=ctemprefnode.create(tempnode);
  265. { ... and add a temp. release node }
  266. addstatement(deletecodestatements,ctempdeletenode.create(tempnode));
  267. end;
  268. { set types }
  269. do_firstpass(n);
  270. result:=fen_norecurse_false;
  271. end;
  272. end;
  273. end;
  274. end;
  275. function OptimizeInductionVariablesSingleForLoop(node : tnode) : tnode;
  276. var
  277. loopcode,
  278. newcode : tblocknode;
  279. loopcodestatements,
  280. newcodestatements : tstatementnode;
  281. fornode : tfornode;
  282. begin
  283. result:=nil;
  284. if node.nodetype<>forn then
  285. exit;
  286. templist:=TFPList.Create;
  287. inductionexprs:=TFPList.Create;
  288. initcode:=nil;
  289. calccode:=nil;
  290. deletecode:=nil;
  291. initcodestatements:=nil;
  292. calccodestatements:=nil;
  293. deletecodestatements:=nil;
  294. { find all expressions being candidates for strength reduction
  295. and replace them }
  296. foreachnodestatic(pm_postprocess,node,@dostrengthreductiontest,node);
  297. { clue everything together }
  298. if assigned(initcode) then
  299. begin
  300. do_firstpass(initcode);
  301. do_firstpass(calccode);
  302. do_firstpass(deletecode);
  303. { create a new for node, the old one will be released by the compiler }
  304. with tfornode(node) do
  305. begin
  306. fornode:=cfornode.create(left,right,t1,t2,lnf_backward in loopflags);
  307. left:=nil;
  308. right:=nil;
  309. t1:=nil;
  310. t2:=nil;
  311. end;
  312. node:=fornode;
  313. loopcode:=internalstatements(loopcodestatements);
  314. addstatement(loopcodestatements,calccode);
  315. addstatement(loopcodestatements,tfornode(node).t2);
  316. tfornode(node).t2:=loopcode;
  317. do_firstpass(node);
  318. result:=internalstatements(newcodestatements);
  319. addstatement(newcodestatements,initcode);
  320. addstatement(newcodestatements,node);
  321. addstatement(newcodestatements,deletecode);
  322. end;
  323. templist.Free;
  324. inductionexprs.Free;
  325. end;
  326. function iterforloops(var n: tnode; arg: pointer): foreachnoderesult;
  327. var
  328. hp : tnode;
  329. begin
  330. Result:=fen_false;
  331. if n.nodetype=forn then
  332. begin
  333. { do we have DFA available? }
  334. if pi_dfaavailable in current_procinfo.flags then
  335. begin
  336. CalcDefSum(n);
  337. end;
  338. containsnestedforloop:=false;
  339. hp:=OptimizeInductionVariablesSingleForLoop(n);
  340. if assigned(hp) then
  341. begin
  342. n.Free;
  343. n:=hp;
  344. end;
  345. { can we avoid further searching? }
  346. if not(containsnestedforloop) then
  347. Result:=fen_norecurse_false;
  348. end;
  349. end;
  350. function OptimizeInductionVariables(node : tnode) : boolean;
  351. begin
  352. changedforloop:=false;
  353. foreachnodestatic(pm_postprocess,node,@iterforloops,nil);
  354. Result:=changedforloop;
  355. end;
  356. end.