optloop.pas 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. {
  2. Loop optimization
  3. Copyright (c) 2005 by Florian Klaempfl
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit optloop;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. node;
  22. function unroll_loop(node : tnode) : tnode;
  23. function optimize_induction_variables(node : tnode) : tnode;
  24. implementation
  25. uses
  26. cclasses,
  27. globtype,globals,constexp,
  28. symdef,symsym,
  29. cpuinfo,
  30. nutils,
  31. nadd,nbas,nflw,ncon,ninl,ncal,nld;
  32. var
  33. nodecount : aword;
  34. function donodecount(var n: tnode; arg: pointer): foreachnoderesult;
  35. begin
  36. inc(nodecount);
  37. result:=fen_false;
  38. end;
  39. { rough estimation how large the tree "node" is }
  40. function countnodes(node : tnode) : aword;
  41. begin
  42. nodecount:=0;
  43. foreachnodestatic(node,@donodecount,nil);
  44. result:=nodecount;
  45. end;
  46. function number_unrolls(node : tnode) : cardinal;
  47. begin
  48. {$ifdef i386}
  49. { multiply by 2 for CPUs with a long pipeline }
  50. if current_settings.cputype in [cpu_Pentium4] then
  51. number_unrolls:=60 div countnodes(node)
  52. else
  53. {$endif i386}
  54. number_unrolls:=30 div countnodes(node);
  55. if number_unrolls=0 then
  56. number_unrolls:=1;
  57. end;
  58. function unroll_loop(node : tnode) : tnode;
  59. var
  60. unrolls,i : cardinal;
  61. counts : qword;
  62. unrollstatement,newforstatement : tstatementnode;
  63. unrollblock : tblocknode;
  64. begin
  65. result:=nil;
  66. if (cs_opt_size in current_settings.optimizerswitches) then
  67. exit;
  68. if not(node.nodetype in [forn]) then
  69. exit;
  70. unrolls:=number_unrolls(tfornode(node).t2);
  71. if unrolls>1 then
  72. begin
  73. { number of executions known? }
  74. if (tfornode(node).right.nodetype=ordconstn) and (tfornode(node).t1.nodetype=ordconstn) then
  75. begin
  76. if lnf_backward in tfornode(node).loopflags then
  77. counts:=tordconstnode(tfornode(node).right).value-tordconstnode(tfornode(node).t1).value+1
  78. else
  79. counts:=tordconstnode(tfornode(node).t1).value-tordconstnode(tfornode(node).right).value+1;
  80. { don't unroll more than we need }
  81. if unrolls>counts then
  82. unrolls:=counts;
  83. { create block statement }
  84. unrollblock:=internalstatements(unrollstatement);
  85. { let's unroll (and rock of course) }
  86. for i:=1 to unrolls do
  87. begin
  88. { create and insert copy of the statement block }
  89. addstatement(unrollstatement,tfornode(node).t2.getcopy);
  90. { set and insert entry label? }
  91. if (counts mod unrolls<>0) and
  92. ((counts mod unrolls)=unrolls-i) then
  93. begin
  94. tfornode(node).entrylabel:=clabelnode.create(cnothingnode.create,tlabelsym.create('$optunrol'));
  95. addstatement(unrollstatement,tfornode(node).entrylabel);
  96. end;
  97. { for itself increases at the last iteration }
  98. if i<unrolls then
  99. begin
  100. { insert incrementation of counter var }
  101. addstatement(unrollstatement,
  102. geninlinenode(in_inc_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)));
  103. end;
  104. end;
  105. { can we get rid of the for statement? }
  106. if unrolls=counts then
  107. begin
  108. { create block statement }
  109. result:=internalstatements(newforstatement);
  110. { initial assignment }
  111. addstatement(newforstatement,cassignmentnode.create(
  112. tfornode(node).left.getcopy,tfornode(node).right.getcopy));
  113. addstatement(newforstatement,unrollblock);
  114. end;
  115. end
  116. else
  117. begin
  118. { unrolling is a little bit more tricky if we don't know the
  119. loop count at compile time, but the solution is to use a jump table
  120. which is indexed by "loop count mod unrolls" at run time and which
  121. jumps then at the appropriate place inside the loop. Because
  122. a module division is expensive, we can use only unroll counts dividable
  123. by 2 }
  124. case unrolls of
  125. 1..2:
  126. ;
  127. 3:
  128. unrolls:=2;
  129. 4..7:
  130. unrolls:=4;
  131. { unrolls>4 already make no sense imo, but who knows (FK) }
  132. 8..15:
  133. unrolls:=8;
  134. 16..31:
  135. unrolls:=16;
  136. 32..63:
  137. unrolls:=32;
  138. 64..$7fff:
  139. unrolls:=64;
  140. else
  141. exit;
  142. end;
  143. { we don't handle this yet }
  144. exit;
  145. end;
  146. if not(assigned(result)) then
  147. begin
  148. tfornode(node).t2.free;
  149. tfornode(node).t2:=unrollblock;
  150. end;
  151. end;
  152. end;
  153. var
  154. initcode,
  155. calccode,
  156. deletecode : tblocknode;
  157. initcodestatements,
  158. calccodestatements,
  159. deletecodestatements: tstatementnode;
  160. templist : tfplist;
  161. inductionexprs : tfplist;
  162. function is_loop_invariant(loop : tnode;expr : tnode) : boolean;
  163. begin
  164. result:=is_constnode(expr);
  165. end;
  166. { checks if the strength of n can be recuded, arg is the tforloop being considered }
  167. function dostrengthreductiontest(var n: tnode; arg: pointer): foreachnoderesult;
  168. function findpreviousstrengthreduction : boolean;
  169. var
  170. i : longint;
  171. begin
  172. result:=false;
  173. for i:=0 to inductionexprs.count-1 do
  174. begin
  175. { do we already maintain one expression? }
  176. if tnode(inductionexprs[i]).isequal(n) then
  177. begin
  178. n.free;
  179. n:=ttemprefnode.create(ttempcreatenode(templist[i]));
  180. result:=true;
  181. exit;
  182. end;
  183. end;
  184. end;
  185. procedure CreateNodes;
  186. begin
  187. if not assigned(initcode) then
  188. begin
  189. initcode:=internalstatements(initcodestatements);
  190. calccode:=internalstatements(calccodestatements);
  191. deletecode:=internalstatements(deletecodestatements);
  192. end;
  193. end;
  194. var
  195. tempnode : ttempcreatenode;
  196. begin
  197. result:=fen_false;
  198. case n.nodetype of
  199. muln:
  200. begin
  201. if (taddnode(n).left.nodetype=loadn) and
  202. taddnode(n).left.isequal(tfornode(arg).left) and
  203. { plain read of the loop variable? }
  204. not(nf_write in taddnode(n).left.flags) and
  205. not(nf_modify in taddnode(n).left.flags) and
  206. is_loop_invariant(tfornode(arg),taddnode(n).right) and
  207. { for now, we can handle only constant lower borders }
  208. is_constnode(tfornode(arg).right) then
  209. begin
  210. { did we use the same expression before already? }
  211. if not(findpreviousstrengthreduction) then
  212. begin
  213. tempnode:=ctempcreatenode.create(n.resultdef,n.resultdef.size,tt_persistent,
  214. tstoreddef(n.resultdef).is_intregable or tstoreddef(n.resultdef).is_fpuregable);
  215. templist.Add(tempnode);
  216. inductionexprs.Add(n);
  217. CreateNodes;
  218. if lnf_backward in tfornode(arg).loopflags then
  219. addstatement(calccodestatements,
  220. geninlinenode(in_dec_x,false,
  221. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))))
  222. else
  223. addstatement(calccodestatements,
  224. geninlinenode(in_inc_x,false,
  225. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))));
  226. addstatement(initcodestatements,tempnode);
  227. addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),
  228. caddnode.create(muln,
  229. caddnode.create(subn,tfornode(arg).right.getcopy,cordconstnode.create(1,tfornode(arg).right.resultdef,false)),
  230. taddnode(n).right.getcopy)
  231. )
  232. );
  233. { finally replace the node by a temp. ref }
  234. n:=ctemprefnode.create(tempnode);
  235. { ... and add a temp. release node }
  236. addstatement(deletecodestatements,ctempdeletenode.create(tempnode));
  237. end;
  238. result:=fen_norecurse_false;
  239. end;
  240. end;
  241. end;
  242. end;
  243. function optimize_induction_variables(node : tnode) : tnode;
  244. var
  245. loopcode,
  246. newcode : tblocknode;
  247. loopcodestatements,
  248. newcodestatements : tstatementnode;
  249. fornode : tfornode;
  250. begin
  251. if node.nodetype<>forn then
  252. exit;
  253. templist:=TFPList.Create;
  254. inductionexprs:=TFPList.Create;
  255. result:=nil;
  256. initcode:=nil;
  257. calccode:=nil;
  258. deletecode:=nil;
  259. initcodestatements:=nil;
  260. calccodestatements:=nil;
  261. deletecodestatements:=nil;
  262. { find all expressions being candidates for strength reduction
  263. and replace them }
  264. foreachnodestatic(pm_postprocess,node,@dostrengthreductiontest,node);
  265. { clue everything together }
  266. if assigned(initcode) then
  267. begin
  268. { create a new for node, the old one will be released by the compiler }
  269. with tfornode(node) do
  270. begin
  271. fornode:=cfornode.create(left,right,t1,t2,lnf_backward in loopflags);
  272. left:=nil;
  273. right:=nil;
  274. t1:=nil;
  275. t2:=nil;
  276. end;
  277. node:=fornode;
  278. loopcode:=internalstatements(loopcodestatements);
  279. addstatement(loopcodestatements,calccode);
  280. addstatement(loopcodestatements,tfornode(node).t2);
  281. tfornode(node).t2:=loopcode;
  282. result:=internalstatements(newcodestatements);
  283. addstatement(newcodestatements,initcode);
  284. addstatement(newcodestatements,node);
  285. addstatement(newcodestatements,deletecode);
  286. printnode(output,result);
  287. end;
  288. templist.Free;
  289. inductionexprs.Free;
  290. end;
  291. end.