optloop.pas 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468
  1. {
  2. Loop optimization
  3. Copyright (c) 2005 by Florian Klaempfl
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit optloop;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. node;
  22. function unroll_loop(node : tnode) : tnode;
  23. function OptimizeInductionVariables(node : tnode) : boolean;
  24. implementation
  25. uses
  26. cutils,cclasses,
  27. globtype,globals,constexp,
  28. symdef,symsym,
  29. defutil,
  30. cpuinfo,
  31. nutils,
  32. nadd,nbas,nflw,ncon,ninl,ncal,nld,nmem,ncnv,
  33. ncgmem,
  34. pass_1,
  35. optbase,optutils,
  36. procinfo;
  37. function number_unrolls(node : tnode) : cardinal;
  38. begin
  39. {$ifdef i386}
  40. { multiply by 2 for CPUs with a long pipeline }
  41. if current_settings.cputype in [cpu_Pentium4] then
  42. number_unrolls:=60 div node_count(node)
  43. else
  44. {$endif i386}
  45. number_unrolls:=30 div node_count(node);
  46. if number_unrolls=0 then
  47. number_unrolls:=1;
  48. end;
  49. function unroll_loop(node : tnode) : tnode;
  50. var
  51. unrolls,i : cardinal;
  52. counts : qword;
  53. unrollstatement,newforstatement : tstatementnode;
  54. unrollblock : tblocknode;
  55. begin
  56. result:=nil;
  57. if (cs_opt_size in current_settings.optimizerswitches) then
  58. exit;
  59. if not(node.nodetype in [forn]) then
  60. exit;
  61. unrolls:=number_unrolls(tfornode(node).t2);
  62. if unrolls>1 then
  63. begin
  64. { number of executions known? }
  65. if (tfornode(node).right.nodetype=ordconstn) and (tfornode(node).t1.nodetype=ordconstn) then
  66. begin
  67. if lnf_backward in tfornode(node).loopflags then
  68. counts:=tordconstnode(tfornode(node).right).value-tordconstnode(tfornode(node).t1).value+1
  69. else
  70. counts:=tordconstnode(tfornode(node).t1).value-tordconstnode(tfornode(node).right).value+1;
  71. { don't unroll more than we need }
  72. if unrolls>counts then
  73. unrolls:=counts;
  74. { create block statement }
  75. unrollblock:=internalstatements(unrollstatement);
  76. { let's unroll (and rock of course) }
  77. for i:=1 to unrolls do
  78. begin
  79. { create and insert copy of the statement block }
  80. addstatement(unrollstatement,tfornode(node).t2.getcopy);
  81. { set and insert entry label? }
  82. if (counts mod unrolls<>0) and
  83. ((counts mod unrolls)=unrolls-i) then
  84. begin
  85. tfornode(node).entrylabel:=clabelnode.create(cnothingnode.create,tlabelsym.create('$optunrol'));
  86. addstatement(unrollstatement,tfornode(node).entrylabel);
  87. end;
  88. { for itself increases at the last iteration }
  89. if i<unrolls then
  90. begin
  91. { insert incr/decrementation of counter var }
  92. if lnf_backward in tfornode(node).loopflags then
  93. addstatement(unrollstatement,
  94. geninlinenode(in_dec_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)))
  95. else
  96. addstatement(unrollstatement,
  97. geninlinenode(in_inc_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)));
  98. end;
  99. end;
  100. { can we get rid of the for statement? }
  101. if unrolls=counts then
  102. begin
  103. { create block statement }
  104. result:=internalstatements(newforstatement);
  105. { initial assignment }
  106. addstatement(newforstatement,cassignmentnode.create(
  107. tfornode(node).left.getcopy,tfornode(node).right.getcopy));
  108. addstatement(newforstatement,unrollblock);
  109. end;
  110. end
  111. else
  112. begin
  113. { unrolling is a little bit more tricky if we don't know the
  114. loop count at compile time, but the solution is to use a jump table
  115. which is indexed by "loop count mod unrolls" at run time and which
  116. jumps then at the appropriate place inside the loop. Because
  117. a module division is expensive, we can use only unroll counts dividable
  118. by 2 }
  119. case unrolls of
  120. 1..2:
  121. ;
  122. 3:
  123. unrolls:=2;
  124. 4..7:
  125. unrolls:=4;
  126. { unrolls>4 already make no sense imo, but who knows (FK) }
  127. 8..15:
  128. unrolls:=8;
  129. 16..31:
  130. unrolls:=16;
  131. 32..63:
  132. unrolls:=32;
  133. 64..$7fff:
  134. unrolls:=64;
  135. else
  136. exit;
  137. end;
  138. { we don't handle this yet }
  139. exit;
  140. end;
  141. if not(assigned(result)) then
  142. begin
  143. tfornode(node).t2.free;
  144. tfornode(node).t2:=unrollblock;
  145. end;
  146. end;
  147. end;
  148. var
  149. initcode,
  150. calccode,
  151. deletecode : tblocknode;
  152. initcodestatements,
  153. calccodestatements,
  154. deletecodestatements: tstatementnode;
  155. templist : tfplist;
  156. inductionexprs : tfplist;
  157. changedforloop,
  158. containsnestedforloop : boolean;
  159. function is_loop_invariant(loop : tnode;expr : tnode) : boolean;
  160. begin
  161. result:=is_constnode(expr);
  162. case expr.nodetype of
  163. loadn:
  164. begin
  165. if (pi_dfaavailable in current_procinfo.flags) and
  166. assigned(loop.optinfo) and
  167. assigned(expr.optinfo) then
  168. { no aliasing? }
  169. result:=not(tabstractvarsym(tloadnode(expr).symtableentry).addr_taken) and
  170. { no definition in the loop? }
  171. not(DFASetIn(loop.optinfo^.defsum,expr.optinfo^.index));
  172. end;
  173. vecn:
  174. begin
  175. result:=((tvecnode(expr).left.nodetype=loadn) or is_loop_invariant(loop,tvecnode(expr).left)) and
  176. is_loop_invariant(loop,tvecnode(expr).right);
  177. end;
  178. typeconvn:
  179. result:=is_loop_invariant(loop,ttypeconvnode(expr).left);
  180. end;
  181. end;
  182. { checks if the strength of n can be recuded, arg is the tforloop being considered }
  183. function dostrengthreductiontest(var n: tnode; arg: pointer): foreachnoderesult;
  184. function findpreviousstrengthreduction : boolean;
  185. var
  186. i : longint;
  187. begin
  188. result:=false;
  189. for i:=0 to inductionexprs.count-1 do
  190. begin
  191. { do we already maintain one expression? }
  192. if tnode(inductionexprs[i]).isequal(n) then
  193. begin
  194. n.free;
  195. case n.nodetype of
  196. muln:
  197. n:=ctemprefnode.create(ttempcreatenode(templist[i]));
  198. vecn:
  199. n:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(
  200. ttempcreatenode(templist[i]))),n.resultdef);
  201. else
  202. internalerror(200809211);
  203. end;
  204. result:=true;
  205. exit;
  206. end;
  207. end;
  208. end;
  209. procedure CreateNodes;
  210. begin
  211. if not assigned(initcode) then
  212. begin
  213. initcode:=internalstatements(initcodestatements);
  214. calccode:=internalstatements(calccodestatements);
  215. deletecode:=internalstatements(deletecodestatements);
  216. end;
  217. end;
  218. var
  219. tempnode : ttempcreatenode;
  220. dummy : longint;
  221. begin
  222. result:=fen_false;
  223. case n.nodetype of
  224. forn:
  225. { inform for loop search routine, that it needs to search more deeply }
  226. containsnestedforloop:=true;
  227. muln:
  228. begin
  229. if (taddnode(n).right.nodetype=loadn) and
  230. taddnode(n).right.isequal(tfornode(arg).left) and
  231. { plain read of the loop variable? }
  232. not(nf_write in taddnode(n).right.flags) and
  233. not(nf_modify in taddnode(n).right.flags) and
  234. is_loop_invariant(tfornode(arg),taddnode(n).left) and
  235. { for now, we can handle only constant lower borders }
  236. is_constnode(tfornode(arg).right) then
  237. taddnode(n).swapleftright;
  238. if (taddnode(n).left.nodetype=loadn) and
  239. taddnode(n).left.isequal(tfornode(arg).left) and
  240. { plain read of the loop variable? }
  241. not(nf_write in taddnode(n).left.flags) and
  242. not(nf_modify in taddnode(n).left.flags) and
  243. is_loop_invariant(tfornode(arg),taddnode(n).right) and
  244. { for now, we can handle only constant lower borders }
  245. is_constnode(tfornode(arg).right) then
  246. begin
  247. changedforloop:=true;
  248. { did we use the same expression before already? }
  249. if not(findpreviousstrengthreduction) then
  250. begin
  251. tempnode:=ctempcreatenode.create(n.resultdef,n.resultdef.size,tt_persistent,
  252. tstoreddef(n.resultdef).is_intregable or tstoreddef(n.resultdef).is_fpuregable);
  253. templist.Add(tempnode);
  254. inductionexprs.Add(n);
  255. CreateNodes;
  256. if lnf_backward in tfornode(arg).loopflags then
  257. addstatement(calccodestatements,
  258. geninlinenode(in_dec_x,false,
  259. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))))
  260. else
  261. addstatement(calccodestatements,
  262. geninlinenode(in_inc_x,false,
  263. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))));
  264. addstatement(initcodestatements,tempnode);
  265. addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),
  266. caddnode.create(muln,
  267. caddnode.create(subn,tfornode(arg).right.getcopy,cordconstnode.create(1,tfornode(arg).right.resultdef,false)),
  268. taddnode(n).right.getcopy)
  269. )
  270. );
  271. { finally replace the node by a temp. ref }
  272. n:=ctemprefnode.create(tempnode);
  273. { ... and add a temp. release node }
  274. addstatement(deletecodestatements,ctempdeletenode.create(tempnode));
  275. end;
  276. { set types }
  277. do_firstpass(n);
  278. result:=fen_norecurse_false;
  279. end;
  280. end;
  281. vecn:
  282. begin
  283. { is the index the counter variable? }
  284. if not(is_special_array(tvecnode(n).left.resultdef)) and
  285. (tvecnode(n).right.isequal(tfornode(arg).left) or
  286. { fpc usually creates a type cast to access an array }
  287. ((tvecnode(n).right.nodetype=typeconvn) and
  288. ttypeconvnode(tvecnode(n).right).left.isequal(tfornode(arg).left)
  289. )
  290. ) and
  291. { plain read of the loop variable? }
  292. not(nf_write in tvecnode(n).right.flags) and
  293. not(nf_modify in tvecnode(n).right.flags) and
  294. { direct array access? }
  295. ((tvecnode(n).left.nodetype=loadn) or
  296. { ... or loop invariant expression? }
  297. is_loop_invariant(tfornode(arg),tvecnode(n).left)) and
  298. { removing the multiplication is only worth the
  299. effort if it's not a simple shift }
  300. not(ispowerof2(tcgvecnode(n).get_mul_size,dummy)) then
  301. begin
  302. changedforloop:=true;
  303. { did we use the same expression before already? }
  304. if not(findpreviousstrengthreduction) then
  305. begin
  306. tempnode:=ctempcreatenode.create(voidpointertype,voidpointertype.size,tt_persistent,true);
  307. templist.Add(tempnode);
  308. inductionexprs.Add(n);
  309. CreateNodes;
  310. if lnf_backward in tfornode(arg).loopflags then
  311. addstatement(calccodestatements,
  312. geninlinenode(in_dec_x,false,
  313. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(
  314. cordconstnode.create(tcgvecnode(n).get_mul_size,tfornode(arg).right.resultdef,false),nil))))
  315. else
  316. addstatement(calccodestatements,
  317. geninlinenode(in_inc_x,false,
  318. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(
  319. cordconstnode.create(tcgvecnode(n).get_mul_size,tfornode(arg).right.resultdef,false),nil))));
  320. addstatement(initcodestatements,tempnode);
  321. addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),
  322. caddrnode.create(
  323. cvecnode.create(tvecnode(n).left.getcopy,tfornode(arg).right.getcopy)
  324. )
  325. ));
  326. { finally replace the node by a temp. ref }
  327. n:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(tempnode)),n.resultdef);
  328. { ... and add a temp. release node }
  329. addstatement(deletecodestatements,ctempdeletenode.create(tempnode));
  330. end;
  331. { set types }
  332. do_firstpass(n);
  333. result:=fen_norecurse_false;
  334. end;
  335. end;
  336. end;
  337. end;
  338. function OptimizeInductionVariablesSingleForLoop(node : tnode) : tnode;
  339. var
  340. loopcode : tblocknode;
  341. loopcodestatements,
  342. newcodestatements : tstatementnode;
  343. fornode : tfornode;
  344. begin
  345. result:=nil;
  346. if node.nodetype<>forn then
  347. exit;
  348. templist:=TFPList.Create;
  349. inductionexprs:=TFPList.Create;
  350. initcode:=nil;
  351. calccode:=nil;
  352. deletecode:=nil;
  353. initcodestatements:=nil;
  354. calccodestatements:=nil;
  355. deletecodestatements:=nil;
  356. { find all expressions being candidates for strength reduction
  357. and replace them }
  358. foreachnodestatic(pm_postprocess,node,@dostrengthreductiontest,node);
  359. { clue everything together }
  360. if assigned(initcode) then
  361. begin
  362. do_firstpass(tnode(initcode));
  363. do_firstpass(tnode(calccode));
  364. do_firstpass(tnode(deletecode));
  365. { create a new for node, the old one will be released by the compiler }
  366. with tfornode(node) do
  367. begin
  368. fornode:=cfornode.create(left,right,t1,t2,lnf_backward in loopflags);
  369. left:=nil;
  370. right:=nil;
  371. t1:=nil;
  372. t2:=nil;
  373. end;
  374. node:=fornode;
  375. loopcode:=internalstatements(loopcodestatements);
  376. addstatement(loopcodestatements,calccode);
  377. addstatement(loopcodestatements,tfornode(node).t2);
  378. tfornode(node).t2:=loopcode;
  379. do_firstpass(node);
  380. result:=internalstatements(newcodestatements);
  381. addstatement(newcodestatements,initcode);
  382. addstatement(newcodestatements,node);
  383. addstatement(newcodestatements,deletecode);
  384. end;
  385. templist.Free;
  386. inductionexprs.Free;
  387. end;
  388. function iterforloops(var n: tnode; arg: pointer): foreachnoderesult;
  389. var
  390. hp : tnode;
  391. begin
  392. Result:=fen_false;
  393. if n.nodetype=forn then
  394. begin
  395. { do we have DFA available? }
  396. if pi_dfaavailable in current_procinfo.flags then
  397. begin
  398. CalcDefSum(n);
  399. end;
  400. containsnestedforloop:=false;
  401. hp:=OptimizeInductionVariablesSingleForLoop(n);
  402. if assigned(hp) then
  403. begin
  404. n.Free;
  405. n:=hp;
  406. end;
  407. { can we avoid further searching? }
  408. if not(containsnestedforloop) then
  409. Result:=fen_norecurse_false;
  410. end;
  411. end;
  412. function OptimizeInductionVariables(node : tnode) : boolean;
  413. begin
  414. changedforloop:=false;
  415. foreachnodestatic(pm_postprocess,node,@iterforloops,nil);
  416. Result:=changedforloop;
  417. end;
  418. end.