optloop.pas 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524
  1. {
  2. Loop optimization
  3. Copyright (c) 2005 by Florian Klaempfl
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit optloop;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. node;
  22. function unroll_loop(node : tnode) : tnode;
  23. function OptimizeInductionVariables(node : tnode) : boolean;
  24. implementation
  25. uses
  26. cutils,cclasses,
  27. globtype,globals,constexp,
  28. symdef,symsym,
  29. defutil,
  30. cpuinfo,
  31. nutils,
  32. nadd,nbas,nflw,ncon,ninl,ncal,nld,nmem,ncnv,
  33. ncgmem,
  34. pass_1,
  35. optbase,optutils,
  36. procinfo;
  37. function number_unrolls(node : tnode) : cardinal;
  38. begin
  39. {$ifdef i386}
  40. { multiply by 2 for CPUs with a long pipeline }
  41. if current_settings.optimizecputype in [cpu_Pentium4] then
  42. number_unrolls:=60 div node_count(node)
  43. else
  44. {$endif i386}
  45. number_unrolls:=30 div node_count(node);
  46. if number_unrolls=0 then
  47. number_unrolls:=1;
  48. end;
  49. type
  50. treplaceinfo = record
  51. node : tnode;
  52. value : Tconstexprint;
  53. end;
  54. preplaceinfo = ^treplaceinfo;
  55. function checkbreakcontinue(var n:tnode; arg: pointer): foreachnoderesult;
  56. begin
  57. if n.nodetype in [breakn,continuen] then
  58. result:=fen_norecurse_true
  59. else
  60. result:=fen_false;
  61. end;
  62. function replaceloadnodes(var n: tnode; arg: pointer): foreachnoderesult;
  63. begin
  64. if ((n.nodetype=loadn) and (preplaceinfo(arg)^.node.nodetype=loadn) and
  65. (tloadnode(n).symtableentry=tloadnode(preplaceinfo(arg)^.node).symtableentry)) or
  66. ((n.nodetype=temprefn) and (preplaceinfo(arg)^.node.nodetype=temprefn) and
  67. (ttemprefnode(n).tempinfo=ttemprefnode(preplaceinfo(arg)^.node).tempinfo)) then
  68. begin
  69. if n.flags*[nf_modify,nf_write]<>[] then
  70. internalerror(2012090402);
  71. n.free;
  72. n:=cordconstnode.create(preplaceinfo(arg)^.value,preplaceinfo(arg)^.node.resultdef,false);
  73. end;
  74. result:=fen_false;
  75. end;
  76. function unroll_loop(node : tnode) : tnode;
  77. var
  78. unrolls,i : cardinal;
  79. counts : qword;
  80. unrollstatement,newforstatement : tstatementnode;
  81. unrollblock : tblocknode;
  82. getridoffor : boolean;
  83. replaceinfo : treplaceinfo;
  84. begin
  85. result:=nil;
  86. if (cs_opt_size in current_settings.optimizerswitches) then
  87. exit;
  88. if not(node.nodetype in [forn]) then
  89. exit;
  90. unrolls:=number_unrolls(tfornode(node).t2);
  91. if unrolls>1 then
  92. begin
  93. { number of executions known? }
  94. if (tfornode(node).right.nodetype=ordconstn) and (tfornode(node).t1.nodetype=ordconstn) then
  95. begin
  96. if lnf_backward in tfornode(node).loopflags then
  97. counts:=tordconstnode(tfornode(node).right).value-tordconstnode(tfornode(node).t1).value+1
  98. else
  99. counts:=tordconstnode(tfornode(node).t1).value-tordconstnode(tfornode(node).right).value+1;
  100. { don't unroll more than we need,
  101. multiply unroll by two here because we can get rid
  102. of the counter variable completely and replace it by a constant
  103. if unrolls=counts }
  104. if unrolls*2>counts then
  105. unrolls:=counts;
  106. { create block statement }
  107. unrollblock:=internalstatements(unrollstatement);
  108. { can we get rid completly of the for ? }
  109. getridoffor:=(unrolls=counts) and not(foreachnodestatic(tfornode(node).t2,@checkbreakcontinue,nil));
  110. if getridoffor then
  111. begin
  112. if not(tfornode(node).left.nodetype in [temprefn,loadn]) then
  113. internalerror(2012090301);
  114. replaceinfo.node:=tfornode(node).left;
  115. replaceinfo.value:=tordconstnode(tfornode(node).right).value;
  116. end;
  117. { let's unroll (and rock of course) }
  118. for i:=1 to unrolls do
  119. begin
  120. { create and insert copy of the statement block }
  121. addstatement(unrollstatement,tfornode(node).t2.getcopy);
  122. { set and insert entry label? }
  123. if (counts mod unrolls<>0) and
  124. ((counts mod unrolls)=unrolls-i) then
  125. begin
  126. tfornode(node).entrylabel:=clabelnode.create(cnothingnode.create,tlabelsym.create('$optunrol'));
  127. addstatement(unrollstatement,tfornode(node).entrylabel);
  128. end;
  129. if getridoffor then
  130. begin
  131. foreachnodestatic(tnode(unrollstatement),@replaceloadnodes,@replaceinfo);
  132. if lnf_backward in tfornode(node).loopflags then
  133. replaceinfo.value:=replaceinfo.value-1
  134. else
  135. replaceinfo.value:=replaceinfo.value+1;
  136. end
  137. else
  138. begin
  139. { for itself increases at the last iteration }
  140. if i<unrolls then
  141. begin
  142. { insert incr/decrementation of counter var }
  143. if lnf_backward in tfornode(node).loopflags then
  144. addstatement(unrollstatement,
  145. geninlinenode(in_dec_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)))
  146. else
  147. addstatement(unrollstatement,
  148. geninlinenode(in_inc_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)));
  149. end;
  150. end;
  151. end;
  152. { can we get rid of the for statement? }
  153. if getridoffor then
  154. begin
  155. { create block statement }
  156. result:=internalstatements(newforstatement);
  157. addstatement(newforstatement,unrollblock);
  158. end;
  159. end
  160. else
  161. begin
  162. { unrolling is a little bit more tricky if we don't know the
  163. loop count at compile time, but the solution is to use a jump table
  164. which is indexed by "loop count mod unrolls" at run time and which
  165. jumps then at the appropriate place inside the loop. Because
  166. a module division is expensive, we can use only unroll counts dividable
  167. by 2 }
  168. case unrolls of
  169. 1..2:
  170. ;
  171. 3:
  172. unrolls:=2;
  173. 4..7:
  174. unrolls:=4;
  175. { unrolls>4 already make no sense imo, but who knows (FK) }
  176. 8..15:
  177. unrolls:=8;
  178. 16..31:
  179. unrolls:=16;
  180. 32..63:
  181. unrolls:=32;
  182. 64..$7fff:
  183. unrolls:=64;
  184. else
  185. exit;
  186. end;
  187. { we don't handle this yet }
  188. exit;
  189. end;
  190. if not(assigned(result)) then
  191. begin
  192. tfornode(node).t2.free;
  193. tfornode(node).t2:=unrollblock;
  194. end;
  195. end;
  196. end;
  197. var
  198. initcode,
  199. calccode,
  200. deletecode : tblocknode;
  201. initcodestatements,
  202. calccodestatements,
  203. deletecodestatements: tstatementnode;
  204. templist : tfplist;
  205. inductionexprs : tfplist;
  206. changedforloop,
  207. containsnestedforloop : boolean;
  208. function is_loop_invariant(loop : tnode;expr : tnode) : boolean;
  209. begin
  210. result:=is_constnode(expr);
  211. case expr.nodetype of
  212. loadn:
  213. begin
  214. if (pi_dfaavailable in current_procinfo.flags) and
  215. assigned(loop.optinfo) and
  216. assigned(expr.optinfo) then
  217. { no aliasing? }
  218. result:=not(tabstractvarsym(tloadnode(expr).symtableentry).addr_taken) and
  219. { no definition in the loop? }
  220. not(DFASetIn(loop.optinfo^.defsum,expr.optinfo^.index));
  221. end;
  222. vecn:
  223. begin
  224. result:=((tvecnode(expr).left.nodetype=loadn) or is_loop_invariant(loop,tvecnode(expr).left)) and
  225. is_loop_invariant(loop,tvecnode(expr).right);
  226. end;
  227. typeconvn:
  228. result:=is_loop_invariant(loop,ttypeconvnode(expr).left);
  229. end;
  230. end;
  231. { checks if the strength of n can be recuded, arg is the tforloop being considered }
  232. function dostrengthreductiontest(var n: tnode; arg: pointer): foreachnoderesult;
  233. function findpreviousstrengthreduction : boolean;
  234. var
  235. i : longint;
  236. begin
  237. result:=false;
  238. for i:=0 to inductionexprs.count-1 do
  239. begin
  240. { do we already maintain one expression? }
  241. if tnode(inductionexprs[i]).isequal(n) then
  242. begin
  243. n.free;
  244. case n.nodetype of
  245. muln:
  246. n:=ctemprefnode.create(ttempcreatenode(templist[i]));
  247. vecn:
  248. n:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(
  249. ttempcreatenode(templist[i]))),n.resultdef);
  250. else
  251. internalerror(200809211);
  252. end;
  253. result:=true;
  254. exit;
  255. end;
  256. end;
  257. end;
  258. procedure CreateNodes;
  259. begin
  260. if not assigned(initcode) then
  261. begin
  262. initcode:=internalstatements(initcodestatements);
  263. calccode:=internalstatements(calccodestatements);
  264. deletecode:=internalstatements(deletecodestatements);
  265. end;
  266. end;
  267. var
  268. tempnode : ttempcreatenode;
  269. dummy : longint;
  270. begin
  271. result:=fen_false;
  272. case n.nodetype of
  273. forn:
  274. { inform for loop search routine, that it needs to search more deeply }
  275. containsnestedforloop:=true;
  276. muln:
  277. begin
  278. if (taddnode(n).right.nodetype=loadn) and
  279. taddnode(n).right.isequal(tfornode(arg).left) and
  280. { plain read of the loop variable? }
  281. not(nf_write in taddnode(n).right.flags) and
  282. not(nf_modify in taddnode(n).right.flags) and
  283. is_loop_invariant(tfornode(arg),taddnode(n).left) and
  284. { for now, we can handle only constant lower borders }
  285. is_constnode(tfornode(arg).right) then
  286. taddnode(n).swapleftright;
  287. if (taddnode(n).left.nodetype=loadn) and
  288. taddnode(n).left.isequal(tfornode(arg).left) and
  289. { plain read of the loop variable? }
  290. not(nf_write in taddnode(n).left.flags) and
  291. not(nf_modify in taddnode(n).left.flags) and
  292. is_loop_invariant(tfornode(arg),taddnode(n).right) and
  293. { for now, we can handle only constant lower borders }
  294. is_constnode(tfornode(arg).right) then
  295. begin
  296. changedforloop:=true;
  297. { did we use the same expression before already? }
  298. if not(findpreviousstrengthreduction) then
  299. begin
  300. tempnode:=ctempcreatenode.create(n.resultdef,n.resultdef.size,tt_persistent,
  301. tstoreddef(n.resultdef).is_intregable or tstoreddef(n.resultdef).is_fpuregable);
  302. templist.Add(tempnode);
  303. inductionexprs.Add(n);
  304. CreateNodes;
  305. if lnf_backward in tfornode(arg).loopflags then
  306. addstatement(calccodestatements,
  307. geninlinenode(in_dec_x,false,
  308. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))))
  309. else
  310. addstatement(calccodestatements,
  311. geninlinenode(in_inc_x,false,
  312. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))));
  313. addstatement(initcodestatements,tempnode);
  314. addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),
  315. caddnode.create(muln,
  316. caddnode.create(subn,tfornode(arg).right.getcopy,cordconstnode.create(1,tfornode(arg).right.resultdef,false)),
  317. taddnode(n).right.getcopy)
  318. )
  319. );
  320. { finally replace the node by a temp. ref }
  321. n:=ctemprefnode.create(tempnode);
  322. { ... and add a temp. release node }
  323. addstatement(deletecodestatements,ctempdeletenode.create(tempnode));
  324. end;
  325. { set types }
  326. do_firstpass(n);
  327. result:=fen_norecurse_false;
  328. end;
  329. end;
  330. vecn:
  331. begin
  332. { is the index the counter variable? }
  333. if not(is_special_array(tvecnode(n).left.resultdef)) and
  334. (tvecnode(n).right.isequal(tfornode(arg).left) or
  335. { fpc usually creates a type cast to access an array }
  336. ((tvecnode(n).right.nodetype=typeconvn) and
  337. ttypeconvnode(tvecnode(n).right).left.isequal(tfornode(arg).left)
  338. )
  339. ) and
  340. { plain read of the loop variable? }
  341. not(nf_write in tvecnode(n).right.flags) and
  342. not(nf_modify in tvecnode(n).right.flags) and
  343. { direct array access? }
  344. ((tvecnode(n).left.nodetype=loadn) or
  345. { ... or loop invariant expression? }
  346. is_loop_invariant(tfornode(arg),tvecnode(n).left)) and
  347. { removing the multiplication is only worth the
  348. effort if it's not a simple shift }
  349. not(ispowerof2(tcgvecnode(n).get_mul_size,dummy)) then
  350. begin
  351. changedforloop:=true;
  352. { did we use the same expression before already? }
  353. if not(findpreviousstrengthreduction) then
  354. begin
  355. tempnode:=ctempcreatenode.create(voidpointertype,voidpointertype.size,tt_persistent,true);
  356. templist.Add(tempnode);
  357. inductionexprs.Add(n);
  358. CreateNodes;
  359. if lnf_backward in tfornode(arg).loopflags then
  360. addstatement(calccodestatements,
  361. geninlinenode(in_dec_x,false,
  362. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(
  363. cordconstnode.create(tcgvecnode(n).get_mul_size,tfornode(arg).right.resultdef,false),nil))))
  364. else
  365. addstatement(calccodestatements,
  366. geninlinenode(in_inc_x,false,
  367. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(
  368. cordconstnode.create(tcgvecnode(n).get_mul_size,tfornode(arg).right.resultdef,false),nil))));
  369. addstatement(initcodestatements,tempnode);
  370. addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),
  371. caddrnode.create(
  372. cvecnode.create(tvecnode(n).left.getcopy,tfornode(arg).right.getcopy)
  373. )
  374. ));
  375. { finally replace the node by a temp. ref }
  376. n:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(tempnode)),n.resultdef);
  377. { ... and add a temp. release node }
  378. addstatement(deletecodestatements,ctempdeletenode.create(tempnode));
  379. end;
  380. { set types }
  381. do_firstpass(n);
  382. result:=fen_norecurse_false;
  383. end;
  384. end;
  385. end;
  386. end;
  387. function OptimizeInductionVariablesSingleForLoop(node : tnode) : tnode;
  388. var
  389. loopcode : tblocknode;
  390. loopcodestatements,
  391. newcodestatements : tstatementnode;
  392. fornode : tfornode;
  393. begin
  394. result:=nil;
  395. if node.nodetype<>forn then
  396. exit;
  397. templist:=TFPList.Create;
  398. inductionexprs:=TFPList.Create;
  399. initcode:=nil;
  400. calccode:=nil;
  401. deletecode:=nil;
  402. initcodestatements:=nil;
  403. calccodestatements:=nil;
  404. deletecodestatements:=nil;
  405. { find all expressions being candidates for strength reduction
  406. and replace them }
  407. foreachnodestatic(pm_postprocess,node,@dostrengthreductiontest,node);
  408. { clue everything together }
  409. if assigned(initcode) then
  410. begin
  411. do_firstpass(tnode(initcode));
  412. do_firstpass(tnode(calccode));
  413. do_firstpass(tnode(deletecode));
  414. { create a new for node, the old one will be released by the compiler }
  415. with tfornode(node) do
  416. begin
  417. fornode:=cfornode.create(left,right,t1,t2,lnf_backward in loopflags);
  418. left:=nil;
  419. right:=nil;
  420. t1:=nil;
  421. t2:=nil;
  422. end;
  423. node:=fornode;
  424. loopcode:=internalstatements(loopcodestatements);
  425. addstatement(loopcodestatements,calccode);
  426. addstatement(loopcodestatements,tfornode(node).t2);
  427. tfornode(node).t2:=loopcode;
  428. do_firstpass(node);
  429. result:=internalstatements(newcodestatements);
  430. addstatement(newcodestatements,initcode);
  431. addstatement(newcodestatements,node);
  432. addstatement(newcodestatements,deletecode);
  433. end;
  434. templist.Free;
  435. inductionexprs.Free;
  436. end;
  437. function iterforloops(var n: tnode; arg: pointer): foreachnoderesult;
  438. var
  439. hp : tnode;
  440. begin
  441. Result:=fen_false;
  442. if n.nodetype=forn then
  443. begin
  444. { do we have DFA available? }
  445. if pi_dfaavailable in current_procinfo.flags then
  446. begin
  447. CalcDefSum(n);
  448. end;
  449. containsnestedforloop:=false;
  450. hp:=OptimizeInductionVariablesSingleForLoop(n);
  451. if assigned(hp) then
  452. begin
  453. n.Free;
  454. n:=hp;
  455. end;
  456. { can we avoid further searching? }
  457. if not(containsnestedforloop) then
  458. Result:=fen_norecurse_false;
  459. end;
  460. end;
  461. function OptimizeInductionVariables(node : tnode) : boolean;
  462. begin
  463. changedforloop:=false;
  464. foreachnodestatic(pm_postprocess,node,@iterforloops,nil);
  465. Result:=changedforloop;
  466. end;
  467. end.