optloop.pas 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537
  1. {
  2. Loop optimization
  3. Copyright (c) 2005 by Florian Klaempfl
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit optloop;
  18. {$i fpcdefs.inc}
  19. { $define DEBUG_OPTSTRENGTH}
  20. interface
  21. uses
  22. node;
  23. function unroll_loop(node : tnode) : tnode;
  24. function OptimizeInductionVariables(node : tnode) : boolean;
  25. implementation
  26. uses
  27. cutils,cclasses,
  28. globtype,globals,constexp,
  29. verbose,
  30. symdef,symsym,
  31. defutil,
  32. cpuinfo,
  33. nutils,
  34. nadd,nbas,nflw,ncon,ninl,ncal,nld,nmem,ncnv,
  35. ncgmem,
  36. pass_1,
  37. optbase,optutils,
  38. procinfo;
  39. function number_unrolls(node : tnode) : cardinal;
  40. begin
  41. {$ifdef i386}
  42. { multiply by 2 for CPUs with a long pipeline }
  43. if current_settings.optimizecputype in [cpu_Pentium4] then
  44. number_unrolls:=60 div node_count(node)
  45. else
  46. {$endif i386}
  47. number_unrolls:=30 div node_count(node);
  48. if number_unrolls=0 then
  49. number_unrolls:=1;
  50. end;
  51. type
  52. treplaceinfo = record
  53. node : tnode;
  54. value : Tconstexprint;
  55. end;
  56. preplaceinfo = ^treplaceinfo;
  57. function checkbreakcontinue(var n:tnode; arg: pointer): foreachnoderesult;
  58. begin
  59. if n.nodetype in [breakn,continuen] then
  60. result:=fen_norecurse_true
  61. else
  62. result:=fen_false;
  63. end;
  64. function replaceloadnodes(var n: tnode; arg: pointer): foreachnoderesult;
  65. begin
  66. if ((n.nodetype=loadn) and (preplaceinfo(arg)^.node.nodetype=loadn) and
  67. (tloadnode(n).symtableentry=tloadnode(preplaceinfo(arg)^.node).symtableentry)) or
  68. ((n.nodetype=temprefn) and (preplaceinfo(arg)^.node.nodetype=temprefn) and
  69. (ttemprefnode(n).tempinfo=ttemprefnode(preplaceinfo(arg)^.node).tempinfo)) then
  70. begin
  71. if n.flags*[nf_modify,nf_write,nf_address_taken]<>[] then
  72. internalerror(2012090402);
  73. n.free;
  74. n:=cordconstnode.create(preplaceinfo(arg)^.value,preplaceinfo(arg)^.node.resultdef,false);
  75. end;
  76. result:=fen_false;
  77. end;
  78. function unroll_loop(node : tnode) : tnode;
  79. var
  80. unrolls,i : cardinal;
  81. counts : qword;
  82. unrollstatement,newforstatement : tstatementnode;
  83. unrollblock : tblocknode;
  84. getridoffor : boolean;
  85. replaceinfo : treplaceinfo;
  86. usesbreakcontinue : boolean;
  87. begin
  88. result:=nil;
  89. if (cs_opt_size in current_settings.optimizerswitches) then
  90. exit;
  91. if not(node.nodetype in [forn]) then
  92. exit;
  93. unrolls:=number_unrolls(tfornode(node).t2);
  94. if unrolls>1 then
  95. begin
  96. { number of executions known? }
  97. if (tfornode(node).right.nodetype=ordconstn) and (tfornode(node).t1.nodetype=ordconstn) then
  98. begin
  99. if lnf_backward in tfornode(node).loopflags then
  100. counts:=tordconstnode(tfornode(node).right).value-tordconstnode(tfornode(node).t1).value+1
  101. else
  102. counts:=tordconstnode(tfornode(node).t1).value-tordconstnode(tfornode(node).right).value+1;
  103. usesbreakcontinue:=foreachnodestatic(tfornode(node).t2,@checkbreakcontinue,nil);
  104. { don't unroll more than we need,
  105. multiply unroll by two here because we can get rid
  106. of the counter variable completely and replace it by a constant
  107. if unrolls=counts }
  108. if unrolls*2>counts then
  109. unrolls:=counts;
  110. { create block statement }
  111. unrollblock:=internalstatements(unrollstatement);
  112. { can we get rid completly of the for ? }
  113. getridoffor:=(unrolls=counts) and not(usesbreakcontinue);
  114. if getridoffor then
  115. begin
  116. if not(tfornode(node).left.nodetype in [temprefn,loadn]) then
  117. internalerror(2012090301);
  118. replaceinfo.node:=tfornode(node).left;
  119. replaceinfo.value:=tordconstnode(tfornode(node).right).value;
  120. end;
  121. { let's unroll (and rock of course) }
  122. for i:=1 to unrolls do
  123. begin
  124. { create and insert copy of the statement block }
  125. addstatement(unrollstatement,tfornode(node).t2.getcopy);
  126. { set and insert entry label? }
  127. if (counts mod unrolls<>0) and
  128. ((counts mod unrolls)=unrolls-i) then
  129. begin
  130. tfornode(node).entrylabel:=clabelnode.create(cnothingnode.create,clabelsym.create('$optunrol'));
  131. addstatement(unrollstatement,tfornode(node).entrylabel);
  132. end;
  133. if getridoffor then
  134. begin
  135. foreachnodestatic(tnode(unrollstatement),@replaceloadnodes,@replaceinfo);
  136. if lnf_backward in tfornode(node).loopflags then
  137. replaceinfo.value:=replaceinfo.value-1
  138. else
  139. replaceinfo.value:=replaceinfo.value+1;
  140. end
  141. else
  142. begin
  143. { for itself increases at the last iteration }
  144. if i<unrolls then
  145. begin
  146. { insert incr/decrementation of counter var }
  147. if lnf_backward in tfornode(node).loopflags then
  148. addstatement(unrollstatement,
  149. geninlinenode(in_dec_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)))
  150. else
  151. addstatement(unrollstatement,
  152. geninlinenode(in_inc_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)));
  153. end;
  154. end;
  155. end;
  156. { can we get rid of the for statement? }
  157. if getridoffor then
  158. begin
  159. { create block statement }
  160. result:=internalstatements(newforstatement);
  161. addstatement(newforstatement,unrollblock);
  162. end;
  163. end
  164. else
  165. begin
  166. { unrolling is a little bit more tricky if we don't know the
  167. loop count at compile time, but the solution is to use a jump table
  168. which is indexed by "loop count mod unrolls" at run time and which
  169. jumps then at the appropriate place inside the loop. Because
  170. a module division is expensive, we can use only unroll counts dividable
  171. by 2 }
  172. case unrolls of
  173. 1..2:
  174. ;
  175. 3:
  176. unrolls:=2;
  177. 4..7:
  178. unrolls:=4;
  179. { unrolls>4 already make no sense imo, but who knows (FK) }
  180. 8..15:
  181. unrolls:=8;
  182. 16..31:
  183. unrolls:=16;
  184. 32..63:
  185. unrolls:=32;
  186. 64..$7fff:
  187. unrolls:=64;
  188. else
  189. exit;
  190. end;
  191. { we don't handle this yet }
  192. exit;
  193. end;
  194. if not(assigned(result)) then
  195. begin
  196. tfornode(node).t2.free;
  197. tfornode(node).t2:=unrollblock;
  198. end;
  199. end;
  200. end;
  201. var
  202. initcode,
  203. calccode,
  204. deletecode : tblocknode;
  205. initcodestatements,
  206. calccodestatements,
  207. deletecodestatements: tstatementnode;
  208. templist : tfplist;
  209. inductionexprs : tfplist;
  210. changedforloop,
  211. containsnestedforloop : boolean;
  212. function is_loop_invariant(loop : tnode;expr : tnode) : boolean;
  213. begin
  214. result:=is_constnode(expr);
  215. case expr.nodetype of
  216. loadn:
  217. begin
  218. if (pi_dfaavailable in current_procinfo.flags) and
  219. assigned(loop.optinfo) and
  220. assigned(expr.optinfo) then
  221. { no aliasing? }
  222. result:=not(tabstractvarsym(tloadnode(expr).symtableentry).addr_taken) and
  223. { no definition in the loop? }
  224. not(DFASetIn(loop.optinfo^.defsum,expr.optinfo^.index));
  225. end;
  226. vecn:
  227. begin
  228. result:=((tvecnode(expr).left.nodetype=loadn) or is_loop_invariant(loop,tvecnode(expr).left)) and
  229. is_loop_invariant(loop,tvecnode(expr).right);
  230. end;
  231. typeconvn:
  232. result:=is_loop_invariant(loop,ttypeconvnode(expr).left);
  233. end;
  234. end;
  235. { checks if the strength of n can be recuded, arg is the tforloop being considered }
  236. function dostrengthreductiontest(var n: tnode; arg: pointer): foreachnoderesult;
  237. function findpreviousstrengthreduction : boolean;
  238. var
  239. i : longint;
  240. begin
  241. result:=false;
  242. for i:=0 to inductionexprs.count-1 do
  243. begin
  244. { do we already maintain one expression? }
  245. if tnode(inductionexprs[i]).isequal(n) then
  246. begin
  247. n.free;
  248. case n.nodetype of
  249. muln:
  250. n:=ctemprefnode.create(ttempcreatenode(templist[i]));
  251. vecn:
  252. n:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(
  253. ttempcreatenode(templist[i]))),n.resultdef);
  254. else
  255. internalerror(200809211);
  256. end;
  257. result:=true;
  258. exit;
  259. end;
  260. end;
  261. end;
  262. procedure CreateNodes;
  263. begin
  264. if not assigned(initcode) then
  265. begin
  266. initcode:=internalstatements(initcodestatements);
  267. calccode:=internalstatements(calccodestatements);
  268. deletecode:=internalstatements(deletecodestatements);
  269. end;
  270. end;
  271. var
  272. tempnode : ttempcreatenode;
  273. dummy : longint;
  274. begin
  275. result:=fen_false;
  276. case n.nodetype of
  277. forn:
  278. { inform for loop search routine, that it needs to search more deeply }
  279. containsnestedforloop:=true;
  280. muln:
  281. begin
  282. if (taddnode(n).right.nodetype=loadn) and
  283. taddnode(n).right.isequal(tfornode(arg).left) and
  284. { plain read of the loop variable? }
  285. not(nf_write in taddnode(n).right.flags) and
  286. not(nf_modify in taddnode(n).right.flags) and
  287. is_loop_invariant(tfornode(arg),taddnode(n).left) and
  288. { for now, we can handle only constant lower borders }
  289. is_constnode(tfornode(arg).right) then
  290. taddnode(n).swapleftright;
  291. if (taddnode(n).left.nodetype=loadn) and
  292. taddnode(n).left.isequal(tfornode(arg).left) and
  293. { plain read of the loop variable? }
  294. not(nf_write in taddnode(n).left.flags) and
  295. not(nf_modify in taddnode(n).left.flags) and
  296. is_loop_invariant(tfornode(arg),taddnode(n).right) and
  297. { for now, we can handle only constant lower borders }
  298. is_constnode(tfornode(arg).right) then
  299. begin
  300. changedforloop:=true;
  301. { did we use the same expression before already? }
  302. if not(findpreviousstrengthreduction) then
  303. begin
  304. tempnode:=ctempcreatenode.create(n.resultdef,n.resultdef.size,tt_persistent,
  305. tstoreddef(n.resultdef).is_intregable or tstoreddef(n.resultdef).is_fpuregable);
  306. templist.Add(tempnode);
  307. inductionexprs.Add(n);
  308. CreateNodes;
  309. if lnf_backward in tfornode(arg).loopflags then
  310. addstatement(calccodestatements,
  311. geninlinenode(in_dec_x,false,
  312. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))))
  313. else
  314. addstatement(calccodestatements,
  315. geninlinenode(in_inc_x,false,
  316. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))));
  317. addstatement(initcodestatements,tempnode);
  318. addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),
  319. caddnode.create(muln,
  320. caddnode.create(subn,tfornode(arg).right.getcopy,cordconstnode.create(1,tfornode(arg).right.resultdef,false)),
  321. taddnode(n).right.getcopy)
  322. )
  323. );
  324. { finally replace the node by a temp. ref }
  325. n:=ctemprefnode.create(tempnode);
  326. { ... and add a temp. release node }
  327. addstatement(deletecodestatements,ctempdeletenode.create(tempnode));
  328. end;
  329. { set types }
  330. do_firstpass(n);
  331. result:=fen_norecurse_false;
  332. end;
  333. end;
  334. vecn:
  335. begin
  336. { is the index the counter variable? }
  337. if not(is_special_array(tvecnode(n).left.resultdef)) and
  338. (tvecnode(n).right.isequal(tfornode(arg).left) or
  339. { fpc usually creates a type cast to access an array }
  340. ((tvecnode(n).right.nodetype=typeconvn) and
  341. ttypeconvnode(tvecnode(n).right).left.isequal(tfornode(arg).left)
  342. )
  343. ) and
  344. { plain read of the loop variable? }
  345. not(nf_write in tvecnode(n).right.flags) and
  346. not(nf_modify in tvecnode(n).right.flags) and
  347. { direct array access? }
  348. ((tvecnode(n).left.nodetype=loadn) or
  349. { ... or loop invariant expression? }
  350. is_loop_invariant(tfornode(arg),tvecnode(n).right)) and
  351. { removing the multiplication is only worth the
  352. effort if it's not a simple shift }
  353. not(ispowerof2(tcgvecnode(n).get_mul_size,dummy)) then
  354. begin
  355. changedforloop:=true;
  356. { did we use the same expression before already? }
  357. if not(findpreviousstrengthreduction) then
  358. begin
  359. {$ifdef DEBUG_OPTSTRENGTH}
  360. writeln('**********************************************************************************');
  361. writeln('Found expression for strength reduction: ');
  362. printnode(n);
  363. writeln('**********************************************************************************');
  364. {$endif DEBUG_OPTSTRENGTH}
  365. tempnode:=ctempcreatenode.create(voidpointertype,voidpointertype.size,tt_persistent,true);
  366. templist.Add(tempnode);
  367. inductionexprs.Add(n);
  368. CreateNodes;
  369. if lnf_backward in tfornode(arg).loopflags then
  370. addstatement(calccodestatements,
  371. geninlinenode(in_dec_x,false,
  372. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(
  373. cordconstnode.create(tcgvecnode(n).get_mul_size,tfornode(arg).right.resultdef,false),nil))))
  374. else
  375. addstatement(calccodestatements,
  376. geninlinenode(in_inc_x,false,
  377. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(
  378. cordconstnode.create(tcgvecnode(n).get_mul_size,tfornode(arg).right.resultdef,false),nil))));
  379. addstatement(initcodestatements,tempnode);
  380. addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),
  381. caddrnode.create(
  382. cvecnode.create(tvecnode(n).left.getcopy,tfornode(arg).right.getcopy)
  383. )
  384. ));
  385. { finally replace the node by a temp. ref }
  386. n:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(tempnode)),n.resultdef);
  387. { ... and add a temp. release node }
  388. addstatement(deletecodestatements,ctempdeletenode.create(tempnode));
  389. end;
  390. { set types }
  391. do_firstpass(n);
  392. result:=fen_norecurse_false;
  393. end;
  394. end;
  395. end;
  396. end;
  397. function OptimizeInductionVariablesSingleForLoop(node : tnode) : tnode;
  398. var
  399. loopcode : tblocknode;
  400. loopcodestatements,
  401. newcodestatements : tstatementnode;
  402. fornode : tfornode;
  403. begin
  404. result:=nil;
  405. if node.nodetype<>forn then
  406. exit;
  407. templist:=TFPList.Create;
  408. inductionexprs:=TFPList.Create;
  409. initcode:=nil;
  410. calccode:=nil;
  411. deletecode:=nil;
  412. initcodestatements:=nil;
  413. calccodestatements:=nil;
  414. deletecodestatements:=nil;
  415. { find all expressions being candidates for strength reduction
  416. and replace them }
  417. foreachnodestatic(pm_postprocess,node,@dostrengthreductiontest,node);
  418. { clue everything together }
  419. if assigned(initcode) then
  420. begin
  421. do_firstpass(tnode(initcode));
  422. do_firstpass(tnode(calccode));
  423. do_firstpass(tnode(deletecode));
  424. { create a new for node, the old one will be released by the compiler }
  425. with tfornode(node) do
  426. begin
  427. fornode:=cfornode.create(left,right,t1,t2,lnf_backward in loopflags);
  428. left:=nil;
  429. right:=nil;
  430. t1:=nil;
  431. t2:=nil;
  432. end;
  433. node:=fornode;
  434. loopcode:=internalstatements(loopcodestatements);
  435. addstatement(loopcodestatements,tfornode(node).t2);
  436. tfornode(node).t2:=loopcode;
  437. do_firstpass(node);
  438. addstatement(loopcodestatements,calccode);
  439. result:=internalstatements(newcodestatements);
  440. addstatement(newcodestatements,initcode);
  441. addstatement(newcodestatements,node);
  442. addstatement(newcodestatements,deletecode);
  443. end;
  444. templist.Free;
  445. inductionexprs.Free;
  446. end;
  447. function iterforloops(var n: tnode; arg: pointer): foreachnoderesult;
  448. var
  449. hp : tnode;
  450. begin
  451. Result:=fen_false;
  452. if n.nodetype=forn then
  453. begin
  454. { do we have DFA available? }
  455. if pi_dfaavailable in current_procinfo.flags then
  456. begin
  457. CalcDefSum(tfornode(n).t2);
  458. end;
  459. containsnestedforloop:=false;
  460. hp:=OptimizeInductionVariablesSingleForLoop(n);
  461. if assigned(hp) then
  462. begin
  463. n.Free;
  464. n:=hp;
  465. end;
  466. { can we avoid further searching? }
  467. if not(containsnestedforloop) then
  468. Result:=fen_norecurse_false;
  469. end;
  470. end;
  471. function OptimizeInductionVariables(node : tnode) : boolean;
  472. begin
  473. changedforloop:=false;
  474. foreachnodestatic(pm_postprocess,node,@iterforloops,nil);
  475. Result:=changedforloop;
  476. end;
  477. end.