optloop.pas 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. {
  2. Loop optimization
  3. Copyright (c) 2005 by Florian Klaempfl
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit optloop;
  18. {$i fpcdefs.inc}
  19. { $define DEBUG_OPTSTRENGTH}
  20. interface
  21. uses
  22. node;
  23. function unroll_loop(node : tnode) : tnode;
  24. function OptimizeInductionVariables(node : tnode) : boolean;
  25. implementation
  26. uses
  27. cutils,cclasses,compinnr,
  28. globtype,globals,constexp,
  29. verbose,
  30. symdef,symsym,
  31. defutil,
  32. cpuinfo,
  33. nutils,
  34. nadd,nbas,nflw,ncon,ninl,ncal,nld,nmem,ncnv,
  35. ncgmem,
  36. pass_1,
  37. optbase,optutils,
  38. procinfo;
  39. function number_unrolls(node : tnode) : cardinal;
  40. begin
  41. { calculate how often a loop shall be unrolled.
  42. The term (60*ord(node_count_weighted(node)<15)) is used to get small loops unrolled more often as
  43. the counter management takes more time in this case. }
  44. {$ifdef i386}
  45. { multiply by 2 for CPUs with a long pipeline }
  46. if current_settings.optimizecputype in [cpu_Pentium4] then
  47. number_unrolls:=trunc(round((60+(60*ord(node_count_weighted(node)<15)))/max(node_count_weighted(node),1)))
  48. else
  49. {$endif i386}
  50. number_unrolls:=trunc(round((30+(60*ord(node_count_weighted(node)<15)))/max(node_count_weighted(node),1)));
  51. if number_unrolls=0 then
  52. number_unrolls:=1;
  53. end;
  54. type
  55. treplaceinfo = record
  56. node : tnode;
  57. value : Tconstexprint;
  58. end;
  59. preplaceinfo = ^treplaceinfo;
  60. function checkcontrollflowstatements(var n:tnode; arg: pointer): foreachnoderesult;
  61. begin
  62. if n.nodetype in [breakn,continuen,goton,labeln,exitn,raisen] then
  63. result:=fen_norecurse_true
  64. else
  65. result:=fen_false;
  66. end;
  67. function replaceloadnodes(var n: tnode; arg: pointer): foreachnoderesult;
  68. begin
  69. if n.isequal(preplaceinfo(arg)^.node) then
  70. begin
  71. if n.flags*[nf_modify,nf_write,nf_address_taken]<>[] then
  72. internalerror(2012090402);
  73. n.free;
  74. n:=cordconstnode.create(preplaceinfo(arg)^.value,preplaceinfo(arg)^.node.resultdef,false);
  75. do_firstpass(n);
  76. end;
  77. result:=fen_false;
  78. end;
  79. function unroll_loop(node : tnode) : tnode;
  80. var
  81. unrolls,i : cardinal;
  82. counts : qword;
  83. unrollstatement,newforstatement : tstatementnode;
  84. unrollblock : tblocknode;
  85. getridoffor : boolean;
  86. replaceinfo : treplaceinfo;
  87. hascontrollflowstatements : boolean;
  88. begin
  89. result:=nil;
  90. if (cs_opt_size in current_settings.optimizerswitches) then
  91. exit;
  92. if not(node.nodetype in [forn]) then
  93. exit;
  94. unrolls:=number_unrolls(tfornode(node).t2);
  95. if (unrolls>1) and
  96. ((tfornode(node).left.nodetype<>loadn) or
  97. { the address of the counter variable might be taken if it is passed by constref to a
  98. subroutine, so really check if it is not taken }
  99. ((tfornode(node).left.nodetype=loadn) and (tloadnode(tfornode(node).left).symtableentry is tabstractvarsym) and
  100. not(tabstractvarsym(tloadnode(tfornode(node).left).symtableentry).addr_taken) and
  101. not(tabstractvarsym(tloadnode(tfornode(node).left).symtableentry).different_scope))
  102. ) then
  103. begin
  104. { number of executions known? }
  105. if (tfornode(node).right.nodetype=ordconstn) and (tfornode(node).t1.nodetype=ordconstn) then
  106. begin
  107. if lnf_backward in tfornode(node).loopflags then
  108. counts:=tordconstnode(tfornode(node).right).value-tordconstnode(tfornode(node).t1).value+1
  109. else
  110. counts:=tordconstnode(tfornode(node).t1).value-tordconstnode(tfornode(node).right).value+1;
  111. hascontrollflowstatements:=foreachnodestatic(tfornode(node).t2,@checkcontrollflowstatements,nil);
  112. { don't unroll more than we need,
  113. multiply unroll by two here because we can get rid
  114. of the counter variable completely and replace it by a constant
  115. if unrolls=counts }
  116. if unrolls*2>=counts then
  117. unrolls:=counts;
  118. { create block statement }
  119. unrollblock:=internalstatements(unrollstatement);
  120. { can we get rid completly of the for ? }
  121. getridoffor:=(unrolls=counts) and not(hascontrollflowstatements) and
  122. { TP/Macpas allows assignments to the for-variables, so we cannot get rid of the for }
  123. ([m_tp7,m_mac]*current_settings.modeswitches=[]);
  124. if getridoffor then
  125. begin
  126. replaceinfo.node:=tfornode(node).left;
  127. replaceinfo.value:=tordconstnode(tfornode(node).right).value;
  128. end
  129. else
  130. { we consider currently unrolling not beneficial, if we cannot get rid of the for completely, this
  131. might change if a more sophisticated heuristics is used (FK) }
  132. exit;
  133. { let's unroll (and rock of course) }
  134. for i:=1 to unrolls do
  135. begin
  136. { create and insert copy of the statement block }
  137. addstatement(unrollstatement,tfornode(node).t2.getcopy);
  138. { set and insert entry label? }
  139. if (counts mod unrolls<>0) and
  140. ((counts mod unrolls)=unrolls-i) then
  141. begin
  142. tfornode(node).entrylabel:=clabelnode.create(cnothingnode.create,clabelsym.create('$optunrol'));
  143. addstatement(unrollstatement,tfornode(node).entrylabel);
  144. end;
  145. if getridoffor then
  146. begin
  147. foreachnodestatic(tnode(unrollstatement),@replaceloadnodes,@replaceinfo);
  148. if lnf_backward in tfornode(node).loopflags then
  149. replaceinfo.value:=replaceinfo.value-1
  150. else
  151. replaceinfo.value:=replaceinfo.value+1;
  152. end
  153. else
  154. begin
  155. { for itself increases at the last iteration }
  156. if i<unrolls then
  157. begin
  158. { insert incr/decrementation of counter var }
  159. if lnf_backward in tfornode(node).loopflags then
  160. addstatement(unrollstatement,
  161. geninlinenode(in_dec_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)))
  162. else
  163. addstatement(unrollstatement,
  164. geninlinenode(in_inc_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)));
  165. end;
  166. end;
  167. end;
  168. { can we get rid of the for statement? }
  169. if getridoffor then
  170. begin
  171. { create block statement }
  172. result:=internalstatements(newforstatement);
  173. addstatement(newforstatement,unrollblock);
  174. doinlinesimplify(result);
  175. end;
  176. end
  177. else
  178. begin
  179. { unrolling is a little bit more tricky if we don't know the
  180. loop count at compile time, but the solution is to use a jump table
  181. which is indexed by "loop count mod unrolls" at run time and which
  182. jumps then at the appropriate place inside the loop. Because
  183. a module division is expensive, we can use only unroll counts dividable
  184. by 2 }
  185. case unrolls of
  186. 1..2:
  187. ;
  188. 3:
  189. unrolls:=2;
  190. 4..7:
  191. unrolls:=4;
  192. { unrolls>4 already make no sense imo, but who knows (FK) }
  193. 8..15:
  194. unrolls:=8;
  195. 16..31:
  196. unrolls:=16;
  197. 32..63:
  198. unrolls:=32;
  199. 64..$7fff:
  200. unrolls:=64;
  201. else
  202. exit;
  203. end;
  204. { we don't handle this yet }
  205. exit;
  206. end;
  207. if not(assigned(result)) then
  208. begin
  209. tfornode(node).t2.free;
  210. tfornode(node).t2:=unrollblock;
  211. end;
  212. end;
  213. end;
  214. var
  215. initcode,
  216. calccode,
  217. deletecode : tblocknode;
  218. initcodestatements,
  219. calccodestatements,
  220. deletecodestatements: tstatementnode;
  221. templist : tfplist;
  222. inductionexprs : tfplist;
  223. changedforloop,
  224. containsnestedforloop : boolean;
  225. function is_loop_invariant(loop : tnode;expr : tnode) : boolean;
  226. begin
  227. result:=is_constnode(expr);
  228. case expr.nodetype of
  229. loadn:
  230. begin
  231. if (pi_dfaavailable in current_procinfo.flags) and
  232. assigned(loop.optinfo) and
  233. assigned(expr.optinfo) then
  234. { no aliasing? }
  235. result:=not(tabstractvarsym(tloadnode(expr).symtableentry).addr_taken) and
  236. { no definition in the loop? }
  237. not(DFASetIn(tfornode(loop).t2.optinfo^.defsum,expr.optinfo^.index));
  238. end;
  239. vecn:
  240. begin
  241. result:=((tvecnode(expr).left.nodetype=loadn) or is_loop_invariant(loop,tvecnode(expr).left)) and
  242. is_loop_invariant(loop,tvecnode(expr).right);
  243. end;
  244. typeconvn:
  245. result:=is_loop_invariant(loop,ttypeconvnode(expr).left);
  246. addn,subn:
  247. result:=is_loop_invariant(loop,taddnode(expr).left) and is_loop_invariant(loop,taddnode(expr).right);
  248. else
  249. ;
  250. end;
  251. end;
  252. { checks if the strength of n can be recuded, arg is the tforloop being considered }
  253. function dostrengthreductiontest(var n: tnode; arg: pointer): foreachnoderesult;
  254. function findpreviousstrengthreduction : boolean;
  255. var
  256. i : longint;
  257. hp : tnode;
  258. begin
  259. result:=false;
  260. for i:=0 to inductionexprs.count-1 do
  261. begin
  262. { do we already maintain one expression? }
  263. if tnode(inductionexprs[i]).isequal(n) then
  264. begin
  265. case n.nodetype of
  266. muln:
  267. hp:=ctemprefnode.create(ttempcreatenode(templist[i]));
  268. vecn:
  269. hp:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(
  270. ttempcreatenode(templist[i]))),n.resultdef);
  271. else
  272. internalerror(200809211);
  273. end;
  274. n.free;
  275. n:=hp;
  276. result:=true;
  277. exit;
  278. end;
  279. end;
  280. end;
  281. procedure CreateNodes;
  282. begin
  283. if not assigned(initcode) then
  284. begin
  285. initcode:=internalstatements(initcodestatements);
  286. calccode:=internalstatements(calccodestatements);
  287. deletecode:=internalstatements(deletecodestatements);
  288. end;
  289. end;
  290. var
  291. tempnode : ttempcreatenode;
  292. dummy : longint;
  293. begin
  294. result:=fen_false;
  295. case n.nodetype of
  296. forn:
  297. { inform for loop search routine, that it needs to search more deeply }
  298. containsnestedforloop:=true;
  299. muln:
  300. begin
  301. if (taddnode(n).right.nodetype=loadn) and
  302. taddnode(n).right.isequal(tfornode(arg).left) and
  303. { plain read of the loop variable? }
  304. not(nf_write in taddnode(n).right.flags) and
  305. not(nf_modify in taddnode(n).right.flags) and
  306. is_loop_invariant(tfornode(arg),taddnode(n).left) and
  307. { for now, we can handle only constant lower borders }
  308. is_constnode(tfornode(arg).right) then
  309. taddnode(n).swapleftright;
  310. if (taddnode(n).left.nodetype=loadn) and
  311. taddnode(n).left.isequal(tfornode(arg).left) and
  312. { plain read of the loop variable? }
  313. not(nf_write in taddnode(n).left.flags) and
  314. not(nf_modify in taddnode(n).left.flags) and
  315. is_loop_invariant(tfornode(arg),taddnode(n).right) and
  316. { for now, we can handle only constant lower borders }
  317. is_constnode(tfornode(arg).right) then
  318. begin
  319. changedforloop:=true;
  320. { did we use the same expression before already? }
  321. if not(findpreviousstrengthreduction) then
  322. begin
  323. tempnode:=ctempcreatenode.create(n.resultdef,n.resultdef.size,tt_persistent,
  324. tstoreddef(n.resultdef).is_intregable or tstoreddef(n.resultdef).is_fpuregable);
  325. templist.Add(tempnode);
  326. inductionexprs.Add(n);
  327. CreateNodes;
  328. if lnf_backward in tfornode(arg).loopflags then
  329. addstatement(calccodestatements,
  330. geninlinenode(in_dec_x,false,
  331. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))))
  332. else
  333. addstatement(calccodestatements,
  334. geninlinenode(in_inc_x,false,
  335. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))));
  336. addstatement(initcodestatements,tempnode);
  337. addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),
  338. caddnode.create(muln,tfornode(arg).right.getcopy,
  339. taddnode(n).right.getcopy)
  340. )
  341. );
  342. { finally replace the node by a temp. ref }
  343. n:=ctemprefnode.create(tempnode);
  344. { ... and add a temp. release node }
  345. addstatement(deletecodestatements,ctempdeletenode.create(tempnode));
  346. end;
  347. { set types }
  348. do_firstpass(n);
  349. result:=fen_norecurse_false;
  350. end;
  351. end;
  352. vecn:
  353. begin
  354. { is the index the counter variable? }
  355. if not(is_special_array(tvecnode(n).left.resultdef)) and
  356. not(is_packed_array(tvecnode(n).left.resultdef)) and
  357. (tvecnode(n).right.isequal(tfornode(arg).left) or
  358. { fpc usually creates a type cast to access an array }
  359. ((tvecnode(n).right.nodetype=typeconvn) and
  360. ttypeconvnode(tvecnode(n).right).left.isequal(tfornode(arg).left)
  361. )
  362. ) and
  363. { plain read of the loop variable? }
  364. not(nf_write in tvecnode(n).right.flags) and
  365. not(nf_modify in tvecnode(n).right.flags) and
  366. { direct array access? }
  367. ((tvecnode(n).left.nodetype=loadn) or
  368. { ... or loop invariant expression? }
  369. is_loop_invariant(tfornode(arg),tvecnode(n).right)) and
  370. { removing the multiplication is only worth the
  371. effort if it's not a simple shift }
  372. not(ispowerof2(tcgvecnode(n).get_mul_size,dummy)) then
  373. begin
  374. changedforloop:=true;
  375. { did we use the same expression before already? }
  376. if not(findpreviousstrengthreduction) then
  377. begin
  378. {$ifdef DEBUG_OPTSTRENGTH}
  379. writeln('**********************************************************************************');
  380. writeln('Found expression for strength reduction: ');
  381. printnode(n);
  382. writeln('**********************************************************************************');
  383. {$endif DEBUG_OPTSTRENGTH}
  384. tempnode:=ctempcreatenode.create(voidpointertype,voidpointertype.size,tt_persistent,true);
  385. templist.Add(tempnode);
  386. inductionexprs.Add(n);
  387. CreateNodes;
  388. if lnf_backward in tfornode(arg).loopflags then
  389. addstatement(calccodestatements,
  390. cinlinenode.createintern(in_dec_x,false,
  391. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(
  392. cordconstnode.create(tcgvecnode(n).get_mul_size,sizeuinttype,false),nil))))
  393. else
  394. addstatement(calccodestatements,
  395. cinlinenode.createintern(in_inc_x,false,
  396. ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(
  397. cordconstnode.create(tcgvecnode(n).get_mul_size,sizeuinttype,false),nil))));
  398. addstatement(initcodestatements,tempnode);
  399. addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),
  400. caddrnode.create(
  401. cvecnode.create(tvecnode(n).left.getcopy,tfornode(arg).right.getcopy)
  402. )
  403. ));
  404. { finally replace the node by a temp. ref }
  405. n:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(tempnode)),n.resultdef);
  406. { ... and add a temp. release node }
  407. addstatement(deletecodestatements,ctempdeletenode.create(tempnode));
  408. end;
  409. { set types }
  410. do_firstpass(n);
  411. result:=fen_norecurse_false;
  412. end;
  413. end;
  414. else
  415. ;
  416. end;
  417. end;
  418. function OptimizeInductionVariablesSingleForLoop(node : tnode) : tnode;
  419. var
  420. loopcode : tblocknode;
  421. loopcodestatements,
  422. newcodestatements : tstatementnode;
  423. fornode : tfornode;
  424. begin
  425. result:=nil;
  426. if node.nodetype<>forn then
  427. exit;
  428. templist:=TFPList.Create;
  429. inductionexprs:=TFPList.Create;
  430. initcode:=nil;
  431. calccode:=nil;
  432. deletecode:=nil;
  433. initcodestatements:=nil;
  434. calccodestatements:=nil;
  435. deletecodestatements:=nil;
  436. { find all expressions being candidates for strength reduction
  437. and replace them }
  438. foreachnodestatic(pm_postprocess,node,@dostrengthreductiontest,node);
  439. { clue everything together }
  440. if assigned(initcode) then
  441. begin
  442. do_firstpass(tnode(initcode));
  443. do_firstpass(tnode(calccode));
  444. do_firstpass(tnode(deletecode));
  445. { create a new for node, the old one will be released by the compiler }
  446. with tfornode(node) do
  447. begin
  448. fornode:=cfornode.create(left,right,t1,t2,lnf_backward in loopflags);
  449. left:=nil;
  450. right:=nil;
  451. t1:=nil;
  452. t2:=nil;
  453. end;
  454. node:=fornode;
  455. loopcode:=internalstatements(loopcodestatements);
  456. addstatement(loopcodestatements,tfornode(node).t2);
  457. tfornode(node).t2:=loopcode;
  458. do_firstpass(node);
  459. addstatement(loopcodestatements,calccode);
  460. result:=internalstatements(newcodestatements);
  461. addstatement(newcodestatements,initcode);
  462. initcode:=nil;
  463. addstatement(newcodestatements,node);
  464. addstatement(newcodestatements,deletecode);
  465. end;
  466. templist.Free;
  467. inductionexprs.Free;
  468. end;
  469. function iterforloops(var n: tnode; arg: pointer): foreachnoderesult;
  470. var
  471. hp : tnode;
  472. begin
  473. Result:=fen_false;
  474. if n.nodetype=forn then
  475. begin
  476. { do we have DFA available? }
  477. if pi_dfaavailable in current_procinfo.flags then
  478. begin
  479. CalcDefSum(tfornode(n).t2);
  480. end;
  481. containsnestedforloop:=false;
  482. hp:=OptimizeInductionVariablesSingleForLoop(n);
  483. if assigned(hp) then
  484. begin
  485. n.Free;
  486. n:=hp;
  487. end;
  488. { can we avoid further searching? }
  489. if not(containsnestedforloop) then
  490. Result:=fen_norecurse_false;
  491. end;
  492. end;
  493. function OptimizeInductionVariables(node : tnode) : boolean;
  494. begin
  495. changedforloop:=false;
  496. foreachnodestatic(pm_postprocess,node,@iterforloops,nil);
  497. Result:=changedforloop;
  498. end;
  499. end.