123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558 |
- {
- Loop optimization
- Copyright (c) 2005 by Florian Klaempfl
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- ****************************************************************************
- }
- unit optloop;
- {$i fpcdefs.inc}
- { $define DEBUG_OPTSTRENGTH}
- interface
- uses
- node;
- function unroll_loop(node : tnode) : tnode;
- function OptimizeInductionVariables(node : tnode) : boolean;
- implementation
- uses
- cutils,cclasses,compinnr,
- globtype,globals,constexp,
- verbose,
- symdef,symsym,
- defutil,
- cpuinfo,
- nutils,
- nadd,nbas,nflw,ncon,ninl,ncal,nld,nmem,ncnv,
- ncgmem,
- pass_1,
- optbase,optutils,
- procinfo;
- function number_unrolls(node : tnode) : cardinal;
- begin
- { calculate how often a loop shall be unrolled.
- The term (60*ord(node_count_weighted(node)<15)) is used to get small loops unrolled more often as
- the counter management takes more time in this case. }
- {$ifdef i386}
- { multiply by 2 for CPUs with a long pipeline }
- if current_settings.optimizecputype in [cpu_Pentium4] then
- number_unrolls:=trunc(round((60+(60*ord(node_count_weighted(node)<15)))/max(node_count_weighted(node),1)))
- else
- {$endif i386}
- number_unrolls:=trunc(round((30+(60*ord(node_count_weighted(node)<15)))/max(node_count_weighted(node),1)));
- if number_unrolls=0 then
- number_unrolls:=1;
- end;
- type
- treplaceinfo = record
- node : tnode;
- value : Tconstexprint;
- end;
- preplaceinfo = ^treplaceinfo;
- function checkcontrollflowstatements(var n:tnode; arg: pointer): foreachnoderesult;
- begin
- if n.nodetype in [breakn,continuen,goton,labeln,exitn,raisen] then
- result:=fen_norecurse_true
- else
- result:=fen_false;
- end;
- function replaceloadnodes(var n: tnode; arg: pointer): foreachnoderesult;
- begin
- if n.isequal(preplaceinfo(arg)^.node) then
- begin
- if n.flags*[nf_modify,nf_write,nf_address_taken]<>[] then
- internalerror(2012090402);
- n.free;
- n:=cordconstnode.create(preplaceinfo(arg)^.value,preplaceinfo(arg)^.node.resultdef,false);
- do_firstpass(n);
- end;
- result:=fen_false;
- end;
- function unroll_loop(node : tnode) : tnode;
- var
- unrolls,i : cardinal;
- counts : qword;
- unrollstatement,newforstatement : tstatementnode;
- unrollblock : tblocknode;
- getridoffor : boolean;
- replaceinfo : treplaceinfo;
- hascontrollflowstatements : boolean;
- begin
- result:=nil;
- if (cs_opt_size in current_settings.optimizerswitches) then
- exit;
- if not(node.nodetype in [forn]) then
- exit;
- unrolls:=number_unrolls(tfornode(node).t2);
- if (unrolls>1) and
- ((tfornode(node).left.nodetype<>loadn) or
- { the address of the counter variable might be taken if it is passed by constref to a
- subroutine, so really check if it is not taken }
- ((tfornode(node).left.nodetype=loadn) and (tloadnode(tfornode(node).left).symtableentry is tabstractvarsym) and
- not(tabstractvarsym(tloadnode(tfornode(node).left).symtableentry).addr_taken) and
- not(tabstractvarsym(tloadnode(tfornode(node).left).symtableentry).different_scope))
- ) then
- begin
- { number of executions known? }
- if (tfornode(node).right.nodetype=ordconstn) and (tfornode(node).t1.nodetype=ordconstn) then
- begin
- if lnf_backward in tfornode(node).loopflags then
- counts:=tordconstnode(tfornode(node).right).value-tordconstnode(tfornode(node).t1).value+1
- else
- counts:=tordconstnode(tfornode(node).t1).value-tordconstnode(tfornode(node).right).value+1;
- hascontrollflowstatements:=foreachnodestatic(tfornode(node).t2,@checkcontrollflowstatements,nil);
- { don't unroll more than we need,
- multiply unroll by two here because we can get rid
- of the counter variable completely and replace it by a constant
- if unrolls=counts }
- if unrolls*2>=counts then
- unrolls:=counts;
- { create block statement }
- unrollblock:=internalstatements(unrollstatement);
- { can we get rid completly of the for ? }
- getridoffor:=(unrolls=counts) and not(hascontrollflowstatements) and
- { TP/Macpas allows assignments to the for-variables, so we cannot get rid of the for }
- ([m_tp7,m_mac]*current_settings.modeswitches=[]);
- if getridoffor then
- begin
- replaceinfo.node:=tfornode(node).left;
- replaceinfo.value:=tordconstnode(tfornode(node).right).value;
- end
- else
- { we consider currently unrolling not beneficial, if we cannot get rid of the for completely, this
- might change if a more sophisticated heuristics is used (FK) }
- exit;
- { let's unroll (and rock of course) }
- for i:=1 to unrolls do
- begin
- { create and insert copy of the statement block }
- addstatement(unrollstatement,tfornode(node).t2.getcopy);
- { set and insert entry label? }
- if (counts mod unrolls<>0) and
- ((counts mod unrolls)=unrolls-i) then
- begin
- tfornode(node).entrylabel:=clabelnode.create(cnothingnode.create,clabelsym.create('$optunrol'));
- addstatement(unrollstatement,tfornode(node).entrylabel);
- end;
- if getridoffor then
- begin
- foreachnodestatic(tnode(unrollstatement),@replaceloadnodes,@replaceinfo);
- if lnf_backward in tfornode(node).loopflags then
- replaceinfo.value:=replaceinfo.value-1
- else
- replaceinfo.value:=replaceinfo.value+1;
- end
- else
- begin
- { for itself increases at the last iteration }
- if i<unrolls then
- begin
- { insert incr/decrementation of counter var }
- if lnf_backward in tfornode(node).loopflags then
- addstatement(unrollstatement,
- geninlinenode(in_dec_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)))
- else
- addstatement(unrollstatement,
- geninlinenode(in_inc_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)));
- end;
- end;
- end;
- { can we get rid of the for statement? }
- if getridoffor then
- begin
- { create block statement }
- result:=internalstatements(newforstatement);
- addstatement(newforstatement,unrollblock);
- doinlinesimplify(result);
- end;
- end
- else
- begin
- { unrolling is a little bit more tricky if we don't know the
- loop count at compile time, but the solution is to use a jump table
- which is indexed by "loop count mod unrolls" at run time and which
- jumps then at the appropriate place inside the loop. Because
- a module division is expensive, we can use only unroll counts dividable
- by 2 }
- case unrolls of
- 1..2:
- ;
- 3:
- unrolls:=2;
- 4..7:
- unrolls:=4;
- { unrolls>4 already make no sense imo, but who knows (FK) }
- 8..15:
- unrolls:=8;
- 16..31:
- unrolls:=16;
- 32..63:
- unrolls:=32;
- 64..$7fff:
- unrolls:=64;
- else
- exit;
- end;
- { we don't handle this yet }
- exit;
- end;
- if not(assigned(result)) then
- begin
- tfornode(node).t2.free;
- tfornode(node).t2:=unrollblock;
- end;
- end;
- end;
- var
- initcode,
- calccode,
- deletecode : tblocknode;
- initcodestatements,
- calccodestatements,
- deletecodestatements: tstatementnode;
- templist : tfplist;
- inductionexprs : tfplist;
- changedforloop,
- containsnestedforloop : boolean;
- function is_loop_invariant(loop : tnode;expr : tnode) : boolean;
- begin
- result:=is_constnode(expr);
- case expr.nodetype of
- loadn:
- begin
- if (pi_dfaavailable in current_procinfo.flags) and
- assigned(loop.optinfo) and
- assigned(expr.optinfo) then
- { no aliasing? }
- result:=not(tabstractvarsym(tloadnode(expr).symtableentry).addr_taken) and
- { no definition in the loop? }
- not(DFASetIn(tfornode(loop).t2.optinfo^.defsum,expr.optinfo^.index));
- end;
- vecn:
- begin
- result:=((tvecnode(expr).left.nodetype=loadn) or is_loop_invariant(loop,tvecnode(expr).left)) and
- is_loop_invariant(loop,tvecnode(expr).right);
- end;
- typeconvn:
- result:=is_loop_invariant(loop,ttypeconvnode(expr).left);
- else
- ;
- end;
- end;
- { checks if the strength of n can be recuded, arg is the tforloop being considered }
- function dostrengthreductiontest(var n: tnode; arg: pointer): foreachnoderesult;
- function findpreviousstrengthreduction : boolean;
- var
- i : longint;
- hp : tnode;
- begin
- result:=false;
- for i:=0 to inductionexprs.count-1 do
- begin
- { do we already maintain one expression? }
- if tnode(inductionexprs[i]).isequal(n) then
- begin
- case n.nodetype of
- muln:
- hp:=ctemprefnode.create(ttempcreatenode(templist[i]));
- vecn:
- hp:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(
- ttempcreatenode(templist[i]))),n.resultdef);
- else
- internalerror(200809211);
- end;
- n.free;
- n:=hp;
- result:=true;
- exit;
- end;
- end;
- end;
- procedure CreateNodes;
- begin
- if not assigned(initcode) then
- begin
- initcode:=internalstatements(initcodestatements);
- calccode:=internalstatements(calccodestatements);
- deletecode:=internalstatements(deletecodestatements);
- end;
- end;
- var
- tempnode : ttempcreatenode;
- dummy : longint;
- begin
- result:=fen_false;
- case n.nodetype of
- forn:
- { inform for loop search routine, that it needs to search more deeply }
- containsnestedforloop:=true;
- muln:
- begin
- if (taddnode(n).right.nodetype=loadn) and
- taddnode(n).right.isequal(tfornode(arg).left) and
- { plain read of the loop variable? }
- not(nf_write in taddnode(n).right.flags) and
- not(nf_modify in taddnode(n).right.flags) and
- is_loop_invariant(tfornode(arg),taddnode(n).left) and
- { for now, we can handle only constant lower borders }
- is_constnode(tfornode(arg).right) then
- taddnode(n).swapleftright;
- if (taddnode(n).left.nodetype=loadn) and
- taddnode(n).left.isequal(tfornode(arg).left) and
- { plain read of the loop variable? }
- not(nf_write in taddnode(n).left.flags) and
- not(nf_modify in taddnode(n).left.flags) and
- is_loop_invariant(tfornode(arg),taddnode(n).right) and
- { for now, we can handle only constant lower borders }
- is_constnode(tfornode(arg).right) then
- begin
- changedforloop:=true;
- { did we use the same expression before already? }
- if not(findpreviousstrengthreduction) then
- begin
- tempnode:=ctempcreatenode.create(n.resultdef,n.resultdef.size,tt_persistent,
- tstoreddef(n.resultdef).is_intregable or tstoreddef(n.resultdef).is_fpuregable);
- templist.Add(tempnode);
- inductionexprs.Add(n);
- CreateNodes;
- if lnf_backward in tfornode(arg).loopflags then
- addstatement(calccodestatements,
- geninlinenode(in_dec_x,false,
- ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))))
- else
- addstatement(calccodestatements,
- geninlinenode(in_inc_x,false,
- ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))));
- addstatement(initcodestatements,tempnode);
- addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),
- caddnode.create(muln,tfornode(arg).right.getcopy,
- taddnode(n).right.getcopy)
- )
- );
- { finally replace the node by a temp. ref }
- n:=ctemprefnode.create(tempnode);
- { ... and add a temp. release node }
- addstatement(deletecodestatements,ctempdeletenode.create(tempnode));
- end;
- { set types }
- do_firstpass(n);
- result:=fen_norecurse_false;
- end;
- end;
- vecn:
- begin
- { is the index the counter variable? }
- if not(is_special_array(tvecnode(n).left.resultdef)) and
- not(is_packed_array(tvecnode(n).left.resultdef)) and
- (tvecnode(n).right.isequal(tfornode(arg).left) or
- { fpc usually creates a type cast to access an array }
- ((tvecnode(n).right.nodetype=typeconvn) and
- ttypeconvnode(tvecnode(n).right).left.isequal(tfornode(arg).left)
- )
- ) and
- { plain read of the loop variable? }
- not(nf_write in tvecnode(n).right.flags) and
- not(nf_modify in tvecnode(n).right.flags) and
- { direct array access? }
- ((tvecnode(n).left.nodetype=loadn) or
- { ... or loop invariant expression? }
- is_loop_invariant(tfornode(arg),tvecnode(n).right)) and
- { removing the multiplication is only worth the
- effort if it's not a simple shift }
- not(ispowerof2(tcgvecnode(n).get_mul_size,dummy)) then
- begin
- changedforloop:=true;
- { did we use the same expression before already? }
- if not(findpreviousstrengthreduction) then
- begin
- {$ifdef DEBUG_OPTSTRENGTH}
- writeln('**********************************************************************************');
- writeln('Found expression for strength reduction: ');
- printnode(n);
- writeln('**********************************************************************************');
- {$endif DEBUG_OPTSTRENGTH}
- tempnode:=ctempcreatenode.create(voidpointertype,voidpointertype.size,tt_persistent,true);
- templist.Add(tempnode);
- inductionexprs.Add(n);
- CreateNodes;
- if lnf_backward in tfornode(arg).loopflags then
- addstatement(calccodestatements,
- cinlinenode.createintern(in_dec_x,false,
- ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(
- cordconstnode.create(tcgvecnode(n).get_mul_size,sizeuinttype,false),nil))))
- else
- addstatement(calccodestatements,
- cinlinenode.createintern(in_inc_x,false,
- ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(
- cordconstnode.create(tcgvecnode(n).get_mul_size,sizeuinttype,false),nil))));
- addstatement(initcodestatements,tempnode);
- addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),
- caddrnode.create(
- cvecnode.create(tvecnode(n).left.getcopy,tfornode(arg).right.getcopy)
- )
- ));
- { finally replace the node by a temp. ref }
- n:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(tempnode)),n.resultdef);
- { ... and add a temp. release node }
- addstatement(deletecodestatements,ctempdeletenode.create(tempnode));
- end;
- { set types }
- do_firstpass(n);
- result:=fen_norecurse_false;
- end;
- end;
- else
- ;
- end;
- end;
- function OptimizeInductionVariablesSingleForLoop(node : tnode) : tnode;
- var
- loopcode : tblocknode;
- loopcodestatements,
- newcodestatements : tstatementnode;
- fornode : tfornode;
- begin
- result:=nil;
- if node.nodetype<>forn then
- exit;
- templist:=TFPList.Create;
- inductionexprs:=TFPList.Create;
- initcode:=nil;
- calccode:=nil;
- deletecode:=nil;
- initcodestatements:=nil;
- calccodestatements:=nil;
- deletecodestatements:=nil;
- { find all expressions being candidates for strength reduction
- and replace them }
- foreachnodestatic(pm_postprocess,node,@dostrengthreductiontest,node);
- { clue everything together }
- if assigned(initcode) then
- begin
- do_firstpass(tnode(initcode));
- do_firstpass(tnode(calccode));
- do_firstpass(tnode(deletecode));
- { create a new for node, the old one will be released by the compiler }
- with tfornode(node) do
- begin
- fornode:=cfornode.create(left,right,t1,t2,lnf_backward in loopflags);
- left:=nil;
- right:=nil;
- t1:=nil;
- t2:=nil;
- end;
- node:=fornode;
- loopcode:=internalstatements(loopcodestatements);
- addstatement(loopcodestatements,tfornode(node).t2);
- tfornode(node).t2:=loopcode;
- do_firstpass(node);
- addstatement(loopcodestatements,calccode);
- result:=internalstatements(newcodestatements);
- addstatement(newcodestatements,initcode);
- initcode:=nil;
- addstatement(newcodestatements,node);
- addstatement(newcodestatements,deletecode);
- end;
- templist.Free;
- inductionexprs.Free;
- end;
- function iterforloops(var n: tnode; arg: pointer): foreachnoderesult;
- var
- hp : tnode;
- begin
- Result:=fen_false;
- if n.nodetype=forn then
- begin
- { do we have DFA available? }
- if pi_dfaavailable in current_procinfo.flags then
- begin
- CalcDefSum(tfornode(n).t2);
- end;
- containsnestedforloop:=false;
- hp:=OptimizeInductionVariablesSingleForLoop(n);
- if assigned(hp) then
- begin
- n.Free;
- n:=hp;
- end;
- { can we avoid further searching? }
- if not(containsnestedforloop) then
- Result:=fen_norecurse_false;
- end;
- end;
- function OptimizeInductionVariables(node : tnode) : boolean;
- begin
- changedforloop:=false;
- foreachnodestatic(pm_postprocess,node,@iterforloops,nil);
- Result:=changedforloop;
- end;
- end.
|