| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702 | {    Loop optimization    Copyright (c) 2005 by Florian Klaempfl    This program is free software; you can redistribute it and/or modify    it under the terms of the GNU General Public License as published by    the Free Software Foundation; either version 2 of the License, or    (at your option) any later version.    This program is distributed in the hope that it will be useful,    but WITHOUT ANY WARRANTY; without even the implied warranty of    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    GNU General Public License for more details.    You should have received a copy of the GNU General Public License    along with this program; if not, write to the Free Software    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ****************************************************************************}unit optloop;{$i fpcdefs.inc}{ $define DEBUG_OPTSTRENGTH}{ $define DEBUG_OPTFORLOOP}  interface    uses      node;    function unroll_loop(node : tnode) : tnode;    function OptimizeInductionVariables(node : tnode) : boolean;    function OptimizeForLoop(var node : tnode) : boolean;  implementation    uses      cutils,cclasses,compinnr,      globtype,globals,constexp,      verbose,      symdef,symsym,      defutil,      cpuinfo,      nutils,      nadd,nbas,nflw,ncon,ninl,ncal,nld,nmem,ncnv,      ncgmem,      pass_1,      optbase,optutils,      procinfo;    function number_unrolls(node : tnode) : cardinal;      begin        { calculate how often a loop shall be unrolled.          The term (60*ord(node_count_weighted(node)<15)) is used to get small loops  unrolled more often as          the counter management takes more time in this case. }{$ifdef i386}        { multiply by 2 for CPUs with a long pipeline }        if current_settings.optimizecputype in [cpu_Pentium4] then          number_unrolls:=trunc(round((60+(60*ord(node_count_weighted(node)<15)))/max(node_count_weighted(node),1)))        else{$endif i386}          number_unrolls:=trunc(round((30+(60*ord(node_count_weighted(node)<15)))/max(node_count_weighted(node),1)));        if number_unrolls=0 then          number_unrolls:=1;      end;    type      treplaceinfo = record        node : tnode;        value : Tconstexprint;      end;      preplaceinfo = ^treplaceinfo;    function checkcontrollflowstatements(var n:tnode; arg: pointer): foreachnoderesult;      begin        if n.nodetype in [breakn,continuen,goton,labeln,exitn,raisen] then          result:=fen_norecurse_true        else          result:=fen_false;      end;    function replaceloadnodes(var n: tnode; arg: pointer): foreachnoderesult;      begin        if n.isequal(preplaceinfo(arg)^.node) then          begin            if n.flags*[nf_modify,nf_write,nf_address_taken]<>[] then              internalerror(2012090402);            n.free;            n:=cordconstnode.create(preplaceinfo(arg)^.value,preplaceinfo(arg)^.node.resultdef,false);            do_firstpass(n);          end;        result:=fen_false;      end;    function unroll_loop(node : tnode) : tnode;      var        unrolls,i : cardinal;        counts : qword;        unrollstatement,newforstatement : tstatementnode;        unrollblock : tblocknode;        getridoffor : boolean;        replaceinfo : treplaceinfo;        hascontrollflowstatements : boolean;      begin        result:=nil;        if (cs_opt_size in current_settings.optimizerswitches) then          exit;        if ErrorCount<>0 then          exit;        if not(node.nodetype in [forn]) then          exit;        unrolls:=number_unrolls(tfornode(node).t2);        if (unrolls>1) and          ((tfornode(node).left.nodetype<>loadn) or           { the address of the counter variable might be taken if it is passed by constref to a             subroutine, so really check if it is not taken }           ((tfornode(node).left.nodetype=loadn) and (tloadnode(tfornode(node).left).symtableentry is tabstractvarsym) and            not(tabstractvarsym(tloadnode(tfornode(node).left).symtableentry).addr_taken) and            not(tabstractvarsym(tloadnode(tfornode(node).left).symtableentry).different_scope))           ) then          begin            { number of executions known? }            if (tfornode(node).right.nodetype=ordconstn) and (tfornode(node).t1.nodetype=ordconstn) then              begin                if lnf_backward in tfornode(node).loopflags then                  counts:=tordconstnode(tfornode(node).right).value-tordconstnode(tfornode(node).t1).value+1                else                  counts:=tordconstnode(tfornode(node).t1).value-tordconstnode(tfornode(node).right).value+1;                hascontrollflowstatements:=foreachnodestatic(tfornode(node).t2,@checkcontrollflowstatements,nil);                { don't unroll more than we need,                  multiply unroll by two here because we can get rid                  of the counter variable completely and replace it by a constant                  if unrolls=counts }                if unrolls*2>=counts then                  unrolls:=counts;                { create block statement }                unrollblock:=internalstatements(unrollstatement);                { can we get rid completly of the for ? }                getridoffor:=(unrolls=counts) and not(hascontrollflowstatements) and                  { TP/Macpas allows assignments to the for-variables, so we cannot get rid of the for }                  ([m_tp7,m_mac]*current_settings.modeswitches=[]);                if getridoffor then                  begin                    replaceinfo.node:=tfornode(node).left;                    replaceinfo.value:=tordconstnode(tfornode(node).right).value;                  end                else                  { we consider currently unrolling not beneficial, if we cannot get rid of the for completely, this                    might change if a more sophisticated heuristics is used (FK) }                  exit;                { let's unroll (and rock of course) }                for i:=1 to unrolls do                  begin                    { create and insert copy of the statement block }                    addstatement(unrollstatement,tfornode(node).t2.getcopy);                    { set and insert entry label? }                    if (counts mod unrolls<>0) and                      ((counts mod unrolls)=unrolls-i) then                      begin                        tfornode(node).entrylabel:=clabelnode.create(cnothingnode.create,clabelsym.create('$optunrol'));                        addstatement(unrollstatement,tfornode(node).entrylabel);                      end;                    if getridoffor then                      begin                        foreachnodestatic(tnode(unrollstatement),@replaceloadnodes,@replaceinfo);                        if lnf_backward in tfornode(node).loopflags then                          replaceinfo.value:=replaceinfo.value-1                        else                          replaceinfo.value:=replaceinfo.value+1;                      end                    else                      begin                        { for itself increases at the last iteration }                        if i<unrolls then                          begin                            { insert incr/decrementation of counter var }                            if lnf_backward in tfornode(node).loopflags then                              addstatement(unrollstatement,                                geninlinenode(in_dec_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)))                            else                              addstatement(unrollstatement,                                geninlinenode(in_inc_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)));                          end;                       end;                  end;                { can we get rid of the for statement? }                if getridoffor then                  begin                    { create block statement }                    result:=internalstatements(newforstatement);                    addstatement(newforstatement,unrollblock);                    doinlinesimplify(result);                  end;              end            else              begin                { unrolling is a little bit more tricky if we don't know the                  loop count at compile time, but the solution is to use a jump table                  which is indexed by "loop count mod unrolls" at run time and which                  jumps then at the appropriate place inside the loop. Because                  a module division is expensive, we can use only unroll counts dividable                  by 2 }                case unrolls of                  1..2:                    ;                  3:                    unrolls:=2;                  4..7:                    unrolls:=4;                  { unrolls>4 already make no sense imo, but who knows (FK) }                  8..15:                    unrolls:=8;                  16..31:                    unrolls:=16;                  32..63:                    unrolls:=32;                  64..$7fff:                    unrolls:=64;                  else                    exit;                end;                { we don't handle this yet }                exit;              end;            if not(assigned(result)) then              begin                tfornode(node).t2.free;                tfornode(node).t2:=unrollblock;              end;          end;      end;    var      initcode,      calccode,      deletecode : tblocknode;      initcodestatements,      calccodestatements,      deletecodestatements: tstatementnode;      templist : tfplist;      inductionexprs : tfplist;      changedforloop,      containsnestedforloop,      docalcatend: boolean;    function checkcontinue(var n:tnode; arg: pointer): foreachnoderesult;      begin        if n.nodetype=continuen then          result:=fen_norecurse_true        else          result:=fen_false;      end;    function is_loop_invariant(loop : tnode;expr : tnode) : boolean;      begin        result:=is_constnode(expr);        case expr.nodetype of          loadn:            begin              if (pi_dfaavailable in current_procinfo.flags) and                assigned(loop.optinfo) and                assigned(expr.optinfo) and                not(expr.isequal(tfornode(loop).left)) then                { no aliasing? }                result:=(([nf_write,nf_modify]*expr.flags)=[]) and not(tabstractvarsym(tloadnode(expr).symtableentry).addr_taken) and                { no definition in the loop? }                  not(DFASetIn(tfornode(loop).t2.optinfo^.defsum,expr.optinfo^.index));            end;          vecn:            begin              result:=((tvecnode(expr).left.nodetype=loadn) or is_loop_invariant(loop,tvecnode(expr).left)) and                is_loop_invariant(loop,tvecnode(expr).right);            end;          typeconvn:            result:=is_loop_invariant(loop,ttypeconvnode(expr).left);          addn,subn:            result:=is_loop_invariant(loop,taddnode(expr).left) and is_loop_invariant(loop,taddnode(expr).right);          else            ;        end;      end;    { checks if the strength of n can be recuded, arg is the tforloop being considered }    function dostrengthreductiontest(var n: tnode; arg: pointer): foreachnoderesult;      function findpreviousstrengthreduction : boolean;        var          i : longint;          hp : tnode;        begin          result:=false;          for i:=0 to inductionexprs.count-1 do            begin              { do we already maintain one expression? }              if tnode(inductionexprs[i]).isequal(n) then                begin                  case n.nodetype of                    muln:                      hp:=ctemprefnode.create(ttempcreatenode(templist[i]));                    vecn:                      hp:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(                        ttempcreatenode(templist[i]))),n.resultdef);                    else                      internalerror(200809211);                  end;                  n.free;                  n:=hp;                  result:=true;                  exit;                end;            end;        end;      procedure CreateNodes;        begin          if not assigned(initcode) then            begin              initcode:=internalstatements(initcodestatements);              calccode:=internalstatements(calccodestatements);              deletecode:=internalstatements(deletecodestatements);            end;        end;      procedure CheckCalcAtEnd;        begin          if not assigned(initcode) then            docalcatend:=not(assigned(tfornode(arg).entrylabel)) and              not(foreachnodestatic(tfornode(arg).t2,@checkcontinue,nil));        end;      var        tempnode,startvaltemp : ttempcreatenode;        dummy : longint;        nn : tnode;        nt : tnodetype;        nflags : tnodeflags;      begin        result:=fen_false;        nflags:=n.flags;        case n.nodetype of          forn:            { inform for loop search routine, that it needs to search more deeply }            containsnestedforloop:=true;          muln:            begin              if (taddnode(n).right.nodetype=loadn) and                taddnode(n).right.isequal(tfornode(arg).left) and                { plain read of the loop variable? }                not(nf_write in taddnode(n).right.flags) and                not(nf_modify in taddnode(n).right.flags) and                is_loop_invariant(tfornode(arg),taddnode(n).left) then                taddnode(n).swapleftright;              if (taddnode(n).left.nodetype=loadn) and                taddnode(n).left.isequal(tfornode(arg).left) and                { plain read of the loop variable? }                not(nf_write in taddnode(n).left.flags) and                not(nf_modify in taddnode(n).left.flags) and                is_loop_invariant(tfornode(arg),taddnode(n).right) then                begin                  changedforloop:=true;                  { did we use the same expression before already? }                  if not(findpreviousstrengthreduction) then                    begin{$ifdef DEBUG_OPTSTRENGTH}                      writeln('**********************************************************************************');                      writeln(parser_current_file, ': Found expression for strength reduction (MUL): ');                      printnode(n);                      writeln('**********************************************************************************');{$endif DEBUG_OPTSTRENGTH}                      CheckCalcAtEnd;                      tempnode:=ctempcreatenode.create(n.resultdef,n.resultdef.size,tt_persistent,                        tstoreddef(n.resultdef).is_intregable or tstoreddef(n.resultdef).is_fpuregable);                      templist.Add(tempnode);                      inductionexprs.Add(n);                      CreateNodes;                      if lnf_backward in tfornode(arg).loopflags then                        addstatement(calccodestatements,                          geninlinenode(in_dec_x,false,                          ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))))                      else                        addstatement(calccodestatements,                          geninlinenode(in_inc_x,false,                          ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(taddnode(n).right.getcopy,nil))));                      addstatement(initcodestatements,tempnode);                      nn:=tfornode(arg).right.getcopy;                      { If the calculation is not performed at the end                        it is needed to adjust the starting value }                      if not docalcatend then                        begin                          if lnf_backward in tfornode(arg).loopflags then                            nt:=addn                          else                            nt:=subn;                          nn:=caddnode.create_internal(nt,nn,                             cordconstnode.create(1,nn.resultdef,false));                        end;                      addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),                          caddnode.create(muln,nn,                            taddnode(n).right.getcopy)                          )                        );                      { finally replace the node by a temp. ref }                      n:=ctemprefnode.create(tempnode);                      { ... and add a temp. release node }                      addstatement(deletecodestatements,ctempdeletenode.create(tempnode));                    end;                  { set types }                  do_firstpass(n);                  result:=fen_norecurse_false;                end;            end;          vecn:            begin              { is the index the counter variable? }              if not(is_special_array(tvecnode(n).left.resultdef)) and                not(is_packed_array(tvecnode(n).left.resultdef)) and                (tvecnode(n).right.isequal(tfornode(arg).left) or                 { fpc usually creates a type cast to access an array }                 ((tvecnode(n).right.nodetype=typeconvn) and                  ttypeconvnode(tvecnode(n).right).left.isequal(tfornode(arg).left)                 )                ) and                { plain read of the loop variable? }                not(nf_write in tvecnode(n).right.flags) and                not(nf_modify in tvecnode(n).right.flags) and                { direct array access? }                ((tvecnode(n).left.nodetype=loadn) or                { ... or loop invariant expression? }                is_loop_invariant(tfornode(arg),tvecnode(n).right)){$if not (defined(cpu16bitalu) or defined(cpu8bitalu))}                { removing the multiplication is only worth the                  effort if it's not a simple shift }                and not(ispowerof2(tcgvecnode(n).get_mul_size,dummy)){$endif}                then                begin                  changedforloop:=true;                  { did we use the same expression before already? }                  if not(findpreviousstrengthreduction) then                    begin{$ifdef DEBUG_OPTSTRENGTH}                      writeln('**********************************************************************************');                      writeln(parser_current_file,': Found expression for strength reduction (VEC): ');                      printnode(n);                      writeln('**********************************************************************************');{$endif DEBUG_OPTSTRENGTH}                      CheckCalcAtEnd;                      tempnode:=ctempcreatenode.create(voidpointertype,voidpointertype.size,tt_persistent,true);                      templist.Add(tempnode);                      inductionexprs.Add(n);                      CreateNodes;                      if lnf_backward in tfornode(arg).loopflags then                        addstatement(calccodestatements,                          cinlinenode.createintern(in_dec_x,false,                          ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(                          cordconstnode.create(tcgvecnode(n).get_mul_size,sizeuinttype,false),nil))))                      else                        addstatement(calccodestatements,                          cinlinenode.createintern(in_inc_x,false,                          ccallparanode.create(ctemprefnode.create(tempnode),ccallparanode.create(                          cordconstnode.create(tcgvecnode(n).get_mul_size,sizeuinttype,false),nil))));                      addstatement(initcodestatements,tempnode);                      startvaltemp:=maybereplacewithtemp(tfornode(arg).right,initcode,initcodestatements,tfornode(arg).right.resultdef.size,true);                      nn:=caddrnode.create(                          cvecnode.create(tvecnode(n).left.getcopy,tfornode(arg).right.getcopy)                        );                      { If the calculation is not performed at the end                        it is needed to adjust the starting value }                      if not docalcatend then                        begin                          if lnf_backward in tfornode(arg).loopflags then                            nt:=addn                          else                            nt:=subn;                          nn:=caddnode.create_internal(nt,                             ctypeconvnode.create_internal(nn,voidpointertype),                             cordconstnode.create(tcgvecnode(n).get_mul_size,sizeuinttype,false));                        end;                      addstatement(initcodestatements,cassignmentnode.create(ctemprefnode.create(tempnode),nn));                      { finally replace the node by a temp. ref }                      n:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(tempnode)),n.resultdef);                      { ... and add a temp. release node }                      if startvaltemp<>nil then                        addstatement(deletecodestatements,ctempdeletenode.create(startvaltemp));                      addstatement(deletecodestatements,ctempdeletenode.create(tempnode));                    end;                  { Copy the nf_write,nf_modify flags to the new deref node of the temp.                    Othewise assignments to vector elements will be removed. }                  if nflags*[nf_write,nf_modify]<>[] then                    begin                      if (n.nodetype<>typeconvn) or (ttypeconvnode(n).left.nodetype<>derefn) then                        internalerror(2021091501);                      ttypeconvnode(n).left.flags:=ttypeconvnode(n).left.flags+nflags*[nf_write,nf_modify];                    end;                  { set types }                  do_firstpass(n);                  result:=fen_norecurse_false;                end;            end;          else            ;        end;      end;    function OptimizeInductionVariablesSingleForLoop(node : tnode) : tnode;      var        loopcode : tblocknode;        loopcodestatements,        newcodestatements : tstatementnode;        fornode : tfornode;      begin        result:=nil;        if node.nodetype<>forn then          exit;        templist:=TFPList.Create;        inductionexprs:=TFPList.Create;        initcode:=nil;        calccode:=nil;        deletecode:=nil;        initcodestatements:=nil;        calccodestatements:=nil;        deletecodestatements:=nil;        docalcatend:=false;        { find all expressions being candidates for strength reduction          and replace them }        foreachnodestatic(pm_postprocess,node,@dostrengthreductiontest,node);        { clue everything together }        if assigned(initcode) then          begin            do_firstpass(tnode(initcode));            do_firstpass(tnode(calccode));            do_firstpass(tnode(deletecode));            { create a new for node, the old one will be released by the compiler }            with tfornode(node) do              begin                fornode:=cfornode.create(left,right,t1,t2,lnf_backward in loopflags);                left:=nil;                right:=nil;                t1:=nil;                t2:=nil;              end;            node:=fornode;            loopcode:=internalstatements(loopcodestatements);            if not docalcatend then              addstatement(loopcodestatements,calccode);            addstatement(loopcodestatements,tfornode(node).t2);            if docalcatend then              addstatement(loopcodestatements,calccode);            tfornode(node).t2:=loopcode;            do_firstpass(node);            result:=internalstatements(newcodestatements);            addstatement(newcodestatements,initcode);            initcode:=nil;            addstatement(newcodestatements,node);            addstatement(newcodestatements,deletecode);          end;        templist.Free;        inductionexprs.Free;      end;    function OptimizeInductionVariables_iterforloops(var n: tnode; arg: pointer): foreachnoderesult;      var        hp : tnode;      begin        Result:=fen_false;        if n.nodetype=forn then          begin            { do we have DFA available? }            if pi_dfaavailable in current_procinfo.flags then              begin                CalcDefSum(tfornode(n).t2);              end;            containsnestedforloop:=false;            hp:=OptimizeInductionVariablesSingleForLoop(n);            if assigned(hp) then              begin                n.Free;                n:=hp;              end;            { can we avoid further searching? }            if not(containsnestedforloop) then              Result:=fen_norecurse_false;          end;      end;    function OptimizeInductionVariables(node : tnode) : boolean;      begin        Result:=false;        if not(pi_dfaavailable in current_procinfo.flags) then          exit;        changedforloop:=false;        foreachnodestatic(pm_postprocess,node,@OptimizeInductionVariables_iterforloops,nil);        Result:=changedforloop;      end;    function OptimizeForLoop_iterforloops(var n: tnode; arg: pointer): foreachnoderesult;      begin        Result:=fen_false;        if (n.nodetype=forn) and          not(lnf_backward in tfornode(n).loopflags) and          (lnf_dont_mind_loopvar_on_exit in tfornode(n).loopflags) and          is_constintnode(tfornode(n).right) and          { this is not strictly necessary, but we do it for now }          is_constnode(tfornode(n).t1) and          (([cs_check_overflow,cs_check_range]*n.localswitches)=[]) and          (([cs_check_overflow,cs_check_range]*tfornode(n).left.localswitches)=[]) and          ((tfornode(n).left.nodetype=loadn) and (tloadnode(tfornode(n).left).symtableentry is tabstractvarsym) and            not(tabstractvarsym(tloadnode(tfornode(n).left).symtableentry).addr_taken) and            not(tabstractvarsym(tloadnode(tfornode(n).left).symtableentry).different_scope)) then          begin            { do we have DFA available? }            if pi_dfaavailable in current_procinfo.flags then              begin                CalcUseSum(tfornode(n).t2);                CalcDefSum(tfornode(n).t2);              end            else              Internalerror(2017122801);            if not(assigned(tfornode(n).left.optinfo)) then              exit;            if not(DFASetIn(tfornode(n).t2.optinfo^.usesum,tfornode(n).left.optinfo^.index)) and              not(DFASetIn(tfornode(n).t2.optinfo^.defsum,tfornode(n).left.optinfo^.index))  then              begin                { convert the loop from i:=a to b into i:=b-a+1 to 1 as this simplifies the                  abort condition }{$ifdef DEBUG_OPTFORLOOP}                writeln('**********************************************************************************');                writeln('Found loop for reverting: ');                printnode(n);                writeln('**********************************************************************************');{$endif DEBUG_OPTFORLOOP}                include(tfornode(n).loopflags,lnf_backward);                tfornode(n).right:=caddnode.create_internal(addn,caddnode.create_internal(subn,tfornode(n).t1,tfornode(n).right),                  cordconstnode.create(1,tfornode(n).left.resultdef,false));                tfornode(n).t1:=cordconstnode.create(1,tfornode(n).left.resultdef,false);                include(tfornode(n).loopflags,lnf_counter_not_used);                exclude(n.flags,nf_pass1_done);                do_firstpass(n);{$ifdef DEBUG_OPTFORLOOP}                writeln('Loop reverted: ');                printnode(n);                writeln('**********************************************************************************');{$endif DEBUG_OPTFORLOOP}                changedforloop:=true;              end;          end;      end;    function OptimizeForLoop(var node : tnode) : boolean;      begin        Result:=false;        if not(pi_dfaavailable in current_procinfo.flags) then          exit;        changedforloop:=false;        foreachnodestatic(pm_postprocess,node,@OptimizeForLoop_iterforloops,nil);        Result:=changedforloop;      end;end.
 |