123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169 |
- {
- Loop unrolling
- Copyright (c) 2005 by Florian Klaempfl
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- ****************************************************************************
- }
- unit optunrol;
- {$i fpcdefs.inc}
- interface
- uses
- node;
- function unroll_loop(node : tnode) : tnode;
- implementation
- uses
- globtype,globals,
- cpuinfo,
- nutils,
- nbas,nflw,ncon,ninl,ncal;
- var
- nodecount : aint;
- function donodecount(var n: tnode; arg: pointer): foreachnoderesult;
- begin
- inc(nodecount);
- result:=fen_false;
- end;
- { rough estimation how large the tree "node" is }
- function countnodes(node : tnode) : aint;
- begin
- nodecount:=0;
- foreachnodestatic(node,@donodecount,nil);
- result:=nodecount;
- end;
- function number_unrolls(node : tnode) : integer;
- begin
- {$ifdef i386}
- { multiply by 2 for CPUs with a long pipeline }
- if current_settings.cputype in [cpu_Pentium4] then
- number_unrolls:=60 div countnodes(node)
- else
- {$endif i386}
- number_unrolls:=30 div countnodes(node);
- if number_unrolls=0 then
- number_unrolls:=1;
- end;
- function unroll_loop(node : tnode) : tnode;
- var
- unrolls,i : integer;
- counts : qword;
- unrollstatement : tstatementnode;
- unrollblock : tblocknode;
- begin
- result:=nil;
- if (cs_opt_size in current_settings.optimizerswitches) then
- exit;
- if not(node.nodetype in [forn]) then
- exit;
- unrolls:=number_unrolls(tfornode(node).t2);
- if unrolls>1 then
- begin
- { number of executions known? }
- if (tfornode(node).right.nodetype=ordconstn) and (tfornode(node).t1.nodetype=ordconstn) then
- begin
- if lnf_backward in tfornode(node).loopflags then
- counts:=tordconstnode(tfornode(node).right).value-tordconstnode(tfornode(node).t1).value+1
- else
- counts:=tordconstnode(tfornode(node).t1).value-tordconstnode(tfornode(node).right).value+1;
- { don't unroll more than we need }
- if unrolls>counts then
- unrolls:=counts;
- { create block statement }
- unrollblock:=internalstatements(unrollstatement);
- { let's unroll (and rock of course) }
- for i:=1 to unrolls do
- begin
- { create and insert copy of the statement block }
- addstatement(unrollstatement,tfornode(tfornode(node).t2).getcopy);
- { set and insert entry label? }
- if (counts mod unrolls<>0) and
- ((counts mod unrolls)=unrolls-i) then
- begin
- tfornode(node).entrylabel:=clabelnode.create(cnothingnode.create);
- addstatement(unrollstatement,tfornode(node).entrylabel);
- end;
- { for itself increases at the last iteration }
- if i<unrolls then
- begin
- { insert incrementation of counter var }
- addstatement(unrollstatement,
- geninlinenode(in_inc_x,false,ccallparanode.create(tfornode(node).left.getcopy,nil)));
- end;
- end;
- { can we get rid of the for statement? }
- if unrolls=counts then
- result:=unrollblock;
- end
- else
- begin
- { unrolling is a little bit more tricky if we don't know the
- loop count at compile time, but the solution is to use a jump table
- which is indexed by "loop count mod unrolls" at run time and which
- jumps then at the appropriate place inside the loop. Because
- a module division is expensive, we can use only unroll counts dividable
- by 2 }
- case unrolls of
- 1..2:
- ;
- 3:
- unrolls:=2;
- 4..7:
- unrolls:=4;
- { unrolls>4 already make no sense imo, but who knows (FK) }
- 8..15:
- unrolls:=8;
- 16..31:
- unrolls:=16;
- 32..63:
- unrolls:=32;
- 64..$7fff:
- unrolls:=64;
- else
- exit;
- end;
- { we don't handle this yet }
- exit;
- end;
- if not(assigned(result)) then
- begin
- tfornode(node).t2.free;
- tfornode(node).t2:=unrollblock;
- end;
- end;
- end;
- end.
|