|
@@ -123,7 +123,7 @@ unit rgobj;
|
|
|
end;
|
|
|
|
|
|
Tmovelist=record
|
|
|
- count:cardinal;
|
|
|
+ count,sorted_until:cardinal;
|
|
|
data:array[0..$ffff] of Tlinkedlistitem;
|
|
|
end;
|
|
|
Pmovelist=^Tmovelist;
|
|
@@ -311,6 +311,42 @@ implementation
|
|
|
globals,verbose,tgobj,procinfo;
|
|
|
|
|
|
|
|
|
+ procedure sort_movelist(ml:Pmovelist);
|
|
|
+
|
|
|
+ {Ok, sorting pointers is silly, but it does the job to make Trgobj.combine
|
|
|
+ faster.}
|
|
|
+
|
|
|
+ var h,i,p:word;
|
|
|
+ t:Tlinkedlistitem;
|
|
|
+
|
|
|
+ begin
|
|
|
+ with ml^ do
|
|
|
+ begin
|
|
|
+ if count<2 then
|
|
|
+ exit;
|
|
|
+ p:=1;
|
|
|
+ while 2*p<count do
|
|
|
+ p:=2*p;
|
|
|
+ while p<>0 do
|
|
|
+ begin
|
|
|
+ for h:=p to count-1 do
|
|
|
+ begin
|
|
|
+ i:=h;
|
|
|
+ t:=data[i];
|
|
|
+ repeat
|
|
|
+ if ptrint(data[i-p])<=ptrint(t) then
|
|
|
+ break;
|
|
|
+ data[i]:=data[i-p];
|
|
|
+ dec(i,p);
|
|
|
+ until i<p;
|
|
|
+ data[i]:=t;
|
|
|
+ end;
|
|
|
+ p:=p shr 1;
|
|
|
+ end;
|
|
|
+ sorted_until:=count-1;
|
|
|
+ end;
|
|
|
+ end;
|
|
|
+
|
|
|
{******************************************************************************
|
|
|
tinterferencebitmap
|
|
|
******************************************************************************}
|
|
@@ -664,11 +700,12 @@ implementation
|
|
|
begin
|
|
|
getmem(movelist,64);
|
|
|
movelist^.count:=0;
|
|
|
+ movelist^.sorted_until:=0;
|
|
|
end
|
|
|
else
|
|
|
begin
|
|
|
cursize:=memsize(movelist);
|
|
|
- if (4*(movelist^.count+1)=cursize) then
|
|
|
+ if (4*(movelist^.count+2)=cursize) then
|
|
|
reallocmem(movelist,cursize*2);
|
|
|
end;
|
|
|
movelist^.data[movelist^.count]:=data;
|
|
@@ -742,9 +779,9 @@ implementation
|
|
|
registers in it cause. This allows simplify to execute in
|
|
|
constant time.}
|
|
|
|
|
|
- var p,h,i,j,leni,lenj:word;
|
|
|
+ var p,h,i,leni,lent:word;
|
|
|
t:Tsuperregister;
|
|
|
- adji,adjj:Psuperregisterworklist;
|
|
|
+ adji,adjt:Psuperregisterworklist;
|
|
|
|
|
|
begin
|
|
|
with simplifyworklist do
|
|
@@ -756,30 +793,25 @@ implementation
|
|
|
p:=2*p;
|
|
|
while p<>0 do
|
|
|
begin
|
|
|
- for h:=0 to length-p-1 do
|
|
|
+ for h:=p to length-1 do
|
|
|
begin
|
|
|
i:=h;
|
|
|
+ t:=buf^[i];
|
|
|
+ adjt:=reginfo[buf^[i]].adjlist;
|
|
|
+ lent:=0;
|
|
|
+ if adjt<>nil then
|
|
|
+ lent:=adjt^.length;
|
|
|
repeat
|
|
|
- j:=i+p;
|
|
|
- adji:=reginfo[buf^[i]].adjlist;
|
|
|
- adjj:=reginfo[buf^[j]].adjlist;
|
|
|
- if adji=nil then
|
|
|
- leni:=0
|
|
|
- else
|
|
|
+ adji:=reginfo[buf^[i-p]].adjlist;
|
|
|
+ leni:=0;
|
|
|
+ if adji<>nil then
|
|
|
leni:=adji^.length;
|
|
|
- if adjj=nil then
|
|
|
- lenj:=0
|
|
|
- else
|
|
|
- lenj:=adjj^.length;
|
|
|
- if lenj>=leni then
|
|
|
- break;
|
|
|
- t:=buf^[i];
|
|
|
- buf^[i]:=buf^[j];
|
|
|
- buf^[j]:=t;
|
|
|
- if i<p then
|
|
|
+ if leni<=lent then
|
|
|
break;
|
|
|
+ buf^[i]:=buf^[i-p];
|
|
|
dec(i,p)
|
|
|
- until false;
|
|
|
+ until i<p;
|
|
|
+ buf^[i]:=t;
|
|
|
end;
|
|
|
p:=p shr 1;
|
|
|
end;
|
|
@@ -1009,12 +1041,9 @@ implementation
|
|
|
procedure trgobj.combine(u,v:Tsuperregister);
|
|
|
|
|
|
var adj : Psuperregisterworklist;
|
|
|
- i : word;
|
|
|
+ i,n,p,q:cardinal;
|
|
|
t : tsuperregister;
|
|
|
- n,o : cardinal;
|
|
|
- decrement : boolean;
|
|
|
-{ moves:Tsuperregisterset;}
|
|
|
- vm:Pmovelist;
|
|
|
+ searched:Tlinkedlistitem;
|
|
|
|
|
|
label l1;
|
|
|
|
|
@@ -1028,50 +1057,53 @@ implementation
|
|
|
{Combine both movelists. Since the movelists are sets, only add
|
|
|
elements that are not already present. The movelists cannot be
|
|
|
empty by definition; nodes are only coalesced if there is a move
|
|
|
- between them.}
|
|
|
-
|
|
|
-{ Nice attempt; it didn't work.
|
|
|
- supregset_reset(moves,false);
|
|
|
- supregset_include(moves,u);
|
|
|
+ between them. To prevent quadratic time blowup (movelists of
|
|
|
+ especially machine registers can get very large because of moves
|
|
|
+ generated during calls) we need to go into disgusting complexity.
|
|
|
+
|
|
|
+ (See webtbs/tw2242 for an example that stresses this.)
|
|
|
+
|
|
|
+ We want to sort the movelist to be able to search logarithmically.
|
|
|
+ Unfortunately, sorting the movelist every time before searching
|
|
|
+ is counter-productive, since the movelist usually grows with a few
|
|
|
+ items at a time. Therefore, we split the movelist into a sorted
|
|
|
+ and an unsorted part and search through both. If the unsorted part
|
|
|
+ becomes too large, we sort.}
|
|
|
+
|
|
|
+ {We have to weigh the cost of sorting the list against searching
|
|
|
+ the cost of the unsorted part. I use factor of 8 here; if the
|
|
|
+ number of items is less than 8 times the numer of unsorted items,
|
|
|
+ we'll sort the list.}
|
|
|
with reginfo[u].movelist^ do
|
|
|
- for n:=0 to count-1 do
|
|
|
- begin
|
|
|
- if Tmoveins(data[n]).x=u then
|
|
|
- supregset_include(moves,Tmoveins(data[n]).y)
|
|
|
- else
|
|
|
- supregset_include(moves,Tmoveins(data[n]).x)
|
|
|
- end;
|
|
|
- with reginfo[v].movelist^ do
|
|
|
- for n:=0 to count-1 do
|
|
|
- begin
|
|
|
- if Tmoveins(data[n]).x=v then
|
|
|
- begin
|
|
|
- if supregset_in(moves,Tmoveins(data[n]).y) then
|
|
|
- add_to_movelist(u,data[n]);
|
|
|
- end
|
|
|
- else
|
|
|
- begin
|
|
|
- if supregset_in(moves,Tmoveins(data[n]).x) then
|
|
|
- add_to_movelist(u,data[n]);
|
|
|
+ if count<8*(count-sorted_until) then
|
|
|
+ sort_movelist(reginfo[u].movelist);
|
|
|
+ for n:=0 to reginfo[v].movelist^.count-1 do
|
|
|
+ begin
|
|
|
+ {Binary search the sorted part of the list.}
|
|
|
+ searched:=reginfo[v].movelist^.data[n];
|
|
|
+ p:=0;
|
|
|
+ q:=reginfo[u].movelist^.sorted_until;
|
|
|
+ i:=0;
|
|
|
+ if q<>0 then
|
|
|
+ repeat
|
|
|
+ i:=(p+q) shr 1;
|
|
|
+ if ptrint(searched)>ptrint(reginfo[u].movelist^.data[i]) then
|
|
|
+ p:=i+1
|
|
|
+ else
|
|
|
+ q:=i;
|
|
|
+ until p=q;
|
|
|
+ with reginfo[u].movelist^ do
|
|
|
+ if searched<>data[i] then
|
|
|
+ begin
|
|
|
+ {Linear search the unsorted part of the list.}
|
|
|
+ for i:=sorted_until+1 to count-1 do
|
|
|
+ if searched=data[i] then
|
|
|
+ goto l1;
|
|
|
+ {Not found -> add}
|
|
|
+ add_to_movelist(u,searched);
|
|
|
+ l1:
|
|
|
end;
|
|
|
- end;}
|
|
|
-
|
|
|
- {This loop is a performance bottleneck for large procedures and therefore
|
|
|
- optimized by hand as much as possible. This is because machine registers
|
|
|
- generally collect large movelists (for example around procedure calls data
|
|
|
- is moved into machine registers). The loop below is unfortunately quadratic,
|
|
|
- and guess what this means when a procedure has collected several thousand
|
|
|
- moves.... Test webtbs/tw2242 is a good example to illustrate this.}
|
|
|
- vm:=reginfo[v].movelist;
|
|
|
- for n:=0 to vm^.count-1 do
|
|
|
- with reginfo[u].movelist^ do
|
|
|
- begin
|
|
|
- for o:=0 to count-1 do
|
|
|
- if data[o]=vm^.data[n] then
|
|
|
- goto l1; {Continue outer loop.}
|
|
|
- add_to_movelist(u,vm^.data[n]);
|
|
|
- l1:
|
|
|
- end;
|
|
|
+ end;
|
|
|
|
|
|
enable_moves(v);
|
|
|
|
|
@@ -1080,26 +1112,27 @@ implementation
|
|
|
for i:=1 to adj^.length do
|
|
|
begin
|
|
|
t:=adj^.buf^[i-1];
|
|
|
- if not(ri_coalesced in reginfo[t].flags) then
|
|
|
- begin
|
|
|
- {t has a connection to v. Since we are adding v to u, we
|
|
|
- need to connect t to u. However, beware if t was already
|
|
|
- connected to u...}
|
|
|
- if (ibitmap[t,u]) and not (ri_selected in reginfo[t].flags) then
|
|
|
- {... because in that case, we are actually removing an edge
|
|
|
- and the degree of t decreases.}
|
|
|
- decrement_degree(t)
|
|
|
- else
|
|
|
- begin
|
|
|
- add_edge(t,u);
|
|
|
- {We have added an edge to t and u. So their degree increases.
|
|
|
- However, v is added to u. That means its neighbours will
|
|
|
- no longer point to v, but to u instead. Therefore, only the
|
|
|
- degree of u increases.}
|
|
|
- if (u>=first_imaginary) and not (ri_selected in reginfo[t].flags) then
|
|
|
- inc(reginfo[u].degree);
|
|
|
- end;
|
|
|
- end;
|
|
|
+ with reginfo[t] do
|
|
|
+ if not(ri_coalesced in flags) then
|
|
|
+ begin
|
|
|
+ {t has a connection to v. Since we are adding v to u, we
|
|
|
+ need to connect t to u. However, beware if t was already
|
|
|
+ connected to u...}
|
|
|
+ if (ibitmap[t,u]) and not (ri_selected in flags) then
|
|
|
+ {... because in that case, we are actually removing an edge
|
|
|
+ and the degree of t decreases.}
|
|
|
+ decrement_degree(t)
|
|
|
+ else
|
|
|
+ begin
|
|
|
+ add_edge(t,u);
|
|
|
+ {We have added an edge to t and u. So their degree increases.
|
|
|
+ However, v is added to u. That means its neighbours will
|
|
|
+ no longer point to v, but to u instead. Therefore, only the
|
|
|
+ degree of u increases.}
|
|
|
+ if (u>=first_imaginary) and not (ri_selected in flags) then
|
|
|
+ inc(reginfo[u].degree);
|
|
|
+ end;
|
|
|
+ end;
|
|
|
end;
|
|
|
if (reginfo[u].degree>=usable_registers_cnt) and freezeworklist.delete(u) then
|
|
|
spillworklist.add(u);
|
|
@@ -1968,7 +2001,10 @@ implementation
|
|
|
end.
|
|
|
{
|
|
|
$Log$
|
|
|
- Revision 1.118 2004-02-07 23:28:34 daniel
|
|
|
+ Revision 1.119 2004-02-08 14:26:28 daniel
|
|
|
+ * Register allocator speed boost
|
|
|
+
|
|
|
+ Revision 1.118 2004/02/07 23:28:34 daniel
|
|
|
* Take advantage of our new with statement optimization
|
|
|
|
|
|
Revision 1.117 2004/02/06 13:34:46 daniel
|