ソースを参照

* Register allocator speed boost

daniel 21 年 前
コミット
9868155d53
1 ファイル変更126 行追加90 行削除
  1. 126 90
      compiler/rgobj.pas

+ 126 - 90
compiler/rgobj.pas

@@ -123,7 +123,7 @@ unit rgobj;
       end;
 
       Tmovelist=record
-        count:cardinal;
+        count,sorted_until:cardinal;
         data:array[0..$ffff] of Tlinkedlistitem;
       end;
       Pmovelist=^Tmovelist;
@@ -311,6 +311,42 @@ implementation
        globals,verbose,tgobj,procinfo;
 
 
+    procedure sort_movelist(ml:Pmovelist);
+
+    {Ok, sorting pointers is silly, but it does the job to make Trgobj.combine
+     faster.}
+
+    var h,i,p:word;
+        t:Tlinkedlistitem;
+
+    begin
+      with ml^ do
+        begin
+          if count<2 then
+            exit;
+          p:=1;
+          while 2*p<count do
+            p:=2*p;
+          while p<>0 do
+            begin
+              for h:=p to count-1 do
+                begin
+                  i:=h;
+                  t:=data[i];
+                  repeat
+                    if ptrint(data[i-p])<=ptrint(t) then
+                      break;
+                    data[i]:=data[i-p];
+                    dec(i,p);
+                  until i<p;
+                  data[i]:=t;
+                end;
+              p:=p shr 1;
+            end;
+          sorted_until:=count-1;
+        end;
+    end;
+
 {******************************************************************************
                               tinterferencebitmap
 ******************************************************************************}
@@ -664,11 +700,12 @@ implementation
             begin
               getmem(movelist,64);
               movelist^.count:=0;
+              movelist^.sorted_until:=0;
             end
           else
             begin
               cursize:=memsize(movelist);
-              if (4*(movelist^.count+1)=cursize) then
+              if (4*(movelist^.count+2)=cursize) then
                 reallocmem(movelist,cursize*2);
             end;
           movelist^.data[movelist^.count]:=data;
@@ -742,9 +779,9 @@ implementation
      registers in it cause. This allows simplify to execute in
      constant time.}
 
-    var p,h,i,j,leni,lenj:word;
+    var p,h,i,leni,lent:word;
         t:Tsuperregister;
-        adji,adjj:Psuperregisterworklist;
+        adji,adjt:Psuperregisterworklist;
 
     begin
       with simplifyworklist do
@@ -756,30 +793,25 @@ implementation
             p:=2*p;
           while p<>0 do
             begin
-              for h:=0 to length-p-1 do
+              for h:=p to length-1 do
                 begin
                   i:=h;
+                  t:=buf^[i];
+                  adjt:=reginfo[buf^[i]].adjlist;
+                  lent:=0;
+                  if adjt<>nil then
+                    lent:=adjt^.length;
                   repeat
-                    j:=i+p;
-                    adji:=reginfo[buf^[i]].adjlist;
-                    adjj:=reginfo[buf^[j]].adjlist;
-                    if adji=nil then
-                      leni:=0
-                    else
+                    adji:=reginfo[buf^[i-p]].adjlist;
+                    leni:=0;
+                    if adji<>nil then
                       leni:=adji^.length;
-                    if adjj=nil then
-                      lenj:=0
-                    else
-                      lenj:=adjj^.length;
-                    if lenj>=leni then
-                      break;
-                    t:=buf^[i];
-                    buf^[i]:=buf^[j];
-                    buf^[j]:=t;
-                    if i<p then
+                    if leni<=lent then
                       break;
+                    buf^[i]:=buf^[i-p];
                     dec(i,p)
-                  until false;
+                  until i<p;
+                  buf^[i]:=t;
                 end;
               p:=p shr 1;
             end;
@@ -1009,12 +1041,9 @@ implementation
     procedure trgobj.combine(u,v:Tsuperregister);
 
     var adj : Psuperregisterworklist;
-        i : word;
+        i,n,p,q:cardinal;
         t : tsuperregister;
-        n,o : cardinal;
-        decrement : boolean;
-{	moves:Tsuperregisterset;}
-        vm:Pmovelist;
+        searched:Tlinkedlistitem;
 
     label l1;
 
@@ -1028,50 +1057,53 @@ implementation
       {Combine both movelists. Since the movelists are sets, only add
        elements that are not already present. The movelists cannot be
        empty by definition; nodes are only coalesced if there is a move
-       between them.}
-
-{     Nice attempt; it didn't work.
-      supregset_reset(moves,false);
-      supregset_include(moves,u);
+       between them. To prevent quadratic time blowup (movelists of
+       especially machine registers can get very large because of moves
+       generated during calls) we need to go into disgusting complexity.
+
+       (See webtbs/tw2242 for an example that stresses this.)
+
+       We want to sort the movelist to be able to search logarithmically.
+       Unfortunately, sorting the movelist every time before searching
+       is counter-productive, since the movelist usually grows with a few
+       items at a time. Therefore, we split the movelist into a sorted
+       and an unsorted part and search through both. If the unsorted part
+       becomes too large, we sort.}
+
+      {We have to weigh the cost of sorting the list against searching
+       the cost of the unsorted part. I use factor of 8 here; if the
+       number of items is less than 8 times the numer of unsorted items,
+       we'll sort the list.}
       with reginfo[u].movelist^ do
-        for n:=0 to count-1 do
-	  begin
-	    if Tmoveins(data[n]).x=u then
-              supregset_include(moves,Tmoveins(data[n]).y)
-	    else
-              supregset_include(moves,Tmoveins(data[n]).x)
-          end;
-      with reginfo[v].movelist^ do
-        for n:=0 to count-1 do
-	  begin
-	    if Tmoveins(data[n]).x=v then
-	      begin
-	        if supregset_in(moves,Tmoveins(data[n]).y) then
-        	  add_to_movelist(u,data[n]);
-              end
-	    else
-	      begin
-	        if supregset_in(moves,Tmoveins(data[n]).x) then
-        	  add_to_movelist(u,data[n]);
+        if count<8*(count-sorted_until) then
+          sort_movelist(reginfo[u].movelist);
+      for n:=0 to reginfo[v].movelist^.count-1 do
+        begin
+          {Binary search the sorted part of the list.}
+          searched:=reginfo[v].movelist^.data[n];
+          p:=0;
+          q:=reginfo[u].movelist^.sorted_until;
+          i:=0;
+          if q<>0 then 
+            repeat
+              i:=(p+q) shr 1;
+              if ptrint(searched)>ptrint(reginfo[u].movelist^.data[i]) then
+                p:=i+1
+              else
+                q:=i;
+            until p=q;
+          with reginfo[u].movelist^ do
+            if searched<>data[i] then
+              begin
+                {Linear search the unsorted part of the list.}
+                for i:=sorted_until+1 to count-1 do
+                  if searched=data[i] then
+                    goto l1;
+                {Not found -> add}
+                add_to_movelist(u,searched);
+              l1:
               end;
-	  end;}
-
-      {This loop is a performance bottleneck for large procedures and therefore
-       optimized by hand as much as possible. This is because machine registers
-       generally collect large movelists (for example around procedure calls data
-       is moved into machine registers). The loop below is unfortunately quadratic,
-       and guess what this means when a procedure has collected several thousand
-       moves.... Test webtbs/tw2242 is a good example to illustrate this.}
-      vm:=reginfo[v].movelist;
-      for n:=0 to vm^.count-1 do
-        with reginfo[u].movelist^ do
-          begin
-            for o:=0 to count-1 do
-              if data[o]=vm^.data[n] then
-                goto l1; {Continue outer loop.}
-            add_to_movelist(u,vm^.data[n]);
-          l1:
-          end;
+        end;
 
       enable_moves(v);
 
@@ -1080,26 +1112,27 @@ implementation
         for i:=1 to adj^.length do
           begin
             t:=adj^.buf^[i-1];
-            if not(ri_coalesced in reginfo[t].flags) then
-              begin
-                {t has a connection to v. Since we are adding v to u, we
-                 need to connect t to u. However, beware if t was already
-                 connected to u...}
-                if (ibitmap[t,u]) and not (ri_selected in reginfo[t].flags) then
-                  {... because in that case, we are actually removing an edge
-                   and the degree of t decreases.}
-                  decrement_degree(t)
-                else
-                  begin
-                    add_edge(t,u);
-                    {We have added an edge to t and u. So their degree increases.
-                     However, v is added to u. That means its neighbours will
-                     no longer point to v, but to u instead. Therefore, only the
-                     degree of u increases.}
-                    if (u>=first_imaginary) and not (ri_selected in reginfo[t].flags) then
-                      inc(reginfo[u].degree);
-                  end;
-              end;
+            with reginfo[t] do
+              if not(ri_coalesced in flags) then
+                begin
+                  {t has a connection to v. Since we are adding v to u, we
+                   need to connect t to u. However, beware if t was already
+                   connected to u...}
+                  if (ibitmap[t,u]) and not (ri_selected in flags) then
+                    {... because in that case, we are actually removing an edge
+                     and the degree of t decreases.}
+                    decrement_degree(t)
+                  else
+                    begin
+                      add_edge(t,u);
+                      {We have added an edge to t and u. So their degree increases.
+                       However, v is added to u. That means its neighbours will
+                       no longer point to v, but to u instead. Therefore, only the
+                       degree of u increases.}
+                      if (u>=first_imaginary) and not (ri_selected in flags) then
+                        inc(reginfo[u].degree);
+                    end;
+                end;
           end;
       if (reginfo[u].degree>=usable_registers_cnt) and freezeworklist.delete(u) then
         spillworklist.add(u);
@@ -1968,7 +2001,10 @@ implementation
 end.
 {
   $Log$
-  Revision 1.118  2004-02-07 23:28:34  daniel
+  Revision 1.119  2004-02-08 14:26:28  daniel
+    * Register allocator speed boost
+
+  Revision 1.118  2004/02/07 23:28:34  daniel
     * Take advantage of our new with statement optimization
 
   Revision 1.117  2004/02/06 13:34:46  daniel