فهرست منبع

+ more sophisticated code to optimize multiplications on arm
+ the multiplication optimization needs a popcnt function
+ simple test

git-svn-id: trunk@22299 -

florian 13 سال پیش
والد
کامیت
2f8027c63f
4فایلهای تغییر یافته به همراه168 افزوده شده و 1 حذف شده
  1. 1 0
      .gitattributes
  2. 102 1
      compiler/arm/cgcpu.pas
  3. 51 0
      compiler/cutils.pas
  4. 14 0
      tests/test/tmul1.pp

+ 1 - 0
.gitattributes

@@ -10918,6 +10918,7 @@ tests/test/tmsg2.pp svneol=native#text/plain
 tests/test/tmsg3.pp svneol=native#text/plain
 tests/test/tmsg4.pp svneol=native#text/plain
 tests/test/tmt1.pp svneol=native#text/plain
+tests/test/tmul1.pp svneol=native#text/pascal
 tests/test/tnoext1.pp svneol=native#text/plain
 tests/test/tnoext2.pp svneol=native#text/plain
 tests/test/tnoext3.pp svneol=native#text/plain

+ 102 - 1
compiler/arm/cgcpu.pas

@@ -114,7 +114,10 @@ unit cgcpu;
         procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;src,dst: tregister;shuffle : pmmshuffle); override;
         { Transform unsupported methods into Internal errors }
         procedure a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; size: TCGSize; src, dst: TRegister); override;
-      private
+
+        { try to generate optimized 32 Bit multiplication, returns true if successful generated }
+        function try_optimized_mul32_const_reg_reg(list: TAsmList; a: tcgint; src, dst: tregister) : boolean;
+
         { clear out potential overflow bits from 8 or 16 bit operations  }
         { the upper 24/16 bits of a register after an operation          }
         procedure maybeadjustresult(list: TAsmList; op: TOpCg; size: tcgsize; dst: tregister);
@@ -653,6 +656,100 @@ unit cgcpu;
         end
       end;
 
+
+    function tcgarm.try_optimized_mul32_const_reg_reg(list: TAsmList; a: tcgint; src, dst: tregister) : boolean;
+      var
+        multiplier : dword;
+        power : longint;
+        shifterop : tshifterop;
+        bitsset : byte;
+        negative : boolean;
+        first : boolean;
+        cycles : byte;
+      begin
+        result:=true;
+        cycles:=0;
+        negative:=a<0;
+        shifterop.rs:=NR_NO;
+        shifterop.shiftmode:=SM_LSL;
+        if negative then
+          inc(cycles);
+        multiplier:=dword(abs(a));
+        bitsset:=popcnt(multiplier and $fffffffe);
+
+        { most simple cases }
+        if a=1 then
+          a_load_reg_reg(list,OS_32,OS_32,src,dst)
+        else if a=0 then
+          a_load_const_reg(list,OS_32,0,dst)
+        else if a=-1 then
+          a_op_reg_reg(list,OP_NEG,OS_32,src,dst)
+        { add up ?
+
+          basically, one add is needed for each bit being set in the constant factor
+          however, the least significant bit is for free, it can be hidden in the initial
+          instruction
+        }
+        else if (bitsset+cycles<=3) and
+          (bitsset>popcnt(dword(nextpowerof2(multiplier,power)-multiplier) and $fffffffe)) then
+          begin
+            first:=true;
+            while multiplier<>0 do
+              begin
+                shifterop.shiftimm:=BsrDWord(multiplier);
+                if odd(multiplier) then
+                  begin
+                    list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,dst,src,src,shifterop));
+                    dec(multiplier);
+                  end
+                else
+                  if first then
+                    list.concat(taicpu.op_reg_reg_shifterop(A_MOV,dst,src,shifterop))
+                  else
+                    list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,dst,dst,src,shifterop));
+                first:=false;
+                dec(multiplier,1 shl shifterop.shiftimm);
+              end;
+            if negative then
+              list.concat(taicpu.op_reg_reg_const(A_RSB,dst,dst,0));
+          end
+        { subtract from the next greater power of two? }
+        else if popcnt(dword(nextpowerof2(multiplier,power)-multiplier) and $fffffffe)+cycles<=3 then
+          begin
+            first:=true;
+            while multiplier<>0 do
+              begin
+                if first then
+                  begin
+                    multiplier:=(1 shl power)-multiplier;
+                    shifterop.shiftimm:=power;
+                  end
+                else
+                  shifterop.shiftimm:=BsrDWord(multiplier);
+
+                if odd(multiplier) then
+                  begin
+                    list.concat(taicpu.op_reg_reg_reg_shifterop(A_RSB,dst,src,src,shifterop));
+                    dec(multiplier);
+                  end
+                else
+                  if first then
+                    list.concat(taicpu.op_reg_reg_shifterop(A_MOV,dst,src,shifterop))
+                  else
+                    begin
+                      list.concat(taicpu.op_reg_reg_reg_shifterop(A_SUB,dst,dst,src,shifterop));
+                      dec(multiplier,1 shl shifterop.shiftimm);
+                    end;
+                first:=false;
+              end;
+            if negative then
+              list.concat(taicpu.op_reg_reg_const(A_RSB,dst,dst,0));
+          end
+        else
+          result:=false;
+      end;
+
+
     procedure tcgarm.a_op_const_reg_reg_checkoverflow(list: TAsmList; op: TOpCg; size: tcgsize; a: tcgint; src, dst: tregister;setflags : boolean;var ovloc : tlocation);
       var
         shift : byte;
@@ -765,6 +862,10 @@ unit cgcpu;
                 so.shiftimm:=l1;
                 list.concat(taicpu.op_reg_reg_reg_shifterop(A_RSB,dst,src,src,so));
               end
+            else if (op in [OP_MUL,OP_IMUL]) and not(cgsetflags or setflags) and try_optimized_mul32_const_reg_reg(list,a,src,dst) then
+              begin
+                { nothing to do on success }
+              end
             { x := y and 0; just clears a register, this sometimes gets generated on 64bit ops.
               Just using mov x, #0 might allow some easier optimizations down the line. }
             else if (op = OP_AND) and (dword(a)=0) then

+ 51 - 0
compiler/cutils.pas

@@ -90,6 +90,13 @@ interface
     }
     function ispowerof2(value : int64;out power : longint) : boolean;
     function nextpowerof2(value : int64; out power: longint) : int64;
+{$ifdef VER2_6}  { only 2.7.1+ has a popcnt function in the system unit }
+    function PopCnt(AValue : Byte): Byte;
+    function PopCnt(AValue : Word): Word;
+    function PopCnt(AValue : DWord): DWord;
+    function PopCnt(Const AValue : QWord): QWord;
+{$endif VER2_6}
+
     function backspace_quote(const s:string;const qchars:Tcharset):string;
     function octal_quote(const s:string;const qchars:Tcharset):string;
 
@@ -845,6 +852,50 @@ implementation
           end;
       end;
 
+{$ifdef VER2_6}
+    const
+      PopCntData : array[0..15] of byte = (0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4);
+
+    function PopCnt(AValue : Byte): Byte;
+      var
+        i : SizeInt;
+      begin
+        Result:=PopCntData[AValue and $f]+PopCntData[(AValue shr 4) and $f];
+      end;
+
+
+    function PopCnt(AValue : Word): Word;
+      var
+        i : SizeInt;
+      begin
+        Result:=0;
+        for i:=0 to 3 do
+          begin
+            inc(Result,PopCntData[AValue and $f]);
+            AValue:=AValue shr 4;
+          end;
+      end;
+
+
+    function PopCnt(AValue : DWord): DWord;
+      var
+        i : SizeInt;
+      begin
+        Result:=0;
+        for i:=0 to 7 do
+          begin
+            inc(Result,PopCntData[AValue and $f]);
+            AValue:=AValue shr 4;
+          end;
+      end;
+
+
+    function PopCnt(Const AValue : QWord): QWord;
+      begin
+        Result:=PopCnt(lo(AValue))+PopCnt(hi(AValue))
+      end;
+ {$endif VER2_6}
+
 
     function backspace_quote(const s:string;const qchars:Tcharset):string;
 

+ 14 - 0
tests/test/tmul1.pp

@@ -0,0 +1,14 @@
+var
+  i : longint;
+
+begin
+  i:=5;
+  i:=i*10;
+  i:=i*62;
+  i:=i*-10;
+  i:=i*-62;
+  i:=i*87;
+  if i<>167214000 then
+    halt(1);
+  writeln('ok');
+end.