瀏覽代碼

+ patch by J. Gareth Moreton: SHL-centric peephole optimisations, resolves #37389

git-svn-id: trunk@45811 -
florian 5 年之前
父節點
當前提交
09125e834f
共有 1 個文件被更改,包括 161 次插入5 次删除
  1. 161 5
      compiler/x86/aoptx86.pas

+ 161 - 5
compiler/x86/aoptx86.pas

@@ -3396,10 +3396,15 @@ unit aoptx86;
         TmpBool1,TmpBool2 : Boolean;
         tmpref : treference;
         hp1,hp2: tai;
+        mask:    tcgint;
       begin
         Result:=false;
-        if MatchOpType(taicpu(p),top_const,top_reg) and
-           (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
+
+        { All these optimisations work on "shl/sal const,%reg" }
+        if not MatchOpType(taicpu(p),top_const,top_reg) then
+          Exit;
+
+        if (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
            (taicpu(p).oper[0]^.val <= 3) then
           { Changes "shl const, %reg32; add const/reg, %reg32" to one lea statement }
           begin
@@ -3511,8 +3516,7 @@ unit aoptx86;
               end;
           end
 {$ifndef x86_64}
-        else if (current_settings.optimizecputype < cpu_Pentium2) and
-          MatchOpType(taicpu(p),top_const,top_reg) then
+        else if (current_settings.optimizecputype < cpu_Pentium2) then
           begin
             { changes "shl $1, %reg" to "add %reg, %reg", which is the same on a 386,
               but faster on a 486, and Tairable in both U and V pipes on the Pentium
@@ -3540,7 +3544,130 @@ unit aoptx86;
              end;
           end
 {$endif x86_64}
-          ;
+        else if
+          GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) and MatchOpType(taicpu(hp1), top_const, top_reg) and
+          (
+            (
+              MatchInstruction(hp1, A_AND, [taicpu(p).opsize]) and
+              SetAndTest(hp1, hp2)
+{$ifdef x86_64}
+            ) or
+            (
+              MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
+              GetNextInstruction(hp1, hp2) and
+              MatchInstruction(hp2, A_AND, [taicpu(p).opsize]) and
+              MatchOpType(taicpu(hp2), top_reg, top_reg) and
+              (taicpu(hp1).oper[1]^.reg = taicpu(hp2).oper[0]^.reg)
+{$endif x86_64}
+            )
+          ) and
+          (taicpu(p).oper[1]^.reg = taicpu(hp2).oper[1]^.reg) then
+          begin
+            { Change:
+                shl x, %reg1
+                mov -(1<<x), %reg2
+                and %reg2, %reg1
+
+              Or:
+                shl x, %reg1
+                and -(1<<x), %reg1
+
+              To just:
+                shl x, %reg1
+
+              Since the and operation only zeroes bits that are already zero from the shl operation
+            }
+            case taicpu(p).oper[0]^.val of
+               8:
+                 mask:=$FFFFFFFFFFFFFF00;
+               16:
+                 mask:=$FFFFFFFFFFFF0000;
+               32:
+                 mask:=$FFFFFFFF00000000;
+               63:
+                 { Constant pre-calculated to prevent overflow errors with Int64 }
+                 mask:=$8000000000000000;
+               else
+                 begin
+                   if taicpu(p).oper[0]^.val >= 64 then
+                     { Shouldn't happen realistically, since the register
+                       is guaranteed to be set to zero at this point }
+                     mask := 0
+                   else
+                     mask := -(Int64(1 shl taicpu(p).oper[0]^.val));
+                 end;
+            end;
+
+            if taicpu(hp1).oper[0]^.val = mask then
+              begin
+                { Everything checks out, perform the optimisation, as long as
+                  the FLAGS register isn't being used}
+                TransferUsedRegs(TmpUsedRegs);
+                UpdateUsedRegs(TmpUsedRegs, tai(p.next));
+
+{$ifdef x86_64}
+                if (hp1 <> hp2) then
+                  begin
+                    { "shl/mov/and" version }
+                    UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
+
+                    { Don't do the optimisation if the FLAGS register is in use }
+                    if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp2, TmpUsedRegs)) then
+                      begin
+                        DebugMsg(SPeepholeOptimization + 'ShlMovAnd2Shl', p);
+                        { Don't remove the 'mov' instruction if its register is used elsewhere }
+                        if not(RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, hp2, TmpUsedRegs)) then
+                          begin
+                            asml.Remove(hp1);
+                            hp1.Free;
+                            Result := True;
+                          end;
+
+                        { Only set Result to True if the 'mov' instruction was removed }
+                        asml.Remove(hp2);
+                        hp2.Free;
+                      end;
+                  end
+                else
+{$endif x86_64}
+                  begin
+                    { "shl/and" version }
+
+                    { Don't do the optimisation if the FLAGS register is in use }
+                    if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
+                      begin
+                        DebugMsg(SPeepholeOptimization + 'ShlAnd2Shl', p);
+                        asml.Remove(hp1);
+                        hp1.Free;
+                        Result := True;
+                      end;
+                  end;
+
+                Exit;
+              end
+            else {$ifdef x86_64}if (hp1 = hp2) then{$endif x86_64}
+              begin
+                { Even if the mask doesn't allow for its removal, we might be
+                  able to optimise the mask for the "shl/and" version, which
+                  may permit other peephole optimisations }
+{$ifdef DEBUG_AOPTCPU}
+                mask := taicpu(hp1).oper[0]^.val and mask;
+                if taicpu(hp1).oper[0]^.val <> mask then
+                  begin
+                    DebugMsg(
+                      SPeepholeOptimization +
+                      'Changed mask from $' + debug_tostr(taicpu(hp1).oper[0]^.val) +
+                      ' to $' + debug_tostr(mask) +
+                      'based on previous instruction (ShlAnd2ShlAnd)', hp1);
+                    taicpu(hp1).oper[0]^.val := mask;
+                  end;
+{$else DEBUG_AOPTCPU}
+                { If debugging is off, just set the operand even if it's the same }
+                taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val and mask;
+{$endif DEBUG_AOPTCPU}
+              end;
+
+          end;
       end;
 
 
@@ -5357,6 +5484,35 @@ unit aoptx86;
                 hp1.Free;
               end;
           end
+        else if reg_and_hp1_is_instr and
+          (taicpu(p).oper[0]^.typ = top_reg) and
+          (
+            (taicpu(hp1).opcode = A_SHL) or (taicpu(hp1).opcode = A_SAL)
+          ) and
+          (taicpu(hp1).oper[0]^.typ = top_const) and
+          SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
+          MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
+          { Minimum shift value allowed is the bit difference between the sizes }
+          (taicpu(hp1).oper[0]^.val >=
+            { Multiply by 8 because tcgsize2size returns bytes, not bits }
+            8 * (
+              tcgsize2size[reg_cgsize(taicpu(p).oper[1]^.reg)] -
+              tcgsize2size[reg_cgsize(taicpu(p).oper[0]^.reg)]
+            )
+          ) then
+          begin
+            { For:
+                movsx/movzx %reg1,%reg1 (same register, just different sizes)
+                shl/sal     ##,   %reg1
+
+              Remove the movsx/movzx instruction if the shift overwrites the
+              extended bits of the register (e.g. movslq %eax,%rax; shlq $32,%rax
+            }
+            DebugMsg(SPeepholeOptimization + 'MovxShl2Shl',p);
+            RemoveCurrentP(p, hp1);
+            Result := True;
+            Exit;
+          end
         else if taicpu(p).opcode=A_MOVZX then
           begin
             { removes superfluous And's after movzx's }