Browse Source

* x86: Introduced TrySwapMovOp method, and redesigned TrySwapMovCmp
to use it while also trying to move one more instruction back

J. Gareth "Curious Kit" Moreton 3 years ago
parent
commit
5f3749dc49
1 changed files with 129 additions and 37 deletions
  1. 129 37
      compiler/x86/aoptx86.pas

+ 129 - 37
compiler/x86/aoptx86.pas

@@ -211,6 +211,7 @@ unit aoptx86;
         procedure ConvertJumpToRET(const p: tai; const ret_p: tai);
         procedure ConvertJumpToRET(const p: tai; const ret_p: tai);
 
 
         function CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean;
         function CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean;
+        function TrySwapMovOp(var p, hp1: tai): Boolean;
         function TrySwapMovCmp(var p, hp1: tai): Boolean;
         function TrySwapMovCmp(var p, hp1: tai): Boolean;
 
 
         { Processor-dependent reference optimisation }
         { Processor-dependent reference optimisation }
@@ -8453,10 +8454,10 @@ unit aoptx86;
             Break;
             Break;
 
 
           case taicpu(hp2).opcode of
           case taicpu(hp2).opcode of
-            A_MOVSS:
+            A_MOVSD:
               begin
               begin
                 if taicpu(hp2).ops = 0 then
                 if taicpu(hp2).ops = 0 then
-                  { Wrong MOVSS }
+                  { Wrong MOVSD }
                   Break;
                   Break;
                 Inc(Count);
                 Inc(Count);
                 if Count >= 5 then
                 if Count >= 5 then
@@ -8475,7 +8476,7 @@ unit aoptx86;
             A_MOVZX,
             A_MOVZX,
             A_MOVAPS,
             A_MOVAPS,
             A_MOVUPS,
             A_MOVUPS,
-            A_MOVSD,
+            A_MOVSS,
             A_MOVAPD,
             A_MOVAPD,
             A_MOVUPD,
             A_MOVUPD,
             A_MOVDQA,
             A_MOVDQA,
@@ -8626,41 +8627,38 @@ unit aoptx86;
     end;
     end;
 
 
 
 
-  function TX86AsmOptimizer.TrySwapMovCmp(var p, hp1: tai): Boolean;
+  const
+    WriteOp: array[0..3] of set of TInsChange = (
+      [Ch_Wop1, Ch_RWop1, Ch_Mop1],
+      [Ch_Wop2, Ch_RWop2, Ch_Mop2],
+      [Ch_Wop3, Ch_RWop3, Ch_Mop3],
+      [Ch_Wop4, Ch_RWop4, Ch_Mop4]);
+
+    RegWriteFlags: array[0..7] of set of TInsChange = (
+      { The order is important: EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP }
+      [Ch_WEAX, Ch_RWEAX, Ch_MEAX{$ifdef x86_64}, Ch_WRAX, Ch_RWRAX, Ch_MRAX{$endif x86_64}],
+      [Ch_WECX, Ch_RWECX, Ch_MECX{$ifdef x86_64}, Ch_WRCX, Ch_RWRCX, Ch_MRCX{$endif x86_64}],
+      [Ch_WEDX, Ch_RWEDX, Ch_MEDX{$ifdef x86_64}, Ch_WRDX, Ch_RWRDX, Ch_MRDX{$endif x86_64}],
+      [Ch_WEBX, Ch_RWEBX, Ch_MEBX{$ifdef x86_64}, Ch_WRBX, Ch_RWRBX, Ch_MRBX{$endif x86_64}],
+      [Ch_WESI, Ch_RWESI, Ch_MESI{$ifdef x86_64}, Ch_WRSI, Ch_RWRSI, Ch_MRSI{$endif x86_64}],
+      [Ch_WEDI, Ch_RWEDI, Ch_MEDI{$ifdef x86_64}, Ch_WRDI, Ch_RWRDI, Ch_MRDI{$endif x86_64}],
+      [Ch_WEBP, Ch_RWEBP, Ch_MEBP{$ifdef x86_64}, Ch_WRBP, Ch_RWRBP, Ch_MRBP{$endif x86_64}],
+      [Ch_WESP, Ch_RWESP, Ch_MESP{$ifdef x86_64}, Ch_WRSP, Ch_RWRSP, Ch_MRSP{$endif x86_64}]);
+
+
+  function TX86AsmOptimizer.TrySwapMovOp(var p, hp1: tai): Boolean;
     var
     var
       hp2: tai;
       hp2: tai;
       X: Integer;
       X: Integer;
-    const
-      WriteOp: array[0..3] of set of TInsChange = (
-        [Ch_Wop1, Ch_RWop1, Ch_Mop1],
-        [Ch_Wop2, Ch_RWop2, Ch_Mop2],
-        [Ch_Wop3, Ch_RWop3, Ch_Mop3],
-        [Ch_Wop4, Ch_RWop4, Ch_Mop4]);
-
-      RegWriteFlags: array[0..7] of set of TInsChange = (
-        { The order is important: EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP }
-        [Ch_WEAX, Ch_RWEAX, Ch_MEAX{$ifdef x86_64}, Ch_WRAX, Ch_RWRAX, Ch_MRAX{$endif x86_64}],
-        [Ch_WECX, Ch_RWECX, Ch_MECX{$ifdef x86_64}, Ch_WRCX, Ch_RWRCX, Ch_MRCX{$endif x86_64}],
-        [Ch_WEDX, Ch_RWEDX, Ch_MEDX{$ifdef x86_64}, Ch_WRDX, Ch_RWRDX, Ch_MRDX{$endif x86_64}],
-        [Ch_WEBX, Ch_RWEBX, Ch_MEBX{$ifdef x86_64}, Ch_WRBX, Ch_RWRBX, Ch_MRBX{$endif x86_64}],
-        [Ch_WESI, Ch_RWESI, Ch_MESI{$ifdef x86_64}, Ch_WRSI, Ch_RWRSI, Ch_MRSI{$endif x86_64}],
-        [Ch_WEDI, Ch_RWEDI, Ch_MEDI{$ifdef x86_64}, Ch_WRDI, Ch_RWRDI, Ch_MRDI{$endif x86_64}],
-        [Ch_WEBP, Ch_RWEBP, Ch_MEBP{$ifdef x86_64}, Ch_WRBP, Ch_RWRBP, Ch_MRBP{$endif x86_64}],
-        [Ch_WESP, Ch_RWESP, Ch_MESP{$ifdef x86_64}, Ch_WRSP, Ch_RWRSP, Ch_MRSP{$endif x86_64}]);
-
     begin
     begin
       { If we have something like:
       { If we have something like:
-          cmp ###,%reg1
-          mov 0,%reg2
+          op  ###,###
+          mov ###,###
 
 
-        And no modified registers are shared, move the instruction to before
-        the comparison as this means it can be optimised without worrying
-        about the FLAGS register. (CMP/MOV is generated by
-        "J(c)Mov1JmpMov0 -> Set(~c)", among other things).
+        Try to move the MOV instruction to before OP as long as OP and MOV don't
+        interfere in regards to what they write to.
 
 
-        As long as the second instruction doesn't use the flags or one of the
-        registers used by CMP or TEST (also check any references that use the
-        registers), then it can be moved prior to the comparison.
+        NOTE: p must be a 2-operand instruction
       }
       }
 
 
       Result := False;
       Result := False;
@@ -8672,12 +8670,12 @@ unit aoptx86;
       { NOP is a pipeline fence, likely marking the beginning of the function
       { NOP is a pipeline fence, likely marking the beginning of the function
         epilogue, so drop out.  Similarly, drop out if POP or RET are
         epilogue, so drop out.  Similarly, drop out if POP or RET are
         encountered }
         encountered }
-      if MatchInstruction(hp1, A_NOP, A_POP, []) then
+      if MatchInstruction(hp1, A_NOP, A_POP, A_RET, []) then
         Exit;
         Exit;
 
 
-      if (taicpu(hp1).opcode = A_MOVSS) and
+      if (taicpu(hp1).opcode = A_MOVSD) and
         (taicpu(hp1).ops = 0) then
         (taicpu(hp1).ops = 0) then
-        { Wrong MOVSS }
+        { Wrong MOVSD }
         Exit;
         Exit;
 
 
       { Check for writes to specific registers first }
       { Check for writes to specific registers first }
@@ -8705,6 +8703,25 @@ unit aoptx86;
             Exit;
             Exit;
         end;
         end;
 
 
+      { Check p to make sure it doesn't write to something that affects hp1 }
+
+      { Check for writes to specific registers first }
+      { EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP in that order }
+      for X := 0 to 7 do
+        if (RegWriteFlags[X] * InsProp[taicpu(p).opcode].Ch <> [])
+          and RegInInstruction(newreg(R_INTREGISTER, TSuperRegister(X), R_SUBWHOLE), hp1) then
+          Exit;
+
+      for X := 0 to taicpu(p).ops - 1 do
+        begin
+          { Check to see if this operand writes to something }
+          if ((WriteOp[X] * InsProp[taicpu(p).opcode].Ch) <> []) and
+            { And matches something in hp1 }
+            (taicpu(p).oper[X]^.typ = top_reg) and
+            RegInInstruction(taicpu(p).oper[X]^.reg, hp1) then
+            Exit;
+        end;
+
       { The instruction can be safely moved }
       { The instruction can be safely moved }
       asml.Remove(hp1);
       asml.Remove(hp1);
 
 
@@ -8712,6 +8729,17 @@ unit aoptx86;
         can be optimised into "xor %reg,%reg" later }
         can be optimised into "xor %reg,%reg" later }
       if SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, tai(p.Previous)), hp2) then
       if SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, tai(p.Previous)), hp2) then
         asml.InsertBefore(hp1, hp2)
         asml.InsertBefore(hp1, hp2)
+
+        { Failing that, try to insert after the last instructions where the
+          FLAGS register is not yet in use }
+      else if GetLastInstruction(p, hp2) and
+        (
+          (hp2.typ <> ait_instruction) or
+          { Don't insert after an instruction that uses the flags when p doesn't use them }
+          RegInInstruction(NR_DEFAULTFLAGS, p) or
+          not RegInInstruction(NR_DEFAULTFLAGS, hp2)
+        ) then
+        asml.InsertAfter(hp1, hp2)
       else
       else
         { Note, if p.Previous is nil (even if it should logically never be the
         { Note, if p.Previous is nil (even if it should logically never be the
           case), FindRegAllocBackward immediately exits with False and so we
           case), FindRegAllocBackward immediately exits with False and so we
@@ -8721,26 +8749,90 @@ unit aoptx86;
 
 
       DebugMsg(SPeepholeOptimization + 'Swapped ' + debug_op2str(taicpu(p).opcode) + ' and ' + debug_op2str(taicpu(hp1).opcode) + ' instructions to improve optimisation potential', hp1);
       DebugMsg(SPeepholeOptimization + 'Swapped ' + debug_op2str(taicpu(p).opcode) + ' and ' + debug_op2str(taicpu(hp1).opcode) + ' instructions to improve optimisation potential', hp1);
 
 
+      { We can't trust UsedRegs because we're looking backwards, although we
+        know the registers are allocated after p at the very least, so manually
+        create tai_regalloc objects if needed }
       for X := 0 to taicpu(hp1).ops - 1 do
       for X := 0 to taicpu(hp1).ops - 1 do
         case taicpu(hp1).oper[X]^.typ of
         case taicpu(hp1).oper[X]^.typ of
           top_reg:
           top_reg:
-            AllocRegBetween(taicpu(hp1).oper[X]^.reg, hp1, p, UsedRegs);
+            begin
+              asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.reg, nil), hp1);
+              IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.reg, UsedRegs);
+              AllocRegBetween(taicpu(hp1).oper[X]^.reg, hp1, p, UsedRegs);
+            end;
           top_ref:
           top_ref:
             begin
             begin
               if taicpu(hp1).oper[X]^.ref^.base <> NR_NO then
               if taicpu(hp1).oper[X]^.ref^.base <> NR_NO then
-                AllocRegBetween(taicpu(hp1).oper[X]^.ref^.base, hp1, p, UsedRegs);
+                begin
+                  asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.ref^.base, nil), hp1);
+                  IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.ref^.base, UsedRegs);
+                  AllocRegBetween(taicpu(hp1).oper[X]^.ref^.base, hp1, p, UsedRegs);
+                end;
               if taicpu(hp1).oper[X]^.ref^.index <> NR_NO then
               if taicpu(hp1).oper[X]^.ref^.index <> NR_NO then
-                AllocRegBetween(taicpu(hp1).oper[X]^.ref^.index, hp1, p, UsedRegs);
+                begin
+                  asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.ref^.index, nil), hp1);
+                  IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.ref^.index, UsedRegs);
+                  AllocRegBetween(taicpu(hp1).oper[X]^.ref^.index, hp1, p, UsedRegs);
+                end;
             end;
             end;
           else
           else
             ;
             ;
         end;
         end;
 
 
+      Result := True;
+    end;
+
+
+  function TX86AsmOptimizer.TrySwapMovCmp(var p, hp1: tai): Boolean;
+    var
+      hp2: tai;
+      X: Integer;
+    begin
+      { If we have something like:
+          cmp ###,%reg1
+          mov 0,%reg2
+
+        And no modified registers are shared, move the instruction to before
+        the comparison as this means it can be optimised without worrying
+        about the FLAGS register. (CMP/MOV is generated by
+        "J(c)Mov1JmpMov0 -> Set(~c)", among other things).
+
+        As long as the second instruction doesn't use the flags or one of the
+        registers used by CMP or TEST (also check any references that use the
+        registers), then it can be moved prior to the comparison.
+      }
+
+      Result := False;
+      if not TrySwapMovOp(p, hp1) then
+        Exit;
+
       if taicpu(hp1).opcode = A_LEA then
       if taicpu(hp1).opcode = A_LEA then
         { The flags will be overwritten by the CMP/TEST instruction }
         { The flags will be overwritten by the CMP/TEST instruction }
         ConvertLEA(taicpu(hp1));
         ConvertLEA(taicpu(hp1));
 
 
       Result := True;
       Result := True;
+
+      { Can we move it one further back? }
+      if GetLastInstruction(hp1, hp2) and (hp2.typ = ait_instruction) and
+        { Check to see if CMP/TEST is a comparison against zero }
+        (
+          (
+            (taicpu(p).opcode = A_CMP) and
+            MatchOperand(taicpu(p).oper[0]^, 0)
+          ) or
+          (
+            (taicpu(p).opcode = A_TEST) and
+            (
+              OpsEqual(taicpu(p).oper[0]^, taicpu(p).oper[1]^) or
+              MatchOperand(taicpu(p).oper[0]^, -1)
+            )
+          )
+        ) and
+        { These instructions set the zero flag if the result is zero }
+        MatchInstruction(hp2, [A_ADD, A_SUB, A_OR, A_XOR, A_AND, A_POPCNT, A_LZCNT], []) and
+        OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) then
+          { Looks like we can - if successful, this benefits PostPeepholeOptTestOr }
+          TrySwapMovOp(hp2, hp1);
     end;
     end;