Browse Source

* x86: Introduced TrySwapMovOp method, and redesigned TrySwapMovCmp
to use it while also trying to move one more instruction back

J. Gareth "Curious Kit" Moreton 3 years ago
parent
commit
5f3749dc49
1 changed files with 129 additions and 37 deletions
  1. 129 37
      compiler/x86/aoptx86.pas

+ 129 - 37
compiler/x86/aoptx86.pas

@@ -211,6 +211,7 @@ unit aoptx86;
         procedure ConvertJumpToRET(const p: tai; const ret_p: tai);
 
         function CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean;
+        function TrySwapMovOp(var p, hp1: tai): Boolean;
         function TrySwapMovCmp(var p, hp1: tai): Boolean;
 
         { Processor-dependent reference optimisation }
@@ -8453,10 +8454,10 @@ unit aoptx86;
             Break;
 
           case taicpu(hp2).opcode of
-            A_MOVSS:
+            A_MOVSD:
               begin
                 if taicpu(hp2).ops = 0 then
-                  { Wrong MOVSS }
+                  { Wrong MOVSD }
                   Break;
                 Inc(Count);
                 if Count >= 5 then
@@ -8475,7 +8476,7 @@ unit aoptx86;
             A_MOVZX,
             A_MOVAPS,
             A_MOVUPS,
-            A_MOVSD,
+            A_MOVSS,
             A_MOVAPD,
             A_MOVUPD,
             A_MOVDQA,
@@ -8626,41 +8627,38 @@ unit aoptx86;
     end;
 
 
-  function TX86AsmOptimizer.TrySwapMovCmp(var p, hp1: tai): Boolean;
+  const
+    WriteOp: array[0..3] of set of TInsChange = (
+      [Ch_Wop1, Ch_RWop1, Ch_Mop1],
+      [Ch_Wop2, Ch_RWop2, Ch_Mop2],
+      [Ch_Wop3, Ch_RWop3, Ch_Mop3],
+      [Ch_Wop4, Ch_RWop4, Ch_Mop4]);
+
+    RegWriteFlags: array[0..7] of set of TInsChange = (
+      { The order is important: EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP }
+      [Ch_WEAX, Ch_RWEAX, Ch_MEAX{$ifdef x86_64}, Ch_WRAX, Ch_RWRAX, Ch_MRAX{$endif x86_64}],
+      [Ch_WECX, Ch_RWECX, Ch_MECX{$ifdef x86_64}, Ch_WRCX, Ch_RWRCX, Ch_MRCX{$endif x86_64}],
+      [Ch_WEDX, Ch_RWEDX, Ch_MEDX{$ifdef x86_64}, Ch_WRDX, Ch_RWRDX, Ch_MRDX{$endif x86_64}],
+      [Ch_WEBX, Ch_RWEBX, Ch_MEBX{$ifdef x86_64}, Ch_WRBX, Ch_RWRBX, Ch_MRBX{$endif x86_64}],
+      [Ch_WESI, Ch_RWESI, Ch_MESI{$ifdef x86_64}, Ch_WRSI, Ch_RWRSI, Ch_MRSI{$endif x86_64}],
+      [Ch_WEDI, Ch_RWEDI, Ch_MEDI{$ifdef x86_64}, Ch_WRDI, Ch_RWRDI, Ch_MRDI{$endif x86_64}],
+      [Ch_WEBP, Ch_RWEBP, Ch_MEBP{$ifdef x86_64}, Ch_WRBP, Ch_RWRBP, Ch_MRBP{$endif x86_64}],
+      [Ch_WESP, Ch_RWESP, Ch_MESP{$ifdef x86_64}, Ch_WRSP, Ch_RWRSP, Ch_MRSP{$endif x86_64}]);
+
+
+  function TX86AsmOptimizer.TrySwapMovOp(var p, hp1: tai): Boolean;
     var
       hp2: tai;
       X: Integer;
-    const
-      WriteOp: array[0..3] of set of TInsChange = (
-        [Ch_Wop1, Ch_RWop1, Ch_Mop1],
-        [Ch_Wop2, Ch_RWop2, Ch_Mop2],
-        [Ch_Wop3, Ch_RWop3, Ch_Mop3],
-        [Ch_Wop4, Ch_RWop4, Ch_Mop4]);
-
-      RegWriteFlags: array[0..7] of set of TInsChange = (
-        { The order is important: EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP }
-        [Ch_WEAX, Ch_RWEAX, Ch_MEAX{$ifdef x86_64}, Ch_WRAX, Ch_RWRAX, Ch_MRAX{$endif x86_64}],
-        [Ch_WECX, Ch_RWECX, Ch_MECX{$ifdef x86_64}, Ch_WRCX, Ch_RWRCX, Ch_MRCX{$endif x86_64}],
-        [Ch_WEDX, Ch_RWEDX, Ch_MEDX{$ifdef x86_64}, Ch_WRDX, Ch_RWRDX, Ch_MRDX{$endif x86_64}],
-        [Ch_WEBX, Ch_RWEBX, Ch_MEBX{$ifdef x86_64}, Ch_WRBX, Ch_RWRBX, Ch_MRBX{$endif x86_64}],
-        [Ch_WESI, Ch_RWESI, Ch_MESI{$ifdef x86_64}, Ch_WRSI, Ch_RWRSI, Ch_MRSI{$endif x86_64}],
-        [Ch_WEDI, Ch_RWEDI, Ch_MEDI{$ifdef x86_64}, Ch_WRDI, Ch_RWRDI, Ch_MRDI{$endif x86_64}],
-        [Ch_WEBP, Ch_RWEBP, Ch_MEBP{$ifdef x86_64}, Ch_WRBP, Ch_RWRBP, Ch_MRBP{$endif x86_64}],
-        [Ch_WESP, Ch_RWESP, Ch_MESP{$ifdef x86_64}, Ch_WRSP, Ch_RWRSP, Ch_MRSP{$endif x86_64}]);
-
     begin
       { If we have something like:
-          cmp ###,%reg1
-          mov 0,%reg2
+          op  ###,###
+          mov ###,###
 
-        And no modified registers are shared, move the instruction to before
-        the comparison as this means it can be optimised without worrying
-        about the FLAGS register. (CMP/MOV is generated by
-        "J(c)Mov1JmpMov0 -> Set(~c)", among other things).
+        Try to move the MOV instruction to before OP as long as OP and MOV don't
+        interfere in regards to what they write to.
 
-        As long as the second instruction doesn't use the flags or one of the
-        registers used by CMP or TEST (also check any references that use the
-        registers), then it can be moved prior to the comparison.
+        NOTE: p must be a 2-operand instruction
       }
 
       Result := False;
@@ -8672,12 +8670,12 @@ unit aoptx86;
       { NOP is a pipeline fence, likely marking the beginning of the function
         epilogue, so drop out.  Similarly, drop out if POP or RET are
         encountered }
-      if MatchInstruction(hp1, A_NOP, A_POP, []) then
+      if MatchInstruction(hp1, A_NOP, A_POP, A_RET, []) then
         Exit;
 
-      if (taicpu(hp1).opcode = A_MOVSS) and
+      if (taicpu(hp1).opcode = A_MOVSD) and
         (taicpu(hp1).ops = 0) then
-        { Wrong MOVSS }
+        { Wrong MOVSD }
         Exit;
 
       { Check for writes to specific registers first }
@@ -8705,6 +8703,25 @@ unit aoptx86;
             Exit;
         end;
 
+      { Check p to make sure it doesn't write to something that affects hp1 }
+
+      { Check for writes to specific registers first }
+      { EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP in that order }
+      for X := 0 to 7 do
+        if (RegWriteFlags[X] * InsProp[taicpu(p).opcode].Ch <> [])
+          and RegInInstruction(newreg(R_INTREGISTER, TSuperRegister(X), R_SUBWHOLE), hp1) then
+          Exit;
+
+      for X := 0 to taicpu(p).ops - 1 do
+        begin
+          { Check to see if this operand writes to something }
+          if ((WriteOp[X] * InsProp[taicpu(p).opcode].Ch) <> []) and
+            { And matches something in hp1 }
+            (taicpu(p).oper[X]^.typ = top_reg) and
+            RegInInstruction(taicpu(p).oper[X]^.reg, hp1) then
+            Exit;
+        end;
+
       { The instruction can be safely moved }
       asml.Remove(hp1);
 
@@ -8712,6 +8729,17 @@ unit aoptx86;
         can be optimised into "xor %reg,%reg" later }
       if SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, tai(p.Previous)), hp2) then
         asml.InsertBefore(hp1, hp2)
+
+        { Failing that, try to insert after the last instructions where the
+          FLAGS register is not yet in use }
+      else if GetLastInstruction(p, hp2) and
+        (
+          (hp2.typ <> ait_instruction) or
+          { Don't insert after an instruction that uses the flags when p doesn't use them }
+          RegInInstruction(NR_DEFAULTFLAGS, p) or
+          not RegInInstruction(NR_DEFAULTFLAGS, hp2)
+        ) then
+        asml.InsertAfter(hp1, hp2)
       else
         { Note, if p.Previous is nil (even if it should logically never be the
           case), FindRegAllocBackward immediately exits with False and so we
@@ -8721,26 +8749,90 @@ unit aoptx86;
 
       DebugMsg(SPeepholeOptimization + 'Swapped ' + debug_op2str(taicpu(p).opcode) + ' and ' + debug_op2str(taicpu(hp1).opcode) + ' instructions to improve optimisation potential', hp1);
 
+      { We can't trust UsedRegs because we're looking backwards, although we
+        know the registers are allocated after p at the very least, so manually
+        create tai_regalloc objects if needed }
       for X := 0 to taicpu(hp1).ops - 1 do
         case taicpu(hp1).oper[X]^.typ of
           top_reg:
-            AllocRegBetween(taicpu(hp1).oper[X]^.reg, hp1, p, UsedRegs);
+            begin
+              asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.reg, nil), hp1);
+              IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.reg, UsedRegs);
+              AllocRegBetween(taicpu(hp1).oper[X]^.reg, hp1, p, UsedRegs);
+            end;
           top_ref:
             begin
               if taicpu(hp1).oper[X]^.ref^.base <> NR_NO then
-                AllocRegBetween(taicpu(hp1).oper[X]^.ref^.base, hp1, p, UsedRegs);
+                begin
+                  asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.ref^.base, nil), hp1);
+                  IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.ref^.base, UsedRegs);
+                  AllocRegBetween(taicpu(hp1).oper[X]^.ref^.base, hp1, p, UsedRegs);
+                end;
               if taicpu(hp1).oper[X]^.ref^.index <> NR_NO then
-                AllocRegBetween(taicpu(hp1).oper[X]^.ref^.index, hp1, p, UsedRegs);
+                begin
+                  asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.ref^.index, nil), hp1);
+                  IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.ref^.index, UsedRegs);
+                  AllocRegBetween(taicpu(hp1).oper[X]^.ref^.index, hp1, p, UsedRegs);
+                end;
             end;
           else
             ;
         end;
 
+      Result := True;
+    end;
+
+
+  function TX86AsmOptimizer.TrySwapMovCmp(var p, hp1: tai): Boolean;
+    var
+      hp2: tai;
+      X: Integer;
+    begin
+      { If we have something like:
+          cmp ###,%reg1
+          mov 0,%reg2
+
+        And no modified registers are shared, move the instruction to before
+        the comparison as this means it can be optimised without worrying
+        about the FLAGS register. (CMP/MOV is generated by
+        "J(c)Mov1JmpMov0 -> Set(~c)", among other things).
+
+        As long as the second instruction doesn't use the flags or one of the
+        registers used by CMP or TEST (also check any references that use the
+        registers), then it can be moved prior to the comparison.
+      }
+
+      Result := False;
+      if not TrySwapMovOp(p, hp1) then
+        Exit;
+
       if taicpu(hp1).opcode = A_LEA then
         { The flags will be overwritten by the CMP/TEST instruction }
         ConvertLEA(taicpu(hp1));
 
       Result := True;
+
+      { Can we move it one further back? }
+      if GetLastInstruction(hp1, hp2) and (hp2.typ = ait_instruction) and
+        { Check to see if CMP/TEST is a comparison against zero }
+        (
+          (
+            (taicpu(p).opcode = A_CMP) and
+            MatchOperand(taicpu(p).oper[0]^, 0)
+          ) or
+          (
+            (taicpu(p).opcode = A_TEST) and
+            (
+              OpsEqual(taicpu(p).oper[0]^, taicpu(p).oper[1]^) or
+              MatchOperand(taicpu(p).oper[0]^, -1)
+            )
+          )
+        ) and
+        { These instructions set the zero flag if the result is zero }
+        MatchInstruction(hp2, [A_ADD, A_SUB, A_OR, A_XOR, A_AND, A_POPCNT, A_LZCNT], []) and
+        OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) then
+          { Looks like we can - if successful, this benefits PostPeepholeOptTestOr }
+          TrySwapMovOp(hp2, hp1);
     end;