Browse Source

* use add/sub instead of inc/dec on modern CPUs when optimizing for speed

git-svn-id: trunk@25057 -
florian 12 years ago
parent
commit
507edb16de
3 changed files with 49 additions and 16 deletions
  1. 6 3
      compiler/i386/popt386.pas
  2. 41 12
      compiler/x86/cgx86.pas
  3. 2 1
      compiler/x86/nx86add.pas

+ 6 - 3
compiler/i386/popt386.pas

@@ -41,7 +41,8 @@ uses
 {$ifdef finaldestdebug}
   cobjects,
 {$endif finaldestdebug}
-  cpuinfo,cpubase,cgutils,daopt386;
+  cpuinfo,cpubase,cgutils,daopt386,
+  cgx86;
 
 
 function isFoldableArithOp(hp1: taicpu; reg: tregister): boolean;
@@ -960,13 +961,13 @@ begin
                             if (base = taicpu(p).oper[1]^.reg) then
                               begin
                                 l := offset;
-                                if (l=1) then
+                                if (l=1) and UseIncDec then
                                   begin
                                     taicpu(p).opcode := A_INC;
                                     taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
                                     taicpu(p).ops := 1
                                   end
-                                else if (l=-1) then
+                                else if (l=-1) and UseIncDec then
                                   begin
                                     taicpu(p).opcode := A_DEC;
                                     taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
@@ -2121,6 +2122,8 @@ begin
               end;
             case taicpu(p).opcode Of
               A_CALL:
+                { don't do this on modern CPUs, this really hurts them due to
+                  broken call/ret pairing }
                 if (current_settings.optimizecputype < cpu_Pentium2) and
                    not(cs_create_pic in current_settings.moduleswitches) and
                    GetNextInstruction(p, hp1) and

+ 41 - 12
compiler/x86/cgx86.pas

@@ -167,6 +167,8 @@ unit cgx86;
 
     function UseAVX: boolean;
 
+    function UseIncDec: boolean;
+
   implementation
 
     uses
@@ -180,6 +182,21 @@ unit cgx86;
         Result:=current_settings.fputype in fpu_avx_instructionsets;
       end;
 
+
+    { modern CPUs prefer add/sub over inc/dec because add/sub break instructions dependencies on flags
+      because they modify all flags }
+    function UseIncDec: boolean;
+      begin
+{$if defined(x86_64)}
+        Result:=cs_opt_size in current_settings.optimizerswitches;
+{$elseif defined(i386)}
+        Result:=(cs_opt_size in current_settings.optimizerswitches) or (current_settings.cputype in [cpu_386]);
+{$elseif defined(i8086)}
+        Result:=(cs_opt_size in current_settings.optimizerswitches) or (current_settings.cputype in [cpu_8086..cpu_386]);
+{$endif}
+      end;
+
+
     const
       TOpCG2AsmOp: Array[topcg] of TAsmOp = (A_NONE,A_MOV,A_ADD,A_AND,A_DIV,
                             A_IDIV,A_IMUL,A_MUL,A_NEG,A_NOT,A_OR,
@@ -1596,11 +1613,14 @@ unit cgx86;
           OP_ADD, OP_AND, OP_OR, OP_SUB, OP_XOR:
             if not(cs_check_overflow in current_settings.localswitches) and
                (a = 1) and
-               (op in [OP_ADD,OP_SUB]) then
-              if op = OP_ADD then
-                list.concat(taicpu.op_reg(A_INC,TCgSize2OpSize[size],reg))
-              else
-                list.concat(taicpu.op_reg(A_DEC,TCgSize2OpSize[size],reg))
+               (op in [OP_ADD,OP_SUB]) and
+               UseIncDec then
+               begin
+                 if op = OP_ADD then
+                   list.concat(taicpu.op_reg(A_INC,TCgSize2OpSize[size],reg))
+                 else
+                   list.concat(taicpu.op_reg(A_DEC,TCgSize2OpSize[size],reg))
+               end
             else if (a = 0) then
               if (op <> OP_AND) then
                 exit
@@ -1727,11 +1747,14 @@ unit cgx86;
           OP_ADD, OP_AND, OP_OR, OP_SUB, OP_XOR:
             if not(cs_check_overflow in current_settings.localswitches) and
                (a = 1) and
-               (op in [OP_ADD,OP_SUB]) then
-              if op = OP_ADD then
-                list.concat(taicpu.op_ref(A_INC,TCgSize2OpSize[size],tmpref))
-              else
-                list.concat(taicpu.op_ref(A_DEC,TCgSize2OpSize[size],tmpref))
+               (op in [OP_ADD,OP_SUB]) and
+               UseIncDec then
+               begin
+                 if op = OP_ADD then
+                   list.concat(taicpu.op_ref(A_INC,TCgSize2OpSize[size],tmpref))
+                 else
+                   list.concat(taicpu.op_ref(A_DEC,TCgSize2OpSize[size],tmpref))
+               end
             else if (a = 0) then
               if (op <> OP_AND) then
                 exit
@@ -2371,7 +2394,10 @@ unit cgx86;
                     a_label(list,again);
                     decrease_sp(winstackpagesize-4);
                     list.concat(Taicpu.op_reg(A_PUSH,S_L,NR_EAX));
-                    list.concat(Taicpu.op_reg(A_DEC,S_L,NR_EDI));
+                    if UseIncDec then
+                      list.concat(Taicpu.op_reg(A_DEC,S_L,NR_EDI))
+                    else
+                      list.concat(Taicpu.op_const_reg(A_SUB,S_L,1,NR_EDI));
                     a_jmp_cond(list,OC_NE,again);
                     decrease_sp(localsize mod winstackpagesize-4);
                     reference_reset_base(href,NR_ESP,localsize-4,4);
@@ -2409,7 +2435,10 @@ unit cgx86;
                     decrease_sp(winstackpagesize);
                     reference_reset_base(href,NR_RSP,0,4);
                     list.concat(Taicpu.op_reg_ref(A_MOV,S_L,NR_EAX,href));
-                    list.concat(Taicpu.op_reg(A_DEC,S_Q,NR_R10));
+                    if UseIncDec then
+                      list.concat(Taicpu.op_reg(A_DEC,S_Q,NR_R10))
+                    else
+                      list.concat(Taicpu.op_const_reg(A_SUB,S_Q,1,NR_R10));
                     a_jmp_cond(list,OC_NE,again);
                     decrease_sp(localsize mod winstackpagesize);
                     ungetcpuregister(list,NR_R10);

+ 2 - 1
compiler/x86/nx86add.pas

@@ -143,7 +143,8 @@ unit nx86add;
                  if (op=A_SUB) and
                     (right.location.loc=LOC_CONSTANT) and
                     (right.location.value=1) and
-                    not(cs_check_overflow in current_settings.localswitches) then
+                    not(cs_check_overflow in current_settings.localswitches) and
+                    UseIncDec then
                   begin
                     emit_reg(A_DEC,TCGSize2Opsize[opsize],left.location.register);
                   end