Browse Source

+ automatically insert fma inlines into floating point code if possible and fastmath is activated

git-svn-id: trunk@28382 -
florian 11 years ago
parent
commit
499dd078e3
2 changed files with 167 additions and 12 deletions
  1. 145 12
      compiler/nadd.pas
  2. 22 0
      compiler/x86/nx86add.pas

+ 145 - 12
compiler/nadd.pas

@@ -73,6 +73,10 @@ interface
           { full 64 bit multiplies.                                }
           { full 64 bit multiplies.                                }
           function use_generic_mul64bit: boolean; virtual;
           function use_generic_mul64bit: boolean; virtual;
 
 
+          { shall be overriden if the target cpu supports
+            an fma instruction
+          }
+          function use_fma : boolean; virtual;
           { This routine calls internal runtime library helpers
           { This routine calls internal runtime library helpers
             for all floating point arithmetic in the case
             for all floating point arithmetic in the case
             where the emulation switches is on. Otherwise
             where the emulation switches is on. Otherwise
@@ -80,18 +84,22 @@ interface
             the code generation phase.
             the code generation phase.
           }
           }
           function first_addfloat : tnode; virtual;
           function first_addfloat : tnode; virtual;
-         private
-           { checks whether a muln can be calculated as a 32bit }
-           { * 32bit -> 64 bit                                  }
-           function try_make_mul32to64: boolean;
-           { Match against the ranges, i.e.:
-             var a:1..10;
-             begin
-               if a>0 then
-                 ...
-             always evaluates to true. (DM)
-           }
-           function cmp_of_disjunct_ranges(var res : boolean) : boolean;
+       private
+          { checks whether a muln can be calculated as a 32bit }
+          { * 32bit -> 64 bit                                  }
+          function try_make_mul32to64: boolean;
+
+          { Match against the ranges, i.e.:
+            var a:1..10;
+            begin
+              if a>0 then
+                ...
+            always evaluates to true. (DM)
+          }
+          function cmp_of_disjunct_ranges(var res : boolean) : boolean;
+
+          { tries to replace the current node by a fma node }
+          function try_fma(ld,rd : tdef) : tnode;
        end;
        end;
        taddnodeclass = class of taddnode;
        taddnodeclass = class of taddnode;
 
 
@@ -2612,6 +2620,127 @@ implementation
       end;
       end;
 
 
 
 
+    function taddnode.use_fma : boolean;
+      begin
+        result:=false;
+      end;
+
+
+    function taddnode.try_fma(ld,rd : tdef) : tnode;
+      var
+        inlinennr : Integer;
+      begin
+        result:=nil;
+        if (cs_opt_fastmath in current_settings.optimizerswitches) and
+          use_fma and
+          (nodetype in [addn,subn]) and
+          (rd.typ=floatdef) and (ld.typ=floatdef) and
+          (is_single(rd) or is_double(rd)) and
+          equal_defs(rd,ld) and
+          { transforming a*b+c into fma(a,b,c) makes only sense if c can be
+            calculated easily. Consider a*b+c*d which results in
+
+            fmul
+            fmul
+            fadd
+
+            and in
+
+            fmul
+            fma
+
+            when using the fma optimization. On a super scalar architecture, the first instruction
+            sequence requires clock_cycles(fmul)+clock_cycles(fadd) clock cycles because the fmuls can be executed in parallel.
+            The second sequence requires clock_cycles(fmul)+clock_cycles(fma) because the fma has to wait for the
+            result of the fmul. Since typically clock_cycles(fma)>clock_cycles(fadd) applies, the first sequence is better.
+          }
+          (((left.nodetype=muln) and (node_complexity(right)<3)) or
+           ((right.nodetype=muln) and (node_complexity(left)<3)) or
+           ((left.nodetype=inlinen) and
+            (tinlinenode(left).inlinenumber=in_sqr_real) and
+             (node_complexity(right)<3)) or
+           ((right.nodetype=inlinen) and
+            (tinlinenode(right).inlinenumber=in_sqr_real) and
+            (node_complexity(left)<3))
+          ) then
+          begin
+            case tfloatdef(ld).floattype of
+              s32real:
+               inlinennr:=in_fma_single;
+              s64real:
+               inlinennr:=in_fma_double;
+              s80real:
+               inlinennr:=in_fma_extended;
+              s128real:
+               inlinennr:=in_fma_float128;
+              else
+                internalerror(2014042601);
+            end;
+            if left.nodetype=muln then
+              begin
+                if nodetype=subn then
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(cunaryminusnode.create(right),
+                    ccallparanode.create(taddnode(left).right,
+                    ccallparanode.create(taddnode(left).left,nil
+                    ))))
+                else
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(right,
+                    ccallparanode.create(taddnode(left).right,
+                    ccallparanode.create(taddnode(left).left,nil
+                    ))));
+                right:=nil;
+                taddnode(left).right:=nil;
+                taddnode(left).left:=nil;
+              end
+            else if right.nodetype=muln then
+              begin
+                if nodetype=subn then
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left,
+                    ccallparanode.create(cunaryminusnode.create(taddnode(right).right),
+                    ccallparanode.create(taddnode(right).left,nil
+                    ))))
+                else
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left,
+                    ccallparanode.create(taddnode(right).right,
+                    ccallparanode.create(taddnode(right).left,nil
+                    ))));
+                left:=nil;
+                taddnode(right).right:=nil;
+                taddnode(right).left:=nil;
+              end
+            else if (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) then
+              begin
+                if nodetype=subn then
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(cunaryminusnode.create(right),
+                    ccallparanode.create(tinlinenode(left).left.getcopy,
+                    ccallparanode.create(tinlinenode(left).left.getcopy,nil
+                    ))))
+                else
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(right,
+                    ccallparanode.create(tinlinenode(left).left.getcopy,
+                    ccallparanode.create(tinlinenode(left).left.getcopy,nil
+                    ))));
+                right:=nil;
+              end
+            { we get here only if right is a sqr node }
+            else if (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
+              begin
+                if nodetype=subn then
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left,
+                    ccallparanode.create(cunaryminusnode.create(tinlinenode(right).left.getcopy),
+                    ccallparanode.create(tinlinenode(right).left.getcopy,nil
+                    ))))
+                else
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left,
+                    ccallparanode.create(tinlinenode(right).left.getcopy,
+                    ccallparanode.create(tinlinenode(right).left.getcopy,nil
+                    ))));
+                left:=nil;
+              end;
+          end;
+      end;
+
+
     function taddnode.first_add64bitint: tnode;
     function taddnode.first_add64bitint: tnode;
       var
       var
         procname: string[31];
         procname: string[31];
@@ -3109,6 +3238,10 @@ implementation
                 expectloc:=LOC_FPUREGISTER
                 expectloc:=LOC_FPUREGISTER
               else
               else
                 expectloc:=LOC_FLAGS;
                 expectloc:=LOC_FLAGS;
+
+              result:=try_fma(ld,rd);
+              if assigned(result) then
+                exit;
             end
             end
 
 
          { pointer comperation and subtraction }
          { pointer comperation and subtraction }

+ 22 - 0
compiler/x86/nx86add.pas

@@ -47,6 +47,7 @@ unit nx86add;
         procedure second_addfloatsse;
         procedure second_addfloatsse;
         procedure second_addfloatavx;
         procedure second_addfloatavx;
       public
       public
+        function use_fma : boolean;override;
         procedure second_addfloat;override;
         procedure second_addfloat;override;
 {$ifndef i8086}
 {$ifndef i8086}
         procedure second_addsmallset;override;
         procedure second_addsmallset;override;
@@ -273,6 +274,15 @@ unit nx86add;
     procedure tx86addnode.prepare_x87_locations(out refnode: tnode);
     procedure tx86addnode.prepare_x87_locations(out refnode: tnode);
       begin
       begin
         refnode:=nil;
         refnode:=nil;
+
+        { later on, no mm registers are allowed, so transfer everything to memory here
+          below it is loaded into an fpu register if neede }
+        if left.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then
+          hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
+
+        if right.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then
+          hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
+
         case ord(left.location.loc=LOC_FPUREGISTER)+ord(right.location.loc=LOC_FPUREGISTER) of
         case ord(left.location.loc=LOC_FPUREGISTER)+ord(right.location.loc=LOC_FPUREGISTER) of
           0:
           0:
             begin
             begin
@@ -1072,6 +1082,18 @@ unit nx86add;
       end;
       end;
 
 
 
 
+    function tx86addnode.use_fma : boolean;
+      begin
+{$ifndef i8086}
+        { test if the result stays in an xmm register, fiddeling with fpu registers and fma makes no sense }
+        Result:=use_vectorfpu(resultdef) and
+          ((cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[]);
+{$else i8086}
+        Result:=inherited use_fma;
+{$endif i8086}
+      end;
+
+
     procedure tx86addnode.second_cmpfloatvector;
     procedure tx86addnode.second_cmpfloatvector;
       var
       var
         op : tasmop;
         op : tasmop;