Răsfoiți Sursa

* make use of mulps/mulpd and haddps/haddpd/hsubpd/hsubps to optimze x*x+y*y and x*x-y*y where x and y might be single or double

git-svn-id: trunk@18790 -
florian 14 ani în urmă
părinte
comite
46cc0209de
3 a modificat fișierele cu 156 adăugiri și 2 ștergeri
  1. 1 0
      .gitattributes
  2. 66 2
      compiler/x86/nx86add.pas
  3. 89 0
      tests/test/tshuffle1.pp

+ 1 - 0
.gitattributes

@@ -10448,6 +10448,7 @@ tests/test/tset5a.pp svneol=native#text/plain
 tests/test/tset6.pp svneol=native#text/plain
 tests/test/tset7.pp svneol=native#text/plain
 tests/test/tsetsize.pp svneol=native#text/plain
+tests/test/tshuffle1.pp svneol=native#text/pascal
 tests/test/tstack.pp svneol=native#text/plain
 tests/test/tstatic1.pp svneol=native#text/pascal
 tests/test/tstatic2.pp svneol=native#text/pascal

+ 66 - 2
compiler/x86/nx86add.pas

@@ -66,7 +66,7 @@ unit nx86add;
       symconst,symdef,
       cgobj,cgx86,cga,cgutils,
       paramgr,tgobj,ncgutil,
-      ncon,nset,
+      ncon,nset,ninl,
       defutil;
 
 
@@ -660,7 +660,28 @@ unit nx86add;
     procedure tx86addnode.second_addfloatsse;
       var
         op : topcg;
+        sqr_sum : boolean;
+        tmp : tnode;
       begin
+        sqr_sum:=false;
+        if (current_settings.fputype>=fpu_sse3) and
+           use_vectorfpu(resultdef) and
+           (nodetype in [addn,subn]) and
+          (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
+          (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
+          begin
+            sqr_sum:=true;
+            tmp:=tinlinenode(left).left;
+            tinlinenode(left).left:=nil;
+            left.free;
+            left:=tmp;
+
+            tmp:=tinlinenode(right).left;
+            tinlinenode(right).left:=nil;
+            right.free;
+            right:=tmp;
+          end;
+
         pass_left_right;
         check_left_and_right_fpureg(false);
 
@@ -687,8 +708,51 @@ unit nx86add;
         end;
 
         location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
+
+        if sqr_sum then
+          begin
+            if nf_swapped in flags then
+              swapleftright;
+
+            location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
+            location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,true);
+            location:=left.location;
+            if is_double(resultdef) then
+              begin
+                current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
+                case nodetype of
+                  addn:
+                    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
+                  subn:
+                    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
+                  else
+                    internalerror(201108162);
+                end;
+              end
+            else
+              begin
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
+                { ensure that bits 64..127 contain valid values }
+                current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
+                { the data is now in bits 0..32 and 64..95 }
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
+                case nodetype of
+                  addn:
+                    begin
+                      current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
+                    end;
+                  subn:
+                    begin
+                      current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
+                    end;
+                  else
+                    internalerror(201108163);
+                end;
+              end
+          end
         { we can use only right as left operand if the operation is commutative }
-        if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
+        else if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
           begin
             location.register:=right.location.register;
             { force floating point reg. location to be written to memory,

+ 89 - 0
tests/test/tshuffle1.pp

@@ -0,0 +1,89 @@
+{ %cpu=i386,x86_64 }
+{ %opt=-Cfsse3 -O3 }
+{$mode objfpc}
+uses
+  cpu;
+
+function test_double : longint;
+  var
+    f,f1,f2 : double;
+    i : longint;
+  begin
+    result:=0;
+    f1:=1;
+    f2:=2;
+    f:=f1*f1+f2*f2;
+    if f<>5 then
+      result:=1;
+    f:=f1*f1-f2*f2;
+    if f<>-3 then
+      result:=1;
+    { fool ssa }
+    for i:=1 to 3 do
+      begin
+        f:=f1*f1+f2*f2;
+        if f<>5 then
+          result:=1;
+        f:=f1*f1-f2*f2;
+        if f<>-3 then
+          result:=1;
+      end;
+  end;
+
+
+function test_single : longint;
+  var
+    f,f1,f2 : single;
+    i : longint;
+  begin
+    result:=0;
+    f1:=1;
+    f2:=2;
+    f:=f1*f1+f2*f2;
+    if f<>5 then
+      result:=1;
+    f:=f1*f1-f2*f2;
+    if f<>-3 then
+      result:=1;
+    { fool ssa }
+    for i:=1 to 3 do
+      begin
+        f:=f1*f1+f2*f2;
+        if f<>5 then
+          result:=1;
+        f:=f1*f1-f2*f2;
+        if f<>-3 then
+          result:=1;
+      end;
+  end;
+
+var
+  f,f1,f2 : double;
+  i : longint;
+begin
+  if not(is_sse3_cpu) then
+    halt(0);
+  f1:=1;
+  f2:=2;
+  f:=f1*f1+f2*f2;
+  if f<>5 then
+    halt(1);
+  f:=f1*f1-f2*f2;
+  if f<>-3 then
+    halt(1);
+  { fool ssa }
+  for i:=1 to 3 do
+    begin
+      f:=f1*f1+f2*f2;
+      if f<>5 then
+        halt(1);
+      f:=f1*f1-f2*f2;
+      if f<>-3 then
+        halt(1);
+    end;
+  if test_double<>0 then
+    halt(1);
+  if test_single<>0 then
+    halt(1);
+  writeln('ok');
+end.