2
0
Эх сурвалжийг харах

* improve copy_avx gencopy mode

git-svn-id: trunk@45486 -
florian 5 жил өмнө
parent
commit
88c98a5741
1 өөрчлөгдсөн 31 нэмэгдсэн , 47 устгасан
  1. 31 47
      compiler/x86/cgx86.pas

+ 31 - 47
compiler/x86/cgx86.pas

@@ -1554,7 +1554,7 @@ unit cgx86;
                      if GetRefAlignment(tmpref) = 16 then
                        op := A_VMOVDQA
                      else
-                       op := A_VMOVDQU
+                       op := A_VMOVDQU;
                    end
                  else
                    begin
@@ -1570,7 +1570,7 @@ unit cgx86;
                      if GetRefAlignment(tmpref) = 32 then
                        op := A_VMOVDQA
                      else
-                       op := A_VMOVDQU
+                       op := A_VMOVDQU;
                    end
                  else
                    { SSE doesn't support 256-bit vectors }
@@ -1582,7 +1582,7 @@ unit cgx86;
                      if GetRefAlignment(tmpref) = 64 then
                        op := A_VMOVDQA
                      else
-                       op := A_VMOVDQU
+                       op := A_VMOVDQU;
                    end
                  else
                    { SSE doesn't support 512-bit vectors }
@@ -1648,13 +1648,13 @@ unit cgx86;
                    if GetRefAlignment(tmpref) = 16 then
                      op := A_VMOVDQA
                    else
-                     op := A_VMOVDQU
+                     op := A_VMOVDQU;
                  end else
                  begin
                    if GetRefAlignment(tmpref) = 16 then
                      op := A_MOVDQA
                    else
-                     op := A_MOVDQU
+                     op := A_MOVDQU;
                  end;
                OS_M256:
                  { Use XMM integer transfer }
@@ -1663,7 +1663,7 @@ unit cgx86;
                    if GetRefAlignment(tmpref) = 32 then
                      op := A_VMOVDQA
                    else
-                     op := A_VMOVDQU
+                     op := A_VMOVDQU;
                  end else
                    { SSE doesn't support 256-bit vectors }
                    InternalError(2018012942);
@@ -1674,7 +1674,7 @@ unit cgx86;
                    if GetRefAlignment(tmpref) = 64 then
                      op := A_VMOVDQA
                    else
-                     op := A_VMOVDQU
+                     op := A_VMOVDQU;
                  end else
                    { SSE doesn't support 512-bit vectors }
                    InternalError(2018012945);
@@ -2703,6 +2703,7 @@ unit cgx86;
         cgsize:Tcgsize;
         cm:copymode;
         saved_ds,saved_es: Boolean;
+        hlist: TAsmList;
 
     begin
       srcref:=source;
@@ -2926,54 +2927,37 @@ unit cgx86;
 
         copy_avx:
           begin
-            r0:=NR_NO;
-            r1:=NR_NO;
-            r2:=NR_NO;
-            r3:=NR_NO;
-            if len>=16 then
-              begin
-                r0:=getmmregister(list,OS_M128);
-                { we want to force the use of vmovups, so do not use a_loadmm_ref_reg }
-                list.concat(taicpu.op_ref_reg(A_VMOVUPS,S_NO,srcref,r0));
-                inc(srcref.offset,16);
-              end;
-            if len>=32 then
+            hlist:=TAsmList.create;
+            while (len>=32) and (srcref.alignment>=32) and (dstref.alignment>=32) do
               begin
-                r1:=getmmregister(list,OS_M128);
-                list.concat(taicpu.op_ref_reg(A_VMOVUPS,S_NO,srcref,r1));
-                inc(srcref.offset,16);
+                r0:=getmmregister(list,OS_M256);
+                a_loadmm_ref_reg(list,OS_M256,OS_M256,srcref,r0,nil);
+                a_loadmm_reg_ref(hlist,OS_M256,OS_M256,r0,dstref,nil);
+                inc(srcref.offset,32);
+                inc(dstref.offset,32);
+                dec(len,32);
+                Include(current_procinfo.flags,pi_uses_ymm);
               end;
-            if len>=48 then
+            while len>=16 do
               begin
-                r2:=getmmregister(list,OS_M128);
-                list.concat(taicpu.op_ref_reg(A_VMOVUPS,S_NO,srcref,r2));
+                r0:=getmmregister(list,OS_M128);
+                a_loadmm_ref_reg(list,OS_M128,OS_M128,srcref,r0,nil);
+                a_loadmm_reg_ref(hlist,OS_M128,OS_M128,r0,dstref,nil);
                 inc(srcref.offset,16);
-              end;
-            if (len=8) or (len=24) or (len=40) then
-              begin
-                r3:=getmmregister(list,OS_M64);
-                list.concat(taicpu.op_ref_reg(A_VMOVSD,S_NO,srcref,r3));
-              end;
-
-            if len>=16 then
-              begin
-                list.concat(taicpu.op_reg_ref(A_VMOVUPS,S_NO,r0,dstref));
                 inc(dstref.offset,16);
+                dec(len,16);
               end;
-            if len>=32 then
+            if len>=8 then
               begin
-                list.concat(taicpu.op_reg_ref(A_VMOVUPS,S_NO,r1,dstref));
-                inc(dstref.offset,16);
-              end;
-            if len>=48 then
-              begin
-                list.concat(taicpu.op_reg_ref(A_VMOVUPS,S_NO,r2,dstref));
-                inc(dstref.offset,16);
-              end;
-            if (len=8) or (len=24) or (len=40) then
-              begin
-                list.concat(taicpu.op_reg_ref(A_VMOVSD,S_NO,r3,dstref));
+                r0:=getmmregister(list,OS_M64);
+                a_loadmm_ref_reg(list,OS_M64,OS_M64,srcref,r0,nil);
+                a_loadmm_reg_ref(hlist,OS_M64,OS_M64,r0,dstref,nil);
+                inc(srcref.offset,8);
+                inc(dstref.offset,8);
+                dec(len,8);
               end;
+            list.concatList(hlist);
+            hlist.free;
           end
         else {copy_string, should be a good fallback in case of unhandled}
           begin