|
@@ -2718,7 +2718,8 @@ unit cgx86;
|
|
|
push_segment_size = S_W;
|
|
|
{$endif}
|
|
|
|
|
|
- type copymode=(copy_move,copy_mmx,copy_string,copy_mm,copy_avx);
|
|
|
+ type
|
|
|
+ copymode=(copy_move,copy_mmx,copy_string,copy_mm,copy_avx,copy_avx512);
|
|
|
|
|
|
var srcref,dstref,tmpref:Treference;
|
|
|
r,r0,r1,r2,r3:Tregister;
|
|
@@ -2777,11 +2778,15 @@ unit cgx86;
|
|
|
if cs_opt_size in current_settings.optimizerswitches then
|
|
|
helpsize:=2*sizeof(aword);
|
|
|
{$ifndef i8086}
|
|
|
+ if (FPUX86_HAS_AVX512F in fpu_capabilities[current_settings.fputype]) and
|
|
|
+ ((len mod 8)=0) and (len<=128) then
|
|
|
+ cm:=copy_avx512
|
|
|
+ else
|
|
|
{ avx helps only to reduce size, using it in general does at least not help on
|
|
|
an i7-4770
|
|
|
- but using the xmm registers reduces register pressure(FK) }
|
|
|
+ but using the xmm registers reduces register pressure (FK) }
|
|
|
if (FPUX86_HAS_AVXUNIT in fpu_capabilities[current_settings.fputype]) and
|
|
|
- ({$ifdef i386}(len=8) or{$endif i386}(len=16) or (len=24) or (len=32) or (len=40) or (len=48)) then
|
|
|
+ ((len mod 8)=0) and (len<=48) {$ifndef i386}and (len<>8){$endif i386} then
|
|
|
cm:=copy_avx
|
|
|
else
|
|
|
{ I'am not sure what CPUs would benefit from using sse instructions for moves
|
|
@@ -2949,9 +2954,21 @@ unit cgx86;
|
|
|
end;
|
|
|
end;
|
|
|
|
|
|
+ copy_avx512,
|
|
|
copy_avx:
|
|
|
begin
|
|
|
hlist:=TAsmList.create;
|
|
|
+ if cm=copy_avx512 then
|
|
|
+ while len>=64 do
|
|
|
+ begin
|
|
|
+ r0:=getmmregister(list,OS_M512);
|
|
|
+ a_loadmm_ref_reg(list,OS_M512,OS_M512,srcref,r0,nil);
|
|
|
+ a_loadmm_reg_ref(hlist,OS_M512,OS_M512,r0,dstref,nil);
|
|
|
+ inc(srcref.offset,64);
|
|
|
+ inc(dstref.offset,64);
|
|
|
+ dec(len,64);
|
|
|
+ Include(current_procinfo.flags,pi_uses_ymm);
|
|
|
+ end;
|
|
|
while len>=32 do
|
|
|
begin
|
|
|
r0:=getmmregister(list,OS_M256);
|