Browse Source

REP STOS branch for x64 Fill* (only for System V ABI for now).

Rika Ichinose 1 year ago
parent
commit
1ec0326995
1 changed files with 59 additions and 10 deletions
  1. 59 10
      rtl/x86_64/x86_64.inc

+ 59 - 10
rtl/x86_64/x86_64.inc

@@ -21,6 +21,15 @@
                                Primitives
                                Primitives
 ****************************************************************************}
 ****************************************************************************}
 
 
+{$ifndef win64}
+  {$define use_fast_repmovstos} { REP STOS uses nonvolatile RDI and would require a stack frame on Win64 to be SEH-compliant. }
+{$endif}
+
+{$ifdef use_fast_repmovstos}
+var
+  fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
+{$endif}
+
 {$define FPC_SYSTEM_HAS_SPTR}
 {$define FPC_SYSTEM_HAS_SPTR}
 Function Sptr : Pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
 Function Sptr : Pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
 asm
 asm
@@ -297,6 +306,11 @@ procedure FillXxxx_MoreThanTwoXmms; assembler; nostackframe;
   rdx = byte count
   rdx = byte count
   xmm0 = pattern for unaligned writes
   xmm0 = pattern for unaligned writes
   xmm1 = pattern for aligned writes }
   xmm1 = pattern for aligned writes }
+const
+{$ifdef use_fast_repmovstos}
+  ErmsThreshold = 1536;
+{$endif}
+  NtThreshold = 512 * 1024;
 asm
 asm
     { x can start and end misaligned on the vector boundary:
     { x can start and end misaligned on the vector boundary:
 
 
@@ -326,8 +340,13 @@ asm
     jle    .LFourAlignedTailWrites
     jle    .LFourAlignedTailWrites
 
 
     add    $48, %rcx
     add    $48, %rcx
-    cmp    $0x80000, %rdx
+{$ifdef use_fast_repmovstos}
+    cmp    $ErmsThreshold, %rdx
+    jae    .LRepStos
+{$else}
+    cmp    $NtThreshold, %rdx
     jae    .L64xNT_Body
     jae    .L64xNT_Body
+{$endif}
 
 
 .balign 16
 .balign 16
 .L64x_Body:
 .L64x_Body:
@@ -346,8 +365,37 @@ asm
     movdqa %xmm1, 32(%rax) { T2 }
     movdqa %xmm1, 32(%rax) { T2 }
 .LOneAlignedTailWrite:
 .LOneAlignedTailWrite:
     movdqa %xmm1, 48(%rax) { T1 }
     movdqa %xmm1, 48(%rax) { T1 }
-    movdqu %xmm0, 49(%r8) { UT }
+    movdqu %xmm0, 65-16(%r8) { UT }
+    ret
+
+{$ifdef use_fast_repmovstos}
+.LRepStos:
+{$ifdef FPC_PIC}
+    movq   fast_large_repmovstosb@GOTPCREL(%rip), %r9
+    cmpb   $1, (%r9)
+{$else FPC_PIC}
+    cmpb   $1, fast_large_repmovstosb(%rip)
+{$endif FPC_PIC}
+    jne    .LRepStosIsNotBetter
+{$ifdef win64}
+    push   %rdi { For tests on Windows; however this is SEH incompliant so the entire use_fast_repmovstos branch is disabled by default! }
+{$endif}
+    mov    %rcx, %rdi { rdi = REP STOS destination. }
+    lea    65-16+8-1(%r8), %rcx
+    sub    %rdi, %rcx
+    shr    $3, %rcx { rcx = count of REP STOSQ blocks before UT. }
+    movq   %xmm1, %rax { recover pattern for aligned writes back to GPR :) }
+    rep stosq
+    movdqu %xmm0, 65-16(%r8) { UT }
+{$ifdef win64}
+    pop    %rdi
+{$endif}
     ret
     ret
+{$endif}
+
+.LRepStosIsNotBetter:
+    cmp    $NtThreshold, %rdx
+    jb     .L64x_Body
 
 
 .balign 16
 .balign 16
 .L64xNT_Body:
 .L64xNT_Body:
@@ -1452,7 +1500,7 @@ const
 {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
 {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
 procedure fpc_cpuinit;
 procedure fpc_cpuinit;
   var
   var
-    _eax,_ebx,cpuid1_ecx : dword;
+    _eax,cpuid7_ebx,cpuid1_ecx : dword;
   begin
   begin
     { don't let libraries influence the FPU cw set by the host program }
     { don't let libraries influence the FPU cw set by the host program }
     if IsLibrary then
     if IsLibrary then
@@ -1473,7 +1521,14 @@ procedure fpc_cpuinit;
           xorl %ecx,%ecx
           xorl %ecx,%ecx
           cpuid
           cpuid
           movl %ecx,cpuid1_ecx
           movl %ecx,cpuid1_ecx
+          movl $7,%eax
+          xorl %ecx,%ecx
+          cpuid
+          movl %ebx,cpuid7_ebx
         end;
         end;
+{$ifdef use_fast_repmovstos}
+        fast_large_repmovstosb:=cpuid7_ebx and (1 shl 9)<>0;
+{$endif}
         { XGETBV support? }
         { XGETBV support? }
         if (cpuid1_ecx and $8000000)<>0 then 
         if (cpuid1_ecx and $8000000)<>0 then 
           begin
           begin
@@ -1485,13 +1540,7 @@ procedure fpc_cpuinit;
             if (_eax and 6)=6 then
             if (_eax and 6)=6 then
               begin
               begin
                 has_avx_support:=(cpuid1_ecx and $10000000)<>0;
                 has_avx_support:=(cpuid1_ecx and $10000000)<>0;
-                asm
-                  movl $7,%eax
-                  xorl %ecx,%ecx
-                  cpuid
-                  movl %ebx,_ebx
-                end;
-                has_avx2_support:=(_ebx and $20)<>0;
+                has_avx2_support:=(cpuid7_ebx and $20)<>0;
               end;
               end;
           end;
           end;
       end;
       end;