فهرست منبع

REP MOVSB branch for x64 Move.

Rika Ichinose 1 سال پیش
والد
کامیت
1cda7d8e36
1فایلهای تغییر یافته به همراه152 افزوده شده و 14 حذف شده
  1. 152 14
      rtl/x86_64/x86_64.inc

+ 152 - 14
rtl/x86_64/x86_64.inc

@@ -21,11 +21,12 @@
                                Primitives
 ****************************************************************************}
 
+{$define move_use_fast_repmovstos}
 {$ifndef win64}
-  {$define use_fast_repmovstos} { REP STOS uses nonvolatile RDI and would require a stack frame on Win64 to be SEH-compliant. }
+  {$define fillxxxx_use_fast_repmovstos} { REP STOS uses nonvolatile RDI and would require a stack frame on Win64 to be SEH-compliant. }
 {$endif}
 
-{$ifdef use_fast_repmovstos}
+{$if defined(move_use_fast_repmovstos) or defined(fillxxxx_use_fast_repmovstos)}
 var
   fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
 {$endif}
@@ -91,13 +92,25 @@ procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];
 const
   NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
   PrefetchDistance = 512;
+{$ifdef move_use_fast_repmovstos}
+  ErmsThreshold = 1536;
+{$endif}
 asm
-{$ifndef win64}
+{$if not defined(win64)}
     mov    %rdx, %r8
     mov    %rsi, %rdx
     mov    %rdi, %rcx
-{$endif win64}
+{$elseif defined(move_use_fast_repmovstos)}
+    push   %rsi
+.seh_pushreg %rsi
+    push   %rdi
+.seh_pushreg %rdi
+.seh_endprologue
+{$endif}
 
+{$ifdef move_use_fast_repmovstos}
+.LRe:
+{$endif}
     cmp    $3, %r8
     jle    .L3OrLess
     cmp    $8, %r8
@@ -110,6 +123,10 @@ asm
     jg     .L33OrMore
     movups %xmm4, (%rdx)         { 17–32 bytes }
     movups %xmm5, -16(%rdx,%r8)
+{$if defined(win64) and defined(move_use_fast_repmovstos)}
+    pop    %rdi
+    pop    %rsi
+{$endif}
     ret
 
     .balign 16
@@ -123,6 +140,10 @@ asm
 .LOne:
     mov    %al, (%rdx)
 .LZero:
+{$if defined(win64) and defined(move_use_fast_repmovstos)}
+    pop    %rdi
+    pop    %rsi
+{$endif}
     ret
 
 .L4to8:
@@ -130,6 +151,10 @@ asm
     mov    -4(%rcx,%r8), %r9d
     mov    %eax, (%rdx)
     mov    %r9d, -4(%rdx,%r8)
+{$if defined(win64) and defined(move_use_fast_repmovstos)}
+    pop    %rdi
+    pop    %rsi
+{$endif}
     ret
 
 .L9to16:
@@ -138,8 +163,16 @@ asm
     mov    %rax, (%rdx)
     mov    %r9, -8(%rdx,%r8)
 .Lquit:
+{$if defined(win64) and defined(move_use_fast_repmovstos)}
+    pop    %rdi
+    pop    %rsi
+{$endif}
     ret
-    .byte  102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
+{$if defined(win64) and defined(move_use_fast_repmovstos)}
+    .byte  102,102,102,102,102,144
+{$else}
+    .byte  102,102,102,102,102,102,102,102,102,102,102,144
+{$endif}                         { Turns .balign 16 before .Lloop32f into a no-op. }
 
 .L33OrMore:
     movups -32(%rcx,%r8), %xmm3  { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
@@ -162,8 +195,15 @@ asm
 .LRestAfterNTf:
     sub    $32, %r8              { During the N× loop, r8 is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
     jbe    .LPost32f
+{$ifdef move_use_fast_repmovstos}
+    cmp    $ErmsThreshold-32, %r8
+    jae    .LRepMovsOrNtF
+.LRepMovsIsNotBetterF:
+{$else}
     cmp    $NtThreshold-32, %r8
     jae    .Lntf                 { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
+.LNtIsNotBetterF:
+{$endif}
 
     .balign 16                   { no-op }
 .Lloop32f:
@@ -179,12 +219,40 @@ asm
     movups %xmm3, (%rdx, %r8)
     movups %xmm5, 16(%rdx,%r8)   { Write first and last 16 bytes after everything else. }
     movups %xmm4, (%r9)          { Important for <16-byte step between src and dest. }
+{$if defined(win64) and defined(move_use_fast_repmovstos)}
+    pop    %rdi
+    pop    %rsi
+{$endif}
     ret
 
+{$ifdef move_use_fast_repmovstos}
+.LRepMovsOrNtF:
+    cmp    $NtThreshold-32, %r8
+    jae    .Lntf
+.LNtIsNotBetterF:
+{$ifdef FPC_PIC}
+    movq   fast_large_repmovstosb@GOTPCREL(%rip), %rax
+    cmpb   $1, (%rax)
+{$else FPC_PIC}
+    cmpb   $1, fast_large_repmovstosb(%rip)
+{$endif FPC_PIC}
+    jne    .LRepMovsIsNotBetterF
+    lea    (%rcx,%rdx), %rsi
+    mov    %rdx, %rdi
+    lea    32(%r8), %rcx
+    rep movsb
+    movdqu %xmm4, (%r9) { last 16 aren't required }
+{$ifdef win64}
+    pop    %rdi
+    pop    %rsi
+{$endif}
+    ret
+{$endif move_use_fast_repmovstos}
+
     .balign 16
 .Lntf:
     cmp    $NtThreshold, %rcx    { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
-    jb     .Lloop32f             { (this check is performed here to not stand in the way of smaller counts) }
+    jb     .LNtIsNotBetterF      { (this check is performed here to not stand in the way of smaller counts) }
     sub    $PrefetchDistance+32, %r8 { r8 = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
 
     .balign 16                   { no-op }
@@ -219,8 +287,15 @@ asm
 .LRestAfterNTb:
     sub    $32, %r8
     jbe    .LPost32b
-    cmp    $NtThreshold-32, %r8
-    jae    .Lntb
+{$ifdef move_use_fast_repmovstos}
+    cmp    $ErmsThreshold-32, %r8
+    jae    .LRepMovsOrNtB
+.LRepMovsIsNotBetterB:
+{$else}
+    cmp    $NtThreshold-32, %r8  { this limit must be processor-specific (1/2 L2 cache size) }
+    jae    .Lntb                 { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
+.LNtIsNotBetterB:
+{$endif}
 
     .balign 16                   { no-op }
 .Lloop32b:
@@ -237,12 +312,75 @@ asm
     movups %xmm3, -16(%rdx)
     movups %xmm4, -32(%rdx)
     movups %xmm5, -16(%r9)
+{$if defined(win64) and defined(move_use_fast_repmovstos)}
+    pop    %rdi
+    pop    %rsi
+{$endif}
     ret
 
+{$ifdef move_use_fast_repmovstos}
+.LRepMovsOrNtB:
+    cmp    $NtThreshold-32, %r8
+    jae    .Lntb
+.LNtIsNotBetterB:
+    {         dst = 3
+              v
+      Move(abcdefghijXXX, count=10)
+           ^
+           src = 0
+
+         = abcABCDEFGHIJ
+
+      can be moved right to left in non-overlapping groups of “dst - src”:
+
+      abcdefghijHIJ
+             ^^^
+
+      abcdefgEFGhij
+          ^^^
+
+      abcdBCDefghij
+       ^^^
+
+      abcAbcdefghij <- tail is handled by restarting the Move with corresponding count instead, as it can have 0 to dst - src - 1 bytes.
+      ^
+
+      Only REP MOVs with DF=0 are fast with ERMS, in case you’re wondering why not just use DF=1. }
+
+    cmp    $-ErmsThreshold, %rcx
+    jnbe   .LRepMovsIsNotBetterB { Unfortunately this branch can’t benefit the common case of small distance (like inserting 1 array element into the beginning). :( }
+{$ifdef FPC_PIC}
+    movq   fast_large_repmovstosb@GOTPCREL(%rip), %rax
+    cmpb   $1, (%rax)
+{$else FPC_PIC}
+    cmpb   $1, fast_large_repmovstosb(%rip)
+{$endif FPC_PIC}
+    jne    .LRepMovsIsNotBetterB
+    movups %xmm5, -16(%r9)       { Write last 16 bytes right away. Unlike in the SSE branch, it is safe here because step is >= 16... assuming ErmsThreshold >= 16 :). }
+    mov    %rcx, %rax            { rax = src - dst = -step; as rcx will be required for rep movsb. }
+    add    $32, %r8              { r8 = remaining }
+    add    %rax, %r8             { remaining -= step }
+    jnc    .LRepMovsTailB        { CF=0 after remaining -= step means that remaining became strictly negative and the loop must be stopped/not performed. }
+.LRepMovsNextPieceB:             { ^ Because of aligning dest, step > remaining is not guaranteed even for the first time, i.e. the ranges may have stopped overlapping by here. }
+    add    %rax, %rdx            { dst -= step }
+    lea    (%rax,%rdx), %rsi     { rsi = src = rep movsb source }
+    mov    %rdx, %rdi            { rdi = dst = rep movsb dest }
+    mov    %rax, %rcx
+    neg    %rcx                  { rcx = step = rep movsb count }
+    rep movsb
+    add    %rax, %r8             { remaining -= step }
+    jc     .LRepMovsNextPieceB
+.LRepMovsTailB:
+    sub    %rax, %r8             { r8 = remaining }
+    sub    %r8, %rdx             { rdx = dest }
+    lea    (%rax,%rdx), %rcx     { rcx = src }
+    jmp    .LRe                  { Remaining piece ("a" in the example above). }
+{$endif}
+
     .balign 16
 .Lntb:
     cmp    $-NtThreshold,%rcx
-    jnb    .Lloop32b
+    jnb    .LNtIsNotBetterB
     sub    $PrefetchDistance+32, %r8
 
     .balign 16                   { no-op }
@@ -277,7 +415,7 @@ procedure FillXxxx_MoreThanTwoXmms; assembler; nostackframe;
   xmm0 = pattern for ALIGNED writes
   First and last 16 bytes are written. }
 const
-{$ifdef use_fast_repmovstos}
+{$ifdef fillxxxx_use_fast_repmovstos}
   ErmsThreshold = 1536;
 {$endif}
   NtThreshold = 4 * 1024 * 1024;
@@ -311,7 +449,7 @@ asm
     jle    .LFourAlignedTailWrites
 
     add    $48, %rcx { rcx = H3. }
-{$ifdef use_fast_repmovstos}
+{$ifdef fillxxxx_use_fast_repmovstos}
     cmp    $ErmsThreshold-64, %rax { Need to write aligned byte count − 32 bytes already written. rax = aligned byte count − 96, so compare rax + 64 to ErmsThreshold, or rax to ErmsThreshold − 64. }
     jae    .LRepStos
 {$else}
@@ -338,7 +476,7 @@ asm
     movdqa %xmm0, 48(%rdx) { T1 }
     ret
 
-{$ifdef use_fast_repmovstos}
+{$ifdef fillxxxx_use_fast_repmovstos}
 .LRepStos:
 {$ifdef FPC_PIC}
     movq   fast_large_repmovstosb@GOTPCREL(%rip), %r8
@@ -348,7 +486,7 @@ asm
 {$endif FPC_PIC}
     jne    .LRepStosIsNotBetter
 {$ifdef win64}
-    push   %rdi { For tests on Windows; however this is SEH incompliant so the entire use_fast_repmovstos branch is disabled by default! }
+    push   %rdi { For tests on Windows; however this is SEH incompliant so the entire fillxxxx_use_fast_repmovstos branch is disabled by default! }
 {$endif}
     mov    %rcx, %rdi { rdi = REP STOS destination. }
     lea    64(%rax), %rcx
@@ -1653,7 +1791,7 @@ procedure fpc_cpuinit;
           cpuid
           movl %ebx,cpuid7_ebx
         end ['eax', 'ebx', 'ecx', 'edx'];
-{$ifdef use_fast_repmovstos}
+{$if defined(move_use_fast_repmovstos) or defined(fillxxxx_use_fast_repmovstos)}
         fast_large_repmovstosb:=cpuid7_ebx and (1 shl 9)<>0;
 {$endif}
         { XGETBV support? }