Browse Source

Move ERMS branch into a separate function instead of runtime checks of fast_large_repmovstosb.

Rika Ichinose 1 year ago
parent
commit
8310b169b7
1 changed files with 113 additions and 98 deletions
  1. 113 98
      rtl/i386/fastmove.inc

+ 113 - 98
rtl/i386/fastmove.inc

@@ -236,20 +236,24 @@ asm
 end;
 
 {$ifndef FASTMOVE_DISABLE_SSE}
+label
+  Move_8OrMore_SSE_9to16, Move_8OrMore_SSE_33OrMore, Move_8OrMore_SSE_CancelERMSBackwards;
+
+const
+  Move_NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
+
 procedure Move_8OrMore_SSE; assembler; nostackframe;
 { eax = source, edx = dest, ecx = count (ecx >= 8).
   If FPC_PIC: ebx pushed. }
 const
-  ErmsThreshold = 1536;
-  NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
   PrefetchDistance = 512;
 asm
     cmp    $16, %ecx
-    jle    .L9to16
-    movups (%eax), %xmm4         { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
+    jle    Move_8OrMore_SSE_9to16
+    movups (%eax), %xmm4         { First and last 16 bytes, used both in Move_8OrMore_SSE_33OrMore and 17–32 branch. }
     movups -16(%eax,%ecx), %xmm5
     cmp    $32, %ecx
-    jg     .L33OrMore
+    jg     Move_8OrMore_SSE_33OrMore
     movups %xmm4, (%edx)         { 17–32 bytes }
     movups %xmm5, -16(%edx,%ecx)
 {$ifdef FPC_PIC}
@@ -257,7 +261,7 @@ asm
 {$endif}
     ret
 
-.L9to16:
+Move_8OrMore_SSE_9to16:
     movlps (%eax), %xmm0
     movlps -8(%eax,%ecx), %xmm1
     movlps %xmm0, (%edx)
@@ -267,9 +271,9 @@ asm
     pop    %ebx
 {$endif}
     ret
-    .byte  {$ifndef FPC_PIC}102,{$endif}144 { Turns .balign 16 before .Lloop32f into a no-op. }
+    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
 
-.L33OrMore:
+Move_8OrMore_SSE_33OrMore:
     sub    %edx, %eax            { eax = src - dest }
     jz     .Lquit                { exit if src=dest }
 {$ifndef FPC_PIC}
@@ -291,12 +295,9 @@ asm
 .LRestAfterNTf:
     sub    $32, %ecx             { During the N× loop, ecx is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
     jbe    .LPost32f
-    cmp    $NtThreshold-32, %ecx
+    cmp    $Move_NtThreshold-32, %ecx
     jae    .Lntf                 { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
 .LNtIsNotBetterF:
-    cmp    $ErmsThreshold-32, %ecx { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT. }
-    jae    .LRepMovsF
-.LRepMovsIsNotBetterF:
     test   $15, %eax
     jz     .Lalignedloop32f
 
@@ -342,32 +343,8 @@ asm
     pop    %ebx
     ret
 
-.LRepMovsF:
-{$ifdef FPC_PIC}
-    push   %ebx
-    call   fpc_geteipasebx
-    addl   $_GLOBAL_OFFSET_TABLE_, %ebx
-    movl   fast_large_repmovstosb@GOT(%ebx), %ebx
-    cmpb   $1, (%ebx)
-    pop    %ebx
-{$else FPC_PIC}
-    cmpb   $1, fast_large_repmovstosb
-{$endif FPC_PIC}
-    jne    .LRepMovsIsNotBetterF
-    push   %esi
-    push   %edi
-    lea    (%eax,%edx), %esi
-    mov    %edx, %edi
-    add    $32, %ecx
-    rep movsb
-    movups %xmm4, (%ebx)         { last 16 aren't required }
-    pop    %edi
-    pop    %esi
-    pop    %ebx
-    ret
-
 .Lntf:
-    cmp    $NtThreshold, %eax    { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
+    cmp    $Move_NtThreshold, %eax { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
     jb     .LNtIsNotBetterF      { (this check is performed here to not stand in the way of smaller counts) }
     sub    $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
     test   $15, %eax
@@ -412,6 +389,13 @@ asm
     jmp    .LRestAfterNTf
     .byte  102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
 
+Move_8OrMore_SSE_CancelERMSBackwards:
+    { Adapt from Move_8OrMore_SSE_ERMS.Lback where eax = src, edx = dest - src, xmm4 and xmm5 aren’t read. }
+    add    %eax, %edx
+    movups (%eax), %xmm4
+    movups -16(%eax,%ecx), %xmm5
+    sub    %edx, %eax
+
 { backwards move }
 .Lback:
     lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 16 bytes }
@@ -423,11 +407,8 @@ asm
 .LRestAfterNTb:
     sub    $32, %ecx
     jbe    .LPost32b
-    cmp    $NtThreshold-32, %ecx
+    cmp    $Move_NtThreshold-32, %ecx
     jae    .Lntb
-.LNtIsNotBetterB:
-    cmp    $ErmsThreshold-32, %ecx
-    jae    .LRepMovsB
 
     .balign 16                   { no-op }
 .Lloop32b:
@@ -451,7 +432,75 @@ asm
     pop    %ebx
     ret
 
-.LRepMovsB:
+.Lntb:
+    cmp    $-Move_NtThreshold, %eax
+    jnb    .Lloop32b
+    sub    $PrefetchDistance+32, %ecx
+
+    .balign 16
+.Lntloop64b:
+    prefetchnta -PrefetchDistance(%eax,%edx,1)
+    sub    $64, %edx
+    movups 48(%eax,%edx,1), %xmm0
+    movntps %xmm0, 48(%edx)
+    movups 32(%eax,%edx,1), %xmm0
+    movntps %xmm0, 32(%edx)
+    movups 16(%eax,%edx,1), %xmm0
+    movntps %xmm0, 16(%edx)
+    movups (%eax,%edx,1), %xmm0
+    movntps %xmm0, (%edx)
+    sub    $64, %ecx
+    jae    .Lntloop64b
+
+    sfence
+    add    $PrefetchDistance+64, %ecx
+    jmp    .LRestAfterNTb
+end;
+
+procedure Move_8OrMore_SSE_ERMS; assembler; nostackframe;
+{ eax = source, edx = dest, ecx = count (ecx >= 8).
+  If FPC_PIC: ebx pushed. }
+const
+    ErmsThreshold = 1536;
+asm
+    cmp    $16, %ecx
+    jle    Move_8OrMore_SSE_9to16
+    cmp    $ErmsThreshold, %ecx
+    jae    .LRepMovs
+.LCancelRepMovs:
+    movups (%eax), %xmm4         { Same as in Move_8OrMore_SSE. }
+    movups -16(%eax,%ecx), %xmm5
+    cmp    $32, %ecx
+    jg     Move_8OrMore_SSE_33OrMore
+    movups %xmm4, (%edx)         { 17–32 bytes }
+    movups %xmm5, -16(%edx,%ecx)
+{$ifdef FPC_PIC}
+    pop    %ebx
+{$endif}
+    ret
+
+.LRepMovs:
+    cmp    $Move_NtThreshold+16, %ecx
+    jae    .LCancelRepMovs       { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT done by Move_8OrMore_SSE. }
+    sub    %eax, %edx            { edx = dest - src }
+    jz     .Lquit                { exit if src=dest }
+    cmp    %edx, %ecx            { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
+    ja     .Lback
+
+    push   %esi
+    push   %edi
+    mov    %eax, %esi
+    lea    (%edx,%eax), %edi
+    rep movsb
+    pop    %edi
+    pop    %esi
+.Lquit:
+{$ifdef FPC_PIC}
+    pop    %ebx
+{$endif}
+    ret
+
+.Lback:
     {         dst = 3
               v
       Move(abcdefghijXXX, count=10)
@@ -475,68 +524,32 @@ asm
       ^
 
       Only REP MOVs with DF=0 are fast with ERMS, in case you’re wondering why not just use DF=1. }
-
-    cmp    $-ErmsThreshold, %eax
-    jnbe   .Lloop32b             { Unfortunately this branch can’t benefit the common case of small distance (like inserting 1 array element into the beginning). :( }
-{$ifdef FPC_PIC}
+{$ifndef FPC_PIC}
     push   %ebx
-    call   fpc_geteipasebx
-    addl   $_GLOBAL_OFFSET_TABLE_, %ebx
-    movl   fast_large_repmovstosb@GOT(%ebx), %ebx
-    cmpb   $1, (%ebx)
-    pop    %ebx
-{$else FPC_PIC}
-    cmpb   $1, fast_large_repmovstosb
-{$endif FPC_PIC}
-    jne    .Lloop32b
-    movups %xmm5, -15(%ebx)      { Write last 16 bytes right away, freeing up ebx. Unlike .LFirstAndLast16f, it is safe here because step is >= 16... assuming ErmsThreshold >= 16 :). }
-    lea    32(%ecx), %ebx        { ebx = remaining }
-    add    %eax, %ebx            { eax = src - dst = -step; remaining -= step. }
-    jnc    .LRepMovsTailB        { CF=0 after remaining -= step means that remaining became strictly negative and the loop must be stopped/not performed. }
-    push   %esi                  { ^ Because of aligning dest, step > remaining is not guaranteed even for the first time, i.e. the ranges may have stopped overlapping by here. }
+{$endif}
+    cmp    $ErmsThreshold, %edx
+    jb     Move_8OrMore_SSE_CancelERMSBackwards { Unfortunately this branch can’t benefit the common case of small distance (like inserting 1 array element into the beginning). :( }
+
+    mov    %ecx, %ebx            { ebx = remaining }
+    sub    %edx, %ebx            { edx = dst - src = step; remaining -= step. }
+    add    %ecx, %eax
+    push   %esi
     push   %edi
-.LRepMovsNextPieceB:
-    add    %eax, %edx            { dst -= step }
-    lea    (%eax,%edx), %esi     { esi = src = rep movsb source }
-    mov    %edx, %edi            { edi = dst = rep movsb dest }
-    mov    %eax, %ecx
-    neg    %ecx                  { ecx = step = rep movsb count }
+.LRepMovsNextPieceB:             { At least 1 iteration is always performed. }
+    mov    %eax, %edi            { edi = src before subtracting step = dst = rep movsb dest }
+    sub    %edx, %eax            { src -= step }
+    mov    %eax, %esi            { esi = src = rep movsb source }
+    mov    %edx, %ecx            { ecx = step = rep movsb count }
     rep movsb
-    add    %eax, %ebx            { remaining -= step }
-    jc     .LRepMovsNextPieceB
+    sub    %edx, %ebx            { remaining -= step }
+    jnc    .LRepMovsNextPieceB   { CF=1 after remaining -= step means that remaining became strictly negative and the loop must be stopped/not performed. }
     pop    %edi
     pop    %esi
-.LRepMovsTailB:
-    sub    %eax, %ebx            { ebx = remaining }
-    mov    %ebx, %ecx            { ecx = remaining }
-    sub    %ebx, %edx            { edx = dest }
-    lea    (%eax,%edx), %eax     { eax = src }
+    lea    (%edx,%ebx), %ecx     { ecx = remaining }
+    sub    %ecx, %eax            { eax = src }
+    add    %eax, %edx            { edx = dest }
     pop    %ebx
-    jmp    Move                  { Remaining piece ("a" in the example above). Can save a bit of jumps by doing first something like: if ecx >= 16, jump directly to "movups -16(%eax,%ecx), %xmm5". }
-
-.Lntb:
-    cmp    $-NtThreshold, %eax
-    jnb    .LNtIsNotBetterB
-    sub    $PrefetchDistance+32, %ecx
-
-    .balign 16
-.Lntloop64b:
-    prefetchnta -PrefetchDistance(%eax,%edx,1)
-    sub    $64, %edx
-    movups 48(%eax,%edx,1), %xmm0
-    movntps %xmm0, 48(%edx)
-    movups 32(%eax,%edx,1), %xmm0
-    movntps %xmm0, 32(%edx)
-    movups 16(%eax,%edx,1), %xmm0
-    movntps %xmm0, 16(%edx)
-    movups (%eax,%edx,1), %xmm0
-    movntps %xmm0, (%edx)
-    sub    $64, %ecx
-    jae    .Lntloop64b
-
-    sfence
-    add    $PrefetchDistance+64, %ecx
-    jmp    .LRestAfterNTb
+    jmp    Move                  { Remaining piece ("a" in the example above). Can save a bit of jumps by doing first something like: if ecx >= 16, jump directly to .LCancelRepMovs. }
 end;
 {$endif ndef FASTMOVE_DISABLE_SSE}
 
@@ -558,6 +571,8 @@ begin
 {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
     result:=@Move_8OrMore_Valgrind
 {$ifndef FASTMOVE_DISABLE_SSE}
+  else if fast_large_repmovstosb then
+    result:=@Move_8OrMore_SSE_ERMS
   else if has_sse_support then
     result:=@Move_8OrMore_SSE
 {$endif ndef FASTMOVE_DISABLE_SSE}