1 year ago · 8310b169b7
--- a/rtl/i386/fastmove.inc
+++ b/rtl/i386/fastmove.inc
@@ -236,20 +236,24 @@ asm
 
				 end;
			
 
				 
			
 
				 {$ifndef FASTMOVE_DISABLE_SSE}
			
 
				+label
			
 
				+  Move_8OrMore_SSE_9to16, Move_8OrMore_SSE_33OrMore, Move_8OrMore_SSE_CancelERMSBackwards;
			
 
				+
			
 
				+const
			
 
				+  Move_NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
			
 
				+
			
 
				 procedure Move_8OrMore_SSE; assembler; nostackframe;
			
 
				 { eax = source, edx = dest, ecx = count (ecx >= 8).
			
 
				   If FPC_PIC: ebx pushed. }
			
 
				 const
			
 
				-  ErmsThreshold = 1536;
			
 
				-  NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
			
 
				   PrefetchDistance = 512;
			
 
				 asm
			
 
				     cmp    $16, %ecx
			
 
				-    jle    .L9to16
			
 
				-    movups (%eax), %xmm4         { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
			
 
				+    jle    Move_8OrMore_SSE_9to16
			
 
				+    movups (%eax), %xmm4         { First and last 16 bytes, used both in Move_8OrMore_SSE_33OrMore and 17–32 branch. }
			
 
				     movups -16(%eax,%ecx), %xmm5
			
 
				     cmp    $32, %ecx
			
 
				-    jg     .L33OrMore
			
 
				+    jg     Move_8OrMore_SSE_33OrMore
			
 
				     movups %xmm4, (%edx)         { 17–32 bytes }
			
 
				     movups %xmm5, -16(%edx,%ecx)
			
 
				 {$ifdef FPC_PIC}
			
@@ -257,7 +261,7 @@ asm
 
				 {$endif}
			
 
				     ret
			
 
				 
			
 
				-.L9to16:
			
 
				+Move_8OrMore_SSE_9to16:
			
 
				     movlps (%eax), %xmm0
			
 
				     movlps -8(%eax,%ecx), %xmm1
			
 
				     movlps %xmm0, (%edx)
			
@@ -267,9 +271,9 @@ asm
 
				     pop    %ebx
			
 
				 {$endif}
			
 
				     ret
			
 
				-    .byte  {$ifndef FPC_PIC}102,{$endif}144 { Turns .balign 16 before .Lloop32f into a no-op. }
			
 
				+    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
			
 
				 
			
 
				-.L33OrMore:
			
 
				+Move_8OrMore_SSE_33OrMore:
			
 
				     sub    %edx, %eax            { eax = src - dest }
			
 
				     jz     .Lquit                { exit if src=dest }
			
 
				 {$ifndef FPC_PIC}
			
@@ -291,12 +295,9 @@ asm
 
				 .LRestAfterNTf:
			
 
				     sub    $32, %ecx             { During the N× loop, ecx is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
			
 
				     jbe    .LPost32f
			
 
				-    cmp    $NtThreshold-32, %ecx
			
 
				+    cmp    $Move_NtThreshold-32, %ecx
			
 
				     jae    .Lntf                 { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
			
 
				 .LNtIsNotBetterF:
			
 
				-    cmp    $ErmsThreshold-32, %ecx { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT. }
			
 
				-    jae    .LRepMovsF
			
 
				-.LRepMovsIsNotBetterF:
			
 
				     test   $15, %eax
			
 
				     jz     .Lalignedloop32f
			
 
				 
			
@@ -342,32 +343,8 @@ asm
 
				     pop    %ebx
			
 
				     ret
			
 
				 
			
 
				-.LRepMovsF:
			
 
				-{$ifdef FPC_PIC}
			
 
				-    push   %ebx
			
 
				-    call   fpc_geteipasebx
			
 
				-    addl   $_GLOBAL_OFFSET_TABLE_, %ebx
			
 
				-    movl   fast_large_repmovstosb@GOT(%ebx), %ebx
			
 
				-    cmpb   $1, (%ebx)
			
 
				-    pop    %ebx
			
 
				-{$else FPC_PIC}
			
 
				-    cmpb   $1, fast_large_repmovstosb
			
 
				-{$endif FPC_PIC}
			
 
				-    jne    .LRepMovsIsNotBetterF
			
 
				-    push   %esi
			
 
				-    push   %edi
			
 
				-    lea    (%eax,%edx), %esi
			
 
				-    mov    %edx, %edi
			
 
				-    add    $32, %ecx
			
 
				-    rep movsb
			
 
				-    movups %xmm4, (%ebx)         { last 16 aren't required }
			
 
				-    pop    %edi
			
 
				-    pop    %esi
			
 
				-    pop    %ebx
			
 
				-    ret
			
 
				-
			
 
				 .Lntf:
			
 
				-    cmp    $NtThreshold, %eax    { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
			
 
				+    cmp    $Move_NtThreshold, %eax { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
			
 
				     jb     .LNtIsNotBetterF      { (this check is performed here to not stand in the way of smaller counts) }
			
 
				     sub    $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
			
 
				     test   $15, %eax
			
@@ -412,6 +389,13 @@ asm
 
				     jmp    .LRestAfterNTf
			
 
				     .byte  102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
			
 
				 
			
 
				+Move_8OrMore_SSE_CancelERMSBackwards:
			
 
				+    { Adapt from Move_8OrMore_SSE_ERMS.Lback where eax = src, edx = dest - src, xmm4 and xmm5 aren’t read. }
			
 
				+    add    %eax, %edx
			
 
				+    movups (%eax), %xmm4
			
 
				+    movups -16(%eax,%ecx), %xmm5
			
 
				+    sub    %edx, %eax
			
 
				+
			
 
				 { backwards move }
			
 
				 .Lback:
			
 
				     lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 16 bytes }
			
@@ -423,11 +407,8 @@ asm
 
				 .LRestAfterNTb:
			
 
				     sub    $32, %ecx
			
 
				     jbe    .LPost32b
			
 
				-    cmp    $NtThreshold-32, %ecx
			
 
				+    cmp    $Move_NtThreshold-32, %ecx
			
 
				     jae    .Lntb
			
 
				-.LNtIsNotBetterB:
			
 
				-    cmp    $ErmsThreshold-32, %ecx
			
 
				-    jae    .LRepMovsB
			
 
				 
			
 
				     .balign 16                   { no-op }
			
 
				 .Lloop32b:
			
@@ -451,7 +432,75 @@ asm
 
				     pop    %ebx
			
 
				     ret
			
 
				 
			
 
				-.LRepMovsB:
			
 
				+.Lntb:
			
 
				+    cmp    $-Move_NtThreshold, %eax
			
 
				+    jnb    .Lloop32b
			
 
				+    sub    $PrefetchDistance+32, %ecx
			
 
				+
			
 
				+    .balign 16
			
 
				+.Lntloop64b:
			
 
				+    prefetchnta -PrefetchDistance(%eax,%edx,1)
			
 
				+    sub    $64, %edx
			
 
				+    movups 48(%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, 48(%edx)
			
 
				+    movups 32(%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, 32(%edx)
			
 
				+    movups 16(%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, 16(%edx)
			
 
				+    movups (%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, (%edx)
			
 
				+    sub    $64, %ecx
			
 
				+    jae    .Lntloop64b
			
 
				+
			
 
				+    sfence
			
 
				+    add    $PrefetchDistance+64, %ecx
			
 
				+    jmp    .LRestAfterNTb
			
 
				+end;
			
 
				+
			
 
				+procedure Move_8OrMore_SSE_ERMS; assembler; nostackframe;
			
 
				+{ eax = source, edx = dest, ecx = count (ecx >= 8).
			
 
				+  If FPC_PIC: ebx pushed. }
			
 
				+const
			
 
				+    ErmsThreshold = 1536;
			
 
				+asm
			
 
				+    cmp    $16, %ecx
			
 
				+    jle    Move_8OrMore_SSE_9to16
			
 
				+    cmp    $ErmsThreshold, %ecx
			
 
				+    jae    .LRepMovs
			
 
				+.LCancelRepMovs:
			
 
				+    movups (%eax), %xmm4         { Same as in Move_8OrMore_SSE. }
			
 
				+    movups -16(%eax,%ecx), %xmm5
			
 
				+    cmp    $32, %ecx
			
 
				+    jg     Move_8OrMore_SSE_33OrMore
			
 
				+    movups %xmm4, (%edx)         { 17–32 bytes }
			
 
				+    movups %xmm5, -16(%edx,%ecx)
			
 
				+{$ifdef FPC_PIC}
			
 
				+    pop    %ebx
			
 
				+{$endif}
			
 
				+    ret
			
 
				+
			
 
				+.LRepMovs:
			
 
				+    cmp    $Move_NtThreshold+16, %ecx
			
 
				+    jae    .LCancelRepMovs       { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT done by Move_8OrMore_SSE. }
			
 
				+    sub    %eax, %edx            { edx = dest - src }
			
 
				+    jz     .Lquit                { exit if src=dest }
			
 
				+    cmp    %edx, %ecx            { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
			
 
				+    ja     .Lback
			
 
				+
			
 
				+    push   %esi
			
 
				+    push   %edi
			
 
				+    mov    %eax, %esi
			
 
				+    lea    (%edx,%eax), %edi
			
 
				+    rep movsb
			
 
				+    pop    %edi
			
 
				+    pop    %esi
			
 
				+.Lquit:
			
 
				+{$ifdef FPC_PIC}
			
 
				+    pop    %ebx
			
 
				+{$endif}
			
 
				+    ret
			
 
				+
			
 
				+.Lback:
			
 
				     {         dst = 3
			
 
				               v
			
 
				       Move(abcdefghijXXX, count=10)
			
@@ -475,68 +524,32 @@ asm
 
				       ^
			
 
				 
			
 
				       Only REP MOVs with DF=0 are fast with ERMS, in case you’re wondering why not just use DF=1. }
			
 
				-
			
 
				-    cmp    $-ErmsThreshold, %eax
			
 
				-    jnbe   .Lloop32b             { Unfortunately this branch can’t benefit the common case of small distance (like inserting 1 array element into the beginning). :( }
			
 
				-{$ifdef FPC_PIC}
			
 
				+{$ifndef FPC_PIC}
			
 
				     push   %ebx
			
 
				-    call   fpc_geteipasebx
			
 
				-    addl   $_GLOBAL_OFFSET_TABLE_, %ebx
			
 
				-    movl   fast_large_repmovstosb@GOT(%ebx), %ebx
			
 
				-    cmpb   $1, (%ebx)
			
 
				-    pop    %ebx
			
 
				-{$else FPC_PIC}
			
 
				-    cmpb   $1, fast_large_repmovstosb
			
 
				-{$endif FPC_PIC}
			
 
				-    jne    .Lloop32b
			
 
				-    movups %xmm5, -15(%ebx)      { Write last 16 bytes right away, freeing up ebx. Unlike .LFirstAndLast16f, it is safe here because step is >= 16... assuming ErmsThreshold >= 16 :). }
			
 
				-    lea    32(%ecx), %ebx        { ebx = remaining }
			
 
				-    add    %eax, %ebx            { eax = src - dst = -step; remaining -= step. }
			
 
				-    jnc    .LRepMovsTailB        { CF=0 after remaining -= step means that remaining became strictly negative and the loop must be stopped/not performed. }
			
 
				-    push   %esi                  { ^ Because of aligning dest, step > remaining is not guaranteed even for the first time, i.e. the ranges may have stopped overlapping by here. }
			
 
				+{$endif}
			
 
				+    cmp    $ErmsThreshold, %edx
			
 
				+    jb     Move_8OrMore_SSE_CancelERMSBackwards { Unfortunately this branch can’t benefit the common case of small distance (like inserting 1 array element into the beginning). :( }
			
 
				+
			
 
				+    mov    %ecx, %ebx            { ebx = remaining }
			
 
				+    sub    %edx, %ebx            { edx = dst - src = step; remaining -= step. }
			
 
				+    add    %ecx, %eax
			
 
				+    push   %esi
			
 
				     push   %edi
			
 
				-.LRepMovsNextPieceB:
			
 
				-    add    %eax, %edx            { dst -= step }
			
 
				-    lea    (%eax,%edx), %esi     { esi = src = rep movsb source }
			
 
				-    mov    %edx, %edi            { edi = dst = rep movsb dest }
			
 
				-    mov    %eax, %ecx
			
 
				-    neg    %ecx                  { ecx = step = rep movsb count }
			
 
				+.LRepMovsNextPieceB:             { At least 1 iteration is always performed. }
			
 
				+    mov    %eax, %edi            { edi = src before subtracting step = dst = rep movsb dest }
			
 
				+    sub    %edx, %eax            { src -= step }
			
 
				+    mov    %eax, %esi            { esi = src = rep movsb source }
			
 
				+    mov    %edx, %ecx            { ecx = step = rep movsb count }
			
 
				     rep movsb
			
 
				-    add    %eax, %ebx            { remaining -= step }
			
 
				-    jc     .LRepMovsNextPieceB
			
 
				+    sub    %edx, %ebx            { remaining -= step }
			
 
				+    jnc    .LRepMovsNextPieceB   { CF=1 after remaining -= step means that remaining became strictly negative and the loop must be stopped/not performed. }
			
 
				     pop    %edi
			
 
				     pop    %esi
			
 
				-.LRepMovsTailB:
			
 
				-    sub    %eax, %ebx            { ebx = remaining }
			
 
				-    mov    %ebx, %ecx            { ecx = remaining }
			
 
				-    sub    %ebx, %edx            { edx = dest }
			
 
				-    lea    (%eax,%edx), %eax     { eax = src }
			
 
				+    lea    (%edx,%ebx), %ecx     { ecx = remaining }
			
 
				+    sub    %ecx, %eax            { eax = src }
			
 
				+    add    %eax, %edx            { edx = dest }
			
 
				     pop    %ebx
			
 
				-    jmp    Move                  { Remaining piece ("a" in the example above). Can save a bit of jumps by doing first something like: if ecx >= 16, jump directly to "movups -16(%eax,%ecx), %xmm5". }
			
 
				-
			
 
				-.Lntb:
			
 
				-    cmp    $-NtThreshold, %eax
			
 
				-    jnb    .LNtIsNotBetterB
			
 
				-    sub    $PrefetchDistance+32, %ecx
			
 
				-
			
 
				-    .balign 16
			
 
				-.Lntloop64b:
			
 
				-    prefetchnta -PrefetchDistance(%eax,%edx,1)
			
 
				-    sub    $64, %edx
			
 
				-    movups 48(%eax,%edx,1), %xmm0
			
 
				-    movntps %xmm0, 48(%edx)
			
 
				-    movups 32(%eax,%edx,1), %xmm0
			
 
				-    movntps %xmm0, 32(%edx)
			
 
				-    movups 16(%eax,%edx,1), %xmm0
			
 
				-    movntps %xmm0, 16(%edx)
			
 
				-    movups (%eax,%edx,1), %xmm0
			
 
				-    movntps %xmm0, (%edx)
			
 
				-    sub    $64, %ecx
			
 
				-    jae    .Lntloop64b
			
 
				-
			
 
				-    sfence
			
 
				-    add    $PrefetchDistance+64, %ecx
			
 
				-    jmp    .LRestAfterNTb
			
 
				+    jmp    Move                  { Remaining piece ("a" in the example above). Can save a bit of jumps by doing first something like: if ecx >= 16, jump directly to .LCancelRepMovs. }
			
 
				 end;
			
 
				 {$endif ndef FASTMOVE_DISABLE_SSE}
			
 
				 
			
@@ -558,6 +571,8 @@ begin
 
				 {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
			
 
				     result:=@Move_8OrMore_Valgrind
			
 
				 {$ifndef FASTMOVE_DISABLE_SSE}
			
 
				+  else if fast_large_repmovstosb then
			
 
				+    result:=@Move_8OrMore_SSE_ERMS
			
 
				   else if has_sse_support then
			
 
				     result:=@Move_8OrMore_SSE
			
 
				 {$endif ndef FASTMOVE_DISABLE_SSE}