1 سال پیش · f14aced9c5
--- a/rtl/i386/fastmove.inc
+++ b/rtl/i386/fastmove.inc
@@ -258,16 +258,16 @@ asm
 
				     ret
			
 
				 
			
 
				 .L9to16:
			
 
				-    movq   (%eax), %xmm0
			
 
				-    movq   -8(%eax,%ecx), %xmm1
			
 
				-    movq   %xmm0, (%edx)
			
 
				-    movq   %xmm1, -8(%edx,%ecx)
			
 
				+    movlps (%eax), %xmm0
			
 
				+    movlps -8(%eax,%ecx), %xmm1
			
 
				+    movlps %xmm0, (%edx)
			
 
				+    movlps %xmm1, -8(%edx,%ecx)
			
 
				 .Lquit:
			
 
				 {$ifdef FPC_PIC}
			
 
				     pop    %ebx
			
 
				 {$endif}
			
 
				     ret
			
 
				-    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
			
 
				+    .byte  {$ifndef FPC_PIC}102,{$endif}144 { Turns .balign 16 before .Lloop32f into a no-op. }
			
 
				 
			
 
				 .L33OrMore:
			
 
				     sub    %edx, %eax            { eax = src - dest }
			
@@ -277,7 +277,7 @@ asm
 
				 {$endif}
			
 
				     jnb    .LForward             { src>dest => forward move }
			
 
				 
			
 
				-    mov    %ecx, %ebx
			
 
				+    lea    -1(%ecx), %ebx
			
 
				     add    %eax, %ebx            { eax is negative => ecx+eax > 0 if regions overlap }
			
 
				     jb     .Lback                { if no overlap, still do forward move }
			
 
				 
			
@@ -293,10 +293,10 @@ asm
 
				     jbe    .LPost32f
			
 
				     cmp    $NtThreshold-32, %ecx
			
 
				     jae    .Lntf                 { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
			
 
				-.LNtIsNotBetter:
			
 
				+.LNtIsNotBetterF:
			
 
				     cmp    $ErmsThreshold-32, %ecx { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT. }
			
 
				     jae    .LRepMovsF
			
 
				-.LRepMovsIsNotBetter:
			
 
				+.LRepMovsIsNotBetterF:
			
 
				     test   $15, %eax
			
 
				     jz     .Lalignedloop32f
			
 
				 
			
@@ -353,7 +353,7 @@ asm
 
				 {$else FPC_PIC}
			
 
				     cmpb   $1, fast_large_repmovstosb
			
 
				 {$endif FPC_PIC}
			
 
				-    jne    .LRepMovsIsNotBetter
			
 
				+    jne    .LRepMovsIsNotBetterF
			
 
				     push   %esi
			
 
				     push   %edi
			
 
				     lea    (%eax,%edx), %esi
			
@@ -368,7 +368,7 @@ asm
 
				 
			
 
				 .Lntf:
			
 
				     cmp    $NtThreshold, %eax    { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
			
 
				-    jb     .LNtIsNotBetter       { (this check is performed here to not stand in the way of smaller counts) }
			
 
				+    jb     .LNtIsNotBetterF      { (this check is performed here to not stand in the way of smaller counts) }
			
 
				     sub    $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
			
 
				     test   $15, %eax
			
 
				     jz     .Lalignedntloop64f
			
@@ -410,7 +410,7 @@ asm
 
				     sfence
			
 
				     add    $PrefetchDistance+64, %ecx
			
 
				     jmp    .LRestAfterNTf
			
 
				-    .byte  102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
			
 
				+    .byte  102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
			
 
				 
			
 
				 { backwards move }
			
 
				 .Lback:
			
@@ -425,6 +425,9 @@ asm
 
				     jbe    .LPost32b
			
 
				     cmp    $NtThreshold-32, %ecx
			
 
				     jae    .Lntb
			
 
				+.LNtIsNotBetterB:
			
 
				+    cmp    $ErmsThreshold-32, %ecx
			
 
				+    jae    .LRepMovsB
			
 
				 
			
 
				     .balign 16                   { no-op }
			
 
				 .Lloop32b:
			
@@ -448,9 +451,72 @@ asm
 
				     pop    %ebx
			
 
				     ret
			
 
				 
			
 
				+.LRepMovsB:
			
 
				+    {         dst = 3
			
 
				+              v
			
 
				+      Move(abcdefghijXXX, count=10)
			
 
				+           ^
			
 
				+           src = 0
			
 
				+
			
 
				+         = abcABCDEFGHIJ
			
 
				+
			
 
				+      can be moved right to left in non-overlapping groups of “dst - src”:
			
 
				+
			
 
				+      abcdefghijHIJ
			
 
				+             ^^^
			
 
				+
			
 
				+      abcdefgEFGhij
			
 
				+          ^^^
			
 
				+
			
 
				+      abcdBCDefghij
			
 
				+       ^^^
			
 
				+
			
 
				+      abcAbcdefghij <- tail is handled by restarting the Move with corresponding count instead, as it can have 0 to dst - src - 1 bytes.
			
 
				+      ^
			
 
				+
			
 
				+      Only REP MOVs with DF=0 are fast with ERMS, in case you’re wondering why not just use DF=1. }
			
 
				+
			
 
				+    cmp    $-ErmsThreshold, %eax
			
 
				+    jnbe   .Lloop32b             { Unfortunately this branch can’t benefit the common case of small distance (like inserting 1 array element into the beginning). :( }
			
 
				+{$ifdef FPC_PIC}
			
 
				+    push   %ebx
			
 
				+    call   fpc_geteipasebx
			
 
				+    addl   $_GLOBAL_OFFSET_TABLE_, %ebx
			
 
				+    movl   fast_large_repmovstosb@GOT(%ebx), %ebx
			
 
				+    cmpb   $1, (%ebx)
			
 
				+    pop    %ebx
			
 
				+{$else FPC_PIC}
			
 
				+    cmpb   $1, fast_large_repmovstosb
			
 
				+{$endif FPC_PIC}
			
 
				+    jne    .Lloop32b
			
 
				+    movups %xmm5, -15(%ebx)      { Write last 16 bytes right away, freeing up ebx. Unlike .LFirstAndLast16f, it is safe here because step is >= 16... assuming ErmsThreshold >= 16 :). }
			
 
				+    lea    32(%ecx), %ebx        { ebx = remaining }
			
 
				+    add    %eax, %ebx            { eax = src - dst = -step; remaining -= step. }
			
 
				+    jnc    .LRepMovsTailB        { CF=0 after remaining -= step means that remaining became strictly negative and the loop must be stopped/not performed. }
			
 
				+    push   %esi                  { ^ Because of aligning dest, step > remaining is not guaranteed even for the first time, i.e. the ranges may have stopped overlapping by here. }
			
 
				+    push   %edi
			
 
				+.LRepMovsNextPieceB:
			
 
				+    add    %eax, %edx            { dst -= step }
			
 
				+    lea    (%eax,%edx), %esi     { esi = src = rep movsb source }
			
 
				+    mov    %edx, %edi            { edi = dst = rep movsb dest }
			
 
				+    mov    %eax, %ecx
			
 
				+    neg    %ecx                  { ecx = step = rep movsb count }
			
 
				+    rep movsb
			
 
				+    add    %eax, %ebx            { remaining -= step }
			
 
				+    jc     .LRepMovsNextPieceB
			
 
				+    pop    %edi
			
 
				+    pop    %esi
			
 
				+.LRepMovsTailB:
			
 
				+    sub    %eax, %ebx            { ebx = remaining }
			
 
				+    mov    %ebx, %ecx            { ecx = remaining }
			
 
				+    sub    %ebx, %edx            { edx = dest }
			
 
				+    lea    (%eax,%edx), %eax     { eax = src }
			
 
				+    pop    %ebx
			
 
				+    jmp    Move                  { Remaining piece ("a" in the example above). Can save a bit of jumps by doing first something like: if ecx >= 16, jump directly to "movups -16(%eax,%ecx), %xmm5". }
			
 
				+
			
 
				 .Lntb:
			
 
				     cmp    $-NtThreshold, %eax
			
 
				-    jnb    .Lloop32b
			
 
				+    jnb    .LNtIsNotBetterB
			
 
				     sub    $PrefetchDistance+32, %ecx
			
 
				 
			
 
				     .balign 16