فهرست منبع

Attempt to ERMS backward i386 ‘Move’s.

Rika Ichinose 1 سال پیش
والد
کامیت
f14aced9c5
1فایلهای تغییر یافته به همراه78 افزوده شده و 12 حذف شده
  1. 78 12
      rtl/i386/fastmove.inc

+ 78 - 12
rtl/i386/fastmove.inc

@@ -258,16 +258,16 @@ asm
     ret
 
 .L9to16:
-    movq   (%eax), %xmm0
-    movq   -8(%eax,%ecx), %xmm1
-    movq   %xmm0, (%edx)
-    movq   %xmm1, -8(%edx,%ecx)
+    movlps (%eax), %xmm0
+    movlps -8(%eax,%ecx), %xmm1
+    movlps %xmm0, (%edx)
+    movlps %xmm1, -8(%edx,%ecx)
 .Lquit:
 {$ifdef FPC_PIC}
     pop    %ebx
 {$endif}
     ret
-    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
+    .byte  {$ifndef FPC_PIC}102,{$endif}144 { Turns .balign 16 before .Lloop32f into a no-op. }
 
 .L33OrMore:
     sub    %edx, %eax            { eax = src - dest }
@@ -277,7 +277,7 @@ asm
 {$endif}
     jnb    .LForward             { src>dest => forward move }
 
-    mov    %ecx, %ebx
+    lea    -1(%ecx), %ebx
     add    %eax, %ebx            { eax is negative => ecx+eax > 0 if regions overlap }
     jb     .Lback                { if no overlap, still do forward move }
 
@@ -293,10 +293,10 @@ asm
     jbe    .LPost32f
     cmp    $NtThreshold-32, %ecx
     jae    .Lntf                 { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
-.LNtIsNotBetter:
+.LNtIsNotBetterF:
     cmp    $ErmsThreshold-32, %ecx { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT. }
     jae    .LRepMovsF
-.LRepMovsIsNotBetter:
+.LRepMovsIsNotBetterF:
     test   $15, %eax
     jz     .Lalignedloop32f
 
@@ -353,7 +353,7 @@ asm
 {$else FPC_PIC}
     cmpb   $1, fast_large_repmovstosb
 {$endif FPC_PIC}
-    jne    .LRepMovsIsNotBetter
+    jne    .LRepMovsIsNotBetterF
     push   %esi
     push   %edi
     lea    (%eax,%edx), %esi
@@ -368,7 +368,7 @@ asm
 
 .Lntf:
     cmp    $NtThreshold, %eax    { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
-    jb     .LNtIsNotBetter       { (this check is performed here to not stand in the way of smaller counts) }
+    jb     .LNtIsNotBetterF      { (this check is performed here to not stand in the way of smaller counts) }
     sub    $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
     test   $15, %eax
     jz     .Lalignedntloop64f
@@ -410,7 +410,7 @@ asm
     sfence
     add    $PrefetchDistance+64, %ecx
     jmp    .LRestAfterNTf
-    .byte  102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
+    .byte  102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
 
 { backwards move }
 .Lback:
@@ -425,6 +425,9 @@ asm
     jbe    .LPost32b
     cmp    $NtThreshold-32, %ecx
     jae    .Lntb
+.LNtIsNotBetterB:
+    cmp    $ErmsThreshold-32, %ecx
+    jae    .LRepMovsB
 
     .balign 16                   { no-op }
 .Lloop32b:
@@ -448,9 +451,72 @@ asm
     pop    %ebx
     ret
 
+.LRepMovsB:
+    {         dst = 3
+              v
+      Move(abcdefghijXXX, count=10)
+           ^
+           src = 0
+
+         = abcABCDEFGHIJ
+
+      can be moved right to left in non-overlapping groups of “dst - src”:
+
+      abcdefghijHIJ
+             ^^^
+
+      abcdefgEFGhij
+          ^^^
+
+      abcdBCDefghij
+       ^^^
+
+      abcAbcdefghij <- tail is handled by restarting the Move with corresponding count instead, as it can have 0 to dst - src - 1 bytes.
+      ^
+
+      Only REP MOVs with DF=0 are fast with ERMS, in case you’re wondering why not just use DF=1. }
+
+    cmp    $-ErmsThreshold, %eax
+    jnbe   .Lloop32b             { Unfortunately this branch can’t benefit the common case of small distance (like inserting 1 array element into the beginning). :( }
+{$ifdef FPC_PIC}
+    push   %ebx
+    call   fpc_geteipasebx
+    addl   $_GLOBAL_OFFSET_TABLE_, %ebx
+    movl   fast_large_repmovstosb@GOT(%ebx), %ebx
+    cmpb   $1, (%ebx)
+    pop    %ebx
+{$else FPC_PIC}
+    cmpb   $1, fast_large_repmovstosb
+{$endif FPC_PIC}
+    jne    .Lloop32b
+    movups %xmm5, -15(%ebx)      { Write last 16 bytes right away, freeing up ebx. Unlike .LFirstAndLast16f, it is safe here because step is >= 16... assuming ErmsThreshold >= 16 :). }
+    lea    32(%ecx), %ebx        { ebx = remaining }
+    add    %eax, %ebx            { eax = src - dst = -step; remaining -= step. }
+    jnc    .LRepMovsTailB        { CF=0 after remaining -= step means that remaining became strictly negative and the loop must be stopped/not performed. }
+    push   %esi                  { ^ Because of aligning dest, step > remaining is not guaranteed even for the first time, i.e. the ranges may have stopped overlapping by here. }
+    push   %edi
+.LRepMovsNextPieceB:
+    add    %eax, %edx            { dst -= step }
+    lea    (%eax,%edx), %esi     { esi = src = rep movsb source }
+    mov    %edx, %edi            { edi = dst = rep movsb dest }
+    mov    %eax, %ecx
+    neg    %ecx                  { ecx = step = rep movsb count }
+    rep movsb
+    add    %eax, %ebx            { remaining -= step }
+    jc     .LRepMovsNextPieceB
+    pop    %edi
+    pop    %esi
+.LRepMovsTailB:
+    sub    %eax, %ebx            { ebx = remaining }
+    mov    %ebx, %ecx            { ecx = remaining }
+    sub    %ebx, %edx            { edx = dest }
+    lea    (%eax,%edx), %eax     { eax = src }
+    pop    %ebx
+    jmp    Move                  { Remaining piece ("a" in the example above). Can save a bit of jumps by doing first something like: if ecx >= 16, jump directly to "movups -16(%eax,%ecx), %xmm5". }
+
 .Lntb:
     cmp    $-NtThreshold, %eax
-    jnb    .Lloop32b
+    jnb    .LNtIsNotBetterB
     sub    $PrefetchDistance+32, %ecx
 
     .balign 16