1 rok temu · e395166cb7
--- a/rtl/i386/fastmove.inc
+++ b/rtl/i386/fastmove.inc
@@ -7,35 +7,34 @@ procedure Move_8OrMore_Valgrind; assembler; nostackframe;
 
															 { eax = source, edx = dest, ecx = count (ecx >= 8).
														
 
															   If FPC_PIC: ebx pushed. }
														
 
															 asm
														
 
															-{$ifndef FPC_PIC}
														
 
															-    push   %ebx
														
 
															-{$endif}
														
 
															-    sub    %edx, %eax
														
 
															-    jae    .LForward
														
 
															-    mov    %ecx, %ebx
														
 
															-    add    %eax, %ebx            { eax is negative => ecx+eax > 0 if regions overlap }
														
 
															-    jb     .LBack                { if no overlap, still do forward move }
														
 
															+    sub    %eax, %edx            { edx = dest - src }
														
 
															+    cmp    %edx, %ecx
														
 
															+    ja     .LBack                { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
														
 
															-.LForward:
														
 
															 {$ifdef FPC_ENABLED_CLD}
														
 
															     cld
														
 
															 {$endif FPC_ENABLED_CLD}
														
 
															     push   %esi
														
 
															     push   %edi
														
 
															-    lea    (%eax,%edx), %esi
														
 
															-    mov    %edx, %edi
														
 
															+    mov    %eax, %esi
														
 
															+    lea    (%edx,%eax), %edi
														
 
															     rep movsb
														
 
															     pop    %edi
														
 
															     pop    %esi
														
 
															+{$ifdef FPC_PIC}
														
 
															     pop    %ebx
														
 
															+{$endif}
														
 
															     ret
														
 
															 .LBack:
														
 
															-    add    %ecx, %edx
														
 
															+{$ifndef FPC_PIC}
														
 
															+    push   %ebx
														
 
															+{$endif}
														
 
															+    add    %ecx, %eax
														
 
															 .LNextb:
														
 
															-    dec    %edx
														
 
															-    mov    (%eax,%edx), %bl
														
 
															-    mov    %bl, (%edx)
														
 
															+    dec    %eax
														
 
															+    mov    (%eax), %bl
														
 
															+    mov    %bl, (%edx,%eax)
														
 
															     dec    %ecx
														
 
															     jnz    .LNextb
														
 
															     pop    %ebx
														
@@ -77,13 +76,11 @@ asm
 
															 {$ifndef FPC_PIC}
														
 
															     push   %ebx
														
 
															 {$endif}
														
 
															-    jnb    .LForward             { src>dest => forward move }
														
 
															+    mov    %eax, %ebx
														
 
															+    neg    %ebx
														
 
															+    cmp    %ebx, %ecx
														
 
															+    ja     .Lback                { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
														
 
															-    mov    %ecx, %ebx
														
 
															-    add    %eax, %ebx            { eax is negative => ecx+eax > 0 if regions overlap }
														
 
															-    jb     .Lback                { if no overlap, still do forward move }
														
 
															-
														
 
															-.LForward:
														
 
															     mov    %edx, %ebx            { remember original dest to write first 16 bytes }
														
 
															     add    %edx, %ecx            { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
														
 
															     add    $8, %edx
														
@@ -161,13 +158,11 @@ asm
 
															     movq   -8(%eax,%ecx), %mm5
														
 
															     sub    %edx, %eax            { eax = src - dest }
														
 
															     jz     .Lquit                { exit if src=dest }
														
 
															-    jnb    .LForward             { src>dest => forward move }
														
 
															-
														
 
															-    mov    %ecx, %ebx
														
 
															-    add    %eax, %ebx            { eax is negative => ecx+eax > 0 if regions overlap }
														
 
															-    jb     .Lback                { if no overlap, still do forward move }
														
 
															+    mov    %eax, %ebx
														
 
															+    neg    %ebx
														
 
															+    cmp    %ebx, %ecx
														
 
															+    ja     .Lback                { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
														
 
															-.LForward:
														
 
															     mov    %edx, %ebx            { remember original dest to write first 16 bytes }
														
 
															     add    %edx, %ecx            { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
														
 
															     add    $8, %edx
														
@@ -237,7 +232,7 @@ end;
 
															 {$ifndef FASTMOVE_DISABLE_SSE}
														
 
															 label
														
 
															-  Move_8OrMore_SSE_9to16, Move_8OrMore_SSE_33OrMore, Move_8OrMore_SSE_CancelERMSBackwards;
														
 
															+  Move_8OrMore_SSE_9to15, Move_8OrMore_SSE_33OrMore, Move_8OrMore_SSE_CancelERMSBackwards;
														
 
															 const
														
 
															   Move_NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
														
@@ -248,20 +243,20 @@ procedure Move_8OrMore_SSE; assembler; nostackframe;
 
															 const
														
 
															   PrefetchDistance = 512;
														
 
															 asm
														
 
															-    cmp    $16, %ecx
														
 
															-    jle    Move_8OrMore_SSE_9to16
														
 
															+    cmp    $15, %ecx
														
 
															+    jle    Move_8OrMore_SSE_9to15
														
 
															     movups (%eax), %xmm4         { First and last 16 bytes, used both in Move_8OrMore_SSE_33OrMore and 17–32 branch. }
														
 
															     movups -16(%eax,%ecx), %xmm5
														
 
															     cmp    $32, %ecx
														
 
															     jg     Move_8OrMore_SSE_33OrMore
														
 
															-    movups %xmm4, (%edx)         { 17–32 bytes }
														
 
															+    movups %xmm4, (%edx)         { 16–32 bytes }
														
 
															     movups %xmm5, -16(%edx,%ecx)
														
 
															 {$ifdef FPC_PIC}
														
 
															     pop    %ebx
														
 
															 {$endif}
														
 
															     ret
														
 
															-Move_8OrMore_SSE_9to16:
														
 
															+Move_8OrMore_SSE_9to15:
														
 
															     movlps (%eax), %xmm0
														
 
															     movlps -8(%eax,%ecx), %xmm1
														
 
															     movlps %xmm0, (%edx)
														
@@ -271,7 +266,7 @@ Move_8OrMore_SSE_9to16:
 
															     pop    %ebx
														
 
															 {$endif}
														
 
															     ret
														
 
															-    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
														
 
															+    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
														
 
															 Move_8OrMore_SSE_33OrMore:
														
 
															     sub    %edx, %eax            { eax = src - dest }
														
@@ -279,13 +274,11 @@ Move_8OrMore_SSE_33OrMore:
 
															 {$ifndef FPC_PIC}
														
 
															     push   %ebx
														
 
															 {$endif}
														
 
															-    jnb    .LForward             { src>dest => forward move }
														
 
															+    mov    %eax, %ebx
														
 
															+    neg    %ebx
														
 
															+    cmp    %ebx, %ecx
														
 
															+    ja     .Lback                { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
														
 
															-    lea    -1(%ecx), %ebx
														
 
															-    add    %eax, %ebx            { eax is negative => ecx+eax > 0 if regions overlap }
														
 
															-    jb     .Lback                { if no overlap, still do forward move }
														
 
															-
														
 
															-.LForward:
														
 
															     mov    %edx, %ebx            { remember original dest to write first 16 bytes }
														
 
															     add    %edx, %ecx            { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
														
 
															     add    $16, %edx
														
@@ -466,15 +459,15 @@ procedure Move_8OrMore_SSE_ERMS; assembler; nostackframe;
 
															 const
														
 
															     ErmsThreshold = 1536;
														
 
															 asm
														
 
															-    cmp    $16, %ecx
														
 
															-    jle    Move_8OrMore_SSE_9to16
														
 
															+    cmp    $15, %ecx
														
 
															+    jle    Move_8OrMore_SSE_9to15
														
 
															     cmp    $ErmsThreshold, %ecx
														
 
															     jae    .LRepMovs
														
 
															     movups (%eax), %xmm4         { Same as in Move_8OrMore_SSE. }
														
 
															     movups -16(%eax,%ecx), %xmm5
														
 
															     cmp    $32, %ecx
														
 
															     jg     Move_8OrMore_SSE_33OrMore
														
 
															-    movups %xmm4, (%edx)         { 17–32 bytes }
														
 
															+    movups %xmm4, (%edx)         { 16–32 bytes }
														
 
															     movups %xmm5, -16(%edx,%ecx)
														
 
															 {$ifdef FPC_PIC}
														
 
															     pop    %ebx
														
--- a/rtl/x86_64/x86_64.inc
+++ b/rtl/x86_64/x86_64.inc
@@ -134,18 +134,17 @@ asm
 
															     mov    %r9, -8(%rdx,%r8)
														
 
															 .Lquit:
														
 
															     ret
														
 
															-    .byte  0x90,0x90,0x90        { Turns .balign 16 before .Lloop32f into a no-op. }
														
 
															+    .byte  102,144               { Turns .balign 16 before .Lloop32f into a no-op. }
														
 
															 .L33OrMore:
														
 
															     sub    %rdx, %rcx            { rcx = src - dest }
														
 
															     jz     .Lquit                { exit if src=dest }
														
 
															-    jnb    .LForward             { src>dest => forward move }
														
 
															-    mov    %r8, %rax
														
 
															-    add    %rcx, %rax            { rcx is negative => r8+rcx > 0 if regions overlap }
														
 
															-    jb     .Lback                { if no overlap, still do forward move }
														
 
															+    mov    %rcx, %rax
														
 
															+    neg    %rax
														
 
															+    cmp    %rax, %r8
														
 
															+    ja     .Lback                { count (r8) > unsigned(dest - src) (rax) if regions overlap }
														
 
															-.LForward:
														
 
															     mov    %rdx, %r9             { remember original dest to write first 16 bytes }
														
 
															     add    %rdx, %r8             { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
														
 
															     add    $16, %rdx
														
@@ -217,7 +216,7 @@ asm
 
															     mfence
														
 
															     add    $0x1000, %r8
														
 
															     jmpq   .LRestAfterNTf        { go handle remaining bytes }
														
 
															-    .byte  0x90,0x90,0x90        { Turns .balign 16 before .Lloop32b into a no-op. }
														
 
															+    .byte  102,102,144           { Turns .balign 16 before .Lloop32b into a no-op. }
														
 
															 { backwards move }
														
 
															 .Lback: