1 year ago · 12f18177ae
--- a/rtl/x86_64/x86_64.inc
+++ b/rtl/x86_64/x86_64.inc
@@ -86,6 +86,9 @@ end;
 
				 procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
			
 
				 { Linux: rdi source, rsi dest, rdx count
			
 
				   win64: rcx source, rdx dest, r8 count }
			
 
				+const
			
 
				+  NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
			
 
				+  PrefetchDistance = 512;
			
 
				 asm
			
 
				 {$ifndef win64}
			
 
				     mov    %rdx, %r8
			
@@ -157,7 +160,7 @@ asm
 
				 .LRestAfterNTf:
			
 
				     sub    $32, %r8              { During the N× loop, r8 is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
			
 
				     jbe    .LPost32f
			
 
				-    cmp    $0x40000, %r8         { this limit must be processor-specific (1/2 L2 cache size) }
			
 
				+    cmp    $NtThreshold-32, %r8
			
 
				     jae    .Lntf                 { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
			
 
				 
			
 
				     .balign 16                   { no-op }
			
@@ -176,46 +179,31 @@ asm
 
				     movdqu %xmm4, (%r9)          { Important for <16-byte step between src and dest. }
			
 
				     ret
			
 
				 
			
 
				+    .balign 16
			
 
				 .Lntf:
			
 
				-    cmp    $0x1000, %rcx         { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
			
 
				+    cmp    $NtThreshold, %rcx    { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
			
 
				     jb     .Lloop32f             { (this check is performed here to not stand in the way of smaller counts) }
			
 
				-    sub    $0xFE0, %r8           { r8 = remaining - 0x1000, but 32 was subtracted already, so must subtract only (0x1000 - 32) = 0xFE0. }
			
 
				+    sub    $PrefetchDistance+32, %r8 { r8 = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
			
 
				 
			
 
				-.Lntloopf:
			
 
				-    mov    $32, %eax
			
 
				-
			
 
				-    .balign 16
			
 
				-.Lpref:
			
 
				-    prefetchnta (%rcx,%rdx,1)
			
 
				-    prefetchnta 0x40(%rcx,%rdx,1)
			
 
				-    add    $0x80, %rdx
			
 
				-    dec    %eax
			
 
				-    jnz    .Lpref
			
 
				-
			
 
				-    sub    $0x1000, %rdx
			
 
				-    mov    $64, %eax
			
 
				-
			
 
				-    .balign 16
			
 
				+    .balign 16                   { no-op }
			
 
				 .Lntloop64f:
			
 
				+    prefetchnta 0+PrefetchDistance(%rcx,%rdx,1)
			
 
				+    movdqu (%rcx,%rdx,1), %xmm0
			
 
				+    movntdq %xmm0, (%rdx)
			
 
				+    movdqu 16(%rcx,%rdx,1), %xmm0
			
 
				+    movntdq %xmm0, 16(%rdx)
			
 
				+    movdqu 32(%rcx,%rdx,1), %xmm0
			
 
				+    movntdq %xmm0, 32(%rdx)
			
 
				+    movdqu 48(%rcx,%rdx,1), %xmm0
			
 
				+    movntdq %xmm0, 48(%rdx)
			
 
				     add    $64, %rdx
			
 
				-    movdqu -64(%rcx,%rdx,1), %xmm0
			
 
				-    movntdq %xmm0, -64(%rdx)
			
 
				-    movdqu -48(%rcx,%rdx,1), %xmm0
			
 
				-    movntdq %xmm0, -48(%rdx)
			
 
				-    movdqu -32(%rcx,%rdx,1), %xmm0
			
 
				-    movntdq %xmm0, -32(%rdx)
			
 
				-    movdqu -16(%rcx,%rdx,1), %xmm0
			
 
				-    movntdq %xmm0, -16(%rdx)
			
 
				-    dec    %eax
			
 
				-    jnz    .Lntloop64f
			
 
				-
			
 
				-    sub    $0x1000, %r8
			
 
				-    jae    .Lntloopf
			
 
				-
			
 
				-    mfence
			
 
				-    add    $0x1000, %r8
			
 
				+    sub    $64, %r8
			
 
				+    jae    .Lntloop64f
			
 
				+
			
 
				+    sfence
			
 
				+    add    $PrefetchDistance+64, %r8
			
 
				     jmpq   .LRestAfterNTf        { go handle remaining bytes }
			
 
				-    .byte  102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
			
 
				+    .byte  102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
			
 
				 
			
 
				 { backwards move }
			
 
				 .Lback:
			
@@ -229,7 +217,7 @@ asm
 
				 .LRestAfterNTb:
			
 
				     sub    $32, %r8
			
 
				     jbe    .LPost32b
			
 
				-    cmp    $0x40000, %r8
			
 
				+    cmp    $NtThreshold-32, %r8
			
 
				     jae    .Lntb
			
 
				 
			
 
				     .balign 16                   { no-op }
			
@@ -249,27 +237,15 @@ asm
 
				     movdqu %xmm5, -16(%r9)
			
 
				     ret
			
 
				 
			
 
				+    .balign 16
			
 
				 .Lntb:
			
 
				-    cmp    $0xfffffffffffff000,%rcx
			
 
				+    cmp    $-NtThreshold,%rcx
			
 
				     jnb    .Lloop32b
			
 
				-    sub    $0xFE0, %r8
			
 
				-
			
 
				-.Lntloopb:
			
 
				-    mov    $32, %eax
			
 
				+    sub    $PrefetchDistance+32, %r8
			
 
				 
			
 
				-    .balign 16
			
 
				-.Lprefb:
			
 
				-    sub    $0x80, %rdx
			
 
				-    prefetchnta (%rcx,%rdx,1)
			
 
				-    prefetchnta 0x40(%rcx,%rdx,1)
			
 
				-    dec    %eax
			
 
				-    jnz    .Lprefb
			
 
				-
			
 
				-    add    $0x1000, %rdx
			
 
				-    mov    $0x40, %eax
			
 
				-
			
 
				-    .balign 16
			
 
				+    .balign 16                   { no-op }
			
 
				 .Lntloop64b:
			
 
				+    prefetchnta -PrefetchDistance(%rcx,%rdx,1)
			
 
				     sub    $64, %rdx
			
 
				     movdqu 48(%rcx,%rdx,1), %xmm0
			
 
				     movntdq %xmm0, 48(%rdx)
			
@@ -279,14 +255,11 @@ asm
 
				     movntdq %xmm0, 16(%rdx)
			
 
				     movdqu (%rcx,%rdx,1), %xmm0
			
 
				     movntdq %xmm0, (%rdx)
			
 
				-    dec    %eax
			
 
				-    jnz    .Lntloop64b
			
 
				-
			
 
				-    sub    $0x1000, %r8
			
 
				-    jae    .Lntloopb
			
 
				+    sub    $64, %r8
			
 
				+    jae    .Lntloop64b
			
 
				 
			
 
				-    mfence
			
 
				-    add    $0x1000, %r8
			
 
				+    sfence
			
 
				+    add    $PrefetchDistance+64, %r8
			
 
				     jmpq   .LRestAfterNTb
			
 
				 end;
			
 
				 {$endif FPC_SYSTEM_HAS_MOVE}