1 year ago · 7bf502ad40
--- a/rtl/x86_64/x86_64.inc
+++ b/rtl/x86_64/x86_64.inc
@@ -102,12 +102,12 @@ asm
 
															     jle    .L4to8
														
 
															     cmp    $16, %r8
														
 
															     jle    .L9to16
														
 
															-    movdqu (%rcx), %xmm4         { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
														
 
															-    movdqu -16(%rcx,%r8), %xmm5
														
 
															+    movups (%rcx), %xmm4         { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
														
 
															+    movups -16(%rcx,%r8), %xmm5
														
 
															     cmp    $32, %r8
														
 
															     jg     .L33OrMore
														
 
															-    movdqu %xmm4, (%rdx)         { 17–32 bytes }
														
 
															-    movdqu %xmm5, -16(%rdx,%r8)
														
 
															+    movups %xmm4, (%rdx)         { 17–32 bytes }
														
 
															+    movups %xmm5, -16(%rdx,%r8)
														
 
															     ret
														
 
															     .balign 16
														
@@ -137,10 +137,10 @@ asm
 
															     mov    %r9, -8(%rdx,%r8)
														
 
															 .Lquit:
														
 
															     ret
														
 
															-    .byte  102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
														
 
															+    .byte  102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
														
 
															 .L33OrMore:
														
 
															-    movdqu -32(%rcx,%r8), %xmm3  { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
														
 
															+    movups -32(%rcx,%r8), %xmm3  { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
														
 
															                                  { but -32(%rcx,%r8) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
														
 
															     sub    %rdx, %rcx            { rcx = src - dest }
														
@@ -165,18 +165,18 @@ asm
 
															     .balign 16                   { no-op }
														
 
															 .Lloop32f:
														
 
															-    movdqu (%rcx,%rdx), %xmm0
														
 
															-    movdqa %xmm0, (%rdx)
														
 
															-    movdqu 16(%rcx,%rdx), %xmm0
														
 
															-    movdqa %xmm0, 16(%rdx)
														
 
															+    movups (%rcx,%rdx), %xmm0
														
 
															+    movaps %xmm0, (%rdx)
														
 
															+    movups 16(%rcx,%rdx), %xmm0
														
 
															+    movaps %xmm0, 16(%rdx)
														
 
															     add    $32, %rdx
														
 
															     sub    $32, %r8
														
 
															     ja     .Lloop32f
														
 
															 .LPost32f:                       { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. }
														
 
															-    movdqu %xmm3, (%rdx, %r8)
														
 
															-    movdqu %xmm5, 16(%rdx,%r8)   { Write first and last 16 bytes after everything else. }
														
 
															-    movdqu %xmm4, (%r9)          { Important for <16-byte step between src and dest. }
														
 
															+    movups %xmm3, (%rdx, %r8)
														
 
															+    movups %xmm5, 16(%rdx,%r8)   { Write first and last 16 bytes after everything else. }
														
 
															+    movups %xmm4, (%r9)          { Important for <16-byte step between src and dest. }
														
 
															     ret
														
 
															     .balign 16
														
@@ -188,14 +188,14 @@ asm
 
															     .balign 16                   { no-op }
														
 
															 .Lntloop64f:
														
 
															     prefetchnta 0+PrefetchDistance(%rcx,%rdx,1)
														
 
															-    movdqu (%rcx,%rdx,1), %xmm0
														
 
															-    movntdq %xmm0, (%rdx)
														
 
															-    movdqu 16(%rcx,%rdx,1), %xmm0
														
 
															-    movntdq %xmm0, 16(%rdx)
														
 
															-    movdqu 32(%rcx,%rdx,1), %xmm0
														
 
															-    movntdq %xmm0, 32(%rdx)
														
 
															-    movdqu 48(%rcx,%rdx,1), %xmm0
														
 
															-    movntdq %xmm0, 48(%rdx)
														
 
															+    movups (%rcx,%rdx,1), %xmm0
														
 
															+    movntps %xmm0, (%rdx)
														
 
															+    movups 16(%rcx,%rdx,1), %xmm0
														
 
															+    movntps %xmm0, 16(%rdx)
														
 
															+    movups 32(%rcx,%rdx,1), %xmm0
														
 
															+    movntps %xmm0, 32(%rdx)
														
 
															+    movups 48(%rcx,%rdx,1), %xmm0
														
 
															+    movntps %xmm0, 48(%rdx)
														
 
															     add    $64, %rdx
														
 
															     sub    $64, %r8
														
 
															     jae    .Lntloop64f
														
@@ -203,11 +203,11 @@ asm
 
															     sfence
														
 
															     add    $PrefetchDistance+64, %r8
														
 
															     jmpq   .LRestAfterNTf        { go handle remaining bytes }
														
 
															-    .byte  102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
														
 
															+    .byte  102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
														
 
															 { backwards move }
														
 
															 .Lback:
														
 
															-    movdqu 16(%rcx,%rdx), %xmm3  { Second vector from the start. }
														
 
															+    movups 16(%rcx,%rdx), %xmm3  { Second vector from the start. }
														
 
															     lea    (%rdx,%r8), %r9       { points to the end of dest; remember to write last 16 bytes }
														
 
															     lea    -1(%r9), %r8          { move dest to the previous 16-byte boundary... }
														
 
															     and    $-16, %r8
														
@@ -223,18 +223,18 @@ asm
 
															     .balign 16                   { no-op }
														
 
															 .Lloop32b:
														
 
															     sub    $32, %rdx
														
 
															-    movdqu 16(%rcx,%rdx), %xmm0
														
 
															-    movdqa %xmm0, 16(%rdx)
														
 
															-    movdqu (%rcx,%rdx), %xmm0
														
 
															-    movdqa %xmm0, (%rdx)
														
 
															+    movups 16(%rcx,%rdx), %xmm0
														
 
															+    movaps %xmm0, 16(%rdx)
														
 
															+    movups (%rcx,%rdx), %xmm0
														
 
															+    movaps %xmm0, (%rdx)
														
 
															     sub    $32, %r8
														
 
															     ja     .Lloop32b
														
 
															 .LPost32b:
														
 
															     sub    %r8, %rdx
														
 
															-    movdqu %xmm3, -16(%rdx)
														
 
															-    movdqu %xmm4, -32(%rdx)
														
 
															-    movdqu %xmm5, -16(%r9)
														
 
															+    movups %xmm3, -16(%rdx)
														
 
															+    movups %xmm4, -32(%rdx)
														
 
															+    movups %xmm5, -16(%r9)
														
 
															     ret
														
 
															     .balign 16
														
@@ -247,14 +247,14 @@ asm
 
															 .Lntloop64b:
														
 
															     prefetchnta -PrefetchDistance(%rcx,%rdx,1)
														
 
															     sub    $64, %rdx
														
 
															-    movdqu 48(%rcx,%rdx,1), %xmm0
														
 
															-    movntdq %xmm0, 48(%rdx)
														
 
															-    movdqu 32(%rcx,%rdx,1), %xmm0
														
 
															-    movntdq %xmm0, 32(%rdx)
														
 
															-    movdqu 16(%rcx,%rdx,1), %xmm0
														
 
															-    movntdq %xmm0, 16(%rdx)
														
 
															-    movdqu (%rcx,%rdx,1), %xmm0
														
 
															-    movntdq %xmm0, (%rdx)
														
 
															+    movups 48(%rcx,%rdx,1), %xmm0
														
 
															+    movntps %xmm0, 48(%rdx)
														
 
															+    movups 32(%rcx,%rdx,1), %xmm0
														
 
															+    movntps %xmm0, 32(%rdx)
														
 
															+    movups 16(%rcx,%rdx,1), %xmm0
														
 
															+    movntps %xmm0, 16(%rdx)
														
 
															+    movups (%rcx,%rdx,1), %xmm0
														
 
															+    movntps %xmm0, (%rdx)
														
 
															     sub    $64, %r8
														
 
															     jae    .Lntloop64b