1 year ago · 0b5998ee8b
--- a/rtl/i386/fastmove.inc
+++ b/rtl/i386/fastmove.inc
@@ -63,19 +63,19 @@ asm
 
															     ret
														
 
															 .Lcancel:
														
 
															+    fstp   %st(0)                { Pop the “second int64 from the end” .L33OrMore loads. }
														
 
															     fucompp                      { Pop two elements loaded at the beginning. }
														
 
															-{$ifdef FPC_PIC}
														
 
															     pop    %ebx
														
 
															-{$endif}
														
 
															     ret
														
 
															-    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16f into a no-op. }
														
 
															+    .byte  102,102,144           { Turns .balign 16 before .Lloop16f into a no-op. }
														
 
															 .L33OrMore:
														
 
															-    sub    %edx, %eax            { eax = src - dest }
														
 
															-    jz     .Lcancel              { exit if src=dest }
														
 
															+    fildq  -16(%eax,%ecx)        { Second int64 from the end. }
														
 
															 {$ifndef FPC_PIC}
														
 
															     push   %ebx
														
 
															 {$endif}
														
 
															+    sub    %edx, %eax            { eax = src - dest }
														
 
															+    jz     .Lcancel              { exit if src=dest }
														
 
															     mov    %eax, %ebx
														
 
															     neg    %ebx
														
 
															     cmp    %ebx, %ecx
														
@@ -101,19 +101,17 @@ asm
 
															     ja     .Lloop16f
														
 
															 .LPost16f:                       { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
														
 
															-    cmp    $-8, %ecx
														
 
															-    jle    .LFirstAndLast8f
														
 
															-    fildq  (%eax,%edx)
														
 
															-    fistpq (%edx)
														
 
															-.LFirstAndLast8f:
														
 
															+    fistpq (%edx,%ecx)
														
 
															     fistpq 8(%edx,%ecx)          { Write first and last 8 bytes after everything else. }
														
 
															     fistpq (%ebx)                { Important for <8-byte step between src and dest. }
														
 
															     pop    %ebx
														
 
															     ret
														
 
															-    .byte  102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
														
 
															+    .byte  102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
														
 
															 { backwards move }
														
 
															 .Lback:
														
 
															+    fstp   %st(0)
														
 
															+    fildq  8(%eax,%edx)          { Second int64 from the start. }
														
 
															     lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 8 bytes }
														
 
															     mov    %ebx, %ecx            { move dest to the previous 8-byte boundary... }
														
 
															     and    $-8, %ecx
														
@@ -134,12 +132,8 @@ asm
 
															     ja     .Lloop16b
														
 
															 .LPost16b:
														
 
															-    cmp    $-8, %ecx
														
 
															-    jle    .LFirstAndLast8b
														
 
															-    fildq  -8(%eax,%edx)
														
 
															-    fistpq -8(%edx)
														
 
															-.LFirstAndLast8b:
														
 
															     sub    %ecx, %edx
														
 
															+    fistpq -8(%edx)
														
 
															     fistpq -7(%ebx)
														
 
															     fistpq -16(%edx)
														
 
															     pop    %ebx
														
@@ -156,6 +150,7 @@ asm
 
															 {$endif}
														
 
															     movq   (%eax), %mm4          { First and last 8 bytes. }
														
 
															     movq   -8(%eax,%ecx), %mm5
														
 
															+    movq   -16(%eax,%ecx), %mm3  { Second vector from the end. }
														
 
															     sub    %edx, %eax            { eax = src - dest }
														
 
															     jz     .Lquit                { exit if src=dest }
														
 
															     mov    %eax, %ebx
														
@@ -183,21 +178,18 @@ asm
 
															     ja     .Lloop16f
														
 
															 .LPost16f:                       { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
														
 
															-    cmp    $-8, %ecx
														
 
															-    jle    .LFirstAndLast8f
														
 
															-    movq   (%eax,%edx), %mm0
														
 
															-    movq   %mm0, (%edx)
														
 
															-.LFirstAndLast8f:
														
 
															+    movq   %mm3, (%edx,%ecx)
														
 
															     movq   %mm5, 8(%edx,%ecx)    { Write first and last 8 bytes after everything else. }
														
 
															     movq   %mm4, (%ebx)          { Important for <8-byte step between src and dest. }
														
 
															 .Lquit:
														
 
															     emms
														
 
															     pop    %ebx
														
 
															     ret
														
 
															-    .byte  102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
														
 
															+    .byte  144 { Turns .balign 16 before .Lloop16b into a no-op. }
														
 
															 { backwards move }
														
 
															 .Lback:
														
 
															+    movq   8(%eax,%edx), %mm3    { Second vector from the start. }
														
 
															     lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 8 bytes }
														
 
															     mov    %ebx, %ecx            { move dest to the previous 8-byte boundary... }
														
 
															     and    $-8, %ecx
														
@@ -218,12 +210,8 @@ asm
 
															     ja     .Lloop16b
														
 
															 .LPost16b:
														
 
															-    cmp    $-8, %ecx
														
 
															-    jle    .LFirstAndLast8b
														
 
															-    movq   -8(%eax,%edx), %mm0
														
 
															-    movq   %mm0, -8(%edx)
														
 
															-.LFirstAndLast8b:
														
 
															     sub    %ecx, %edx
														
 
															+    movq   %mm3, -8(%edx)
														
 
															     movq   %mm4, -16(%edx)
														
 
															     movq   %mm5, -7(%ebx)
														
 
															     emms
														
@@ -266,9 +254,12 @@ Move_8OrMore_SSE_9to15:
 
															     pop    %ebx
														
 
															 {$endif}
														
 
															     ret
														
 
															-    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
														
 
															+    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
														
 
															 Move_8OrMore_SSE_33OrMore:
														
 
															+    movups -32(%eax,%ecx), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
														
 
															+                                 { but -32(%eax,%ecx) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
														
 
															+
														
 
															     sub    %edx, %eax            { eax = src - dest }
														
 
															     jz     .Lquit                { exit if src=dest }
														
 
															 {$ifndef FPC_PIC}
														
@@ -305,11 +296,7 @@ Move_8OrMore_SSE_33OrMore:
 
															     ja     .Lloop32f
														
 
															 .LPost32f:                       { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. }
														
 
															-    cmp    $-16, %ecx
														
 
															-    jle    .LFirstAndLast16f
														
 
															-    movups (%eax,%edx), %xmm0
														
 
															-    movaps %xmm0, (%edx)
														
 
															-.LFirstAndLast16f:
														
 
															+    movups %xmm3, (%edx, %ecx)
														
 
															     movups %xmm5, 16(%edx,%ecx)  { Write first and last 16 bytes after everything else. }
														
 
															     movups %xmm4, (%ebx)         { Important for <16-byte step between src and dest. }
														
 
															     pop    %ebx
														
@@ -326,11 +313,7 @@ Move_8OrMore_SSE_33OrMore:
 
															     ja     .Lalignedloop32f
														
 
															 .LalignedPost32f:
														
 
															-    cmp    $-16, %ecx
														
 
															-    jle    .LalignedFirstAndLast16f
														
 
															-    movaps (%eax,%edx), %xmm0
														
 
															-    movaps %xmm0, (%edx)
														
 
															-.LalignedFirstAndLast16f:
														
 
															+    movups %xmm3, (%edx, %ecx)
														
 
															     movups %xmm5, 16(%edx,%ecx)
														
 
															     movups %xmm4, (%ebx)
														
 
															     pop    %ebx
														
@@ -380,7 +363,7 @@ Move_8OrMore_SSE_33OrMore:
 
															     sfence
														
 
															     add    $PrefetchDistance+64, %ecx
														
 
															     jmp    .LRestAfterNTf
														
 
															-    .byte  {$ifdef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
														
 
															+    .byte  {$ifdef FPC_PIC}102,{$endif}102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
														
 
															 Move_8OrMore_SSE_CancelERMSBackwards:
														
 
															     { Adapt from Move_8OrMore_SSE_ERMS.Lback where eax = src, edx = dest - src, xmm4 and xmm5 aren’t read, ebx isn't pushed if not FPC_PIC. }
														
@@ -394,6 +377,7 @@ Move_8OrMore_SSE_CancelERMSBackwards:
 
															 { backwards move }
														
 
															 .Lback:
														
 
															+    movups 16(%eax,%edx), %xmm3  { Second vector from the start. }
														
 
															     lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 16 bytes }
														
 
															     mov    %ebx, %ecx            { move dest to the previous 16-byte boundary... }
														
 
															     and    $-16, %ecx
														
@@ -417,12 +401,8 @@ Move_8OrMore_SSE_CancelERMSBackwards:
 
															     ja     .Lloop32b
														
 
															 .LPost32b:
														
 
															-    cmp    $-16, %ecx
														
 
															-    jle    .LFirstAndLast16b
														
 
															-    movups -16(%eax,%edx), %xmm0
														
 
															-    movaps %xmm0, -16(%edx)
														
 
															-.LFirstAndLast16b:
														
 
															     sub    %ecx, %edx
														
 
															+    movups %xmm3, -16(%edx)
														
 
															     movups %xmm4, -32(%edx)
														
 
															     movups %xmm5, -15(%ebx)
														
 
															     pop    %ebx
														
--- a/rtl/x86_64/x86_64.inc
+++ b/rtl/x86_64/x86_64.inc
@@ -134,9 +134,12 @@ asm
 
															     mov    %r9, -8(%rdx,%r8)
														
 
															 .Lquit:
														
 
															     ret
														
 
															-    .byte  102,144               { Turns .balign 16 before .Lloop32f into a no-op. }
														
 
															+    .byte  102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
														
 
															 .L33OrMore:
														
 
															+    movdqu -32(%rcx,%r8), %xmm3  { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
														
 
															+                                 { but -32(%rcx,%r8) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
														
 
															+
														
 
															     sub    %rdx, %rcx            { rcx = src - dest }
														
 
															     jz     .Lquit                { exit if src=dest }
														
@@ -168,11 +171,7 @@ asm
 
															     ja     .Lloop32f
														
 
															 .LPost32f:                       { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. }
														
 
															-    cmp    $-16, %r8
														
 
															-    jle    .LFirstAndLast16f
														
 
															-    movdqu (%rcx,%rdx), %xmm0
														
 
															-    movdqa %xmm0, (%rdx)
														
 
															-.LFirstAndLast16f:
														
 
															+    movdqu %xmm3, (%rdx, %r8)
														
 
															     movdqu %xmm5, 16(%rdx,%r8)   { Write first and last 16 bytes after everything else. }
														
 
															     movdqu %xmm4, (%r9)          { Important for <16-byte step between src and dest. }
														
 
															     ret
														
@@ -216,10 +215,11 @@ asm
 
															     mfence
														
 
															     add    $0x1000, %r8
														
 
															     jmpq   .LRestAfterNTf        { go handle remaining bytes }
														
 
															-    .byte  102,102,144           { Turns .balign 16 before .Lloop32b into a no-op. }
														
 
															+    .byte  102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
														
 
															 { backwards move }
														
 
															 .Lback:
														
 
															+    movdqu 16(%rcx,%rdx), %xmm3  { Second vector from the start. }
														
 
															     lea    (%rdx,%r8), %r9       { points to the end of dest; remember to write last 16 bytes }
														
 
															     lea    -1(%r9), %r8          { move dest to the previous 16-byte boundary... }
														
 
															     and    $-16, %r8
														
@@ -243,12 +243,8 @@ asm
 
															     ja     .Lloop32b
														
 
															 .LPost32b:
														
 
															-    cmp    $-16, %r8
														
 
															-    jle    .LFirstAndLast16b
														
 
															-    movdqu -16(%rcx,%rdx), %xmm0
														
 
															-    movdqa %xmm0, -16(%rdx)
														
 
															-.LFirstAndLast16b:
														
 
															     sub    %r8, %rdx
														
 
															+    movdqu %xmm3, -16(%rdx)
														
 
															     movdqu %xmm4, -32(%rdx)
														
 
															     movdqu %xmm5, -16(%r9)
														
 
															     ret