1 year ago · 0b5998ee8b
--- a/rtl/i386/fastmove.inc
+++ b/rtl/i386/fastmove.inc
@@ -63,19 +63,19 @@ asm
 
				     ret
			
 
				 
			
 
				 .Lcancel:
			
 
				+    fstp   %st(0)                { Pop the “second int64 from the end” .L33OrMore loads. }
			
 
				     fucompp                      { Pop two elements loaded at the beginning. }
			
 
				-{$ifdef FPC_PIC}
			
 
				     pop    %ebx
			
 
				-{$endif}
			
 
				     ret
			
 
				-    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16f into a no-op. }
			
 
				+    .byte  102,102,144           { Turns .balign 16 before .Lloop16f into a no-op. }
			
 
				 
			
 
				 .L33OrMore:
			
 
				-    sub    %edx, %eax            { eax = src - dest }
			
 
				-    jz     .Lcancel              { exit if src=dest }
			
 
				+    fildq  -16(%eax,%ecx)        { Second int64 from the end. }
			
 
				 {$ifndef FPC_PIC}
			
 
				     push   %ebx
			
 
				 {$endif}
			
 
				+    sub    %edx, %eax            { eax = src - dest }
			
 
				+    jz     .Lcancel              { exit if src=dest }
			
 
				     mov    %eax, %ebx
			
 
				     neg    %ebx
			
 
				     cmp    %ebx, %ecx
			
@@ -101,19 +101,17 @@ asm
 
				     ja     .Lloop16f
			
 
				 
			
 
				 .LPost16f:                       { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
			
 
				-    cmp    $-8, %ecx
			
 
				-    jle    .LFirstAndLast8f
			
 
				-    fildq  (%eax,%edx)
			
 
				-    fistpq (%edx)
			
 
				-.LFirstAndLast8f:
			
 
				+    fistpq (%edx,%ecx)
			
 
				     fistpq 8(%edx,%ecx)          { Write first and last 8 bytes after everything else. }
			
 
				     fistpq (%ebx)                { Important for <8-byte step between src and dest. }
			
 
				     pop    %ebx
			
 
				     ret
			
 
				-    .byte  102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
			
 
				+    .byte  102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
			
 
				 
			
 
				 { backwards move }
			
 
				 .Lback:
			
 
				+    fstp   %st(0)
			
 
				+    fildq  8(%eax,%edx)          { Second int64 from the start. }
			
 
				     lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 8 bytes }
			
 
				     mov    %ebx, %ecx            { move dest to the previous 8-byte boundary... }
			
 
				     and    $-8, %ecx
			
@@ -134,12 +132,8 @@ asm
 
				     ja     .Lloop16b
			
 
				 
			
 
				 .LPost16b:
			
 
				-    cmp    $-8, %ecx
			
 
				-    jle    .LFirstAndLast8b
			
 
				-    fildq  -8(%eax,%edx)
			
 
				-    fistpq -8(%edx)
			
 
				-.LFirstAndLast8b:
			
 
				     sub    %ecx, %edx
			
 
				+    fistpq -8(%edx)
			
 
				     fistpq -7(%ebx)
			
 
				     fistpq -16(%edx)
			
 
				     pop    %ebx
			
@@ -156,6 +150,7 @@ asm
 
				 {$endif}
			
 
				     movq   (%eax), %mm4          { First and last 8 bytes. }
			
 
				     movq   -8(%eax,%ecx), %mm5
			
 
				+    movq   -16(%eax,%ecx), %mm3  { Second vector from the end. }
			
 
				     sub    %edx, %eax            { eax = src - dest }
			
 
				     jz     .Lquit                { exit if src=dest }
			
 
				     mov    %eax, %ebx
			
@@ -183,21 +178,18 @@ asm
 
				     ja     .Lloop16f
			
 
				 
			
 
				 .LPost16f:                       { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
			
 
				-    cmp    $-8, %ecx
			
 
				-    jle    .LFirstAndLast8f
			
 
				-    movq   (%eax,%edx), %mm0
			
 
				-    movq   %mm0, (%edx)
			
 
				-.LFirstAndLast8f:
			
 
				+    movq   %mm3, (%edx,%ecx)
			
 
				     movq   %mm5, 8(%edx,%ecx)    { Write first and last 8 bytes after everything else. }
			
 
				     movq   %mm4, (%ebx)          { Important for <8-byte step between src and dest. }
			
 
				 .Lquit:
			
 
				     emms
			
 
				     pop    %ebx
			
 
				     ret
			
 
				-    .byte  102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
			
 
				+    .byte  144 { Turns .balign 16 before .Lloop16b into a no-op. }
			
 
				 
			
 
				 { backwards move }
			
 
				 .Lback:
			
 
				+    movq   8(%eax,%edx), %mm3    { Second vector from the start. }
			
 
				     lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 8 bytes }
			
 
				     mov    %ebx, %ecx            { move dest to the previous 8-byte boundary... }
			
 
				     and    $-8, %ecx
			
@@ -218,12 +210,8 @@ asm
 
				     ja     .Lloop16b
			
 
				 
			
 
				 .LPost16b:
			
 
				-    cmp    $-8, %ecx
			
 
				-    jle    .LFirstAndLast8b
			
 
				-    movq   -8(%eax,%edx), %mm0
			
 
				-    movq   %mm0, -8(%edx)
			
 
				-.LFirstAndLast8b:
			
 
				     sub    %ecx, %edx
			
 
				+    movq   %mm3, -8(%edx)
			
 
				     movq   %mm4, -16(%edx)
			
 
				     movq   %mm5, -7(%ebx)
			
 
				     emms
			
@@ -266,9 +254,12 @@ Move_8OrMore_SSE_9to15:
 
				     pop    %ebx
			
 
				 {$endif}
			
 
				     ret
			
 
				-    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
			
 
				+    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
			
 
				 
			
 
				 Move_8OrMore_SSE_33OrMore:
			
 
				+    movups -32(%eax,%ecx), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
			
 
				+                                 { but -32(%eax,%ecx) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
			
 
				+
			
 
				     sub    %edx, %eax            { eax = src - dest }
			
 
				     jz     .Lquit                { exit if src=dest }
			
 
				 {$ifndef FPC_PIC}
			
@@ -305,11 +296,7 @@ Move_8OrMore_SSE_33OrMore:
 
				     ja     .Lloop32f
			
 
				 
			
 
				 .LPost32f:                       { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. }
			
 
				-    cmp    $-16, %ecx
			
 
				-    jle    .LFirstAndLast16f
			
 
				-    movups (%eax,%edx), %xmm0
			
 
				-    movaps %xmm0, (%edx)
			
 
				-.LFirstAndLast16f:
			
 
				+    movups %xmm3, (%edx, %ecx)
			
 
				     movups %xmm5, 16(%edx,%ecx)  { Write first and last 16 bytes after everything else. }
			
 
				     movups %xmm4, (%ebx)         { Important for <16-byte step between src and dest. }
			
 
				     pop    %ebx
			
@@ -326,11 +313,7 @@ Move_8OrMore_SSE_33OrMore:
 
				     ja     .Lalignedloop32f
			
 
				 
			
 
				 .LalignedPost32f:
			
 
				-    cmp    $-16, %ecx
			
 
				-    jle    .LalignedFirstAndLast16f
			
 
				-    movaps (%eax,%edx), %xmm0
			
 
				-    movaps %xmm0, (%edx)
			
 
				-.LalignedFirstAndLast16f:
			
 
				+    movups %xmm3, (%edx, %ecx)
			
 
				     movups %xmm5, 16(%edx,%ecx)
			
 
				     movups %xmm4, (%ebx)
			
 
				     pop    %ebx
			
@@ -380,7 +363,7 @@ Move_8OrMore_SSE_33OrMore:
 
				     sfence
			
 
				     add    $PrefetchDistance+64, %ecx
			
 
				     jmp    .LRestAfterNTf
			
 
				-    .byte  {$ifdef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
			
 
				+    .byte  {$ifdef FPC_PIC}102,{$endif}102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
			
 
				 
			
 
				 Move_8OrMore_SSE_CancelERMSBackwards:
			
 
				     { Adapt from Move_8OrMore_SSE_ERMS.Lback where eax = src, edx = dest - src, xmm4 and xmm5 aren’t read, ebx isn't pushed if not FPC_PIC. }
			
@@ -394,6 +377,7 @@ Move_8OrMore_SSE_CancelERMSBackwards:
 
				 
			
 
				 { backwards move }
			
 
				 .Lback:
			
 
				+    movups 16(%eax,%edx), %xmm3  { Second vector from the start. }
			
 
				     lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 16 bytes }
			
 
				     mov    %ebx, %ecx            { move dest to the previous 16-byte boundary... }
			
 
				     and    $-16, %ecx
			
@@ -417,12 +401,8 @@ Move_8OrMore_SSE_CancelERMSBackwards:
 
				     ja     .Lloop32b
			
 
				 
			
 
				 .LPost32b:
			
 
				-    cmp    $-16, %ecx
			
 
				-    jle    .LFirstAndLast16b
			
 
				-    movups -16(%eax,%edx), %xmm0
			
 
				-    movaps %xmm0, -16(%edx)
			
 
				-.LFirstAndLast16b:
			
 
				     sub    %ecx, %edx
			
 
				+    movups %xmm3, -16(%edx)
			
 
				     movups %xmm4, -32(%edx)
			
 
				     movups %xmm5, -15(%ebx)
			
 
				     pop    %ebx
			
--- a/rtl/x86_64/x86_64.inc
+++ b/rtl/x86_64/x86_64.inc
@@ -134,9 +134,12 @@ asm
 
				     mov    %r9, -8(%rdx,%r8)
			
 
				 .Lquit:
			
 
				     ret
			
 
				-    .byte  102,144               { Turns .balign 16 before .Lloop32f into a no-op. }
			
 
				+    .byte  102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
			
 
				 
			
 
				 .L33OrMore:
			
 
				+    movdqu -32(%rcx,%r8), %xmm3  { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
			
 
				+                                 { but -32(%rcx,%r8) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
			
 
				+
			
 
				     sub    %rdx, %rcx            { rcx = src - dest }
			
 
				     jz     .Lquit                { exit if src=dest }
			
 
				 
			
@@ -168,11 +171,7 @@ asm
 
				     ja     .Lloop32f
			
 
				 
			
 
				 .LPost32f:                       { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. }
			
 
				-    cmp    $-16, %r8
			
 
				-    jle    .LFirstAndLast16f
			
 
				-    movdqu (%rcx,%rdx), %xmm0
			
 
				-    movdqa %xmm0, (%rdx)
			
 
				-.LFirstAndLast16f:
			
 
				+    movdqu %xmm3, (%rdx, %r8)
			
 
				     movdqu %xmm5, 16(%rdx,%r8)   { Write first and last 16 bytes after everything else. }
			
 
				     movdqu %xmm4, (%r9)          { Important for <16-byte step between src and dest. }
			
 
				     ret
			
@@ -216,10 +215,11 @@ asm
 
				     mfence
			
 
				     add    $0x1000, %r8
			
 
				     jmpq   .LRestAfterNTf        { go handle remaining bytes }
			
 
				-    .byte  102,102,144           { Turns .balign 16 before .Lloop32b into a no-op. }
			
 
				+    .byte  102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
			
 
				 
			
 
				 { backwards move }
			
 
				 .Lback:
			
 
				+    movdqu 16(%rcx,%rdx), %xmm3  { Second vector from the start. }
			
 
				     lea    (%rdx,%r8), %r9       { points to the end of dest; remember to write last 16 bytes }
			
 
				     lea    -1(%r9), %r8          { move dest to the previous 16-byte boundary... }
			
 
				     and    $-16, %r8
			
@@ -243,12 +243,8 @@ asm
 
				     ja     .Lloop32b
			
 
				 
			
 
				 .LPost32b:
			
 
				-    cmp    $-16, %r8
			
 
				-    jle    .LFirstAndLast16b
			
 
				-    movdqu -16(%rcx,%rdx), %xmm0
			
 
				-    movdqa %xmm0, -16(%rdx)
			
 
				-.LFirstAndLast16b:
			
 
				     sub    %r8, %rdx
			
 
				+    movdqu %xmm3, -16(%rdx)
			
 
				     movdqu %xmm4, -32(%rdx)
			
 
				     movdqu %xmm5, -16(%r9)
			
 
				     ret