1 year ago · b87e22151a
--- a/rtl/i386/i386.inc
+++ b/rtl/i386/i386.inc
@@ -214,7 +214,7 @@ asm
 
				         cmp    $32, %edx
			
 
				         ja     .LMoreThanTwoVectors
			
 
				         ret
			
 
				-        .byte  102,102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
			
 
				+        .byte  144 { Turn .balign 16 before .L64x_Body into a no-op. }
			
 
				 
			
 
				       { x can start and end misaligned on the vector boundary:
			
 
				         x = ~~][H1][H2][...][T2][T1]~
			
@@ -227,56 +227,58 @@ asm
 
				         mov    %eax, %ecx
			
 
				         shl    $3, %ecx { ecx = misalignment of x in bits }
			
 
				         rol    %cl, %esi { misalign the pattern }
			
 
				-        movd   %esi, %xmm1
			
 
				-        pshufd $0, %xmm1, %xmm1
			
 
				+        movd   %esi, %xmm0
			
 
				+        pshufd $0, %xmm0, %xmm0
			
 
				         pop    %esi
			
 
				 
			
 
				 { FillChar (to skip the misaligning above) and FillQWord jump here.
			
 
				-  eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes, first and last 16 bytes written. }
			
 
				+  eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
			
 
				 FillXxxx_MoreThanTwoXMMs:
			
 
				         lea    -65(%eax,%edx), %ecx
			
 
				         and    $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
			
 
				-        and    $-16, %eax { align eax to the LEFT (so needs to be offset by an additional +16 for a while). }
			
 
				-        movdqa %xmm1, 16(%eax) { Write H1. }
			
 
				-        cmp    $49, %edx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
			
 
				+        mov    %ecx, %edx { Remember T4 to edx. }
			
 
				+        and    $-16, %eax { eax = H1 − 16. }
			
 
				+        sub    %eax, %ecx { ecx = aligned byte count − 48. }
			
 
				+        movdqa %xmm0, 16(%eax) { Write H1. }
			
 
				+        cmp    $32-48, %ecx
			
 
				         jle    .LOneAlignedTailWrite
			
 
				-        movdqa %xmm1, 32(%eax) { Write H2. }
			
 
				-        cmp    $81, %edx  { 50~81 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
			
 
				+        movdqa %xmm0, 32(%eax) { Write H2. }
			
 
				+        cmp    $64-48, %ecx
			
 
				         jle    .LTwoAlignedTailWrites
			
 
				-        cmp    $113, %edx  { 82~113 bytes might contain 4~6 heads+tails; write as H1–2 and T4–1. }
			
 
				-        jle    .LFourAlignedTailWrites
			
 
				+        sub    $48, %ecx { ecx = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
			
 
				+        jle    .LFourAlignedTailWrites { ecx was ≤ 96−48 }
			
 
				 
			
 
				-        add    $48, %eax
			
 
				-        cmp    $NtThreshold, %edx
			
 
				+        add    $48, %eax { eax = H3. }
			
 
				+        cmp    $NtThreshold, %ecx
			
 
				         jae    .L64xNT_Body
			
 
				 
			
 
				 .balign 16 { no-op }
			
 
				 .L64x_Body:
			
 
				-        movdqa %xmm1, (%eax)
			
 
				-        movdqa %xmm1, 16(%eax)
			
 
				-        movdqa %xmm1, 32(%eax)
			
 
				-        movdqa %xmm1, 48(%eax)
			
 
				-        add    $64,  %eax
			
 
				-        cmp    %ecx, %eax
			
 
				-        jb     .L64x_Body
			
 
				+        movdqa %xmm0, (%eax)
			
 
				+        movdqa %xmm0, 16(%eax)
			
 
				+        movdqa %xmm0, 32(%eax)
			
 
				+        movdqa %xmm0, 48(%eax)
			
 
				+        add    $64, %eax
			
 
				+        sub    $64, %ecx
			
 
				+        ja     .L64x_Body
			
 
				 .LFourAlignedTailWrites:
			
 
				-        movdqa %xmm1, (%ecx) { T4 }
			
 
				-        movdqa %xmm1, 16(%ecx) { T3 }
			
 
				+        movdqa %xmm0, (%edx) { T4 }
			
 
				+        movdqa %xmm0, 16(%edx) { T3 }
			
 
				 .LTwoAlignedTailWrites:
			
 
				-        movdqa %xmm1, 32(%ecx) { T2 }
			
 
				+        movdqa %xmm0, 32(%edx) { T2 }
			
 
				 .LOneAlignedTailWrite:
			
 
				-        movdqa %xmm1, 48(%ecx) { T1 }
			
 
				+        movdqa %xmm0, 48(%edx) { T1 }
			
 
				         ret
			
 
				 
			
 
				 .balign 16
			
 
				 .L64xNT_Body:
			
 
				-        movntdq %xmm1, (%eax)
			
 
				-        movntdq %xmm1, 16(%eax)
			
 
				-        movntdq %xmm1, 32(%eax)
			
 
				-        movntdq %xmm1, 48(%eax)
			
 
				+        movntdq %xmm0, (%eax)
			
 
				+        movntdq %xmm0, 16(%eax)
			
 
				+        movntdq %xmm0, 32(%eax)
			
 
				+        movntdq %xmm0, 48(%eax)
			
 
				         add    $64, %eax
			
 
				-        cmp    %ecx, %eax
			
 
				-        jb     .L64xNT_Body
			
 
				+        sub    $64, %ecx
			
 
				+        ja     .L64xNT_Body
			
 
				         sfence
			
 
				         jmp    .LFourAlignedTailWrites
			
 
				 end;
			
@@ -367,7 +369,6 @@ asm
 
				         pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
			
 
				         movdqu %xmm0, (%eax)
			
 
				         movdqu %xmm0, -16(%eax,%edx)
			
 
				-        movdqa %xmm0, %xmm1
			
 
				         cmp    $32, %edx
			
 
				         ja     FillXxxx_MoreThanTwoXMMs
			
 
				 end;
			
@@ -388,7 +389,6 @@ asm
 
				         pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
			
 
				         movdqu %xmm0, (%eax)
			
 
				         movdqu %xmm0, -16(%eax,%edx)
			
 
				-        movdqa %xmm0, %xmm1
			
 
				         cmp    $32, %edx
			
 
				         ja     FillXxxx_MoreThanTwoXMMs
			
 
				 end;
			
@@ -613,20 +613,19 @@ asm
 
				         shl     $3, %edx
			
 
				         movdqu  %xmm0, (%eax)
			
 
				         movdqu  %xmm0, -16(%eax,%edx)
			
 
				-        movdqa  %xmm0, %xmm1
			
 
				         test    $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
			
 
				         jz      FillXxxx_MoreThanTwoXMMs
			
 
				-        mov     %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x into xmm1. }
			
 
				+        mov     %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x. }
			
 
				         shl     $3, %ecx
			
 
				         and     $63, %ecx
			
 
				-        movd    %ecx, %xmm3
			
 
				-        psllq   %xmm3, %xmm1
			
 
				+        movd    %ecx, %xmm2
			
 
				+        movdqa  %xmm0, %xmm1
			
 
				+        psllq   %xmm2, %xmm1
			
 
				         neg     %ecx      { Can also do not ecx; and $63, %ecx; inc ecx to support ecx = 0, as SSE shifts correctly handle count >= bitsizeof.  }
			
 
				         and     $63, %ecx { But in the definitely unaligned case, ecx = 0 is impossible. }
			
 
				-        movd    %ecx, %xmm3
			
 
				-        movdqa  %xmm0, %xmm2
			
 
				-        psrlq   %xmm3, %xmm2
			
 
				-        por     %xmm2, %xmm1
			
 
				+        movd    %ecx, %xmm2
			
 
				+        psrlq   %xmm2, %xmm0
			
 
				+        por     %xmm1, %xmm0
			
 
				         jmp     FillXxxx_MoreThanTwoXMMs
			
 
				 
			
 
				 .L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ }
			
--- a/rtl/x86_64/x86_64.inc
+++ b/rtl/x86_64/x86_64.inc
@@ -272,8 +272,8 @@ procedure FillXxxx_MoreThanTwoXmms; assembler; nostackframe;
 
				 { Input:
			
 
				   rcx = 'x'
			
 
				   rdx = byte count
			
 
				-  xmm0 = pattern for unaligned writes
			
 
				-  xmm1 = pattern for aligned writes }
			
 
				+  xmm0 = pattern for ALIGNED writes
			
 
				+  First and last 16 bytes are written. }
			
 
				 const
			
 
				 {$ifdef use_fast_repmovstos}
			
 
				   ErmsThreshold = 1536;
			
@@ -291,56 +291,56 @@ asm
 
				       H1 and so on are called “aligned heads” or just “heads”.
			
 
				       T1 and so on are called “aligned tails” or just “tails”.
			
 
				 
			
 
				-      UT (“unaligned tail”) is written with another 'movdqu' after the loop.
			
 
				+      UT (“unaligned tail”) is written by the caller as well.
			
 
				       At least 1 of its bytes is exclusive to it as well, that’s why 65 is subtracted below instead of 64. }
			
 
				 
			
 
				-    lea    -65(%rcx,%rdx), %r8 { r8 = end of x - 65, to get the loop bound and to write UT later (why not write it right away though...). }
			
 
				-    and    $-16, %rcx { align rcx to the LEFT (so needs to be offset by an additional +16 for a while). }
			
 
				-    movdqa %xmm1, 16(%rcx) { Write H1. }
			
 
				-    mov    %r8, %rax
			
 
				-    and    $-16, %rax { rax = “T4” (possibly fictive) = aligned r8 = loop bound. }
			
 
				-    cmp    $49, %rdx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
			
 
				+    lea    -65(%rcx,%rdx), %rax
			
 
				+    and    $-16, %rax { rax = “T4” (possibly fictive). }
			
 
				+    mov    %rax, %rdx { Remember T4 to rdx. }
			
 
				+    and    $-16, %rcx { rcx = H1 − 16. }
			
 
				+    sub    %rcx, %rax { rax = aligned byte count − 48. }
			
 
				+    movdqa %xmm0, 16(%rcx) { Write H1. }
			
 
				+    cmp    $32-48, %rax
			
 
				     jle    .LOneAlignedTailWrite
			
 
				-    movdqa %xmm1, 32(%rcx) { Write H2. }
			
 
				-    cmp    $81, %rdx  { 50~81 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
			
 
				+    movdqa %xmm0, 32(%rcx) { Write H2. }
			
 
				+    cmp    $64-48, %rax
			
 
				     jle    .LTwoAlignedTailWrites
			
 
				-    cmp    $113, %rdx  { 82~113 bytes might contain 4~6 heads+tails; write as H1–2 and T4–1. }
			
 
				+    sub    $48, %rax { rax = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
			
 
				     jle    .LFourAlignedTailWrites
			
 
				 
			
 
				-    add    $48, %rcx
			
 
				+    add    $48, %rcx { rcx = H3. }
			
 
				 {$ifdef use_fast_repmovstos}
			
 
				-    cmp    $ErmsThreshold, %rdx
			
 
				+    cmp    $ErmsThreshold-64, %rax { Need to write aligned byte count − 32 bytes already written. rax = aligned byte count − 96, so compare rax + 64 to ErmsThreshold, or rax to ErmsThreshold − 64. }
			
 
				     jae    .LRepStos
			
 
				 {$else}
			
 
				-    cmp    $NtThreshold, %rdx
			
 
				+    cmp    $NtThreshold, %rax
			
 
				     jae    .L64xNT_Body
			
 
				 {$endif}
			
 
				 
			
 
				 .balign 16
			
 
				 .L64x_Body:
			
 
				-    movdqa %xmm1, (%rcx)
			
 
				-    movdqa %xmm1, 16(%rcx)
			
 
				-    movdqa %xmm1, 32(%rcx)
			
 
				-    movdqa %xmm1, 48(%rcx)
			
 
				+    movdqa %xmm0, (%rcx)
			
 
				+    movdqa %xmm0, 16(%rcx)
			
 
				+    movdqa %xmm0, 32(%rcx)
			
 
				+    movdqa %xmm0, 48(%rcx)
			
 
				     add    $64, %rcx
			
 
				-    cmp    %rax, %rcx
			
 
				-    jb     .L64x_Body
			
 
				+    sub    $64, %rax
			
 
				+    ja     .L64x_Body
			
 
				 
			
 
				 .LFourAlignedTailWrites:
			
 
				-    movdqa %xmm1, (%rax) { T4 }
			
 
				-    movdqa %xmm1, 16(%rax) { T3 }
			
 
				+    movdqa %xmm0, (%rdx) { T4 }
			
 
				+    movdqa %xmm0, 16(%rdx) { T3 }
			
 
				 .LTwoAlignedTailWrites:
			
 
				-    movdqa %xmm1, 32(%rax) { T2 }
			
 
				+    movdqa %xmm0, 32(%rdx) { T2 }
			
 
				 .LOneAlignedTailWrite:
			
 
				-    movdqa %xmm1, 48(%rax) { T1 }
			
 
				-    movdqu %xmm0, 65-16(%r8) { UT }
			
 
				+    movdqa %xmm0, 48(%rdx) { T1 }
			
 
				     ret
			
 
				 
			
 
				 {$ifdef use_fast_repmovstos}
			
 
				 .LRepStos:
			
 
				 {$ifdef FPC_PIC}
			
 
				-    movq   fast_large_repmovstosb@GOTPCREL(%rip), %r9
			
 
				-    cmpb   $1, (%r9)
			
 
				+    movq   fast_large_repmovstosb@GOTPCREL(%rip), %r8
			
 
				+    cmpb   $1, (%r8)
			
 
				 {$else FPC_PIC}
			
 
				     cmpb   $1, fast_large_repmovstosb(%rip)
			
 
				 {$endif FPC_PIC}
			
@@ -349,12 +349,10 @@ asm
 
				     push   %rdi { For tests on Windows; however this is SEH incompliant so the entire use_fast_repmovstos branch is disabled by default! }
			
 
				 {$endif}
			
 
				     mov    %rcx, %rdi { rdi = REP STOS destination. }
			
 
				-    lea    65-16+8-1(%r8), %rcx
			
 
				-    sub    %rdi, %rcx
			
 
				-    shr    $3, %rcx { rcx = count of REP STOSQ blocks before UT. }
			
 
				-    movq   %xmm1, %rax { recover pattern for aligned writes back to GPR :) }
			
 
				+    lea    64(%rax), %rcx
			
 
				+    shr    $3, %rcx { rcx = count of REP STOSQ blocks up to T1 (might be 1 more than strictly required if T1 and UT overlap is 8 or more, don’t care). }
			
 
				+    movq   %xmm0, %rax { recover pattern for aligned writes back to GPR :) }
			
 
				     rep stosq
			
 
				-    movdqu %xmm0, 65-16(%r8) { UT }
			
 
				 {$ifdef win64}
			
 
				     pop    %rdi
			
 
				 {$endif}
			
@@ -362,18 +360,18 @@ asm
 
				 {$endif}
			
 
				 
			
 
				 .LRepStosIsNotBetter:
			
 
				-    cmp    $NtThreshold, %rdx
			
 
				+    cmp    $NtThreshold-64, %rax
			
 
				     jb     .L64x_Body
			
 
				 
			
 
				 .balign 16
			
 
				 .L64xNT_Body:
			
 
				-    movntdq %xmm1, (%rcx)
			
 
				-    movntdq %xmm1, 16(%rcx)
			
 
				-    movntdq %xmm1, 32(%rcx)
			
 
				-    movntdq %xmm1, 48(%rcx)
			
 
				+    movntdq %xmm0, (%rcx)
			
 
				+    movntdq %xmm0, 16(%rcx)
			
 
				+    movntdq %xmm0, 32(%rcx)
			
 
				+    movntdq %xmm0, 48(%rcx)
			
 
				     add    $64, %rcx
			
 
				-    cmp    %rax, %rcx
			
 
				-    jb     .L64xNT_Body
			
 
				+    sub    $64, %rax
			
 
				+    ja     .L64xNT_Body
			
 
				     sfence
			
 
				     jmp    .LFourAlignedTailWrites
			
 
				 end;
			
@@ -400,11 +398,9 @@ Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
 
				     movd   %eax, %xmm0
			
 
				     pshufd $0, %xmm0, %xmm0
			
 
				     movdqu %xmm0, (%rcx)
			
 
				-    movdqa %xmm0, %xmm1
			
 
				-
			
 
				+    movdqu %xmm0, -16(%rcx,%rdx)
			
 
				     cmp    $32, %rdx
			
 
				     jg     FillXxxx_MoreThanTwoXmms
			
 
				-    movdqu %xmm0, -16(%rcx,%rdx)
			
 
				     ret
			
 
				 
			
 
				 .L4to15:
			
@@ -452,23 +448,21 @@ procedure FillWord(var x;count:SizeInt;value:word);assembler;nostackframe;
 
				     movd   %eax, %xmm0
			
 
				     pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
			
 
				     movdqu %xmm0, (%rcx)
			
 
				-
			
 
				+    movdqu %xmm0, -16(%rcx,%rdx,2)
			
 
				     cmp    $16, %rdx
			
 
				-    jle    .LTail
			
 
				+    jg     .LMoreThanTwoXMMs
			
 
				+    ret
			
 
				 
			
 
				+.LMoreThanTwoXMMs:
			
 
				     shl    $1, %rdx { rdx = byte count }
			
 
				     mov    %rcx, %r8
			
 
				     shl    $3, %ecx
			
 
				     rol    %cl, %eax { misalign the pattern by the misalignment of x }
			
 
				     mov    %r8, %rcx
			
 
				-    movd   %eax, %xmm1
			
 
				-    pshufd $0, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
			
 
				+    movd   %eax, %xmm0
			
 
				+    pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
			
 
				     jmp    FillXxxx_MoreThanTwoXmms
			
 
				 
			
 
				-.LTail:
			
 
				-    movdqu %xmm0, -16(%rcx,%rdx,2)
			
 
				-    ret
			
 
				-
			
 
				 .L4to8:
			
 
				     mov    %eax, %r8d
			
 
				     shl    $32, %r8
			
@@ -508,14 +502,15 @@ procedure FillDWord(var x;count:SizeInt;value:DWord);assembler;nostackframe;
 
				     movd   %eax, %xmm0
			
 
				     pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
			
 
				     movdqu %xmm0, (%rcx)
			
 
				+    movdqu %xmm0, -16(%rcx,%rdx,4)
			
 
				 
			
 
				     shl    $2, %rdx { rdx = byte count }
			
 
				     mov    %rcx, %r8
			
 
				     shl    $3, %ecx
			
 
				     rol    %cl, %eax { misalign the pattern by the misalignment of x }
			
 
				     mov    %r8, %rcx
			
 
				-    movd   %eax, %xmm1
			
 
				-    pshufd $0, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
			
 
				+    movd   %eax, %xmm0
			
 
				+    pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
			
 
				     jmp    FillXxxx_MoreThanTwoXmms
			
 
				 
			
 
				 .L4to8:
			
@@ -561,14 +556,15 @@ procedure FillQWord(var x;count:SizeInt;value:QWord);assembler;nostackframe;
 
				     movq   %rax, %xmm0
			
 
				     pshufd $0b01000100, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
			
 
				     movdqu %xmm0, (%rcx)
			
 
				+    movdqu %xmm0, -16(%rcx,%rdx,8)
			
 
				 
			
 
				     shl    $3, %rdx { rdx = byte count }
			
 
				     mov    %rcx, %r8
			
 
				     shl    $3, %ecx
			
 
				     rol    %cl, %rax { misalign the pattern by the misalignment of x }
			
 
				     mov    %r8, %rcx
			
 
				-    movq   %rax, %xmm1
			
 
				-    pshufd $0b01000100, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
			
 
				+    movq   %rax, %xmm0
			
 
				+    pshufd $0b01000100, %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
			
 
				     jmp    FillXxxx_MoreThanTwoXmms
			
 
				 
			
 
				 .L3to6: