Browse Source

Use non-conservative Fill thresholds.

Rika Ichinose 1 year ago
parent
commit
b87e22151a
2 changed files with 90 additions and 95 deletions
  1. 39 40
      rtl/i386/i386.inc
  2. 51 55
      rtl/x86_64/x86_64.inc

+ 39 - 40
rtl/i386/i386.inc

@@ -214,7 +214,7 @@ asm
         cmp    $32, %edx
         cmp    $32, %edx
         ja     .LMoreThanTwoVectors
         ja     .LMoreThanTwoVectors
         ret
         ret
-        .byte  102,102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
+        .byte  144 { Turn .balign 16 before .L64x_Body into a no-op. }
 
 
       { x can start and end misaligned on the vector boundary:
       { x can start and end misaligned on the vector boundary:
         x = ~~][H1][H2][...][T2][T1]~
         x = ~~][H1][H2][...][T2][T1]~
@@ -227,56 +227,58 @@ asm
         mov    %eax, %ecx
         mov    %eax, %ecx
         shl    $3, %ecx { ecx = misalignment of x in bits }
         shl    $3, %ecx { ecx = misalignment of x in bits }
         rol    %cl, %esi { misalign the pattern }
         rol    %cl, %esi { misalign the pattern }
-        movd   %esi, %xmm1
-        pshufd $0, %xmm1, %xmm1
+        movd   %esi, %xmm0
+        pshufd $0, %xmm0, %xmm0
         pop    %esi
         pop    %esi
 
 
 { FillChar (to skip the misaligning above) and FillQWord jump here.
 { FillChar (to skip the misaligning above) and FillQWord jump here.
-  eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes, first and last 16 bytes written. }
+  eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
 FillXxxx_MoreThanTwoXMMs:
 FillXxxx_MoreThanTwoXMMs:
         lea    -65(%eax,%edx), %ecx
         lea    -65(%eax,%edx), %ecx
         and    $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
         and    $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
-        and    $-16, %eax { align eax to the LEFT (so needs to be offset by an additional +16 for a while). }
-        movdqa %xmm1, 16(%eax) { Write H1. }
-        cmp    $49, %edx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
+        mov    %ecx, %edx { Remember T4 to edx. }
+        and    $-16, %eax { eax = H1 − 16. }
+        sub    %eax, %ecx { ecx = aligned byte count − 48. }
+        movdqa %xmm0, 16(%eax) { Write H1. }
+        cmp    $32-48, %ecx
         jle    .LOneAlignedTailWrite
         jle    .LOneAlignedTailWrite
-        movdqa %xmm1, 32(%eax) { Write H2. }
-        cmp    $81, %edx  { 50~81 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
+        movdqa %xmm0, 32(%eax) { Write H2. }
+        cmp    $64-48, %ecx
         jle    .LTwoAlignedTailWrites
         jle    .LTwoAlignedTailWrites
-        cmp    $113, %edx  { 82~113 bytes might contain 4~6 heads+tails; write as H1–2 and T4–1. }
-        jle    .LFourAlignedTailWrites
+        sub    $48, %ecx { ecx = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
+        jle    .LFourAlignedTailWrites { ecx was ≤ 96−48 }
 
 
-        add    $48, %eax
-        cmp    $NtThreshold, %edx
+        add    $48, %eax { eax = H3. }
+        cmp    $NtThreshold, %ecx
         jae    .L64xNT_Body
         jae    .L64xNT_Body
 
 
 .balign 16 { no-op }
 .balign 16 { no-op }
 .L64x_Body:
 .L64x_Body:
-        movdqa %xmm1, (%eax)
-        movdqa %xmm1, 16(%eax)
-        movdqa %xmm1, 32(%eax)
-        movdqa %xmm1, 48(%eax)
-        add    $64,  %eax
-        cmp    %ecx, %eax
-        jb     .L64x_Body
+        movdqa %xmm0, (%eax)
+        movdqa %xmm0, 16(%eax)
+        movdqa %xmm0, 32(%eax)
+        movdqa %xmm0, 48(%eax)
+        add    $64, %eax
+        sub    $64, %ecx
+        ja     .L64x_Body
 .LFourAlignedTailWrites:
 .LFourAlignedTailWrites:
-        movdqa %xmm1, (%ecx) { T4 }
-        movdqa %xmm1, 16(%ecx) { T3 }
+        movdqa %xmm0, (%edx) { T4 }
+        movdqa %xmm0, 16(%edx) { T3 }
 .LTwoAlignedTailWrites:
 .LTwoAlignedTailWrites:
-        movdqa %xmm1, 32(%ecx) { T2 }
+        movdqa %xmm0, 32(%edx) { T2 }
 .LOneAlignedTailWrite:
 .LOneAlignedTailWrite:
-        movdqa %xmm1, 48(%ecx) { T1 }
+        movdqa %xmm0, 48(%edx) { T1 }
         ret
         ret
 
 
 .balign 16
 .balign 16
 .L64xNT_Body:
 .L64xNT_Body:
-        movntdq %xmm1, (%eax)
-        movntdq %xmm1, 16(%eax)
-        movntdq %xmm1, 32(%eax)
-        movntdq %xmm1, 48(%eax)
+        movntdq %xmm0, (%eax)
+        movntdq %xmm0, 16(%eax)
+        movntdq %xmm0, 32(%eax)
+        movntdq %xmm0, 48(%eax)
         add    $64, %eax
         add    $64, %eax
-        cmp    %ecx, %eax
-        jb     .L64xNT_Body
+        sub    $64, %ecx
+        ja     .L64xNT_Body
         sfence
         sfence
         jmp    .LFourAlignedTailWrites
         jmp    .LFourAlignedTailWrites
 end;
 end;
@@ -367,7 +369,6 @@ asm
         pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
         pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
         movdqu %xmm0, (%eax)
         movdqu %xmm0, (%eax)
         movdqu %xmm0, -16(%eax,%edx)
         movdqu %xmm0, -16(%eax,%edx)
-        movdqa %xmm0, %xmm1
         cmp    $32, %edx
         cmp    $32, %edx
         ja     FillXxxx_MoreThanTwoXMMs
         ja     FillXxxx_MoreThanTwoXMMs
 end;
 end;
@@ -388,7 +389,6 @@ asm
         pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
         pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
         movdqu %xmm0, (%eax)
         movdqu %xmm0, (%eax)
         movdqu %xmm0, -16(%eax,%edx)
         movdqu %xmm0, -16(%eax,%edx)
-        movdqa %xmm0, %xmm1
         cmp    $32, %edx
         cmp    $32, %edx
         ja     FillXxxx_MoreThanTwoXMMs
         ja     FillXxxx_MoreThanTwoXMMs
 end;
 end;
@@ -613,20 +613,19 @@ asm
         shl     $3, %edx
         shl     $3, %edx
         movdqu  %xmm0, (%eax)
         movdqu  %xmm0, (%eax)
         movdqu  %xmm0, -16(%eax,%edx)
         movdqu  %xmm0, -16(%eax,%edx)
-        movdqa  %xmm0, %xmm1
         test    $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
         test    $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
         jz      FillXxxx_MoreThanTwoXMMs
         jz      FillXxxx_MoreThanTwoXMMs
-        mov     %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x into xmm1. }
+        mov     %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x. }
         shl     $3, %ecx
         shl     $3, %ecx
         and     $63, %ecx
         and     $63, %ecx
-        movd    %ecx, %xmm3
-        psllq   %xmm3, %xmm1
+        movd    %ecx, %xmm2
+        movdqa  %xmm0, %xmm1
+        psllq   %xmm2, %xmm1
         neg     %ecx      { Can also do not ecx; and $63, %ecx; inc ecx to support ecx = 0, as SSE shifts correctly handle count >= bitsizeof.  }
         neg     %ecx      { Can also do not ecx; and $63, %ecx; inc ecx to support ecx = 0, as SSE shifts correctly handle count >= bitsizeof.  }
         and     $63, %ecx { But in the definitely unaligned case, ecx = 0 is impossible. }
         and     $63, %ecx { But in the definitely unaligned case, ecx = 0 is impossible. }
-        movd    %ecx, %xmm3
-        movdqa  %xmm0, %xmm2
-        psrlq   %xmm3, %xmm2
-        por     %xmm2, %xmm1
+        movd    %ecx, %xmm2
+        psrlq   %xmm2, %xmm0
+        por     %xmm1, %xmm0
         jmp     FillXxxx_MoreThanTwoXMMs
         jmp     FillXxxx_MoreThanTwoXMMs
 
 
 .L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ }
 .L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ }

+ 51 - 55
rtl/x86_64/x86_64.inc

@@ -272,8 +272,8 @@ procedure FillXxxx_MoreThanTwoXmms; assembler; nostackframe;
 { Input:
 { Input:
   rcx = 'x'
   rcx = 'x'
   rdx = byte count
   rdx = byte count
-  xmm0 = pattern for unaligned writes
-  xmm1 = pattern for aligned writes }
+  xmm0 = pattern for ALIGNED writes
+  First and last 16 bytes are written. }
 const
 const
 {$ifdef use_fast_repmovstos}
 {$ifdef use_fast_repmovstos}
   ErmsThreshold = 1536;
   ErmsThreshold = 1536;
@@ -291,56 +291,56 @@ asm
       H1 and so on are called “aligned heads” or just “heads”.
       H1 and so on are called “aligned heads” or just “heads”.
       T1 and so on are called “aligned tails” or just “tails”.
       T1 and so on are called “aligned tails” or just “tails”.
 
 
-      UT (“unaligned tail”) is written with another 'movdqu' after the loop.
+      UT (“unaligned tail”) is written by the caller as well.
       At least 1 of its bytes is exclusive to it as well, that’s why 65 is subtracted below instead of 64. }
       At least 1 of its bytes is exclusive to it as well, that’s why 65 is subtracted below instead of 64. }
 
 
-    lea    -65(%rcx,%rdx), %r8 { r8 = end of x - 65, to get the loop bound and to write UT later (why not write it right away though...). }
-    and    $-16, %rcx { align rcx to the LEFT (so needs to be offset by an additional +16 for a while). }
-    movdqa %xmm1, 16(%rcx) { Write H1. }
-    mov    %r8, %rax
-    and    $-16, %rax { rax = “T4” (possibly fictive) = aligned r8 = loop bound. }
-    cmp    $49, %rdx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
+    lea    -65(%rcx,%rdx), %rax
+    and    $-16, %rax { rax = “T4” (possibly fictive). }
+    mov    %rax, %rdx { Remember T4 to rdx. }
+    and    $-16, %rcx { rcx = H1 − 16. }
+    sub    %rcx, %rax { rax = aligned byte count − 48. }
+    movdqa %xmm0, 16(%rcx) { Write H1. }
+    cmp    $32-48, %rax
     jle    .LOneAlignedTailWrite
     jle    .LOneAlignedTailWrite
-    movdqa %xmm1, 32(%rcx) { Write H2. }
-    cmp    $81, %rdx  { 50~81 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
+    movdqa %xmm0, 32(%rcx) { Write H2. }
+    cmp    $64-48, %rax
     jle    .LTwoAlignedTailWrites
     jle    .LTwoAlignedTailWrites
-    cmp    $113, %rdx  { 82~113 bytes might contain 4~6 heads+tails; write as H1–2 and T4–1. }
+    sub    $48, %rax { rax = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
     jle    .LFourAlignedTailWrites
     jle    .LFourAlignedTailWrites
 
 
-    add    $48, %rcx
+    add    $48, %rcx { rcx = H3. }
 {$ifdef use_fast_repmovstos}
 {$ifdef use_fast_repmovstos}
-    cmp    $ErmsThreshold, %rdx
+    cmp    $ErmsThreshold-64, %rax { Need to write aligned byte count − 32 bytes already written. rax = aligned byte count − 96, so compare rax + 64 to ErmsThreshold, or rax to ErmsThreshold − 64. }
     jae    .LRepStos
     jae    .LRepStos
 {$else}
 {$else}
-    cmp    $NtThreshold, %rdx
+    cmp    $NtThreshold, %rax
     jae    .L64xNT_Body
     jae    .L64xNT_Body
 {$endif}
 {$endif}
 
 
 .balign 16
 .balign 16
 .L64x_Body:
 .L64x_Body:
-    movdqa %xmm1, (%rcx)
-    movdqa %xmm1, 16(%rcx)
-    movdqa %xmm1, 32(%rcx)
-    movdqa %xmm1, 48(%rcx)
+    movdqa %xmm0, (%rcx)
+    movdqa %xmm0, 16(%rcx)
+    movdqa %xmm0, 32(%rcx)
+    movdqa %xmm0, 48(%rcx)
     add    $64, %rcx
     add    $64, %rcx
-    cmp    %rax, %rcx
-    jb     .L64x_Body
+    sub    $64, %rax
+    ja     .L64x_Body
 
 
 .LFourAlignedTailWrites:
 .LFourAlignedTailWrites:
-    movdqa %xmm1, (%rax) { T4 }
-    movdqa %xmm1, 16(%rax) { T3 }
+    movdqa %xmm0, (%rdx) { T4 }
+    movdqa %xmm0, 16(%rdx) { T3 }
 .LTwoAlignedTailWrites:
 .LTwoAlignedTailWrites:
-    movdqa %xmm1, 32(%rax) { T2 }
+    movdqa %xmm0, 32(%rdx) { T2 }
 .LOneAlignedTailWrite:
 .LOneAlignedTailWrite:
-    movdqa %xmm1, 48(%rax) { T1 }
-    movdqu %xmm0, 65-16(%r8) { UT }
+    movdqa %xmm0, 48(%rdx) { T1 }
     ret
     ret
 
 
 {$ifdef use_fast_repmovstos}
 {$ifdef use_fast_repmovstos}
 .LRepStos:
 .LRepStos:
 {$ifdef FPC_PIC}
 {$ifdef FPC_PIC}
-    movq   fast_large_repmovstosb@GOTPCREL(%rip), %r9
-    cmpb   $1, (%r9)
+    movq   fast_large_repmovstosb@GOTPCREL(%rip), %r8
+    cmpb   $1, (%r8)
 {$else FPC_PIC}
 {$else FPC_PIC}
     cmpb   $1, fast_large_repmovstosb(%rip)
     cmpb   $1, fast_large_repmovstosb(%rip)
 {$endif FPC_PIC}
 {$endif FPC_PIC}
@@ -349,12 +349,10 @@ asm
     push   %rdi { For tests on Windows; however this is SEH incompliant so the entire use_fast_repmovstos branch is disabled by default! }
     push   %rdi { For tests on Windows; however this is SEH incompliant so the entire use_fast_repmovstos branch is disabled by default! }
 {$endif}
 {$endif}
     mov    %rcx, %rdi { rdi = REP STOS destination. }
     mov    %rcx, %rdi { rdi = REP STOS destination. }
-    lea    65-16+8-1(%r8), %rcx
-    sub    %rdi, %rcx
-    shr    $3, %rcx { rcx = count of REP STOSQ blocks before UT. }
-    movq   %xmm1, %rax { recover pattern for aligned writes back to GPR :) }
+    lea    64(%rax), %rcx
+    shr    $3, %rcx { rcx = count of REP STOSQ blocks up to T1 (might be 1 more than strictly required if T1 and UT overlap is 8 or more, don’t care). }
+    movq   %xmm0, %rax { recover pattern for aligned writes back to GPR :) }
     rep stosq
     rep stosq
-    movdqu %xmm0, 65-16(%r8) { UT }
 {$ifdef win64}
 {$ifdef win64}
     pop    %rdi
     pop    %rdi
 {$endif}
 {$endif}
@@ -362,18 +360,18 @@ asm
 {$endif}
 {$endif}
 
 
 .LRepStosIsNotBetter:
 .LRepStosIsNotBetter:
-    cmp    $NtThreshold, %rdx
+    cmp    $NtThreshold-64, %rax
     jb     .L64x_Body
     jb     .L64x_Body
 
 
 .balign 16
 .balign 16
 .L64xNT_Body:
 .L64xNT_Body:
-    movntdq %xmm1, (%rcx)
-    movntdq %xmm1, 16(%rcx)
-    movntdq %xmm1, 32(%rcx)
-    movntdq %xmm1, 48(%rcx)
+    movntdq %xmm0, (%rcx)
+    movntdq %xmm0, 16(%rcx)
+    movntdq %xmm0, 32(%rcx)
+    movntdq %xmm0, 48(%rcx)
     add    $64, %rcx
     add    $64, %rcx
-    cmp    %rax, %rcx
-    jb     .L64xNT_Body
+    sub    $64, %rax
+    ja     .L64xNT_Body
     sfence
     sfence
     jmp    .LFourAlignedTailWrites
     jmp    .LFourAlignedTailWrites
 end;
 end;
@@ -400,11 +398,9 @@ Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
     movd   %eax, %xmm0
     movd   %eax, %xmm0
     pshufd $0, %xmm0, %xmm0
     pshufd $0, %xmm0, %xmm0
     movdqu %xmm0, (%rcx)
     movdqu %xmm0, (%rcx)
-    movdqa %xmm0, %xmm1
-
+    movdqu %xmm0, -16(%rcx,%rdx)
     cmp    $32, %rdx
     cmp    $32, %rdx
     jg     FillXxxx_MoreThanTwoXmms
     jg     FillXxxx_MoreThanTwoXmms
-    movdqu %xmm0, -16(%rcx,%rdx)
     ret
     ret
 
 
 .L4to15:
 .L4to15:
@@ -452,23 +448,21 @@ procedure FillWord(var x;count:SizeInt;value:word);assembler;nostackframe;
     movd   %eax, %xmm0
     movd   %eax, %xmm0
     pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
     pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
     movdqu %xmm0, (%rcx)
     movdqu %xmm0, (%rcx)
-
+    movdqu %xmm0, -16(%rcx,%rdx,2)
     cmp    $16, %rdx
     cmp    $16, %rdx
-    jle    .LTail
+    jg     .LMoreThanTwoXMMs
+    ret
 
 
+.LMoreThanTwoXMMs:
     shl    $1, %rdx { rdx = byte count }
     shl    $1, %rdx { rdx = byte count }
     mov    %rcx, %r8
     mov    %rcx, %r8
     shl    $3, %ecx
     shl    $3, %ecx
     rol    %cl, %eax { misalign the pattern by the misalignment of x }
     rol    %cl, %eax { misalign the pattern by the misalignment of x }
     mov    %r8, %rcx
     mov    %r8, %rcx
-    movd   %eax, %xmm1
-    pshufd $0, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
+    movd   %eax, %xmm0
+    pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
     jmp    FillXxxx_MoreThanTwoXmms
     jmp    FillXxxx_MoreThanTwoXmms
 
 
-.LTail:
-    movdqu %xmm0, -16(%rcx,%rdx,2)
-    ret
-
 .L4to8:
 .L4to8:
     mov    %eax, %r8d
     mov    %eax, %r8d
     shl    $32, %r8
     shl    $32, %r8
@@ -508,14 +502,15 @@ procedure FillDWord(var x;count:SizeInt;value:DWord);assembler;nostackframe;
     movd   %eax, %xmm0
     movd   %eax, %xmm0
     pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
     pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
     movdqu %xmm0, (%rcx)
     movdqu %xmm0, (%rcx)
+    movdqu %xmm0, -16(%rcx,%rdx,4)
 
 
     shl    $2, %rdx { rdx = byte count }
     shl    $2, %rdx { rdx = byte count }
     mov    %rcx, %r8
     mov    %rcx, %r8
     shl    $3, %ecx
     shl    $3, %ecx
     rol    %cl, %eax { misalign the pattern by the misalignment of x }
     rol    %cl, %eax { misalign the pattern by the misalignment of x }
     mov    %r8, %rcx
     mov    %r8, %rcx
-    movd   %eax, %xmm1
-    pshufd $0, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
+    movd   %eax, %xmm0
+    pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
     jmp    FillXxxx_MoreThanTwoXmms
     jmp    FillXxxx_MoreThanTwoXmms
 
 
 .L4to8:
 .L4to8:
@@ -561,14 +556,15 @@ procedure FillQWord(var x;count:SizeInt;value:QWord);assembler;nostackframe;
     movq   %rax, %xmm0
     movq   %rax, %xmm0
     pshufd $0b01000100, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
     pshufd $0b01000100, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
     movdqu %xmm0, (%rcx)
     movdqu %xmm0, (%rcx)
+    movdqu %xmm0, -16(%rcx,%rdx,8)
 
 
     shl    $3, %rdx { rdx = byte count }
     shl    $3, %rdx { rdx = byte count }
     mov    %rcx, %r8
     mov    %rcx, %r8
     shl    $3, %ecx
     shl    $3, %ecx
     rol    %cl, %rax { misalign the pattern by the misalignment of x }
     rol    %cl, %rax { misalign the pattern by the misalignment of x }
     mov    %r8, %rcx
     mov    %r8, %rcx
-    movq   %rax, %xmm1
-    pshufd $0b01000100, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
+    movq   %rax, %xmm0
+    pshufd $0b01000100, %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
     jmp    FillXxxx_MoreThanTwoXmms
     jmp    FillXxxx_MoreThanTwoXmms
 
 
 .L3to6:
 .L3to6: