Browse Source

Use non-conservative Fill thresholds.

Rika Ichinose 1 year ago
parent
commit
b87e22151a
2 changed files with 90 additions and 95 deletions
  1. 39 40
      rtl/i386/i386.inc
  2. 51 55
      rtl/x86_64/x86_64.inc

+ 39 - 40
rtl/i386/i386.inc

@@ -214,7 +214,7 @@ asm
         cmp    $32, %edx
         ja     .LMoreThanTwoVectors
         ret
-        .byte  102,102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
+        .byte  144 { Turn .balign 16 before .L64x_Body into a no-op. }
 
       { x can start and end misaligned on the vector boundary:
         x = ~~][H1][H2][...][T2][T1]~
@@ -227,56 +227,58 @@ asm
         mov    %eax, %ecx
         shl    $3, %ecx { ecx = misalignment of x in bits }
         rol    %cl, %esi { misalign the pattern }
-        movd   %esi, %xmm1
-        pshufd $0, %xmm1, %xmm1
+        movd   %esi, %xmm0
+        pshufd $0, %xmm0, %xmm0
         pop    %esi
 
 { FillChar (to skip the misaligning above) and FillQWord jump here.
-  eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes, first and last 16 bytes written. }
+  eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
 FillXxxx_MoreThanTwoXMMs:
         lea    -65(%eax,%edx), %ecx
         and    $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
-        and    $-16, %eax { align eax to the LEFT (so needs to be offset by an additional +16 for a while). }
-        movdqa %xmm1, 16(%eax) { Write H1. }
-        cmp    $49, %edx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
+        mov    %ecx, %edx { Remember T4 to edx. }
+        and    $-16, %eax { eax = H1 − 16. }
+        sub    %eax, %ecx { ecx = aligned byte count − 48. }
+        movdqa %xmm0, 16(%eax) { Write H1. }
+        cmp    $32-48, %ecx
         jle    .LOneAlignedTailWrite
-        movdqa %xmm1, 32(%eax) { Write H2. }
-        cmp    $81, %edx  { 50~81 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
+        movdqa %xmm0, 32(%eax) { Write H2. }
+        cmp    $64-48, %ecx
         jle    .LTwoAlignedTailWrites
-        cmp    $113, %edx  { 82~113 bytes might contain 4~6 heads+tails; write as H1–2 and T4–1. }
-        jle    .LFourAlignedTailWrites
+        sub    $48, %ecx { ecx = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
+        jle    .LFourAlignedTailWrites { ecx was ≤ 96−48 }
 
-        add    $48, %eax
-        cmp    $NtThreshold, %edx
+        add    $48, %eax { eax = H3. }
+        cmp    $NtThreshold, %ecx
         jae    .L64xNT_Body
 
 .balign 16 { no-op }
 .L64x_Body:
-        movdqa %xmm1, (%eax)
-        movdqa %xmm1, 16(%eax)
-        movdqa %xmm1, 32(%eax)
-        movdqa %xmm1, 48(%eax)
-        add    $64,  %eax
-        cmp    %ecx, %eax
-        jb     .L64x_Body
+        movdqa %xmm0, (%eax)
+        movdqa %xmm0, 16(%eax)
+        movdqa %xmm0, 32(%eax)
+        movdqa %xmm0, 48(%eax)
+        add    $64, %eax
+        sub    $64, %ecx
+        ja     .L64x_Body
 .LFourAlignedTailWrites:
-        movdqa %xmm1, (%ecx) { T4 }
-        movdqa %xmm1, 16(%ecx) { T3 }
+        movdqa %xmm0, (%edx) { T4 }
+        movdqa %xmm0, 16(%edx) { T3 }
 .LTwoAlignedTailWrites:
-        movdqa %xmm1, 32(%ecx) { T2 }
+        movdqa %xmm0, 32(%edx) { T2 }
 .LOneAlignedTailWrite:
-        movdqa %xmm1, 48(%ecx) { T1 }
+        movdqa %xmm0, 48(%edx) { T1 }
         ret
 
 .balign 16
 .L64xNT_Body:
-        movntdq %xmm1, (%eax)
-        movntdq %xmm1, 16(%eax)
-        movntdq %xmm1, 32(%eax)
-        movntdq %xmm1, 48(%eax)
+        movntdq %xmm0, (%eax)
+        movntdq %xmm0, 16(%eax)
+        movntdq %xmm0, 32(%eax)
+        movntdq %xmm0, 48(%eax)
         add    $64, %eax
-        cmp    %ecx, %eax
-        jb     .L64xNT_Body
+        sub    $64, %ecx
+        ja     .L64xNT_Body
         sfence
         jmp    .LFourAlignedTailWrites
 end;
@@ -367,7 +369,6 @@ asm
         pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
         movdqu %xmm0, (%eax)
         movdqu %xmm0, -16(%eax,%edx)
-        movdqa %xmm0, %xmm1
         cmp    $32, %edx
         ja     FillXxxx_MoreThanTwoXMMs
 end;
@@ -388,7 +389,6 @@ asm
         pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
         movdqu %xmm0, (%eax)
         movdqu %xmm0, -16(%eax,%edx)
-        movdqa %xmm0, %xmm1
         cmp    $32, %edx
         ja     FillXxxx_MoreThanTwoXMMs
 end;
@@ -613,20 +613,19 @@ asm
         shl     $3, %edx
         movdqu  %xmm0, (%eax)
         movdqu  %xmm0, -16(%eax,%edx)
-        movdqa  %xmm0, %xmm1
         test    $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
         jz      FillXxxx_MoreThanTwoXMMs
-        mov     %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x into xmm1. }
+        mov     %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x. }
         shl     $3, %ecx
         and     $63, %ecx
-        movd    %ecx, %xmm3
-        psllq   %xmm3, %xmm1
+        movd    %ecx, %xmm2
+        movdqa  %xmm0, %xmm1
+        psllq   %xmm2, %xmm1
         neg     %ecx      { Can also do not ecx; and $63, %ecx; inc ecx to support ecx = 0, as SSE shifts correctly handle count >= bitsizeof.  }
         and     $63, %ecx { But in the definitely unaligned case, ecx = 0 is impossible. }
-        movd    %ecx, %xmm3
-        movdqa  %xmm0, %xmm2
-        psrlq   %xmm3, %xmm2
-        por     %xmm2, %xmm1
+        movd    %ecx, %xmm2
+        psrlq   %xmm2, %xmm0
+        por     %xmm1, %xmm0
         jmp     FillXxxx_MoreThanTwoXMMs
 
 .L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ }

+ 51 - 55
rtl/x86_64/x86_64.inc

@@ -272,8 +272,8 @@ procedure FillXxxx_MoreThanTwoXmms; assembler; nostackframe;
 { Input:
   rcx = 'x'
   rdx = byte count
-  xmm0 = pattern for unaligned writes
-  xmm1 = pattern for aligned writes }
+  xmm0 = pattern for ALIGNED writes
+  First and last 16 bytes are written. }
 const
 {$ifdef use_fast_repmovstos}
   ErmsThreshold = 1536;
@@ -291,56 +291,56 @@ asm
       H1 and so on are called “aligned heads” or just “heads”.
       T1 and so on are called “aligned tails” or just “tails”.
 
-      UT (“unaligned tail”) is written with another 'movdqu' after the loop.
+      UT (“unaligned tail”) is written by the caller as well.
       At least 1 of its bytes is exclusive to it as well, that’s why 65 is subtracted below instead of 64. }
 
-    lea    -65(%rcx,%rdx), %r8 { r8 = end of x - 65, to get the loop bound and to write UT later (why not write it right away though...). }
-    and    $-16, %rcx { align rcx to the LEFT (so needs to be offset by an additional +16 for a while). }
-    movdqa %xmm1, 16(%rcx) { Write H1. }
-    mov    %r8, %rax
-    and    $-16, %rax { rax = “T4” (possibly fictive) = aligned r8 = loop bound. }
-    cmp    $49, %rdx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
+    lea    -65(%rcx,%rdx), %rax
+    and    $-16, %rax { rax = “T4” (possibly fictive). }
+    mov    %rax, %rdx { Remember T4 to rdx. }
+    and    $-16, %rcx { rcx = H1 − 16. }
+    sub    %rcx, %rax { rax = aligned byte count − 48. }
+    movdqa %xmm0, 16(%rcx) { Write H1. }
+    cmp    $32-48, %rax
     jle    .LOneAlignedTailWrite
-    movdqa %xmm1, 32(%rcx) { Write H2. }
-    cmp    $81, %rdx  { 50~81 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
+    movdqa %xmm0, 32(%rcx) { Write H2. }
+    cmp    $64-48, %rax
     jle    .LTwoAlignedTailWrites
-    cmp    $113, %rdx  { 82~113 bytes might contain 4~6 heads+tails; write as H1–2 and T4–1. }
+    sub    $48, %rax { rax = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
     jle    .LFourAlignedTailWrites
 
-    add    $48, %rcx
+    add    $48, %rcx { rcx = H3. }
 {$ifdef use_fast_repmovstos}
-    cmp    $ErmsThreshold, %rdx
+    cmp    $ErmsThreshold-64, %rax { Need to write aligned byte count − 32 bytes already written. rax = aligned byte count − 96, so compare rax + 64 to ErmsThreshold, or rax to ErmsThreshold − 64. }
     jae    .LRepStos
 {$else}
-    cmp    $NtThreshold, %rdx
+    cmp    $NtThreshold, %rax
     jae    .L64xNT_Body
 {$endif}
 
 .balign 16
 .L64x_Body:
-    movdqa %xmm1, (%rcx)
-    movdqa %xmm1, 16(%rcx)
-    movdqa %xmm1, 32(%rcx)
-    movdqa %xmm1, 48(%rcx)
+    movdqa %xmm0, (%rcx)
+    movdqa %xmm0, 16(%rcx)
+    movdqa %xmm0, 32(%rcx)
+    movdqa %xmm0, 48(%rcx)
     add    $64, %rcx
-    cmp    %rax, %rcx
-    jb     .L64x_Body
+    sub    $64, %rax
+    ja     .L64x_Body
 
 .LFourAlignedTailWrites:
-    movdqa %xmm1, (%rax) { T4 }
-    movdqa %xmm1, 16(%rax) { T3 }
+    movdqa %xmm0, (%rdx) { T4 }
+    movdqa %xmm0, 16(%rdx) { T3 }
 .LTwoAlignedTailWrites:
-    movdqa %xmm1, 32(%rax) { T2 }
+    movdqa %xmm0, 32(%rdx) { T2 }
 .LOneAlignedTailWrite:
-    movdqa %xmm1, 48(%rax) { T1 }
-    movdqu %xmm0, 65-16(%r8) { UT }
+    movdqa %xmm0, 48(%rdx) { T1 }
     ret
 
 {$ifdef use_fast_repmovstos}
 .LRepStos:
 {$ifdef FPC_PIC}
-    movq   fast_large_repmovstosb@GOTPCREL(%rip), %r9
-    cmpb   $1, (%r9)
+    movq   fast_large_repmovstosb@GOTPCREL(%rip), %r8
+    cmpb   $1, (%r8)
 {$else FPC_PIC}
     cmpb   $1, fast_large_repmovstosb(%rip)
 {$endif FPC_PIC}
@@ -349,12 +349,10 @@ asm
     push   %rdi { For tests on Windows; however this is SEH incompliant so the entire use_fast_repmovstos branch is disabled by default! }
 {$endif}
     mov    %rcx, %rdi { rdi = REP STOS destination. }
-    lea    65-16+8-1(%r8), %rcx
-    sub    %rdi, %rcx
-    shr    $3, %rcx { rcx = count of REP STOSQ blocks before UT. }
-    movq   %xmm1, %rax { recover pattern for aligned writes back to GPR :) }
+    lea    64(%rax), %rcx
+    shr    $3, %rcx { rcx = count of REP STOSQ blocks up to T1 (might be 1 more than strictly required if T1 and UT overlap is 8 or more, don’t care). }
+    movq   %xmm0, %rax { recover pattern for aligned writes back to GPR :) }
     rep stosq
-    movdqu %xmm0, 65-16(%r8) { UT }
 {$ifdef win64}
     pop    %rdi
 {$endif}
@@ -362,18 +360,18 @@ asm
 {$endif}
 
 .LRepStosIsNotBetter:
-    cmp    $NtThreshold, %rdx
+    cmp    $NtThreshold-64, %rax
     jb     .L64x_Body
 
 .balign 16
 .L64xNT_Body:
-    movntdq %xmm1, (%rcx)
-    movntdq %xmm1, 16(%rcx)
-    movntdq %xmm1, 32(%rcx)
-    movntdq %xmm1, 48(%rcx)
+    movntdq %xmm0, (%rcx)
+    movntdq %xmm0, 16(%rcx)
+    movntdq %xmm0, 32(%rcx)
+    movntdq %xmm0, 48(%rcx)
     add    $64, %rcx
-    cmp    %rax, %rcx
-    jb     .L64xNT_Body
+    sub    $64, %rax
+    ja     .L64xNT_Body
     sfence
     jmp    .LFourAlignedTailWrites
 end;
@@ -400,11 +398,9 @@ Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
     movd   %eax, %xmm0
     pshufd $0, %xmm0, %xmm0
     movdqu %xmm0, (%rcx)
-    movdqa %xmm0, %xmm1
-
+    movdqu %xmm0, -16(%rcx,%rdx)
     cmp    $32, %rdx
     jg     FillXxxx_MoreThanTwoXmms
-    movdqu %xmm0, -16(%rcx,%rdx)
     ret
 
 .L4to15:
@@ -452,23 +448,21 @@ procedure FillWord(var x;count:SizeInt;value:word);assembler;nostackframe;
     movd   %eax, %xmm0
     pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
     movdqu %xmm0, (%rcx)
-
+    movdqu %xmm0, -16(%rcx,%rdx,2)
     cmp    $16, %rdx
-    jle    .LTail
+    jg     .LMoreThanTwoXMMs
+    ret
 
+.LMoreThanTwoXMMs:
     shl    $1, %rdx { rdx = byte count }
     mov    %rcx, %r8
     shl    $3, %ecx
     rol    %cl, %eax { misalign the pattern by the misalignment of x }
     mov    %r8, %rcx
-    movd   %eax, %xmm1
-    pshufd $0, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
+    movd   %eax, %xmm0
+    pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
     jmp    FillXxxx_MoreThanTwoXmms
 
-.LTail:
-    movdqu %xmm0, -16(%rcx,%rdx,2)
-    ret
-
 .L4to8:
     mov    %eax, %r8d
     shl    $32, %r8
@@ -508,14 +502,15 @@ procedure FillDWord(var x;count:SizeInt;value:DWord);assembler;nostackframe;
     movd   %eax, %xmm0
     pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
     movdqu %xmm0, (%rcx)
+    movdqu %xmm0, -16(%rcx,%rdx,4)
 
     shl    $2, %rdx { rdx = byte count }
     mov    %rcx, %r8
     shl    $3, %ecx
     rol    %cl, %eax { misalign the pattern by the misalignment of x }
     mov    %r8, %rcx
-    movd   %eax, %xmm1
-    pshufd $0, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
+    movd   %eax, %xmm0
+    pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
     jmp    FillXxxx_MoreThanTwoXmms
 
 .L4to8:
@@ -561,14 +556,15 @@ procedure FillQWord(var x;count:SizeInt;value:QWord);assembler;nostackframe;
     movq   %rax, %xmm0
     pshufd $0b01000100, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
     movdqu %xmm0, (%rcx)
+    movdqu %xmm0, -16(%rcx,%rdx,8)
 
     shl    $3, %rdx { rdx = byte count }
     mov    %rcx, %r8
     shl    $3, %ecx
     rol    %cl, %rax { misalign the pattern by the misalignment of x }
     mov    %r8, %rcx
-    movq   %rax, %xmm1
-    pshufd $0b01000100, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
+    movq   %rax, %xmm0
+    pshufd $0b01000100, %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
     jmp    FillXxxx_MoreThanTwoXmms
 
 .L3to6: