Browse Source

Supposedly faster FillChar for x64.

Rika Ichinose 2 years ago
parent
commit
b56cbad50e
1 changed files with 88 additions and 90 deletions
  1. 88 90
      rtl/x86_64/x86_64.inc

+ 88 - 90
rtl/x86_64/x86_64.inc

@@ -362,107 +362,105 @@ Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
     mov    %rdi, %rcx
 {$endif win64}
 
-    cmp    $8, %rdx
-    jl     .Ltiny
+    mov    $0x01010101, %r9d
+    movzbl %r8b, %eax
+    imul   %r9d, %eax
+
+    cmp    $16, %rdx
+    jge    .LVecOrMore
+    cmp    $3, %rdx
+    jle    .L3OrLess
+
+    mov    %eax, (%rcx)
+    cmp    $8, %edx
+    jle    .LLast4
+    mov    %eax, 4(%rcx)
+    mov    %eax, -8(%rcx,%rdx)
+.LLast4:
+    mov    %eax, -4(%rcx,%rdx)
+    ret
 
-// TODO: movz?q and movs?q are not accepted by FPC asmreader, it needs fixing.
-// `movzbl' instead is accepted and generates correct code with internal assembler,
-// but breaks targets using external GAS (Mantis #19188).
-// So use a different instruction for now.
+.L3OrLess:
+    test   %rdx, %rdx
+    jle    .LQuit
+    mov    %al, (%rcx)
+    mov    %al, -1(%rcx,%rdx)
+    shr    $1, %edx
+    mov    %al, (%rcx,%rdx)
+.LQuit:
+    ret
 
-    { expand byte value  }
-    andq   $0xff, %r8
-{
-    movzbq %r8b, %r8
-}
-    mov    $0x0101010101010101,%r9
-    imul   %r9, %r8
-
-    test   $7, %cl
-    je     .Laligned
-
-    { align dest to 8 bytes }
-    test   $1, %cl
-    je     .L2
-    movb   %r8b, (%rcx)
-    add    $1, %rcx
-    sub    $1, %rdx
-.L2:
-    test   $2, %cl
-    je     .L4
-    movw   %r8w, (%rcx)
-    add    $2, %rcx
-    sub    $2, %rdx
-.L4:
-    test   $4, %cl
-    je     .Laligned
-    movl   %r8d, (%rcx)
-    add    $4, %rcx
-    sub    $4, %rdx
+.balign 16
+.LVecOrMore:
+    movd   %eax, %xmm0
+    pshufd $0, %xmm0, %xmm0
 
-.Laligned:
-    mov    %rdx, %rax
-    and    $0x3f, %rdx
-    shr    $6, %rax
-    jne    .Lmore64
+    { x can start and end aligned or misaligned on the vector boundary:
 
-.Lless64:
-    mov    %rdx, %rax
-    and    $7, %rdx
-    shr    $3, %rax
-    je     .Ltiny
+      x = [UH][H1][H2][...][T2][T1]
+      x = UH][H1][H2][...][T2][T1][UT
 
-    .balign 16
-.Lloop8:                               { max. 8 iterations }
-    mov    %r8, (%rcx)
-    add    $8, %rcx
-    dec    %rax
-    jne    .Lloop8
-.Ltiny:
-    test   %rdx, %rdx
-    jle    .Lquit
-.Lloop1:
-    movb   %r8b, (%rcx)
-    inc    %rcx
-    dec    %rdx
-    jnz    .Lloop1
-.Lquit:
-    retq
+      UH ("unaligned head") is written, potentially overlapping with H1, with the 'movdqu'. Has 1–16 bytes.
+      H1 and so on are “heads”.
+      T1 and so on are “tails”.
+      UT ("unaligned tail") is written with another 'movdqu' after the loop. Has 0–15 bytes. }
 
-.Lmore64:
-    cmp    $0x2000,%rax
-    jae    .Lloop64nti
+    movdqu %xmm0, (%rcx)
+    lea    -64(%rcx,%rdx), %r8 { r8 = end of x - 64, loop bound }
 
-    .balign 16
-.Lloop64:
+    cmp    $32, %rdx
+    jle    .LLastVec
+
+    and    $-16, %rcx { align rcx to the LEFT (so needs to be offset by an additional +16 for a while). }
+    movdqa %xmm0, 16(%rcx) { Write H1. }
+    mov    %r8, %rax
+    and    $-16, %rax { rax = “T4” (possibly fictive) = aligned r8. }
+    cmp    $48, %rdx { 33~48 bytes might contain 1~2 heads+tails; write as H1 and T1. }
+    jle    .LOneAlignedTailWrite
+    movdqa %xmm0, 32(%rcx) { Write H2. }
+    cmp    $80, %rdx  { 49~80 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
+    jle    .LTwoAlignedTailWrites
+    movdqa %xmm0, 48(%rcx) { Write H3. }
+    cmp    $112, %rdx  { 81~112 bytes might contain 4~6 heads+tails; write as H1–3 and T3–1. }
+    jle    .LThreeAlignedTailWrites
+
+    add    $48, %rcx
+    cmp    $0x80000, %rdx
+    jae    .L64xNT_Body
+
+.balign 16
+.L64x_Body:
+    movdqa %xmm0, (%rcx)
+    movdqa %xmm0, 16(%rcx)
+    movdqa %xmm0, 32(%rcx)
+    movdqa %xmm0, 48(%rcx)
     add    $64, %rcx
-    mov    %r8, -64(%rcx)
-    mov    %r8, -56(%rcx)
-    mov    %r8, -48(%rcx)
-    mov    %r8, -40(%rcx)
-    dec    %rax
-    mov    %r8, -32(%rcx)
-    mov    %r8, -24(%rcx)
-    mov    %r8, -16(%rcx)
-    mov    %r8, -8(%rcx)
-    jne    .Lloop64
-    jmp    .Lless64
+    cmp    %r8, %rcx
+    jb     .L64x_Body
+
+.LLoopEnd:
+    movdqa %xmm0, (%rax)
+.LThreeAlignedTailWrites:
+    movdqa %xmm0, 16(%rax)
+.LTwoAlignedTailWrites:
+    movdqa %xmm0, 32(%rax)
+.LOneAlignedTailWrite:
+    movdqa %xmm0, 48(%rax)
+.LLastVec:
+    movdqu %xmm0, 48(%r8)
+    ret
 
-    .balign 16
-.Lloop64nti:
+.balign 16
+.L64xNT_Body:
+    movntdq %xmm0, (%rcx)
+    movntdq %xmm0, 16(%rcx)
+    movntdq %xmm0, 32(%rcx)
+    movntdq %xmm0, 48(%rcx)
     add    $64, %rcx
-    movnti %r8, -64(%rcx)
-    movnti %r8, -56(%rcx)
-    movnti %r8, -48(%rcx)
-    movnti %r8, -40(%rcx)
-    dec    %rax
-    movnti %r8, -32(%rcx)
-    movnti %r8, -24(%rcx)
-    movnti %r8, -16(%rcx)
-    movnti %r8, -8(%rcx)
-    jnz    .Lloop64nti
+    cmp    %r8, %rcx
+    jb     .L64xNT_Body
     mfence
-    jmp    .Lless64
+    jmp    .LLoopEnd
   end;
 {$endif FPC_SYSTEM_HAS_FILLCHAR}