Browse Source

Fill* for x64, physically sharing half of the code with FillChar.

Rika Ichinose 2 years ago
parent
commit
a4c324ee23
2 changed files with 238 additions and 66 deletions
  1. 0 0
      HEAD
  2. 238 66
      rtl/x86_64/x86_64.inc

+ 0 - 0
HEAD


+ 238 - 66
rtl/x86_64/x86_64.inc

@@ -287,27 +287,111 @@ asm
 end;
 end;
 {$endif FPC_SYSTEM_HAS_MOVE}
 {$endif FPC_SYSTEM_HAS_MOVE}
 
 
+{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
+  or not defined(FPC_SYSTEM_HAS_FILLWORD)
+  or not defined(FPC_SYSTEM_HAS_FILLDWORD)
+  or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
+procedure FillXxxx_MoreThanTwoXmms; assembler; nostackframe;
+{ Input:
+  rcx = 'x'
+  rdx = byte count
+  xmm0 = pattern for unaligned writes
+  xmm1 = pattern for aligned writes }
+asm
+    { x can start and end misaligned on the vector boundary:
+
+      x = ~~][H1][H2][...][T2][T1]~
+          [UH]                 [UT]
+
+      UH (“unaligned head”) potentially overlaps with H1 and is already written with 'movdqu' by the caller.
+      At least 1 of its bytes is exclusive to it, i.e. if x is already aligned, H1 starts at byte 16.
+
+      H1 and so on are called “aligned heads” or just “heads”.
+      T1 and so on are called “aligned tails” or just “tails”.
+
+      UT (“unaligned tail”) is written with another 'movdqu' after the loop.
+      At least 1 of its bytes is exclusive to it as well, that’s why 65 is subtracted below instead of 64. }
+
+    lea    -65(%rcx,%rdx), %r8 { r8 = end of x - 65, to get the loop bound and to write UT later (why not write it right away though...). }
+    and    $-16, %rcx { align rcx to the LEFT (so needs to be offset by an additional +16 for a while). }
+    movdqa %xmm1, 16(%rcx) { Write H1. }
+    mov    %r8, %rax
+    and    $-16, %rax { rax = “T4” (possibly fictive) = aligned r8 = loop bound. }
+    cmp    $49, %rdx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
+    jle    .LOneAlignedTailWrite
+    movdqa %xmm1, 32(%rcx) { Write H2. }
+    cmp    $81, %rdx  { 50~81 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
+    jle    .LTwoAlignedTailWrites
+    cmp    $113, %rdx  { 82~113 bytes might contain 4~6 heads+tails; write as H1–2 and T4–1. }
+    jle    .LFourAlignedTailWrites
+
+    add    $48, %rcx
+    cmp    $0x80000, %rdx
+    jae    .L64xNT_Body
+
+.balign 16
+.L64x_Body:
+    movdqa %xmm1, (%rcx)
+    movdqa %xmm1, 16(%rcx)
+    movdqa %xmm1, 32(%rcx)
+    movdqa %xmm1, 48(%rcx)
+    add    $64, %rcx
+    cmp    %rax, %rcx
+    jb     .L64x_Body
+
+.LFourAlignedTailWrites:
+    movdqa %xmm1, (%rax) { T4 }
+    movdqa %xmm1, 16(%rax) { T3 }
+.LTwoAlignedTailWrites:
+    movdqa %xmm1, 32(%rax) { T2 }
+.LOneAlignedTailWrite:
+    movdqa %xmm1, 48(%rax) { T1 }
+    movdqu %xmm0, 49(%r8) { UT }
+    ret
+
+.balign 16
+.L64xNT_Body:
+    movntdq %xmm1, (%rcx)
+    movntdq %xmm1, 16(%rcx)
+    movntdq %xmm1, 32(%rcx)
+    movntdq %xmm1, 48(%rcx)
+    add    $64, %rcx
+    cmp    %rax, %rcx
+    jb     .L64xNT_Body
+    sfence
+    jmp    .LFourAlignedTailWrites
+end;
+{$endif FPC_SYSTEM_HAS_FILLxxxx}
+
 {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
 {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
 {$define FPC_SYSTEM_HAS_FILLCHAR}
 {$define FPC_SYSTEM_HAS_FILLCHAR}
 Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
 Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
   asm
   asm
 { win64: rcx dest, rdx count, r8b value
 { win64: rcx dest, rdx count, r8b value
   linux: rdi dest, rsi count, rdx value }
   linux: rdi dest, rsi count, rdx value }
+    movzbl {$ifdef win64} %r8b {$else} %dl {$endif}, %eax
+    imul   $0x01010101, %eax
 {$ifndef win64}
 {$ifndef win64}
-    mov    %rdx, %r8
     mov    %rsi, %rdx
     mov    %rsi, %rdx
     mov    %rdi, %rcx
     mov    %rdi, %rcx
 {$endif win64}
 {$endif win64}
 
 
-    mov    $0x01010101, %r9d
-    movzbl %r8b, %eax
-    imul   %r9d, %eax
-
-    cmp    $16, %rdx
-    jge    .LVecOrMore
     cmp    $3, %rdx
     cmp    $3, %rdx
     jle    .L3OrLess
     jle    .L3OrLess
+    cmp    $16, %rdx
+    jl     .L4to15
+
+    movd   %eax, %xmm0
+    pshufd $0, %xmm0, %xmm0
+    movdqu %xmm0, (%rcx)
+    movdqa %xmm0, %xmm1
+
+    cmp    $32, %rdx
+    jg     FillXxxx_MoreThanTwoXmms
+    movdqu %xmm0, -16(%rcx,%rdx)
+    ret
 
 
+.L4to15:
     mov    %eax, (%rcx)
     mov    %eax, (%rcx)
     cmp    $8, %edx
     cmp    $8, %edx
     jle    .LLast4
     jle    .LLast4
@@ -325,81 +409,169 @@ Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
     shr    $1, %edx
     shr    $1, %edx
     mov    %al, (%rcx,%rdx)
     mov    %al, (%rcx,%rdx)
 .LQuit:
 .LQuit:
-    ret
+  end;
+{$endif FPC_SYSTEM_HAS_FILLCHAR}
+
+{$ifndef FPC_SYSTEM_HAS_FILLWORD}
+{$define FPC_SYSTEM_HAS_FILLWORD}
+procedure FillWord(var x;count:SizeInt;value:word);assembler;nostackframe;
+  asm
+{$ifdef win64}
+    movzwl %r8w, %eax
+    shl    $16, %r8d
+    or     %r8d, %eax
+{$else}
+    movzwl %dx, %eax
+    shl    $16, %edx
+    or     %edx, %eax
+    mov    %rsi, %rdx
+    mov    %rdi, %rcx
+{$endif}
+
+    cmp    $3, %rdx
+    jle    .L3OrLess
+    cmp    $8, %rdx
+    jle    .L4to8
 
 
-.balign 16
-.LVecOrMore:
     movd   %eax, %xmm0
     movd   %eax, %xmm0
-    pshufd $0, %xmm0, %xmm0
+    pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
+    movdqu %xmm0, (%rcx)
 
 
-    { x can start and end aligned or misaligned on the vector boundary:
+    cmp    $16, %rdx
+    jle    .LTail
 
 
-      x = [UH][H1][H2][...][T2][T1]
-      x = UH][H1][H2][...][T2][T1][UT
+    shl    $1, %rdx { rdx = byte count }
+    mov    %rcx, %r8
+    shl    $3, %ecx
+    rol    %cl, %eax { misalign the pattern by the misalignment of x }
+    mov    %r8, %rcx
+    movd   %eax, %xmm1
+    pshufd $0, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
+    jmp    FillXxxx_MoreThanTwoXmms
+
+.LTail:
+    movdqu %xmm0, -16(%rcx,%rdx,2)
+    ret
 
 
-      UH (“unaligned head”) is written, potentially overlapping with H1, with the 'movdqu'. Has 1–16 bytes.
-      H1 and so on are “heads”.
-      T1 and so on are “tails”.
-      UT (“unaligned tail”) is written with another 'movdqu' after the loop. Has 0–15 bytes. }
+.L4to8:
+    mov    %eax, %r8d
+    shl    $32, %r8
+    or     %r8, %rax
+    mov    %rax, (%rcx)
+    mov    %rax, -8(%rcx,%rdx,2)
+    ret
 
 
+.L3OrLess:
+    test   %rdx, %rdx
+    jle    .LQuit
+    mov    %ax, (%rcx)
+    mov    %ax, -2(%rcx,%rdx,2)
+    shr    $1, %edx
+    mov    %ax, (%rcx,%rdx,2)
+.LQuit:
+  end;
+{$endif FPC_SYSTEM_HAS_FILLWORD}
+
+{$ifndef FPC_SYSTEM_HAS_FILLDWORD}
+{$define FPC_SYSTEM_HAS_FILLDWORD}
+procedure FillDWord(var x;count:SizeInt;value:DWord);assembler;nostackframe;
+  asm
+{$ifdef win64}
+    mov    %r8d, %eax
+{$else}
+    mov    %edx, %eax
+    mov    %rsi, %rdx
+    mov    %rdi, %rcx
+{$endif win64}
+
+    cmp    $3, %rdx
+    jle    .L3OrLess
+    cmp    $8, %rdx
+    jle    .L4to8
+
+    movd   %eax, %xmm0
+    pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
     movdqu %xmm0, (%rcx)
     movdqu %xmm0, (%rcx)
-    lea    -64(%rcx,%rdx), %r8 { r8 = end of x - 64, loop bound }
 
 
-    cmp    $32, %rdx
-    jle    .LLastVec
+    shl    $2, %rdx { rdx = byte count }
+    mov    %rcx, %r8
+    shl    $3, %ecx
+    rol    %cl, %eax { misalign the pattern by the misalignment of x }
+    mov    %r8, %rcx
+    movd   %eax, %xmm1
+    pshufd $0, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
+    jmp    FillXxxx_MoreThanTwoXmms
 
 
-    and    $-16, %rcx { align rcx to the LEFT (so needs to be offset by an additional +16 for a while). }
-    movdqa %xmm0, 16(%rcx) { Write H1. }
+.L4to8:
+{$ifndef win64} { on win64, eax = r8d already. }
+    mov    %eax, %r8d
+{$endif}
+    shl    $32, %r8
+    or     %r8, %rax
+    mov    %rax, (%rcx)
+    mov    %rax, 8(%rcx)
+    mov    %rax, -16(%rcx,%rdx,4)
+    mov    %rax, -8(%rcx,%rdx,4)
+    ret
+
+.L3OrLess:
+    test   %rdx, %rdx
+    jle    .LQuit
+    mov    %eax, (%rcx)
+    mov    %eax, -4(%rcx,%rdx,4)
+    shr    $1, %edx
+    mov    %eax, (%rcx,%rdx,4)
+.LQuit:
+  end;
+{$endif FPC_SYSTEM_HAS_FILLDWORD}
+
+{$ifndef FPC_SYSTEM_HAS_FILLQWORD}
+{$define FPC_SYSTEM_HAS_FILLQWORD}
+procedure FillQWord(var x;count:SizeInt;value:QWord);assembler;nostackframe;
+  asm
+{$ifdef win64}
     mov    %r8, %rax
     mov    %r8, %rax
-    and    $-16, %rax { rax = “T4” (possibly fictive) = aligned r8. }
-    cmp    $48, %rdx { 33~48 bytes might contain 1~2 heads+tails; write as H1 and T1. }
-    jle    .LOneAlignedTailWrite
-    movdqa %xmm0, 32(%rcx) { Write H2. }
-    cmp    $80, %rdx  { 49~80 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
-    jle    .LTwoAlignedTailWrites
-    movdqa %xmm0, 48(%rcx) { Write H3. }
-    cmp    $112, %rdx  { 81~112 bytes might contain 4~6 heads+tails; write as H1–3 and T3–1. }
-    jle    .LThreeAlignedTailWrites
+{$else}
+    mov    %rdx, %rax
+    mov    %rsi, %rdx
+    mov    %rdi, %rcx
+{$endif win64}
 
 
-    add    $48, %rcx
-    cmp    $0x80000, %rdx
-    jae    .L64xNT_Body
+    cmp    $2, %rdx
+    jle    .L2OrLess
+    cmp    $6, %rdx
+    jle    .L3to6
 
 
-.balign 16
-.L64x_Body:
-    movdqa %xmm0, (%rcx)
-    movdqa %xmm0, 16(%rcx)
-    movdqa %xmm0, 32(%rcx)
-    movdqa %xmm0, 48(%rcx)
-    add    $64, %rcx
-    cmp    %r8, %rcx
-    jb     .L64x_Body
+    movq   %rax, %xmm0
+    pshufd $0b01000100, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
+    movdqu %xmm0, (%rcx)
 
 
-.LLoopEnd:
-    movdqa %xmm0, (%rax)
-.LThreeAlignedTailWrites:
-    movdqa %xmm0, 16(%rax)
-.LTwoAlignedTailWrites:
-    movdqa %xmm0, 32(%rax)
-.LOneAlignedTailWrite:
-    movdqa %xmm0, 48(%rax)
-.LLastVec:
-    movdqu %xmm0, 48(%r8)
+    shl    $3, %rdx { rdx = byte count }
+    mov    %rcx, %r8
+    shl    $3, %ecx
+    rol    %cl, %rax { misalign the pattern by the misalignment of x }
+    mov    %r8, %rcx
+    movq   %rax, %xmm1
+    pshufd $0b01000100, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
+    jmp    FillXxxx_MoreThanTwoXmms
+
+.L3to6:
+    mov    %rax, (%rcx)
+    mov    %rax, 8(%rcx)
+    mov    %rax, 16(%rcx)
+    mov    %rax, -24(%rcx,%rdx,8)
+    mov    %rax, -16(%rcx,%rdx,8)
+    mov    %rax, -8(%rcx,%rdx,8)
     ret
     ret
 
 
-.balign 16
-.L64xNT_Body:
-    movntdq %xmm0, (%rcx)
-    movntdq %xmm0, 16(%rcx)
-    movntdq %xmm0, 32(%rcx)
-    movntdq %xmm0, 48(%rcx)
-    add    $64, %rcx
-    cmp    %r8, %rcx
-    jb     .L64xNT_Body
-    mfence
-    jmp    .LLoopEnd
+.L2OrLess:
+    test   %rdx, %rdx
+    jle    .LQuit
+    mov    %rax, (%rcx)
+    mov    %rax, -8(%rcx,%rdx,8)
+.LQuit:
   end;
   end;
-{$endif FPC_SYSTEM_HAS_FILLCHAR}
+{$endif FPC_SYSTEM_HAS_FILLQWORD}
 
 
 {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
 {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
 {$define FPC_SYSTEM_HAS_INDEXBYTE}
 {$define FPC_SYSTEM_HAS_INDEXBYTE}