2 years ago · b56cbad50e
--- a/rtl/x86_64/x86_64.inc
+++ b/rtl/x86_64/x86_64.inc
@@ -362,107 +362,105 @@ Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
 
				     mov    %rdi, %rcx
			
 
				 {$endif win64}
			
 
				 
			
 
				-    cmp    $8, %rdx
			
 
				-    jl     .Ltiny
			
 
				+    mov    $0x01010101, %r9d
			
 
				+    movzbl %r8b, %eax
			
 
				+    imul   %r9d, %eax
			
 
				+
			
 
				+    cmp    $16, %rdx
			
 
				+    jge    .LVecOrMore
			
 
				+    cmp    $3, %rdx
			
 
				+    jle    .L3OrLess
			
 
				+
			
 
				+    mov    %eax, (%rcx)
			
 
				+    cmp    $8, %edx
			
 
				+    jle    .LLast4
			
 
				+    mov    %eax, 4(%rcx)
			
 
				+    mov    %eax, -8(%rcx,%rdx)
			
 
				+.LLast4:
			
 
				+    mov    %eax, -4(%rcx,%rdx)
			
 
				+    ret
			
 
				 
			
 
				-// TODO: movz?q and movs?q are not accepted by FPC asmreader, it needs fixing.
			
 
				-// `movzbl' instead is accepted and generates correct code with internal assembler,
			
 
				-// but breaks targets using external GAS (Mantis #19188).
			
 
				-// So use a different instruction for now.
			
 
				+.L3OrLess:
			
 
				+    test   %rdx, %rdx
			
 
				+    jle    .LQuit
			
 
				+    mov    %al, (%rcx)
			
 
				+    mov    %al, -1(%rcx,%rdx)
			
 
				+    shr    $1, %edx
			
 
				+    mov    %al, (%rcx,%rdx)
			
 
				+.LQuit:
			
 
				+    ret
			
 
				 
			
 
				-    { expand byte value  }
			
 
				-    andq   $0xff, %r8
			
 
				-{
			
 
				-    movzbq %r8b, %r8
			
 
				-}
			
 
				-    mov    $0x0101010101010101,%r9
			
 
				-    imul   %r9, %r8
			
 
				-
			
 
				-    test   $7, %cl
			
 
				-    je     .Laligned
			
 
				-
			
 
				-    { align dest to 8 bytes }
			
 
				-    test   $1, %cl
			
 
				-    je     .L2
			
 
				-    movb   %r8b, (%rcx)
			
 
				-    add    $1, %rcx
			
 
				-    sub    $1, %rdx
			
 
				-.L2:
			
 
				-    test   $2, %cl
			
 
				-    je     .L4
			
 
				-    movw   %r8w, (%rcx)
			
 
				-    add    $2, %rcx
			
 
				-    sub    $2, %rdx
			
 
				-.L4:
			
 
				-    test   $4, %cl
			
 
				-    je     .Laligned
			
 
				-    movl   %r8d, (%rcx)
			
 
				-    add    $4, %rcx
			
 
				-    sub    $4, %rdx
			
 
				+.balign 16
			
 
				+.LVecOrMore:
			
 
				+    movd   %eax, %xmm0
			
 
				+    pshufd $0, %xmm0, %xmm0
			
 
				 
			
 
				-.Laligned:
			
 
				-    mov    %rdx, %rax
			
 
				-    and    $0x3f, %rdx
			
 
				-    shr    $6, %rax
			
 
				-    jne    .Lmore64
			
 
				+    { x can start and end aligned or misaligned on the vector boundary:
			
 
				 
			
 
				-.Lless64:
			
 
				-    mov    %rdx, %rax
			
 
				-    and    $7, %rdx
			
 
				-    shr    $3, %rax
			
 
				-    je     .Ltiny
			
 
				+      x = [UH][H1][H2][...][T2][T1]
			
 
				+      x = UH][H1][H2][...][T2][T1][UT
			
 
				 
			
 
				-    .balign 16
			
 
				-.Lloop8:                               { max. 8 iterations }
			
 
				-    mov    %r8, (%rcx)
			
 
				-    add    $8, %rcx
			
 
				-    dec    %rax
			
 
				-    jne    .Lloop8
			
 
				-.Ltiny:
			
 
				-    test   %rdx, %rdx
			
 
				-    jle    .Lquit
			
 
				-.Lloop1:
			
 
				-    movb   %r8b, (%rcx)
			
 
				-    inc    %rcx
			
 
				-    dec    %rdx
			
 
				-    jnz    .Lloop1
			
 
				-.Lquit:
			
 
				-    retq
			
 
				+      UH ("unaligned head") is written, potentially overlapping with H1, with the 'movdqu'. Has 1–16 bytes.
			
 
				+      H1 and so on are “heads”.
			
 
				+      T1 and so on are “tails”.
			
 
				+      UT ("unaligned tail") is written with another 'movdqu' after the loop. Has 0–15 bytes. }
			
 
				 
			
 
				-.Lmore64:
			
 
				-    cmp    $0x2000,%rax
			
 
				-    jae    .Lloop64nti
			
 
				+    movdqu %xmm0, (%rcx)
			
 
				+    lea    -64(%rcx,%rdx), %r8 { r8 = end of x - 64, loop bound }
			
 
				 
			
 
				-    .balign 16
			
 
				-.Lloop64:
			
 
				+    cmp    $32, %rdx
			
 
				+    jle    .LLastVec
			
 
				+
			
 
				+    and    $-16, %rcx { align rcx to the LEFT (so needs to be offset by an additional +16 for a while). }
			
 
				+    movdqa %xmm0, 16(%rcx) { Write H1. }
			
 
				+    mov    %r8, %rax
			
 
				+    and    $-16, %rax { rax = “T4” (possibly fictive) = aligned r8. }
			
 
				+    cmp    $48, %rdx { 33~48 bytes might contain 1~2 heads+tails; write as H1 and T1. }
			
 
				+    jle    .LOneAlignedTailWrite
			
 
				+    movdqa %xmm0, 32(%rcx) { Write H2. }
			
 
				+    cmp    $80, %rdx  { 49~80 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
			
 
				+    jle    .LTwoAlignedTailWrites
			
 
				+    movdqa %xmm0, 48(%rcx) { Write H3. }
			
 
				+    cmp    $112, %rdx  { 81~112 bytes might contain 4~6 heads+tails; write as H1–3 and T3–1. }
			
 
				+    jle    .LThreeAlignedTailWrites
			
 
				+
			
 
				+    add    $48, %rcx
			
 
				+    cmp    $0x80000, %rdx
			
 
				+    jae    .L64xNT_Body
			
 
				+
			
 
				+.balign 16
			
 
				+.L64x_Body:
			
 
				+    movdqa %xmm0, (%rcx)
			
 
				+    movdqa %xmm0, 16(%rcx)
			
 
				+    movdqa %xmm0, 32(%rcx)
			
 
				+    movdqa %xmm0, 48(%rcx)
			
 
				     add    $64, %rcx
			
 
				-    mov    %r8, -64(%rcx)
			
 
				-    mov    %r8, -56(%rcx)
			
 
				-    mov    %r8, -48(%rcx)
			
 
				-    mov    %r8, -40(%rcx)
			
 
				-    dec    %rax
			
 
				-    mov    %r8, -32(%rcx)
			
 
				-    mov    %r8, -24(%rcx)
			
 
				-    mov    %r8, -16(%rcx)
			
 
				-    mov    %r8, -8(%rcx)
			
 
				-    jne    .Lloop64
			
 
				-    jmp    .Lless64
			
 
				+    cmp    %r8, %rcx
			
 
				+    jb     .L64x_Body
			
 
				+
			
 
				+.LLoopEnd:
			
 
				+    movdqa %xmm0, (%rax)
			
 
				+.LThreeAlignedTailWrites:
			
 
				+    movdqa %xmm0, 16(%rax)
			
 
				+.LTwoAlignedTailWrites:
			
 
				+    movdqa %xmm0, 32(%rax)
			
 
				+.LOneAlignedTailWrite:
			
 
				+    movdqa %xmm0, 48(%rax)
			
 
				+.LLastVec:
			
 
				+    movdqu %xmm0, 48(%r8)
			
 
				+    ret
			
 
				 
			
 
				-    .balign 16
			
 
				-.Lloop64nti:
			
 
				+.balign 16
			
 
				+.L64xNT_Body:
			
 
				+    movntdq %xmm0, (%rcx)
			
 
				+    movntdq %xmm0, 16(%rcx)
			
 
				+    movntdq %xmm0, 32(%rcx)
			
 
				+    movntdq %xmm0, 48(%rcx)
			
 
				     add    $64, %rcx
			
 
				-    movnti %r8, -64(%rcx)
			
 
				-    movnti %r8, -56(%rcx)
			
 
				-    movnti %r8, -48(%rcx)
			
 
				-    movnti %r8, -40(%rcx)
			
 
				-    dec    %rax
			
 
				-    movnti %r8, -32(%rcx)
			
 
				-    movnti %r8, -24(%rcx)
			
 
				-    movnti %r8, -16(%rcx)
			
 
				-    movnti %r8, -8(%rcx)
			
 
				-    jnz    .Lloop64nti
			
 
				+    cmp    %r8, %rcx
			
 
				+    jb     .L64xNT_Body
			
 
				     mfence
			
 
				-    jmp    .Lless64
			
 
				+    jmp    .LLoopEnd
			
 
				   end;
			
 
				 {$endif FPC_SYSTEM_HAS_FILLCHAR}