1 year ago · a35577593b
--- a/rtl/i386/i386.inc
+++ b/rtl/i386/i386.inc
@@ -199,10 +199,8 @@ asm
 
				 end;
			
 
				 {$endif FillChar/Word/DWord required.}
			
 
				 
			
 
				-{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
			
 
				 label
			
 
				   FillXxxx_MoreThanTwoXMMs;
			
 
				-{$endif FillQWord required.}
			
 
				 
			
 
				 procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
			
 
				 { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
			
@@ -212,11 +210,11 @@ asm
 
				         movd   %ecx, %xmm0
			
 
				         pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
			
 
				         movdqu %xmm0, (%eax)
			
 
				+        movdqu %xmm0, -16(%eax,%edx)
			
 
				         cmp    $32, %edx
			
 
				         ja     .LMoreThanTwoVectors
			
 
				-        movdqu %xmm0, -16(%eax,%edx)
			
 
				         ret
			
 
				-        .byte  102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
			
 
				+        .byte  102,102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
			
 
				 
			
 
				       { x can start and end misaligned on the vector boundary:
			
 
				         x = ~~][H1][H2][...][T2][T1]~
			
@@ -228,22 +226,18 @@ asm
 
				         mov    %ecx, %esi { esi = pattern }
			
 
				         mov    %eax, %ecx
			
 
				         shl    $3, %ecx { ecx = misalignment of x in bits }
			
 
				-        rol    %cl, %esi { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
			
 
				+        rol    %cl, %esi { misalign the pattern }
			
 
				         movd   %esi, %xmm1
			
 
				         pshufd $0, %xmm1, %xmm1
			
 
				+        pop    %esi
			
 
				 
			
 
				-{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
			
 
				-{ FillQWord jumps here.
			
 
				-  eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes.
			
 
				-  Expects first 16 bytes written...
			
 
				-  ...and ESI pushed! }
			
 
				+{ FillChar (to skip the misaligning above) and FillQWord jump here.
			
 
				+  eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes, first and last 16 bytes written. }
			
 
				 FillXxxx_MoreThanTwoXMMs:
			
 
				-{$endif FillQWord required.}
			
 
				-        lea    -65(%eax,%edx), %ecx { ecx = end of x - 65, to get the loop bound and to write UT later (why not write it right away though...). }
			
 
				+        lea    -65(%eax,%edx), %ecx
			
 
				+        and    $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
			
 
				         and    $-16, %eax { align eax to the LEFT (so needs to be offset by an additional +16 for a while). }
			
 
				         movdqa %xmm1, 16(%eax) { Write H1. }
			
 
				-        mov    %ecx, %esi
			
 
				-        and    $-16, %esi { esi = “T4” (possibly fictive) = aligned ecx = loop bound. }
			
 
				         cmp    $49, %edx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
			
 
				         jle    .LOneAlignedTailWrite
			
 
				         movdqa %xmm1, 32(%eax) { Write H2. }
			
@@ -256,24 +250,22 @@ FillXxxx_MoreThanTwoXMMs:
 
				         cmp    $NtThreshold, %edx
			
 
				         jae    .L64xNT_Body
			
 
				 
			
 
				-.balign 16
			
 
				+.balign 16 { no-op }
			
 
				 .L64x_Body:
			
 
				         movdqa %xmm1, (%eax)
			
 
				         movdqa %xmm1, 16(%eax)
			
 
				         movdqa %xmm1, 32(%eax)
			
 
				         movdqa %xmm1, 48(%eax)
			
 
				         add    $64,  %eax
			
 
				-        cmp    %esi, %eax
			
 
				+        cmp    %ecx, %eax
			
 
				         jb     .L64x_Body
			
 
				 .LFourAlignedTailWrites:
			
 
				-        movdqa %xmm1, (%esi) { T4 }
			
 
				-        movdqa %xmm1, 16(%esi) { T3 }
			
 
				+        movdqa %xmm1, (%ecx) { T4 }
			
 
				+        movdqa %xmm1, 16(%ecx) { T3 }
			
 
				 .LTwoAlignedTailWrites:
			
 
				-        movdqa %xmm1, 32(%esi) { T2 }
			
 
				+        movdqa %xmm1, 32(%ecx) { T2 }
			
 
				 .LOneAlignedTailWrite:
			
 
				-        movdqa %xmm1, 48(%esi) { T1 }
			
 
				-        movdqu %xmm0, 49(%ecx) { UT }
			
 
				-        pop    %esi
			
 
				+        movdqa %xmm1, 48(%ecx) { T1 }
			
 
				         ret
			
 
				 
			
 
				 .balign 16
			
@@ -283,7 +275,7 @@ FillXxxx_MoreThanTwoXMMs:
 
				         movntdq %xmm1, 32(%eax)
			
 
				         movntdq %xmm1, 48(%eax)
			
 
				         add    $64, %eax
			
 
				-        cmp    %esi, %eax
			
 
				+        cmp    %ecx, %eax
			
 
				         jb     .L64xNT_Body
			
 
				         sfence
			
 
				         jmp    .LFourAlignedTailWrites
			
@@ -369,8 +361,15 @@ asm
 
				         cmp     $16, %edx
			
 
				         jbe     FillXxxx_U32Pattern_Ladder_4to16
			
 
				         cmp     $FillXxxx_RepStosThreshold_NoERMS, %edx
			
 
				-        jb      FillXxxx_U32Pattern_SSE2_16OrMore
			
 
				-        jmp     FillXxxx_U32Pattern_RepStos_8OrMore
			
 
				+        jae     FillXxxx_U32Pattern_RepStos_8OrMore
			
 
				+
			
 
				+        movd   %ecx, %xmm0
			
 
				+        pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
			
 
				+        movdqu %xmm0, (%eax)
			
 
				+        movdqu %xmm0, -16(%eax,%edx)
			
 
				+        movdqa %xmm0, %xmm1
			
 
				+        cmp    $32, %edx
			
 
				+        ja     FillXxxx_MoreThanTwoXMMs
			
 
				 end;
			
 
				 
			
 
				 procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
			
@@ -383,8 +382,15 @@ asm
 
				         cmp     $16, %edx
			
 
				         jbe     FillXxxx_U32Pattern_Ladder_4to16
			
 
				         cmp     $FillXxxx_RepStosThreshold_ERMS, %edx
			
 
				-        jb      FillXxxx_U32Pattern_SSE2_16OrMore
			
 
				-        jmp     FillXxxx_U32Pattern_RepStos_8OrMore
			
 
				+        jae     FillXxxx_U32Pattern_RepStos_8OrMore
			
 
				+
			
 
				+        movd   %ecx, %xmm0
			
 
				+        pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
			
 
				+        movdqu %xmm0, (%eax)
			
 
				+        movdqu %xmm0, -16(%eax,%edx)
			
 
				+        movdqa %xmm0, %xmm1
			
 
				+        cmp    $32, %edx
			
 
				+        ja     FillXxxx_MoreThanTwoXMMs
			
 
				 end;
			
 
				 
			
 
				 procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
			
@@ -599,14 +605,14 @@ asm
 
				         punpcklqdq %xmm0, %xmm0
			
 
				         { Stack is 12 bytes:
			
 
				           [esp] = return address, [esp + 4] = value (not required anymore).
			
 
				-          Convert to 8 bytes expected by FillXxxx_MoreThanTwoXMMs:
			
 
				-          [esp] = esi, [esp + 4] = return address. }
			
 
				+          Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs:
			
 
				+          [esp] = return address. }
			
 
				         mov     (%esp), %ecx
			
 
				-        add     $4, %esp
			
 
				-        mov     %esi, (%esp)
			
 
				-        mov     %ecx, 4(%esp)
			
 
				+        add     $8, %esp
			
 
				+        mov     %ecx, (%esp)
			
 
				         shl     $3, %edx
			
 
				         movdqu  %xmm0, (%eax)
			
 
				+        movdqu  %xmm0, -16(%eax,%edx)
			
 
				         movdqa  %xmm0, %xmm1
			
 
				         test    $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
			
 
				         jz      FillXxxx_MoreThanTwoXMMs