|
@@ -199,10 +199,8 @@ asm
|
|
|
end;
|
|
|
{$endif FillChar/Word/DWord required.}
|
|
|
|
|
|
-{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
|
|
|
label
|
|
|
FillXxxx_MoreThanTwoXMMs;
|
|
|
-{$endif FillQWord required.}
|
|
|
|
|
|
procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
|
|
|
{ eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
|
|
@@ -212,11 +210,11 @@ asm
|
|
|
movd %ecx, %xmm0
|
|
|
pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
|
|
|
movdqu %xmm0, (%eax)
|
|
|
+ movdqu %xmm0, -16(%eax,%edx)
|
|
|
cmp $32, %edx
|
|
|
ja .LMoreThanTwoVectors
|
|
|
- movdqu %xmm0, -16(%eax,%edx)
|
|
|
ret
|
|
|
- .byte 102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
|
|
|
+ .byte 102,102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
|
|
|
|
|
|
{ x can start and end misaligned on the vector boundary:
|
|
|
x = ~~][H1][H2][...][T2][T1]~
|
|
@@ -228,22 +226,18 @@ asm
|
|
|
mov %ecx, %esi { esi = pattern }
|
|
|
mov %eax, %ecx
|
|
|
shl $3, %ecx { ecx = misalignment of x in bits }
|
|
|
- rol %cl, %esi { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
|
|
|
+ rol %cl, %esi { misalign the pattern }
|
|
|
movd %esi, %xmm1
|
|
|
pshufd $0, %xmm1, %xmm1
|
|
|
+ pop %esi
|
|
|
|
|
|
-{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
|
|
|
-{ FillQWord jumps here.
|
|
|
- eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes.
|
|
|
- Expects first 16 bytes written...
|
|
|
- ...and ESI pushed! }
|
|
|
+{ FillChar (to skip the misaligning above) and FillQWord jump here.
|
|
|
+ eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes, first and last 16 bytes written. }
|
|
|
FillXxxx_MoreThanTwoXMMs:
|
|
|
-{$endif FillQWord required.}
|
|
|
- lea -65(%eax,%edx), %ecx { ecx = end of x - 65, to get the loop bound and to write UT later (why not write it right away though...). }
|
|
|
+ lea -65(%eax,%edx), %ecx
|
|
|
+ and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
|
|
|
and $-16, %eax { align eax to the LEFT (so needs to be offset by an additional +16 for a while). }
|
|
|
movdqa %xmm1, 16(%eax) { Write H1. }
|
|
|
- mov %ecx, %esi
|
|
|
- and $-16, %esi { esi = “T4” (possibly fictive) = aligned ecx = loop bound. }
|
|
|
cmp $49, %edx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
|
|
|
jle .LOneAlignedTailWrite
|
|
|
movdqa %xmm1, 32(%eax) { Write H2. }
|
|
@@ -256,24 +250,22 @@ FillXxxx_MoreThanTwoXMMs:
|
|
|
cmp $NtThreshold, %edx
|
|
|
jae .L64xNT_Body
|
|
|
|
|
|
-.balign 16
|
|
|
+.balign 16 { no-op }
|
|
|
.L64x_Body:
|
|
|
movdqa %xmm1, (%eax)
|
|
|
movdqa %xmm1, 16(%eax)
|
|
|
movdqa %xmm1, 32(%eax)
|
|
|
movdqa %xmm1, 48(%eax)
|
|
|
add $64, %eax
|
|
|
- cmp %esi, %eax
|
|
|
+ cmp %ecx, %eax
|
|
|
jb .L64x_Body
|
|
|
.LFourAlignedTailWrites:
|
|
|
- movdqa %xmm1, (%esi) { T4 }
|
|
|
- movdqa %xmm1, 16(%esi) { T3 }
|
|
|
+ movdqa %xmm1, (%ecx) { T4 }
|
|
|
+ movdqa %xmm1, 16(%ecx) { T3 }
|
|
|
.LTwoAlignedTailWrites:
|
|
|
- movdqa %xmm1, 32(%esi) { T2 }
|
|
|
+ movdqa %xmm1, 32(%ecx) { T2 }
|
|
|
.LOneAlignedTailWrite:
|
|
|
- movdqa %xmm1, 48(%esi) { T1 }
|
|
|
- movdqu %xmm0, 49(%ecx) { UT }
|
|
|
- pop %esi
|
|
|
+ movdqa %xmm1, 48(%ecx) { T1 }
|
|
|
ret
|
|
|
|
|
|
.balign 16
|
|
@@ -283,7 +275,7 @@ FillXxxx_MoreThanTwoXMMs:
|
|
|
movntdq %xmm1, 32(%eax)
|
|
|
movntdq %xmm1, 48(%eax)
|
|
|
add $64, %eax
|
|
|
- cmp %esi, %eax
|
|
|
+ cmp %ecx, %eax
|
|
|
jb .L64xNT_Body
|
|
|
sfence
|
|
|
jmp .LFourAlignedTailWrites
|
|
@@ -369,8 +361,15 @@ asm
|
|
|
cmp $16, %edx
|
|
|
jbe FillXxxx_U32Pattern_Ladder_4to16
|
|
|
cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
|
|
|
- jb FillXxxx_U32Pattern_SSE2_16OrMore
|
|
|
- jmp FillXxxx_U32Pattern_RepStos_8OrMore
|
|
|
+ jae FillXxxx_U32Pattern_RepStos_8OrMore
|
|
|
+
|
|
|
+ movd %ecx, %xmm0
|
|
|
+ pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
|
|
|
+ movdqu %xmm0, (%eax)
|
|
|
+ movdqu %xmm0, -16(%eax,%edx)
|
|
|
+ movdqa %xmm0, %xmm1
|
|
|
+ cmp $32, %edx
|
|
|
+ ja FillXxxx_MoreThanTwoXMMs
|
|
|
end;
|
|
|
|
|
|
procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
|
|
@@ -383,8 +382,15 @@ asm
|
|
|
cmp $16, %edx
|
|
|
jbe FillXxxx_U32Pattern_Ladder_4to16
|
|
|
cmp $FillXxxx_RepStosThreshold_ERMS, %edx
|
|
|
- jb FillXxxx_U32Pattern_SSE2_16OrMore
|
|
|
- jmp FillXxxx_U32Pattern_RepStos_8OrMore
|
|
|
+ jae FillXxxx_U32Pattern_RepStos_8OrMore
|
|
|
+
|
|
|
+ movd %ecx, %xmm0
|
|
|
+ pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
|
|
|
+ movdqu %xmm0, (%eax)
|
|
|
+ movdqu %xmm0, -16(%eax,%edx)
|
|
|
+ movdqa %xmm0, %xmm1
|
|
|
+ cmp $32, %edx
|
|
|
+ ja FillXxxx_MoreThanTwoXMMs
|
|
|
end;
|
|
|
|
|
|
procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
|
|
@@ -599,14 +605,14 @@ asm
|
|
|
punpcklqdq %xmm0, %xmm0
|
|
|
{ Stack is 12 bytes:
|
|
|
[esp] = return address, [esp + 4] = value (not required anymore).
|
|
|
- Convert to 8 bytes expected by FillXxxx_MoreThanTwoXMMs:
|
|
|
- [esp] = esi, [esp + 4] = return address. }
|
|
|
+ Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs:
|
|
|
+ [esp] = return address. }
|
|
|
mov (%esp), %ecx
|
|
|
- add $4, %esp
|
|
|
- mov %esi, (%esp)
|
|
|
- mov %ecx, 4(%esp)
|
|
|
+ add $8, %esp
|
|
|
+ mov %ecx, (%esp)
|
|
|
shl $3, %edx
|
|
|
movdqu %xmm0, (%eax)
|
|
|
+ movdqu %xmm0, -16(%eax,%edx)
|
|
|
movdqa %xmm0, %xmm1
|
|
|
test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
|
|
|
jz FillXxxx_MoreThanTwoXMMs
|