|
@@ -1307,11 +1307,14 @@ asm
|
|
|
end;
|
|
|
{$endif ndef CPUX86_HAS_SSE2}
|
|
|
|
|
|
-function {$ifdef CPUX86_HAS_SSE2} CompareByte {$else} CompareByte_SSE2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
|
|
|
+label
|
|
|
+ CompareByte_1OrLess, CompareByte_CantOverReadBoth_AVX2;
|
|
|
+
|
|
|
+function CompareByte_SSE2(const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
|
|
|
asm
|
|
|
{ eax = buf1, edx = buf2, ecx = len }
|
|
|
cmp $1, %ecx
|
|
|
- jle .L1OrLess
|
|
|
+ jle CompareByte_1OrLess
|
|
|
|
|
|
push %ebx
|
|
|
cmp $16, %ecx
|
|
@@ -1345,6 +1348,24 @@ asm
|
|
|
xor %eax, %eax
|
|
|
ret
|
|
|
|
|
|
+.LAligned32xLoop_TwoVectorsDiffer:
|
|
|
+ add %eax, %edx { restore edx = buf2 }
|
|
|
+ pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
|
|
|
+ inc %cx
|
|
|
+ jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
|
|
|
+ mov %ecx, %ebx
|
|
|
+.LVec0Differs:
|
|
|
+ bsf %ebx, %ebx
|
|
|
+ movzbl (%eax,%ebx), %eax
|
|
|
+ movzbl (%edx,%ebx), %edx
|
|
|
+ sub %edx, %eax
|
|
|
+ pop %ebx
|
|
|
+ ret
|
|
|
+
|
|
|
+ .byte 144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
|
|
|
+CompareByte_CantOverReadBoth_AVX2:
|
|
|
+ cmp $16, %ecx
|
|
|
+ jb .LCantOverReadBoth
|
|
|
.LVecOrMore:
|
|
|
{ Compare first vectors. }
|
|
|
movdqu (%eax), %xmm0
|
|
@@ -1354,7 +1375,7 @@ asm
|
|
|
inc %bx
|
|
|
jnz .LVec0Differs
|
|
|
|
|
|
- sub $32, %ecx { now ecx is len - 32... mostly just to save bytes on offsets improving .LAligned32xLoop_Body alignment :) }
|
|
|
+ sub $32, %ecx { now ecx is len - 32. }
|
|
|
jbe .LLastVec
|
|
|
|
|
|
{ Compare second vectors. }
|
|
@@ -1365,41 +1386,10 @@ asm
|
|
|
inc %bx
|
|
|
jnz .LVec1Differs
|
|
|
|
|
|
- { More than four vectors: aligned loop. }
|
|
|
cmp $32, %ecx
|
|
|
- ja .LAligned32xLoop_Prepare
|
|
|
-
|
|
|
- { Compare last two vectors. }
|
|
|
- movdqu (%eax,%ecx), %xmm0
|
|
|
- movdqu (%edx,%ecx), %xmm1
|
|
|
- pcmpeqb %xmm1, %xmm0
|
|
|
- pmovmskb %xmm0, %ebx
|
|
|
- inc %bx
|
|
|
- jnz .LVecEm2Differs
|
|
|
-.LLastVec:
|
|
|
- movdqu 16(%eax,%ecx), %xmm0
|
|
|
- movdqu 16(%edx,%ecx), %xmm1
|
|
|
- pcmpeqb %xmm1, %xmm0
|
|
|
- pmovmskb %xmm0, %ebx
|
|
|
- inc %bx
|
|
|
- jnz .LVecEm1Differs
|
|
|
- pop %ebx
|
|
|
- xor %eax, %eax
|
|
|
- ret
|
|
|
+ jbe .LLastTwoVectors
|
|
|
|
|
|
-.LVecEm2Differs:
|
|
|
- sub $16, %ecx
|
|
|
-.LVecEm1Differs:
|
|
|
- bsf %ebx, %ebx
|
|
|
- add %ecx, %ebx
|
|
|
- movzbl 16(%eax,%ebx), %eax
|
|
|
- movzbl 16(%edx,%ebx), %edx
|
|
|
- sub %edx, %eax
|
|
|
- pop %ebx
|
|
|
- ret
|
|
|
- nop { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
|
|
|
-
|
|
|
-.LAligned32xLoop_Prepare:
|
|
|
+ { More than four vectors: aligned loop. }
|
|
|
lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
|
|
|
sub %eax, %edx { edx = buf2 - buf1 }
|
|
|
and $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
|
|
@@ -1418,40 +1408,33 @@ asm
|
|
|
jnz .LAligned32xLoop_TwoVectorsDiffer
|
|
|
sub $32, %ecx
|
|
|
ja .LAligned32xLoop_Body
|
|
|
-
|
|
|
- { Compare last two vectors after the loop by doing one more loop iteration, modified. }
|
|
|
- lea 32(%eax,%ecx), %eax
|
|
|
- movdqu (%edx,%eax), %xmm0
|
|
|
- movdqu (%eax), %xmm2
|
|
|
- pcmpeqb %xmm2, %xmm0
|
|
|
- movdqu 16(%edx,%eax), %xmm1
|
|
|
- movdqu 16(%eax), %xmm2
|
|
|
- pcmpeqb %xmm2, %xmm1
|
|
|
- pand %xmm0, %xmm1
|
|
|
- pmovmskb %xmm1, %ebx
|
|
|
+ add %eax, %edx { restore edx = buf2 }
|
|
|
+ add $32, %ecx
|
|
|
+.LLastTwoVectors:
|
|
|
+ movdqu (%eax,%ecx), %xmm0
|
|
|
+ movdqu (%edx,%ecx), %xmm1
|
|
|
+ pcmpeqb %xmm1, %xmm0
|
|
|
+ pmovmskb %xmm0, %ebx
|
|
|
inc %bx
|
|
|
- jnz .LAligned32xLoop_TwoVectorsDiffer
|
|
|
+ jnz .LVecEm2Differs
|
|
|
+.LLastVec:
|
|
|
+ movdqu 16(%eax,%ecx), %xmm0
|
|
|
+ movdqu 16(%edx,%ecx), %xmm1
|
|
|
+ pcmpeqb %xmm1, %xmm0
|
|
|
+ pmovmskb %xmm0, %ebx
|
|
|
+ inc %bx
|
|
|
+ jnz .LVecEm1Differs
|
|
|
pop %ebx
|
|
|
xor %eax, %eax
|
|
|
ret
|
|
|
|
|
|
-.LAligned32xLoop_TwoVectorsDiffer:
|
|
|
- add %eax, %edx { restore edx = buf2 }
|
|
|
- pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
|
|
|
- inc %cx
|
|
|
- jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
|
|
|
- bsf %ecx, %ebx
|
|
|
- movzbl (%eax,%ebx), %eax
|
|
|
- movzbl (%edx,%ebx), %edx
|
|
|
- sub %edx, %eax
|
|
|
- pop %ebx
|
|
|
- ret
|
|
|
-
|
|
|
.LVec1Differs:
|
|
|
- add $16, %eax
|
|
|
- add $16, %edx
|
|
|
-.LVec0Differs:
|
|
|
+ xor %ecx, %ecx
|
|
|
+.LVecEm1Differs:
|
|
|
+ add $16, %ecx
|
|
|
+.LVecEm2Differs:
|
|
|
bsf %ebx, %ebx
|
|
|
+ add %ecx, %ebx
|
|
|
movzbl (%eax,%ebx), %eax
|
|
|
movzbl (%edx,%ebx), %edx
|
|
|
sub %edx, %eax
|
|
@@ -1510,7 +1493,7 @@ asm
|
|
|
pop %ebx
|
|
|
ret
|
|
|
|
|
|
-.L1OrLess:
|
|
|
+CompareByte_1OrLess:
|
|
|
jl .LUnbounded_Prepare
|
|
|
movzbl (%eax), %eax
|
|
|
movzbl (%edx), %edx
|
|
@@ -1535,7 +1518,126 @@ asm
|
|
|
or $1, %eax
|
|
|
end;
|
|
|
|
|
|
-{$ifndef CPUX86_HAS_SSE2}
|
|
|
+function {$ifdef CPUX86_HAS_BMI1} CompareByte {$else} CompareByte_AVX2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
|
|
|
+asm
|
|
|
+ { eax = buf1, edx = buf2, ecx = len }
|
|
|
+ cmp $1, %ecx
|
|
|
+ jle CompareByte_1OrLess
|
|
|
+
|
|
|
+ push %ebx
|
|
|
+ cmp $32, %ecx
|
|
|
+ jae .LVecOrMore
|
|
|
+
|
|
|
+ { 2 to 31 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
|
|
|
+ mov %eax, %ebx
|
|
|
+ or %edx, %ebx
|
|
|
+ and $4095, %ebx
|
|
|
+ cmp $4064, %ebx
|
|
|
+ ja CompareByte_CantOverReadBoth_AVX2
|
|
|
+
|
|
|
+ { Over-read both as YMMs. }
|
|
|
+ vmovdqu (%eax), %ymm0
|
|
|
+ vpcmpeqb (%edx), %ymm0, %ymm0
|
|
|
+ vpmovmskb %ymm0, %ebx
|
|
|
+ inc %ebx
|
|
|
+ { bzhi %ecx, %ebx, %ecx }
|
|
|
+ .byte 0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi }
|
|
|
+ jnz .LVec0Differs
|
|
|
+ vzeroupper
|
|
|
+ pop %ebx
|
|
|
+ xor %eax, %eax
|
|
|
+ ret
|
|
|
+
|
|
|
+ .byte 144 { Turn .balign 16 before .LAligned64xLoop_Body into a no-op. }
|
|
|
+.LAligned64xLoop_TwoVectorsDiffer:
|
|
|
+ add %eax, %edx { restore edx = buf2 }
|
|
|
+ vpmovmskb %ymm0, %ecx { Is there a difference in the first vector? }
|
|
|
+ inc %ecx
|
|
|
+ jz .LVec1Differs { No difference in the first vector, ymm0 is all ones, ebx = vpmovmskb(vpcmpeqb(buf1 + 32, buf2 + 32)) from the loop body. }
|
|
|
+ mov %ecx, %ebx
|
|
|
+.LVec0Differs:
|
|
|
+ vzeroupper
|
|
|
+ tzcnt %ebx, %ebx
|
|
|
+ movzbl (%eax,%ebx), %eax
|
|
|
+ movzbl (%edx,%ebx), %edx
|
|
|
+ sub %edx, %eax
|
|
|
+ pop %ebx
|
|
|
+ ret
|
|
|
+
|
|
|
+.LVecOrMore:
|
|
|
+ { Compare first vectors. }
|
|
|
+ vmovdqu (%eax), %ymm0
|
|
|
+ vpcmpeqb (%edx), %ymm0, %ymm0
|
|
|
+ vpmovmskb %ymm0, %ebx
|
|
|
+ inc %ebx
|
|
|
+ jnz .LVec0Differs
|
|
|
+
|
|
|
+ sub $64, %ecx { now ecx is len - 64. }
|
|
|
+ jbe .LLastVec
|
|
|
+
|
|
|
+ { Compare second vectors. }
|
|
|
+ vmovdqu 32(%eax), %ymm0
|
|
|
+ vpcmpeqb 32(%edx), %ymm0, %ymm0
|
|
|
+ vpmovmskb %ymm0, %ebx
|
|
|
+ inc %ebx
|
|
|
+ jnz .LVec1Differs
|
|
|
+
|
|
|
+ cmp $64, %ecx
|
|
|
+ jbe .LLastTwoVectors
|
|
|
+
|
|
|
+ { More than four vectors: aligned loop. }
|
|
|
+ lea -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
|
|
|
+ sub %eax, %edx { edx = buf2 - buf1 }
|
|
|
+ and $-32, %eax { Align buf1. First two vectors already analyzed are skipped by +64 on the first loop iteration. }
|
|
|
+ sub %eax, %ecx { ecx = count to be handled with loop }
|
|
|
+.balign 16 { No-op. }
|
|
|
+.LAligned64xLoop_Body:
|
|
|
+ add $64, %eax
|
|
|
+ { Compare two YMMs, reduce the result with 'and'. }
|
|
|
+ vmovdqu (%edx,%eax), %ymm0
|
|
|
+ vpcmpeqb (%eax), %ymm0, %ymm0 { ymm0 = vpcmpeqb(buf1, buf2) }
|
|
|
+ vmovdqu 32(%edx,%eax), %ymm1
|
|
|
+ vpcmpeqb 32(%eax), %ymm1, %ymm1
|
|
|
+ vpand %ymm0, %ymm1, %ymm1 { ymm1 = ymm0 and vpcmpeqb(buf1 + 32, buf2 + 32) }
|
|
|
+ vpmovmskb %ymm1, %ebx
|
|
|
+ inc %ebx
|
|
|
+ jnz .LAligned64xLoop_TwoVectorsDiffer
|
|
|
+ sub $64, %ecx
|
|
|
+ ja .LAligned64xLoop_Body
|
|
|
+ add %eax, %edx { restore edx = buf2 }
|
|
|
+ add $64, %ecx
|
|
|
+.LLastTwoVectors:
|
|
|
+ vmovdqu (%eax,%ecx), %ymm0
|
|
|
+ vpcmpeqb (%edx,%ecx), %ymm0, %ymm0
|
|
|
+ vpmovmskb %ymm0, %ebx
|
|
|
+ inc %ebx
|
|
|
+ jnz .LVecEm2Differs
|
|
|
+.LLastVec:
|
|
|
+ vmovdqu 32(%eax,%ecx), %ymm0
|
|
|
+ vpcmpeqb 32(%edx,%ecx), %ymm0, %ymm0
|
|
|
+ vpmovmskb %ymm0, %ebx
|
|
|
+ inc %ebx
|
|
|
+ jnz .LVecEm1Differs
|
|
|
+ vzeroupper
|
|
|
+ pop %ebx
|
|
|
+ xor %eax, %eax
|
|
|
+ ret
|
|
|
+
|
|
|
+.LVec1Differs:
|
|
|
+ xor %ecx, %ecx
|
|
|
+.LVecEm1Differs:
|
|
|
+ add $32, %ecx
|
|
|
+.LVecEm2Differs:
|
|
|
+ vzeroupper
|
|
|
+ tzcnt %ebx, %ebx
|
|
|
+ add %ecx, %ebx
|
|
|
+ movzbl (%eax,%ebx), %eax
|
|
|
+ movzbl (%edx,%ebx), %edx
|
|
|
+ sub %edx, %eax
|
|
|
+ pop %ebx
|
|
|
+end;
|
|
|
+
|
|
|
+{$ifndef CPUX86_HAS_BMI1}
|
|
|
function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
|
|
|
|
|
|
var
|
|
@@ -1544,11 +1646,15 @@ var
|
|
|
function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
|
|
|
begin
|
|
|
if not fpc_cpucodeinit_performed then
|
|
|
- exit(CompareByte_Plain(buf1, buf2, len));
|
|
|
- if has_sse2_support then
|
|
|
+ exit({$ifdef CPUX86_HAS_SSE2} CompareByte_SSE2 {$else} CompareByte_Plain {$endif} (buf1, buf2, len));
|
|
|
+ if has_avx2_support then
|
|
|
+ CompareByte_Impl:=@CompareByte_AVX2
|
|
|
+ else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
|
|
|
CompareByte_Impl:=@CompareByte_SSE2
|
|
|
+{$ifndef CPUX86_HAS_SSE2}
|
|
|
else
|
|
|
- CompareByte_Impl:=@CompareByte_Plain;
|
|
|
+ CompareByte_Impl:=@CompareByte_Plain
|
|
|
+{$endif};
|
|
|
result:=CompareByte_Impl(buf1, buf2, len);
|
|
|
end;
|
|
|
|
|
@@ -1556,7 +1662,7 @@ function CompareByte(const buf1, buf2; len: SizeInt): SizeInt;
|
|
|
begin
|
|
|
result:=CompareByte_Impl(buf1, buf2, len);
|
|
|
end;
|
|
|
-{$endif ndef CPUX86_HAS_SSE2 (need CompareByte dispatcher)}
|
|
|
+{$endif ndef CPUX86_HAS_BMI1 (need CompareByte dispatcher)}
|
|
|
{$endif FPC_SYSTEM_HAS_COMPAREBYTE}
|
|
|
|
|
|
|