2 years ago · 9917350ef0
--- a/rtl/i386/i386.inc
+++ b/rtl/i386/i386.inc
@@ -1307,11 +1307,14 @@ asm
 
															 end;
														
 
															 {$endif ndef CPUX86_HAS_SSE2}
														
 
															-function {$ifdef CPUX86_HAS_SSE2} CompareByte {$else} CompareByte_SSE2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
														
 
															+label
														
 
															+  CompareByte_1OrLess, CompareByte_CantOverReadBoth_AVX2;
														
 
															+
														
 
															+function CompareByte_SSE2(const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
														
 
															 asm
														
 
															         { eax = buf1, edx = buf2, ecx = len }
														
 
															         cmp      $1, %ecx
														
 
															-        jle      .L1OrLess
														
 
															+        jle      CompareByte_1OrLess
														
 
															         push     %ebx
														
 
															         cmp      $16, %ecx
														
@@ -1345,6 +1348,24 @@ asm
 
															         xor      %eax, %eax
														
 
															         ret
														
 
															+.LAligned32xLoop_TwoVectorsDiffer:
														
 
															+        add      %eax, %edx { restore edx = buf2 }
														
 
															+        pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
														
 
															+        inc      %cx
														
 
															+        jz       .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
														
 
															+        mov      %ecx, %ebx
														
 
															+.LVec0Differs:
														
 
															+        bsf      %ebx, %ebx
														
 
															+        movzbl   (%eax,%ebx), %eax
														
 
															+        movzbl   (%edx,%ebx), %edx
														
 
															+        sub      %edx, %eax
														
 
															+        pop      %ebx
														
 
															+        ret
														
 
															+
														
 
															+        .byte    144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
														
 
															+CompareByte_CantOverReadBoth_AVX2:
														
 
															+        cmp      $16, %ecx
														
 
															+        jb       .LCantOverReadBoth
														
 
															 .LVecOrMore:
														
 
															         { Compare first vectors. }
														
 
															         movdqu   (%eax), %xmm0
														
@@ -1354,7 +1375,7 @@ asm
 
															         inc      %bx
														
 
															         jnz      .LVec0Differs
														
 
															-        sub      $32, %ecx { now ecx is len - 32... mostly just to save bytes on offsets improving .LAligned32xLoop_Body alignment :) }
														
 
															+        sub      $32, %ecx { now ecx is len - 32. }
														
 
															         jbe      .LLastVec
														
 
															         { Compare second vectors. }
														
@@ -1365,41 +1386,10 @@ asm
 
															         inc      %bx
														
 
															         jnz      .LVec1Differs
														
 
															-        { More than four vectors: aligned loop. }
														
 
															         cmp      $32, %ecx
														
 
															-        ja       .LAligned32xLoop_Prepare
														
 
															-
														
 
															-        { Compare last two vectors. }
														
 
															-        movdqu   (%eax,%ecx), %xmm0
														
 
															-        movdqu   (%edx,%ecx), %xmm1
														
 
															-        pcmpeqb  %xmm1, %xmm0
														
 
															-        pmovmskb %xmm0, %ebx
														
 
															-        inc      %bx
														
 
															-        jnz      .LVecEm2Differs
														
 
															-.LLastVec:
														
 
															-        movdqu   16(%eax,%ecx), %xmm0
														
 
															-        movdqu   16(%edx,%ecx), %xmm1
														
 
															-        pcmpeqb  %xmm1, %xmm0
														
 
															-        pmovmskb %xmm0, %ebx
														
 
															-        inc      %bx
														
 
															-        jnz      .LVecEm1Differs
														
 
															-        pop      %ebx
														
 
															-        xor      %eax, %eax
														
 
															-        ret
														
 
															+        jbe      .LLastTwoVectors
														
 
															-.LVecEm2Differs:
														
 
															-        sub      $16, %ecx
														
 
															-.LVecEm1Differs:
														
 
															-        bsf      %ebx, %ebx
														
 
															-        add      %ecx, %ebx
														
 
															-        movzbl   16(%eax,%ebx), %eax
														
 
															-        movzbl   16(%edx,%ebx), %edx
														
 
															-        sub      %edx, %eax
														
 
															-        pop      %ebx
														
 
															-        ret
														
 
															-        nop      { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
														
 
															-
														
 
															-.LAligned32xLoop_Prepare:
														
 
															+        { More than four vectors: aligned loop. }
														
 
															         lea      -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
														
 
															         sub      %eax, %edx { edx = buf2 - buf1 }
														
 
															         and      $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
														
@@ -1418,40 +1408,33 @@ asm
 
															         jnz      .LAligned32xLoop_TwoVectorsDiffer
														
 
															         sub      $32, %ecx
														
 
															         ja       .LAligned32xLoop_Body
														
 
															-
														
 
															-        { Compare last two vectors after the loop by doing one more loop iteration, modified. }
														
 
															-        lea      32(%eax,%ecx), %eax
														
 
															-        movdqu   (%edx,%eax), %xmm0
														
 
															-        movdqu   (%eax), %xmm2
														
 
															-        pcmpeqb  %xmm2, %xmm0
														
 
															-        movdqu   16(%edx,%eax), %xmm1
														
 
															-        movdqu   16(%eax), %xmm2
														
 
															-        pcmpeqb  %xmm2, %xmm1
														
 
															-        pand     %xmm0, %xmm1
														
 
															-        pmovmskb %xmm1, %ebx
														
 
															+        add      %eax, %edx { restore edx = buf2 }
														
 
															+        add      $32, %ecx
														
 
															+.LLastTwoVectors:
														
 
															+        movdqu   (%eax,%ecx), %xmm0
														
 
															+        movdqu   (%edx,%ecx), %xmm1
														
 
															+        pcmpeqb  %xmm1, %xmm0
														
 
															+        pmovmskb %xmm0, %ebx
														
 
															         inc      %bx
														
 
															-        jnz      .LAligned32xLoop_TwoVectorsDiffer
														
 
															+        jnz      .LVecEm2Differs
														
 
															+.LLastVec:
														
 
															+        movdqu   16(%eax,%ecx), %xmm0
														
 
															+        movdqu   16(%edx,%ecx), %xmm1
														
 
															+        pcmpeqb  %xmm1, %xmm0
														
 
															+        pmovmskb %xmm0, %ebx
														
 
															+        inc      %bx
														
 
															+        jnz      .LVecEm1Differs
														
 
															         pop      %ebx
														
 
															         xor      %eax, %eax
														
 
															         ret
														
 
															-.LAligned32xLoop_TwoVectorsDiffer:
														
 
															-        add      %eax, %edx { restore edx = buf2 }
														
 
															-        pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
														
 
															-        inc      %cx
														
 
															-        jz       .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
														
 
															-        bsf      %ecx, %ebx
														
 
															-        movzbl   (%eax,%ebx), %eax
														
 
															-        movzbl   (%edx,%ebx), %edx
														
 
															-        sub      %edx, %eax
														
 
															-        pop      %ebx
														
 
															-        ret
														
 
															-
														
 
															 .LVec1Differs:
														
 
															-        add      $16, %eax
														
 
															-        add      $16, %edx
														
 
															-.LVec0Differs:
														
 
															+        xor      %ecx, %ecx
														
 
															+.LVecEm1Differs:
														
 
															+        add      $16, %ecx
														
 
															+.LVecEm2Differs:
														
 
															         bsf      %ebx, %ebx
														
 
															+        add      %ecx, %ebx
														
 
															         movzbl   (%eax,%ebx), %eax
														
 
															         movzbl   (%edx,%ebx), %edx
														
 
															         sub      %edx, %eax
														
@@ -1510,7 +1493,7 @@ asm
 
															         pop      %ebx
														
 
															         ret
														
 
															-.L1OrLess:
														
 
															+CompareByte_1OrLess:
														
 
															         jl       .LUnbounded_Prepare
														
 
															         movzbl   (%eax), %eax
														
 
															         movzbl   (%edx), %edx
														
@@ -1535,7 +1518,126 @@ asm
 
															         or       $1, %eax
														
 
															 end;
														
 
															-{$ifndef CPUX86_HAS_SSE2}
														
 
															+function {$ifdef CPUX86_HAS_BMI1} CompareByte {$else} CompareByte_AVX2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
														
 
															+asm
														
 
															+        { eax = buf1, edx = buf2, ecx = len }
														
 
															+        cmp       $1, %ecx
														
 
															+        jle       CompareByte_1OrLess
														
 
															+
														
 
															+        push      %ebx
														
 
															+        cmp       $32, %ecx
														
 
															+        jae       .LVecOrMore
														
 
															+
														
 
															+        { 2 to 31 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
														
 
															+        mov       %eax, %ebx
														
 
															+        or        %edx, %ebx
														
 
															+        and       $4095, %ebx
														
 
															+        cmp       $4064, %ebx
														
 
															+        ja        CompareByte_CantOverReadBoth_AVX2
														
 
															+
														
 
															+        { Over-read both as YMMs. }
														
 
															+        vmovdqu   (%eax), %ymm0
														
 
															+        vpcmpeqb  (%edx), %ymm0, %ymm0
														
 
															+        vpmovmskb %ymm0, %ebx
														
 
															+        inc       %ebx
														
 
															+        { bzhi      %ecx, %ebx, %ecx }
														
 
															+        .byte     0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi }
														
 
															+        jnz       .LVec0Differs
														
 
															+        vzeroupper
														
 
															+        pop       %ebx
														
 
															+        xor       %eax, %eax
														
 
															+        ret
														
 
															+
														
 
															+        .byte     144 { Turn .balign 16 before .LAligned64xLoop_Body into a no-op. }
														
 
															+.LAligned64xLoop_TwoVectorsDiffer:
														
 
															+        add       %eax, %edx { restore edx = buf2 }
														
 
															+        vpmovmskb %ymm0, %ecx { Is there a difference in the first vector? }
														
 
															+        inc       %ecx
														
 
															+        jz        .LVec1Differs { No difference in the first vector, ymm0 is all ones, ebx = vpmovmskb(vpcmpeqb(buf1 + 32, buf2 + 32)) from the loop body. }
														
 
															+        mov       %ecx, %ebx
														
 
															+.LVec0Differs:
														
 
															+        vzeroupper
														
 
															+        tzcnt     %ebx, %ebx
														
 
															+        movzbl    (%eax,%ebx), %eax
														
 
															+        movzbl    (%edx,%ebx), %edx
														
 
															+        sub       %edx, %eax
														
 
															+        pop       %ebx
														
 
															+        ret
														
 
															+
														
 
															+.LVecOrMore:
														
 
															+        { Compare first vectors. }
														
 
															+        vmovdqu   (%eax), %ymm0
														
 
															+        vpcmpeqb  (%edx), %ymm0, %ymm0
														
 
															+        vpmovmskb %ymm0, %ebx
														
 
															+        inc       %ebx
														
 
															+        jnz       .LVec0Differs
														
 
															+
														
 
															+        sub       $64, %ecx { now ecx is len - 64. }
														
 
															+        jbe       .LLastVec
														
 
															+
														
 
															+        { Compare second vectors. }
														
 
															+        vmovdqu   32(%eax), %ymm0
														
 
															+        vpcmpeqb  32(%edx), %ymm0, %ymm0
														
 
															+        vpmovmskb %ymm0, %ebx
														
 
															+        inc       %ebx
														
 
															+        jnz       .LVec1Differs
														
 
															+
														
 
															+        cmp       $64, %ecx
														
 
															+        jbe       .LLastTwoVectors
														
 
															+
														
 
															+        { More than four vectors: aligned loop. }
														
 
															+        lea       -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
														
 
															+        sub       %eax, %edx { edx = buf2 - buf1 }
														
 
															+        and       $-32, %eax { Align buf1. First two vectors already analyzed are skipped by +64 on the first loop iteration. }
														
 
															+        sub       %eax, %ecx { ecx = count to be handled with loop }
														
 
															+.balign 16 { No-op. }
														
 
															+.LAligned64xLoop_Body:
														
 
															+        add       $64, %eax
														
 
															+        { Compare two YMMs, reduce the result with 'and'. }
														
 
															+        vmovdqu   (%edx,%eax), %ymm0
														
 
															+        vpcmpeqb  (%eax), %ymm0, %ymm0 { ymm0 = vpcmpeqb(buf1, buf2) }
														
 
															+        vmovdqu   32(%edx,%eax), %ymm1
														
 
															+        vpcmpeqb  32(%eax), %ymm1, %ymm1
														
 
															+        vpand     %ymm0, %ymm1, %ymm1 { ymm1 = ymm0 and vpcmpeqb(buf1 + 32, buf2 + 32) }
														
 
															+        vpmovmskb %ymm1, %ebx
														
 
															+        inc       %ebx
														
 
															+        jnz       .LAligned64xLoop_TwoVectorsDiffer
														
 
															+        sub       $64, %ecx
														
 
															+        ja        .LAligned64xLoop_Body
														
 
															+        add       %eax, %edx { restore edx = buf2 }
														
 
															+        add       $64, %ecx
														
 
															+.LLastTwoVectors:
														
 
															+        vmovdqu   (%eax,%ecx), %ymm0
														
 
															+        vpcmpeqb  (%edx,%ecx), %ymm0, %ymm0
														
 
															+        vpmovmskb %ymm0, %ebx
														
 
															+        inc       %ebx
														
 
															+        jnz       .LVecEm2Differs
														
 
															+.LLastVec:
														
 
															+        vmovdqu   32(%eax,%ecx), %ymm0
														
 
															+        vpcmpeqb  32(%edx,%ecx), %ymm0, %ymm0
														
 
															+        vpmovmskb %ymm0, %ebx
														
 
															+        inc       %ebx
														
 
															+        jnz       .LVecEm1Differs
														
 
															+        vzeroupper
														
 
															+        pop       %ebx
														
 
															+        xor       %eax, %eax
														
 
															+        ret
														
 
															+
														
 
															+.LVec1Differs:
														
 
															+        xor      %ecx, %ecx
														
 
															+.LVecEm1Differs:
														
 
															+        add      $32, %ecx
														
 
															+.LVecEm2Differs:
														
 
															+        vzeroupper
														
 
															+        tzcnt    %ebx, %ebx
														
 
															+        add      %ecx, %ebx
														
 
															+        movzbl   (%eax,%ebx), %eax
														
 
															+        movzbl   (%edx,%ebx), %edx
														
 
															+        sub      %edx, %eax
														
 
															+        pop      %ebx
														
 
															+end;
														
 
															+
														
 
															+{$ifndef CPUX86_HAS_BMI1}
														
 
															 function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
														
 
															 var
														
@@ -1544,11 +1646,15 @@ var
 
															 function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
														
 
															 begin
														
 
															   if not fpc_cpucodeinit_performed then
														
 
															-    exit(CompareByte_Plain(buf1, buf2, len));
														
 
															-  if has_sse2_support then
														
 
															+    exit({$ifdef CPUX86_HAS_SSE2} CompareByte_SSE2 {$else} CompareByte_Plain {$endif} (buf1, buf2, len));
														
 
															+  if has_avx2_support then
														
 
															+    CompareByte_Impl:=@CompareByte_AVX2
														
 
															+  else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
														
 
															     CompareByte_Impl:=@CompareByte_SSE2
														
 
															+{$ifndef CPUX86_HAS_SSE2}
														
 
															   else
														
 
															-    CompareByte_Impl:=@CompareByte_Plain;
														
 
															+    CompareByte_Impl:=@CompareByte_Plain
														
 
															+{$endif};
														
 
															   result:=CompareByte_Impl(buf1, buf2, len);
														
 
															 end;
														
@@ -1556,7 +1662,7 @@ function CompareByte(const buf1, buf2; len: SizeInt): SizeInt;
 
															 begin
														
 
															   result:=CompareByte_Impl(buf1, buf2, len);
														
 
															 end;
														
 
															-{$endif ndef CPUX86_HAS_SSE2 (need CompareByte dispatcher)}
														
 
															+{$endif ndef CPUX86_HAS_BMI1 (need CompareByte dispatcher)}
														
 
															 {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
														
--- a/rtl/inc/systemh.inc
+++ b/rtl/inc/systemh.inc
@@ -921,7 +921,7 @@ function  Indexword(const buf;len:SizeInt;b:word):SizeInt; {$if defined(cpui386)
 
															 function  IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
														
 
															 function  IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt; {$if (defined(cpui386) or defined(cpux86_64)) and not defined(CPUX86_HAS_SSE4_1)} inline; {$endif}
														
 
															 function  CompareChar(const buf1,buf2;len:SizeInt):SizeInt;
														
 
															-function  CompareByte(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
														
 
															+function  CompareByte(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_BMI1)} inline; {$endif}
														
 
															 function  CompareWord(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
														
 
															 function  CompareDWord(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
														
 
															 procedure MoveChar0(const buf1;var buf2;len:SizeInt);