2 years ago · 9917350ef0
--- a/rtl/i386/i386.inc
+++ b/rtl/i386/i386.inc
@@ -1307,11 +1307,14 @@ asm
 
				 end;
			
 
				 {$endif ndef CPUX86_HAS_SSE2}
			
 
				 
			
 
				-function {$ifdef CPUX86_HAS_SSE2} CompareByte {$else} CompareByte_SSE2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
			
 
				+label
			
 
				+  CompareByte_1OrLess, CompareByte_CantOverReadBoth_AVX2;
			
 
				+
			
 
				+function CompareByte_SSE2(const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
			
 
				 asm
			
 
				         { eax = buf1, edx = buf2, ecx = len }
			
 
				         cmp      $1, %ecx
			
 
				-        jle      .L1OrLess
			
 
				+        jle      CompareByte_1OrLess
			
 
				 
			
 
				         push     %ebx
			
 
				         cmp      $16, %ecx
			
@@ -1345,6 +1348,24 @@ asm
 
				         xor      %eax, %eax
			
 
				         ret
			
 
				 
			
 
				+.LAligned32xLoop_TwoVectorsDiffer:
			
 
				+        add      %eax, %edx { restore edx = buf2 }
			
 
				+        pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
			
 
				+        inc      %cx
			
 
				+        jz       .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
			
 
				+        mov      %ecx, %ebx
			
 
				+.LVec0Differs:
			
 
				+        bsf      %ebx, %ebx
			
 
				+        movzbl   (%eax,%ebx), %eax
			
 
				+        movzbl   (%edx,%ebx), %edx
			
 
				+        sub      %edx, %eax
			
 
				+        pop      %ebx
			
 
				+        ret
			
 
				+
			
 
				+        .byte    144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
			
 
				+CompareByte_CantOverReadBoth_AVX2:
			
 
				+        cmp      $16, %ecx
			
 
				+        jb       .LCantOverReadBoth
			
 
				 .LVecOrMore:
			
 
				         { Compare first vectors. }
			
 
				         movdqu   (%eax), %xmm0
			
@@ -1354,7 +1375,7 @@ asm
 
				         inc      %bx
			
 
				         jnz      .LVec0Differs
			
 
				 
			
 
				-        sub      $32, %ecx { now ecx is len - 32... mostly just to save bytes on offsets improving .LAligned32xLoop_Body alignment :) }
			
 
				+        sub      $32, %ecx { now ecx is len - 32. }
			
 
				         jbe      .LLastVec
			
 
				 
			
 
				         { Compare second vectors. }
			
@@ -1365,41 +1386,10 @@ asm
 
				         inc      %bx
			
 
				         jnz      .LVec1Differs
			
 
				 
			
 
				-        { More than four vectors: aligned loop. }
			
 
				         cmp      $32, %ecx
			
 
				-        ja       .LAligned32xLoop_Prepare
			
 
				-
			
 
				-        { Compare last two vectors. }
			
 
				-        movdqu   (%eax,%ecx), %xmm0
			
 
				-        movdqu   (%edx,%ecx), %xmm1
			
 
				-        pcmpeqb  %xmm1, %xmm0
			
 
				-        pmovmskb %xmm0, %ebx
			
 
				-        inc      %bx
			
 
				-        jnz      .LVecEm2Differs
			
 
				-.LLastVec:
			
 
				-        movdqu   16(%eax,%ecx), %xmm0
			
 
				-        movdqu   16(%edx,%ecx), %xmm1
			
 
				-        pcmpeqb  %xmm1, %xmm0
			
 
				-        pmovmskb %xmm0, %ebx
			
 
				-        inc      %bx
			
 
				-        jnz      .LVecEm1Differs
			
 
				-        pop      %ebx
			
 
				-        xor      %eax, %eax
			
 
				-        ret
			
 
				+        jbe      .LLastTwoVectors
			
 
				 
			
 
				-.LVecEm2Differs:
			
 
				-        sub      $16, %ecx
			
 
				-.LVecEm1Differs:
			
 
				-        bsf      %ebx, %ebx
			
 
				-        add      %ecx, %ebx
			
 
				-        movzbl   16(%eax,%ebx), %eax
			
 
				-        movzbl   16(%edx,%ebx), %edx
			
 
				-        sub      %edx, %eax
			
 
				-        pop      %ebx
			
 
				-        ret
			
 
				-        nop      { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
			
 
				-
			
 
				-.LAligned32xLoop_Prepare:
			
 
				+        { More than four vectors: aligned loop. }
			
 
				         lea      -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
			
 
				         sub      %eax, %edx { edx = buf2 - buf1 }
			
 
				         and      $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
			
@@ -1418,40 +1408,33 @@ asm
 
				         jnz      .LAligned32xLoop_TwoVectorsDiffer
			
 
				         sub      $32, %ecx
			
 
				         ja       .LAligned32xLoop_Body
			
 
				-
			
 
				-        { Compare last two vectors after the loop by doing one more loop iteration, modified. }
			
 
				-        lea      32(%eax,%ecx), %eax
			
 
				-        movdqu   (%edx,%eax), %xmm0
			
 
				-        movdqu   (%eax), %xmm2
			
 
				-        pcmpeqb  %xmm2, %xmm0
			
 
				-        movdqu   16(%edx,%eax), %xmm1
			
 
				-        movdqu   16(%eax), %xmm2
			
 
				-        pcmpeqb  %xmm2, %xmm1
			
 
				-        pand     %xmm0, %xmm1
			
 
				-        pmovmskb %xmm1, %ebx
			
 
				+        add      %eax, %edx { restore edx = buf2 }
			
 
				+        add      $32, %ecx
			
 
				+.LLastTwoVectors:
			
 
				+        movdqu   (%eax,%ecx), %xmm0
			
 
				+        movdqu   (%edx,%ecx), %xmm1
			
 
				+        pcmpeqb  %xmm1, %xmm0
			
 
				+        pmovmskb %xmm0, %ebx
			
 
				         inc      %bx
			
 
				-        jnz      .LAligned32xLoop_TwoVectorsDiffer
			
 
				+        jnz      .LVecEm2Differs
			
 
				+.LLastVec:
			
 
				+        movdqu   16(%eax,%ecx), %xmm0
			
 
				+        movdqu   16(%edx,%ecx), %xmm1
			
 
				+        pcmpeqb  %xmm1, %xmm0
			
 
				+        pmovmskb %xmm0, %ebx
			
 
				+        inc      %bx
			
 
				+        jnz      .LVecEm1Differs
			
 
				         pop      %ebx
			
 
				         xor      %eax, %eax
			
 
				         ret
			
 
				 
			
 
				-.LAligned32xLoop_TwoVectorsDiffer:
			
 
				-        add      %eax, %edx { restore edx = buf2 }
			
 
				-        pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
			
 
				-        inc      %cx
			
 
				-        jz       .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
			
 
				-        bsf      %ecx, %ebx
			
 
				-        movzbl   (%eax,%ebx), %eax
			
 
				-        movzbl   (%edx,%ebx), %edx
			
 
				-        sub      %edx, %eax
			
 
				-        pop      %ebx
			
 
				-        ret
			
 
				-
			
 
				 .LVec1Differs:
			
 
				-        add      $16, %eax
			
 
				-        add      $16, %edx
			
 
				-.LVec0Differs:
			
 
				+        xor      %ecx, %ecx
			
 
				+.LVecEm1Differs:
			
 
				+        add      $16, %ecx
			
 
				+.LVecEm2Differs:
			
 
				         bsf      %ebx, %ebx
			
 
				+        add      %ecx, %ebx
			
 
				         movzbl   (%eax,%ebx), %eax
			
 
				         movzbl   (%edx,%ebx), %edx
			
 
				         sub      %edx, %eax
			
@@ -1510,7 +1493,7 @@ asm
 
				         pop      %ebx
			
 
				         ret
			
 
				 
			
 
				-.L1OrLess:
			
 
				+CompareByte_1OrLess:
			
 
				         jl       .LUnbounded_Prepare
			
 
				         movzbl   (%eax), %eax
			
 
				         movzbl   (%edx), %edx
			
@@ -1535,7 +1518,126 @@ asm
 
				         or       $1, %eax
			
 
				 end;
			
 
				 
			
 
				-{$ifndef CPUX86_HAS_SSE2}
			
 
				+function {$ifdef CPUX86_HAS_BMI1} CompareByte {$else} CompareByte_AVX2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
			
 
				+asm
			
 
				+        { eax = buf1, edx = buf2, ecx = len }
			
 
				+        cmp       $1, %ecx
			
 
				+        jle       CompareByte_1OrLess
			
 
				+
			
 
				+        push      %ebx
			
 
				+        cmp       $32, %ecx
			
 
				+        jae       .LVecOrMore
			
 
				+
			
 
				+        { 2 to 31 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
			
 
				+        mov       %eax, %ebx
			
 
				+        or        %edx, %ebx
			
 
				+        and       $4095, %ebx
			
 
				+        cmp       $4064, %ebx
			
 
				+        ja        CompareByte_CantOverReadBoth_AVX2
			
 
				+
			
 
				+        { Over-read both as YMMs. }
			
 
				+        vmovdqu   (%eax), %ymm0
			
 
				+        vpcmpeqb  (%edx), %ymm0, %ymm0
			
 
				+        vpmovmskb %ymm0, %ebx
			
 
				+        inc       %ebx
			
 
				+        { bzhi      %ecx, %ebx, %ecx }
			
 
				+        .byte     0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi }
			
 
				+        jnz       .LVec0Differs
			
 
				+        vzeroupper
			
 
				+        pop       %ebx
			
 
				+        xor       %eax, %eax
			
 
				+        ret
			
 
				+
			
 
				+        .byte     144 { Turn .balign 16 before .LAligned64xLoop_Body into a no-op. }
			
 
				+.LAligned64xLoop_TwoVectorsDiffer:
			
 
				+        add       %eax, %edx { restore edx = buf2 }
			
 
				+        vpmovmskb %ymm0, %ecx { Is there a difference in the first vector? }
			
 
				+        inc       %ecx
			
 
				+        jz        .LVec1Differs { No difference in the first vector, ymm0 is all ones, ebx = vpmovmskb(vpcmpeqb(buf1 + 32, buf2 + 32)) from the loop body. }
			
 
				+        mov       %ecx, %ebx
			
 
				+.LVec0Differs:
			
 
				+        vzeroupper
			
 
				+        tzcnt     %ebx, %ebx
			
 
				+        movzbl    (%eax,%ebx), %eax
			
 
				+        movzbl    (%edx,%ebx), %edx
			
 
				+        sub       %edx, %eax
			
 
				+        pop       %ebx
			
 
				+        ret
			
 
				+
			
 
				+.LVecOrMore:
			
 
				+        { Compare first vectors. }
			
 
				+        vmovdqu   (%eax), %ymm0
			
 
				+        vpcmpeqb  (%edx), %ymm0, %ymm0
			
 
				+        vpmovmskb %ymm0, %ebx
			
 
				+        inc       %ebx
			
 
				+        jnz       .LVec0Differs
			
 
				+
			
 
				+        sub       $64, %ecx { now ecx is len - 64. }
			
 
				+        jbe       .LLastVec
			
 
				+
			
 
				+        { Compare second vectors. }
			
 
				+        vmovdqu   32(%eax), %ymm0
			
 
				+        vpcmpeqb  32(%edx), %ymm0, %ymm0
			
 
				+        vpmovmskb %ymm0, %ebx
			
 
				+        inc       %ebx
			
 
				+        jnz       .LVec1Differs
			
 
				+
			
 
				+        cmp       $64, %ecx
			
 
				+        jbe       .LLastTwoVectors
			
 
				+
			
 
				+        { More than four vectors: aligned loop. }
			
 
				+        lea       -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
			
 
				+        sub       %eax, %edx { edx = buf2 - buf1 }
			
 
				+        and       $-32, %eax { Align buf1. First two vectors already analyzed are skipped by +64 on the first loop iteration. }
			
 
				+        sub       %eax, %ecx { ecx = count to be handled with loop }
			
 
				+.balign 16 { No-op. }
			
 
				+.LAligned64xLoop_Body:
			
 
				+        add       $64, %eax
			
 
				+        { Compare two YMMs, reduce the result with 'and'. }
			
 
				+        vmovdqu   (%edx,%eax), %ymm0
			
 
				+        vpcmpeqb  (%eax), %ymm0, %ymm0 { ymm0 = vpcmpeqb(buf1, buf2) }
			
 
				+        vmovdqu   32(%edx,%eax), %ymm1
			
 
				+        vpcmpeqb  32(%eax), %ymm1, %ymm1
			
 
				+        vpand     %ymm0, %ymm1, %ymm1 { ymm1 = ymm0 and vpcmpeqb(buf1 + 32, buf2 + 32) }
			
 
				+        vpmovmskb %ymm1, %ebx
			
 
				+        inc       %ebx
			
 
				+        jnz       .LAligned64xLoop_TwoVectorsDiffer
			
 
				+        sub       $64, %ecx
			
 
				+        ja        .LAligned64xLoop_Body
			
 
				+        add       %eax, %edx { restore edx = buf2 }
			
 
				+        add       $64, %ecx
			
 
				+.LLastTwoVectors:
			
 
				+        vmovdqu   (%eax,%ecx), %ymm0
			
 
				+        vpcmpeqb  (%edx,%ecx), %ymm0, %ymm0
			
 
				+        vpmovmskb %ymm0, %ebx
			
 
				+        inc       %ebx
			
 
				+        jnz       .LVecEm2Differs
			
 
				+.LLastVec:
			
 
				+        vmovdqu   32(%eax,%ecx), %ymm0
			
 
				+        vpcmpeqb  32(%edx,%ecx), %ymm0, %ymm0
			
 
				+        vpmovmskb %ymm0, %ebx
			
 
				+        inc       %ebx
			
 
				+        jnz       .LVecEm1Differs
			
 
				+        vzeroupper
			
 
				+        pop       %ebx
			
 
				+        xor       %eax, %eax
			
 
				+        ret
			
 
				+
			
 
				+.LVec1Differs:
			
 
				+        xor      %ecx, %ecx
			
 
				+.LVecEm1Differs:
			
 
				+        add      $32, %ecx
			
 
				+.LVecEm2Differs:
			
 
				+        vzeroupper
			
 
				+        tzcnt    %ebx, %ebx
			
 
				+        add      %ecx, %ebx
			
 
				+        movzbl   (%eax,%ebx), %eax
			
 
				+        movzbl   (%edx,%ebx), %edx
			
 
				+        sub      %edx, %eax
			
 
				+        pop      %ebx
			
 
				+end;
			
 
				+
			
 
				+{$ifndef CPUX86_HAS_BMI1}
			
 
				 function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
			
 
				 
			
 
				 var
			
@@ -1544,11 +1646,15 @@ var
 
				 function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
			
 
				 begin
			
 
				   if not fpc_cpucodeinit_performed then
			
 
				-    exit(CompareByte_Plain(buf1, buf2, len));
			
 
				-  if has_sse2_support then
			
 
				+    exit({$ifdef CPUX86_HAS_SSE2} CompareByte_SSE2 {$else} CompareByte_Plain {$endif} (buf1, buf2, len));
			
 
				+  if has_avx2_support then
			
 
				+    CompareByte_Impl:=@CompareByte_AVX2
			
 
				+  else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
			
 
				     CompareByte_Impl:=@CompareByte_SSE2
			
 
				+{$ifndef CPUX86_HAS_SSE2}
			
 
				   else
			
 
				-    CompareByte_Impl:=@CompareByte_Plain;
			
 
				+    CompareByte_Impl:=@CompareByte_Plain
			
 
				+{$endif};
			
 
				   result:=CompareByte_Impl(buf1, buf2, len);
			
 
				 end;
			
 
				 
			
@@ -1556,7 +1662,7 @@ function CompareByte(const buf1, buf2; len: SizeInt): SizeInt;
 
				 begin
			
 
				   result:=CompareByte_Impl(buf1, buf2, len);
			
 
				 end;
			
 
				-{$endif ndef CPUX86_HAS_SSE2 (need CompareByte dispatcher)}
			
 
				+{$endif ndef CPUX86_HAS_BMI1 (need CompareByte dispatcher)}
			
 
				 {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
			
 
				 
			
 
				 
			
--- a/rtl/inc/systemh.inc
+++ b/rtl/inc/systemh.inc
@@ -921,7 +921,7 @@ function  Indexword(const buf;len:SizeInt;b:word):SizeInt; {$if defined(cpui386)
 
				 function  IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
			
 
				 function  IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt; {$if (defined(cpui386) or defined(cpux86_64)) and not defined(CPUX86_HAS_SSE4_1)} inline; {$endif}
			
 
				 function  CompareChar(const buf1,buf2;len:SizeInt):SizeInt;
			
 
				-function  CompareByte(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
			
 
				+function  CompareByte(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_BMI1)} inline; {$endif}
			
 
				 function  CompareWord(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
			
 
				 function  CompareDWord(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
			
 
				 procedure MoveChar0(const buf1;var buf2;len:SizeInt);