Browse Source

AVX2 CompareByte for i386.

Rika Ichinose 1 year ago
parent
commit
9917350ef0
2 changed files with 176 additions and 70 deletions
  1. 175 69
      rtl/i386/i386.inc
  2. 1 1
      rtl/inc/systemh.inc

+ 175 - 69
rtl/i386/i386.inc

@@ -1307,11 +1307,14 @@ asm
 end;
 end;
 {$endif ndef CPUX86_HAS_SSE2}
 {$endif ndef CPUX86_HAS_SSE2}
 
 
-function {$ifdef CPUX86_HAS_SSE2} CompareByte {$else} CompareByte_SSE2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
+label
+  CompareByte_1OrLess, CompareByte_CantOverReadBoth_AVX2;
+
+function CompareByte_SSE2(const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
 asm
 asm
         { eax = buf1, edx = buf2, ecx = len }
         { eax = buf1, edx = buf2, ecx = len }
         cmp      $1, %ecx
         cmp      $1, %ecx
-        jle      .L1OrLess
+        jle      CompareByte_1OrLess
 
 
         push     %ebx
         push     %ebx
         cmp      $16, %ecx
         cmp      $16, %ecx
@@ -1345,6 +1348,24 @@ asm
         xor      %eax, %eax
         xor      %eax, %eax
         ret
         ret
 
 
+.LAligned32xLoop_TwoVectorsDiffer:
+        add      %eax, %edx { restore edx = buf2 }
+        pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
+        inc      %cx
+        jz       .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
+        mov      %ecx, %ebx
+.LVec0Differs:
+        bsf      %ebx, %ebx
+        movzbl   (%eax,%ebx), %eax
+        movzbl   (%edx,%ebx), %edx
+        sub      %edx, %eax
+        pop      %ebx
+        ret
+
+        .byte    144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
+CompareByte_CantOverReadBoth_AVX2:
+        cmp      $16, %ecx
+        jb       .LCantOverReadBoth
 .LVecOrMore:
 .LVecOrMore:
         { Compare first vectors. }
         { Compare first vectors. }
         movdqu   (%eax), %xmm0
         movdqu   (%eax), %xmm0
@@ -1354,7 +1375,7 @@ asm
         inc      %bx
         inc      %bx
         jnz      .LVec0Differs
         jnz      .LVec0Differs
 
 
-        sub      $32, %ecx { now ecx is len - 32... mostly just to save bytes on offsets improving .LAligned32xLoop_Body alignment :) }
+        sub      $32, %ecx { now ecx is len - 32. }
         jbe      .LLastVec
         jbe      .LLastVec
 
 
         { Compare second vectors. }
         { Compare second vectors. }
@@ -1365,41 +1386,10 @@ asm
         inc      %bx
         inc      %bx
         jnz      .LVec1Differs
         jnz      .LVec1Differs
 
 
-        { More than four vectors: aligned loop. }
         cmp      $32, %ecx
         cmp      $32, %ecx
-        ja       .LAligned32xLoop_Prepare
-
-        { Compare last two vectors. }
-        movdqu   (%eax,%ecx), %xmm0
-        movdqu   (%edx,%ecx), %xmm1
-        pcmpeqb  %xmm1, %xmm0
-        pmovmskb %xmm0, %ebx
-        inc      %bx
-        jnz      .LVecEm2Differs
-.LLastVec:
-        movdqu   16(%eax,%ecx), %xmm0
-        movdqu   16(%edx,%ecx), %xmm1
-        pcmpeqb  %xmm1, %xmm0
-        pmovmskb %xmm0, %ebx
-        inc      %bx
-        jnz      .LVecEm1Differs
-        pop      %ebx
-        xor      %eax, %eax
-        ret
+        jbe      .LLastTwoVectors
 
 
-.LVecEm2Differs:
-        sub      $16, %ecx
-.LVecEm1Differs:
-        bsf      %ebx, %ebx
-        add      %ecx, %ebx
-        movzbl   16(%eax,%ebx), %eax
-        movzbl   16(%edx,%ebx), %edx
-        sub      %edx, %eax
-        pop      %ebx
-        ret
-        nop      { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
-
-.LAligned32xLoop_Prepare:
+        { More than four vectors: aligned loop. }
         lea      -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
         lea      -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
         sub      %eax, %edx { edx = buf2 - buf1 }
         sub      %eax, %edx { edx = buf2 - buf1 }
         and      $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
         and      $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
@@ -1418,40 +1408,33 @@ asm
         jnz      .LAligned32xLoop_TwoVectorsDiffer
         jnz      .LAligned32xLoop_TwoVectorsDiffer
         sub      $32, %ecx
         sub      $32, %ecx
         ja       .LAligned32xLoop_Body
         ja       .LAligned32xLoop_Body
-
-        { Compare last two vectors after the loop by doing one more loop iteration, modified. }
-        lea      32(%eax,%ecx), %eax
-        movdqu   (%edx,%eax), %xmm0
-        movdqu   (%eax), %xmm2
-        pcmpeqb  %xmm2, %xmm0
-        movdqu   16(%edx,%eax), %xmm1
-        movdqu   16(%eax), %xmm2
-        pcmpeqb  %xmm2, %xmm1
-        pand     %xmm0, %xmm1
-        pmovmskb %xmm1, %ebx
+        add      %eax, %edx { restore edx = buf2 }
+        add      $32, %ecx
+.LLastTwoVectors:
+        movdqu   (%eax,%ecx), %xmm0
+        movdqu   (%edx,%ecx), %xmm1
+        pcmpeqb  %xmm1, %xmm0
+        pmovmskb %xmm0, %ebx
         inc      %bx
         inc      %bx
-        jnz      .LAligned32xLoop_TwoVectorsDiffer
+        jnz      .LVecEm2Differs
+.LLastVec:
+        movdqu   16(%eax,%ecx), %xmm0
+        movdqu   16(%edx,%ecx), %xmm1
+        pcmpeqb  %xmm1, %xmm0
+        pmovmskb %xmm0, %ebx
+        inc      %bx
+        jnz      .LVecEm1Differs
         pop      %ebx
         pop      %ebx
         xor      %eax, %eax
         xor      %eax, %eax
         ret
         ret
 
 
-.LAligned32xLoop_TwoVectorsDiffer:
-        add      %eax, %edx { restore edx = buf2 }
-        pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
-        inc      %cx
-        jz       .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
-        bsf      %ecx, %ebx
-        movzbl   (%eax,%ebx), %eax
-        movzbl   (%edx,%ebx), %edx
-        sub      %edx, %eax
-        pop      %ebx
-        ret
-
 .LVec1Differs:
 .LVec1Differs:
-        add      $16, %eax
-        add      $16, %edx
-.LVec0Differs:
+        xor      %ecx, %ecx
+.LVecEm1Differs:
+        add      $16, %ecx
+.LVecEm2Differs:
         bsf      %ebx, %ebx
         bsf      %ebx, %ebx
+        add      %ecx, %ebx
         movzbl   (%eax,%ebx), %eax
         movzbl   (%eax,%ebx), %eax
         movzbl   (%edx,%ebx), %edx
         movzbl   (%edx,%ebx), %edx
         sub      %edx, %eax
         sub      %edx, %eax
@@ -1510,7 +1493,7 @@ asm
         pop      %ebx
         pop      %ebx
         ret
         ret
 
 
-.L1OrLess:
+CompareByte_1OrLess:
         jl       .LUnbounded_Prepare
         jl       .LUnbounded_Prepare
         movzbl   (%eax), %eax
         movzbl   (%eax), %eax
         movzbl   (%edx), %edx
         movzbl   (%edx), %edx
@@ -1535,7 +1518,126 @@ asm
         or       $1, %eax
         or       $1, %eax
 end;
 end;
 
 
-{$ifndef CPUX86_HAS_SSE2}
+function {$ifdef CPUX86_HAS_BMI1} CompareByte {$else} CompareByte_AVX2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
+asm
+        { eax = buf1, edx = buf2, ecx = len }
+        cmp       $1, %ecx
+        jle       CompareByte_1OrLess
+
+        push      %ebx
+        cmp       $32, %ecx
+        jae       .LVecOrMore
+
+        { 2 to 31 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
+        mov       %eax, %ebx
+        or        %edx, %ebx
+        and       $4095, %ebx
+        cmp       $4064, %ebx
+        ja        CompareByte_CantOverReadBoth_AVX2
+
+        { Over-read both as YMMs. }
+        vmovdqu   (%eax), %ymm0
+        vpcmpeqb  (%edx), %ymm0, %ymm0
+        vpmovmskb %ymm0, %ebx
+        inc       %ebx
+        { bzhi      %ecx, %ebx, %ecx }
+        .byte     0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi }
+        jnz       .LVec0Differs
+        vzeroupper
+        pop       %ebx
+        xor       %eax, %eax
+        ret
+
+        .byte     144 { Turn .balign 16 before .LAligned64xLoop_Body into a no-op. }
+.LAligned64xLoop_TwoVectorsDiffer:
+        add       %eax, %edx { restore edx = buf2 }
+        vpmovmskb %ymm0, %ecx { Is there a difference in the first vector? }
+        inc       %ecx
+        jz        .LVec1Differs { No difference in the first vector, ymm0 is all ones, ebx = vpmovmskb(vpcmpeqb(buf1 + 32, buf2 + 32)) from the loop body. }
+        mov       %ecx, %ebx
+.LVec0Differs:
+        vzeroupper
+        tzcnt     %ebx, %ebx
+        movzbl    (%eax,%ebx), %eax
+        movzbl    (%edx,%ebx), %edx
+        sub       %edx, %eax
+        pop       %ebx
+        ret
+
+.LVecOrMore:
+        { Compare first vectors. }
+        vmovdqu   (%eax), %ymm0
+        vpcmpeqb  (%edx), %ymm0, %ymm0
+        vpmovmskb %ymm0, %ebx
+        inc       %ebx
+        jnz       .LVec0Differs
+
+        sub       $64, %ecx { now ecx is len - 64. }
+        jbe       .LLastVec
+
+        { Compare second vectors. }
+        vmovdqu   32(%eax), %ymm0
+        vpcmpeqb  32(%edx), %ymm0, %ymm0
+        vpmovmskb %ymm0, %ebx
+        inc       %ebx
+        jnz       .LVec1Differs
+
+        cmp       $64, %ecx
+        jbe       .LLastTwoVectors
+
+        { More than four vectors: aligned loop. }
+        lea       -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
+        sub       %eax, %edx { edx = buf2 - buf1 }
+        and       $-32, %eax { Align buf1. First two vectors already analyzed are skipped by +64 on the first loop iteration. }
+        sub       %eax, %ecx { ecx = count to be handled with loop }
+.balign 16 { No-op. }
+.LAligned64xLoop_Body:
+        add       $64, %eax
+        { Compare two YMMs, reduce the result with 'and'. }
+        vmovdqu   (%edx,%eax), %ymm0
+        vpcmpeqb  (%eax), %ymm0, %ymm0 { ymm0 = vpcmpeqb(buf1, buf2) }
+        vmovdqu   32(%edx,%eax), %ymm1
+        vpcmpeqb  32(%eax), %ymm1, %ymm1
+        vpand     %ymm0, %ymm1, %ymm1 { ymm1 = ymm0 and vpcmpeqb(buf1 + 32, buf2 + 32) }
+        vpmovmskb %ymm1, %ebx
+        inc       %ebx
+        jnz       .LAligned64xLoop_TwoVectorsDiffer
+        sub       $64, %ecx
+        ja        .LAligned64xLoop_Body
+        add       %eax, %edx { restore edx = buf2 }
+        add       $64, %ecx
+.LLastTwoVectors:
+        vmovdqu   (%eax,%ecx), %ymm0
+        vpcmpeqb  (%edx,%ecx), %ymm0, %ymm0
+        vpmovmskb %ymm0, %ebx
+        inc       %ebx
+        jnz       .LVecEm2Differs
+.LLastVec:
+        vmovdqu   32(%eax,%ecx), %ymm0
+        vpcmpeqb  32(%edx,%ecx), %ymm0, %ymm0
+        vpmovmskb %ymm0, %ebx
+        inc       %ebx
+        jnz       .LVecEm1Differs
+        vzeroupper
+        pop       %ebx
+        xor       %eax, %eax
+        ret
+
+.LVec1Differs:
+        xor      %ecx, %ecx
+.LVecEm1Differs:
+        add      $32, %ecx
+.LVecEm2Differs:
+        vzeroupper
+        tzcnt    %ebx, %ebx
+        add      %ecx, %ebx
+        movzbl   (%eax,%ebx), %eax
+        movzbl   (%edx,%ebx), %edx
+        sub      %edx, %eax
+        pop      %ebx
+end;
+
+{$ifndef CPUX86_HAS_BMI1}
 function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
 function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
 
 
 var
 var
@@ -1544,11 +1646,15 @@ var
 function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
 function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
 begin
 begin
   if not fpc_cpucodeinit_performed then
   if not fpc_cpucodeinit_performed then
-    exit(CompareByte_Plain(buf1, buf2, len));
-  if has_sse2_support then
+    exit({$ifdef CPUX86_HAS_SSE2} CompareByte_SSE2 {$else} CompareByte_Plain {$endif} (buf1, buf2, len));
+  if has_avx2_support then
+    CompareByte_Impl:=@CompareByte_AVX2
+  else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
     CompareByte_Impl:=@CompareByte_SSE2
     CompareByte_Impl:=@CompareByte_SSE2
+{$ifndef CPUX86_HAS_SSE2}
   else
   else
-    CompareByte_Impl:=@CompareByte_Plain;
+    CompareByte_Impl:=@CompareByte_Plain
+{$endif};
   result:=CompareByte_Impl(buf1, buf2, len);
   result:=CompareByte_Impl(buf1, buf2, len);
 end;
 end;
 
 
@@ -1556,7 +1662,7 @@ function CompareByte(const buf1, buf2; len: SizeInt): SizeInt;
 begin
 begin
   result:=CompareByte_Impl(buf1, buf2, len);
   result:=CompareByte_Impl(buf1, buf2, len);
 end;
 end;
-{$endif ndef CPUX86_HAS_SSE2 (need CompareByte dispatcher)}
+{$endif ndef CPUX86_HAS_BMI1 (need CompareByte dispatcher)}
 {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
 {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
 
 
 
 

+ 1 - 1
rtl/inc/systemh.inc

@@ -921,7 +921,7 @@ function  Indexword(const buf;len:SizeInt;b:word):SizeInt; {$if defined(cpui386)
 function  IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
 function  IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
 function  IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt; {$if (defined(cpui386) or defined(cpux86_64)) and not defined(CPUX86_HAS_SSE4_1)} inline; {$endif}
 function  IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt; {$if (defined(cpui386) or defined(cpux86_64)) and not defined(CPUX86_HAS_SSE4_1)} inline; {$endif}
 function  CompareChar(const buf1,buf2;len:SizeInt):SizeInt;
 function  CompareChar(const buf1,buf2;len:SizeInt):SizeInt;
-function  CompareByte(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
+function  CompareByte(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_BMI1)} inline; {$endif}
 function  CompareWord(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
 function  CompareWord(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
 function  CompareDWord(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
 function  CompareDWord(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386) and not defined(CPUX86_HAS_SSE2)} inline; {$endif}
 procedure MoveChar0(const buf1;var buf2;len:SizeInt);
 procedure MoveChar0(const buf1;var buf2;len:SizeInt);