Browse Source

Post-modern CompareByte for i386/SSE2.

Rika Ichinose 2 years ago
parent
commit
9f491a40c7
1 changed files with 198 additions and 93 deletions
  1. 198 93
      rtl/i386/i386.inc

+ 198 - 93
rtl/i386/i386.inc

@@ -871,125 +871,230 @@ end;
 
 
 function CompareByte_SSE2(const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
 function CompareByte_SSE2(const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
 asm
 asm
-        cmp      $3, %ecx
-        push     %esi
-        lea      (%eax,%ecx), %esi { esi = buf1 end }
-        jle      .LBytewise_Test
+        { eax = buf1, edx = buf2, ecx = len }
+        cmp      $1, %ecx
+        jle      .L1OrLess
+
         push     %ebx
         push     %ebx
-        and      $-16, %ecx
-        lea      (%eax,%ecx), %ebx { ebx = end of full XMMs in buf1 }
-        cmp      %ebx, %eax
-        jne      .L16x_Body
-        lea      15(%ebx), %eax { check if tails don't cross page boundaries and can be over-read to XMMs }
-        lea      15(%edx), %ecx
-        xor      %ebx, %eax
-        xor      %edx, %ecx
-        or       %ecx, %eax
-        cmp      $4095, %eax
+        cmp      $16, %ecx
+        jae      .LVecOrMore
+
+        { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
+        mov      %eax, %ebx
+        or       %edx, %ebx
+        and      $4095, %ebx
+        cmp      $4080, %ebx
         ja       .LCantOverReadBoth
         ja       .LCantOverReadBoth
-        movdqu   (%ebx), %xmm0
-        movdqu   (%edx), %xmm2
-        pcmpeqb  %xmm2, %xmm0
-        pmovmskb %xmm0, %eax
-        xor      $65535, %eax
-        jz       .LReturnEAX
-        bsf      %eax, %ecx
-        add      %ecx, %ebx
-        cmp      %esi, %ebx { ignore over-read garbage bytes }
-        jnb      .L16x_Nothing
-        movzbl   (%ebx), %eax
-        movzbl   (%edx,%ecx), %edx
+
+        { Over-read both as XMMs. }
+        movdqu   (%eax), %xmm0
+        movdqu   (%edx), %xmm1
+        pcmpeqb  %xmm1, %xmm0
+        pmovmskb %xmm0, %ebx
+        inc      %bx { Here and below, 2-byte 'inc r16' after 'pmovmskb' can be replaced with 5-byte 'add $1, r16' or 6-byte 'xor $65535, r32'. }
+        jz       .LNothing
+        bsf      %ebx, %ebx
+        cmp      %ecx, %ebx { Ignore garbage beyond 'len'. }
+        jae      .LNothing
+        movzbl   (%eax,%ebx), %eax
+        movzbl   (%edx,%ebx), %edx
         sub      %edx, %eax
         sub      %edx, %eax
-.LReturnEAX:
         pop      %ebx
         pop      %ebx
-        pop      %esi
         ret
         ret
 
 
-.balign 16
-.L16x_Body:
-        movdqu   (%edx), %xmm0
-        movdqu   (%eax), %xmm1
+.LNothing:
+        pop      %ebx
+        xor      %eax, %eax
+        ret
+
+.LVecOrMore:
+        { Compare first vectors. }
+        movdqu   (%eax), %xmm0
+        movdqu   (%edx), %xmm1
         pcmpeqb  %xmm1, %xmm0
         pcmpeqb  %xmm1, %xmm0
-        pmovmskb %xmm0, %ecx
-        xor      $65535, %ecx
-        jnz      .L16x_Found
-        add      $16, %eax
-        add      $16, %edx
-        cmp      %eax, %ebx
-        jne      .L16x_Body
+        pmovmskb %xmm0, %ebx
+        inc      %bx
+        jnz      .LVec0Differs
 
 
-        cmp      %ebx, %esi
-        je       .L16x_Nothing
+        sub      $32, %ecx { now ecx is len - 32... mostly just to save bytes on offsets improving .LAligned32xLoop_Body alignment :) }
+        jbe      .LLastVec
 
 
-        sub      %eax, %edx
-        lea      -16(%esi), %eax
-        add      %eax, %edx
-        movdqu   (%edx), %xmm0
-        movdqu   (%eax), %xmm1
+        { Compare second vectors. }
+        movdqu   16(%eax), %xmm0
+        movdqu   16(%edx), %xmm1
         pcmpeqb  %xmm1, %xmm0
         pcmpeqb  %xmm1, %xmm0
-        pmovmskb %xmm0, %ecx
-        xor      $65535, %ecx
-        jnz      .L16x_Found
-.L16x_Nothing:
+        pmovmskb %xmm0, %ebx
+        inc      %bx
+        jnz      .LVec1Differs
+
+        { More than four vectors: aligned loop. }
+        cmp      $32, %ecx
+        ja       .LAligned32xLoop_Prepare
+
+        { Compare last two vectors. }
+        movdqu   (%eax,%ecx), %xmm0
+        movdqu   (%edx,%ecx), %xmm1
+        pcmpeqb  %xmm1, %xmm0
+        pmovmskb %xmm0, %ebx
+        inc      %bx
+        jnz      .LVecEm2Differs
+.LLastVec:
+        movdqu   16(%eax,%ecx), %xmm0
+        movdqu   16(%edx,%ecx), %xmm1
+        pcmpeqb  %xmm1, %xmm0
+        pmovmskb %xmm0, %ebx
+        inc      %bx
+        jnz      .LVecEm1Differs
         pop      %ebx
         pop      %ebx
         xor      %eax, %eax
         xor      %eax, %eax
-        pop      %esi
         ret
         ret
 
 
-.balign 16
-.L16x_Found:
-        bsf      %ecx, %ecx
+.LVecEm2Differs:
+        sub      $16, %ecx
+.LVecEm1Differs:
+        bsf      %ebx, %ebx
+        add      %ecx, %ebx
+        movzbl   16(%eax,%ebx), %eax
+        movzbl   16(%edx,%ebx), %edx
+        sub      %edx, %eax
         pop      %ebx
         pop      %ebx
-        movzbl   (%eax,%ecx), %eax
-        movzbl   (%edx,%ecx), %edx
-        pop      %esi
+        ret
+        nop      { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
+
+.LAligned32xLoop_Prepare:
+        lea      -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
+        sub      %eax, %edx { edx = buf2 - buf1 }
+        and      $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
+        sub      %eax, %ecx { ecx = count to be handled with loop }
+.balign 16 { No-op. }
+.LAligned32xLoop_Body:
+        add      $32, %eax
+        { Compare two XMMs, reduce the result with 'and'. }
+        movdqu   (%edx,%eax), %xmm0
+        pcmpeqb  (%eax), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
+        movdqu   16(%edx,%eax), %xmm1
+        pcmpeqb  16(%eax), %xmm1
+        pand     %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
+        pmovmskb %xmm1, %ebx
+        inc      %bx
+        jnz      .LAligned32xLoop_TwoVectorsDiffer
+        sub      $32, %ecx
+        ja       .LAligned32xLoop_Body
+
+        { Compare last two vectors after the loop by doing one more loop iteration, modified. }
+        lea      32(%eax,%ecx), %eax
+        movdqu   (%edx,%eax), %xmm0
+        movdqu   (%eax), %xmm2
+        pcmpeqb  %xmm2, %xmm0
+        movdqu   16(%edx,%eax), %xmm1
+        movdqu   16(%eax), %xmm2
+        pcmpeqb  %xmm2, %xmm1
+        pand     %xmm0, %xmm1
+        pmovmskb %xmm1, %ebx
+        inc      %bx
+        jnz      .LAligned32xLoop_TwoVectorsDiffer
+        pop      %ebx
+        xor      %eax, %eax
+        ret
+
+.LAligned32xLoop_TwoVectorsDiffer:
+        add      %eax, %edx { restore edx = buf2 }
+        pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
+        inc      %cx
+        jz       .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
+        bsf      %ecx, %ebx
+        movzbl   (%eax,%ebx), %eax
+        movzbl   (%edx,%ebx), %edx
+        sub      %edx, %eax
+        pop      %ebx
+        ret
+
+.LVec1Differs:
+        add      $16, %eax
+        add      $16, %edx
+.LVec0Differs:
+        bsf      %ebx, %ebx
+        movzbl   (%eax,%ebx), %eax
+        movzbl   (%edx,%ebx), %edx
         sub      %edx, %eax
         sub      %edx, %eax
+        pop      %ebx
         ret
         ret
 
 
 .LCantOverReadBoth:
 .LCantOverReadBoth:
-        mov      %esi, %eax
-        sub      %ebx, %eax
-        and      $-4, %eax
-        add      %ebx, %eax
-        cmp      %eax, %ebx
-        je       .LPopEbxAndGoBytewise
-.L4x_Body:
-        mov      (%ebx), %ecx
-        cmp      (%edx), %ecx
-        jne      .L4x_Found
-        add      $4, %ebx
-        add      $4, %edx
-        cmp      %ebx, %eax
-        jne      .L4x_Body
-.LPopEbxAndGoBytewise:
+        cmp      $3, %ecx
+        jle      .L2to3
+        push     %esi
+        mov      (%eax), %ebx
+        mov      (%edx), %esi
+        cmp      %esi, %ebx
+        jne      .L4xDiffer
+        cmp      $8, %ecx
+        jbe      .LLast4x
+        mov      4(%eax), %ebx
+        mov      4(%edx), %esi
+        cmp      %esi, %ebx
+        jne      .L4xDiffer
+        mov      -8(%eax,%ecx), %ebx
+        mov      -8(%edx,%ecx), %esi
+        cmp      %esi, %ebx
+        jne      .L4xDiffer
+.LLast4x:
+        mov      -4(%eax,%ecx), %ebx
+        mov      -4(%edx,%ecx), %esi
+        cmp      %esi, %ebx
+        jne      .L4xDiffer
+        pop      %esi
         pop      %ebx
         pop      %ebx
-.LBytewise_Test:
-        cmp      %esi, %eax
-        je       .LBytewise_Nothing
-.LBytewise_Body:
-        movzbl   (%edx), %ecx
-        cmp      (%eax), %cl
-        jne      .LDoSbb
-        add      $1, %eax
-        add      $1, %edx
-        cmp      %esi, %eax
-        jne      .LBytewise_Body
-.LBytewise_Nothing:
         xor      %eax, %eax
         xor      %eax, %eax
+        ret
+
+.L4xDiffer:
+        bswap    %ebx
+        bswap    %esi
+        cmp      %esi, %ebx
         pop      %esi
         pop      %esi
+        sbb      %eax, %eax
+        or       $1, %eax
+        pop      %ebx
         ret
         ret
 
 
-.L4x_Found:
-        mov      (%edx), %eax
-        bswap    %ecx
-        bswap    %eax
-        cmp      %ecx, %eax
+.L2to3:
+        movzwl   (%edx), %ebx
+        bswap    %ebx
+        shr      $1, %ebx
+        mov      -1(%edx,%ecx), %bl
+        movzwl   (%eax), %edx
+        bswap    %edx
+        shr      $1, %edx
+        mov      -1(%eax,%ecx), %dl
+        mov      %edx, %eax
+        sub      %ebx, %eax
         pop      %ebx
         pop      %ebx
-.LDoSbb:
+        ret
+
+.L1OrLess:
+        jl       .LUnbounded_Prepare
+        movzbl   (%eax), %eax
+        movzbl   (%edx), %edx
+        sub      %edx, %eax
+        ret
+
+.LUnbounded_Prepare:
+        sub      %eax, %edx { edx = buf2 - buf1 }
+        test     %ecx, %ecx
+        jnz      .LUnbounded_Body
+        xor      %eax, %eax
+        ret
+
+.balign 16
+.LUnbounded_Next:
+        add      $1, %eax
+.LUnbounded_Body:
+        movzbl   (%edx,%eax), %ecx
+        cmp      %cl, (%eax)
+        je       .LUnbounded_Next
         sbb      %eax, %eax
         sbb      %eax, %eax
-        and      $2, %eax
-        sub      $1, %eax
-        pop      %esi
+        or       $1, %eax
 end;
 end;
 
 
 function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
 function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;