Parcourir la source

Post-modern CompareByte for x86-64/SSE2.

Rika Ichinose il y a 2 ans
Parent
commit
c07f36b30b
1 fichiers modifiés avec 166 ajouts et 76 suppressions
  1. 166 76
      rtl/x86_64/x86_64.inc

+ 166 - 76
rtl/x86_64/x86_64.inc

@@ -646,108 +646,198 @@ asm
     mov      %rsi, %rdx
     mov      %rsi, %rdx
     mov      %rdi, %rcx
     mov      %rdi, %rcx
 {$endif win64}
 {$endif win64}
-    lea      (%rcx,%r8), %r10 { r10 = buf1 end }
-    cmp      $3, %r8
-    jle      .LBytewise_Test
-    mov      %r8, %r9
-    and      $-16, %r9
-    add      %rcx, %r9 { r9 = end of full XMMs in buf1 }
-    cmp      %r9, %rcx
-    jne      .L16x_Body
-    lea      15(%r9), %eax { check if tails don't cross page boundaries and can be over-read to XMMs }
-    lea      15(%rdx), %ecx
-    xor      %r9d, %eax
-    xor      %edx, %ecx
-    or       %ecx, %eax
-    cmp      $4095, %eax
-    ja       .L4x_Prepare
-    movdqu   (%r9), %xmm0
+    { rcx = buf1, rdx = buf2, r8 = len }
+    cmp      $1, %r8
+    jle      .L1OrLess
+
+    cmp      $16, %r8
+    jae      .LVecOrMore
+
+    { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but is faster. }
+    mov      %ecx, %eax
+    or       %edx, %eax
+    and      $4095, %eax
+    cmp      $4080, %eax
+    ja       .LCantOverReadBoth
+
+    { Over-read both as XMMs. }
+    movdqu   (%rcx), %xmm0
     movdqu   (%rdx), %xmm1
     movdqu   (%rdx), %xmm1
     pcmpeqb  %xmm1, %xmm0
     pcmpeqb  %xmm1, %xmm0
     pmovmskb %xmm0, %eax
     pmovmskb %xmm0, %eax
-    xor      $65535, %eax
-    jz       .L16x_Nothing
-    bsf      %eax, %ecx
-    add      %rcx, %r9
-    cmp      %r10, %r9 { ignore over-read garbage bytes }
-    jnb      .L16x_Nothing
-    movzbl   (%r9), %eax
-    movzbl   (%rdx,%rcx), %edx
+    inc      %ax
+    jz       .LNothing
+    bsf      %eax, %eax
+    cmp      %r8d, %eax { Ignore garbage beyond 'len'. }
+    jae      .LNothing
+    movzbl   (%rdx,%rax), %edx
+    movzbl   (%rcx,%rax), %eax
     sub      %rdx, %rax
     sub      %rdx, %rax
     ret
     ret
 
 
 .balign 16
 .balign 16
-.L16x_Body:
-    movdqu   (%rdx), %xmm0
-    movdqu   (%rcx), %xmm1
+.LNothing:
+    xor      %eax, %eax
+    ret
+
+.LAligned32xLoop_TwoVectorsDiffer:
+    add      %rcx, %rdx { restore rdx = buf2 }
+    pmovmskb %xmm0, %r8d { Is there a difference in the first vector? }
+    inc      %r8w
+    jz       .LVec1Differs { No difference in the first vector, xmm0 is all ones, eax = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
+    mov      %r8d, %eax
+.LVec0Differs:
+    bsf      %eax, %eax
+    movzbl   (%rdx,%rax), %edx
+    movzbl   (%rcx,%rax), %eax
+    sub      %rdx, %rax
+    ret
+    .byte    0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
+
+.LVecOrMore:
+    { Compare first vectors. }
+    movdqu   (%rcx), %xmm0
+    movdqu   (%rdx), %xmm1
     pcmpeqb  %xmm1, %xmm0
     pcmpeqb  %xmm1, %xmm0
     pmovmskb %xmm0, %eax
     pmovmskb %xmm0, %eax
-    xor      $65535, %eax
-    jnz      .L16x_Found
-    add      $16, %rcx
-    add      $16, %rdx
-    cmp      %rcx, %r9
-    jne      .L16x_Body
+    inc      %ax
+    jnz      .LVec0Differs
 
 
-    cmp      %r9, %r10
-    je       .L16x_Nothing
+    sub      $32, %r8
+    jbe      .LLastVec
 
 
-    sub      %rcx, %rdx
-    lea      -16(%r10), %rcx
-    add      %rcx, %rdx
-    movdqu   (%rdx), %xmm0
-    movdqu   (%rcx), %xmm1
+    { Compare second vectors. }
+    movdqu   16(%rcx), %xmm0
+    movdqu   16(%rdx), %xmm1
     pcmpeqb  %xmm1, %xmm0
     pcmpeqb  %xmm1, %xmm0
     pmovmskb %xmm0, %eax
     pmovmskb %xmm0, %eax
-    xor      $65535, %eax
-    jnz      .L16x_Found
-.L16x_Nothing:
+    inc      %ax
+    jnz      .LVec1Differs
+
+    cmp      $32, %r8
+    jbe      .LLastTwoVectors
+
+    { More than four vectors: aligned loop. }
+    lea      -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). }
+    sub      %rcx, %rdx { rdx = buf2 - buf1 }
+    and      $-16, %rcx { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
+    sub      %rcx, %r8 { r8 = count to be handled with loop }
+.balign 16 { no-op }
+.LAligned32xLoop_Body:
+    add      $32, %rcx
+    { Compare two XMMs, reduce the result with 'and'. }
+    movdqu   (%rdx,%rcx), %xmm0
+    pcmpeqb  (%rcx), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
+    movdqu   16(%rdx,%rcx), %xmm1
+    pcmpeqb  16(%rcx), %xmm1
+    pand     %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
+    pmovmskb %xmm1, %eax
+    inc      %ax
+    jnz      .LAligned32xLoop_TwoVectorsDiffer
+    sub      $32, %r8
+    ja       .LAligned32xLoop_Body
+    add      %rcx, %rdx { restore rdx = buf2 }
+    add      $32, %r8
+.LLastTwoVectors:
+    movdqu   (%rcx,%r8), %xmm0
+    movdqu   (%rdx,%r8), %xmm1
+    pcmpeqb  %xmm1, %xmm0
+    pmovmskb %xmm0, %eax
+    inc      %ax
+    jnz      .LVecEm2Differs
+.LLastVec:
+    movdqu   16(%rcx,%r8), %xmm0
+    movdqu   16(%rdx,%r8), %xmm1
+    pcmpeqb  %xmm1, %xmm0
+    pmovmskb %xmm0, %eax
+    inc      %ax
+    jnz      .LVecEm1Differs
     xor      %eax, %eax
     xor      %eax, %eax
     ret
     ret
 
 
-.balign 16
-.L16x_Found:
+.LVec1Differs:
+    xor      %r8d, %r8d
+.LVecEm1Differs:
+    add      $16, %r8
+.LVecEm2Differs:
     bsf      %eax, %eax
     bsf      %eax, %eax
+    add      %r8, %rax
     movzbl   (%rdx,%rax), %edx
     movzbl   (%rdx,%rax), %edx
     movzbl   (%rcx,%rax), %eax
     movzbl   (%rcx,%rax), %eax
     sub      %rdx, %rax
     sub      %rdx, %rax
     ret
     ret
 
 
-.L4x_Prepare:
-    and      $12, %r8d { count to be handled with uint32s for 1 <= len <= 15: len mod 16 div 4 * 4 = len and %1100 = len and 12 }
-    lea      (%r9,%r8), %rcx
-    cmp      %rcx, %r9
-    je       .LBytewise_Body
-.L4x_Body:
-    mov      (%r9), %eax
-    mov      (%rdx), %r8d
-    cmp      %r8d, %eax
-    jne      .L4x_Found
-    add      $4, %r9
-    add      $4, %rdx
-    cmp      %r9, %rcx
-    jne      .L4x_Body
-.LBytewise_Test:
-    cmp      %r10, %rcx
-    je       .LNothing
-.LBytewise_Body:
-    movzbl   (%rcx), %eax
-    movzbl   (%rdx), %r8d
-    sub      %r8, %rax
-    jne      .LReturnRAX
-    add      $1, %rcx
-    add      $1, %rdx
-    cmp      %r10, %rcx
-    jne      .LBytewise_Body
-.LNothing:
+.LCantOverReadBoth:
+    cmp      $8, %r8d
+    ja       .L9to15
+    cmp      $3, %r8d
+    jle      .L2to3
+    mov      (%rcx), %eax
+    mov      (%rdx), %r9d
+    cmp      %r9d, %eax
+    jne      .L4xOr8xDiffer
+    mov      -4(%rcx,%r8), %eax
+    mov      -4(%rdx,%r8), %r9d
+    cmp      %r9d, %eax
+    jne      .L4xOr8xDiffer
     xor      %eax, %eax
     xor      %eax, %eax
-.LReturnRAX:
     ret
     ret
 
 
-.L4x_Found:
-    bswap    %r8d
+.L9to15:
+    mov      (%rcx), %rax
+    mov      (%rdx), %r9
+    cmp      %r9, %rax
+    jne      .L4xOr8xDiffer
+    mov      -8(%rcx,%r8), %rax
+    mov      -8(%rdx,%r8), %r9
+    cmp      %r9, %rax
+    jne      .L4xOr8xDiffer
+    xor      %eax, %eax
+    ret
+
+.L4xOr8xDiffer:
+    bswap    %r9
+    bswap    %rax
+    cmp      %r9, %rax
+    sbb      %rax, %rax
+    or       $1, %rax
+    ret
+
+.L2to3:
+    movzwl   (%rcx), %eax
     bswap    %eax
     bswap    %eax
-    sub      %r8, %rax
+    shr      $1, %eax
+    mov      -1(%rcx,%r8), %al
+    movzwl   (%rdx), %ecx
+    bswap    %ecx
+    shr      $1, %ecx
+    mov      -1(%rdx,%r8), %cl
+    sub      %rcx, %rax
+    ret
+
+.L1OrLess:
+    jl       .LUnbounded_Prepare
+    movzbl   (%rcx), %eax
+    movzbl   (%rdx), %edx
+    sub      %rdx, %rax
+    ret
+
+.LUnbounded_Prepare:
+    sub      %rcx, %rdx { rdx = buf2 - buf1 }
+    test     %r8, %r8
+    jnz      .LUnbounded_Body
+    xor      %eax, %eax
+    ret
+
+.balign 16
+.LUnbounded_Next:
+    add      $1, %rcx
+.LUnbounded_Body:
+    movzbl   (%rdx,%rcx), %eax
+    cmp      %al, (%rcx)
+    je       .LUnbounded_Next
+    sbb      %rax, %rax
+    or       $1, %rax
 end;
 end;
 {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
 {$endif FPC_SYSTEM_HAS_COMPAREBYTE}