Browse Source

Improved CompareByte for i386 and x86_64.

Rika Ichinose 2 years ago
parent
commit
524589231f
2 changed files with 149 additions and 93 deletions
  1. 76 61
      rtl/i386/i386.inc
  2. 73 32
      rtl/x86_64/x86_64.inc

+ 76 - 61
rtl/i386/i386.inc

@@ -466,71 +466,86 @@ end;
 {$define FPC_SYSTEM_HAS_COMPAREBYTE}
 function CompareByte(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
 asm
-        cmpl    $57,%ecx          { empirically determined value on a Core 2 Duo Conroe }
-        jg      .LCmpbyteFull
-        testl   %ecx,%ecx
-        je      .LCmpbyteZero
-
-        pushl   %ebx
-.LCmpbyteLoop:
-        movb    (%eax),%bl
-        cmpb    (%edx),%bl
-        leal    1(%eax),%eax
-        leal    1(%edx),%edx
-        jne     .LCmpbyteExitFast
-        decl    %ecx
-        jne     .LCmpbyteLoop
-.LCmpbyteExitFast:
-        movzbl  -1(%edx),%ecx     { Compare last position }
-        movzbl  %bl,%eax
-        subl    %ecx,%eax
-        popl    %ebx
+        sub     %eax, %edx
+        cmp     $6, %ecx
+        push    %esi
+        lea     (%eax,%ecx), %esi
+        jle     .LBytewiseTail_Prepare
+        push    %ebx
+        lea     3(%eax), %ebx
+        and     $-4, %ebx
+        cmp     %ebx, %eax
+        jne     .LBytewiseHead_Body
+.L4x_Prepare:
+        mov     %esi, %eax
+        and     $-4, %eax
+        jmp     .L4x_Body
+
+.balign 16
+.L4x_Next:
+        add     $4, %ebx
+        cmp     %ebx, %eax
+        je      .LBytewiseTail_PrepareFromHeadAnd4x
+.L4x_Body:
+        mov     (%ebx,%edx), %ecx
+        cmp     %ecx, (%ebx)
+        je      .L4x_Next
+        mov     (%ebx), %eax
+{$ifdef CPUX86_HAS_BSWAP}
+        bswap   %ecx
+{$else}
+        rol     $8, %cx
+        rol     $16, %ecx
+        rol     $8, %cx
+{$endif}
+        pop     %ebx
+        pop     %esi
+{$ifdef CPUX86_HAS_BSWAP}
+        bswap   %eax
+{$else}
+        rol     $8, %ax
+        rol     $16, %eax
+        rol     $8, %ax
+{$endif}
+        cmp     %eax, %ecx
+        sbb     %eax, %eax
+        and     $2, %eax
+        sub     $1, %eax
         ret
 
-.LCmpbyteZero:
-        movl    $0,%eax
+.LBytewiseHead_Next:
+        add     $1, %eax
+        cmp     %eax, %ebx
+        je      .L4x_Prepare
+.LBytewiseHead_Body:
+        movzbl  (%eax,%edx), %ecx
+        cmp     (%eax), %cl
+        je      .LBytewiseHead_Next
+        pop     %ebx
+        jmp     .LBytesDiffer
+
+.LBytewiseTail_PrepareFromHeadAnd4x:
+        pop     %ebx
+.LBytewiseTail_Prepare:
+        cmp     %esi, %eax
+        jne     .LBytewiseTail_Body
+.LNothingFound:
+        xor     %eax, %eax
+        pop     %esi
         ret
 
-.LCmpbyteFull:
-        pushl   %esi
-        pushl   %edi
-{$ifdef FPC_ENABLED_CLD}
-        cld
-{$endif FPC_ENABLED_CLD}
-        movl    %eax,%edi
-        movl    %edx,%esi
-        movl    %ecx,%eax
-
-        movl    %edi,%ecx       { Align on 32bits }
-        negl    %ecx            { calc bytes to align   (%edi and 3) xor 3= -%edi and 3 }
-        andl    $3,%ecx
-        subl    %ecx,%eax       { Subtract from number of bytes to go }
-        orl     %ecx,%ecx
-        repe
-        cmpsb                   { The actual 32-bit Aligning }
-        jne     .LCmpbyte3
-        movl    %eax,%ecx       { bytes to do, divide by 4 }
-        andl    $3,%eax         { remainder }
-        shrl    $2,%ecx         {  The actual division }
-        orl     %ecx,%ecx       { Sets zero flag if ecx=0 -> no cmp }
-        repe
-        cmpsl
-        je      .LCmpbyte2      { All equal? then to the left over bytes }
-        movl    $4,%eax         { Not equal. Rescan the last 4 bytes bytewise }
-        subl    %eax,%esi
-        subl    %eax,%edi
-.LCmpbyte2:
-        movl    %eax,%ecx       { bytes still to (re)scan }
-        orl     %eax,%eax       { prevent disaster in case %eax=0 }
-        repe
-        cmpsb
-.LCmpbyte3:
-        movzbl  -1(%esi),%ecx
-        movzbl  -1(%edi),%eax   { Compare failing (or equal) position }
-        subl    %ecx,%eax
-.LCmpbyteExit:
-        popl    %edi
-        popl    %esi
+.LBytewiseTail_Next:
+        add     $1, %eax
+        cmp     %eax, %esi
+        je      .LNothingFound
+.LBytewiseTail_Body:
+        movzbl  (%eax,%edx), %ecx
+        cmp     (%eax), %cl
+        je      .LBytewiseTail_Next
+.LBytesDiffer:
+        movzbl  (%eax), %eax
+        pop     %esi
+        sub     %ecx, %eax
 end;
 {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
 

+ 73 - 32
rtl/x86_64/x86_64.inc

@@ -636,39 +636,80 @@ function CompareByte(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackfra
   linux: rdi buf, rsi buf, rdx len }
 asm
 {$ifndef win64}
-    mov    %rdx, %r8
-    mov    %rsi, %rdx
-    mov    %rdi, %rcx
+    mov      %rdx, %r8
+    mov      %rsi, %rdx
+    mov      %rdi, %rcx
 {$endif win64}
-    negq    %r8
-    jz      .LCmpbyteZero
-
-    subq    %r8, %rcx
-    subq    %r8, %rdx
-
-    .balign 16
-.LCmpbyteLoop:
-{$ifdef oldbinutils}
-// for the reason why this alternate coding of movzbl is given here
-// see the comments in FillChar above
-    .byte 0x42,0x0F,0xB6,0x04,0x01
-{$else}
-    movzbl  (%rcx,%r8), %eax
-{$endif}    cmpb    (%rdx,%r8), %al
-    jne     .LCmpbyteExitFast
-    incq     %r8
-    jne     .LCmpbyteLoop
-.LCmpbyteZero:
-     xorl    %eax, %eax
-     retq
-
-.LCmpbyteExitFast:
-{$ifdef oldbinutils}
-    .byte 0x42,0x0F,0xB6,0x0C,0x02
-{$else}
-     movzbl  (%rdx,%r8), %ecx    { Compare last position }
-{$endif}
-     subq    %rcx, %rax
+    sub      %rcx, %rdx
+    lea      (%rcx,%r8), %r9
+    cmp      $15, %r8
+    jle      .LLessThanXMM
+    and      $-16, %r8
+    lea      (%rcx,%r8), %rax
+    jmp      .L16x_Body
+
+.balign 16
+.L16x_Next:
+    add      $16, %rcx
+    cmp      %rcx, %rax
+    je       .L4x_PrepareAfter16x
+.L16x_Body:
+    movdqu   (%rcx,%rdx), %xmm0
+    movdqu   (%rcx), %xmm1
+    pcmpeqb  %xmm1, %xmm0
+    pmovmskb %xmm0, %r8d
+    xor      $65535, %r8d
+    je       .L16x_Next
+    bsf      %r8d, %r8d
+    add      %rcx, %rdx
+    movzbl   (%rcx,%r8), %eax
+    movzbl   (%rdx,%r8), %edx
+    sub      %rdx, %rax
+    ret
+
+.L4x_PrepareAfter16x:
+    mov      %r9, %r8
+    sub      %rcx, %r8
+.LLessThanXMM:
+    cmp      $3, %r8
+    jle      .LBytewiseTail_Prepare
+.L4x_Prepare:
+    and      $-4, %r8
+    add      %rcx, %r8
+    cmp      %r8, %rcx
+    jne      .L4x_Body
+    jmp      .LBytewiseTail_Prepare
+
+.L4x_Next:
+    add      $4, %rcx
+    cmp      %rcx, %r8
+    je       .LBytewiseTail_Prepare
+.L4x_Body:
+    mov      (%rcx,%rdx), %r10d
+    mov      (%rcx), %eax
+    cmp      %r10d, %eax
+    je       .L4x_Next
+    bswap    %eax
+    bswap    %r10d
+    sub      %r10, %rax
+    ret
+
+.LBytewiseTail_Prepare:
+    cmp      %rcx, %r9
+    jne      .LBytewiseTail_Body
+    xor      %eax, %eax
+    ret
+
+.LBytewiseTail_Next:
+    add      $1, %rcx
+    cmp      %rcx, %r9
+    je       .LReturnRAX
+.LBytewiseTail_Body:
+    movzbl   (%rcx,%rdx), %r8d
+    movzbl   (%rcx), %eax
+    sub      %r8, %rax
+    je       .LBytewiseTail_Next
+.LReturnRAX:
 end;
 {$endif FPC_SYSTEM_HAS_COMPAREBYTE}