Browse Source

Improved CompareWord for i386 and x86_64.

Rika Ichinose 2 years ago
parent
commit
da12cfc867
2 changed files with 145 additions and 73 deletions
  1. 54 73
      rtl/i386/i386.inc
  2. 91 0
      rtl/x86_64/x86_64.inc

+ 54 - 73
rtl/i386/i386.inc

@@ -554,83 +554,64 @@ end;
 {$define FPC_SYSTEM_HAS_COMPAREWORD}
 function CompareWord(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
 asm
-        cmpl    $32,%ecx          { empirical average value, on a Athlon XP the
-                                    break even is at 14, on a Core 2 Duo > 100 }
-        jg      .LCmpWordFull
-        testl   %ecx,%ecx
-        je      .LCmpWordZero
-
-        pushl   %ebx
-.LCmpWordLoop:
-        movw    (%eax),%bx
-        cmpw    (%edx),%bx
-        leal    2(%eax),%eax
-        leal    2(%edx),%edx
-        jne     .LCmpWordExitFast
-        decl    %ecx
-        jne     .LCmpWordLoop
-.LCmpWordExitFast:
-        movzwl  -2(%edx),%ecx     { Compare last position }
-        movzwl  %bx,%eax
-        subl    %ecx,%eax
-        popl    %ebx
+        sub     %eax, %edx
+        push    %esi
+        push    %ebx
+        cmp     $1073741823, %ecx
+        ja      .LUnbounded
+        cmp     $3, %ecx
+        lea     (%eax,%ecx,2), %esi
+        jle     .LWordwise_Test
+        test    $3, %al
+        je      .LPtrUintWise_Prepare
+        movzwl  (%edx,%eax), %ebx
+        cmp     (%eax), %bx
+        jne     .LDoSbb
+        add     $2, %eax
+        sub     $1, %ecx
+.LPtrUintWise_Prepare:
+        and     $-2, %ecx
+        lea     (%eax,%ecx,2), %ecx
+.balign 16
+.LPtrUintWise_Next:
+        mov     (%edx,%eax), %ebx
+        cmp     (%eax), %ebx
+        jne     .LPtrUintsDiffer
+        add     $4, %eax
+        cmp     %eax, %ecx
+        jne     .LPtrUintWise_Next
+.LWordwise_Test:
+        cmp     %esi, %eax
+        je      .LNothingFound
+.LWordwise_Body:
+        movzwl  (%edx,%eax), %ecx
+        cmp     (%eax), %cx
+        jne     .LDoSbb
+        add     $2, %eax
+        cmp     %esi, %eax
+        jne     .LWordwise_Body
+.LNothingFound:
+        pop     %ebx
+        xor     %eax, %eax
+        pop     %esi
         ret
 
-.LCmpWordZero:
-        movl    $0,%eax
+.LPtrUintsDiffer:
+        cmp     (%eax), %bx
+        jne     .LDoSbb
+        shr     $16, %ebx
+        cmp     2(%eax), %bx
+.LDoSbb:
+        sbb     %eax, %eax
+        and     $2, %eax
+        sub     $1, %eax
+        pop     %ebx
+        pop     %esi
         ret
 
-.LCmpWordFull:
-        pushl   %esi
-        pushl   %edi
-        pushl   %ebx
-{$ifdef FPC_ENABLED_CLD}
-        cld
-{$endif FPC_ENABLED_CLD}
-        movl    %eax,%edi
-        movl    %edx,%esi
-        movl    %ecx,%eax
-        movl    (%edi),%ebx     // Compare alignment bytes.
-        cmpl    (%esi),%ebx
-        jne     .LCmpword2      // Aligning will go wrong already. Max 2 words will be scanned Branch NOW
-        shll    $1,%eax         {Convert word count to bytes}
-        movl    %edi,%edx       { Align comparing is already done, so simply add}
-        negl    %edx            { calc bytes to align  -%edi and 3}
-        andl    $3,%edx
-        addl    %edx,%esi       { Skip max 3 bytes alignment}
-        addl    %edx,%edi
-        subl    %edx,%eax       { Subtract from number of bytes to go}
-        movl    %eax,%ecx       { Make copy of bytes to go}
-        andl    $3,%eax         { Calc remainder (mod 4) }
-        andl    $1,%edx         { %edx is 1 if array not 2-aligned, 0 otherwise}
-        shrl    $2,%ecx         { divide bytes to go by 4, DWords to go}
-        orl     %ecx,%ecx       { Sets zero flag if ecx=0 -> no cmp}
-        repe                    { Compare entire DWords}
-        cmpsl
-        je      .LCmpword2a     { All equal? then to the left over bytes}
-        movl    $4,%eax         { Not equal. Rescan the last 4 bytes bytewise}
-        subl    %eax,%esi       { Go back one DWord}
-        subl    %eax,%edi
-        incl    %eax            {if not odd then this does nothing, else it makes
-                                  sure that adding %edx increases from 2 to 3 words}
-.LCmpword2a:
-        subl    %edx,%esi       { Subtract alignment}
-        subl    %edx,%edi
-        addl    %edx,%eax
-        shrl    $1,%eax
-.LCmpword2:
-        movl    %eax,%ecx       {words still to (re)scan}
-        orl     %eax,%eax       {prevent disaster in case %eax=0}
-        repe
-        cmpsw
-.LCmpword3:
-        movzwl  -2(%esi),%ecx
-        movzwl  -2(%edi),%eax    // Compare failing (or equal) position
-        subl    %ecx,%eax        // calculate end result.
-.LCmpwordExit:
-        popl    %ebx
-        popl    %edi
-        popl    %esi
+.LUnbounded:
+        mov     %eax, %esi
+        jmp     .LWordwise_Body
 end;
 {$endif FPC_SYSTEM_HAS_COMPAREWORD}
 

+ 91 - 0
rtl/x86_64/x86_64.inc

@@ -854,6 +854,97 @@ end;
 {$endif FPC_SYSTEM_HAS_COMPAREDWORD}
 
 
+{$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
+{$define FPC_SYSTEM_HAS_COMPAREWORD}
+function CompareWord(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
+asm
+{$ifndef win64}
+    mov      %rdx, %r8
+    mov      %rsi, %rdx
+    mov      %rdi, %rcx
+{$endif win64}
+    mov      %r8, %rax
+    shr      $62, %rax
+    jnz      .LUnbounded
+    lea      (%rcx,%r8,2), %r9
+    cmp      $3, %r8
+    jle      .LWordwise_Test
+    and      $-8, %r8
+    lea      (%rcx,%r8,2), %r8
+    cmp      %r8, %rcx
+    jne      .L8x_Body
+.L8x_Tail:
+    cmp      %r8, %r9
+    je       .LNothing
+    lea      15(%r8), %eax
+    lea      15(%rdx), %ecx
+    xor      %r8d, %eax
+    xor      %edx, %ecx
+    or       %ecx, %eax
+    cmp      $4095, %eax
+    ja       .LCantOverReadBothTails
+    movdqu   (%r8), %xmm0
+    movdqu   (%rdx), %xmm2
+    pcmpeqw  %xmm2, %xmm0
+    pmovmskb %xmm0, %eax
+    xor      $65535, %eax
+    je       .LNothing
+    bsf      %eax, %eax
+    add      %rax, %r8
+    cmp      %r9, %r8
+    jnb      .LNothing
+    movzwl   (%r8), %ecx
+    cmp      %cx, (%rdx,%rax)
+    sbb      %rax, %rax
+    and      $2, %eax
+    sub      $1, %rax
+    ret
+
+.balign 16
+.L8x_Next:
+    add      $16, %rcx
+    add      $16, %rdx
+    cmp      %rcx, %r8
+    je       .L8x_Tail
+.L8x_Body:
+    movdqu   (%rdx), %xmm0
+    movdqu   (%rcx), %xmm1
+    pcmpeqw  %xmm1, %xmm0
+    pmovmskb %xmm0, %eax
+    xor      $65535, %eax
+    je       .L8x_Next
+    bsf      %eax, %eax
+    movzwl   (%rcx,%rax), %ecx
+    cmp      %cx, (%rdx,%rax)
+.LDoSbb:
+    sbb      %rax, %rax
+    and      $2, %eax
+    sub      $1, %rax
+    ret
+
+.LCantOverReadBothTails:
+    mov      %r8, %rcx
+.LWordwise_Body:
+    movzwl   (%rcx), %eax
+    cmp      %ax, (%rdx)
+    jne      .LDoSbb
+.LWordwise_Next:
+    add      $2, %rcx
+    add      $2, %rdx
+.LWordwise_Test:
+    cmp      %r9, %rcx
+    jne      .LWordwise_Body
+.LNothing:
+    xor      %eax, %eax
+    ret
+
+.LUnbounded:
+    mov     %rcx, %r9
+    jmp     .LWordwise_Body
+end;
+{$endif FPC_SYSTEM_HAS_COMPAREWORD}
+
+
 {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
 { does a thread save inc/dec }
 function declocked(var l : longint) : boolean;assembler; nostackframe;