Browse Source

Faster path for IndexBytes with a match at the beginning.

Rika Ichinose 1 year ago
parent
commit
ca0e04a346
2 changed files with 97 additions and 35 deletions
  1. 46 17
      rtl/i386/i386.inc
  2. 51 18
      rtl/x86_64/x86_64.inc

+ 46 - 17
rtl/i386/i386.inc

@@ -786,31 +786,38 @@ function {$ifdef CPUX86_HAS_SSE2} IndexByte {$else} IndexByte_SSE2 {$endif} (con
 asm
 asm
         test      %edx, %edx
         test      %edx, %edx
         jz        .Lnotfound                 { exit if len=0 }
         jz        .Lnotfound                 { exit if len=0 }
-        push      %ebx
+
         movd      %ecx, %xmm1
         movd      %ecx, %xmm1
-        lea       16(%eax), %ecx             { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
+        mov       %eax, %ecx
         punpcklbw %xmm1, %xmm1
         punpcklbw %xmm1, %xmm1
-        and       $-0x10, %ecx               { first aligned address after buf }
         punpcklbw %xmm1, %xmm1
         punpcklbw %xmm1, %xmm1
+        and       $4095, %ecx
         pshufd    $0, %xmm1, %xmm1
         pshufd    $0, %xmm1, %xmm1
-        movdqa    -16(%ecx), %xmm0           { Fetch first 16 bytes (up to 15 bytes before target) }
-        sub       %eax, %ecx                 { ecx=number of valid bytes, eax=original ptr }
 
 
-        pcmpeqb   %xmm1, %xmm0               { compare with pattern and get bitmask }
-        pmovmskb  %xmm0, %ebx
+        cmp       $4080, %ecx
+        ja        .LCrossPage
 
 
-        shl       %cl, %ebx                  { shift valid bits into high word }
-        and       $0xffff0000, %ebx          { clear low word containing invalid bits }
-        shr       %cl, %ebx                  { shift back }
-        jz        .Lcontinue
-.Lmatch:
-        bsf       %ebx, %ebx
-        lea       -16(%ecx,%ebx), %eax
-        pop       %ebx
-        cmp       %eax, %edx                 { check against the buffer length }
-        jbe       .Lnotfound
+        movdqu    (%eax), %xmm0              { Analyze first 16 bytes, unaligned. }
+        pcmpeqb   %xmm1, %xmm0
+        pmovmskb  %xmm0, %ecx
+        test      %ecx, %ecx
+        jz        .LContinueAligned
+
+        bsf       %ecx, %eax
+        cmp       %edx, %eax
+        jae       .Lnotfound
         ret
         ret
 
 
+        .byte     144                        { Make .balign 16 before .Lloop a no-op. }
+.LContinueAligned:
+        cmp       $16, %edx                  { Length might be explicitly set to 16 or less; if so, skip a bit of work. }
+        jbe       .Lnotfound                 { (Or rather, this check is *required* unless jumping to .Lcontinue instead of going directly to .Lloop) }
+
+        push      %ebx
+        lea       16(%eax), %ecx             { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
+        and       $-0x10, %ecx               { first aligned address after buf }
+        sub       %eax, %ecx                 { ecx=number of valid bytes, eax=original ptr }
+
     .balign 16
     .balign 16
 .Lloop:
 .Lloop:
         movdqa    (%eax,%ecx), %xmm0         { eax and ecx may have any values, }
         movdqa    (%eax,%ecx), %xmm0         { eax and ecx may have any values, }
@@ -825,6 +832,28 @@ asm
         pop       %ebx
         pop       %ebx
 .Lnotfound:
 .Lnotfound:
         or        $-1, %eax
         or        $-1, %eax
+        ret
+
+.LCrossPage:
+        push      %ebx
+        lea       16(%eax), %ecx             { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
+        and       $-0x10, %ecx               { first aligned address after buf }
+        movdqa    -16(%ecx), %xmm0           { Fetch first 16 bytes (up to 15 bytes before target) }
+        sub       %eax, %ecx                 { ecx=number of valid bytes, eax=original ptr }
+
+        pcmpeqb   %xmm1, %xmm0               { compare with pattern and get bitmask }
+        pmovmskb  %xmm0, %ebx
+
+        shl       %cl, %ebx                  { shift valid bits into high word }
+        and       $0xffff0000, %ebx          { clear low word containing invalid bits }
+        shr       %cl, %ebx                  { shift back }
+        jz        .Lcontinue
+.Lmatch:
+        bsf       %ebx, %ebx
+        lea       -16(%ecx,%ebx), %eax
+        pop       %ebx
+        cmp       %eax, %edx                 { check against the buffer length }
+        jbe       .Lnotfound
 end;
 end;
 
 
 {$ifndef CPUX86_HAS_SSE2}
 {$ifndef CPUX86_HAS_SSE2}

+ 51 - 18
rtl/x86_64/x86_64.inc

@@ -595,34 +595,42 @@ function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackfram
 asm
 asm
     test   len, len
     test   len, len
     jz     .Lnotfound                  { exit if len=0 }
     jz     .Lnotfound                  { exit if len=0 }
+
     movd   {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1
     movd   {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1
+    mov    {$ifdef win64} %ecx {$else} %edi {$endif}, %eax
+    punpcklbw  %xmm1, %xmm1
+    punpcklbw  %xmm1, %xmm1
+    and    $4095, %eax
+    pshufd $0, %xmm1, %xmm1
+
+    cmp    $4080, %eax
+    ja     .LCrossPage
+
+    movdqu    ({$ifdef win64} %rcx {$else} %rdi {$endif}), %xmm0 { Analyze first 16 bytes, unaligned. }
+    pcmpeqb   %xmm1, %xmm0
+    pmovmskb  %xmm0, %eax
+    test      %eax, %eax
+    jz        .LContinueAligned
+
+    bsf    %eax, %eax
+    cmp    len, %rax
+    jae    .Lnotfound
+    ret
+
+    .byte  {$ifndef win64}102,102,102,102,{$endif}102,102,102,102,102,102,102,102,102,144 { Make .balign 16 before .Lloop a no-op. }
+.LContinueAligned:
+    cmp    $16, len                    { Length might be explicitly set to 16 or less; if so, skip a bit of work. }
+    jbe    .Lnotfound                  { (Or rather, this check is *required* unless jumping to .Lcontinue instead of going directly to .Lloop) }
+
 {$ifdef win64}
 {$ifdef win64}
     mov    %rcx, %r8                   { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
     mov    %rcx, %r8                   { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
     add    $16, %rcx
     add    $16, %rcx
 {$else}
 {$else}
     lea    16(%rdi), %rcx              { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
     lea    16(%rdi), %rcx              { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
 {$endif}
 {$endif}
-    punpcklbw  %xmm1, %xmm1
     and    $-0x10, %rcx                { first aligned address after buf }
     and    $-0x10, %rcx                { first aligned address after buf }
-    punpcklbw  %xmm1, %xmm1
-    pshufd $0, %xmm1, %xmm1
-    movdqa -16(%rcx), %xmm0            { Fetch first 16 bytes (up to 15 bytes before target) }
     sub    {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr }
     sub    {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr }
 
 
-    pcmpeqb %xmm1, %xmm0               { compare with pattern and get bitmask }
-    pmovmskb %xmm0, %eax
-
-    shl    %cl, %eax                   { shift valid bits into high word }
-    and    $0xffff0000, %eax           { clear low word containing invalid bits }
-    shr    %cl, %eax                   { shift back }
-    jz     .Lcontinue
-.Lmatch:
-    bsf    %eax, %eax
-    lea    -16(%rcx,%rax), %rax
-    cmp    %rax, len                   { check against the buffer length }
-    jbe    .Lnotfound
-    ret
-
     .balign 16
     .balign 16
 .Lloop:
 .Lloop:
     movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0 { r8/rdi and rcx may have any values, }
     movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0 { r8/rdi and rcx may have any values, }
@@ -636,6 +644,31 @@ asm
     ja     .Lloop
     ja     .Lloop
 .Lnotfound:
 .Lnotfound:
     or     $-1, %rax
     or     $-1, %rax
+    ret
+
+.LCrossPage:
+{$ifdef win64}
+    mov    %rcx, %r8                   { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
+    add    $16, %rcx
+{$else}
+    lea    16(%rdi), %rcx              { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
+{$endif}
+    and    $-0x10, %rcx                { first aligned address after buf }
+    movdqa -16(%rcx), %xmm0            { Fetch first 16 bytes (up to 15 bytes before target) }
+    sub    {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr }
+
+    pcmpeqb %xmm1, %xmm0               { compare with pattern and get bitmask }
+    pmovmskb %xmm0, %eax
+
+    shl    %cl, %eax                   { shift valid bits into high word }
+    and    $0xffff0000, %eax           { clear low word containing invalid bits }
+    shr    %cl, %eax                   { shift back }
+    jz     .Lcontinue
+.Lmatch:
+    bsf    %eax, %eax
+    lea    -16(%rcx,%rax), %rax
+    cmp    %rax, len                   { check against the buffer length }
+    jbe    .Lnotfound
 end;
 end;
 {$endif FPC_SYSTEM_HAS_INDEXBYTE}
 {$endif FPC_SYSTEM_HAS_INDEXBYTE}