2
0
Эх сурвалжийг харах

+ x86_64 optimized assembler functions IndexByte and IndexWord
+ Extended tests of IndexByte with checks to verify correct operation when passed length is -1.

git-svn-id: trunk@17281 -

sergei 14 жил өмнө
parent
commit
c5e7902e4b

+ 149 - 0
rtl/x86_64/x86_64.inc

@@ -458,7 +458,156 @@ Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
   end;
 {$endif FPC_SYSTEM_HAS_FILLCHAR}
 
+{$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
+{ based on libc/sysdeps/x86_64/memchr.S }
+{$define FPC_SYSTEM_HAS_INDEXBYTE}
+function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
+{ win64: rcx buf, rdx len, r8b word
+  linux: rdi buf, rsi len, rdx word }
+asm
+{$ifdef win64}
+    movd   %r8d, %xmm1
+{$else}
+    movd   %edx, %xmm1
+    movq   %rdi, %rcx
+    movq   %rsi, %rdx
+{$endif}
+    mov    %rcx, %rax                  { duplicate buf }
+    punpcklbw  %xmm1, %xmm1
+    and    $0xfffffffffffffff0, %rax
+    test   %rdx, %rdx
+    punpcklbw  %xmm1, %xmm1
+    jz     .L3                         { exit if len=0 }
+    orl    $0xffffffff, %r8d
+    movdqa (%rax), %xmm0               { Fetch first 16 bytes (up to 15 bytes before target) }
+    pshufd $0, %xmm1, %xmm1
+    sub    %rax, %rcx                  { rcx=misalignment }
+    pcmpeqb %xmm1, %xmm0
+    add    %rcx, %rdx                  { add misalignment to length }
+    cmovb  %r8, %rdx                   { if it overflows (happens when length=-1), set back to -1, }
+                                       {   otherwise loop will terminate too early }
+    mov    %rcx, %r9                   { and save it, will subtract back in the end }
+    shl    %cl, %r8d
+    pmovmskb %xmm0, %ecx
+    andl   %r8d, %ecx                  { mask away matches before buffer start }
+    movl   $16, %r8d
+    jnz    .L1                         { got a match within buffer -> we're done (almost) }
+    cmpq   %r8, %rdx
+    jbe    .L3
+
+    .balign 16
+.L2:
+    movdqa (%rax,%r8), %xmm0
+    lea    16(%r8), %r8
+    pcmpeqb %xmm1, %xmm0
+    pmovmskb %xmm0, %ecx
+    test   %ecx, %ecx
+    jnz    .L1
+    cmp    %r8, %rdx
+    ja     .L2
+
+.L3:
+    or    $-1, %rax
+    jmp   .Ldone
+
+.L1:
+    bsfl   %ecx, %ecx                  { compute position of the first match }
+    lea    -16(%rcx,%r8), %rax
+    cmp    %rax, %rdx
+    jbe    .L3                         { if it is after the specified length, ignore it }
+    sub    %r9, %rax
+.Ldone:
+end;
+{$endif FPC_SYSTEM_HAS_INDEXBYTE}
+
+{$ifndef FPC_SYSTEM_HAS_INDEXWORD}
+{$define FPC_SYSTEM_HAS_INDEXWORD}
+function IndexWord(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
+{ win64: rcx buf, rdx len, r8b word
+  linux: rdi buf, rsi len, rdx word }
+asm
+{$ifdef win64}
+    movd   %r8d, %xmm1
+{$else}
+    movd   %edx, %xmm1
+    movq   %rdi, %rcx
+    movq   %rsi, %rdx
+{$endif}
+    mov    %rcx, %rax                  { duplicate buf }
+    punpcklwd  %xmm1, %xmm1
+    and    $0xfffffffffffffff0, %rax
+    test   %rdx, %rdx
+    pshufd $0, %xmm1, %xmm1
+    jz     .L3                         { exit if len=0 }
+    orl    $0xffffffff, %r8d
+    test   $1, %cl                     { if buffer isn't aligned to word boundary, }
+    jnz    .Lunaligned                 { fallback to slower unaligned loop }
+
+    movdqa (%rax), %xmm0               { Fetch first 16 bytes (up to 14 bytes before target) }
+    sub    %rax, %rcx                  { rcx=misalignment }
+    pcmpeqw %xmm1, %xmm0
+
+    mov    %rcx, %r9
+    shr    $1, %r9                     { save misalignment in words }
+
+    add    %r9, %rdx                   { add misalignment to length }
+    cmovb  %r8, %rdx                   { if it overflows (happens when length=-1), set back to -1, }
+                                       {   otherwise loop will terminate too early }
+    shl    %cl, %r8d
+    pmovmskb %xmm0, %ecx
+    andl   %r8d, %ecx                  { mask away matches before buffer start }
+    movl   $8, %r8d
+    jnz    .L1                         { got a match within buffer -> we're done (almost) }
+    cmpq   %r8, %rdx
+    jbe    .L3
+
+    .balign 16
+.L2:
+    movdqa (%rax,%r8,2), %xmm0
+    lea    8(%r8), %r8
+    pcmpeqw %xmm1, %xmm0
+    pmovmskb %xmm0, %ecx
+    test   %ecx, %ecx
+    jnz    .L1
+    cmp    %r8, %rdx
+    ja     .L2
+
+.L3:
+    or    $-1, %rax
+    jmp   .Ldone
+
+.L1:
+    bsfl   %ecx, %ecx                  { compute position of the first match }
+    shr    $1, %ecx                    { in words }
+    lea    -8(%rcx,%r8), %rax
+    cmp    %rax, %rdx
+    jbe    .L3                         { if it is after the specified length, ignore it }
+    sub    %r9, %rax
+.Ldone:
+    retq
+
+{ TODO: aligned processing is still possible, but for now
+  use the simplest form }
+.Lunaligned:
+    xor    %r9, %r9
+    xor    %r8, %r8
+    mov    %rcx, %rax
+
+    .balign 16
+.L2u:
+    movdqu (%rax,%r8,2), %xmm0
+    lea    8(%r8), %r8
+    pcmpeqw %xmm1, %xmm0
+    pmovmskb %xmm0, %ecx
+    test   %ecx, %ecx
+    jnz    .L1
+    cmp    %r8, %rdx
+    ja     .L2u
+    or     $-1, %rax
+end;
+{$endif FPC_SYSTEM_HAS_INDEXWORD}
 
+{$asmmode att}
 {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
 { does a thread save inc/dec }
 function declocked(var l : longint) : boolean;assembler;

+ 35 - 1
tests/test/tindex.pp

@@ -46,12 +46,29 @@ begin
               writeln('indexbyte error 2 for (',i,',',j,',',k,')');
               halt(2);
             end;
+          {same for length=-1}
+          if indexbyte(b[k+4],-1,0)<>index then
+            begin
+              writeln(indexbyte(b[k+4],-1,0),' <> ',index);
+              writeln('indexbyte error 2a for (',i,',',j,',',k,')');
+              halt(22);
+            end;
+  
 
           if indexbyte(b[k+4],i,b[k+4+i-1])<>i-1 then
             begin
               writeln('indexbyte error 3 for (',i,',',j,',',k,')');
               halt(3);
             end;
+          {same for length=-1}  
+          if i<>0 then   // previous test will be no-op when i=0
+            if indexbyte(b[k+4],-1,b[k+4+i-1])<>i-1 then
+              begin
+                writeln('indexbyte error 3a for (',i,',',j,',',k,')');
+                halt(23);
+              end;
+
+
           if (i<1) then
             index:=-1
           else
@@ -62,6 +79,16 @@ begin
               writeln('indexbyte error 4 for (',i,',',j,',',k,')');
               halt(4);
             end;
+          {same for length=-1}
+          if i<>0 then  // previous test will be no-op when i=0
+            if indexbyte(b[k+4],-1,b[k+4+i shr 1])<>index then
+              begin
+                writeln(indexbyte(b[k+4],-1,b[k+4+i shr 1]),' <> ',index);
+                writeln('indexbyte error 4a for (',i,',',j,',',k,')');
+                halt(24);
+              end;
+
+
           if (i=0) then
             index:=-1
           else
@@ -69,8 +96,15 @@ begin
           if indexbyte(b[k+4],i,b[k+4])<>index then
             begin
               writeln('indexbyte error 5 for (',i,',',j,',',k,')');
-              halt(3);
+              halt(5);
             end;
+          {same for length=-1}
+          if i<>0 then
+            if indexbyte(b[k+4],-1,b[k+4])<>index then
+              begin
+                writeln('indexbyte error 5a for (',i,',',j,',',k,')');
+                halt(25);
+              end;
 
 
           if indexword(b[k+4],i shr 1,0)<>-1 then