Browse Source

SSE4.1 IndexQWord for i386 and x86-64.

Rika Ichinose 1 year ago
parent
commit
0ca608243c
3 changed files with 157 additions and 13 deletions
  1. 71 1
      rtl/i386/i386.inc
  2. 1 1
      rtl/inc/systemh.inc
  3. 85 11
      rtl/x86_64/x86_64.inc

+ 71 - 1
rtl/i386/i386.inc

@@ -26,6 +26,7 @@ var
   { this variable is set to true, if currently an sse check is executed and no sig ill should be generated }
   { this variable is set to true, if currently an sse check is executed and no sig ill should be generated }
   sse_check : boolean;
   sse_check : boolean;
   fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
   fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
+  has_sse41_support : boolean;
   fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. }
   fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. }
 
 
 {$asmmode ATT}
 {$asmmode ATT}
@@ -1074,7 +1075,7 @@ end;
 
 
 {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
 {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
 {$define FPC_SYSTEM_HAS_INDEXQWORD}
 {$define FPC_SYSTEM_HAS_INDEXQWORD}
-function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
+function IndexQWord_Plain(const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
 { eax = buf, edx = len, [esp+4] = b }
 { eax = buf, edx = len, [esp+4] = b }
 asm
 asm
         push    %ebx
         push    %ebx
@@ -1101,6 +1102,74 @@ asm
         pop     %ebx
         pop     %ebx
         mov     $-1, %eax
         mov     $-1, %eax
 end;
 end;
+
+function IndexQWord_SSE41(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
+{ eax = buf, edx = len, [esp+4] = b }
+asm
+    cmp      $6, len
+    jle      IndexQWord_Plain
+    movddup  4(%esp), %xmm0 { xmm0 = pattern of 'b's. }
+    mov      %eax, %ecx { ecx = original buf }
+    sub      $6, len
+.balign 16
+.L6x_Loop:
+    movdqu   (%eax), %xmm1
+    pcmpeqq  %xmm0, %xmm1 { xmm1 = cmpeq(vec 0, pattern) }
+    movdqu   16(%eax), %xmm2
+    pcmpeqq  %xmm0, %xmm2
+    por      %xmm1, %xmm2 { xmm2 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) }
+    movdqu   32(%eax), %xmm3
+    pcmpeqq  %xmm0, %xmm3
+    por      %xmm2, %xmm3 { xmm3 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) or cmpeq(vec 2, pattern) }
+    ptest    %xmm3, %xmm3
+    jnz      .LFound
+    add      $48, %eax
+    sub      $6, len
+    jge      .L6x_Loop
+    lea      (%eax,%edx,8), %eax { Point to last 3 vectors. }
+    cmp      $-5, len
+    jge      .L6x_Loop { Reuse .L6x_Loop to compare last 3 vectors, if not compared already. }
+    mov      $-1, %eax
+    ret      $8
+
+.LFound:
+    sub      %ecx, %eax
+    ptest    %xmm1, %xmm1
+    jnz      .LFoundAtXmm1
+    ptest    %xmm2, %xmm2
+    jnz      .LFoundAtXmm2
+    add      $16, %eax
+    movdqa   %xmm3, %xmm2
+.LFoundAtXmm2:
+    add      $16, %eax
+    movdqa   %xmm2, %xmm1
+.LFoundAtXmm1:
+    pmovmskb %xmm1, %ecx
+    bsf      %ecx, %ecx
+    add      %ecx, %eax
+    shr      $3, %eax
+end;
+
+function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt; forward;
+
+var
+  IndexQWord_Impl: function(const buf;len:SizeInt;b:QWord):SizeInt = @IndexQWord_Dispatch;
+
+function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt;
+begin
+  if not fpc_cpucodeinit_performed then
+    exit(IndexQWord_Plain(buf,len,b));
+  if has_sse41_support then
+    IndexQWord_Impl:=@IndexQWord_SSE41
+  else
+    IndexQWord_Impl:=@IndexQWord_Plain;
+  result:=IndexQWord_Impl(buf,len,b);
+end;
+
+function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt;
+begin
+  result:=IndexQWord_Impl(buf,len,b);
+end;
 {$endif FPC_SYSTEM_HAS_INDEXQWORD}
 {$endif FPC_SYSTEM_HAS_INDEXQWORD}
 
 
 
 
@@ -2420,6 +2489,7 @@ procedure fpc_cpucodeinit;
           begin
           begin
             has_sse2_support:=((_edx_cpuid1 and $4000000)<>0);
             has_sse2_support:=((_edx_cpuid1 and $4000000)<>0);
             has_sse3_support:=((_ecx_cpuid1 and $200)<>0);
             has_sse3_support:=((_ecx_cpuid1 and $200)<>0);
+            has_sse41_support:=boolean(_ecx_cpuid1 shr 19 and 1);
 
 
             { now avx }
             { now avx }
             asm
             asm

+ 1 - 1
rtl/inc/systemh.inc

@@ -919,7 +919,7 @@ function  IndexChar(const buf;len:SizeInt;b:widechar):SizeInt;
 function  IndexByte(const buf;len:SizeInt;b:byte):SizeInt; {$if defined(cpui386)} inline; {$endif}
 function  IndexByte(const buf;len:SizeInt;b:byte):SizeInt; {$if defined(cpui386)} inline; {$endif}
 function  Indexword(const buf;len:SizeInt;b:word):SizeInt; {$if defined(cpui386)} inline; {$endif}
 function  Indexword(const buf;len:SizeInt;b:word):SizeInt; {$if defined(cpui386)} inline; {$endif}
 function  IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt; {$if defined(cpui386)} inline; {$endif}
 function  IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt; {$if defined(cpui386)} inline; {$endif}
-function  IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt;
+function  IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt; {$if defined(cpui386) or defined(cpux86_64)} inline; {$endif}
 function  CompareChar(const buf1,buf2;len:SizeInt):SizeInt;
 function  CompareChar(const buf1,buf2;len:SizeInt):SizeInt;
 function  CompareByte(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386)} inline; {$endif}
 function  CompareByte(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386)} inline; {$endif}
 function  CompareWord(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386)} inline; {$endif}
 function  CompareWord(const buf1,buf2;len:SizeInt):SizeInt; {$if defined(cpui386)} inline; {$endif}

+ 85 - 11
rtl/x86_64/x86_64.inc

@@ -29,6 +29,8 @@
 var
 var
   fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
   fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
 {$endif}
 {$endif}
+var
+  has_sse41_support,fpc_cpuinit_performed : boolean;
 
 
 {$define FPC_SYSTEM_HAS_SPTR}
 {$define FPC_SYSTEM_HAS_SPTR}
 Function Sptr : Pointer;assembler;nostackframe;
 Function Sptr : Pointer;assembler;nostackframe;
@@ -554,7 +556,7 @@ procedure FillQWord(var x;count:SizeInt;value:QWord);assembler;nostackframe;
     jle    .L3to6
     jle    .L3to6
 
 
     movq   %rax, %xmm0
     movq   %rax, %xmm0
-    pshufd $0b01000100, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
+    punpcklqdq %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
     movdqu %xmm0, (%rcx)
     movdqu %xmm0, (%rcx)
     movdqu %xmm0, -16(%rcx,%rdx,8)
     movdqu %xmm0, -16(%rcx,%rdx,8)
 
 
@@ -564,7 +566,7 @@ procedure FillQWord(var x;count:SizeInt;value:QWord);assembler;nostackframe;
     rol    %cl, %rax { misalign the pattern by the misalignment of x }
     rol    %cl, %rax { misalign the pattern by the misalignment of x }
     mov    %r8, %rcx
     mov    %r8, %rcx
     movq   %rax, %xmm0
     movq   %rax, %xmm0
-    pshufd $0b01000100, %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
+    punpcklqdq %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
     jmp    FillXxxx_MoreThanTwoXmms
     jmp    FillXxxx_MoreThanTwoXmms
 
 
 .L3to6:
 .L3to6:
@@ -799,26 +801,96 @@ end;
 
 
 {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
 {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
 {$define FPC_SYSTEM_HAS_INDEXQWORD}
 {$define FPC_SYSTEM_HAS_INDEXQWORD}
-function IndexQWord(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
+function IndexQWord_Plain(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
 { win64: rcx=buf, rdx=len, r8=b
 { win64: rcx=buf, rdx=len, r8=b
   else:  rdi=buf, rsi=len, rdx=b }
   else:  rdi=buf, rsi=len, rdx=b }
 asm
 asm
-    mov      {$ifdef win64} %rcx {$else} %rdi {$endif}, %rax
+    mov      buf, %rax
     sub      $8, %rax
     sub      $8, %rax
 .balign 16
 .balign 16
 .LQwordwise_Next:
 .LQwordwise_Next:
     add      $8, %rax
     add      $8, %rax
-    sub      $1, {$ifdef win64} %rdx {$else} %rsi {$endif}
+    sub      $1, len
     jb       .LNothing
     jb       .LNothing
-    cmp      {$ifdef win64} %r8 {$else} %rdx {$endif}, (%rax)
+    cmpq     b, (%rax)
     jne      .LQwordwise_Next
     jne      .LQwordwise_Next
-    sub      {$ifdef win64} %rcx {$else} %rdi {$endif}, %rax
+    sub      buf, %rax
     shr      $3, %rax
     shr      $3, %rax
     ret
     ret
 
 
 .LNothing:
 .LNothing:
     mov      $-1, %rax
     mov      $-1, %rax
 end;
 end;
+
+function IndexQWord_SSE41(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
+{ win64: rcx=buf, rdx=len, r8=b
+  else:  rdi=buf, rsi=len, rdx=b }
+asm
+    cmp      $6, len
+    jle      IndexQWord_Plain
+    mov      buf, %rax
+    movq     {$ifdef win64} %r8 {$else} %rdx {$endif}, %xmm0
+    punpcklqdq %xmm0, %xmm0 { xmm0 = pattern of 'b's. }
+    sub      $6, len
+.balign 16
+.L6x_Loop:
+    movdqu   (%rax), %xmm1
+    pcmpeqq  %xmm0, %xmm1 { xmm1 = cmpeq(vec 0, pattern) }
+    movdqu   16(%rax), %xmm2
+    pcmpeqq  %xmm0, %xmm2
+    por      %xmm1, %xmm2 { xmm2 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) }
+    movdqu   32(%rax), %xmm3
+    pcmpeqq  %xmm0, %xmm3
+    por      %xmm2, %xmm3 { xmm3 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) or cmpeq(vec 2, pattern) }
+    ptest    %xmm3, %xmm3
+    jnz      .LFound
+    add      $48, %rax
+    sub      $6, len
+    jge      .L6x_Loop
+    lea      (%rax,{$ifdef win64} %rdx {$else} %rsi {$endif},8), %rax { Point to last 3 vectors. }
+    cmp      $-5, len
+    jge      .L6x_Loop { Reuse .L6x_Loop to compare last 3 vectors, if not compared already. }
+    mov      $-1, %rax
+    ret
+
+.LFound:
+    sub      buf, %rax
+    ptest    %xmm1, %xmm1
+    jnz      .LFoundAtXmm1
+    ptest    %xmm2, %xmm2
+    jnz      .LFoundAtXmm2
+    add      $16, %rax
+    movdqa   %xmm3, %xmm2
+.LFoundAtXmm2:
+    add      $16, %rax
+    movdqa   %xmm2, %xmm1
+.LFoundAtXmm1:
+    pmovmskb %xmm1, %ecx
+    bsf      %ecx, %ecx
+    add      %rcx, %rax
+    shr      $3, %rax
+end;
+
+function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt; forward;
+
+var
+  IndexQWord_Impl: function(const buf;len:SizeInt;b:QWord):SizeInt = @IndexQWord_Dispatch;
+
+function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt;
+begin
+  if not fpc_cpuinit_performed then
+    exit(IndexQWord_Plain(buf,len,b));
+  if has_sse41_support then
+    IndexQWord_Impl:=@IndexQWord_SSE41
+  else
+    IndexQWord_Impl:=@IndexQWord_Plain;
+  result:=IndexQWord_Impl(buf,len,b);
+end;
+
+function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt;
+begin
+  result:=IndexQWord_Impl(buf,len,b);
+end;
 {$endif FPC_SYSTEM_HAS_INDEXQWORD}
 {$endif FPC_SYSTEM_HAS_INDEXQWORD}
 
 
 {$endif freebsd}
 {$endif freebsd}
@@ -1472,14 +1544,15 @@ procedure fpc_cpuinit;
       xorl %eax,%eax
       xorl %eax,%eax
       cpuid
       cpuid
       movl %eax,_eax
       movl %eax,_eax
+      movl $1,%eax
+      xorl %ecx,%ecx
+      cpuid
+      movl %ecx,cpuid1_ecx
     end;
     end;
+    has_sse41_support:=boolean(cpuid1_ecx shr 19 and 1);
     if _eax>=7 then
     if _eax>=7 then
       begin
       begin
         asm
         asm
-          movl $1,%eax
-          xorl %ecx,%ecx
-          cpuid
-          movl %ecx,cpuid1_ecx
           movl $7,%eax
           movl $7,%eax
           xorl %ecx,%ecx
           xorl %ecx,%ecx
           cpuid
           cpuid
@@ -1503,6 +1576,7 @@ procedure fpc_cpuinit;
               end;
               end;
           end;
           end;
       end;
       end;
+    fpc_cpuinit_performed:=true;
   end;
   end;
 
 
 {$define FPC_SYSTEM_HAS_SYSINITFPU}
 {$define FPC_SYSTEM_HAS_SYSINITFPU}