Browse Source

Handle Index* / Compare* tail by directly reading last VECSIZE bytes, if there was at least one full vector.

Rika Ichinose 2 years ago
parent
commit
8e884d9acd
2 changed files with 155 additions and 50 deletions
  1. 76 22
      rtl/i386/i386.inc
  2. 79 28
      rtl/x86_64/x86_64.inc

+ 76 - 22
rtl/i386/i386.inc

@@ -670,32 +670,36 @@ asm
         movd     %ecx, %xmm2
         movd     %ecx, %xmm2
         add      %eax, %edx
         add      %eax, %edx
         pshufd   $0, %xmm2, %xmm1
         pshufd   $0, %xmm2, %xmm1
+
+.balign 16
 .L4x_Body:
 .L4x_Body:
         movdqu   (%eax), %xmm0
         movdqu   (%eax), %xmm0
         pcmpeqd  %xmm1, %xmm0
         pcmpeqd  %xmm1, %xmm0
         pmovmskb %xmm0, %edi
         pmovmskb %xmm0, %edi
         test     %edi, %edi
         test     %edi, %edi
-        jne      .L4x_Found
+        jnz      .L4x_Found
 .L4x_Next:
 .L4x_Next:
         add      $16, %eax
         add      $16, %eax
         cmp      %eax, %edx
         cmp      %eax, %edx
         jne      .L4x_Body
         jne      .L4x_Body
-        pop      %edi
-.LDWordwise_Test:
+
         cmp      %esi, %eax
         cmp      %esi, %eax
         je       .LNothing
         je       .LNothing
-.LDWordwise_Body:
-        cmp      %ecx, (%eax)
-        je       .LDWordwise_Found
-        add      $4, %eax
-        cmp      %esi, %eax
-        jne      .LDWordwise_Body
+
+        lea      -16(%esi), %eax
+        movdqu   (%eax), %xmm0
+        pcmpeqd  %xmm1, %xmm0
+        pmovmskb %xmm0, %edi
+        test     %edi, %edi
+        jnz      .L4x_Found
 .LNothing:
 .LNothing:
-        mov      $-1, %eax
+        pop      %edi
         pop      %ebx
         pop      %ebx
         pop      %esi
         pop      %esi
+        mov      $-1, %eax
         ret
         ret
 
 
+.balign 16
 .L4x_Found:
 .L4x_Found:
         bsf      %edi, %edi
         bsf      %edi, %edi
         add      %edi, %eax
         add      %edi, %eax
@@ -707,6 +711,19 @@ asm
         pop      %esi
         pop      %esi
         ret
         ret
 
 
+.balign 16
+.LDWordwise_Body:
+        cmp      %ecx, (%eax)
+        je       .LDWordwise_Found
+        add      $4, %eax
+.LDWordwise_Test:
+        cmp      %esi, %eax
+        jne      .LDWordwise_Body
+        mov      $-1, %eax
+        pop      %ebx
+        pop      %esi
+        ret
+
 .LUnbounded:
 .LUnbounded:
         mov      %eax, %esi
         mov      %eax, %esi
         jmp      .LDWordwise_Body
         jmp      .LDWordwise_Body
@@ -830,14 +847,13 @@ asm
         lea      (%eax,%ecx), %ebx { ebx = end of full XMMs in buf1 }
         lea      (%eax,%ecx), %ebx { ebx = end of full XMMs in buf1 }
         cmp      %ebx, %eax
         cmp      %ebx, %eax
         jne      .L16x_Body
         jne      .L16x_Body
-.L16x_Tail:
         lea      15(%ebx), %eax { check if tails don't cross page boundaries and can be over-read to XMMs }
         lea      15(%ebx), %eax { check if tails don't cross page boundaries and can be over-read to XMMs }
         lea      15(%edx), %ecx
         lea      15(%edx), %ecx
         xor      %ebx, %eax
         xor      %ebx, %eax
         xor      %edx, %ecx
         xor      %edx, %ecx
         or       %ecx, %eax
         or       %ecx, %eax
         cmp      $4095, %eax
         cmp      $4095, %eax
-        ja       .LCantOverReadBothTails
+        ja       .LCantOverReadBoth
         movdqu   (%ebx), %xmm0
         movdqu   (%ebx), %xmm0
         movdqu   (%edx), %xmm2
         movdqu   (%edx), %xmm2
         pcmpeqb  %xmm2, %xmm0
         pcmpeqb  %xmm2, %xmm0
@@ -868,14 +884,26 @@ asm
         add      $16, %edx
         add      $16, %edx
         cmp      %eax, %ebx
         cmp      %eax, %ebx
         jne      .L16x_Body
         jne      .L16x_Body
+
         cmp      %ebx, %esi
         cmp      %ebx, %esi
-        jne      .L16x_Tail
+        je       .L16x_Nothing
+
+        sub      %eax, %edx
+        lea      -16(%esi), %eax
+        add      %eax, %edx
+        movdqu   (%edx), %xmm0
+        movdqu   (%eax), %xmm1
+        pcmpeqb  %xmm1, %xmm0
+        pmovmskb %xmm0, %ecx
+        xor      $65535, %ecx
+        jnz      .L16x_Found
 .L16x_Nothing:
 .L16x_Nothing:
         pop      %ebx
         pop      %ebx
         xor      %eax, %eax
         xor      %eax, %eax
         pop      %esi
         pop      %esi
         ret
         ret
 
 
+.balign 16
 .L16x_Found:
 .L16x_Found:
         bsf      %ecx, %ecx
         bsf      %ecx, %ecx
         pop      %ebx
         pop      %ebx
@@ -885,7 +913,7 @@ asm
         sub      %edx, %eax
         sub      %edx, %eax
         ret
         ret
 
 
-.LCantOverReadBothTails:
+.LCantOverReadBoth:
         mov      %esi, %eax
         mov      %esi, %eax
         sub      %ebx, %eax
         sub      %ebx, %eax
         and      $-4, %eax
         and      $-4, %eax
@@ -1030,14 +1058,13 @@ asm
         lea      (%eax,%ecx,2), %esi { esi = end of full XMMs in buf1 }
         lea      (%eax,%ecx,2), %esi { esi = end of full XMMs in buf1 }
         cmp      %esi, %eax
         cmp      %esi, %eax
         jne      .L8x_Body
         jne      .L8x_Body
-.L8x_Tail:
         lea      15(%esi), %eax
         lea      15(%esi), %eax
         lea      15(%edx), %ecx
         lea      15(%edx), %ecx
         xor      %esi, %eax
         xor      %esi, %eax
         xor      %edx, %ecx
         xor      %edx, %ecx
         or       %ecx, %eax
         or       %ecx, %eax
         cmp      $4095, %eax
         cmp      $4095, %eax
-        ja       .LCantOverReadBothTails
+        ja       .LCantOverReadBoth
         movdqu   (%esi), %xmm0
         movdqu   (%esi), %xmm0
         movdqu   (%edx), %xmm2
         movdqu   (%edx), %xmm2
         pcmpeqw  %xmm2, %xmm0
         pcmpeqw  %xmm2, %xmm0
@@ -1071,8 +1098,19 @@ asm
         add      $16, %edx
         add      $16, %edx
         cmp      %eax, %esi
         cmp      %eax, %esi
         jne      .L8x_Body
         jne      .L8x_Body
+
         cmp      %esi, %ebx
         cmp      %esi, %ebx
-        jne      .L8x_Tail
+        je       .LNothing
+
+        sub      %eax, %edx
+        lea      -16(%ebx), %eax
+        add      %eax, %edx
+        movdqu   (%edx), %xmm0
+        movdqu   (%eax), %xmm1
+        pcmpeqw  %xmm1, %xmm0
+        pmovmskb %xmm0, %ecx
+        xor      $65535, %ecx
+        jnz      .L8x_Found
 .LNothing:
 .LNothing:
         xor      %eax, %eax
         xor      %eax, %eax
 .LReturnEAX:
 .LReturnEAX:
@@ -1086,7 +1124,7 @@ asm
         cmp      %ax, (%edx,%ecx)
         cmp      %ax, (%edx,%ecx)
         jmp      .L8x_DoSbb
         jmp      .L8x_DoSbb
 
 
-.LCantOverReadBothTails:
+.LCantOverReadBoth:
         mov      %esi, %eax
         mov      %esi, %eax
         pop      %esi
         pop      %esi
 .LWordwise_Body:
 .LWordwise_Body:
@@ -1189,19 +1227,35 @@ asm
         add      $16, %edx
         add      $16, %edx
         cmp      %eax, %ecx
         cmp      %eax, %ecx
         jne      .L4x_Body
         jne      .L4x_Body
-        pop      %ebx
-.LDWordwise_Test:
-        cmp      %esi, %eax
+
+        cmp      %esi, %ecx
         je       .LNothing
         je       .LNothing
+
+        sub      %eax, %edx
+        lea      -16(%esi), %eax
+        add      %eax, %edx
+        movdqu   (%edx), %xmm0
+        movdqu   (%eax), %xmm1
+        pcmpeqd  %xmm1, %xmm0
+        pmovmskb %xmm0, %ebx
+        xor      $65535, %ebx
+        jnz      .L4x_Found
+.LNothing:
+        pop      %ebx
+        pop      %esi
+        xor      %eax, %eax
+        ret
+
+.balign 16
 .LDWordwise_Body:
 .LDWordwise_Body:
         mov      (%eax), %ecx
         mov      (%eax), %ecx
         cmp      %ecx, (%edx)
         cmp      %ecx, (%edx)
         jne      .LDoSbb
         jne      .LDoSbb
         add      $4, %eax
         add      $4, %eax
         add      $4, %edx
         add      $4, %edx
+.LDWordwise_Test:
         cmp      %esi, %eax
         cmp      %esi, %eax
         jne      .LDWordwise_Body
         jne      .LDWordwise_Body
-.LNothing:
         xor      %eax, %eax
         xor      %eax, %eax
         pop      %esi
         pop      %esi
         ret
         ret

+ 79 - 28
rtl/x86_64/x86_64.inc

@@ -642,7 +642,7 @@ asm
     jnz      .LUnbounded
     jnz      .LUnbounded
     lea      (%rcx,%rdx,4), %r10
     lea      (%rcx,%rdx,4), %r10
     cmp      $3, %rdx
     cmp      $3, %rdx
-    jle      .LDWorwise_Test
+    jle      .LDWordwise_Test
     movd     %r8d, %xmm1
     movd     %r8d, %xmm1
     pshufd   $0, %xmm1, %xmm1
     pshufd   $0, %xmm1, %xmm1
     and      $-4, %rdx
     and      $-4, %rdx
@@ -654,23 +654,36 @@ asm
     pcmpeqd  %xmm1, %xmm0
     pcmpeqd  %xmm1, %xmm0
     pmovmskb %xmm0, %edx
     pmovmskb %xmm0, %edx
     test     %edx, %edx
     test     %edx, %edx
-    jne      .L4x_Found
+    jnz      .L4x_Found
     add      $16, %rax
     add      $16, %rax
     cmp      %r9, %rax
     cmp      %r9, %rax
     jne      .L4x_Body
     jne      .L4x_Body
-.LDWorwise_Test:
+
     cmp      %r10, %rax
     cmp      %r10, %rax
     je       .LNothing
     je       .LNothing
-.LDWorwise_Body:
+
+    lea      -16(%r10), %rax
+    movdqu   (%rax), %xmm0
+    pcmpeqd  %xmm1, %xmm0
+    pmovmskb %xmm0, %edx
+    test     %edx, %edx
+    jne      .L4x_Found
+.LNothing:
+    mov      $-1, %rax
+    ret
+
+.balign 16
+.LDWordwise_Body:
     cmp      %r8d, (%rax)
     cmp      %r8d, (%rax)
     je       .LFound
     je       .LFound
     add      $4, %rax
     add      $4, %rax
+.LDWordwise_Test:
     cmp      %r10, %rax
     cmp      %r10, %rax
-    jne      .LDWorwise_Body
-.LNothing:
+    jne      .LDWordwise_Body
     mov      $-1, %rax
     mov      $-1, %rax
     ret
     ret
 
 
+.balign 16
 .L4x_Found:
 .L4x_Found:
     bsf      %edx, %edx
     bsf      %edx, %edx
     add      %rdx, %rax
     add      %rdx, %rax
@@ -681,7 +694,7 @@ asm
 
 
 .LUnbounded:
 .LUnbounded:
     mov      %rcx, %r10
     mov      %rcx, %r10
-    jmp      .LDWorwise_Body
+    jmp      .LDWordwise_Body
 end;
 end;
 {$endif FPC_SYSTEM_HAS_INDEXDWORD}
 {$endif FPC_SYSTEM_HAS_INDEXDWORD}
 
 
@@ -706,7 +719,6 @@ asm
     add      %rcx, %r9 { r9 = end of full XMMs in buf1 }
     add      %rcx, %r9 { r9 = end of full XMMs in buf1 }
     cmp      %r9, %rcx
     cmp      %r9, %rcx
     jne      .L16x_Body
     jne      .L16x_Body
-.L16x_Tail:
     lea      15(%r9), %eax { check if tails don't cross page boundaries and can be over-read to XMMs }
     lea      15(%r9), %eax { check if tails don't cross page boundaries and can be over-read to XMMs }
     lea      15(%rdx), %ecx
     lea      15(%rdx), %ecx
     xor      %r9d, %eax
     xor      %r9d, %eax
@@ -719,7 +731,7 @@ asm
     pcmpeqb  %xmm1, %xmm0
     pcmpeqb  %xmm1, %xmm0
     pmovmskb %xmm0, %eax
     pmovmskb %xmm0, %eax
     xor      $65535, %eax
     xor      $65535, %eax
-    je       .L16x_Nothing
+    jz       .L16x_Nothing
     bsf      %eax, %ecx
     bsf      %eax, %ecx
     add      %rcx, %r9
     add      %rcx, %r9
     cmp      %r10, %r9 { ignore over-read garbage bytes }
     cmp      %r10, %r9 { ignore over-read garbage bytes }
@@ -736,17 +748,29 @@ asm
     pcmpeqb  %xmm1, %xmm0
     pcmpeqb  %xmm1, %xmm0
     pmovmskb %xmm0, %eax
     pmovmskb %xmm0, %eax
     xor      $65535, %eax
     xor      $65535, %eax
-    jne      .L16x_Found
+    jnz      .L16x_Found
     add      $16, %rcx
     add      $16, %rcx
     add      $16, %rdx
     add      $16, %rdx
     cmp      %rcx, %r9
     cmp      %rcx, %r9
     jne      .L16x_Body
     jne      .L16x_Body
+
     cmp      %r9, %r10
     cmp      %r9, %r10
-    jne      .L16x_Tail
+    je       .L16x_Nothing
+
+    sub      %rcx, %rdx
+    lea      -16(%r10), %rcx
+    add      %rcx, %rdx
+    movdqu   (%rdx), %xmm0
+    movdqu   (%rcx), %xmm1
+    pcmpeqb  %xmm1, %xmm0
+    pmovmskb %xmm0, %eax
+    xor      $65535, %eax
+    jnz      .L16x_Found
 .L16x_Nothing:
 .L16x_Nothing:
     xor      %eax, %eax
     xor      %eax, %eax
     ret
     ret
 
 
+.balign 16
 .L16x_Found:
 .L16x_Found:
     bsf      %eax, %eax
     bsf      %eax, %eax
     movzbl   (%rdx,%rax), %edx
     movzbl   (%rdx,%rax), %edx
@@ -755,7 +779,7 @@ asm
     ret
     ret
 
 
 .L4x_Prepare:
 .L4x_Prepare:
-    and      $12, %r8d { count to be handled with uint32s after XMMs: len mod 16 div 4 * 4 = len and %1100 = len and 12 }
+    and      $12, %r8d { count to be handled with uint32s for 1 <= len <= 15: len mod 16 div 4 * 4 = len and %1100 = len and 12 }
     lea      (%r9,%r8), %rcx
     lea      (%r9,%r8), %rcx
     cmp      %rcx, %r9
     cmp      %rcx, %r9
     je       .LBytewise_Body
     je       .LBytewise_Body
@@ -812,20 +836,19 @@ asm
     lea      (%rcx,%r8,2), %r8
     lea      (%rcx,%r8,2), %r8
     cmp      %r8, %rcx
     cmp      %r8, %rcx
     jne      .L8x_Body
     jne      .L8x_Body
-.L8x_Tail:
     lea      15(%r8), %eax
     lea      15(%r8), %eax
     lea      15(%rdx), %ecx
     lea      15(%rdx), %ecx
     xor      %r8d, %eax
     xor      %r8d, %eax
     xor      %edx, %ecx
     xor      %edx, %ecx
     or       %ecx, %eax
     or       %ecx, %eax
     cmp      $4095, %eax
     cmp      $4095, %eax
-    ja       .LCantOverReadBothTails
+    ja       .LCantOverReadBoth
     movdqu   (%r8), %xmm0
     movdqu   (%r8), %xmm0
     movdqu   (%rdx), %xmm2
     movdqu   (%rdx), %xmm2
     pcmpeqw  %xmm2, %xmm0
     pcmpeqw  %xmm2, %xmm0
     pmovmskb %xmm0, %eax
     pmovmskb %xmm0, %eax
     xor      $65535, %eax
     xor      $65535, %eax
-    je       .LNothing
+    jz       .LNothing
     bsf      %eax, %eax
     bsf      %eax, %eax
     add      %rax, %r8
     add      %rax, %r8
     cmp      %r9, %r8
     cmp      %r9, %r8
@@ -844,17 +867,29 @@ asm
     pcmpeqw  %xmm1, %xmm0
     pcmpeqw  %xmm1, %xmm0
     pmovmskb %xmm0, %eax
     pmovmskb %xmm0, %eax
     xor      $65535, %eax
     xor      $65535, %eax
-    jne      .L8x_Found
+    jnz      .L8x_Found
     add      $16, %rcx
     add      $16, %rcx
     add      $16, %rdx
     add      $16, %rdx
     cmp      %rcx, %r8
     cmp      %rcx, %r8
     jne      .L8x_Body
     jne      .L8x_Body
+
     cmp      %r8, %r9
     cmp      %r8, %r9
-    jne      .L8x_Tail
+    je       .LNothing
+
+    sub      %rcx, %rdx
+    lea      -16(%r9), %rcx
+    add      %rcx, %rdx
+    movdqu   (%rdx), %xmm0
+    movdqu   (%rcx), %xmm1
+    pcmpeqw  %xmm1, %xmm0
+    pmovmskb %xmm0, %eax
+    xor      $65535, %eax
+    jnz      .L8x_Found
 .LNothing:
 .LNothing:
     xor      %eax, %eax
     xor      %eax, %eax
     ret
     ret
 
 
+.balign 16
 .L8x_Found:
 .L8x_Found:
     bsf      %eax, %eax
     bsf      %eax, %eax
     movzwl   (%rcx,%rax), %ecx
     movzwl   (%rcx,%rax), %ecx
@@ -865,7 +900,7 @@ asm
     sub      $1, %rax
     sub      $1, %rax
     ret
     ret
 
 
-.LCantOverReadBothTails:
+.LCantOverReadBoth:
     mov      %r8, %rcx
     mov      %r8, %rcx
 .LWordwise_Body:
 .LWordwise_Body:
     movzwl   (%rcx), %eax
     movzwl   (%rcx), %eax
@@ -911,26 +946,29 @@ asm
     pcmpeqd  %xmm1, %xmm0
     pcmpeqd  %xmm1, %xmm0
     pmovmskb %xmm0, %eax
     pmovmskb %xmm0, %eax
     xor      $65535, %eax
     xor      $65535, %eax
-    jne      .L4x_Found
+    jnz      .L4x_Found
     add      $16, %rcx
     add      $16, %rcx
     add      $16, %rdx
     add      $16, %rdx
     cmp      %rcx, %r8
     cmp      %rcx, %r8
     jne      .L4x_Body
     jne      .L4x_Body
-.LDwordwise_Test:
+
     cmp      %rcx, %r9
     cmp      %rcx, %r9
     je       .LNothing
     je       .LNothing
-.LDwordwise_Body:
-    mov      (%rcx), %eax
-    cmp      %eax, (%rdx)
-    jne      .LDoSbb
-    add      $4, %rcx
-    add      $4, %rdx
-    cmp      %rcx, %r9
-    jne      .LDwordwise_Body
+
+    sub      %rcx, %rdx
+    lea      -16(%r9), %rcx
+    add      %rcx, %rdx
+    movdqu   (%rcx), %xmm1
+    movdqu   (%rdx), %xmm0
+    pcmpeqd  %xmm1, %xmm0
+    pmovmskb %xmm0, %eax
+    xor      $65535, %eax
+    jnz      .L4x_Found
 .LNothing:
 .LNothing:
     xor      %eax, %eax
     xor      %eax, %eax
     ret
     ret
 
 
+.balign 16
 .L4x_Found:
 .L4x_Found:
     bsf      %eax, %eax
     bsf      %eax, %eax
     mov      (%rcx,%rax), %ecx
     mov      (%rcx,%rax), %ecx
@@ -941,6 +979,19 @@ asm
     sub      $1, %rax
     sub      $1, %rax
     ret
     ret
 
 
+.balign 16
+.LDwordwise_Body:
+    mov      (%rcx), %eax
+    cmp      %eax, (%rdx)
+    jne      .LDoSbb
+    add      $4, %rcx
+    add      $4, %rdx
+.LDwordwise_Test:
+    cmp      %rcx, %r9
+    jne      .LDwordwise_Body
+    xor      %eax, %eax
+    ret
+
 .LUnbounded:
 .LUnbounded:
     mov      %rcx, %r9
     mov      %rcx, %r9
     jmp      .LDwordwise_Body
     jmp      .LDwordwise_Body