Browse Source

Shortcut Compare*(a, a) before entering the aligned loop.

Rika Ichinose 1 year ago
parent
commit
ce6db34224
2 changed files with 51 additions and 39 deletions
  1. 41 36
      rtl/i386/i386.inc
  2. 10 3
      rtl/x86_64/x86_64.inc

+ 41 - 36
rtl/i386/i386.inc

@@ -1363,26 +1363,7 @@ asm
         pop      %ebx
         ret
 
-.LNothing:
-        pop      %ebx
-        xor      %eax, %eax
-        ret
-
-.LAligned32xLoop_TwoVectorsDiffer:
-        add      %eax, %edx { restore edx = buf2 }
-        pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
-        inc      %cx
-        jz       .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
-        mov      %ecx, %ebx
-.LVec0Differs:
-        bsf      %ebx, %ebx
-        movzbl   (%eax,%ebx), %eax
-        movzbl   (%edx,%ebx), %edx
-        sub      %edx, %eax
-        pop      %ebx
-        ret
-
-        .byte    144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
+        .byte    102,102,102,102,102,144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
 CompareByte_CantOverReadBoth_AVX2:
         cmp      $16, %ecx
         jb       .LCantOverReadBoth
@@ -1410,8 +1391,9 @@ CompareByte_CantOverReadBoth_AVX2:
         jbe      .LLastTwoVectors
 
         { More than four vectors: aligned loop. }
-        lea      -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
         sub      %eax, %edx { edx = buf2 - buf1 }
+        jz       .LNothing { Exit if buf1 = buf2. }
+        lea      -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
         and      $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
         sub      %eax, %ecx { ecx = count to be handled with loop }
 .balign 16 { No-op. }
@@ -1444,10 +1426,25 @@ CompareByte_CantOverReadBoth_AVX2:
         pmovmskb %xmm0, %ebx
         inc      %bx
         jnz      .LVecEm1Differs
+.LNothing:
         pop      %ebx
         xor      %eax, %eax
         ret
 
+.LAligned32xLoop_TwoVectorsDiffer:
+        add      %eax, %edx { restore edx = buf2 }
+        pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
+        inc      %cx
+        jz       .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
+        mov      %ecx, %ebx
+.LVec0Differs:
+        bsf      %ebx, %ebx
+        movzbl   (%eax,%ebx), %eax
+        movzbl   (%edx,%ebx), %edx
+        sub      %edx, %eax
+        pop      %ebx
+        ret
+
 .LVec1Differs:
         xor      %ecx, %ecx
 .LVecEm1Differs:
@@ -1563,6 +1560,7 @@ asm
         { bzhi      %ecx, %ebx, %ecx }
         .byte     0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi }
         jnz       .LVec0Differs
+.LNothing:
         vzeroupper
         pop       %ebx
         xor       %eax, %eax
@@ -1584,6 +1582,20 @@ asm
         pop       %ebx
         ret
 
+.LVec1Differs:
+        xor      %ecx, %ecx
+.LVecEm1Differs:
+        add      $32, %ecx
+.LVecEm2Differs:
+        vzeroupper
+        tzcnt    %ebx, %ebx
+        add      %ecx, %ebx
+        movzbl   (%eax,%ebx), %eax
+        movzbl   (%edx,%ebx), %edx
+        sub      %edx, %eax
+        pop      %ebx
+        ret
+
 .LVecOrMore:
         { Compare first vectors. }
         vmovdqu   (%eax), %ymm0
@@ -1606,8 +1618,9 @@ asm
         jbe       .LLastTwoVectors
 
         { More than four vectors: aligned loop. }
-        lea       -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
         sub       %eax, %edx { edx = buf2 - buf1 }
+        jz        .LNothing { Exit if buf1 = buf2. }
+        lea       -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
         and       $-32, %eax { Align buf1. First two vectors already analyzed are skipped by +64 on the first loop iteration. }
         sub       %eax, %ecx { ecx = count to be handled with loop }
 .balign 16 { No-op. }
@@ -1641,20 +1654,6 @@ asm
         vzeroupper
         pop       %ebx
         xor       %eax, %eax
-        ret
-
-.LVec1Differs:
-        xor      %ecx, %ecx
-.LVecEm1Differs:
-        add      $32, %ecx
-.LVecEm2Differs:
-        vzeroupper
-        tzcnt    %ebx, %ebx
-        add      %ecx, %ebx
-        movzbl   (%eax,%ebx), %eax
-        movzbl   (%edx,%ebx), %edx
-        sub      %edx, %eax
-        pop      %ebx
 end;
 
 {$ifndef CPUX86_HAS_BMI2}
@@ -1795,6 +1794,7 @@ asm
         pop     %ebx
         ret
 
+        .byte    102,102,102,102,102,102,102,102,102,102,102,144
 .LVecOrMore:
         movdqu   (%edx,%eax), %xmm0 { Compare first vectors. }
         movdqu   (%eax), %xmm1
@@ -1807,6 +1807,8 @@ asm
         sub      $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
         jle      .LLastVec
 
+        test     %edx, %edx
+        jz       .LNothing { Exit if buf1 = buf2. }
         push     %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
         add      %eax, %ecx
         and      $-16, %eax { align buf1; +16 is performed by the loop. }
@@ -1931,6 +1933,8 @@ asm
         sub      $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
         jle      .LLastVec
 
+        test     %edx, %edx
+        jz       .LNothing { Exit if buf1 = buf2. }
         push     %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
         add      %eax, %ecx
         and      $-16, %eax { align buf1; +16 is performed by the loop. }
@@ -1955,6 +1959,7 @@ asm
         pmovmskb %xmm0, %ebx
         inc      %bx
         jnz      .LVec0Differs
+.LNothing:
         pop      %ebx
         xor      %eax, %eax
         ret

+ 10 - 3
rtl/x86_64/x86_64.inc

@@ -987,7 +987,7 @@ asm
     movzbl   (%rcx,%rax), %eax
     sub      %rdx, %rax
     ret
-    .byte    0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
+    .byte    102,102,102,102,102,102,102,144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
 
 .LVecOrMore:
     { Compare first vectors. }
@@ -1013,8 +1013,9 @@ asm
     jbe      .LLastTwoVectors
 
     { More than four vectors: aligned loop. }
-    lea      -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). }
     sub      %rcx, %rdx { rdx = buf2 - buf1 }
+    jz       .LNothing { Exit if buf1 = buf2. }
+    lea      -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). }
     and      $-16, %rcx { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
     sub      %rcx, %r8 { r8 = count to be handled with loop }
 .balign 16 { no-op }
@@ -1200,6 +1201,7 @@ asm
     sub      %rdx, %rax
     ret
 
+    .byte    102,102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turn .balign 16 before .LAligned8xLoop_Body into a no-op. }
 .LVecOrMore:
     movdqu   (%rdx,%rcx), %xmm0 { Compare first vectors. }
     movdqu   (%rcx), %xmm1
@@ -1212,12 +1214,14 @@ asm
     sub      $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
     jle      .LLastVec
 
+    test     %rdx, %rdx
+    jz       .LNothing { Exit if buf1 = buf2. }
     mov      %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
     add      %rcx, %r8
     and      $-16, %rcx { align buf1; +16 is performed by the loop. }
     sub      %rcx, %r8
 
-.balign 16
+.balign 16 { no-op }
 .LAligned8xLoop_Body:
     add      $16, %rcx
     movdqu   (%rdx,%rcx), %xmm0
@@ -1278,6 +1282,8 @@ asm
     sub      $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
     jle      .LLastVec
 
+    test     %rdx, %rdx
+    jz       .LNothing { Exit if buf1 = buf2. }
     mov      %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
     add      %rcx, %r8
     and      $-16, %rcx { align buf1; +16 is performed by the loop. }
@@ -1301,6 +1307,7 @@ asm
     pmovmskb %xmm0, %eax
     inc      %ax
     jnz      .LVec0Differs
+.LNothing:
     xor      %eax, %eax
     ret