1 year ago · ce6db34224
--- a/rtl/i386/i386.inc
+++ b/rtl/i386/i386.inc
@@ -1363,26 +1363,7 @@ asm
 
				         pop      %ebx
			
 
				         ret
			
 
				 
			
 
				-.LNothing:
			
 
				-        pop      %ebx
			
 
				-        xor      %eax, %eax
			
 
				-        ret
			
 
				-
			
 
				-.LAligned32xLoop_TwoVectorsDiffer:
			
 
				-        add      %eax, %edx { restore edx = buf2 }
			
 
				-        pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
			
 
				-        inc      %cx
			
 
				-        jz       .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
			
 
				-        mov      %ecx, %ebx
			
 
				-.LVec0Differs:
			
 
				-        bsf      %ebx, %ebx
			
 
				-        movzbl   (%eax,%ebx), %eax
			
 
				-        movzbl   (%edx,%ebx), %edx
			
 
				-        sub      %edx, %eax
			
 
				-        pop      %ebx
			
 
				-        ret
			
 
				-
			
 
				-        .byte    144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
			
 
				+        .byte    102,102,102,102,102,144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
			
 
				 CompareByte_CantOverReadBoth_AVX2:
			
 
				         cmp      $16, %ecx
			
 
				         jb       .LCantOverReadBoth
			
@@ -1410,8 +1391,9 @@ CompareByte_CantOverReadBoth_AVX2:
 
				         jbe      .LLastTwoVectors
			
 
				 
			
 
				         { More than four vectors: aligned loop. }
			
 
				-        lea      -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
			
 
				         sub      %eax, %edx { edx = buf2 - buf1 }
			
 
				+        jz       .LNothing { Exit if buf1 = buf2. }
			
 
				+        lea      -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
			
 
				         and      $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
			
 
				         sub      %eax, %ecx { ecx = count to be handled with loop }
			
 
				 .balign 16 { No-op. }
			
@@ -1444,10 +1426,25 @@ CompareByte_CantOverReadBoth_AVX2:
 
				         pmovmskb %xmm0, %ebx
			
 
				         inc      %bx
			
 
				         jnz      .LVecEm1Differs
			
 
				+.LNothing:
			
 
				         pop      %ebx
			
 
				         xor      %eax, %eax
			
 
				         ret
			
 
				 
			
 
				+.LAligned32xLoop_TwoVectorsDiffer:
			
 
				+        add      %eax, %edx { restore edx = buf2 }
			
 
				+        pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
			
 
				+        inc      %cx
			
 
				+        jz       .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
			
 
				+        mov      %ecx, %ebx
			
 
				+.LVec0Differs:
			
 
				+        bsf      %ebx, %ebx
			
 
				+        movzbl   (%eax,%ebx), %eax
			
 
				+        movzbl   (%edx,%ebx), %edx
			
 
				+        sub      %edx, %eax
			
 
				+        pop      %ebx
			
 
				+        ret
			
 
				+
			
 
				 .LVec1Differs:
			
 
				         xor      %ecx, %ecx
			
 
				 .LVecEm1Differs:
			
@@ -1563,6 +1560,7 @@ asm
 
				         { bzhi      %ecx, %ebx, %ecx }
			
 
				         .byte     0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi }
			
 
				         jnz       .LVec0Differs
			
 
				+.LNothing:
			
 
				         vzeroupper
			
 
				         pop       %ebx
			
 
				         xor       %eax, %eax
			
@@ -1584,6 +1582,20 @@ asm
 
				         pop       %ebx
			
 
				         ret
			
 
				 
			
 
				+.LVec1Differs:
			
 
				+        xor      %ecx, %ecx
			
 
				+.LVecEm1Differs:
			
 
				+        add      $32, %ecx
			
 
				+.LVecEm2Differs:
			
 
				+        vzeroupper
			
 
				+        tzcnt    %ebx, %ebx
			
 
				+        add      %ecx, %ebx
			
 
				+        movzbl   (%eax,%ebx), %eax
			
 
				+        movzbl   (%edx,%ebx), %edx
			
 
				+        sub      %edx, %eax
			
 
				+        pop      %ebx
			
 
				+        ret
			
 
				+
			
 
				 .LVecOrMore:
			
 
				         { Compare first vectors. }
			
 
				         vmovdqu   (%eax), %ymm0
			
@@ -1606,8 +1618,9 @@ asm
 
				         jbe       .LLastTwoVectors
			
 
				 
			
 
				         { More than four vectors: aligned loop. }
			
 
				-        lea       -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
			
 
				         sub       %eax, %edx { edx = buf2 - buf1 }
			
 
				+        jz        .LNothing { Exit if buf1 = buf2. }
			
 
				+        lea       -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
			
 
				         and       $-32, %eax { Align buf1. First two vectors already analyzed are skipped by +64 on the first loop iteration. }
			
 
				         sub       %eax, %ecx { ecx = count to be handled with loop }
			
 
				 .balign 16 { No-op. }
			
@@ -1641,20 +1654,6 @@ asm
 
				         vzeroupper
			
 
				         pop       %ebx
			
 
				         xor       %eax, %eax
			
 
				-        ret
			
 
				-
			
 
				-.LVec1Differs:
			
 
				-        xor      %ecx, %ecx
			
 
				-.LVecEm1Differs:
			
 
				-        add      $32, %ecx
			
 
				-.LVecEm2Differs:
			
 
				-        vzeroupper
			
 
				-        tzcnt    %ebx, %ebx
			
 
				-        add      %ecx, %ebx
			
 
				-        movzbl   (%eax,%ebx), %eax
			
 
				-        movzbl   (%edx,%ebx), %edx
			
 
				-        sub      %edx, %eax
			
 
				-        pop      %ebx
			
 
				 end;
			
 
				 
			
 
				 {$ifndef CPUX86_HAS_BMI2}
			
@@ -1795,6 +1794,7 @@ asm
 
				         pop     %ebx
			
 
				         ret
			
 
				 
			
 
				+        .byte    102,102,102,102,102,102,102,102,102,102,102,144
			
 
				 .LVecOrMore:
			
 
				         movdqu   (%edx,%eax), %xmm0 { Compare first vectors. }
			
 
				         movdqu   (%eax), %xmm1
			
@@ -1807,6 +1807,8 @@ asm
 
				         sub      $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
			
 
				         jle      .LLastVec
			
 
				 
			
 
				+        test     %edx, %edx
			
 
				+        jz       .LNothing { Exit if buf1 = buf2. }
			
 
				         push     %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
			
 
				         add      %eax, %ecx
			
 
				         and      $-16, %eax { align buf1; +16 is performed by the loop. }
			
@@ -1931,6 +1933,8 @@ asm
 
				         sub      $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
			
 
				         jle      .LLastVec
			
 
				 
			
 
				+        test     %edx, %edx
			
 
				+        jz       .LNothing { Exit if buf1 = buf2. }
			
 
				         push     %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
			
 
				         add      %eax, %ecx
			
 
				         and      $-16, %eax { align buf1; +16 is performed by the loop. }
			
@@ -1955,6 +1959,7 @@ asm
 
				         pmovmskb %xmm0, %ebx
			
 
				         inc      %bx
			
 
				         jnz      .LVec0Differs
			
 
				+.LNothing:
			
 
				         pop      %ebx
			
 
				         xor      %eax, %eax
			
 
				         ret
			
--- a/rtl/x86_64/x86_64.inc
+++ b/rtl/x86_64/x86_64.inc
@@ -987,7 +987,7 @@ asm
 
				     movzbl   (%rcx,%rax), %eax
			
 
				     sub      %rdx, %rax
			
 
				     ret
			
 
				-    .byte    0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
			
 
				+    .byte    102,102,102,102,102,102,102,144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
			
 
				 
			
 
				 .LVecOrMore:
			
 
				     { Compare first vectors. }
			
@@ -1013,8 +1013,9 @@ asm
 
				     jbe      .LLastTwoVectors
			
 
				 
			
 
				     { More than four vectors: aligned loop. }
			
 
				-    lea      -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). }
			
 
				     sub      %rcx, %rdx { rdx = buf2 - buf1 }
			
 
				+    jz       .LNothing { Exit if buf1 = buf2. }
			
 
				+    lea      -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). }
			
 
				     and      $-16, %rcx { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
			
 
				     sub      %rcx, %r8 { r8 = count to be handled with loop }
			
 
				 .balign 16 { no-op }
			
@@ -1200,6 +1201,7 @@ asm
 
				     sub      %rdx, %rax
			
 
				     ret
			
 
				 
			
 
				+    .byte    102,102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turn .balign 16 before .LAligned8xLoop_Body into a no-op. }
			
 
				 .LVecOrMore:
			
 
				     movdqu   (%rdx,%rcx), %xmm0 { Compare first vectors. }
			
 
				     movdqu   (%rcx), %xmm1
			
@@ -1212,12 +1214,14 @@ asm
 
				     sub      $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
			
 
				     jle      .LLastVec
			
 
				 
			
 
				+    test     %rdx, %rdx
			
 
				+    jz       .LNothing { Exit if buf1 = buf2. }
			
 
				     mov      %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
			
 
				     add      %rcx, %r8
			
 
				     and      $-16, %rcx { align buf1; +16 is performed by the loop. }
			
 
				     sub      %rcx, %r8
			
 
				 
			
 
				-.balign 16
			
 
				+.balign 16 { no-op }
			
 
				 .LAligned8xLoop_Body:
			
 
				     add      $16, %rcx
			
 
				     movdqu   (%rdx,%rcx), %xmm0
			
@@ -1278,6 +1282,8 @@ asm
 
				     sub      $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
			
 
				     jle      .LLastVec
			
 
				 
			
 
				+    test     %rdx, %rdx
			
 
				+    jz       .LNothing { Exit if buf1 = buf2. }
			
 
				     mov      %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
			
 
				     add      %rcx, %r8
			
 
				     and      $-16, %rcx { align buf1; +16 is performed by the loop. }
			
@@ -1301,6 +1307,7 @@ asm
 
				     pmovmskb %xmm0, %eax
			
 
				     inc      %ax
			
 
				     jnz      .LVec0Differs
			
 
				+.LNothing:
			
 
				     xor      %eax, %eax
			
 
				     ret