|
@@ -1187,20 +1187,20 @@ function {$ifdef CPUX86_HAS_SSE4_1} IndexQWord {$else} IndexQWord_SSE41 {$endif}
|
|
asm
|
|
asm
|
|
cmp $6, len
|
|
cmp $6, len
|
|
jle IndexQWord_Plain
|
|
jle IndexQWord_Plain
|
|
- movddup 4(%esp), %xmm0 { xmm0 = pattern of 'b's. }
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} movddup 4(%esp), %xmm0 {$else} .byte 0xF2,0x0F,0x12,0x44,0x24,0x04 {$endif} { xmm0 = pattern of 'b's. }
|
|
mov %eax, %ecx { ecx = original buf }
|
|
mov %eax, %ecx { ecx = original buf }
|
|
sub $6, len
|
|
sub $6, len
|
|
.balign 16
|
|
.balign 16
|
|
.L6x_Loop:
|
|
.L6x_Loop:
|
|
movdqu (%eax), %xmm1
|
|
movdqu (%eax), %xmm1
|
|
- pcmpeqq %xmm0, %xmm1 { xmm1 = cmpeq(vec 0, pattern) }
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} pcmpeqq %xmm0, %xmm1 {$else} .byte 0x66,0x0F,0x38,0x29,0xC8 {$endif} { xmm1 = cmpeq(vec 0, pattern) }
|
|
movdqu 16(%eax), %xmm2
|
|
movdqu 16(%eax), %xmm2
|
|
- pcmpeqq %xmm0, %xmm2
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} pcmpeqq %xmm0, %xmm2 {$else} .byte 0x66,0x0F,0x38,0x29,0xD0 {$endif}
|
|
por %xmm1, %xmm2 { xmm2 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) }
|
|
por %xmm1, %xmm2 { xmm2 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) }
|
|
movdqu 32(%eax), %xmm3
|
|
movdqu 32(%eax), %xmm3
|
|
- pcmpeqq %xmm0, %xmm3
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} pcmpeqq %xmm0, %xmm3 {$else} .byte 0x66,0x0F,0x38,0x29,0xD8 {$endif}
|
|
por %xmm2, %xmm3 { xmm3 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) or cmpeq(vec 2, pattern) }
|
|
por %xmm2, %xmm3 { xmm3 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) or cmpeq(vec 2, pattern) }
|
|
- ptest %xmm3, %xmm3
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} ptest %xmm3, %xmm3 {$else} .byte 0x66,0x0F,0x38,0x17,0xDB {$endif}
|
|
jnz .LFound
|
|
jnz .LFound
|
|
add $48, %eax
|
|
add $48, %eax
|
|
sub $6, len
|
|
sub $6, len
|
|
@@ -1213,9 +1213,9 @@ asm
|
|
|
|
|
|
.LFound:
|
|
.LFound:
|
|
sub %ecx, %eax
|
|
sub %ecx, %eax
|
|
- ptest %xmm1, %xmm1
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} ptest %xmm1, %xmm1 {$else} .byte 0x66,0x0F,0x38,0x17,0xC9 {$endif}
|
|
jnz .LFoundAtXmm1
|
|
jnz .LFoundAtXmm1
|
|
- ptest %xmm2, %xmm2
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} ptest %xmm2, %xmm2 {$else} .byte 0x66,0x0F,0x38,0x17,0xD2 {$endif}
|
|
jnz .LFoundAtXmm2
|
|
jnz .LFoundAtXmm2
|
|
add $16, %eax
|
|
add $16, %eax
|
|
movdqa %xmm3, %xmm2
|
|
movdqa %xmm3, %xmm2
|
|
@@ -1553,15 +1553,14 @@ asm
|
|
ja CompareByte_CantOverReadBoth_AVX2
|
|
ja CompareByte_CantOverReadBoth_AVX2
|
|
|
|
|
|
{ Over-read both as YMMs. }
|
|
{ Over-read both as YMMs. }
|
|
- vmovdqu (%eax), %ymm0
|
|
|
|
- vpcmpeqb (%edx), %ymm0, %ymm0
|
|
|
|
- vpmovmskb %ymm0, %ebx
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vmovdqu (%eax), %ymm0 {$else} .byte 0xC5,0xFE,0x6F,0x00 {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpcmpeqb (%edx), %ymm0, %ymm0 {$else} .byte 0xC5,0xFD,0x74,0x02 {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpmovmskb %ymm0, %ebx {$else} .byte 0xC5,0xFD,0xD7,0xD8 {$endif}
|
|
inc %ebx
|
|
inc %ebx
|
|
- { bzhi %ecx, %ebx, %ecx }
|
|
|
|
- .byte 0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi }
|
|
|
|
|
|
+ {$if not defined(OLD_ASSEMBLER) and not defined(VER3_2)} bzhi %ecx, %ebx, %ecx {$else} .byte 0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi } {$endif}
|
|
jnz .LVec0Differs
|
|
jnz .LVec0Differs
|
|
.LNothing:
|
|
.LNothing:
|
|
- vzeroupper
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vzeroupper {$else} .byte 0xC5,0xF8,0x77 {$endif}
|
|
pop %ebx
|
|
pop %ebx
|
|
xor %eax, %eax
|
|
xor %eax, %eax
|
|
ret
|
|
ret
|
|
@@ -1569,13 +1568,13 @@ asm
|
|
.byte 144 { Turn .balign 16 before .LAligned64xLoop_Body into a no-op. }
|
|
.byte 144 { Turn .balign 16 before .LAligned64xLoop_Body into a no-op. }
|
|
.LAligned64xLoop_TwoVectorsDiffer:
|
|
.LAligned64xLoop_TwoVectorsDiffer:
|
|
add %eax, %edx { restore edx = buf2 }
|
|
add %eax, %edx { restore edx = buf2 }
|
|
- vpmovmskb %ymm0, %ecx { Is there a difference in the first vector? }
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpmovmskb %ymm0, %ecx {$else} .byte 0xC5,0xFD,0xD7,0xC8 {$endif} { Is there a difference in the first vector? }
|
|
inc %ecx
|
|
inc %ecx
|
|
jz .LVec1Differs { No difference in the first vector, ymm0 is all ones, ebx = vpmovmskb(vpcmpeqb(buf1 + 32, buf2 + 32)) from the loop body. }
|
|
jz .LVec1Differs { No difference in the first vector, ymm0 is all ones, ebx = vpmovmskb(vpcmpeqb(buf1 + 32, buf2 + 32)) from the loop body. }
|
|
mov %ecx, %ebx
|
|
mov %ecx, %ebx
|
|
.LVec0Differs:
|
|
.LVec0Differs:
|
|
- vzeroupper
|
|
|
|
- tzcnt %ebx, %ebx
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vzeroupper {$else} .byte 0xC5,0xF8,0x77 {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} tzcnt %ebx, %ebx {$else} .byte 0xF3,0x0F,0xBC,0xDB {$endif}
|
|
movzbl (%eax,%ebx), %eax
|
|
movzbl (%eax,%ebx), %eax
|
|
movzbl (%edx,%ebx), %edx
|
|
movzbl (%edx,%ebx), %edx
|
|
sub %edx, %eax
|
|
sub %edx, %eax
|
|
@@ -1587,8 +1586,8 @@ asm
|
|
.LVecEm1Differs:
|
|
.LVecEm1Differs:
|
|
add $32, %ecx
|
|
add $32, %ecx
|
|
.LVecEm2Differs:
|
|
.LVecEm2Differs:
|
|
- vzeroupper
|
|
|
|
- tzcnt %ebx, %ebx
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vzeroupper {$else} .byte 0xC5,0xF8,0x77 {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} tzcnt %ebx, %ebx {$else} .byte 0xF3,0x0F,0xBC,0xDB {$endif}
|
|
add %ecx, %ebx
|
|
add %ecx, %ebx
|
|
movzbl (%eax,%ebx), %eax
|
|
movzbl (%eax,%ebx), %eax
|
|
movzbl (%edx,%ebx), %edx
|
|
movzbl (%edx,%ebx), %edx
|
|
@@ -1598,9 +1597,9 @@ asm
|
|
|
|
|
|
.LVecOrMore:
|
|
.LVecOrMore:
|
|
{ Compare first vectors. }
|
|
{ Compare first vectors. }
|
|
- vmovdqu (%eax), %ymm0
|
|
|
|
- vpcmpeqb (%edx), %ymm0, %ymm0
|
|
|
|
- vpmovmskb %ymm0, %ebx
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vmovdqu (%eax), %ymm0 {$else} .byte 0xC5,0xFE,0x6F,0x00 {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpcmpeqb (%edx), %ymm0, %ymm0 {$else} .byte 0xC5,0xFD,0x74,0x02 {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpmovmskb %ymm0, %ebx {$else} .byte 0xC5,0xFD,0xD7,0xD8 {$endif}
|
|
inc %ebx
|
|
inc %ebx
|
|
jnz .LVec0Differs
|
|
jnz .LVec0Differs
|
|
|
|
|
|
@@ -1608,9 +1607,9 @@ asm
|
|
jbe .LLastVec
|
|
jbe .LLastVec
|
|
|
|
|
|
{ Compare second vectors. }
|
|
{ Compare second vectors. }
|
|
- vmovdqu 32(%eax), %ymm0
|
|
|
|
- vpcmpeqb 32(%edx), %ymm0, %ymm0
|
|
|
|
- vpmovmskb %ymm0, %ebx
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vmovdqu 32(%eax), %ymm0 {$else} .byte 0xC5,0xFE,0x6F,0x40,0x20 {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpcmpeqb 32(%edx), %ymm0, %ymm0 {$else} .byte 0xC5,0xFD,0x74,0x42,0x20 {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpmovmskb %ymm0, %ebx {$else} .byte 0xC5,0xFD,0xD7,0xD8 {$endif}
|
|
inc %ebx
|
|
inc %ebx
|
|
jnz .LVec1Differs
|
|
jnz .LVec1Differs
|
|
|
|
|
|
@@ -1627,12 +1626,12 @@ asm
|
|
.LAligned64xLoop_Body:
|
|
.LAligned64xLoop_Body:
|
|
add $64, %eax
|
|
add $64, %eax
|
|
{ Compare two YMMs, reduce the result with 'and'. }
|
|
{ Compare two YMMs, reduce the result with 'and'. }
|
|
- vmovdqu (%edx,%eax), %ymm0
|
|
|
|
- vpcmpeqb (%eax), %ymm0, %ymm0 { ymm0 = vpcmpeqb(buf1, buf2) }
|
|
|
|
- vmovdqu 32(%edx,%eax), %ymm1
|
|
|
|
- vpcmpeqb 32(%eax), %ymm1, %ymm1
|
|
|
|
- vpand %ymm0, %ymm1, %ymm1 { ymm1 = ymm0 and vpcmpeqb(buf1 + 32, buf2 + 32) }
|
|
|
|
- vpmovmskb %ymm1, %ebx
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vmovdqu (%edx,%eax), %ymm0 {$else} .byte 0xC5,0xFE,0x6F,0x04,0x02 {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpcmpeqb (%eax), %ymm0, %ymm0 {$else} .byte 0xC5,0xFD,0x74,0x00 {$endif} { ymm0 = vpcmpeqb(buf1, buf2) }
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vmovdqu 32(%edx,%eax), %ymm1 {$else} .byte 0xC5,0xFE,0x6F,0x4C,0x02,0x20 {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpcmpeqb 32(%eax), %ymm1, %ymm1 {$else} .byte 0xC5,0xF5,0x74,0x48,0x20 {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpand %ymm0, %ymm1, %ymm1 {$else} .byte 0xC5,0xF5,0xDB,0xC8 {$endif} { ymm1 = ymm0 and vpcmpeqb(buf1 + 32, buf2 + 32) }
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpmovmskb %ymm1, %ebx {$else} .byte 0xC5,0xFD,0xD7,0xD9 {$endif}
|
|
inc %ebx
|
|
inc %ebx
|
|
jnz .LAligned64xLoop_TwoVectorsDiffer
|
|
jnz .LAligned64xLoop_TwoVectorsDiffer
|
|
sub $64, %ecx
|
|
sub $64, %ecx
|
|
@@ -1640,18 +1639,18 @@ asm
|
|
add %eax, %edx { restore edx = buf2 }
|
|
add %eax, %edx { restore edx = buf2 }
|
|
add $64, %ecx
|
|
add $64, %ecx
|
|
.LLastTwoVectors:
|
|
.LLastTwoVectors:
|
|
- vmovdqu (%eax,%ecx), %ymm0
|
|
|
|
- vpcmpeqb (%edx,%ecx), %ymm0, %ymm0
|
|
|
|
- vpmovmskb %ymm0, %ebx
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vmovdqu (%eax,%ecx), %ymm0 {$else} .byte 0xC5,0xFE,0x6F,0x04,0x08 {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpcmpeqb (%edx,%ecx), %ymm0, %ymm0 {$else} .byte 0xC5,0xFD,0x74,0x04,0x0A {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpmovmskb %ymm0, %ebx {$else} .byte 0xC5,0xFD,0xD7,0xD8 {$endif}
|
|
inc %ebx
|
|
inc %ebx
|
|
jnz .LVecEm2Differs
|
|
jnz .LVecEm2Differs
|
|
.LLastVec:
|
|
.LLastVec:
|
|
- vmovdqu 32(%eax,%ecx), %ymm0
|
|
|
|
- vpcmpeqb 32(%edx,%ecx), %ymm0, %ymm0
|
|
|
|
- vpmovmskb %ymm0, %ebx
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vmovdqu 32(%eax,%ecx), %ymm0 {$else} .byte 0xC5,0xFE,0x6F,0x44,0x08,0x20 {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpcmpeqb 32(%edx,%ecx), %ymm0, %ymm0 {$else} .byte 0xC5,0xFD,0x74,0x44,0x0A,0x20 {$endif}
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vpmovmskb %ymm0, %ebx {$else} .byte 0xC5,0xFD,0xD7,0xD8 {$endif}
|
|
inc %ebx
|
|
inc %ebx
|
|
jnz .LVecEm1Differs
|
|
jnz .LVecEm1Differs
|
|
- vzeroupper
|
|
|
|
|
|
+ {$ifndef OLD_ASSEMBLER} vzeroupper {$else} .byte 0xC5,0xF8,0x77 {$endif}
|
|
pop %ebx
|
|
pop %ebx
|
|
xor %eax, %eax
|
|
xor %eax, %eax
|
|
end;
|
|
end;
|