|
@@ -678,132 +678,87 @@ end;
|
|
{$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
|
|
{$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
|
|
{$define FPC_SYSTEM_HAS_INDEXBYTE}
|
|
{$define FPC_SYSTEM_HAS_INDEXBYTE}
|
|
function IndexByte_Plain(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
|
|
function IndexByte_Plain(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
|
|
|
|
+{ eax = buf, edx = len, cl = b }
|
|
asm
|
|
asm
|
|
- push %esi
|
|
|
|
- push %edi
|
|
|
|
|
|
+ test %edx,%edx
|
|
|
|
+ jz .Lnothing0
|
|
push %eax { save initial value of 'buf' }
|
|
push %eax { save initial value of 'buf' }
|
|
|
|
|
|
- cmp $4,%edx { less than 4 bytes, just test byte by byte. }
|
|
|
|
- jb .Ltail
|
|
|
|
|
|
+ test $3,%al
|
|
|
|
+ jz .Laligned4
|
|
|
|
+.Lalignloop: { align to 4 bytes }
|
|
|
|
+ cmp %cl,(%eax)
|
|
|
|
+ je .Lfoundateax
|
|
|
|
+ inc %eax
|
|
|
|
+ dec %edx
|
|
|
|
+ jz .Lnothing1
|
|
|
|
+ test $3,%al
|
|
|
|
+ jnz .Lalignloop
|
|
|
|
+
|
|
|
|
+.Laligned4: { align to 8 bytes }
|
|
|
|
+ push %esi
|
|
|
|
+ push %edi
|
|
|
|
|
|
mov %cl,%ch { prepare pattern }
|
|
mov %cl,%ch { prepare pattern }
|
|
movzwl %cx,%esi
|
|
movzwl %cx,%esi
|
|
shl $16,%ecx
|
|
shl $16,%ecx
|
|
or %esi,%ecx
|
|
or %esi,%ecx
|
|
|
|
|
|
-.Lalignloop:
|
|
|
|
- test $3,%al { align to 4 bytes if necessary }
|
|
|
|
- je .Laligned
|
|
|
|
- cmp %cl,(%eax)
|
|
|
|
- je .Lexit
|
|
|
|
- inc %eax
|
|
|
|
- dec %edx
|
|
|
|
- jmp .Lalignloop
|
|
|
|
-
|
|
|
|
-.balign 16 { Main loop, unrolled 4 times for speed }
|
|
|
|
|
|
+ test $7,%al
|
|
|
|
+ jz .Lloop
|
|
|
|
+ test %edx,%edx { Adjust buf+len and reuse the second unroll from the loop body. Careful with len < 0 (esp. len = −1). }
|
|
|
|
+ jl .Ldontfixuplen
|
|
|
|
+ add $4,%edx
|
|
|
|
+.Ldontfixuplen:
|
|
|
|
+ sub $4,%eax
|
|
|
|
+ jmp .Lalignfrom4to8
|
|
|
|
|
|
-.Lloop:
|
|
|
|
|
|
+.balign 16
|
|
|
|
+.Lloop: { Requires 8-byte alignment of eax, to safely over-read up to 7 bytes on last iteration. }
|
|
mov (%eax),%esi { load dword }
|
|
mov (%eax),%esi { load dword }
|
|
xor %ecx,%esi { XOR with pattern, bytes equal to target are now 0 }
|
|
xor %ecx,%esi { XOR with pattern, bytes equal to target are now 0 }
|
|
lea -0x01010101(%esi),%edi
|
|
lea -0x01010101(%esi),%edi
|
|
- xor %esi,%edi { (x-0x01010101) xor x }
|
|
|
|
not %esi
|
|
not %esi
|
|
and $0x80808080,%esi
|
|
and $0x80808080,%esi
|
|
- and %edi,%esi { ((x-0x01010101) xor x) and (not x) and 0x80808080 }
|
|
|
|
- jnz .Lfound { one of the bytes matches }
|
|
|
|
|
|
+ and %edi,%esi { (x-0x01010101) and (not x) and 0x80808080 }
|
|
|
|
+ jnz .Lfound0 { one of the bytes matches }
|
|
|
|
|
|
|
|
+.Lalignfrom4to8:
|
|
mov 4(%eax),%esi
|
|
mov 4(%eax),%esi
|
|
xor %ecx,%esi
|
|
xor %ecx,%esi
|
|
lea -0x01010101(%esi),%edi
|
|
lea -0x01010101(%esi),%edi
|
|
- xor %esi,%edi
|
|
|
|
not %esi
|
|
not %esi
|
|
and $0x80808080,%esi
|
|
and $0x80808080,%esi
|
|
and %edi,%esi
|
|
and %edi,%esi
|
|
- jnz .Lfound4
|
|
|
|
|
|
+ jnz .Lfound1
|
|
|
|
|
|
- mov 8(%eax),%esi
|
|
|
|
- xor %ecx,%esi
|
|
|
|
- lea -0x01010101(%esi),%edi
|
|
|
|
- xor %esi,%edi
|
|
|
|
- not %esi
|
|
|
|
- and $0x80808080,%esi
|
|
|
|
- and %edi,%esi
|
|
|
|
- jnz .Lfound8
|
|
|
|
-
|
|
|
|
- mov 12(%eax),%esi
|
|
|
|
- xor %ecx,%esi
|
|
|
|
- lea -0x01010101(%esi),%edi
|
|
|
|
- xor %esi,%edi
|
|
|
|
- not %esi
|
|
|
|
- and $0x80808080,%esi
|
|
|
|
- and %edi,%esi
|
|
|
|
- jnz .Lfound12
|
|
|
|
-
|
|
|
|
- add $16,%eax
|
|
|
|
-.Laligned:
|
|
|
|
- sub $16,%edx
|
|
|
|
- jae .Lloop { Still more than 16 bytes remaining }
|
|
|
|
-
|
|
|
|
-{ Process remaining bytes (<16 left at this point) }
|
|
|
|
-{ length is offset by -16 at this point }
|
|
|
|
-.Lloop2:
|
|
|
|
- cmp $4-16,%edx { < 4 bytes left? }
|
|
|
|
- jb .Ltail
|
|
|
|
-
|
|
|
|
- mov (%eax),%esi
|
|
|
|
- xor %ecx,%esi
|
|
|
|
- lea -0x01010101(%esi),%edi
|
|
|
|
- xor %esi,%edi
|
|
|
|
- not %esi
|
|
|
|
- and $0x80808080,%esi
|
|
|
|
- and %edi,%esi
|
|
|
|
- jne .Lfound
|
|
|
|
-
|
|
|
|
- add $4,%eax
|
|
|
|
- sub $4,%edx
|
|
|
|
- jmp .Lloop2
|
|
|
|
-
|
|
|
|
-.Ltail: { Less than 4 bytes remaining, check one by one }
|
|
|
|
- and $3, %edx
|
|
|
|
- jz .Lnotfound
|
|
|
|
-.Lloop3:
|
|
|
|
- cmp %cl,(%eax)
|
|
|
|
- je .Lexit
|
|
|
|
- inc %eax
|
|
|
|
- dec %edx
|
|
|
|
- jnz .Lloop3
|
|
|
|
-
|
|
|
|
-.Lnotfound:
|
|
|
|
|
|
+ add $8,%eax
|
|
|
|
+ sub $8,%edx
|
|
|
|
+ ja .Lloop
|
|
|
|
+.Lnothing3:
|
|
|
|
+ pop %edi
|
|
|
|
+ pop %esi
|
|
|
|
+.Lnothing1:
|
|
|
|
+ pop %edx
|
|
|
|
+.Lnothing0:
|
|
or $-1,%eax
|
|
or $-1,%eax
|
|
- jmp .Lexit1
|
|
|
|
|
|
+ ret
|
|
|
|
|
|
-{ add missing source pointer increments }
|
|
|
|
-.Lfound12:
|
|
|
|
- add $4,%eax
|
|
|
|
-.Lfound8:
|
|
|
|
- add $4,%eax
|
|
|
|
-.Lfound4:
|
|
|
|
|
|
+.Lfound1:
|
|
|
|
+ sub $4,%edx
|
|
|
|
+ jbe .Lnothing3
|
|
add $4,%eax
|
|
add $4,%eax
|
|
-
|
|
|
|
-.Lfound:
|
|
|
|
- test $0xff,%esi
|
|
|
|
- jnz .Lexit
|
|
|
|
- inc %eax
|
|
|
|
-
|
|
|
|
- test $0xff00,%esi
|
|
|
|
- jnz .Lexit
|
|
|
|
- inc %eax
|
|
|
|
-
|
|
|
|
- test $0xff0000,%esi
|
|
|
|
- jnz .Lexit
|
|
|
|
- inc %eax
|
|
|
|
-
|
|
|
|
-.Lexit:
|
|
|
|
- sub (%esp),%eax
|
|
|
|
-.Lexit1:
|
|
|
|
- pop %ecx { removes initial 'buf' value }
|
|
|
|
|
|
+.Lfound0:
|
|
|
|
+ bsf %esi,%esi
|
|
|
|
+ shr $3,%esi
|
|
|
|
+ cmp %edx,%esi { Garbage after remaining length? }
|
|
|
|
+ jae .Lnothing3
|
|
|
|
+ add %esi,%eax
|
|
pop %edi
|
|
pop %edi
|
|
pop %esi
|
|
pop %esi
|
|
|
|
+.Lfoundateax:
|
|
|
|
+ pop %ecx
|
|
|
|
+ sub %ecx,%eax
|
|
end;
|
|
end;
|
|
|
|
|
|
function IndexByte_SSE2(const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
|
|
function IndexByte_SSE2(const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
|