123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478 |
- {
- This file is part of the Free Pascal run time library.
- Copyright (c) 1999-2000 by the Free Pascal development team
- Include file with set operations called by the compiler
- See the file COPYING.FPC, included in this distribution,
- for details about the copyright.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- **********************************************************************}
- {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
- {$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
- procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
- { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
- asm
- push %ebx
- push %esi
- mov 12(%esp), %esi { esi = size }
- sub $4, %esi
- jl .LBytewise_Prepare { probably dead branch... }
- .L4x_Loop:
- mov (%eax,%esi), %ebx
- or (%edx,%esi), %ebx
- mov %ebx, (%ecx,%esi)
- sub $4, %esi
- ja .L4x_Loop
- mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
- or (%edx), %ebx
- mov %ebx, (%ecx)
- pop %esi
- pop %ebx
- ret $4
- .LBytewise_Prepare:
- add $3, %esi
- .LBytewise_Loop:
- movzbl (%eax,%esi), %ebx
- or (%edx,%esi), %bl
- mov %bl, (%ecx,%esi)
- sub $1, %esi
- jae .LBytewise_Loop
- pop %esi
- pop %ebx
- end;
- procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_add_sets {$else} fpc_varset_add_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
- { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
- asm
- push %ebx
- mov 8(%esp), %ebx
- sub $16, %ebx { ebx = position }
- jl .LFallback { Hopefully dead branch... }
- .L16x_Loop:
- movups (%eax,%ebx), %xmm0
- movups (%edx,%ebx), %xmm1
- orps %xmm1, %xmm0
- movups %xmm0, (%ecx,%ebx)
- sub $16, %ebx
- ja .L16x_Loop
- movups (%eax), %xmm0 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
- movups (%edx), %xmm1
- orps %xmm1, %xmm0
- movups %xmm0, (%ecx)
- pop %ebx
- ret $4
- .LFallback:
- pop %ebx
- jmp fpc_varset_add_sets_plain
- end;
- {$ifndef CPUX86_HAS_SSEUNIT}
- procedure fpc_varset_add_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
- var
- fpc_varset_add_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_add_sets_dispatch;
- procedure fpc_varset_add_sets_dispatch(const set1,set2; var dest;size : ptrint);
- begin
- if has_sse_support then
- fpc_varset_add_sets_impl:=@fpc_varset_add_sets_sse
- else
- fpc_varset_add_sets_impl:=@fpc_varset_add_sets_plain;
- fpc_varset_add_sets_impl(set1,set2,dest,size);
- end;
- procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
- begin
- fpc_varset_add_sets_impl(set1,set2,dest,size);
- end;
- {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_add_sets dispatcher)}
- {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
- {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
- {$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
- procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
- { Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. }
- asm
- push %ebx
- push %esi
- mov 12(%esp), %esi { esi = size }
- sub $4, %esi
- jl .LBytewise_Prepare { probably dead branch... }
- .L4x_Loop:
- mov (%eax,%esi), %ebx
- and (%edx,%esi), %ebx
- mov %ebx, (%ecx,%esi)
- sub $4, %esi
- ja .L4x_Loop
- mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
- and (%edx), %ebx
- mov %ebx, (%ecx)
- pop %esi
- pop %ebx
- ret $4
- .LBytewise_Prepare:
- add $3, %esi
- .LBytewise_Loop:
- movzbl (%eax,%esi), %ebx
- and (%edx,%esi), %bl
- mov %bl, (%ecx,%esi)
- sub $1, %esi
- jae .LBytewise_Loop
- pop %esi
- pop %ebx
- end;
- procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_mul_sets {$else} fpc_varset_mul_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
- { Same as fpc_varset_add_sets_sse but with 'and' instead of 'or'. }
- asm
- push %ebx
- mov 8(%esp), %ebx
- sub $16, %ebx { ebx = position }
- jl .LFallback { Hopefully dead branch... }
- .L16x_Loop:
- movups (%eax,%ebx), %xmm0
- movups (%edx,%ebx), %xmm1
- andps %xmm1, %xmm0
- movups %xmm0, (%ecx,%ebx)
- sub $16, %ebx
- ja .L16x_Loop
- movups (%eax), %xmm0 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
- movups (%edx), %xmm1
- andps %xmm1, %xmm0
- movups %xmm0, (%ecx)
- pop %ebx
- ret $4
- .LFallback:
- pop %ebx
- jmp fpc_varset_mul_sets_plain
- end;
- {$ifndef CPUX86_HAS_SSEUNIT}
- procedure fpc_varset_mul_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
- var
- fpc_varset_mul_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_mul_sets_dispatch;
- procedure fpc_varset_mul_sets_dispatch(const set1,set2; var dest;size : ptrint);
- begin
- if has_sse_support then
- fpc_varset_mul_sets_impl:=@fpc_varset_mul_sets_sse
- else
- fpc_varset_mul_sets_impl:=@fpc_varset_mul_sets_plain;
- fpc_varset_mul_sets_impl(set1,set2,dest,size);
- end;
- procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
- begin
- fpc_varset_mul_sets_impl(set1,set2,dest,size);
- end;
- {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_mul_sets dispatcher)}
- {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
- {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
- {$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
- procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
- { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
- asm
- push %ebx
- push %esi
- mov 12(%esp), %esi { esi = size }
- sub $4, %esi
- jl .LBytewise_Prepare { probably dead branch... }
- mov (%edx), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
- not %ebx { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
- and (%eax), %ebx
- push %ebx
- .L4x_Loop:
- mov (%edx,%esi), %ebx
- not %ebx
- and (%eax,%esi), %ebx
- mov %ebx, (%ecx,%esi)
- sub $4, %esi
- ja .L4x_Loop
- pop %ebx
- mov %ebx, (%ecx) { Write precalculated tail. }
- pop %esi
- pop %ebx
- ret $4
- .LBytewise_Prepare:
- add $3, %esi
- .LBytewise_Loop:
- movzbl (%edx,%esi), %ebx
- not %ebx
- and (%eax,%esi), %bl
- mov %bl, (%ecx,%esi)
- sub $1, %esi
- jae .LBytewise_Loop
- pop %esi
- pop %ebx
- end;
- procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_sub_sets {$else} fpc_varset_sub_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
- { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
- asm
- push %ebx
- mov 8(%esp), %ebx
- sub $16, %ebx { ebx = position }
- jl .LFallback { Hopefully dead branch... }
- movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
- movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
- andnps %xmm1, %xmm2
- .L16x_Loop:
- movups (%eax,%ebx), %xmm1
- movups (%edx,%ebx), %xmm0
- andnps %xmm1, %xmm0
- movups %xmm0, (%ecx,%ebx)
- sub $16, %ebx
- ja .L16x_Loop
- movups %xmm2, (%ecx) { Write precalculated tail. }
- pop %ebx
- ret $4
- .LFallback:
- pop %ebx
- jmp fpc_varset_sub_sets_plain
- end;
- {$ifndef CPUX86_HAS_SSEUNIT}
- procedure fpc_varset_sub_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
- var
- fpc_varset_sub_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_sub_sets_dispatch;
- procedure fpc_varset_sub_sets_dispatch(const set1,set2; var dest;size : ptrint);
- begin
- if has_sse_support then
- fpc_varset_sub_sets_impl:=@fpc_varset_sub_sets_sse
- else
- fpc_varset_sub_sets_impl:=@fpc_varset_sub_sets_plain;
- fpc_varset_sub_sets_impl(set1,set2,dest,size);
- end;
- procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
- begin
- fpc_varset_sub_sets_impl(set1,set2,dest,size);
- end;
- {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_sub_sets dispatcher)}
- {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
- {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
- {$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
- procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
- { Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'.
- eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
- asm
- push %ebx
- push %esi
- mov 12(%esp), %esi { esi = size }
- sub $4, %esi
- jl .LBytewise_Prepare { probably dead branch... }
- mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
- xor (%edx), %ebx { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
- push %ebx
- .L4x_Loop:
- mov (%eax,%esi), %ebx
- xor (%edx,%esi), %ebx
- mov %ebx, (%ecx,%esi)
- sub $4, %esi
- ja .L4x_Loop
- pop %ebx
- mov %ebx, (%ecx) { Write precalculated tail. }
- pop %esi
- pop %ebx
- ret $4
- .LBytewise_Prepare:
- add $3, %esi
- .LBytewise_Loop:
- movzbl (%eax,%esi), %ebx
- xor (%edx,%esi), %bl
- mov %bl, (%ecx,%esi)
- sub $1, %esi
- jae .LBytewise_Loop
- pop %esi
- pop %ebx
- end;
- procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_symdif_sets {$else} fpc_varset_symdif_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
- { Same as fpc_varset_sub_sets_sse but with 'xor' instead of 'and not'.
- eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
- asm
- push %ebx
- mov 8(%esp), %ebx
- sub $16, %ebx { ebx = position }
- jl .LFallback { Hopefully dead branch... }
- movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
- movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
- xorps %xmm1, %xmm2
- .L16x_Loop:
- movups (%eax,%ebx), %xmm1
- movups (%edx,%ebx), %xmm0
- xorps %xmm1, %xmm0
- movups %xmm0, (%ecx,%ebx)
- sub $16, %ebx
- ja .L16x_Loop
- movups %xmm2, (%ecx) { Write precalculated tail. }
- pop %ebx
- ret $4
- .LFallback:
- pop %ebx
- jmp fpc_varset_symdif_sets_plain
- end;
- {$ifndef CPUX86_HAS_SSEUNIT}
- procedure fpc_varset_symdif_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
- var
- fpc_varset_symdif_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_symdif_sets_dispatch;
- procedure fpc_varset_symdif_sets_dispatch(const set1,set2; var dest;size : ptrint);
- begin
- if has_sse_support then
- fpc_varset_symdif_sets_impl:=@fpc_varset_symdif_sets_sse
- else
- fpc_varset_symdif_sets_impl:=@fpc_varset_symdif_sets_plain;
- fpc_varset_symdif_sets_impl(set1,set2,dest,size);
- end;
- procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
- begin
- fpc_varset_symdif_sets_impl(set1,set2,dest,size);
- end;
- {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_symdif_sets dispatcher)}
- {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
- {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
- {$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
- function fpc_varset_contains_sets_plain(const set1,set2;size : ptrint):boolean; assembler; nostackframe;
- { eax = set1, edx = set2, ecx = size }
- asm
- push %ebx
- sub $4, %ecx
- jl .LBytewise_Prepare { probably dead branch... }
- add %ecx, %eax
- add %ecx, %edx
- neg %ecx { Now ecx = -(size - 4), eax points to set1 + size - 4, edx points to set2 + size - 4. Loop ends on size >= 0, leaving up to 4 tail bytes. }
- .L4x_Loop:
- mov (%edx,%ecx), %ebx
- not %ebx
- test %ebx, (%eax,%ecx)
- jnz .LNo
- add $4, %ecx
- js .L4x_Loop
- mov (%edx), %ebx { Tail. }
- not %ebx
- mov %eax, %ecx { eax value is still required to access set1 tail, but eax is going to be xor-zeroed for setz. }
- xor %eax, %eax
- test %ebx, (%ecx)
- setz %al
- pop %ebx
- ret
- .LNo:
- xor %eax, %eax
- pop %ebx
- ret
- .LBytewise_Prepare:
- add $4, %ecx
- neg %ecx
- sub %ecx, %eax
- sub %ecx, %edx
- .LBytewise_Loop:
- movzbl (%edx,%ecx), %ebx
- not %ebx
- test %bl, (%eax,%ecx)
- jnz .LNo
- inc %ecx
- jnz .LBytewise_Loop
- mov $1, %eax
- pop %ebx
- end;
- function {$ifdef CPUX86_HAS_SSE2} fpc_varset_contains_sets {$else} fpc_varset_contains_sets_sse2 {$endif} (const set1,set2;size : ptrint):boolean; assembler; nostackframe; {$ifdef CPUX86_HAS_SSE2} compilerproc; {$endif}
- { eax = set1, edx = set2, ecx = size }
- asm
- sub $16, %ecx
- jl .LFallback { probably dead branch... }
- { Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. }
- movdqu (%eax), %xmm1
- movdqu (%edx), %xmm2
- pandn %xmm1, %xmm2
- .L16x_Loop:
- movdqu (%eax,%ecx), %xmm1
- movdqu (%edx,%ecx), %xmm0
- pandn %xmm1, %xmm0
- por %xmm0, %xmm2
- sub $16, %ecx
- ja .L16x_Loop
- pxor %xmm0, %xmm0
- pcmpeqb %xmm2,%xmm0
- pmovmskb %xmm0, %ecx
- xor %eax, %eax
- inc %cx
- setz %al
- ret
- .LFallback:
- add $16, %ecx
- jmp fpc_varset_contains_sets_plain
- end;
- {$ifndef CPUX86_HAS_SSE2}
- function fpc_varset_contains_sets_dispatch(const set1,set2;size : ptrint):boolean; forward;
- var
- fpc_varset_contains_sets_impl: function(const set1,set2;size : ptrint):boolean = @fpc_varset_contains_sets_dispatch;
- function fpc_varset_contains_sets_dispatch(const set1,set2;size : ptrint):boolean;
- begin
- if has_sse2_support then
- fpc_varset_contains_sets_impl:=@fpc_varset_contains_sets_sse2
- else
- fpc_varset_contains_sets_impl:=@fpc_varset_contains_sets_plain;
- result:=fpc_varset_contains_sets_impl(set1,set2,size);
- end;
- function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; inline;
- begin
- result:=fpc_varset_contains_sets_impl(set1,set2,size);
- end;
- {$endif ndef CPUX86_HAS_SSE2 (need fpc_varset_contains_sets dispatcher)}
- {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
|