{ This file is part of the Free Pascal run time library. Copyright (c) 1999-2000 by the Free Pascal development team Include file with set operations called by the compiler See the file COPYING.FPC, included in this distribution, for details about the copyright. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. **********************************************************************} {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS} {$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS} procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe; { eax = set1, edx = set2, ecx = dest, [esp + 4] = size } asm push %ebx push %esi mov 12(%esp), %esi { esi = size } sub $4, %esi jl .LBytewise_Prepare { probably dead branch... } .L4x_Loop: mov (%eax,%esi), %ebx or (%edx,%esi), %ebx mov %ebx, (%ecx,%esi) sub $4, %esi ja .L4x_Loop mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). } or (%edx), %ebx mov %ebx, (%ecx) pop %esi pop %ebx ret $4 .LBytewise_Prepare: add $3, %esi .LBytewise_Loop: movzbl (%eax,%esi), %ebx or (%edx,%esi), %bl mov %bl, (%ecx,%esi) sub $1, %esi jae .LBytewise_Loop pop %esi pop %ebx end; procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_add_sets {$else} fpc_varset_add_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif} { eax = set1, edx = set2, ecx = dest, [esp + 4] = size } asm push %ebx mov 8(%esp), %ebx sub $16, %ebx { ebx = position } jl .LFallback { Hopefully dead branch... } .L16x_Loop: movups (%eax,%ebx), %xmm0 movups (%edx,%ebx), %xmm1 orps %xmm1, %xmm0 movups %xmm0, (%ecx,%ebx) sub $16, %ebx ja .L16x_Loop movups (%eax), %xmm0 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). } movups (%edx), %xmm1 orps %xmm1, %xmm0 movups %xmm0, (%ecx) pop %ebx ret $4 .LFallback: pop %ebx jmp fpc_varset_add_sets_plain end; {$ifndef CPUX86_HAS_SSEUNIT} procedure fpc_varset_add_sets_dispatch(const set1,set2; var dest;size : ptrint); forward; var fpc_varset_add_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_add_sets_dispatch; procedure fpc_varset_add_sets_dispatch(const set1,set2; var dest;size : ptrint); begin if has_sse_support then fpc_varset_add_sets_impl:=@fpc_varset_add_sets_sse else fpc_varset_add_sets_impl:=@fpc_varset_add_sets_plain; fpc_varset_add_sets_impl(set1,set2,dest,size); end; procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline; begin fpc_varset_add_sets_impl(set1,set2,dest,size); end; {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_add_sets dispatcher)} {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS} {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS} {$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS} procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe; { Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. } asm push %ebx push %esi mov 12(%esp), %esi { esi = size } sub $4, %esi jl .LBytewise_Prepare { probably dead branch... } .L4x_Loop: mov (%eax,%esi), %ebx and (%edx,%esi), %ebx mov %ebx, (%ecx,%esi) sub $4, %esi ja .L4x_Loop mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). } and (%edx), %ebx mov %ebx, (%ecx) pop %esi pop %ebx ret $4 .LBytewise_Prepare: add $3, %esi .LBytewise_Loop: movzbl (%eax,%esi), %ebx and (%edx,%esi), %bl mov %bl, (%ecx,%esi) sub $1, %esi jae .LBytewise_Loop pop %esi pop %ebx end; procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_mul_sets {$else} fpc_varset_mul_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif} { Same as fpc_varset_add_sets_sse but with 'and' instead of 'or'. } asm push %ebx mov 8(%esp), %ebx sub $16, %ebx { ebx = position } jl .LFallback { Hopefully dead branch... } .L16x_Loop: movups (%eax,%ebx), %xmm0 movups (%edx,%ebx), %xmm1 andps %xmm1, %xmm0 movups %xmm0, (%ecx,%ebx) sub $16, %ebx ja .L16x_Loop movups (%eax), %xmm0 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). } movups (%edx), %xmm1 andps %xmm1, %xmm0 movups %xmm0, (%ecx) pop %ebx ret $4 .LFallback: pop %ebx jmp fpc_varset_mul_sets_plain end; {$ifndef CPUX86_HAS_SSEUNIT} procedure fpc_varset_mul_sets_dispatch(const set1,set2; var dest;size : ptrint); forward; var fpc_varset_mul_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_mul_sets_dispatch; procedure fpc_varset_mul_sets_dispatch(const set1,set2; var dest;size : ptrint); begin if has_sse_support then fpc_varset_mul_sets_impl:=@fpc_varset_mul_sets_sse else fpc_varset_mul_sets_impl:=@fpc_varset_mul_sets_plain; fpc_varset_mul_sets_impl(set1,set2,dest,size); end; procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline; begin fpc_varset_mul_sets_impl(set1,set2,dest,size); end; {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_mul_sets dispatcher)} {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS} {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS} {$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS} procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe; { eax = set1, edx = set2, ecx = dest, [esp + 4] = size } asm push %ebx push %esi mov 12(%esp), %esi { esi = size } sub $4, %esi jl .LBytewise_Prepare { probably dead branch... } mov (%edx), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). } not %ebx { Precalculated because operation is not idempotent and dest can be equal to set1/set2. } and (%eax), %ebx push %ebx .L4x_Loop: mov (%edx,%esi), %ebx not %ebx and (%eax,%esi), %ebx mov %ebx, (%ecx,%esi) sub $4, %esi ja .L4x_Loop pop %ebx mov %ebx, (%ecx) { Write precalculated tail. } pop %esi pop %ebx ret $4 .LBytewise_Prepare: add $3, %esi .LBytewise_Loop: movzbl (%edx,%esi), %ebx not %ebx and (%eax,%esi), %bl mov %bl, (%ecx,%esi) sub $1, %esi jae .LBytewise_Loop pop %esi pop %ebx end; procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_sub_sets {$else} fpc_varset_sub_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif} { eax = set1, edx = set2, ecx = dest, [esp + 4] = size } asm push %ebx mov 8(%esp), %ebx sub $16, %ebx { ebx = position } jl .LFallback { Hopefully dead branch... } movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). } movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. } andnps %xmm1, %xmm2 .L16x_Loop: movups (%eax,%ebx), %xmm1 movups (%edx,%ebx), %xmm0 andnps %xmm1, %xmm0 movups %xmm0, (%ecx,%ebx) sub $16, %ebx ja .L16x_Loop movups %xmm2, (%ecx) { Write precalculated tail. } pop %ebx ret $4 .LFallback: pop %ebx jmp fpc_varset_sub_sets_plain end; {$ifndef CPUX86_HAS_SSEUNIT} procedure fpc_varset_sub_sets_dispatch(const set1,set2; var dest;size : ptrint); forward; var fpc_varset_sub_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_sub_sets_dispatch; procedure fpc_varset_sub_sets_dispatch(const set1,set2; var dest;size : ptrint); begin if has_sse_support then fpc_varset_sub_sets_impl:=@fpc_varset_sub_sets_sse else fpc_varset_sub_sets_impl:=@fpc_varset_sub_sets_plain; fpc_varset_sub_sets_impl(set1,set2,dest,size); end; procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline; begin fpc_varset_sub_sets_impl(set1,set2,dest,size); end; {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_sub_sets dispatcher)} {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS} {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS} {$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS} procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe; { Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'. eax = set1, edx = set2, ecx = dest, [esp + 4] = size } asm push %ebx push %esi mov 12(%esp), %esi { esi = size } sub $4, %esi jl .LBytewise_Prepare { probably dead branch... } mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). } xor (%edx), %ebx { Precalculated because operation is not idempotent and dest can be equal to set1/set2. } push %ebx .L4x_Loop: mov (%eax,%esi), %ebx xor (%edx,%esi), %ebx mov %ebx, (%ecx,%esi) sub $4, %esi ja .L4x_Loop pop %ebx mov %ebx, (%ecx) { Write precalculated tail. } pop %esi pop %ebx ret $4 .LBytewise_Prepare: add $3, %esi .LBytewise_Loop: movzbl (%eax,%esi), %ebx xor (%edx,%esi), %bl mov %bl, (%ecx,%esi) sub $1, %esi jae .LBytewise_Loop pop %esi pop %ebx end; procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_symdif_sets {$else} fpc_varset_symdif_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif} { Same as fpc_varset_sub_sets_sse but with 'xor' instead of 'and not'. eax = set1, edx = set2, ecx = dest, [esp + 4] = size } asm push %ebx mov 8(%esp), %ebx sub $16, %ebx { ebx = position } jl .LFallback { Hopefully dead branch... } movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). } movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. } xorps %xmm1, %xmm2 .L16x_Loop: movups (%eax,%ebx), %xmm1 movups (%edx,%ebx), %xmm0 xorps %xmm1, %xmm0 movups %xmm0, (%ecx,%ebx) sub $16, %ebx ja .L16x_Loop movups %xmm2, (%ecx) { Write precalculated tail. } pop %ebx ret $4 .LFallback: pop %ebx jmp fpc_varset_symdif_sets_plain end; {$ifndef CPUX86_HAS_SSEUNIT} procedure fpc_varset_symdif_sets_dispatch(const set1,set2; var dest;size : ptrint); forward; var fpc_varset_symdif_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_symdif_sets_dispatch; procedure fpc_varset_symdif_sets_dispatch(const set1,set2; var dest;size : ptrint); begin if has_sse_support then fpc_varset_symdif_sets_impl:=@fpc_varset_symdif_sets_sse else fpc_varset_symdif_sets_impl:=@fpc_varset_symdif_sets_plain; fpc_varset_symdif_sets_impl(set1,set2,dest,size); end; procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline; begin fpc_varset_symdif_sets_impl(set1,set2,dest,size); end; {$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_symdif_sets dispatcher)} {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS} {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET} {$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET} function fpc_varset_contains_sets_plain(const set1,set2;size : ptrint):boolean; assembler; nostackframe; { eax = set1, edx = set2, ecx = size } asm push %ebx sub $4, %ecx jl .LBytewise_Prepare { probably dead branch... } add %ecx, %eax add %ecx, %edx neg %ecx { Now ecx = -(size - 4), eax points to set1 + size - 4, edx points to set2 + size - 4. Loop ends on size >= 0, leaving up to 4 tail bytes. } .L4x_Loop: mov (%edx,%ecx), %ebx not %ebx test %ebx, (%eax,%ecx) jnz .LNo add $4, %ecx js .L4x_Loop mov (%edx), %ebx { Tail. } not %ebx mov %eax, %ecx { eax value is still required to access set1 tail, but eax is going to be xor-zeroed for setz. } xor %eax, %eax test %ebx, (%ecx) setz %al pop %ebx ret .LNo: xor %eax, %eax pop %ebx ret .LBytewise_Prepare: add $4, %ecx neg %ecx sub %ecx, %eax sub %ecx, %edx .LBytewise_Loop: movzbl (%edx,%ecx), %ebx not %ebx test %bl, (%eax,%ecx) jnz .LNo inc %ecx jnz .LBytewise_Loop mov $1, %eax pop %ebx end; function {$ifdef CPUX86_HAS_SSE2} fpc_varset_contains_sets {$else} fpc_varset_contains_sets_sse2 {$endif} (const set1,set2;size : ptrint):boolean; assembler; nostackframe; {$ifdef CPUX86_HAS_SSE2} compilerproc; {$endif} { eax = set1, edx = set2, ecx = size } asm sub $16, %ecx jl .LFallback { probably dead branch... } { Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. } movdqu (%eax), %xmm1 movdqu (%edx), %xmm2 pandn %xmm1, %xmm2 .L16x_Loop: movdqu (%eax,%ecx), %xmm1 movdqu (%edx,%ecx), %xmm0 pandn %xmm1, %xmm0 por %xmm0, %xmm2 sub $16, %ecx ja .L16x_Loop pxor %xmm0, %xmm0 pcmpeqb %xmm2,%xmm0 pmovmskb %xmm0, %ecx xor %eax, %eax inc %cx setz %al ret .LFallback: add $16, %ecx jmp fpc_varset_contains_sets_plain end; {$ifndef CPUX86_HAS_SSE2} function fpc_varset_contains_sets_dispatch(const set1,set2;size : ptrint):boolean; forward; var fpc_varset_contains_sets_impl: function(const set1,set2;size : ptrint):boolean = @fpc_varset_contains_sets_dispatch; function fpc_varset_contains_sets_dispatch(const set1,set2;size : ptrint):boolean; begin if has_sse2_support then fpc_varset_contains_sets_impl:=@fpc_varset_contains_sets_sse2 else fpc_varset_contains_sets_impl:=@fpc_varset_contains_sets_plain; result:=fpc_varset_contains_sets_impl(set1,set2,size); end; function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; inline; begin result:=fpc_varset_contains_sets_impl(set1,set2,size); end; {$endif ndef CPUX86_HAS_SSE2 (need fpc_varset_contains_sets dispatcher)} {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}