123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234 |
- {
- This file is part of the Free Pascal run time library.
- Copyright (c) 2002 by the Free Pascal development team
- Include file with set operations called by the compiler
- See the file COPYING.FPC, included in this distribution,
- for details about the copyright.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- **********************************************************************}
- {$asmmode intel}
- {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
- procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
- { Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
- Linux: rdi = set1, rsi = set2, rdx = dest, rcx = size }
- asm
- sub size, 16
- jl @Bytewise_Prepare { probably dead branch... }
- @16x_Loop:
- movdqu xmm0, xmmword ptr [set1 + size]
- movdqu xmm1, xmmword ptr [set2 + size]
- por xmm0, xmm1
- movdqu xmmword ptr [dest + size], xmm0
- sub size, 16
- ja @16x_Loop
- movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
- movdqu xmm1, xmmword ptr [set2]
- por xmm0, xmm1
- movdqu xmmword ptr [dest], xmm0
- ret
- @Bytewise_Prepare:
- add size, 15
- @Bytewise_Loop:
- movzx eax, byte ptr [set1 + size]
- or al, byte ptr [set2 + size]
- mov byte ptr [dest + size], al
- sub size, 1
- jae @Bytewise_Loop
- end;
- {$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
- {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
- {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
- procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
- { Same as fpc_varset_add_sets but with 'and' instead of 'or'. }
- asm
- sub size, 16
- jl @Bytewise_Prepare { probably dead branch... }
- @16x_Loop:
- movdqu xmm0, xmmword ptr [set1 + size]
- movdqu xmm1, xmmword ptr [set2 + size]
- pand xmm0, xmm1
- movdqu xmmword ptr [dest + size], xmm0
- sub size, 16
- ja @16x_Loop
- movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
- movdqu xmm1, xmmword ptr [set2]
- pand xmm0, xmm1
- movdqu xmmword ptr [dest], xmm0
- ret
- @Bytewise_Prepare:
- add size, 15
- @Bytewise_Loop:
- movzx eax, byte ptr [set1 + size]
- and al, byte ptr [set2 + size]
- mov byte ptr [dest + size], al
- sub size, 1
- jae @Bytewise_Loop
- end;
- {$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
- {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
- {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
- procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
- { Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
- Linux: rdi = set1, rsi = set2, rdx = dest, rcx = size }
- asm
- sub size, 16
- jl @Bytewise_Prepare { probably dead branch... }
- movdqu xmm1, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
- movdqu xmm2, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
- pandn xmm2, xmm1
- @16x_Loop:
- movdqu xmm1, xmmword ptr [set1 + size]
- movdqu xmm0, xmmword ptr [set2 + size]
- pandn xmm0, xmm1
- movdqu xmmword ptr [dest + size], xmm0
- sub size, 16
- ja @16x_Loop
- movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
- ret
- @Bytewise_Prepare:
- add size, 15
- @Bytewise_Loop:
- movzx eax, byte ptr [set2 + size]
- not eax
- and al, byte ptr [set1 + size]
- mov byte ptr [dest + size], al
- sub size, 1
- jae @Bytewise_Loop
- end;
- {$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
- {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
- {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
- procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
- { Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
- Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
- Linux: rdi = set1, rsi = set2, rdx = dest, rcx = size }
- asm
- sub size, 16
- jl @Bytewise_Prepare { probably dead branch... }
- movdqu xmm2, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
- movdqu xmm1, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
- pxor xmm2, xmm1
- @16x_Loop:
- movdqu xmm0, xmmword ptr [set1 + size]
- movdqu xmm1, xmmword ptr [set2 + size]
- pxor xmm0, xmm1
- movdqu xmmword ptr [dest + size], xmm0
- sub size, 16
- ja @16x_Loop
- movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
- ret
- @Bytewise_Prepare:
- add size, 15
- @Bytewise_Loop:
- movzx eax, byte ptr [set2 + size]
- xor al, byte ptr [set1 + size]
- mov byte ptr [dest + size], al
- sub size, 1
- jae @Bytewise_Loop
- end;
- {$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
- {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
- {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
- function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; assembler; nostackframe;
- { Windows: rcx = set1, rdx = set2, r8 = size
- Linux: rdi = set1, rsi = set2, rdx = size }
- asm
- sub size, 16
- jl @Bytewise_Prepare { probably dead branch... }
- {$if false}
- { Scans 16 bytes at a time left to right with early exits.
- Would be better for large enough sets (maybe around 64 bytes or even more) — if they existed, but worse for actually existing 32.
- Kept for the future. }
- pxor xmm2, xmm2 { xmm2 = 0 }
- add set1, size
- add set2, size
- neg size { Now "size" = -(orig.size - 16), "set1" points to orig.set1 + orig.size - 16, "set2" points to orig.set2 + orig.size - 16.
- Loop ends on "size" >= 0, leaving up to 16 tail bytes. }
- @16x_Loop:
- movdqu xmm1, xmmword ptr [set1 + size]
- movdqu xmm0, xmmword ptr [set2 + size]
- pandn xmm0, xmm1
- pcmpeqb xmm0, xmm2
- pmovmskb eax, xmm0
- inc ax
- jnz @No
- add size, 16
- js @16x_Loop
- movdqu xmm1, xmmword ptr [set1]
- movdqu xmm0, xmmword ptr [set2]
- pandn xmm0, xmm1
- {$else}
- { Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. }
- movdqu xmm1, xmmword ptr [set1]
- movdqu xmm2, xmmword ptr [set2]
- pandn xmm2, xmm1
- @16x_Loop:
- movdqu xmm1, xmmword ptr [set1 + size]
- movdqu xmm0, xmmword ptr [set2 + size]
- pandn xmm0, xmm1
- por xmm2, xmm0
- sub size, 16
- ja @16x_Loop
- pxor xmm0, xmm0
- {$endif}
- pcmpeqb xmm0, xmm2
- pmovmskb ecx, xmm0
- xor eax, eax
- inc cx
- setz al
- ret
- @No:
- xor eax, eax
- ret
- @Bytewise_Prepare:
- add size, 16
- neg size
- sub set1, size
- sub set2, size
- @Bytewise_Loop:
- movzx eax, byte ptr [set2 + size]
- not eax
- test byte ptr [set1 + size], al
- jnz @No
- inc size
- jnz @Bytewise_Loop
- mov eax, $1
- end;
- {$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
- {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
- {$asmmode att}
|