|
@@ -13,3 +13,222 @@
|
|
|
|
|
|
**********************************************************************}
|
|
|
|
|
|
+{$asmmode intel}
|
|
|
+
|
|
|
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
|
|
|
+procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
|
|
|
+{ Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
|
|
|
+ Linux: rdi = set1, rsi = set2, rdx = dest, rcx = size }
|
|
|
+asm
|
|
|
+ sub size, 16
|
|
|
+ jl @Bytewise_Prepare { probably dead branch... }
|
|
|
+
|
|
|
+@16x_Loop:
|
|
|
+ movdqu xmm0, xmmword ptr [set1 + size]
|
|
|
+ movdqu xmm1, xmmword ptr [set2 + size]
|
|
|
+ por xmm0, xmm1
|
|
|
+ movdqu xmmword ptr [dest + size], xmm0
|
|
|
+ sub size, 16
|
|
|
+ ja @16x_Loop
|
|
|
+
|
|
|
+ movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
|
|
|
+ movdqu xmm1, xmmword ptr [set2]
|
|
|
+ por xmm0, xmm1
|
|
|
+ movdqu xmmword ptr [dest], xmm0
|
|
|
+ ret
|
|
|
+
|
|
|
+@Bytewise_Prepare:
|
|
|
+ add size, 15
|
|
|
+@Bytewise_Loop:
|
|
|
+ movzx eax, byte ptr [set1 + size]
|
|
|
+ or al, byte ptr [set2 + size]
|
|
|
+ mov byte ptr [dest + size], al
|
|
|
+ sub size, 1
|
|
|
+ jae @Bytewise_Loop
|
|
|
+end;
|
|
|
+{$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
|
|
|
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
|
|
|
+
|
|
|
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
|
|
|
+procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
|
|
|
+{ Same as fpc_varset_add_sets but with 'and' instead of 'or'. }
|
|
|
+asm
|
|
|
+ sub size, 16
|
|
|
+ jl @Bytewise_Prepare { probably dead branch... }
|
|
|
+
|
|
|
+@16x_Loop:
|
|
|
+ movdqu xmm0, xmmword ptr [set1 + size]
|
|
|
+ movdqu xmm1, xmmword ptr [set2 + size]
|
|
|
+ pand xmm0, xmm1
|
|
|
+ movdqu xmmword ptr [dest + size], xmm0
|
|
|
+ sub size, 16
|
|
|
+ ja @16x_Loop
|
|
|
+
|
|
|
+ movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
|
|
|
+ movdqu xmm1, xmmword ptr [set2]
|
|
|
+ pand xmm0, xmm1
|
|
|
+ movdqu xmmword ptr [dest], xmm0
|
|
|
+ ret
|
|
|
+
|
|
|
+@Bytewise_Prepare:
|
|
|
+ add size, 15
|
|
|
+@Bytewise_Loop:
|
|
|
+ movzx eax, byte ptr [set1 + size]
|
|
|
+ and al, byte ptr [set2 + size]
|
|
|
+ mov byte ptr [dest + size], al
|
|
|
+ sub size, 1
|
|
|
+ jae @Bytewise_Loop
|
|
|
+end;
|
|
|
+{$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
|
|
|
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
|
|
|
+
|
|
|
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
|
|
|
+procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
|
|
|
+{ Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
|
|
|
+ Linux: rdi = set1, rsi = set2, rdx = dest, rcx = size }
|
|
|
+asm
|
|
|
+ sub size, 16
|
|
|
+ jl @Bytewise_Prepare { probably dead branch... }
|
|
|
+
|
|
|
+ movdqu xmm1, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
|
|
|
+ movdqu xmm2, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
|
|
|
+ pandn xmm2, xmm1
|
|
|
+
|
|
|
+@16x_Loop:
|
|
|
+ movdqu xmm1, xmmword ptr [set1 + size]
|
|
|
+ movdqu xmm0, xmmword ptr [set2 + size]
|
|
|
+ pandn xmm0, xmm1
|
|
|
+ movdqu xmmword ptr [dest + size], xmm0
|
|
|
+ sub size, 16
|
|
|
+ ja @16x_Loop
|
|
|
+
|
|
|
+ movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
|
|
|
+ ret
|
|
|
+
|
|
|
+@Bytewise_Prepare:
|
|
|
+ add size, 15
|
|
|
+@Bytewise_Loop:
|
|
|
+ movzx eax, byte ptr [set2 + size]
|
|
|
+ not eax
|
|
|
+ and al, byte ptr [set1 + size]
|
|
|
+ mov byte ptr [dest + size], al
|
|
|
+ sub size, 1
|
|
|
+ jae @Bytewise_Loop
|
|
|
+end;
|
|
|
+{$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
|
|
|
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
|
|
|
+
|
|
|
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
|
|
|
+procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
|
|
|
+{ Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
|
|
|
+
|
|
|
+ Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
|
|
|
+ Linux: rdi = set1, rsi = set2, rdx = dest, rcx = size }
|
|
|
+asm
|
|
|
+ sub size, 16
|
|
|
+ jl @Bytewise_Prepare { probably dead branch... }
|
|
|
+
|
|
|
+ movdqu xmm2, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
|
|
|
+ movdqu xmm1, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
|
|
|
+ pxor xmm2, xmm1
|
|
|
+
|
|
|
+@16x_Loop:
|
|
|
+ movdqu xmm0, xmmword ptr [set1 + size]
|
|
|
+ movdqu xmm1, xmmword ptr [set2 + size]
|
|
|
+ pxor xmm0, xmm1
|
|
|
+ movdqu xmmword ptr [dest + size], xmm0
|
|
|
+ sub size, 16
|
|
|
+ ja @16x_Loop
|
|
|
+
|
|
|
+ movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
|
|
|
+ ret
|
|
|
+
|
|
|
+@Bytewise_Prepare:
|
|
|
+ add size, 15
|
|
|
+@Bytewise_Loop:
|
|
|
+ movzx eax, byte ptr [set2 + size]
|
|
|
+ xor al, byte ptr [set1 + size]
|
|
|
+ mov byte ptr [dest + size], al
|
|
|
+ sub size, 1
|
|
|
+ jae @Bytewise_Loop
|
|
|
+end;
|
|
|
+{$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
|
|
|
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
|
|
|
+
|
|
|
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
|
|
|
+function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; assembler; nostackframe;
|
|
|
+{ Windows: rcx = set1, rdx = set2, r8 = size
|
|
|
+ Linux: rdi = set1, rsi = set2, rdx = size }
|
|
|
+asm
|
|
|
+ sub size, 16
|
|
|
+ jl @Bytewise_Prepare { probably dead branch... }
|
|
|
+
|
|
|
+{$if false}
|
|
|
+{ Scans 16 bytes at a time left to right with early exits.
|
|
|
+ Would be better for large enough sets (maybe around 64 bytes or even more) — if they existed, but worse for actually existing 32.
|
|
|
+ Kept for the future. }
|
|
|
+ pxor xmm2, xmm2 { xmm2 = 0 }
|
|
|
+ add set1, size
|
|
|
+ add set2, size
|
|
|
+ neg size { Now "size" = -(orig.size - 16), "set1" points to orig.set1 + orig.size - 16, "set2" points to orig.set2 + orig.size - 16.
|
|
|
+ Loop ends on "size" >= 0, leaving up to 16 tail bytes. }
|
|
|
+
|
|
|
+@16x_Loop:
|
|
|
+ movdqu xmm1, xmmword ptr [set1 + size]
|
|
|
+ movdqu xmm0, xmmword ptr [set2 + size]
|
|
|
+ pandn xmm0, xmm1
|
|
|
+ pcmpeqb xmm0, xmm2
|
|
|
+ pmovmskb eax, xmm0
|
|
|
+ inc ax
|
|
|
+ jnz @No
|
|
|
+ add size, 16
|
|
|
+ js @16x_Loop
|
|
|
+
|
|
|
+ movdqu xmm1, xmmword ptr [set1]
|
|
|
+ movdqu xmm0, xmmword ptr [set2]
|
|
|
+ pandn xmm0, xmm1
|
|
|
+{$else}
|
|
|
+{ Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. }
|
|
|
+ movdqu xmm1, xmmword ptr [set1]
|
|
|
+ movdqu xmm2, xmmword ptr [set2]
|
|
|
+ pandn xmm2, xmm1
|
|
|
+
|
|
|
+@16x_Loop:
|
|
|
+ movdqu xmm1, xmmword ptr [set1 + size]
|
|
|
+ movdqu xmm0, xmmword ptr [set2 + size]
|
|
|
+ pandn xmm0, xmm1
|
|
|
+ por xmm2, xmm0
|
|
|
+ sub size, 16
|
|
|
+ ja @16x_Loop
|
|
|
+
|
|
|
+ pxor xmm0, xmm0
|
|
|
+{$endif}
|
|
|
+ pcmpeqb xmm0, xmm2
|
|
|
+ pmovmskb ecx, xmm0
|
|
|
+ xor eax, eax
|
|
|
+ inc cx
|
|
|
+ setz al
|
|
|
+ ret
|
|
|
+
|
|
|
+@No:
|
|
|
+ xor eax, eax
|
|
|
+ ret
|
|
|
+
|
|
|
+@Bytewise_Prepare:
|
|
|
+ add size, 16
|
|
|
+ neg size
|
|
|
+ sub set1, size
|
|
|
+ sub set2, size
|
|
|
+@Bytewise_Loop:
|
|
|
+ movzx eax, byte ptr [set2 + size]
|
|
|
+ not eax
|
|
|
+ test byte ptr [set1 + size], al
|
|
|
+ jnz @No
|
|
|
+ inc size
|
|
|
+ jnz @Bytewise_Loop
|
|
|
+ mov eax, $1
|
|
|
+end;
|
|
|
+{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
|
|
|
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
|
|
|
+
|
|
|
+{$asmmode att}
|