2 years ago · 2dca69f2ac
--- a/rtl/i386/set.inc
+++ b/rtl/i386/set.inc
@@ -13,6 +13,217 @@
 
				 
			
 
				  **********************************************************************}
			
 
				 
			
 
				+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
			
 
				+procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
			
 
				+{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
			
 
				+asm
			
 
				+    push   %ebx
			
 
				+    push   %esi
			
 
				+    mov    12(%esp), %esi { esi = size }
			
 
				+    sub    $4, %esi
			
 
				+    jl     .LBytewise_Prepare { probably dead branch... }
			
 
				+
			
 
				+.L4x_Loop:
			
 
				+    mov    (%eax,%esi), %ebx
			
 
				+    or     (%edx,%esi), %ebx
			
 
				+    mov    %ebx, (%ecx,%esi)
			
 
				+    sub    $4, %esi
			
 
				+    ja     .L4x_Loop
			
 
				+
			
 
				+    mov    (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
			
 
				+    or     (%edx), %ebx
			
 
				+    mov    %ebx, (%ecx)
			
 
				+    pop    %esi
			
 
				+    pop    %ebx
			
 
				+    ret    $4
			
 
				+
			
 
				+.LBytewise_Prepare:
			
 
				+    add    $3, %esi
			
 
				+.LBytewise_Loop:
			
 
				+    movzbl (%eax,%esi), %ebx
			
 
				+    or     (%edx,%esi), %bl
			
 
				+    mov    %bl, (%ecx,%esi)
			
 
				+    sub    $1, %esi
			
 
				+    jae    .LBytewise_Loop
			
 
				+    pop    %esi
			
 
				+    pop    %ebx
			
 
				+end;
			
 
				+{$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
			
 
				+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
			
 
				+
			
 
				+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
			
 
				+procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
			
 
				+{ Same as fpc_varset_add_sets but with 'and' instead of 'or'. }
			
 
				+asm
			
 
				+    push   %ebx
			
 
				+    push   %esi
			
 
				+    mov    12(%esp), %esi { esi = size }
			
 
				+    sub    $4, %esi
			
 
				+    jl     .LBytewise_Prepare { probably dead branch... }
			
 
				+
			
 
				+.L4x_Loop:
			
 
				+    mov    (%eax,%esi), %ebx
			
 
				+    and    (%edx,%esi), %ebx
			
 
				+    mov    %ebx, (%ecx,%esi)
			
 
				+    sub    $4, %esi
			
 
				+    ja     .L4x_Loop
			
 
				+
			
 
				+    mov    (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
			
 
				+    and    (%edx), %ebx
			
 
				+    mov    %ebx, (%ecx)
			
 
				+    pop    %esi
			
 
				+    pop    %ebx
			
 
				+    ret    $4
			
 
				+
			
 
				+.LBytewise_Prepare:
			
 
				+    add    $3, %esi
			
 
				+.LBytewise_Loop:
			
 
				+    movzbl (%eax,%esi), %ebx
			
 
				+    and    (%edx,%esi), %bl
			
 
				+    mov    %bl, (%ecx,%esi)
			
 
				+    sub    $1, %esi
			
 
				+    jae    .LBytewise_Loop
			
 
				+    pop    %esi
			
 
				+    pop    %ebx
			
 
				+end;
			
 
				+{$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
			
 
				+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
			
 
				+
			
 
				+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
			
 
				+procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
			
 
				+{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
			
 
				+asm
			
 
				+    push   %ebx
			
 
				+    push   %esi
			
 
				+    mov    12(%esp), %esi { esi = size }
			
 
				+    sub    $4, %esi
			
 
				+    jl     .LBytewise_Prepare { probably dead branch... }
			
 
				+
			
 
				+    mov    (%edx), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
			
 
				+    not    %ebx         { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
			
 
				+    and    (%eax), %ebx
			
 
				+    push   %ebx
			
 
				+.L4x_Loop:
			
 
				+    mov    (%edx,%esi), %ebx
			
 
				+    not    %ebx
			
 
				+    and    (%eax,%esi), %ebx
			
 
				+    mov    %ebx, (%ecx,%esi)
			
 
				+    sub    $4, %esi
			
 
				+    ja     .L4x_Loop
			
 
				+
			
 
				+    pop    %ebx
			
 
				+    mov    %ebx, (%ecx) { Write precalculated tail. }
			
 
				+    pop    %esi
			
 
				+    pop    %ebx
			
 
				+    ret    $4
			
 
				+
			
 
				+.LBytewise_Prepare:
			
 
				+    add    $3, %esi
			
 
				+.LBytewise_Loop:
			
 
				+    movzbl (%edx,%esi), %ebx
			
 
				+    not    %ebx
			
 
				+    and    (%eax,%esi), %bl
			
 
				+    mov    %bl, (%ecx,%esi)
			
 
				+    sub    $1, %esi
			
 
				+    jae    .LBytewise_Loop
			
 
				+    pop    %esi
			
 
				+    pop    %ebx
			
 
				+end;
			
 
				+{$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
			
 
				+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
			
 
				+
			
 
				+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
			
 
				+procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
			
 
				+{ Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
			
 
				+  eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
			
 
				+asm
			
 
				+    push   %ebx
			
 
				+    push   %esi
			
 
				+    mov    12(%esp), %esi { esi = size }
			
 
				+    sub    $4, %esi
			
 
				+    jl     .LBytewise_Prepare { probably dead branch... }
			
 
				+
			
 
				+    mov    (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
			
 
				+    xor    (%edx), %ebx { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
			
 
				+    push   %ebx
			
 
				+.L4x_Loop:
			
 
				+    mov    (%eax,%esi), %ebx
			
 
				+    xor    (%edx,%esi), %ebx
			
 
				+    mov    %ebx, (%ecx,%esi)
			
 
				+    sub    $4, %esi
			
 
				+    ja     .L4x_Loop
			
 
				+
			
 
				+    pop    %ebx
			
 
				+    mov    %ebx, (%ecx) { Write precalculated tail. }
			
 
				+    pop    %esi
			
 
				+    pop    %ebx
			
 
				+    ret    $4
			
 
				+
			
 
				+.LBytewise_Prepare:
			
 
				+    add    $3, %esi
			
 
				+.LBytewise_Loop:
			
 
				+    movzbl (%eax,%esi), %ebx
			
 
				+    xor    (%edx,%esi), %bl
			
 
				+    mov    %bl, (%ecx,%esi)
			
 
				+    sub    $1, %esi
			
 
				+    jae    .LBytewise_Loop
			
 
				+    pop    %esi
			
 
				+    pop    %ebx
			
 
				+end;
			
 
				+{$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
			
 
				+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
			
 
				+
			
 
				+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
			
 
				+function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; assembler; nostackframe;
			
 
				+{ eax = set1, edx = set2, ecx = size }
			
 
				+asm
			
 
				+    push   %ebx
			
 
				+    sub    $4, %ecx
			
 
				+    jl     .LBytewise_Prepare { probably dead branch... }
			
 
				+    add    %ecx, %eax
			
 
				+    add    %ecx, %edx
			
 
				+    neg    %ecx { Now ecx = -(size - 4), eax points to set1 + size - 4, edx points to set2 + size - 4. Loop ends on size >= 0, leaving up to 4 tail bytes. }
			
 
				+
			
 
				+.L4x_Loop:
			
 
				+    mov    (%edx,%ecx), %ebx
			
 
				+    not    %ebx
			
 
				+    test   %ebx, (%eax,%ecx)
			
 
				+    jnz    .LNo
			
 
				+    add    $4, %ecx
			
 
				+    js     .L4x_Loop
			
 
				+
			
 
				+    mov    (%edx), %ebx { Tail. }
			
 
				+    not    %ebx
			
 
				+    mov    %eax, %ecx { eax value is still required to access set1 tail, but eax is going to be xor-zeroed for setz. }
			
 
				+    xor    %eax, %eax
			
 
				+    test   %ebx, (%ecx)
			
 
				+    setz   %al
			
 
				+    pop    %ebx
			
 
				+    ret
			
 
				+
			
 
				+.LNo:
			
 
				+    xor    %eax, %eax
			
 
				+    pop    %ebx
			
 
				+    ret
			
 
				+
			
 
				+.LBytewise_Prepare:
			
 
				+    add    $4, %ecx
			
 
				+    neg    %ecx
			
 
				+    sub    %ecx, %eax
			
 
				+    sub    %ecx, %edx
			
 
				+.LBytewise_Loop:
			
 
				+    movzbl (%edx,%ecx), %ebx
			
 
				+    not    %ebx
			
 
				+    test   %bl, (%eax,%ecx)
			
 
				+    jnz    .LNo
			
 
				+    inc    %ecx
			
 
				+    jnz    .LBytewise_Loop
			
 
				+    mov    $1, %eax
			
 
				+    pop    %ebx
			
 
				+end;
			
 
				+{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
			
 
				+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
			
 
				+
			
 
				 { the following code is exactly big endian set-related, but specific to the old
			
 
				   scheme whereby sets were either 4 or 32 bytes. I've left the routines here
			
 
				   so if someone wants to, they can create equivalents of the new varset helpers
			
--- a/rtl/x86_64/set.inc
+++ b/rtl/x86_64/set.inc
@@ -13,3 +13,222 @@
 
				 
			
 
				  **********************************************************************}
			
 
				 
			
 
				+{$asmmode intel}
			
 
				+
			
 
				+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
			
 
				+procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
			
 
				+{ Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
			
 
				+  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
			
 
				+asm
			
 
				+    sub    size, 16
			
 
				+    jl     @Bytewise_Prepare { probably dead branch... }
			
 
				+
			
 
				+@16x_Loop:
			
 
				+    movdqu xmm0, xmmword ptr [set1 + size]
			
 
				+    movdqu xmm1, xmmword ptr [set2 + size]
			
 
				+    por    xmm0, xmm1
			
 
				+    movdqu xmmword ptr [dest + size], xmm0
			
 
				+    sub    size, 16
			
 
				+    ja     @16x_Loop
			
 
				+
			
 
				+    movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
			
 
				+    movdqu xmm1, xmmword ptr [set2]
			
 
				+    por    xmm0, xmm1
			
 
				+    movdqu xmmword ptr [dest], xmm0
			
 
				+    ret
			
 
				+
			
 
				+@Bytewise_Prepare:
			
 
				+    add    size, 15
			
 
				+@Bytewise_Loop:
			
 
				+    movzx  eax, byte ptr [set1 + size]
			
 
				+    or     al, byte ptr [set2 + size]
			
 
				+    mov    byte ptr [dest + size], al
			
 
				+    sub    size, 1
			
 
				+    jae    @Bytewise_Loop
			
 
				+end;
			
 
				+{$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
			
 
				+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
			
 
				+
			
 
				+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
			
 
				+procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
			
 
				+{ Same as fpc_varset_add_sets but with 'and' instead of 'or'. }
			
 
				+asm
			
 
				+    sub    size, 16
			
 
				+    jl     @Bytewise_Prepare { probably dead branch... }
			
 
				+
			
 
				+@16x_Loop:
			
 
				+    movdqu xmm0, xmmword ptr [set1 + size]
			
 
				+    movdqu xmm1, xmmword ptr [set2 + size]
			
 
				+    pand   xmm0, xmm1
			
 
				+    movdqu xmmword ptr [dest + size], xmm0
			
 
				+    sub    size, 16
			
 
				+    ja     @16x_Loop
			
 
				+
			
 
				+    movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
			
 
				+    movdqu xmm1, xmmword ptr [set2]
			
 
				+    pand   xmm0, xmm1
			
 
				+    movdqu xmmword ptr [dest], xmm0
			
 
				+    ret
			
 
				+
			
 
				+@Bytewise_Prepare:
			
 
				+    add    size, 15
			
 
				+@Bytewise_Loop:
			
 
				+    movzx  eax, byte ptr [set1 + size]
			
 
				+    and    al, byte ptr [set2 + size]
			
 
				+    mov    byte ptr [dest + size], al
			
 
				+    sub    size, 1
			
 
				+    jae    @Bytewise_Loop
			
 
				+end;
			
 
				+{$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
			
 
				+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
			
 
				+
			
 
				+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
			
 
				+procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
			
 
				+{ Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
			
 
				+  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
			
 
				+asm
			
 
				+    sub    size, 16
			
 
				+    jl     @Bytewise_Prepare { probably dead branch... }
			
 
				+
			
 
				+    movdqu xmm1, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
			
 
				+    movdqu xmm2, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
			
 
				+    pandn  xmm2, xmm1
			
 
				+
			
 
				+@16x_Loop:
			
 
				+    movdqu xmm1, xmmword ptr [set1 + size]
			
 
				+    movdqu xmm0, xmmword ptr [set2 + size]
			
 
				+    pandn  xmm0, xmm1
			
 
				+    movdqu xmmword ptr [dest + size], xmm0
			
 
				+    sub    size, 16
			
 
				+    ja     @16x_Loop
			
 
				+
			
 
				+    movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
			
 
				+    ret
			
 
				+
			
 
				+@Bytewise_Prepare:
			
 
				+    add    size, 15
			
 
				+@Bytewise_Loop:
			
 
				+    movzx  eax, byte ptr [set2 + size]
			
 
				+    not    eax
			
 
				+    and    al, byte ptr [set1 + size]
			
 
				+    mov    byte ptr [dest + size], al
			
 
				+    sub    size, 1
			
 
				+    jae    @Bytewise_Loop
			
 
				+end;
			
 
				+{$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
			
 
				+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
			
 
				+
			
 
				+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
			
 
				+procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
			
 
				+{ Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
			
 
				+
			
 
				+  Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
			
 
				+  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
			
 
				+asm
			
 
				+    sub    size, 16
			
 
				+    jl     @Bytewise_Prepare { probably dead branch... }
			
 
				+
			
 
				+    movdqu xmm2, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
			
 
				+    movdqu xmm1, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
			
 
				+    pxor   xmm2, xmm1
			
 
				+
			
 
				+@16x_Loop:
			
 
				+    movdqu xmm0, xmmword ptr [set1 + size]
			
 
				+    movdqu xmm1, xmmword ptr [set2 + size]
			
 
				+    pxor   xmm0, xmm1
			
 
				+    movdqu xmmword ptr [dest + size], xmm0
			
 
				+    sub    size, 16
			
 
				+    ja     @16x_Loop
			
 
				+
			
 
				+    movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
			
 
				+    ret
			
 
				+
			
 
				+@Bytewise_Prepare:
			
 
				+    add    size, 15
			
 
				+@Bytewise_Loop:
			
 
				+    movzx  eax, byte ptr [set2 + size]
			
 
				+    xor    al, byte ptr [set1 + size]
			
 
				+    mov    byte ptr [dest + size], al
			
 
				+    sub    size, 1
			
 
				+    jae    @Bytewise_Loop
			
 
				+end;
			
 
				+{$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
			
 
				+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
			
 
				+
			
 
				+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
			
 
				+function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; assembler; nostackframe;
			
 
				+{ Windows: rcx = set1, rdx = set2, r8 = size
			
 
				+  Linux:   rdi = set1, rsi = set2, rdx = size }
			
 
				+asm
			
 
				+    sub    size, 16
			
 
				+    jl     @Bytewise_Prepare { probably dead branch... }
			
 
				+
			
 
				+{$if false}
			
 
				+{ Scans 16 bytes at a time left to right with early exits.
			
 
				+  Would be better for large enough sets (maybe around 64 bytes or even more) — if they existed, but worse for actually existing 32.
			
 
				+  Kept for the future. }
			
 
				+    pxor   xmm2, xmm2 { xmm2 = 0 }
			
 
				+    add    set1, size
			
 
				+    add    set2, size
			
 
				+    neg    size { Now "size" = -(orig.size - 16), "set1" points to orig.set1 + orig.size - 16, "set2" points to orig.set2 + orig.size - 16.
			
 
				+                  Loop ends on "size" >= 0, leaving up to 16 tail bytes. }
			
 
				+
			
 
				+@16x_Loop:
			
 
				+    movdqu xmm1, xmmword ptr [set1 + size]
			
 
				+    movdqu xmm0, xmmword ptr [set2 + size]
			
 
				+    pandn  xmm0, xmm1
			
 
				+    pcmpeqb xmm0, xmm2
			
 
				+    pmovmskb eax, xmm0
			
 
				+    inc    ax
			
 
				+    jnz    @No
			
 
				+    add    size, 16
			
 
				+    js     @16x_Loop
			
 
				+
			
 
				+    movdqu xmm1, xmmword ptr [set1]
			
 
				+    movdqu xmm0, xmmword ptr [set2]
			
 
				+    pandn  xmm0, xmm1
			
 
				+{$else}
			
 
				+{ Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. }
			
 
				+    movdqu xmm1, xmmword ptr [set1]
			
 
				+    movdqu xmm2, xmmword ptr [set2]
			
 
				+    pandn  xmm2, xmm1
			
 
				+
			
 
				+@16x_Loop:
			
 
				+    movdqu xmm1, xmmword ptr [set1 + size]
			
 
				+    movdqu xmm0, xmmword ptr [set2 + size]
			
 
				+    pandn  xmm0, xmm1
			
 
				+    por    xmm2, xmm0
			
 
				+    sub    size, 16
			
 
				+    ja     @16x_Loop
			
 
				+
			
 
				+    pxor   xmm0, xmm0
			
 
				+{$endif}
			
 
				+    pcmpeqb xmm0, xmm2
			
 
				+    pmovmskb ecx, xmm0
			
 
				+    xor    eax, eax
			
 
				+    inc    cx
			
 
				+    setz   al
			
 
				+    ret
			
 
				+
			
 
				+@No:
			
 
				+    xor    eax, eax
			
 
				+    ret
			
 
				+
			
 
				+@Bytewise_Prepare:
			
 
				+    add    size, 16
			
 
				+    neg    size
			
 
				+    sub    set1, size
			
 
				+    sub    set2, size
			
 
				+@Bytewise_Loop:
			
 
				+    movzx  eax, byte ptr [set2 + size]
			
 
				+    not    eax
			
 
				+    test   byte ptr [set1 + size], al
			
 
				+    jnz    @No
			
 
				+    inc    size
			
 
				+    jnz    @Bytewise_Loop
			
 
				+    mov    eax, $1
			
 
				+end;
			
 
				+{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
			
 
				+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
			
 
				+
			
 
				+{$asmmode att}