2 年之前 · 2dca69f2ac
--- a/rtl/i386/set.inc
+++ b/rtl/i386/set.inc
@@ -13,6 +13,217 @@
 
															  **********************************************************************}
														
 
															+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
														
 
															+procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
														
 
															+{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
														
 
															+asm
														
 
															+    push   %ebx
														
 
															+    push   %esi
														
 
															+    mov    12(%esp), %esi { esi = size }
														
 
															+    sub    $4, %esi
														
 
															+    jl     .LBytewise_Prepare { probably dead branch... }
														
 
															+
														
 
															+.L4x_Loop:
														
 
															+    mov    (%eax,%esi), %ebx
														
 
															+    or     (%edx,%esi), %ebx
														
 
															+    mov    %ebx, (%ecx,%esi)
														
 
															+    sub    $4, %esi
														
 
															+    ja     .L4x_Loop
														
 
															+
														
 
															+    mov    (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
														
 
															+    or     (%edx), %ebx
														
 
															+    mov    %ebx, (%ecx)
														
 
															+    pop    %esi
														
 
															+    pop    %ebx
														
 
															+    ret    $4
														
 
															+
														
 
															+.LBytewise_Prepare:
														
 
															+    add    $3, %esi
														
 
															+.LBytewise_Loop:
														
 
															+    movzbl (%eax,%esi), %ebx
														
 
															+    or     (%edx,%esi), %bl
														
 
															+    mov    %bl, (%ecx,%esi)
														
 
															+    sub    $1, %esi
														
 
															+    jae    .LBytewise_Loop
														
 
															+    pop    %esi
														
 
															+    pop    %ebx
														
 
															+end;
														
 
															+{$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
														
 
															+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
														
 
															+
														
 
															+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
														
 
															+procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
														
 
															+{ Same as fpc_varset_add_sets but with 'and' instead of 'or'. }
														
 
															+asm
														
 
															+    push   %ebx
														
 
															+    push   %esi
														
 
															+    mov    12(%esp), %esi { esi = size }
														
 
															+    sub    $4, %esi
														
 
															+    jl     .LBytewise_Prepare { probably dead branch... }
														
 
															+
														
 
															+.L4x_Loop:
														
 
															+    mov    (%eax,%esi), %ebx
														
 
															+    and    (%edx,%esi), %ebx
														
 
															+    mov    %ebx, (%ecx,%esi)
														
 
															+    sub    $4, %esi
														
 
															+    ja     .L4x_Loop
														
 
															+
														
 
															+    mov    (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
														
 
															+    and    (%edx), %ebx
														
 
															+    mov    %ebx, (%ecx)
														
 
															+    pop    %esi
														
 
															+    pop    %ebx
														
 
															+    ret    $4
														
 
															+
														
 
															+.LBytewise_Prepare:
														
 
															+    add    $3, %esi
														
 
															+.LBytewise_Loop:
														
 
															+    movzbl (%eax,%esi), %ebx
														
 
															+    and    (%edx,%esi), %bl
														
 
															+    mov    %bl, (%ecx,%esi)
														
 
															+    sub    $1, %esi
														
 
															+    jae    .LBytewise_Loop
														
 
															+    pop    %esi
														
 
															+    pop    %ebx
														
 
															+end;
														
 
															+{$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
														
 
															+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
														
 
															+
														
 
															+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
														
 
															+procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
														
 
															+{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
														
 
															+asm
														
 
															+    push   %ebx
														
 
															+    push   %esi
														
 
															+    mov    12(%esp), %esi { esi = size }
														
 
															+    sub    $4, %esi
														
 
															+    jl     .LBytewise_Prepare { probably dead branch... }
														
 
															+
														
 
															+    mov    (%edx), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
														
 
															+    not    %ebx         { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
														
 
															+    and    (%eax), %ebx
														
 
															+    push   %ebx
														
 
															+.L4x_Loop:
														
 
															+    mov    (%edx,%esi), %ebx
														
 
															+    not    %ebx
														
 
															+    and    (%eax,%esi), %ebx
														
 
															+    mov    %ebx, (%ecx,%esi)
														
 
															+    sub    $4, %esi
														
 
															+    ja     .L4x_Loop
														
 
															+
														
 
															+    pop    %ebx
														
 
															+    mov    %ebx, (%ecx) { Write precalculated tail. }
														
 
															+    pop    %esi
														
 
															+    pop    %ebx
														
 
															+    ret    $4
														
 
															+
														
 
															+.LBytewise_Prepare:
														
 
															+    add    $3, %esi
														
 
															+.LBytewise_Loop:
														
 
															+    movzbl (%edx,%esi), %ebx
														
 
															+    not    %ebx
														
 
															+    and    (%eax,%esi), %bl
														
 
															+    mov    %bl, (%ecx,%esi)
														
 
															+    sub    $1, %esi
														
 
															+    jae    .LBytewise_Loop
														
 
															+    pop    %esi
														
 
															+    pop    %ebx
														
 
															+end;
														
 
															+{$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
														
 
															+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
														
 
															+
														
 
															+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
														
 
															+procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
														
 
															+{ Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
														
 
															+  eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
														
 
															+asm
														
 
															+    push   %ebx
														
 
															+    push   %esi
														
 
															+    mov    12(%esp), %esi { esi = size }
														
 
															+    sub    $4, %esi
														
 
															+    jl     .LBytewise_Prepare { probably dead branch... }
														
 
															+
														
 
															+    mov    (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
														
 
															+    xor    (%edx), %ebx { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
														
 
															+    push   %ebx
														
 
															+.L4x_Loop:
														
 
															+    mov    (%eax,%esi), %ebx
														
 
															+    xor    (%edx,%esi), %ebx
														
 
															+    mov    %ebx, (%ecx,%esi)
														
 
															+    sub    $4, %esi
														
 
															+    ja     .L4x_Loop
														
 
															+
														
 
															+    pop    %ebx
														
 
															+    mov    %ebx, (%ecx) { Write precalculated tail. }
														
 
															+    pop    %esi
														
 
															+    pop    %ebx
														
 
															+    ret    $4
														
 
															+
														
 
															+.LBytewise_Prepare:
														
 
															+    add    $3, %esi
														
 
															+.LBytewise_Loop:
														
 
															+    movzbl (%eax,%esi), %ebx
														
 
															+    xor    (%edx,%esi), %bl
														
 
															+    mov    %bl, (%ecx,%esi)
														
 
															+    sub    $1, %esi
														
 
															+    jae    .LBytewise_Loop
														
 
															+    pop    %esi
														
 
															+    pop    %ebx
														
 
															+end;
														
 
															+{$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
														
 
															+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
														
 
															+
														
 
															+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
														
 
															+function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; assembler; nostackframe;
														
 
															+{ eax = set1, edx = set2, ecx = size }
														
 
															+asm
														
 
															+    push   %ebx
														
 
															+    sub    $4, %ecx
														
 
															+    jl     .LBytewise_Prepare { probably dead branch... }
														
 
															+    add    %ecx, %eax
														
 
															+    add    %ecx, %edx
														
 
															+    neg    %ecx { Now ecx = -(size - 4), eax points to set1 + size - 4, edx points to set2 + size - 4. Loop ends on size >= 0, leaving up to 4 tail bytes. }
														
 
															+
														
 
															+.L4x_Loop:
														
 
															+    mov    (%edx,%ecx), %ebx
														
 
															+    not    %ebx
														
 
															+    test   %ebx, (%eax,%ecx)
														
 
															+    jnz    .LNo
														
 
															+    add    $4, %ecx
														
 
															+    js     .L4x_Loop
														
 
															+
														
 
															+    mov    (%edx), %ebx { Tail. }
														
 
															+    not    %ebx
														
 
															+    mov    %eax, %ecx { eax value is still required to access set1 tail, but eax is going to be xor-zeroed for setz. }
														
 
															+    xor    %eax, %eax
														
 
															+    test   %ebx, (%ecx)
														
 
															+    setz   %al
														
 
															+    pop    %ebx
														
 
															+    ret
														
 
															+
														
 
															+.LNo:
														
 
															+    xor    %eax, %eax
														
 
															+    pop    %ebx
														
 
															+    ret
														
 
															+
														
 
															+.LBytewise_Prepare:
														
 
															+    add    $4, %ecx
														
 
															+    neg    %ecx
														
 
															+    sub    %ecx, %eax
														
 
															+    sub    %ecx, %edx
														
 
															+.LBytewise_Loop:
														
 
															+    movzbl (%edx,%ecx), %ebx
														
 
															+    not    %ebx
														
 
															+    test   %bl, (%eax,%ecx)
														
 
															+    jnz    .LNo
														
 
															+    inc    %ecx
														
 
															+    jnz    .LBytewise_Loop
														
 
															+    mov    $1, %eax
														
 
															+    pop    %ebx
														
 
															+end;
														
 
															+{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
														
 
															+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
														
 
															+
														
 
															 { the following code is exactly big endian set-related, but specific to the old
														
 
															   scheme whereby sets were either 4 or 32 bytes. I've left the routines here
														
 
															   so if someone wants to, they can create equivalents of the new varset helpers
														
--- a/rtl/x86_64/set.inc
+++ b/rtl/x86_64/set.inc
@@ -13,3 +13,222 @@
 
															  **********************************************************************}
														
 
															+{$asmmode intel}
														
 
															+
														
 
															+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
														
 
															+procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
														
 
															+{ Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
														
 
															+  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
														
 
															+asm
														
 
															+    sub    size, 16
														
 
															+    jl     @Bytewise_Prepare { probably dead branch... }
														
 
															+
														
 
															+@16x_Loop:
														
 
															+    movdqu xmm0, xmmword ptr [set1 + size]
														
 
															+    movdqu xmm1, xmmword ptr [set2 + size]
														
 
															+    por    xmm0, xmm1
														
 
															+    movdqu xmmword ptr [dest + size], xmm0
														
 
															+    sub    size, 16
														
 
															+    ja     @16x_Loop
														
 
															+
														
 
															+    movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
														
 
															+    movdqu xmm1, xmmword ptr [set2]
														
 
															+    por    xmm0, xmm1
														
 
															+    movdqu xmmword ptr [dest], xmm0
														
 
															+    ret
														
 
															+
														
 
															+@Bytewise_Prepare:
														
 
															+    add    size, 15
														
 
															+@Bytewise_Loop:
														
 
															+    movzx  eax, byte ptr [set1 + size]
														
 
															+    or     al, byte ptr [set2 + size]
														
 
															+    mov    byte ptr [dest + size], al
														
 
															+    sub    size, 1
														
 
															+    jae    @Bytewise_Loop
														
 
															+end;
														
 
															+{$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
														
 
															+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
														
 
															+
														
 
															+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
														
 
															+procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
														
 
															+{ Same as fpc_varset_add_sets but with 'and' instead of 'or'. }
														
 
															+asm
														
 
															+    sub    size, 16
														
 
															+    jl     @Bytewise_Prepare { probably dead branch... }
														
 
															+
														
 
															+@16x_Loop:
														
 
															+    movdqu xmm0, xmmword ptr [set1 + size]
														
 
															+    movdqu xmm1, xmmword ptr [set2 + size]
														
 
															+    pand   xmm0, xmm1
														
 
															+    movdqu xmmword ptr [dest + size], xmm0
														
 
															+    sub    size, 16
														
 
															+    ja     @16x_Loop
														
 
															+
														
 
															+    movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
														
 
															+    movdqu xmm1, xmmword ptr [set2]
														
 
															+    pand   xmm0, xmm1
														
 
															+    movdqu xmmword ptr [dest], xmm0
														
 
															+    ret
														
 
															+
														
 
															+@Bytewise_Prepare:
														
 
															+    add    size, 15
														
 
															+@Bytewise_Loop:
														
 
															+    movzx  eax, byte ptr [set1 + size]
														
 
															+    and    al, byte ptr [set2 + size]
														
 
															+    mov    byte ptr [dest + size], al
														
 
															+    sub    size, 1
														
 
															+    jae    @Bytewise_Loop
														
 
															+end;
														
 
															+{$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
														
 
															+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
														
 
															+
														
 
															+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
														
 
															+procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
														
 
															+{ Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
														
 
															+  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
														
 
															+asm
														
 
															+    sub    size, 16
														
 
															+    jl     @Bytewise_Prepare { probably dead branch... }
														
 
															+
														
 
															+    movdqu xmm1, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
														
 
															+    movdqu xmm2, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
														
 
															+    pandn  xmm2, xmm1
														
 
															+
														
 
															+@16x_Loop:
														
 
															+    movdqu xmm1, xmmword ptr [set1 + size]
														
 
															+    movdqu xmm0, xmmword ptr [set2 + size]
														
 
															+    pandn  xmm0, xmm1
														
 
															+    movdqu xmmword ptr [dest + size], xmm0
														
 
															+    sub    size, 16
														
 
															+    ja     @16x_Loop
														
 
															+
														
 
															+    movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
														
 
															+    ret
														
 
															+
														
 
															+@Bytewise_Prepare:
														
 
															+    add    size, 15
														
 
															+@Bytewise_Loop:
														
 
															+    movzx  eax, byte ptr [set2 + size]
														
 
															+    not    eax
														
 
															+    and    al, byte ptr [set1 + size]
														
 
															+    mov    byte ptr [dest + size], al
														
 
															+    sub    size, 1
														
 
															+    jae    @Bytewise_Loop
														
 
															+end;
														
 
															+{$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
														
 
															+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
														
 
															+
														
 
															+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
														
 
															+procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
														
 
															+{ Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
														
 
															+
														
 
															+  Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
														
 
															+  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
														
 
															+asm
														
 
															+    sub    size, 16
														
 
															+    jl     @Bytewise_Prepare { probably dead branch... }
														
 
															+
														
 
															+    movdqu xmm2, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
														
 
															+    movdqu xmm1, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
														
 
															+    pxor   xmm2, xmm1
														
 
															+
														
 
															+@16x_Loop:
														
 
															+    movdqu xmm0, xmmword ptr [set1 + size]
														
 
															+    movdqu xmm1, xmmword ptr [set2 + size]
														
 
															+    pxor   xmm0, xmm1
														
 
															+    movdqu xmmword ptr [dest + size], xmm0
														
 
															+    sub    size, 16
														
 
															+    ja     @16x_Loop
														
 
															+
														
 
															+    movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
														
 
															+    ret
														
 
															+
														
 
															+@Bytewise_Prepare:
														
 
															+    add    size, 15
														
 
															+@Bytewise_Loop:
														
 
															+    movzx  eax, byte ptr [set2 + size]
														
 
															+    xor    al, byte ptr [set1 + size]
														
 
															+    mov    byte ptr [dest + size], al
														
 
															+    sub    size, 1
														
 
															+    jae    @Bytewise_Loop
														
 
															+end;
														
 
															+{$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
														
 
															+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
														
 
															+
														
 
															+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
														
 
															+function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; assembler; nostackframe;
														
 
															+{ Windows: rcx = set1, rdx = set2, r8 = size
														
 
															+  Linux:   rdi = set1, rsi = set2, rdx = size }
														
 
															+asm
														
 
															+    sub    size, 16
														
 
															+    jl     @Bytewise_Prepare { probably dead branch... }
														
 
															+
														
 
															+{$if false}
														
 
															+{ Scans 16 bytes at a time left to right with early exits.
														
 
															+  Would be better for large enough sets (maybe around 64 bytes or even more) — if they existed, but worse for actually existing 32.
														
 
															+  Kept for the future. }
														
 
															+    pxor   xmm2, xmm2 { xmm2 = 0 }
														
 
															+    add    set1, size
														
 
															+    add    set2, size
														
 
															+    neg    size { Now "size" = -(orig.size - 16), "set1" points to orig.set1 + orig.size - 16, "set2" points to orig.set2 + orig.size - 16.
														
 
															+                  Loop ends on "size" >= 0, leaving up to 16 tail bytes. }
														
 
															+
														
 
															+@16x_Loop:
														
 
															+    movdqu xmm1, xmmword ptr [set1 + size]
														
 
															+    movdqu xmm0, xmmword ptr [set2 + size]
														
 
															+    pandn  xmm0, xmm1
														
 
															+    pcmpeqb xmm0, xmm2
														
 
															+    pmovmskb eax, xmm0
														
 
															+    inc    ax
														
 
															+    jnz    @No
														
 
															+    add    size, 16
														
 
															+    js     @16x_Loop
														
 
															+
														
 
															+    movdqu xmm1, xmmword ptr [set1]
														
 
															+    movdqu xmm0, xmmword ptr [set2]
														
 
															+    pandn  xmm0, xmm1
														
 
															+{$else}
														
 
															+{ Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. }
														
 
															+    movdqu xmm1, xmmword ptr [set1]
														
 
															+    movdqu xmm2, xmmword ptr [set2]
														
 
															+    pandn  xmm2, xmm1
														
 
															+
														
 
															+@16x_Loop:
														
 
															+    movdqu xmm1, xmmword ptr [set1 + size]
														
 
															+    movdqu xmm0, xmmword ptr [set2 + size]
														
 
															+    pandn  xmm0, xmm1
														
 
															+    por    xmm2, xmm0
														
 
															+    sub    size, 16
														
 
															+    ja     @16x_Loop
														
 
															+
														
 
															+    pxor   xmm0, xmm0
														
 
															+{$endif}
														
 
															+    pcmpeqb xmm0, xmm2
														
 
															+    pmovmskb ecx, xmm0
														
 
															+    xor    eax, eax
														
 
															+    inc    cx
														
 
															+    setz   al
														
 
															+    ret
														
 
															+
														
 
															+@No:
														
 
															+    xor    eax, eax
														
 
															+    ret
														
 
															+
														
 
															+@Bytewise_Prepare:
														
 
															+    add    size, 16
														
 
															+    neg    size
														
 
															+    sub    set1, size
														
 
															+    sub    set2, size
														
 
															+@Bytewise_Loop:
														
 
															+    movzx  eax, byte ptr [set2 + size]
														
 
															+    not    eax
														
 
															+    test   byte ptr [set1 + size], al
														
 
															+    jnz    @No
														
 
															+    inc    size
														
 
															+    jnz    @Bytewise_Loop
														
 
															+    mov    eax, $1
														
 
															+end;
														
 
															+{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
														
 
															+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
														
 
															+
														
 
															+{$asmmode att}