Browse Source

Specialized fpc_varset_OP_sets for i386 and x86-64.

Rika Ichinose 2 years ago
parent
commit
2dca69f2ac
2 changed files with 430 additions and 0 deletions
  1. 211 0
      rtl/i386/set.inc
  2. 219 0
      rtl/x86_64/set.inc

+ 211 - 0
rtl/i386/set.inc

@@ -13,6 +13,217 @@
 
  **********************************************************************}
 
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
+procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
+asm
+    push   %ebx
+    push   %esi
+    mov    12(%esp), %esi { esi = size }
+    sub    $4, %esi
+    jl     .LBytewise_Prepare { probably dead branch... }
+
+.L4x_Loop:
+    mov    (%eax,%esi), %ebx
+    or     (%edx,%esi), %ebx
+    mov    %ebx, (%ecx,%esi)
+    sub    $4, %esi
+    ja     .L4x_Loop
+
+    mov    (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
+    or     (%edx), %ebx
+    mov    %ebx, (%ecx)
+    pop    %esi
+    pop    %ebx
+    ret    $4
+
+.LBytewise_Prepare:
+    add    $3, %esi
+.LBytewise_Loop:
+    movzbl (%eax,%esi), %ebx
+    or     (%edx,%esi), %bl
+    mov    %bl, (%ecx,%esi)
+    sub    $1, %esi
+    jae    .LBytewise_Loop
+    pop    %esi
+    pop    %ebx
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
+procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ Same as fpc_varset_add_sets but with 'and' instead of 'or'. }
+asm
+    push   %ebx
+    push   %esi
+    mov    12(%esp), %esi { esi = size }
+    sub    $4, %esi
+    jl     .LBytewise_Prepare { probably dead branch... }
+
+.L4x_Loop:
+    mov    (%eax,%esi), %ebx
+    and    (%edx,%esi), %ebx
+    mov    %ebx, (%ecx,%esi)
+    sub    $4, %esi
+    ja     .L4x_Loop
+
+    mov    (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
+    and    (%edx), %ebx
+    mov    %ebx, (%ecx)
+    pop    %esi
+    pop    %ebx
+    ret    $4
+
+.LBytewise_Prepare:
+    add    $3, %esi
+.LBytewise_Loop:
+    movzbl (%eax,%esi), %ebx
+    and    (%edx,%esi), %bl
+    mov    %bl, (%ecx,%esi)
+    sub    $1, %esi
+    jae    .LBytewise_Loop
+    pop    %esi
+    pop    %ebx
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
+procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
+asm
+    push   %ebx
+    push   %esi
+    mov    12(%esp), %esi { esi = size }
+    sub    $4, %esi
+    jl     .LBytewise_Prepare { probably dead branch... }
+
+    mov    (%edx), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
+    not    %ebx         { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
+    and    (%eax), %ebx
+    push   %ebx
+.L4x_Loop:
+    mov    (%edx,%esi), %ebx
+    not    %ebx
+    and    (%eax,%esi), %ebx
+    mov    %ebx, (%ecx,%esi)
+    sub    $4, %esi
+    ja     .L4x_Loop
+
+    pop    %ebx
+    mov    %ebx, (%ecx) { Write precalculated tail. }
+    pop    %esi
+    pop    %ebx
+    ret    $4
+
+.LBytewise_Prepare:
+    add    $3, %esi
+.LBytewise_Loop:
+    movzbl (%edx,%esi), %ebx
+    not    %ebx
+    and    (%eax,%esi), %bl
+    mov    %bl, (%ecx,%esi)
+    sub    $1, %esi
+    jae    .LBytewise_Loop
+    pop    %esi
+    pop    %ebx
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
+procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
+  eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
+asm
+    push   %ebx
+    push   %esi
+    mov    12(%esp), %esi { esi = size }
+    sub    $4, %esi
+    jl     .LBytewise_Prepare { probably dead branch... }
+
+    mov    (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
+    xor    (%edx), %ebx { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
+    push   %ebx
+.L4x_Loop:
+    mov    (%eax,%esi), %ebx
+    xor    (%edx,%esi), %ebx
+    mov    %ebx, (%ecx,%esi)
+    sub    $4, %esi
+    ja     .L4x_Loop
+
+    pop    %ebx
+    mov    %ebx, (%ecx) { Write precalculated tail. }
+    pop    %esi
+    pop    %ebx
+    ret    $4
+
+.LBytewise_Prepare:
+    add    $3, %esi
+.LBytewise_Loop:
+    movzbl (%eax,%esi), %ebx
+    xor    (%edx,%esi), %bl
+    mov    %bl, (%ecx,%esi)
+    sub    $1, %esi
+    jae    .LBytewise_Loop
+    pop    %esi
+    pop    %ebx
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
+function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; assembler; nostackframe;
+{ eax = set1, edx = set2, ecx = size }
+asm
+    push   %ebx
+    sub    $4, %ecx
+    jl     .LBytewise_Prepare { probably dead branch... }
+    add    %ecx, %eax
+    add    %ecx, %edx
+    neg    %ecx { Now ecx = -(size - 4), eax points to set1 + size - 4, edx points to set2 + size - 4. Loop ends on size >= 0, leaving up to 4 tail bytes. }
+
+.L4x_Loop:
+    mov    (%edx,%ecx), %ebx
+    not    %ebx
+    test   %ebx, (%eax,%ecx)
+    jnz    .LNo
+    add    $4, %ecx
+    js     .L4x_Loop
+
+    mov    (%edx), %ebx { Tail. }
+    not    %ebx
+    mov    %eax, %ecx { eax value is still required to access set1 tail, but eax is going to be xor-zeroed for setz. }
+    xor    %eax, %eax
+    test   %ebx, (%ecx)
+    setz   %al
+    pop    %ebx
+    ret
+
+.LNo:
+    xor    %eax, %eax
+    pop    %ebx
+    ret
+
+.LBytewise_Prepare:
+    add    $4, %ecx
+    neg    %ecx
+    sub    %ecx, %eax
+    sub    %ecx, %edx
+.LBytewise_Loop:
+    movzbl (%edx,%ecx), %ebx
+    not    %ebx
+    test   %bl, (%eax,%ecx)
+    jnz    .LNo
+    inc    %ecx
+    jnz    .LBytewise_Loop
+    mov    $1, %eax
+    pop    %ebx
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
+
 { the following code is exactly big endian set-related, but specific to the old
   scheme whereby sets were either 4 or 32 bytes. I've left the routines here
   so if someone wants to, they can create equivalents of the new varset helpers

+ 219 - 0
rtl/x86_64/set.inc

@@ -13,3 +13,222 @@
 
  **********************************************************************}
 
+{$asmmode intel}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
+procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
+  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
+asm
+    sub    size, 16
+    jl     @Bytewise_Prepare { probably dead branch... }
+
+@16x_Loop:
+    movdqu xmm0, xmmword ptr [set1 + size]
+    movdqu xmm1, xmmword ptr [set2 + size]
+    por    xmm0, xmm1
+    movdqu xmmword ptr [dest + size], xmm0
+    sub    size, 16
+    ja     @16x_Loop
+
+    movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
+    movdqu xmm1, xmmword ptr [set2]
+    por    xmm0, xmm1
+    movdqu xmmword ptr [dest], xmm0
+    ret
+
+@Bytewise_Prepare:
+    add    size, 15
+@Bytewise_Loop:
+    movzx  eax, byte ptr [set1 + size]
+    or     al, byte ptr [set2 + size]
+    mov    byte ptr [dest + size], al
+    sub    size, 1
+    jae    @Bytewise_Loop
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
+procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ Same as fpc_varset_add_sets but with 'and' instead of 'or'. }
+asm
+    sub    size, 16
+    jl     @Bytewise_Prepare { probably dead branch... }
+
+@16x_Loop:
+    movdqu xmm0, xmmword ptr [set1 + size]
+    movdqu xmm1, xmmword ptr [set2 + size]
+    pand   xmm0, xmm1
+    movdqu xmmword ptr [dest + size], xmm0
+    sub    size, 16
+    ja     @16x_Loop
+
+    movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
+    movdqu xmm1, xmmword ptr [set2]
+    pand   xmm0, xmm1
+    movdqu xmmword ptr [dest], xmm0
+    ret
+
+@Bytewise_Prepare:
+    add    size, 15
+@Bytewise_Loop:
+    movzx  eax, byte ptr [set1 + size]
+    and    al, byte ptr [set2 + size]
+    mov    byte ptr [dest + size], al
+    sub    size, 1
+    jae    @Bytewise_Loop
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
+procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
+  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
+asm
+    sub    size, 16
+    jl     @Bytewise_Prepare { probably dead branch... }
+
+    movdqu xmm1, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
+    movdqu xmm2, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
+    pandn  xmm2, xmm1
+
+@16x_Loop:
+    movdqu xmm1, xmmword ptr [set1 + size]
+    movdqu xmm0, xmmword ptr [set2 + size]
+    pandn  xmm0, xmm1
+    movdqu xmmword ptr [dest + size], xmm0
+    sub    size, 16
+    ja     @16x_Loop
+
+    movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
+    ret
+
+@Bytewise_Prepare:
+    add    size, 15
+@Bytewise_Loop:
+    movzx  eax, byte ptr [set2 + size]
+    not    eax
+    and    al, byte ptr [set1 + size]
+    mov    byte ptr [dest + size], al
+    sub    size, 1
+    jae    @Bytewise_Loop
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
+procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
+
+  Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
+  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
+asm
+    sub    size, 16
+    jl     @Bytewise_Prepare { probably dead branch... }
+
+    movdqu xmm2, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
+    movdqu xmm1, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
+    pxor   xmm2, xmm1
+
+@16x_Loop:
+    movdqu xmm0, xmmword ptr [set1 + size]
+    movdqu xmm1, xmmword ptr [set2 + size]
+    pxor   xmm0, xmm1
+    movdqu xmmword ptr [dest + size], xmm0
+    sub    size, 16
+    ja     @16x_Loop
+
+    movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
+    ret
+
+@Bytewise_Prepare:
+    add    size, 15
+@Bytewise_Loop:
+    movzx  eax, byte ptr [set2 + size]
+    xor    al, byte ptr [set1 + size]
+    mov    byte ptr [dest + size], al
+    sub    size, 1
+    jae    @Bytewise_Loop
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
+function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; assembler; nostackframe;
+{ Windows: rcx = set1, rdx = set2, r8 = size
+  Linux:   rdi = set1, rsi = set2, rdx = size }
+asm
+    sub    size, 16
+    jl     @Bytewise_Prepare { probably dead branch... }
+
+{$if false}
+{ Scans 16 bytes at a time left to right with early exits.
+  Would be better for large enough sets (maybe around 64 bytes or even more) — if they existed, but worse for actually existing 32.
+  Kept for the future. }
+    pxor   xmm2, xmm2 { xmm2 = 0 }
+    add    set1, size
+    add    set2, size
+    neg    size { Now "size" = -(orig.size - 16), "set1" points to orig.set1 + orig.size - 16, "set2" points to orig.set2 + orig.size - 16.
+                  Loop ends on "size" >= 0, leaving up to 16 tail bytes. }
+
+@16x_Loop:
+    movdqu xmm1, xmmword ptr [set1 + size]
+    movdqu xmm0, xmmword ptr [set2 + size]
+    pandn  xmm0, xmm1
+    pcmpeqb xmm0, xmm2
+    pmovmskb eax, xmm0
+    inc    ax
+    jnz    @No
+    add    size, 16
+    js     @16x_Loop
+
+    movdqu xmm1, xmmword ptr [set1]
+    movdqu xmm0, xmmword ptr [set2]
+    pandn  xmm0, xmm1
+{$else}
+{ Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. }
+    movdqu xmm1, xmmword ptr [set1]
+    movdqu xmm2, xmmword ptr [set2]
+    pandn  xmm2, xmm1
+
+@16x_Loop:
+    movdqu xmm1, xmmword ptr [set1 + size]
+    movdqu xmm0, xmmword ptr [set2 + size]
+    pandn  xmm0, xmm1
+    por    xmm2, xmm0
+    sub    size, 16
+    ja     @16x_Loop
+
+    pxor   xmm0, xmm0
+{$endif}
+    pcmpeqb xmm0, xmm2
+    pmovmskb ecx, xmm0
+    xor    eax, eax
+    inc    cx
+    setz   al
+    ret
+
+@No:
+    xor    eax, eax
+    ret
+
+@Bytewise_Prepare:
+    add    size, 16
+    neg    size
+    sub    set1, size
+    sub    set2, size
+@Bytewise_Loop:
+    movzx  eax, byte ptr [set2 + size]
+    not    eax
+    test   byte ptr [set1 + size], al
+    jnz    @No
+    inc    size
+    jnz    @Bytewise_Loop
+    mov    eax, $1
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
+
+{$asmmode att}