Browse Source

Specialized fpc_varset_OP_sets for i386 and x86-64.

Rika Ichinose 2 years ago
parent
commit
2dca69f2ac
2 changed files with 430 additions and 0 deletions
  1. 211 0
      rtl/i386/set.inc
  2. 219 0
      rtl/x86_64/set.inc

+ 211 - 0
rtl/i386/set.inc

@@ -13,6 +13,217 @@
 
 
  **********************************************************************}
  **********************************************************************}
 
 
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
+procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
+asm
+    push   %ebx
+    push   %esi
+    mov    12(%esp), %esi { esi = size }
+    sub    $4, %esi
+    jl     .LBytewise_Prepare { probably dead branch... }
+
+.L4x_Loop:
+    mov    (%eax,%esi), %ebx
+    or     (%edx,%esi), %ebx
+    mov    %ebx, (%ecx,%esi)
+    sub    $4, %esi
+    ja     .L4x_Loop
+
+    mov    (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
+    or     (%edx), %ebx
+    mov    %ebx, (%ecx)
+    pop    %esi
+    pop    %ebx
+    ret    $4
+
+.LBytewise_Prepare:
+    add    $3, %esi
+.LBytewise_Loop:
+    movzbl (%eax,%esi), %ebx
+    or     (%edx,%esi), %bl
+    mov    %bl, (%ecx,%esi)
+    sub    $1, %esi
+    jae    .LBytewise_Loop
+    pop    %esi
+    pop    %ebx
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
+procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ Same as fpc_varset_add_sets but with 'and' instead of 'or'. }
+asm
+    push   %ebx
+    push   %esi
+    mov    12(%esp), %esi { esi = size }
+    sub    $4, %esi
+    jl     .LBytewise_Prepare { probably dead branch... }
+
+.L4x_Loop:
+    mov    (%eax,%esi), %ebx
+    and    (%edx,%esi), %ebx
+    mov    %ebx, (%ecx,%esi)
+    sub    $4, %esi
+    ja     .L4x_Loop
+
+    mov    (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
+    and    (%edx), %ebx
+    mov    %ebx, (%ecx)
+    pop    %esi
+    pop    %ebx
+    ret    $4
+
+.LBytewise_Prepare:
+    add    $3, %esi
+.LBytewise_Loop:
+    movzbl (%eax,%esi), %ebx
+    and    (%edx,%esi), %bl
+    mov    %bl, (%ecx,%esi)
+    sub    $1, %esi
+    jae    .LBytewise_Loop
+    pop    %esi
+    pop    %ebx
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
+procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
+asm
+    push   %ebx
+    push   %esi
+    mov    12(%esp), %esi { esi = size }
+    sub    $4, %esi
+    jl     .LBytewise_Prepare { probably dead branch... }
+
+    mov    (%edx), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
+    not    %ebx         { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
+    and    (%eax), %ebx
+    push   %ebx
+.L4x_Loop:
+    mov    (%edx,%esi), %ebx
+    not    %ebx
+    and    (%eax,%esi), %ebx
+    mov    %ebx, (%ecx,%esi)
+    sub    $4, %esi
+    ja     .L4x_Loop
+
+    pop    %ebx
+    mov    %ebx, (%ecx) { Write precalculated tail. }
+    pop    %esi
+    pop    %ebx
+    ret    $4
+
+.LBytewise_Prepare:
+    add    $3, %esi
+.LBytewise_Loop:
+    movzbl (%edx,%esi), %ebx
+    not    %ebx
+    and    (%eax,%esi), %bl
+    mov    %bl, (%ecx,%esi)
+    sub    $1, %esi
+    jae    .LBytewise_Loop
+    pop    %esi
+    pop    %ebx
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
+procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
+  eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
+asm
+    push   %ebx
+    push   %esi
+    mov    12(%esp), %esi { esi = size }
+    sub    $4, %esi
+    jl     .LBytewise_Prepare { probably dead branch... }
+
+    mov    (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
+    xor    (%edx), %ebx { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
+    push   %ebx
+.L4x_Loop:
+    mov    (%eax,%esi), %ebx
+    xor    (%edx,%esi), %ebx
+    mov    %ebx, (%ecx,%esi)
+    sub    $4, %esi
+    ja     .L4x_Loop
+
+    pop    %ebx
+    mov    %ebx, (%ecx) { Write precalculated tail. }
+    pop    %esi
+    pop    %ebx
+    ret    $4
+
+.LBytewise_Prepare:
+    add    $3, %esi
+.LBytewise_Loop:
+    movzbl (%eax,%esi), %ebx
+    xor    (%edx,%esi), %bl
+    mov    %bl, (%ecx,%esi)
+    sub    $1, %esi
+    jae    .LBytewise_Loop
+    pop    %esi
+    pop    %ebx
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
+function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; assembler; nostackframe;
+{ eax = set1, edx = set2, ecx = size }
+asm
+    push   %ebx
+    sub    $4, %ecx
+    jl     .LBytewise_Prepare { probably dead branch... }
+    add    %ecx, %eax
+    add    %ecx, %edx
+    neg    %ecx { Now ecx = -(size - 4), eax points to set1 + size - 4, edx points to set2 + size - 4. Loop ends on size >= 0, leaving up to 4 tail bytes. }
+
+.L4x_Loop:
+    mov    (%edx,%ecx), %ebx
+    not    %ebx
+    test   %ebx, (%eax,%ecx)
+    jnz    .LNo
+    add    $4, %ecx
+    js     .L4x_Loop
+
+    mov    (%edx), %ebx { Tail. }
+    not    %ebx
+    mov    %eax, %ecx { eax value is still required to access set1 tail, but eax is going to be xor-zeroed for setz. }
+    xor    %eax, %eax
+    test   %ebx, (%ecx)
+    setz   %al
+    pop    %ebx
+    ret
+
+.LNo:
+    xor    %eax, %eax
+    pop    %ebx
+    ret
+
+.LBytewise_Prepare:
+    add    $4, %ecx
+    neg    %ecx
+    sub    %ecx, %eax
+    sub    %ecx, %edx
+.LBytewise_Loop:
+    movzbl (%edx,%ecx), %ebx
+    not    %ebx
+    test   %bl, (%eax,%ecx)
+    jnz    .LNo
+    inc    %ecx
+    jnz    .LBytewise_Loop
+    mov    $1, %eax
+    pop    %ebx
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
+
 { the following code is exactly big endian set-related, but specific to the old
 { the following code is exactly big endian set-related, but specific to the old
   scheme whereby sets were either 4 or 32 bytes. I've left the routines here
   scheme whereby sets were either 4 or 32 bytes. I've left the routines here
   so if someone wants to, they can create equivalents of the new varset helpers
   so if someone wants to, they can create equivalents of the new varset helpers

+ 219 - 0
rtl/x86_64/set.inc

@@ -13,3 +13,222 @@
 
 
  **********************************************************************}
  **********************************************************************}
 
 
+{$asmmode intel}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
+procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
+  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
+asm
+    sub    size, 16
+    jl     @Bytewise_Prepare { probably dead branch... }
+
+@16x_Loop:
+    movdqu xmm0, xmmword ptr [set1 + size]
+    movdqu xmm1, xmmword ptr [set2 + size]
+    por    xmm0, xmm1
+    movdqu xmmword ptr [dest + size], xmm0
+    sub    size, 16
+    ja     @16x_Loop
+
+    movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
+    movdqu xmm1, xmmword ptr [set2]
+    por    xmm0, xmm1
+    movdqu xmmword ptr [dest], xmm0
+    ret
+
+@Bytewise_Prepare:
+    add    size, 15
+@Bytewise_Loop:
+    movzx  eax, byte ptr [set1 + size]
+    or     al, byte ptr [set2 + size]
+    mov    byte ptr [dest + size], al
+    sub    size, 1
+    jae    @Bytewise_Loop
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
+procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ Same as fpc_varset_add_sets but with 'and' instead of 'or'. }
+asm
+    sub    size, 16
+    jl     @Bytewise_Prepare { probably dead branch... }
+
+@16x_Loop:
+    movdqu xmm0, xmmword ptr [set1 + size]
+    movdqu xmm1, xmmword ptr [set2 + size]
+    pand   xmm0, xmm1
+    movdqu xmmword ptr [dest + size], xmm0
+    sub    size, 16
+    ja     @16x_Loop
+
+    movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
+    movdqu xmm1, xmmword ptr [set2]
+    pand   xmm0, xmm1
+    movdqu xmmword ptr [dest], xmm0
+    ret
+
+@Bytewise_Prepare:
+    add    size, 15
+@Bytewise_Loop:
+    movzx  eax, byte ptr [set1 + size]
+    and    al, byte ptr [set2 + size]
+    mov    byte ptr [dest + size], al
+    sub    size, 1
+    jae    @Bytewise_Loop
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
+procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
+  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
+asm
+    sub    size, 16
+    jl     @Bytewise_Prepare { probably dead branch... }
+
+    movdqu xmm1, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
+    movdqu xmm2, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
+    pandn  xmm2, xmm1
+
+@16x_Loop:
+    movdqu xmm1, xmmword ptr [set1 + size]
+    movdqu xmm0, xmmword ptr [set2 + size]
+    pandn  xmm0, xmm1
+    movdqu xmmword ptr [dest + size], xmm0
+    sub    size, 16
+    ja     @16x_Loop
+
+    movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
+    ret
+
+@Bytewise_Prepare:
+    add    size, 15
+@Bytewise_Loop:
+    movzx  eax, byte ptr [set2 + size]
+    not    eax
+    and    al, byte ptr [set1 + size]
+    mov    byte ptr [dest + size], al
+    sub    size, 1
+    jae    @Bytewise_Loop
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
+procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+{ Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
+
+  Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
+  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
+asm
+    sub    size, 16
+    jl     @Bytewise_Prepare { probably dead branch... }
+
+    movdqu xmm2, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
+    movdqu xmm1, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
+    pxor   xmm2, xmm1
+
+@16x_Loop:
+    movdqu xmm0, xmmword ptr [set1 + size]
+    movdqu xmm1, xmmword ptr [set2 + size]
+    pxor   xmm0, xmm1
+    movdqu xmmword ptr [dest + size], xmm0
+    sub    size, 16
+    ja     @16x_Loop
+
+    movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
+    ret
+
+@Bytewise_Prepare:
+    add    size, 15
+@Bytewise_Loop:
+    movzx  eax, byte ptr [set2 + size]
+    xor    al, byte ptr [set1 + size]
+    mov    byte ptr [dest + size], al
+    sub    size, 1
+    jae    @Bytewise_Loop
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
+
+{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
+function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; assembler; nostackframe;
+{ Windows: rcx = set1, rdx = set2, r8 = size
+  Linux:   rdi = set1, rsi = set2, rdx = size }
+asm
+    sub    size, 16
+    jl     @Bytewise_Prepare { probably dead branch... }
+
+{$if false}
+{ Scans 16 bytes at a time left to right with early exits.
+  Would be better for large enough sets (maybe around 64 bytes or even more) — if they existed, but worse for actually existing 32.
+  Kept for the future. }
+    pxor   xmm2, xmm2 { xmm2 = 0 }
+    add    set1, size
+    add    set2, size
+    neg    size { Now "size" = -(orig.size - 16), "set1" points to orig.set1 + orig.size - 16, "set2" points to orig.set2 + orig.size - 16.
+                  Loop ends on "size" >= 0, leaving up to 16 tail bytes. }
+
+@16x_Loop:
+    movdqu xmm1, xmmword ptr [set1 + size]
+    movdqu xmm0, xmmword ptr [set2 + size]
+    pandn  xmm0, xmm1
+    pcmpeqb xmm0, xmm2
+    pmovmskb eax, xmm0
+    inc    ax
+    jnz    @No
+    add    size, 16
+    js     @16x_Loop
+
+    movdqu xmm1, xmmword ptr [set1]
+    movdqu xmm0, xmmword ptr [set2]
+    pandn  xmm0, xmm1
+{$else}
+{ Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. }
+    movdqu xmm1, xmmword ptr [set1]
+    movdqu xmm2, xmmword ptr [set2]
+    pandn  xmm2, xmm1
+
+@16x_Loop:
+    movdqu xmm1, xmmword ptr [set1 + size]
+    movdqu xmm0, xmmword ptr [set2 + size]
+    pandn  xmm0, xmm1
+    por    xmm2, xmm0
+    sub    size, 16
+    ja     @16x_Loop
+
+    pxor   xmm0, xmm0
+{$endif}
+    pcmpeqb xmm0, xmm2
+    pmovmskb ecx, xmm0
+    xor    eax, eax
+    inc    cx
+    setz   al
+    ret
+
+@No:
+    xor    eax, eax
+    ret
+
+@Bytewise_Prepare:
+    add    size, 16
+    neg    size
+    sub    set1, size
+    sub    set2, size
+@Bytewise_Loop:
+    movzx  eax, byte ptr [set2 + size]
+    not    eax
+    test   byte ptr [set1 + size], al
+    jnz    @No
+    inc    size
+    jnz    @Bytewise_Loop
+    mov    eax, $1
+end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
+
+{$asmmode att}