Browse Source

SSE set operations (i386).

Rika Ichinose 1 year ago
parent
commit
aed4292017
2 changed files with 267 additions and 283 deletions
  1. 266 281
      rtl/i386/set.inc
  2. 1 2
      rtl/x86_64/set.inc

+ 266 - 281
rtl/i386/set.inc

@@ -14,10 +14,14 @@
  **********************************************************************}
  **********************************************************************}
 
 
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
-procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+label
+  fpc_varset_add_sets_plain_fallback;
+
+procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
 { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
 { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
 asm
 asm
     push   %ebx
     push   %ebx
+fpc_varset_add_sets_plain_fallback:
     push   %esi
     push   %esi
     mov    12(%esp), %esi { esi = size }
     mov    12(%esp), %esi { esi = size }
     sub    $4, %esi
     sub    $4, %esi
@@ -48,14 +52,60 @@ asm
     pop    %esi
     pop    %esi
     pop    %ebx
     pop    %ebx
 end;
 end;
+
+procedure fpc_varset_add_sets_sse(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
+{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
+asm
+    push   %ebx
+    mov    8(%esp), %ebx
+    sub    $16, %ebx { ebx = position }
+    jl     fpc_varset_add_sets_plain_fallback { probably dead branch... }
+
+.L16x_Loop:
+    movups (%eax,%ebx), %xmm0
+    movups (%edx,%ebx), %xmm1
+    orps   %xmm1, %xmm0
+    movups %xmm0, (%ecx,%ebx)
+    sub    $16, %ebx
+    ja     .L16x_Loop
+
+    movups (%eax), %xmm0 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
+    movups (%edx), %xmm1
+    orps   %xmm1, %xmm0
+    movups %xmm0, (%ecx)
+    pop    %ebx
+end;
+
+procedure fpc_varset_add_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
+
+var
+  fpc_varset_add_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_add_sets_dispatch;
+
+procedure fpc_varset_add_sets_dispatch(const set1,set2; var dest;size : ptrint);
+begin
+  if has_sse_support then
+    fpc_varset_add_sets_impl:=@fpc_varset_add_sets_sse
+  else
+    fpc_varset_add_sets_impl:=@fpc_varset_add_sets_plain;
+  fpc_varset_add_sets_impl(set1,set2,dest,size);
+end;
+
+procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
+begin
+  fpc_varset_add_sets_impl(set1,set2,dest,size);
+end;
 {$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
 {$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
 {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
 {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
 
 
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
-procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
-{ Same as fpc_varset_add_sets but with 'and' instead of 'or'. }
+label
+  fpc_varset_mul_sets_plain_fallback;
+
+procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
+{ Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. }
 asm
 asm
     push   %ebx
     push   %ebx
+fpc_varset_mul_sets_plain_fallback:
     push   %esi
     push   %esi
     mov    12(%esp), %esi { esi = size }
     mov    12(%esp), %esi { esi = size }
     sub    $4, %esi
     sub    $4, %esi
@@ -86,14 +136,60 @@ asm
     pop    %esi
     pop    %esi
     pop    %ebx
     pop    %ebx
 end;
 end;
+
+procedure fpc_varset_mul_sets_sse(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
+{ Same as fpc_varset_add_sets_sse but with 'and' instead of 'or'. }
+asm
+    push   %ebx
+    mov    8(%esp), %ebx
+    sub    $16, %ebx { ebx = position }
+    jl     fpc_varset_mul_sets_plain_fallback { probably dead branch... }
+
+.L16x_Loop:
+    movups (%eax,%ebx), %xmm0
+    movups (%edx,%ebx), %xmm1
+    andps  %xmm1, %xmm0
+    movups %xmm0, (%ecx,%ebx)
+    sub    $16, %ebx
+    ja     .L16x_Loop
+
+    movups (%eax), %xmm0 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
+    movups (%edx), %xmm1
+    andps   %xmm1, %xmm0
+    movups %xmm0, (%ecx)
+    pop    %ebx
+end;
+
+procedure fpc_varset_mul_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
+
+var
+  fpc_varset_mul_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_mul_sets_dispatch;
+
+procedure fpc_varset_mul_sets_dispatch(const set1,set2; var dest;size : ptrint);
+begin
+  if has_sse_support then
+    fpc_varset_mul_sets_impl:=@fpc_varset_mul_sets_sse
+  else
+    fpc_varset_mul_sets_impl:=@fpc_varset_mul_sets_plain;
+  fpc_varset_mul_sets_impl(set1,set2,dest,size);
+end;
+
+procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
+begin
+  fpc_varset_mul_sets_impl(set1,set2,dest,size);
+end;
 {$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
 {$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
 {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
 {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
 
 
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
-procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
+label
+  fpc_varset_sub_sets_plain_fallback;
+
+procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
 { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
 { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
 asm
 asm
     push   %ebx
     push   %ebx
+fpc_varset_sub_sets_plain_fallback:
     push   %esi
     push   %esi
     mov    12(%esp), %esi { esi = size }
     mov    12(%esp), %esi { esi = size }
     sub    $4, %esi
     sub    $4, %esi
@@ -129,15 +225,62 @@ asm
     pop    %esi
     pop    %esi
     pop    %ebx
     pop    %ebx
 end;
 end;
+
+procedure fpc_varset_sub_sets_sse(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
+{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
+asm
+    push   %ebx
+    mov    8(%esp), %ebx
+    sub    $16, %ebx { ebx = position }
+    jl     fpc_varset_sub_sets_plain_fallback { probably dead branch... }
+
+    movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
+    movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
+    andnps %xmm1, %xmm2
+
+.L16x_Loop:
+    movups (%eax,%ebx), %xmm1
+    movups (%edx,%ebx), %xmm0
+    andnps %xmm1, %xmm0
+    movups %xmm0, (%ecx,%ebx)
+    sub    $16, %ebx
+    ja     .L16x_Loop
+
+    movups %xmm2, (%ecx) { Write precalculated tail. }
+    pop    %ebx
+end;
+
+procedure fpc_varset_sub_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
+
+var
+  fpc_varset_sub_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_sub_sets_dispatch;
+
+procedure fpc_varset_sub_sets_dispatch(const set1,set2; var dest;size : ptrint);
+begin
+  if has_sse_support then
+    fpc_varset_sub_sets_impl:=@fpc_varset_sub_sets_sse
+  else
+    fpc_varset_sub_sets_impl:=@fpc_varset_sub_sets_plain;
+  fpc_varset_sub_sets_impl(set1,set2,dest,size);
+end;
+
+procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
+begin
+  fpc_varset_sub_sets_impl(set1,set2,dest,size);
+end;
 {$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
 {$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
 {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
 {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
 
 
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
-procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
-{ Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
+label
+  fpc_varset_symdif_sets_plain_fallback;
+
+procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
+{ Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'.
   eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
   eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
 asm
 asm
     push   %ebx
     push   %ebx
+fpc_varset_symdif_sets_plain_fallback:
     push   %esi
     push   %esi
     mov    12(%esp), %esi { esi = size }
     mov    12(%esp), %esi { esi = size }
     sub    $4, %esi
     sub    $4, %esi
@@ -170,11 +313,55 @@ asm
     pop    %esi
     pop    %esi
     pop    %ebx
     pop    %ebx
 end;
 end;
+
+procedure fpc_varset_symdif_sets_sse(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
+{ Same as fpc_varset_sub_sets_sse but with 'xor' instead of 'and not'.
+  eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
+asm
+    push   %ebx
+    mov    8(%esp), %ebx
+    sub    $16, %ebx { ebx = position }
+    jl     fpc_varset_symdif_sets_plain_fallback { probably dead branch... }
+
+    movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
+    movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
+    xorps  %xmm1, %xmm2
+
+.L16x_Loop:
+    movups (%eax,%ebx), %xmm1
+    movups (%edx,%ebx), %xmm0
+    xorps  %xmm1, %xmm0
+    movups %xmm0, (%ecx,%ebx)
+    sub    $16, %ebx
+    ja     .L16x_Loop
+
+    movups %xmm2, (%ecx) { Write precalculated tail. }
+    pop    %ebx
+end;
+
+procedure fpc_varset_symdif_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
+
+var
+  fpc_varset_symdif_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_symdif_sets_dispatch;
+
+procedure fpc_varset_symdif_sets_dispatch(const set1,set2; var dest;size : ptrint);
+begin
+  if has_sse_support then
+    fpc_varset_symdif_sets_impl:=@fpc_varset_symdif_sets_sse
+  else
+    fpc_varset_symdif_sets_impl:=@fpc_varset_symdif_sets_plain;
+  fpc_varset_symdif_sets_impl(set1,set2,dest,size);
+end;
+
+procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
+begin
+  fpc_varset_symdif_sets_impl(set1,set2,dest,size);
+end;
 {$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
 {$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
 {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
 {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
 
 
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
-function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; assembler; nostackframe;
+function fpc_varset_contains_sets_plain(const set1,set2;size : ptrint):boolean; assembler; nostackframe;
 { eax = set1, edx = set2, ecx = size }
 { eax = set1, edx = set2, ecx = size }
 asm
 asm
     push   %ebx
     push   %ebx
@@ -221,292 +408,90 @@ asm
     mov    $1, %eax
     mov    $1, %eax
     pop    %ebx
     pop    %ebx
 end;
 end;
-{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
-{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
-
-{ the following code is exactly big endian set-related, but specific to the old
-  scheme whereby sets were either 4 or 32 bytes. I've left the routines here
-  so if someone wants to, they can create equivalents of the new varset helpers
-  from rtl/inc/genset.inc
-}
-
-{$ifdef FPC_OLD_BIGENDIAN_SETS}
 
 
-{$define FPC_SYSTEM_HAS_FPC_SET_LOAD_SMALL}
-function fpc_set_load_small(l: fpc_small_set): fpc_normal_set;assembler;[public,alias:'FPC_SET_LOAD_SMALL']; compilerproc;
-{
-  load a normal set p from a smallset l
-}
-var
-  saveedi : longint;
-asm
-        movl    %edi,saveedi
-        movl    __RESULT,%edi
-        movl    l,%eax
-{$ifdef FPC_ENABLED_CLD}
-        cld
-{$endif FPC_ENABLED_CLD}
-        stosl
-        xorl    %eax,%eax
-        movl    $7,%ecx
-        rep
-        stosl
-        movl    saveedi,%edi
-end;
-
-{$define FPC_SYSTEM_HAS_FPC_SET_CREATE_ELEMENT}
-
-function fpc_set_create_element(b : byte): fpc_normal_set;assembler;[public,alias:'FPC_SET_CREATE_ELEMENT']; compilerproc;
-{
-  create a new set in p from an element b
-}
-var
-  saveedi : longint;
-asm
-        movl    %edi,saveedi
-        movl    __RESULT,%edi
-        movzbl  b,%edx
-        xorl    %eax,%eax
-        movl    $8,%ecx
-{$ifdef FPC_ENABLED_CLD}
-        cld
-{$endif FPC_ENABLED_CLD}
-        rep
-        stosl
-        leal    -32(%edi),%eax
-        btsl    %edx,(%eax)
-        movl    saveedi,%edi
-end;
-
-
-{$define FPC_SYSTEM_HAS_FPC_SET_SET_BYTE}
-function fpc_set_set_byte(const source: fpc_normal_set; b : byte): fpc_normal_set;assembler; compilerproc;
-{
-  add the element b to the set pointed by source
-}
-var
-  saveesi,saveedi : longint;
-asm
-        movl    %edi,saveedi
-        movl    %esi,saveesi
-        movl    source,%esi
-        movl    __RESULT,%edi
-        movzbl  b,%edx
-        movl    $8,%ecx
-{$ifdef FPC_ENABLED_CLD}
-        cld
-{$endif FPC_ENABLED_CLD}
-        rep
-        movsl
-        leal    -32(%edi),%eax
-        btsl    %edx,(%eax)
-        movl    saveedi,%edi
-        movl    saveesi,%esi
-end;
-
-{$define FPC_SYSTEM_HAS_FPC_SET_UNSET_BYTE}
-function fpc_set_unset_byte(const source: fpc_normal_set; b : byte): fpc_normal_set;assembler; compilerproc;
-{
-  add the element b to the set pointed by source
-}
-var
-  saveesi,saveedi : longint;
-asm
-        movl    %edi,saveedi
-        movl    %esi,saveesi
-        movl    source,%esi
-        movl    __RESULT,%edi
-        movzbl  b,%edx
-        movl    $8,%ecx
-{$ifdef FPC_ENABLED_CLD}
-        cld
-{$endif FPC_ENABLED_CLD}
-        rep
-        movsl
-        leal    -32(%edi),%eax
-        btrl    %edx,(%eax)
-        movl    saveedi,%edi
-        movl    saveesi,%esi
-end;
-
-{$define FPC_SYSTEM_HAS_FPC_SET_SET_RANGE}
-
-function fpc_set_set_range(const orgset: fpc_normal_set; l,h : byte): fpc_normal_set;assembler; compilerproc;
-{
-  adds the range [l..h] to the set pointed to by p
-}
-var
-  saveh : byte;
-  saveesi,saveedi,saveebx : longint;
+function fpc_varset_contains_sets_sse2(const set1,set2;size : ptrint):boolean; assembler; nostackframe;
+{ eax = set1, edx = set2, ecx = size }
 asm
 asm
-        movl    %edi,saveedi
-        movl    %esi,saveesi
-        movl    %ebx,saveebx
-        movl   __RESULT,%edi        // target set address in edi
-        movl   orgset, %esi         // source set address in esi
-        movzbl l,%eax               // lowest bit to be set in eax
-        movzbl h,%ebx               // highest in ebx
-        movb   %bl,saveh
-        movl   $8,%ecx              // we have to copy 32 bytes
-        cmpl   %eax,%ebx            // high < low?
-{$ifdef FPC_ENABLED_CLD}
-        cld
-{$endif FPC_ENABLED_CLD}
-        rep                         // copy source to dest (it's possible to do the range
-        movsl                       // setting and copying simultanuously of course, but
-                                    // that would result in many more jumps and code)
-        movl   %eax,%ecx            // lowest also in ecx
-        jb     .Lset_range_done     // if high > low, then dest := source
-        shrl   $3,%eax              // divide by 8 to get starting and ending byte
-        shrl   $3,%ebx              // address
-        andb   $31,%cl              // low five bits of lo determine start of bit mask
-        andl   $0x0fffffffc,%eax    // clear two lowest bits to get start/end longint
-        subl   $32,%edi             // get back to start of dest
-        andl   $0x0fffffffc,%ebx    // address * 4
-        movl   $0x0ffffffff,%edx    // edx = bitmask to be inserted
-        shll   %cl,%edx             // shift bitmask to clear bits below lo
-        addl   %eax,%edi            // go to starting pos in set
-        subl   %eax,%ebx            // are bit lo and hi in the same longint?
-        jz     .Lset_range_hi       // yes, keep current mask and adjust for hi bit
-        orl    %edx,(%edi)          // no, store current mask
-        movl   $0x0ffffffff,%edx    // new mask
-        addl   $4,%edi              // next longint of set
-        subl   $4,%ebx              // bit hi in this longint?
-        jz     .Lset_range_hi       // yes, keep full mask and adjust for hi bit
-.Lset_range_loop:
-        movl   %edx,(%edi)          // no, fill longints in between with full mask
-        addl   $4,%edi
-        subl   $4,%ebx
-        jnz    .Lset_range_loop
-.Lset_range_hi:
-        movb   saveh,%cl                // this is ok, h is on the stack
-        movl   %edx,%ebx            // save current bitmask
-        andb   $31,%cl
-        subb   $31,%cl              // cl := (31 - (hi and 31)) = shift count to
-        negb   %cl                  // adjust bitmask for hi bit
-        shrl   %cl,%edx             // shift bitmask to clear bits higher than hi
-        andl   %edx,%ebx            // combine both bitmasks
-        orl    %ebx,(%edi)          // store to set
-.Lset_range_done:
-        movl    saveedi,%edi
-        movl    saveesi,%esi
-        movl    saveebx,%ebx
-end;
+    sub    $16, %ecx
+    jl     .LFallback  { probably dead branch... }
 
 
-{$define FPC_SYSTEM_HAS_FPC_SET_IN_BYTE}
+{$if false}
+{ Scans 16 bytes at a time left to right with early exits.
+  Would be better for large enough sets (maybe around 64 bytes or even more) — if they existed, but worse for actually existing 32.
+  Kept for the future. }
+    push   %ebx
+    pxor   %xmm2, %xmm2 { xmm2 = 0 }
+    add    %ecx, %eax { set1 += size }
+    add    %ecx, %edx { set2 += size }
+    neg    %ecx { Now "size" (ecx) = -(orig.size - 16), "set1" (eax) points to orig.set1 + orig.size - 16, "set2" (edx) points to orig.set2 + orig.size - 16.
+                 Loop ends on "size" >= 0, leaving up to 16 tail bytes. }
+.L16x_Loop:
+    movdqu (%eax,%ecx), %xmm1
+    movdqu (%edx,%ecx), %xmm0
+    pandn  %xmm1, %xmm0
+    pcmpeqb %xmm2, %xmm0
+    pmovmskb %xmm0, %ebx
+    inc    %bx
+    jnz    .LNo
+    add    $16, %ecx
+    js     .L16x_Loop
+    pop    %ebx
 
 
-function fpc_set_in_byte(const p: fpc_normal_set; b: byte): boolean; assembler; [public,alias:'FPC_SET_IN_BYTE']; compilerproc;
-{
-  tests if the element b is in the set p the carryflag is set if it present
-}
-asm
-{$ifdef REGCALL}
-        xchgl %edx,%eax
-        andl $0xff,%eax
+    movdqu (%eax), %xmm1
+    movdqu (%edx), %xmm0
+    pandn  %xmm1, %xmm0
 {$else}
 {$else}
-       movl   p,%edx
-       movzbl b,%eax
+{ Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. }
+    movdqu (%eax), %xmm1
+    movdqu (%edx), %xmm2
+    pandn  %xmm1, %xmm2
+
+.L16x_Loop:
+    movdqu (%eax,%ecx), %xmm1
+    movdqu (%edx,%ecx), %xmm0
+    pandn  %xmm1, %xmm0
+    por    %xmm0, %xmm2
+    sub    $16, %ecx
+    ja     .L16x_Loop
+
+    pxor   %xmm0, %xmm0
 {$endif}
 {$endif}
-       btl %eax,(%edx)
-end;
-
-
-{$define FPC_SYSTEM_HAS_FPC_SET_COMP_SETS}
-
-function fpc_set_comp_sets(const set1,set2: fpc_normal_set): boolean;assembler;[public,alias:'FPC_SET_COMP_SETS']; compilerproc;
-{
-  compares set1 and set2 zeroflag is set if they are equal
-}
-var
-  saveesi,saveedi : longint;
-asm
-        movl    %edi,saveedi
-        movl    %esi,saveesi
-        movl set1,%esi
-        movl set2,%edi
-        movl $8,%ecx
-    .LMCOMPSETS1:
-        movl (%esi),%eax
-        movl (%edi),%edx
-        cmpl %edx,%eax
-        jne  .LMCOMPSETEND
-        addl $4,%esi
-        addl $4,%edi
-        decl %ecx
-        jnz .LMCOMPSETS1
-        { we are here only if the two sets are equal
-          we have zero flag set, and that what is expected }
-    .LMCOMPSETEND:
-        seteb %al
-        movl    saveedi,%edi
-        movl    saveesi,%esi
-end;
-
-
-{$ifdef LARGESETS}
+    pcmpeqb %xmm2,%xmm0
+    pmovmskb %xmm0, %ecx
+    xor    %eax, %eax
+    inc    %cx
+    setz   %al
+    ret
 
 
-{$error Needs to be fixed for register calling first!}
+.LFallback:
+    add    $16, %ecx
+    jmp    fpc_varset_contains_sets_plain
 
 
-procedure fpc_largeset_set_word(p : pointer;b : word);assembler;[public,alias:'FPC_LARGESET_SET_WORD']; compilerproc;
-{
-  sets the element b in set p works for sets larger than 256 elements
-  not yet use by the compiler so
-}
-asm
-       pushl %eax
-       movl p,%edi
-       movw b,%ax
-       andl $0xfff8,%eax
-       shrl $3,%eax
-       addl %eax,%edi
-       movb 12(%ebp),%al
-       andl $7,%eax
-       btsl %eax,(%edi)
-       popl %eax
+{$if false}
+.LNo:
+    xor    %eax, %eax
+    pop    %ebx
+{$endif}
 end;
 end;
 
 
+function fpc_varset_contains_sets_dispatch(const set1,set2;size : ptrint):boolean; forward;
 
 
-procedure fpc_largeset_in_word(p : pointer;b : word);assembler;[public,alias:'FPC_LARGESET_IN_WORD']; compilerproc;
-{
-  tests if the element b is in the set p the carryflag is set if it present
-  works for sets larger than 256 elements
-}
-asm
-        pushl %eax
-        movl p,%edi
-        movw b,%ax
-        andl $0xfff8,%eax
-        shrl $3,%eax
-        addl %eax,%edi
-        movb 12(%ebp),%al
-        andl $7,%eax
-        btl %eax,(%edi)
-        popl %eax
+var
+  fpc_varset_contains_sets_impl: function(const set1,set2;size : ptrint):boolean = @fpc_varset_contains_sets_dispatch;
+
+function fpc_varset_contains_sets_dispatch(const set1,set2;size : ptrint):boolean;
+begin
+  if has_sse2_support then
+    fpc_varset_contains_sets_impl:=@fpc_varset_contains_sets_sse2
+  else
+    fpc_varset_contains_sets_impl:=@fpc_varset_contains_sets_plain;
+  result:=fpc_varset_contains_sets_impl(set1,set2,size);
 end;
 end;
 
 
-
-procedure fpc_largeset_comp_sets(set1,set2 : pointer;size : longint);assembler;[public,alias:'FPC_LARGESET_COMP_SETS']; compilerproc;
-asm
-      movl set1,%esi
-      movl set2,%edi
-      movl size,%ecx
-{$ifdef FPC_ENABLED_CLD}
-      cld
-{$endif FPC_ENABLED_CLD}
-  .LMCOMPSETSIZES1:
-      lodsl
-      movl (%edi),%edx
-      cmpl %edx,%eax
-      jne  .LMCOMPSETSIZEEND
-      addl $4,%edi
-      decl %ecx
-      jnz .LMCOMPSETSIZES1
-      { we are here only if the two sets are equal
-        we have zero flag set, and that what is expected }
-  .LMCOMPSETSIZEEND:
+function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; inline;
+begin
+  result:=fpc_varset_contains_sets_impl(set1,set2,size);
 end;
 end;
+{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
 
 
-
-{$endif LARGESET}
-
-{$endif FPC_OLD_BIGENDIAN_SETS}

+ 1 - 2
rtl/x86_64/set.inc

@@ -120,7 +120,7 @@ end;
 
 
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
 procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
 procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
-{ Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
+{ Same as fpc_varset_sub_sets but with 'xor' instead of 'and not'.
 
 
   Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
   Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
   Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
   Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
@@ -172,7 +172,6 @@ asm
     add    set2, size
     add    set2, size
     neg    size { Now "size" = -(orig.size - 16), "set1" points to orig.set1 + orig.size - 16, "set2" points to orig.set2 + orig.size - 16.
     neg    size { Now "size" = -(orig.size - 16), "set1" points to orig.set1 + orig.size - 16, "set2" points to orig.set2 + orig.size - 16.
                   Loop ends on "size" >= 0, leaving up to 16 tail bytes. }
                   Loop ends on "size" >= 0, leaving up to 16 tail bytes. }
-
 @16x_Loop:
 @16x_Loop:
     movdqu xmm1, xmmword ptr [set1 + size]
     movdqu xmm1, xmmword ptr [set1 + size]
     movdqu xmm0, xmmword ptr [set2 + size]
     movdqu xmm0, xmmword ptr [set2 + size]