1 년 전 · aed4292017
--- a/rtl/i386/set.inc
+++ b/rtl/i386/set.inc
@@ -14,10 +14,14 @@
 
															  **********************************************************************}
														
 
															 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
														
 
															-procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
														
 
															+label
														
 
															+  fpc_varset_add_sets_plain_fallback;
														
 
															+
														
 
															+procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
														
 
															 { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
														
 
															 asm
														
 
															     push   %ebx
														
 
															+fpc_varset_add_sets_plain_fallback:
														
 
															     push   %esi
														
 
															     mov    12(%esp), %esi { esi = size }
														
 
															     sub    $4, %esi
														
@@ -48,14 +52,60 @@ asm
 
															     pop    %esi
														
 
															     pop    %ebx
														
 
															 end;
														
 
															+
														
 
															+procedure fpc_varset_add_sets_sse(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
														
 
															+{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
														
 
															+asm
														
 
															+    push   %ebx
														
 
															+    mov    8(%esp), %ebx
														
 
															+    sub    $16, %ebx { ebx = position }
														
 
															+    jl     fpc_varset_add_sets_plain_fallback { probably dead branch... }
														
 
															+
														
 
															+.L16x_Loop:
														
 
															+    movups (%eax,%ebx), %xmm0
														
 
															+    movups (%edx,%ebx), %xmm1
														
 
															+    orps   %xmm1, %xmm0
														
 
															+    movups %xmm0, (%ecx,%ebx)
														
 
															+    sub    $16, %ebx
														
 
															+    ja     .L16x_Loop
														
 
															+
														
 
															+    movups (%eax), %xmm0 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
														
 
															+    movups (%edx), %xmm1
														
 
															+    orps   %xmm1, %xmm0
														
 
															+    movups %xmm0, (%ecx)
														
 
															+    pop    %ebx
														
 
															+end;
														
 
															+
														
 
															+procedure fpc_varset_add_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
														
 
															+
														
 
															+var
														
 
															+  fpc_varset_add_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_add_sets_dispatch;
														
 
															+
														
 
															+procedure fpc_varset_add_sets_dispatch(const set1,set2; var dest;size : ptrint);
														
 
															+begin
														
 
															+  if has_sse_support then
														
 
															+    fpc_varset_add_sets_impl:=@fpc_varset_add_sets_sse
														
 
															+  else
														
 
															+    fpc_varset_add_sets_impl:=@fpc_varset_add_sets_plain;
														
 
															+  fpc_varset_add_sets_impl(set1,set2,dest,size);
														
 
															+end;
														
 
															+
														
 
															+procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
														
 
															+begin
														
 
															+  fpc_varset_add_sets_impl(set1,set2,dest,size);
														
 
															+end;
														
 
															 {$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
														
 
															 {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
														
 
															 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
														
 
															-procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
														
 
															-{ Same as fpc_varset_add_sets but with 'and' instead of 'or'. }
														
 
															+label
														
 
															+  fpc_varset_mul_sets_plain_fallback;
														
 
															+
														
 
															+procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
														
 
															+{ Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. }
														
 
															 asm
														
 
															     push   %ebx
														
 
															+fpc_varset_mul_sets_plain_fallback:
														
 
															     push   %esi
														
 
															     mov    12(%esp), %esi { esi = size }
														
 
															     sub    $4, %esi
														
@@ -86,14 +136,60 @@ asm
 
															     pop    %esi
														
 
															     pop    %ebx
														
 
															 end;
														
 
															+
														
 
															+procedure fpc_varset_mul_sets_sse(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
														
 
															+{ Same as fpc_varset_add_sets_sse but with 'and' instead of 'or'. }
														
 
															+asm
														
 
															+    push   %ebx
														
 
															+    mov    8(%esp), %ebx
														
 
															+    sub    $16, %ebx { ebx = position }
														
 
															+    jl     fpc_varset_mul_sets_plain_fallback { probably dead branch... }
														
 
															+
														
 
															+.L16x_Loop:
														
 
															+    movups (%eax,%ebx), %xmm0
														
 
															+    movups (%edx,%ebx), %xmm1
														
 
															+    andps  %xmm1, %xmm0
														
 
															+    movups %xmm0, (%ecx,%ebx)
														
 
															+    sub    $16, %ebx
														
 
															+    ja     .L16x_Loop
														
 
															+
														
 
															+    movups (%eax), %xmm0 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
														
 
															+    movups (%edx), %xmm1
														
 
															+    andps   %xmm1, %xmm0
														
 
															+    movups %xmm0, (%ecx)
														
 
															+    pop    %ebx
														
 
															+end;
														
 
															+
														
 
															+procedure fpc_varset_mul_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
														
 
															+
														
 
															+var
														
 
															+  fpc_varset_mul_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_mul_sets_dispatch;
														
 
															+
														
 
															+procedure fpc_varset_mul_sets_dispatch(const set1,set2; var dest;size : ptrint);
														
 
															+begin
														
 
															+  if has_sse_support then
														
 
															+    fpc_varset_mul_sets_impl:=@fpc_varset_mul_sets_sse
														
 
															+  else
														
 
															+    fpc_varset_mul_sets_impl:=@fpc_varset_mul_sets_plain;
														
 
															+  fpc_varset_mul_sets_impl(set1,set2,dest,size);
														
 
															+end;
														
 
															+
														
 
															+procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
														
 
															+begin
														
 
															+  fpc_varset_mul_sets_impl(set1,set2,dest,size);
														
 
															+end;
														
 
															 {$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
														
 
															 {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
														
 
															 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
														
 
															-procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
														
 
															+label
														
 
															+  fpc_varset_sub_sets_plain_fallback;
														
 
															+
														
 
															+procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
														
 
															 { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
														
 
															 asm
														
 
															     push   %ebx
														
 
															+fpc_varset_sub_sets_plain_fallback:
														
 
															     push   %esi
														
 
															     mov    12(%esp), %esi { esi = size }
														
 
															     sub    $4, %esi
														
@@ -129,15 +225,62 @@ asm
 
															     pop    %esi
														
 
															     pop    %ebx
														
 
															 end;
														
 
															+
														
 
															+procedure fpc_varset_sub_sets_sse(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
														
 
															+{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
														
 
															+asm
														
 
															+    push   %ebx
														
 
															+    mov    8(%esp), %ebx
														
 
															+    sub    $16, %ebx { ebx = position }
														
 
															+    jl     fpc_varset_sub_sets_plain_fallback { probably dead branch... }
														
 
															+
														
 
															+    movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
														
 
															+    movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
														
 
															+    andnps %xmm1, %xmm2
														
 
															+
														
 
															+.L16x_Loop:
														
 
															+    movups (%eax,%ebx), %xmm1
														
 
															+    movups (%edx,%ebx), %xmm0
														
 
															+    andnps %xmm1, %xmm0
														
 
															+    movups %xmm0, (%ecx,%ebx)
														
 
															+    sub    $16, %ebx
														
 
															+    ja     .L16x_Loop
														
 
															+
														
 
															+    movups %xmm2, (%ecx) { Write precalculated tail. }
														
 
															+    pop    %ebx
														
 
															+end;
														
 
															+
														
 
															+procedure fpc_varset_sub_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
														
 
															+
														
 
															+var
														
 
															+  fpc_varset_sub_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_sub_sets_dispatch;
														
 
															+
														
 
															+procedure fpc_varset_sub_sets_dispatch(const set1,set2; var dest;size : ptrint);
														
 
															+begin
														
 
															+  if has_sse_support then
														
 
															+    fpc_varset_sub_sets_impl:=@fpc_varset_sub_sets_sse
														
 
															+  else
														
 
															+    fpc_varset_sub_sets_impl:=@fpc_varset_sub_sets_plain;
														
 
															+  fpc_varset_sub_sets_impl(set1,set2,dest,size);
														
 
															+end;
														
 
															+
														
 
															+procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
														
 
															+begin
														
 
															+  fpc_varset_sub_sets_impl(set1,set2,dest,size);
														
 
															+end;
														
 
															 {$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
														
 
															 {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
														
 
															 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
														
 
															-procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
														
 
															-{ Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
														
 
															+label
														
 
															+  fpc_varset_symdif_sets_plain_fallback;
														
 
															+
														
 
															+procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
														
 
															+{ Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'.
														
 
															   eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
														
 
															 asm
														
 
															     push   %ebx
														
 
															+fpc_varset_symdif_sets_plain_fallback:
														
 
															     push   %esi
														
 
															     mov    12(%esp), %esi { esi = size }
														
 
															     sub    $4, %esi
														
@@ -170,11 +313,55 @@ asm
 
															     pop    %esi
														
 
															     pop    %ebx
														
 
															 end;
														
 
															+
														
 
															+procedure fpc_varset_symdif_sets_sse(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
														
 
															+{ Same as fpc_varset_sub_sets_sse but with 'xor' instead of 'and not'.
														
 
															+  eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
														
 
															+asm
														
 
															+    push   %ebx
														
 
															+    mov    8(%esp), %ebx
														
 
															+    sub    $16, %ebx { ebx = position }
														
 
															+    jl     fpc_varset_symdif_sets_plain_fallback { probably dead branch... }
														
 
															+
														
 
															+    movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
														
 
															+    movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
														
 
															+    xorps  %xmm1, %xmm2
														
 
															+
														
 
															+.L16x_Loop:
														
 
															+    movups (%eax,%ebx), %xmm1
														
 
															+    movups (%edx,%ebx), %xmm0
														
 
															+    xorps  %xmm1, %xmm0
														
 
															+    movups %xmm0, (%ecx,%ebx)
														
 
															+    sub    $16, %ebx
														
 
															+    ja     .L16x_Loop
														
 
															+
														
 
															+    movups %xmm2, (%ecx) { Write precalculated tail. }
														
 
															+    pop    %ebx
														
 
															+end;
														
 
															+
														
 
															+procedure fpc_varset_symdif_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
														
 
															+
														
 
															+var
														
 
															+  fpc_varset_symdif_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_symdif_sets_dispatch;
														
 
															+
														
 
															+procedure fpc_varset_symdif_sets_dispatch(const set1,set2; var dest;size : ptrint);
														
 
															+begin
														
 
															+  if has_sse_support then
														
 
															+    fpc_varset_symdif_sets_impl:=@fpc_varset_symdif_sets_sse
														
 
															+  else
														
 
															+    fpc_varset_symdif_sets_impl:=@fpc_varset_symdif_sets_plain;
														
 
															+  fpc_varset_symdif_sets_impl(set1,set2,dest,size);
														
 
															+end;
														
 
															+
														
 
															+procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
														
 
															+begin
														
 
															+  fpc_varset_symdif_sets_impl(set1,set2,dest,size);
														
 
															+end;
														
 
															 {$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
														
 
															 {$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
														
 
															 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
														
 
															-function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; assembler; nostackframe;
														
 
															+function fpc_varset_contains_sets_plain(const set1,set2;size : ptrint):boolean; assembler; nostackframe;
														
 
															 { eax = set1, edx = set2, ecx = size }
														
 
															 asm
														
 
															     push   %ebx
														
@@ -221,292 +408,90 @@ asm
 
															     mov    $1, %eax
														
 
															     pop    %ebx
														
 
															 end;
														
 
															-{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
														
 
															-{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
														
 
															-
														
 
															-{ the following code is exactly big endian set-related, but specific to the old
														
 
															-  scheme whereby sets were either 4 or 32 bytes. I've left the routines here
														
 
															-  so if someone wants to, they can create equivalents of the new varset helpers
														
 
															-  from rtl/inc/genset.inc
														
 
															-}
														
 
															-
														
 
															-{$ifdef FPC_OLD_BIGENDIAN_SETS}
														
 
															-{$define FPC_SYSTEM_HAS_FPC_SET_LOAD_SMALL}
														
 
															-function fpc_set_load_small(l: fpc_small_set): fpc_normal_set;assembler;[public,alias:'FPC_SET_LOAD_SMALL']; compilerproc;
														
 
															-{
														
 
															-  load a normal set p from a smallset l
														
 
															-}
														
 
															-var
														
 
															-  saveedi : longint;
														
 
															-asm
														
 
															-        movl    %edi,saveedi
														
 
															-        movl    __RESULT,%edi
														
 
															-        movl    l,%eax
														
 
															-{$ifdef FPC_ENABLED_CLD}
														
 
															-        cld
														
 
															-{$endif FPC_ENABLED_CLD}
														
 
															-        stosl
														
 
															-        xorl    %eax,%eax
														
 
															-        movl    $7,%ecx
														
 
															-        rep
														
 
															-        stosl
														
 
															-        movl    saveedi,%edi
														
 
															-end;
														
 
															-
														
 
															-{$define FPC_SYSTEM_HAS_FPC_SET_CREATE_ELEMENT}
														
 
															-
														
 
															-function fpc_set_create_element(b : byte): fpc_normal_set;assembler;[public,alias:'FPC_SET_CREATE_ELEMENT']; compilerproc;
														
 
															-{
														
 
															-  create a new set in p from an element b
														
 
															-}
														
 
															-var
														
 
															-  saveedi : longint;
														
 
															-asm
														
 
															-        movl    %edi,saveedi
														
 
															-        movl    __RESULT,%edi
														
 
															-        movzbl  b,%edx
														
 
															-        xorl    %eax,%eax
														
 
															-        movl    $8,%ecx
														
 
															-{$ifdef FPC_ENABLED_CLD}
														
 
															-        cld
														
 
															-{$endif FPC_ENABLED_CLD}
														
 
															-        rep
														
 
															-        stosl
														
 
															-        leal    -32(%edi),%eax
														
 
															-        btsl    %edx,(%eax)
														
 
															-        movl    saveedi,%edi
														
 
															-end;
														
 
															-
														
 
															-
														
 
															-{$define FPC_SYSTEM_HAS_FPC_SET_SET_BYTE}
														
 
															-function fpc_set_set_byte(const source: fpc_normal_set; b : byte): fpc_normal_set;assembler; compilerproc;
														
 
															-{
														
 
															-  add the element b to the set pointed by source
														
 
															-}
														
 
															-var
														
 
															-  saveesi,saveedi : longint;
														
 
															-asm
														
 
															-        movl    %edi,saveedi
														
 
															-        movl    %esi,saveesi
														
 
															-        movl    source,%esi
														
 
															-        movl    __RESULT,%edi
														
 
															-        movzbl  b,%edx
														
 
															-        movl    $8,%ecx
														
 
															-{$ifdef FPC_ENABLED_CLD}
														
 
															-        cld
														
 
															-{$endif FPC_ENABLED_CLD}
														
 
															-        rep
														
 
															-        movsl
														
 
															-        leal    -32(%edi),%eax
														
 
															-        btsl    %edx,(%eax)
														
 
															-        movl    saveedi,%edi
														
 
															-        movl    saveesi,%esi
														
 
															-end;
														
 
															-
														
 
															-{$define FPC_SYSTEM_HAS_FPC_SET_UNSET_BYTE}
														
 
															-function fpc_set_unset_byte(const source: fpc_normal_set; b : byte): fpc_normal_set;assembler; compilerproc;
														
 
															-{
														
 
															-  add the element b to the set pointed by source
														
 
															-}
														
 
															-var
														
 
															-  saveesi,saveedi : longint;
														
 
															-asm
														
 
															-        movl    %edi,saveedi
														
 
															-        movl    %esi,saveesi
														
 
															-        movl    source,%esi
														
 
															-        movl    __RESULT,%edi
														
 
															-        movzbl  b,%edx
														
 
															-        movl    $8,%ecx
														
 
															-{$ifdef FPC_ENABLED_CLD}
														
 
															-        cld
														
 
															-{$endif FPC_ENABLED_CLD}
														
 
															-        rep
														
 
															-        movsl
														
 
															-        leal    -32(%edi),%eax
														
 
															-        btrl    %edx,(%eax)
														
 
															-        movl    saveedi,%edi
														
 
															-        movl    saveesi,%esi
														
 
															-end;
														
 
															-
														
 
															-{$define FPC_SYSTEM_HAS_FPC_SET_SET_RANGE}
														
 
															-
														
 
															-function fpc_set_set_range(const orgset: fpc_normal_set; l,h : byte): fpc_normal_set;assembler; compilerproc;
														
 
															-{
														
 
															-  adds the range [l..h] to the set pointed to by p
														
 
															-}
														
 
															-var
														
 
															-  saveh : byte;
														
 
															-  saveesi,saveedi,saveebx : longint;
														
 
															+function fpc_varset_contains_sets_sse2(const set1,set2;size : ptrint):boolean; assembler; nostackframe;
														
 
															+{ eax = set1, edx = set2, ecx = size }
														
 
															 asm
														
 
															-        movl    %edi,saveedi
														
 
															-        movl    %esi,saveesi
														
 
															-        movl    %ebx,saveebx
														
 
															-        movl   __RESULT,%edi        // target set address in edi
														
 
															-        movl   orgset, %esi         // source set address in esi
														
 
															-        movzbl l,%eax               // lowest bit to be set in eax
														
 
															-        movzbl h,%ebx               // highest in ebx
														
 
															-        movb   %bl,saveh
														
 
															-        movl   $8,%ecx              // we have to copy 32 bytes
														
 
															-        cmpl   %eax,%ebx            // high < low?
														
 
															-{$ifdef FPC_ENABLED_CLD}
														
 
															-        cld
														
 
															-{$endif FPC_ENABLED_CLD}
														
 
															-        rep                         // copy source to dest (it's possible to do the range
														
 
															-        movsl                       // setting and copying simultanuously of course, but
														
 
															-                                    // that would result in many more jumps and code)
														
 
															-        movl   %eax,%ecx            // lowest also in ecx
														
 
															-        jb     .Lset_range_done     // if high > low, then dest := source
														
 
															-        shrl   $3,%eax              // divide by 8 to get starting and ending byte
														
 
															-        shrl   $3,%ebx              // address
														
 
															-        andb   $31,%cl              // low five bits of lo determine start of bit mask
														
 
															-        andl   $0x0fffffffc,%eax    // clear two lowest bits to get start/end longint
														
 
															-        subl   $32,%edi             // get back to start of dest
														
 
															-        andl   $0x0fffffffc,%ebx    // address * 4
														
 
															-        movl   $0x0ffffffff,%edx    // edx = bitmask to be inserted
														
 
															-        shll   %cl,%edx             // shift bitmask to clear bits below lo
														
 
															-        addl   %eax,%edi            // go to starting pos in set
														
 
															-        subl   %eax,%ebx            // are bit lo and hi in the same longint?
														
 
															-        jz     .Lset_range_hi       // yes, keep current mask and adjust for hi bit
														
 
															-        orl    %edx,(%edi)          // no, store current mask
														
 
															-        movl   $0x0ffffffff,%edx    // new mask
														
 
															-        addl   $4,%edi              // next longint of set
														
 
															-        subl   $4,%ebx              // bit hi in this longint?
														
 
															-        jz     .Lset_range_hi       // yes, keep full mask and adjust for hi bit
														
 
															-.Lset_range_loop:
														
 
															-        movl   %edx,(%edi)          // no, fill longints in between with full mask
														
 
															-        addl   $4,%edi
														
 
															-        subl   $4,%ebx
														
 
															-        jnz    .Lset_range_loop
														
 
															-.Lset_range_hi:
														
 
															-        movb   saveh,%cl                // this is ok, h is on the stack
														
 
															-        movl   %edx,%ebx            // save current bitmask
														
 
															-        andb   $31,%cl
														
 
															-        subb   $31,%cl              // cl := (31 - (hi and 31)) = shift count to
														
 
															-        negb   %cl                  // adjust bitmask for hi bit
														
 
															-        shrl   %cl,%edx             // shift bitmask to clear bits higher than hi
														
 
															-        andl   %edx,%ebx            // combine both bitmasks
														
 
															-        orl    %ebx,(%edi)          // store to set
														
 
															-.Lset_range_done:
														
 
															-        movl    saveedi,%edi
														
 
															-        movl    saveesi,%esi
														
 
															-        movl    saveebx,%ebx
														
 
															-end;
														
 
															+    sub    $16, %ecx
														
 
															+    jl     .LFallback  { probably dead branch... }
														
 
															-{$define FPC_SYSTEM_HAS_FPC_SET_IN_BYTE}
														
 
															+{$if false}
														
 
															+{ Scans 16 bytes at a time left to right with early exits.
														
 
															+  Would be better for large enough sets (maybe around 64 bytes or even more) — if they existed, but worse for actually existing 32.
														
 
															+  Kept for the future. }
														
 
															+    push   %ebx
														
 
															+    pxor   %xmm2, %xmm2 { xmm2 = 0 }
														
 
															+    add    %ecx, %eax { set1 += size }
														
 
															+    add    %ecx, %edx { set2 += size }
														
 
															+    neg    %ecx { Now "size" (ecx) = -(orig.size - 16), "set1" (eax) points to orig.set1 + orig.size - 16, "set2" (edx) points to orig.set2 + orig.size - 16.
														
 
															+                 Loop ends on "size" >= 0, leaving up to 16 tail bytes. }
														
 
															+.L16x_Loop:
														
 
															+    movdqu (%eax,%ecx), %xmm1
														
 
															+    movdqu (%edx,%ecx), %xmm0
														
 
															+    pandn  %xmm1, %xmm0
														
 
															+    pcmpeqb %xmm2, %xmm0
														
 
															+    pmovmskb %xmm0, %ebx
														
 
															+    inc    %bx
														
 
															+    jnz    .LNo
														
 
															+    add    $16, %ecx
														
 
															+    js     .L16x_Loop
														
 
															+    pop    %ebx
														
 
															-function fpc_set_in_byte(const p: fpc_normal_set; b: byte): boolean; assembler; [public,alias:'FPC_SET_IN_BYTE']; compilerproc;
														
 
															-{
														
 
															-  tests if the element b is in the set p the carryflag is set if it present
														
 
															-}
														
 
															-asm
														
 
															-{$ifdef REGCALL}
														
 
															-        xchgl %edx,%eax
														
 
															-        andl $0xff,%eax
														
 
															+    movdqu (%eax), %xmm1
														
 
															+    movdqu (%edx), %xmm0
														
 
															+    pandn  %xmm1, %xmm0
														
 
															 {$else}
														
 
															-       movl   p,%edx
														
 
															-       movzbl b,%eax
														
 
															+{ Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. }
														
 
															+    movdqu (%eax), %xmm1
														
 
															+    movdqu (%edx), %xmm2
														
 
															+    pandn  %xmm1, %xmm2
														
 
															+
														
 
															+.L16x_Loop:
														
 
															+    movdqu (%eax,%ecx), %xmm1
														
 
															+    movdqu (%edx,%ecx), %xmm0
														
 
															+    pandn  %xmm1, %xmm0
														
 
															+    por    %xmm0, %xmm2
														
 
															+    sub    $16, %ecx
														
 
															+    ja     .L16x_Loop
														
 
															+
														
 
															+    pxor   %xmm0, %xmm0
														
 
															 {$endif}
														
 
															-       btl %eax,(%edx)
														
 
															-end;
														
 
															-
														
 
															-
														
 
															-{$define FPC_SYSTEM_HAS_FPC_SET_COMP_SETS}
														
 
															-
														
 
															-function fpc_set_comp_sets(const set1,set2: fpc_normal_set): boolean;assembler;[public,alias:'FPC_SET_COMP_SETS']; compilerproc;
														
 
															-{
														
 
															-  compares set1 and set2 zeroflag is set if they are equal
														
 
															-}
														
 
															-var
														
 
															-  saveesi,saveedi : longint;
														
 
															-asm
														
 
															-        movl    %edi,saveedi
														
 
															-        movl    %esi,saveesi
														
 
															-        movl set1,%esi
														
 
															-        movl set2,%edi
														
 
															-        movl $8,%ecx
														
 
															-    .LMCOMPSETS1:
														
 
															-        movl (%esi),%eax
														
 
															-        movl (%edi),%edx
														
 
															-        cmpl %edx,%eax
														
 
															-        jne  .LMCOMPSETEND
														
 
															-        addl $4,%esi
														
 
															-        addl $4,%edi
														
 
															-        decl %ecx
														
 
															-        jnz .LMCOMPSETS1
														
 
															-        { we are here only if the two sets are equal
														
 
															-          we have zero flag set, and that what is expected }
														
 
															-    .LMCOMPSETEND:
														
 
															-        seteb %al
														
 
															-        movl    saveedi,%edi
														
 
															-        movl    saveesi,%esi
														
 
															-end;
														
 
															-
														
 
															-
														
 
															-{$ifdef LARGESETS}
														
 
															+    pcmpeqb %xmm2,%xmm0
														
 
															+    pmovmskb %xmm0, %ecx
														
 
															+    xor    %eax, %eax
														
 
															+    inc    %cx
														
 
															+    setz   %al
														
 
															+    ret
														
 
															-{$error Needs to be fixed for register calling first!}
														
 
															+.LFallback:
														
 
															+    add    $16, %ecx
														
 
															+    jmp    fpc_varset_contains_sets_plain
														
 
															-procedure fpc_largeset_set_word(p : pointer;b : word);assembler;[public,alias:'FPC_LARGESET_SET_WORD']; compilerproc;
														
 
															-{
														
 
															-  sets the element b in set p works for sets larger than 256 elements
														
 
															-  not yet use by the compiler so
														
 
															-}
														
 
															-asm
														
 
															-       pushl %eax
														
 
															-       movl p,%edi
														
 
															-       movw b,%ax
														
 
															-       andl $0xfff8,%eax
														
 
															-       shrl $3,%eax
														
 
															-       addl %eax,%edi
														
 
															-       movb 12(%ebp),%al
														
 
															-       andl $7,%eax
														
 
															-       btsl %eax,(%edi)
														
 
															-       popl %eax
														
 
															+{$if false}
														
 
															+.LNo:
														
 
															+    xor    %eax, %eax
														
 
															+    pop    %ebx
														
 
															+{$endif}
														
 
															 end;
														
 
															+function fpc_varset_contains_sets_dispatch(const set1,set2;size : ptrint):boolean; forward;
														
 
															-procedure fpc_largeset_in_word(p : pointer;b : word);assembler;[public,alias:'FPC_LARGESET_IN_WORD']; compilerproc;
														
 
															-{
														
 
															-  tests if the element b is in the set p the carryflag is set if it present
														
 
															-  works for sets larger than 256 elements
														
 
															-}
														
 
															-asm
														
 
															-        pushl %eax
														
 
															-        movl p,%edi
														
 
															-        movw b,%ax
														
 
															-        andl $0xfff8,%eax
														
 
															-        shrl $3,%eax
														
 
															-        addl %eax,%edi
														
 
															-        movb 12(%ebp),%al
														
 
															-        andl $7,%eax
														
 
															-        btl %eax,(%edi)
														
 
															-        popl %eax
														
 
															+var
														
 
															+  fpc_varset_contains_sets_impl: function(const set1,set2;size : ptrint):boolean = @fpc_varset_contains_sets_dispatch;
														
 
															+
														
 
															+function fpc_varset_contains_sets_dispatch(const set1,set2;size : ptrint):boolean;
														
 
															+begin
														
 
															+  if has_sse2_support then
														
 
															+    fpc_varset_contains_sets_impl:=@fpc_varset_contains_sets_sse2
														
 
															+  else
														
 
															+    fpc_varset_contains_sets_impl:=@fpc_varset_contains_sets_plain;
														
 
															+  result:=fpc_varset_contains_sets_impl(set1,set2,size);
														
 
															 end;
														
 
															-
														
 
															-procedure fpc_largeset_comp_sets(set1,set2 : pointer;size : longint);assembler;[public,alias:'FPC_LARGESET_COMP_SETS']; compilerproc;
														
 
															-asm
														
 
															-      movl set1,%esi
														
 
															-      movl set2,%edi
														
 
															-      movl size,%ecx
														
 
															-{$ifdef FPC_ENABLED_CLD}
														
 
															-      cld
														
 
															-{$endif FPC_ENABLED_CLD}
														
 
															-  .LMCOMPSETSIZES1:
														
 
															-      lodsl
														
 
															-      movl (%edi),%edx
														
 
															-      cmpl %edx,%eax
														
 
															-      jne  .LMCOMPSETSIZEEND
														
 
															-      addl $4,%edi
														
 
															-      decl %ecx
														
 
															-      jnz .LMCOMPSETSIZES1
														
 
															-      { we are here only if the two sets are equal
														
 
															-        we have zero flag set, and that what is expected }
														
 
															-  .LMCOMPSETSIZEEND:
														
 
															+function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; inline;
														
 
															+begin
														
 
															+  result:=fpc_varset_contains_sets_impl(set1,set2,size);
														
 
															 end;
														
 
															+{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
														
 
															+{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
														
 
															-
														
 
															-{$endif LARGESET}
														
 
															-
														
 
															-{$endif FPC_OLD_BIGENDIAN_SETS}
														
--- a/rtl/x86_64/set.inc
+++ b/rtl/x86_64/set.inc
@@ -120,7 +120,7 @@ end;
 
															 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
														
 
															 procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
														
 
															-{ Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.
														
 
															+{ Same as fpc_varset_sub_sets but with 'xor' instead of 'and not'.
														
 
															   Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
														
 
															   Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
														
@@ -172,7 +172,6 @@ asm
 
															     add    set2, size
														
 
															     neg    size { Now "size" = -(orig.size - 16), "set1" points to orig.set1 + orig.size - 16, "set2" points to orig.set2 + orig.size - 16.
														
 
															                   Loop ends on "size" >= 0, leaving up to 16 tail bytes. }
														
 
															-
														
 
															 @16x_Loop:
														
 
															     movdqu xmm1, xmmword ptr [set1 + size]
														
 
															     movdqu xmm0, xmmword ptr [set2 + size]