1 year ago · d1db5d2104
--- a/rtl/i386/i386.inc
+++ b/rtl/i386/i386.inc
@@ -168,12 +168,14 @@ end;
 
															   Since FPC does not attempt it even for local labels, this kind of code is inherently unsafe.
														
 
															 }
														
 
															-{$if not defined(darwin) and
														
 
															-  (not defined(FPC_SYSTEM_HAS_FILLCHAR)
														
 
															-   or not defined(FPC_SYSTEM_HAS_FILLWORD)
														
 
															-   or not defined(FPC_SYSTEM_HAS_FILLDWORD)
														
 
															-   or not defined(FPC_SYSTEM_HAS_FILLQWORD)
														
 
															-)}
														
 
															+{$ifndef darwin}
														
 
															+  {$define can_jump_into_the_middle_of_a_procedure}
														
 
															+{$endif darwin}
														
 
															+
														
 
															+{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
														
 
															+  or not defined(FPC_SYSTEM_HAS_FILLWORD)
														
 
															+  or not defined(FPC_SYSTEM_HAS_FILLDWORD)
														
 
															+  or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
														
 
															 {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
														
 
															    or not defined(FPC_SYSTEM_HAS_FILLWORD)
														
@@ -209,13 +211,15 @@ asm
 
															 end;
														
 
															 {$endif FillChar/Word/DWord required.}
														
 
															+{$ifdef can_jump_into_the_middle_of_a_procedure}
														
 
															 label
														
 
															   FillXxxx_MoreThanTwoXMMs;
														
 
															+{$else can_jump_into_the_middle_of_a_procedure}
														
 
															+procedure FillXxxx_MoreThanTwoXMMs; forward;
														
 
															+{$endif can_jump_into_the_middle_of_a_procedure}
														
 
															 procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
														
 
															 { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
														
 
															-const
														
 
															-  NtThreshold = 4 * 1024 * 1024;
														
 
															 asm
														
 
															         movd   %ecx, %xmm0
														
 
															         pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
														
@@ -240,10 +244,17 @@ asm
 
															         movd   %esi, %xmm0
														
 
															         pshufd $0, %xmm0, %xmm0
														
 
															         pop    %esi
														
 
															-
														
 
															+{$ifdef can_jump_into_the_middle_of_a_procedure}
														
 
															 { FillChar (to skip the misaligning above) and FillQWord jump here.
														
 
															   eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
														
 
															 FillXxxx_MoreThanTwoXMMs:
														
 
															+{$else can_jump_into_the_middle_of_a_procedure}
														
 
															+        jmp    FillXxxx_MoreThanTwoXMMs
														
 
															+end;
														
 
															+
														
 
															+procedure FillXxxx_MoreThanTwoXMMs; assembler; nostackframe;
														
 
															+asm
														
 
															+{$endif can_jump_into_the_middle_of_a_procedure}
														
 
															         lea    -65(%eax,%edx), %ecx
														
 
															         and    $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
														
 
															         mov    %ecx, %edx { Remember T4 to edx. }
														
@@ -259,7 +270,7 @@ FillXxxx_MoreThanTwoXMMs:
 
															         jle    .LFourAlignedTailWrites { ecx was ≤ 96−48 }
														
 
															         add    $48, %eax { eax = H3. }
														
 
															-        cmp    $NtThreshold, %ecx
														
 
															+        cmp    $4 * 1024 * 1024, %ecx { Non-temporal fill threshold. }
														
 
															         jae    .L64xNT_Body
														
 
															 .balign 16 { no-op }
														
@@ -339,8 +350,7 @@ end;
 
															 {$endif FillChar/Word/DWord/QWord required.}
														
 
															-{$if not defined(darwin) and
														
 
															-     not defined(FPC_SYSTEM_HAS_FILLCHAR)}
														
 
															+{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)}
														
 
															 {$define FPC_SYSTEM_HAS_FILLCHAR}
														
 
															 procedure FillChar_3OrLess; assembler; nostackframe;
														
 
															 { cl — x, edx — byte count, Low(int32) <= edx <= 3. }
														
@@ -438,8 +448,7 @@ end;
 
															 {$endif FPC_SYSTEM_HAS_FILLCHAR}
														
 
															-{$if not defined(darwin) and
														
 
															-     not defined(FPC_SYSTEM_HAS_FILLWORD)}
														
 
															+{$if not defined(FPC_SYSTEM_HAS_FILLWORD)}
														
 
															 {$define FPC_SYSTEM_HAS_FILLWORD}
														
 
															 procedure FillWord_3OrLess; assembler; nostackframe;
														
 
															 asm
														
@@ -527,8 +536,7 @@ end;
 
															 {$endif FPC_SYSTEM_HAS_FILLWORD}
														
 
															-{$if not defined(darwin) and
														
 
															-     not defined(FPC_SYSTEM_HAS_FILLDWORD)}
														
 
															+{$if not defined(FPC_SYSTEM_HAS_FILLDWORD)}
														
 
															 {$define FPC_SYSTEM_HAS_FILLDWORD}
														
 
															 procedure FillDWord_4OrLess; assembler; nostackframe;
														
 
															 asm
														
@@ -602,8 +610,7 @@ end;
 
															 {$endif FPC_SYSTEM_HAS_FILLDWORD}
														
 
															-{$if not defined(darwin) and
														
 
															-     not defined(FPC_SYSTEM_HAS_FILLQWORD)}
														
 
															+{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
														
 
															 {$define FPC_SYSTEM_HAS_FILLQWORD}
														
 
															 {$ifndef CPUX86_HAS_SSE2}
														
 
															 procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;
														
--- a/rtl/i386/set.inc
+++ b/rtl/i386/set.inc
@@ -15,14 +15,10 @@
 
															 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
														
 
															 {$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
														
 
															-label
														
 
															-  fpc_varset_add_sets_plain_fallback;
														
 
															-
														
 
															 procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
														
 
															 { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
														
 
															 asm
														
 
															     push   %ebx
														
 
															-fpc_varset_add_sets_plain_fallback:
														
 
															     push   %esi
														
 
															     mov    12(%esp), %esi { esi = size }
														
 
															     sub    $4, %esi
														
@@ -60,7 +56,7 @@ asm
 
															     push   %ebx
														
 
															     mov    8(%esp), %ebx
														
 
															     sub    $16, %ebx { ebx = position }
														
 
															-    jl     fpc_varset_add_sets_plain_fallback { probably dead branch... }
														
 
															+    jl     .LFallback { Hopefully dead branch... }
														
 
															 .L16x_Loop:
														
 
															     movups (%eax,%ebx), %xmm0
														
@@ -75,6 +71,11 @@ asm
 
															     orps   %xmm1, %xmm0
														
 
															     movups %xmm0, (%ecx)
														
 
															     pop    %ebx
														
 
															+    ret    $4
														
 
															+
														
 
															+.LFallback:
														
 
															+    pop    %ebx
														
 
															+    jmp    fpc_varset_add_sets_plain
														
 
															 end;
														
 
															 {$ifndef CPUX86_HAS_SSEUNIT}
														
@@ -101,14 +102,10 @@ end;
 
															 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
														
 
															 {$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
														
 
															-label
														
 
															-  fpc_varset_mul_sets_plain_fallback;
														
 
															-
														
 
															 procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
														
 
															 { Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. }
														
 
															 asm
														
 
															     push   %ebx
														
 
															-fpc_varset_mul_sets_plain_fallback:
														
 
															     push   %esi
														
 
															     mov    12(%esp), %esi { esi = size }
														
 
															     sub    $4, %esi
														
@@ -146,7 +143,7 @@ asm
 
															     push   %ebx
														
 
															     mov    8(%esp), %ebx
														
 
															     sub    $16, %ebx { ebx = position }
														
 
															-    jl     fpc_varset_mul_sets_plain_fallback { probably dead branch... }
														
 
															+    jl     .LFallback { Hopefully dead branch... }
														
 
															 .L16x_Loop:
														
 
															     movups (%eax,%ebx), %xmm0
														
@@ -161,6 +158,11 @@ asm
 
															     andps   %xmm1, %xmm0
														
 
															     movups %xmm0, (%ecx)
														
 
															     pop    %ebx
														
 
															+    ret    $4
														
 
															+
														
 
															+.LFallback:
														
 
															+    pop    %ebx
														
 
															+    jmp    fpc_varset_mul_sets_plain
														
 
															 end;
														
 
															 {$ifndef CPUX86_HAS_SSEUNIT}
														
@@ -187,14 +189,10 @@ end;
 
															 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
														
 
															 {$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
														
 
															-label
														
 
															-  fpc_varset_sub_sets_plain_fallback;
														
 
															-
														
 
															 procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
														
 
															 { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
														
 
															 asm
														
 
															     push   %ebx
														
 
															-fpc_varset_sub_sets_plain_fallback:
														
 
															     push   %esi
														
 
															     mov    12(%esp), %esi { esi = size }
														
 
															     sub    $4, %esi
														
@@ -237,7 +235,7 @@ asm
 
															     push   %ebx
														
 
															     mov    8(%esp), %ebx
														
 
															     sub    $16, %ebx { ebx = position }
														
 
															-    jl     fpc_varset_sub_sets_plain_fallback { probably dead branch... }
														
 
															+    jl     .LFallback { Hopefully dead branch... }
														
 
															     movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
														
 
															     movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
														
@@ -253,6 +251,11 @@ asm
 
															     movups %xmm2, (%ecx) { Write precalculated tail. }
														
 
															     pop    %ebx
														
 
															+    ret    $4
														
 
															+
														
 
															+.LFallback:
														
 
															+    pop    %ebx
														
 
															+    jmp    fpc_varset_sub_sets_plain
														
 
															 end;
														
 
															 {$ifndef CPUX86_HAS_SSEUNIT}
														
@@ -279,15 +282,11 @@ end;
 
															 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
														
 
															 {$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
														
 
															-label
														
 
															-  fpc_varset_symdif_sets_plain_fallback;
														
 
															-
														
 
															 procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
														
 
															 { Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'.
														
 
															   eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
														
 
															 asm
														
 
															     push   %ebx
														
 
															-fpc_varset_symdif_sets_plain_fallback:
														
 
															     push   %esi
														
 
															     mov    12(%esp), %esi { esi = size }
														
 
															     sub    $4, %esi
														
@@ -328,7 +327,7 @@ asm
 
															     push   %ebx
														
 
															     mov    8(%esp), %ebx
														
 
															     sub    $16, %ebx { ebx = position }
														
 
															-    jl     fpc_varset_symdif_sets_plain_fallback { probably dead branch... }
														
 
															+    jl     .LFallback { Hopefully dead branch... }
														
 
															     movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
														
 
															     movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
														
@@ -344,6 +343,11 @@ asm
 
															     movups %xmm2, (%ecx) { Write precalculated tail. }
														
 
															     pop    %ebx
														
 
															+    ret    $4
														
 
															+
														
 
															+.LFallback:
														
 
															+    pop    %ebx
														
 
															+    jmp    fpc_varset_symdif_sets_plain
														
 
															 end;
														
 
															 {$ifndef CPUX86_HAS_SSEUNIT}