Browse Source

Darwin: re-enable new assembler fill*word variants

Work around with an extra jump to an extra function.
Rika Ichinose 8 months ago
parent
commit
d1db5d2104
2 changed files with 49 additions and 38 deletions
  1. 25 18
      rtl/i386/i386.inc
  2. 24 20
      rtl/i386/set.inc

+ 25 - 18
rtl/i386/i386.inc

@@ -168,12 +168,14 @@ end;
 
 
   Since FPC does not attempt it even for local labels, this kind of code is inherently unsafe.
   Since FPC does not attempt it even for local labels, this kind of code is inherently unsafe.
 }
 }
-{$if not defined(darwin) and
-  (not defined(FPC_SYSTEM_HAS_FILLCHAR)
-   or not defined(FPC_SYSTEM_HAS_FILLWORD)
-   or not defined(FPC_SYSTEM_HAS_FILLDWORD)
-   or not defined(FPC_SYSTEM_HAS_FILLQWORD)
-)}
+{$ifndef darwin}
+  {$define can_jump_into_the_middle_of_a_procedure}
+{$endif darwin}
+
+{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
+  or not defined(FPC_SYSTEM_HAS_FILLWORD)
+  or not defined(FPC_SYSTEM_HAS_FILLDWORD)
+  or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
 
 
 {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
 {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
    or not defined(FPC_SYSTEM_HAS_FILLWORD)
    or not defined(FPC_SYSTEM_HAS_FILLWORD)
@@ -209,13 +211,15 @@ asm
 end;
 end;
 {$endif FillChar/Word/DWord required.}
 {$endif FillChar/Word/DWord required.}
 
 
+{$ifdef can_jump_into_the_middle_of_a_procedure}
 label
 label
   FillXxxx_MoreThanTwoXMMs;
   FillXxxx_MoreThanTwoXMMs;
+{$else can_jump_into_the_middle_of_a_procedure}
+procedure FillXxxx_MoreThanTwoXMMs; forward;
+{$endif can_jump_into_the_middle_of_a_procedure}
 
 
 procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
 procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
 { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
 { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
-const
-  NtThreshold = 4 * 1024 * 1024;
 asm
 asm
         movd   %ecx, %xmm0
         movd   %ecx, %xmm0
         pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
         pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
@@ -240,10 +244,17 @@ asm
         movd   %esi, %xmm0
         movd   %esi, %xmm0
         pshufd $0, %xmm0, %xmm0
         pshufd $0, %xmm0, %xmm0
         pop    %esi
         pop    %esi
-
+{$ifdef can_jump_into_the_middle_of_a_procedure}
 { FillChar (to skip the misaligning above) and FillQWord jump here.
 { FillChar (to skip the misaligning above) and FillQWord jump here.
   eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
   eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
 FillXxxx_MoreThanTwoXMMs:
 FillXxxx_MoreThanTwoXMMs:
+{$else can_jump_into_the_middle_of_a_procedure}
+        jmp    FillXxxx_MoreThanTwoXMMs
+end;
+
+procedure FillXxxx_MoreThanTwoXMMs; assembler; nostackframe;
+asm
+{$endif can_jump_into_the_middle_of_a_procedure}
         lea    -65(%eax,%edx), %ecx
         lea    -65(%eax,%edx), %ecx
         and    $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
         and    $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
         mov    %ecx, %edx { Remember T4 to edx. }
         mov    %ecx, %edx { Remember T4 to edx. }
@@ -259,7 +270,7 @@ FillXxxx_MoreThanTwoXMMs:
         jle    .LFourAlignedTailWrites { ecx was ≤ 96−48 }
         jle    .LFourAlignedTailWrites { ecx was ≤ 96−48 }
 
 
         add    $48, %eax { eax = H3. }
         add    $48, %eax { eax = H3. }
-        cmp    $NtThreshold, %ecx
+        cmp    $4 * 1024 * 1024, %ecx { Non-temporal fill threshold. }
         jae    .L64xNT_Body
         jae    .L64xNT_Body
 
 
 .balign 16 { no-op }
 .balign 16 { no-op }
@@ -339,8 +350,7 @@ end;
 {$endif FillChar/Word/DWord/QWord required.}
 {$endif FillChar/Word/DWord/QWord required.}
 
 
 
 
-{$if not defined(darwin) and
-     not defined(FPC_SYSTEM_HAS_FILLCHAR)}
+{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)}
 {$define FPC_SYSTEM_HAS_FILLCHAR}
 {$define FPC_SYSTEM_HAS_FILLCHAR}
 procedure FillChar_3OrLess; assembler; nostackframe;
 procedure FillChar_3OrLess; assembler; nostackframe;
 { cl — x, edx — byte count, Low(int32) <= edx <= 3. }
 { cl — x, edx — byte count, Low(int32) <= edx <= 3. }
@@ -438,8 +448,7 @@ end;
 {$endif FPC_SYSTEM_HAS_FILLCHAR}
 {$endif FPC_SYSTEM_HAS_FILLCHAR}
 
 
 
 
-{$if not defined(darwin) and
-     not defined(FPC_SYSTEM_HAS_FILLWORD)}
+{$if not defined(FPC_SYSTEM_HAS_FILLWORD)}
 {$define FPC_SYSTEM_HAS_FILLWORD}
 {$define FPC_SYSTEM_HAS_FILLWORD}
 procedure FillWord_3OrLess; assembler; nostackframe;
 procedure FillWord_3OrLess; assembler; nostackframe;
 asm
 asm
@@ -527,8 +536,7 @@ end;
 {$endif FPC_SYSTEM_HAS_FILLWORD}
 {$endif FPC_SYSTEM_HAS_FILLWORD}
 
 
 
 
-{$if not defined(darwin) and
-     not defined(FPC_SYSTEM_HAS_FILLDWORD)}
+{$if not defined(FPC_SYSTEM_HAS_FILLDWORD)}
 {$define FPC_SYSTEM_HAS_FILLDWORD}
 {$define FPC_SYSTEM_HAS_FILLDWORD}
 procedure FillDWord_4OrLess; assembler; nostackframe;
 procedure FillDWord_4OrLess; assembler; nostackframe;
 asm
 asm
@@ -602,8 +610,7 @@ end;
 {$endif FPC_SYSTEM_HAS_FILLDWORD}
 {$endif FPC_SYSTEM_HAS_FILLDWORD}
 
 
 
 
-{$if not defined(darwin) and
-     not defined(FPC_SYSTEM_HAS_FILLQWORD)}
+{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
 {$define FPC_SYSTEM_HAS_FILLQWORD}
 {$define FPC_SYSTEM_HAS_FILLQWORD}
 {$ifndef CPUX86_HAS_SSE2}
 {$ifndef CPUX86_HAS_SSE2}
 procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;
 procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;

+ 24 - 20
rtl/i386/set.inc

@@ -15,14 +15,10 @@
 
 
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
 {$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
 {$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
-label
-  fpc_varset_add_sets_plain_fallback;
-
 procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
 procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
 { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
 { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
 asm
 asm
     push   %ebx
     push   %ebx
-fpc_varset_add_sets_plain_fallback:
     push   %esi
     push   %esi
     mov    12(%esp), %esi { esi = size }
     mov    12(%esp), %esi { esi = size }
     sub    $4, %esi
     sub    $4, %esi
@@ -60,7 +56,7 @@ asm
     push   %ebx
     push   %ebx
     mov    8(%esp), %ebx
     mov    8(%esp), %ebx
     sub    $16, %ebx { ebx = position }
     sub    $16, %ebx { ebx = position }
-    jl     fpc_varset_add_sets_plain_fallback { probably dead branch... }
+    jl     .LFallback { Hopefully dead branch... }
 
 
 .L16x_Loop:
 .L16x_Loop:
     movups (%eax,%ebx), %xmm0
     movups (%eax,%ebx), %xmm0
@@ -75,6 +71,11 @@ asm
     orps   %xmm1, %xmm0
     orps   %xmm1, %xmm0
     movups %xmm0, (%ecx)
     movups %xmm0, (%ecx)
     pop    %ebx
     pop    %ebx
+    ret    $4
+
+.LFallback:
+    pop    %ebx
+    jmp    fpc_varset_add_sets_plain
 end;
 end;
 
 
 {$ifndef CPUX86_HAS_SSEUNIT}
 {$ifndef CPUX86_HAS_SSEUNIT}
@@ -101,14 +102,10 @@ end;
 
 
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
 {$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
 {$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
-label
-  fpc_varset_mul_sets_plain_fallback;
-
 procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
 procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
 { Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. }
 { Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. }
 asm
 asm
     push   %ebx
     push   %ebx
-fpc_varset_mul_sets_plain_fallback:
     push   %esi
     push   %esi
     mov    12(%esp), %esi { esi = size }
     mov    12(%esp), %esi { esi = size }
     sub    $4, %esi
     sub    $4, %esi
@@ -146,7 +143,7 @@ asm
     push   %ebx
     push   %ebx
     mov    8(%esp), %ebx
     mov    8(%esp), %ebx
     sub    $16, %ebx { ebx = position }
     sub    $16, %ebx { ebx = position }
-    jl     fpc_varset_mul_sets_plain_fallback { probably dead branch... }
+    jl     .LFallback { Hopefully dead branch... }
 
 
 .L16x_Loop:
 .L16x_Loop:
     movups (%eax,%ebx), %xmm0
     movups (%eax,%ebx), %xmm0
@@ -161,6 +158,11 @@ asm
     andps   %xmm1, %xmm0
     andps   %xmm1, %xmm0
     movups %xmm0, (%ecx)
     movups %xmm0, (%ecx)
     pop    %ebx
     pop    %ebx
+    ret    $4
+
+.LFallback:
+    pop    %ebx
+    jmp    fpc_varset_mul_sets_plain
 end;
 end;
 
 
 {$ifndef CPUX86_HAS_SSEUNIT}
 {$ifndef CPUX86_HAS_SSEUNIT}
@@ -187,14 +189,10 @@ end;
 
 
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
 {$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
 {$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
-label
-  fpc_varset_sub_sets_plain_fallback;
-
 procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
 procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
 { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
 { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
 asm
 asm
     push   %ebx
     push   %ebx
-fpc_varset_sub_sets_plain_fallback:
     push   %esi
     push   %esi
     mov    12(%esp), %esi { esi = size }
     mov    12(%esp), %esi { esi = size }
     sub    $4, %esi
     sub    $4, %esi
@@ -237,7 +235,7 @@ asm
     push   %ebx
     push   %ebx
     mov    8(%esp), %ebx
     mov    8(%esp), %ebx
     sub    $16, %ebx { ebx = position }
     sub    $16, %ebx { ebx = position }
-    jl     fpc_varset_sub_sets_plain_fallback { probably dead branch... }
+    jl     .LFallback { Hopefully dead branch... }
 
 
     movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
     movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
     movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
     movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
@@ -253,6 +251,11 @@ asm
 
 
     movups %xmm2, (%ecx) { Write precalculated tail. }
     movups %xmm2, (%ecx) { Write precalculated tail. }
     pop    %ebx
     pop    %ebx
+    ret    $4
+
+.LFallback:
+    pop    %ebx
+    jmp    fpc_varset_sub_sets_plain
 end;
 end;
 
 
 {$ifndef CPUX86_HAS_SSEUNIT}
 {$ifndef CPUX86_HAS_SSEUNIT}
@@ -279,15 +282,11 @@ end;
 
 
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
 {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
 {$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
 {$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
-label
-  fpc_varset_symdif_sets_plain_fallback;
-
 procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
 procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
 { Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'.
 { Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'.
   eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
   eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
 asm
 asm
     push   %ebx
     push   %ebx
-fpc_varset_symdif_sets_plain_fallback:
     push   %esi
     push   %esi
     mov    12(%esp), %esi { esi = size }
     mov    12(%esp), %esi { esi = size }
     sub    $4, %esi
     sub    $4, %esi
@@ -328,7 +327,7 @@ asm
     push   %ebx
     push   %ebx
     mov    8(%esp), %ebx
     mov    8(%esp), %ebx
     sub    $16, %ebx { ebx = position }
     sub    $16, %ebx { ebx = position }
-    jl     fpc_varset_symdif_sets_plain_fallback { probably dead branch... }
+    jl     .LFallback { Hopefully dead branch... }
 
 
     movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
     movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
     movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
     movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
@@ -344,6 +343,11 @@ asm
 
 
     movups %xmm2, (%ecx) { Write precalculated tail. }
     movups %xmm2, (%ecx) { Write precalculated tail. }
     pop    %ebx
     pop    %ebx
+    ret    $4
+
+.LFallback:
+    pop    %ebx
+    jmp    fpc_varset_symdif_sets_plain
 end;
 end;
 
 
 {$ifndef CPUX86_HAS_SSEUNIT}
 {$ifndef CPUX86_HAS_SSEUNIT}