Browse Source

Supposedly better fastmove.inc.

Rika Ichinose 1 year ago
parent
commit
0750777fc8
3 changed files with 522 additions and 896 deletions
  1. 497 871
      rtl/i386/fastmove.inc
  2. 23 23
      rtl/i386/i386.inc
  3. 2 2
      rtl/watcom/system.pp

+ 497 - 871
rtl/i386/fastmove.inc

@@ -1,907 +1,533 @@
-{
-  Copyright (c) 2004, John O'Harrow ([email protected])
+{$ifndef FPC_SYSTEM_HAS_MOVE}
+{$define FPC_SYSTEM_HAS_MOVE}
 
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the
-use of this software.
+{ at least valgrind up to 3.3 has a bug which prevents the default code to
+  work so we use a rather simple implementation here }
+procedure Move_8OrMore_Valgrind; assembler; nostackframe;
+{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
+asm
+    sub    %edx, %eax
+    jae    .LForward
+    mov    %ecx, %ebx
+    add    %eax, %ebx            { eax is negative => ecx+eax > 0 if regions overlap }
+    jb     .LBack                { if no overlap, still do forward move }
 
-Permission is granted to anyone to use this software for any purpose, including
-commercial applications, and to alter it and redistribute it freely, subject to
-the following restrictions:
+.LForward:
+{$ifdef FPC_ENABLED_CLD}
+    cld
+{$endif FPC_ENABLED_CLD}
+    push   %esi
+    push   %edi
+    lea    (%eax,%edx), %esi
+    mov    %edx, %edi
+    rep movsb
+    pop    %edi
+    pop    %esi
+    pop    %ebx
+    ret
 
-1. The origin of this software must not be misrepresented; you must not claim
-   that you wrote the original software. If you use this software in a product,
-   an acknowledgment in the product documentation would be appreciated but is
-   not required.
+.LBack:
+    add    %ecx, %edx
+.LNextb:
+    dec    %edx
+    mov    (%eax,%edx), %bl
+    mov    %bl, (%edx)
+    dec    %ecx
+    jnz    .LNextb
+    pop    %ebx
+end;
 
-2. Altered source versions must be plainly marked as such, and must not be
-   misrepresented as being the original software.
+procedure Move_8OrMore_IA32; assembler; nostackframe;
+{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
+asm
+    fildq  (%eax)                { First and last 8 bytes, used both in .L33OrMore and ladder ending (.L9to16). }
+    fildq  -8(%eax,%ecx)
+    cmp    $16, %ecx
+    jle    .L9to16
+    cmp    $32, %ecx
+    jg     .L33OrMore
+    fildq  8(%eax)
+    fildq  -16(%eax,%ecx)
+    fistpq -16(%edx,%ecx)
+    fistpq 8(%edx)
+.L9to16:
+    fistpq -8(%edx,%ecx)         { 9–16 bytes }
+    fistpq (%edx)
+    pop    %ebx
+    ret
 
-3. This notice may not be removed or altered from any source distribution.
+.Lcancel:
+    fucompp                      { Pop two elements loaded at the beginning. }
+    pop    %ebx
+    ret
+    .byte  0x66,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16f into a no-op. }
 
--------------------------------------------------------------------------------
+.L33OrMore:
+    sub    %edx, %eax            { eax = src - dest }
+    jz     .Lcancel              { exit if src=dest }
+    jnb    .LForward             { src>dest => forward move }
 
-Version: 1.40 - 16-SEP-2004
-}
+    mov    %ecx, %ebx
+    add    %eax, %ebx            { eax is negative => ecx+eax > 0 if regions overlap }
+    jb     .Lback                { if no overlap, still do forward move }
 
-{$ifdef USE_FASTMOVE}
+.LForward:
+    mov    %edx, %ebx            { remember original dest to write first 16 bytes }
+    add    %edx, %ecx            { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
+    add    $8, %edx
+    and    $-8, %edx
+    sub    %edx, %ecx
 
-{$ifndef FPC_SYSTEM_HAS_MOVE}
-{$define FPC_SYSTEM_HAS_MOVE}
+    sub    $16, %ecx
+    jbe    .LPost16f
 
-{$asmmode intel}
+    .balign 16                   { no-op }
+.Lloop16f:
+    fildq  (%eax,%edx)
+    fistpq (%edx)
+    fildq  8(%eax,%edx)
+    fistpq 8(%edx)
+    add    $16, %edx
+    sub    $16, %ecx
+    ja     .Lloop16f
 
-{-------------------------------------------------------------------------}
-(*
-{Just to show that a good Pascal algorithm can beat the default BASM}
-procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer);
-var
-  S, D       : PtrUInt;
-  Temp, C, I : PtrInt;
-  L          : PPtrInt;
-begin
-  S := Cardinal(@Source);
-  D := Cardinal(@Dest);
-  if S = D then
-    Exit;
-  if Count <= 4 then
-    case Count of
-      1 : PByte(@Dest)^ := PByte(S)^;
-      2 : PWord(@Dest)^ := PWord(S)^;
-      3 : if D > S then
-            begin
-              PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
-              PWord(@Dest)^ := PWord(S)^;
-            end
-          else
-            begin
-              PWord(@Dest)^ := PWord(S)^;
-              PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
-            end;
-      4 : PInteger(@Dest)^ := PInteger(S)^
-      else Exit; {Count <= 0}
-    end
-  else
-    if D > S then
-      begin
-        Temp := PInteger(S)^;
-        I := Integer(@Dest);
-        C := Count - 4;
-        L := PInteger(Integer(@Dest) + C);
-        Inc(S, C);
-        repeat
-          L^ := PInteger(S)^;
-          if Count <= 8 then
-            Break;
-          Dec(Count, 4);
-          Dec(S, 4);
-          Dec(L);
-        until False;
-        PInteger(I)^ := Temp;
-      end
-    else
-      begin
-        C := Count - 4;
-        Temp := PInteger(S + Cardinal(C))^;
-        I := Integer(@Dest) + C;
-        L := @Dest;
-        repeat
-          L^ := PInteger(S)^;
-          if Count <= 8 then
-            Break;
-          Dec(Count, 4);
-          Inc(S, 4);
-          Inc(L);
-        until False;
-        PInteger(I)^ := Temp;
-      end;
-end; {MoveJOH_PAS}
-*)
+.LPost16f:                       { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
+    cmp    $-8, %ecx
+    jle    .LFirstAndLast8f
+    fildq  (%eax,%edx)
+    fistpq (%edx)
+.LFirstAndLast8f:
+    fistpq 8(%edx,%ecx)          { Write first and last 8 bytes after everything else. }
+    fistpq (%ebx)                { Important for <8-byte step between src and dest. }
+    pop    %ebx
+    ret
+    .byte  0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16b into a no-op. }
 
-const
-  SMALLMOVESIZE = 36;
+{ backwards move }
+.Lback:
+    lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 8 bytes }
+    mov    %ebx, %ecx            { move dest to the previous 8-byte boundary... }
+    and    $-8, %ecx
+    sub    %edx, %ecx
+    add    %ecx, %edx
 
-{-------------------------------------------------------------------------}
-{Perform Forward Move of 0..36 Bytes}
-{On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count.  Destroys ECX}
-procedure SmallForwardMove_3;assembler;nostackframe;
-asm
-  jmp     dword ptr @@FwdJumpTable[ecx*4]
-  align   16
-@@FwdJumpTable:
-  dd      @@Done {Removes need to test for zero size move}
-  dd      @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08
-  dd      @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16
-  dd      @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24
-  dd      @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32
-  dd      @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36
-@@Fwd36:
-  mov     ecx,[eax-36]
-  mov     [edx-36],ecx
-@@Fwd32:
-  mov     ecx,[eax-32]
-  mov     [edx-32],ecx
-@@Fwd28:
-  mov     ecx,[eax-28]
-  mov     [edx-28],ecx
-@@Fwd24:
-  mov     ecx,[eax-24]
-  mov     [edx-24],ecx
-@@Fwd20:
-  mov     ecx,[eax-20]
-  mov     [edx-20],ecx
-@@Fwd16:
-  mov     ecx,[eax-16]
-  mov     [edx-16],ecx
-@@Fwd12:
-  mov     ecx,[eax-12]
-  mov     [edx-12],ecx
-@@Fwd08:
-  mov     ecx,[eax-8]
-  mov     [edx-8],ecx
-@@Fwd04:
-  mov     ecx,[eax-4]
-  mov     [edx-4],ecx
-  ret
-@@Fwd35:
-  mov     ecx,[eax-35]
-  mov     [edx-35],ecx
-@@Fwd31:
-  mov     ecx,[eax-31]
-  mov     [edx-31],ecx
-@@Fwd27:
-  mov     ecx,[eax-27]
-  mov     [edx-27],ecx
-@@Fwd23:
-  mov     ecx,[eax-23]
-  mov     [edx-23],ecx
-@@Fwd19:
-  mov     ecx,[eax-19]
-  mov     [edx-19],ecx
-@@Fwd15:
-  mov     ecx,[eax-15]
-  mov     [edx-15],ecx
-@@Fwd11:
-  mov     ecx,[eax-11]
-  mov     [edx-11],ecx
-@@Fwd07:
-  mov     ecx,[eax-7]
-  mov     [edx-7],ecx
-  mov     ecx,[eax-4]
-  mov     [edx-4],ecx
-  ret
-@@Fwd03:
-  movzx   ecx, word ptr [eax-3]
-  mov     [edx-3],cx
-  movzx   ecx, byte ptr [eax-1]
-  mov     [edx-1],cl
-  ret
-@@Fwd34:
-  mov     ecx,[eax-34]
-  mov     [edx-34],ecx
-@@Fwd30:
-  mov     ecx,[eax-30]
-  mov     [edx-30],ecx
-@@Fwd26:
-  mov     ecx,[eax-26]
-  mov     [edx-26],ecx
-@@Fwd22:
-  mov     ecx,[eax-22]
-  mov     [edx-22],ecx
-@@Fwd18:
-  mov     ecx,[eax-18]
-  mov     [edx-18],ecx
-@@Fwd14:
-  mov     ecx,[eax-14]
-  mov     [edx-14],ecx
-@@Fwd10:
-  mov     ecx,[eax-10]
-  mov     [edx-10],ecx
-@@Fwd06:
-  mov     ecx,[eax-6]
-  mov     [edx-6],ecx
-@@Fwd02:
-  movzx   ecx, word ptr [eax-2]
-  mov     [edx-2],cx
-  ret
-@@Fwd33:
-  mov     ecx,[eax-33]
-  mov     [edx-33],ecx
-@@Fwd29:
-  mov     ecx,[eax-29]
-  mov     [edx-29],ecx
-@@Fwd25:
-  mov     ecx,[eax-25]
-  mov     [edx-25],ecx
-@@Fwd21:
-  mov     ecx,[eax-21]
-  mov     [edx-21],ecx
-@@Fwd17:
-  mov     ecx,[eax-17]
-  mov     [edx-17],ecx
-@@Fwd13:
-  mov     ecx,[eax-13]
-  mov     [edx-13],ecx
-@@Fwd09:
-  mov     ecx,[eax-9]
-  mov     [edx-9],ecx
-@@Fwd05:
-  mov     ecx,[eax-5]
-  mov     [edx-5],ecx
-@@Fwd01:
-  movzx   ecx, byte ptr [eax-1]
-  mov     [edx-1],cl
-@@Done:
-end; {SmallForwardMove}
-
-{-------------------------------------------------------------------------}
-{Perform Backward Move of 0..36 Bytes}
-{On Entry, ECX = Count, EAX = Source, EDX = Dest.  Destroys ECX}
-procedure SmallBackwardMove_3;assembler;nostackframe;
-asm
-  jmp     dword ptr @@BwdJumpTable[ecx*4]
-  align   16
-@@BwdJumpTable:
-  dd      @@Done {Removes need to test for zero size move}
-  dd      @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08
-  dd      @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16
-  dd      @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24
-  dd      @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32
-  dd      @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36
-@@Bwd36:
-  mov     ecx,[eax+32]
-  mov     [edx+32],ecx
-@@Bwd32:
-  mov     ecx,[eax+28]
-  mov     [edx+28],ecx
-@@Bwd28:
-  mov     ecx,[eax+24]
-  mov     [edx+24],ecx
-@@Bwd24:
-  mov     ecx,[eax+20]
-  mov     [edx+20],ecx
-@@Bwd20:
-  mov     ecx,[eax+16]
-  mov     [edx+16],ecx
-@@Bwd16:
-  mov     ecx,[eax+12]
-  mov     [edx+12],ecx
-@@Bwd12:
-  mov     ecx,[eax+8]
-  mov     [edx+8],ecx
-@@Bwd08:
-  mov     ecx,[eax+4]
-  mov     [edx+4],ecx
-@@Bwd04:
-  mov     ecx,[eax]
-  mov     [edx],ecx
-  ret
-@@Bwd35:
-  mov     ecx,[eax+31]
-  mov     [edx+31],ecx
-@@Bwd31:
-  mov     ecx,[eax+27]
-  mov     [edx+27],ecx
-@@Bwd27:
-  mov     ecx,[eax+23]
-  mov     [edx+23],ecx
-@@Bwd23:
-  mov     ecx,[eax+19]
-  mov     [edx+19],ecx
-@@Bwd19:
-  mov     ecx,[eax+15]
-  mov     [edx+15],ecx
-@@Bwd15:
-  mov     ecx,[eax+11]
-  mov     [edx+11],ecx
-@@Bwd11:
-  mov     ecx,[eax+7]
-  mov     [edx+7],ecx
-@@Bwd07:
-  mov     ecx,[eax+3]
-  mov     [edx+3],ecx
-  mov     ecx,[eax]
-  mov     [edx],ecx
-  ret
-@@Bwd03:
-  movzx   ecx, word ptr [eax+1]
-  mov     [edx+1],cx
-  movzx   ecx, byte ptr [eax]
-  mov     [edx],cl
-  ret
-@@Bwd34:
-  mov     ecx,[eax+30]
-  mov     [edx+30],ecx
-@@Bwd30:
-  mov     ecx,[eax+26]
-  mov     [edx+26],ecx
-@@Bwd26:
-  mov     ecx,[eax+22]
-  mov     [edx+22],ecx
-@@Bwd22:
-  mov     ecx,[eax+18]
-  mov     [edx+18],ecx
-@@Bwd18:
-  mov     ecx,[eax+14]
-  mov     [edx+14],ecx
-@@Bwd14:
-  mov     ecx,[eax+10]
-  mov     [edx+10],ecx
-@@Bwd10:
-  mov     ecx,[eax+6]
-  mov     [edx+6],ecx
-@@Bwd06:
-  mov     ecx,[eax+2]
-  mov     [edx+2],ecx
-@@Bwd02:
-  movzx   ecx, word ptr [eax]
-  mov     [edx],cx
-  ret
-@@Bwd33:
-  mov     ecx,[eax+29]
-  mov     [edx+29],ecx
-@@Bwd29:
-  mov     ecx,[eax+25]
-  mov     [edx+25],ecx
-@@Bwd25:
-  mov     ecx,[eax+21]
-  mov     [edx+21],ecx
-@@Bwd21:
-  mov     ecx,[eax+17]
-  mov     [edx+17],ecx
-@@Bwd17:
-  mov     ecx,[eax+13]
-  mov     [edx+13],ecx
-@@Bwd13:
-  mov     ecx,[eax+9]
-  mov     [edx+9],ecx
-@@Bwd09:
-  mov     ecx,[eax+5]
-  mov     [edx+5],ecx
-@@Bwd05:
-  mov     ecx,[eax+1]
-  mov     [edx+1],ecx
-@@Bwd01:
-  movzx   ecx, byte ptr[eax]
-  mov     [edx],cl
-@@Done:
-end; {SmallBackwardMove}
+    sub    $16, %ecx
+    jbe    .LPost16b
 
+    .balign 16                   { no-op }
+.Lloop16b:
+    sub    $16, %edx
+    fildq  8(%eax,%edx)
+    fistpq 8(%edx)
+    fildq  (%eax,%edx)
+    fistpq (%edx)
+    sub    $16, %ecx
+    ja     .Lloop16b
 
-{ at least valgrind up to 3.3 has a bug which prevents the default code to
-  work so we use a rather simple implementation here
-}
-procedure Forwards_Valgrind;assembler;nostackframe;
-asm
-{$ifdef FPC_ENABLED_CLD}
-  cld
-{$endif FPC_ENABLED_CLD}
-  push    esi
-  push    edi
-  mov     esi,eax
-  mov     edi,edx
-  rep     movsb
-  pop     edi
-  pop     esi
+.LPost16b:
+    cmp    $-8, %ecx
+    jle    .LFirstAndLast8b
+    fildq  -8(%eax,%edx)
+    fistpq -8(%edx)
+.LFirstAndLast8b:
+    sub    %ecx, %edx
+    fistpq -7(%ebx)
+    fistpq -16(%edx)
+    pop    %ebx
 end;
 
-{ at least valgrind up to 3.3 has a bug which prevents the default code to
-  work so we use a rather simple implementation here
-}
-procedure Backwards_Valgrind;assembler;nostackframe;
+procedure Move_8OrMore_MMX; assembler; nostackframe;
+{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
 asm
-  push    esi
-  push    edi
-  lea     esi,[eax+ecx-1]
-  lea     edi,[edx+ecx-1]
-@@repeat:
-  mov     al,[esi]
-  mov     [edi],al
-  dec     esi
-  dec     edi
-  dec     ecx
-  jnz     @@repeat
-  pop     edi
-  pop     esi
+    cmp    $72, %ecx             { Size at which using MMX becomes worthwhile. }
+    jl     Move_8OrMore_IA32
+    movq   (%eax), %mm4          { First and last 8 bytes. }
+    movq   -8(%eax,%ecx), %mm5
+    sub    %edx, %eax            { eax = src - dest }
+    jz     .Lquit                { exit if src=dest }
+    jnb    .LForward             { src>dest => forward move }
+
+    mov    %ecx, %ebx
+    add    %eax, %ebx            { eax is negative => ecx+eax > 0 if regions overlap }
+    jb     .Lback                { if no overlap, still do forward move }
+
+.LForward:
+    mov    %edx, %ebx            { remember original dest to write first 16 bytes }
+    add    %edx, %ecx            { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
+    add    $8, %edx
+    and    $-8, %edx
+    sub    %edx, %ecx
+
+    sub    $16, %ecx
+    jbe    .LPost16f
+
+    .balign 16
+.Lloop16f:
+    movq   (%eax,%edx), %mm0
+    movq   %mm0, (%edx)
+    movq   8(%eax,%edx), %mm0
+    movq   %mm0, 8(%edx)
+    add    $16, %edx
+    sub    $16, %ecx
+    ja     .Lloop16f
+
+.LPost16f:                       { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
+    cmp    $-8, %ecx
+    jle    .LFirstAndLast8f
+    movq   (%eax,%edx), %mm0
+    movq   %mm0, (%edx)
+.LFirstAndLast8f:
+    movq   %mm5, 8(%edx,%ecx)    { Write first and last 8 bytes after everything else. }
+    movq   %mm4, (%ebx)          { Important for <8-byte step between src and dest. }
+.Lquit:
+    emms
+    pop    %ebx
+    ret
+    .byte  0x66,0x66,0x66,0x66,0x66,0x2E,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16b into a no-op. }
+
+{ backwards move }
+.Lback:
+    lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 8 bytes }
+    mov    %ebx, %ecx            { move dest to the previous 8-byte boundary... }
+    and    $-8, %ecx
+    sub    %edx, %ecx
+    add    %ecx, %edx
+
+    sub    $16, %ecx
+    jbe    .LPost16b
+
+    .balign 16                   { no-op }
+.Lloop16b:
+    sub    $16, %edx
+    movq   8(%eax,%edx), %mm0
+    movq   %mm0, 8(%edx)
+    movq   (%eax,%edx), %mm0
+    movq   %mm0, (%edx)
+    sub    $16, %ecx
+    ja     .Lloop16b
+
+.LPost16b:
+    cmp    $-8, %ecx
+    jle    .LFirstAndLast8b
+    movq   -8(%eax,%edx), %mm0
+    movq   %mm0, -8(%edx)
+.LFirstAndLast8b:
+    sub    %ecx, %edx
+    movq   %mm4, -16(%edx)
+    movq   %mm5, -7(%ebx)
+    emms
+    pop    %ebx
 end;
 
-{-------------------------------------------------------------------------}
-{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
-procedure Forwards_IA32_3;assembler;nostackframe;
-asm
-  push    ebx
-  mov     ebx,edx
-  fild    qword ptr [eax]
-  add     eax,ecx {QWORD Align Writes}
-  add     ecx,edx
-  add     edx,7
-  and     edx,-8
-  sub     ecx,edx
-  add     edx,ecx {Now QWORD Aligned}
-  sub     ecx,16
-  neg     ecx
-@FwdLoop:
-  fild    qword ptr [eax+ecx-16]
-  fistp   qword ptr [edx+ecx-16]
-  fild    qword ptr [eax+ecx-8]
-  fistp   qword ptr [edx+ecx-8]
-  add     ecx,16
-  jle     @FwdLoop
-  fistp   qword ptr [ebx]
-  neg     ecx
-  add     ecx,16
-  pop     ebx
-  jmp     SmallForwardMove_3
-end; {Forwards_IA32}
-
-{-------------------------------------------------------------------------}
-{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
-procedure Backwards_IA32_3;assembler;nostackframe;
-asm
-  push    ebx
-  fild    qword ptr [eax+ecx-8]
-  lea     ebx,[edx+ecx] {QWORD Align Writes}
-  and     ebx,7
-  sub     ecx,ebx
-  add     ebx,ecx {Now QWORD Aligned, EBX = Original Length}
-  sub     ecx,16
-@BwdLoop:
-  fild    qword ptr [eax+ecx]
-  fild    qword ptr [eax+ecx+8]
-  fistp   qword ptr [edx+ecx+8]
-  fistp   qword ptr [edx+ecx]
-  sub     ecx,16
-  jge     @BwdLoop
-  fistp   qword ptr [edx+ebx-8]
-  add     ecx,16
-  pop     ebx
-  jmp     SmallBackwardMove_3
-end; {Backwards_IA32}
-
-{-------------------------------------------------------------------------}
-{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
-procedure Forwards_MMX_3;assembler;nostackframe;
-const
-  LARGESIZE = 1024;
-asm
-  cmp     ecx,LARGESIZE
-  jge     @FwdLargeMove
-  cmp     ecx,72 {Size at which using MMX becomes worthwhile}
-  jl      Forwards_IA32_3
-  push    ebx
-  mov     ebx,edx
-  movq    mm0,[eax] {First 8 Characters}
-  {QWORD Align Writes}
-  add     eax,ecx
-  add     ecx,edx
-  add     edx,7
-  and     edx,-8
-  sub     ecx,edx
-  add     edx,ecx
-  {Now QWORD Aligned}
-  sub     ecx,32
-  neg     ecx
-@FwdLoopMMX:
-  movq    mm1,[eax+ecx-32]
-  movq    mm2,[eax+ecx-24]
-  movq    mm3,[eax+ecx-16]
-  movq    mm4,[eax+ecx- 8]
-  movq    [edx+ecx-32],mm1
-  movq    [edx+ecx-24],mm2
-  movq    [edx+ecx-16],mm3
-  movq    [edx+ecx- 8],mm4
-  add     ecx,32
-  jle     @FwdLoopMMX
-  movq    [ebx],mm0 {First 8 Characters}
-  emms
-  pop     ebx
-  neg     ecx
-  add     ecx,32
-  jmp     SmallForwardMove_3
-@FwdLargeMove:
-  push    ebx
-  mov     ebx,ecx
-  test    edx,15
-  jz      @FwdAligned
-  {16 byte Align Destination}
-  mov     ecx,edx
-  add     ecx,15
-  and     ecx,-16
-  sub     ecx,edx
-  add     eax,ecx
-  add     edx,ecx
-  sub     ebx,ecx
-  {Destination now 16 Byte Aligned}
-  call    SmallForwardMove_3
-@FwdAligned:
-  mov     ecx,ebx
-  and     ecx,-16
-  sub     ebx,ecx {EBX = Remainder}
-  push    esi
-  push    edi
-  mov     esi,eax          {ESI = Source}
-  mov     edi,edx          {EDI = Dest}
-  mov     eax,ecx          {EAX = Count}
-  and     eax,-64          {EAX = No of Bytes to Blocks Moves}
-  and     ecx,$3F          {ECX = Remaining Bytes to Move (0..63)}
-  add     esi,eax
-  add     edi,eax
-  shr     eax,3            {EAX = No of QWORD's to Block Move}
-  neg     eax
-@MMXcopyloop:
-  movq    mm0,[esi+eax*8   ]
-  movq    mm1,[esi+eax*8+ 8]
-  movq    mm2,[esi+eax*8+16]
-  movq    mm3,[esi+eax*8+24]
-  movq    mm4,[esi+eax*8+32]
-  movq    mm5,[esi+eax*8+40]
-  movq    mm6,[esi+eax*8+48]
-  movq    mm7,[esi+eax*8+56]
-  movq    [edi+eax*8   ],mm0
-  movq    [edi+eax*8+ 8],mm1
-  movq    [edi+eax*8+16],mm2
-  movq    [edi+eax*8+24],mm3
-  movq    [edi+eax*8+32],mm4
-  movq    [edi+eax*8+40],mm5
-  movq    [edi+eax*8+48],mm6
-  movq    [edi+eax*8+56],mm7
-  add     eax,8
-  jnz     @MMXcopyloop
-  emms                   {Empty MMX State}
-{$ifdef FPC_ENABLED_CLD}
-  cld
-{$endif FPC_ENABLED_CLD}
-  add     ecx,ebx
-  shr     ecx,2
-  rep     movsd
-  mov     ecx,ebx
-  and     ecx,3
-  rep     movsb
-  pop     edi
-  pop     esi
-  pop     ebx
-end; {Forwards_MMX}
-
-{-------------------------------------------------------------------------}
-{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
-procedure Backwards_MMX_3;assembler;nostackframe;
-asm
-  cmp     ecx,72 {Size at which using MMX becomes worthwhile}
-  jl      Backwards_IA32_3
-  push    ebx
-  movq    mm0,[eax+ecx-8] {Get Last QWORD}
-  {QWORD Align Writes}
-  lea     ebx,[edx+ecx]
-  and     ebx,7
-  sub     ecx,ebx
-  add     ebx,ecx
-  {Now QWORD Aligned}
-  sub     ecx,32
-@BwdLoopMMX:
-  movq    mm1,[eax+ecx   ]
-  movq    mm2,[eax+ecx+ 8]
-  movq    mm3,[eax+ecx+16]
-  movq    mm4,[eax+ecx+24]
-  movq    [edx+ecx+24],mm4
-  movq    [edx+ecx+16],mm3
-  movq    [edx+ecx+ 8],mm2
-  movq    [edx+ecx   ],mm1
-  sub     ecx,32
-  jge     @BwdLoopMMX
-  movq    [edx+ebx-8], mm0 {Last QWORD}
-  emms
-  add     ecx,32
-  pop     ebx
-  jmp     SmallBackwardMove_3
-end; {Backwards_MMX}
-
-{$ifndef FASTMOVE_DISABLE_SSE3}
-{-------------------------------------------------------------------------}
-{Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
-procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe;
-const
-  Prefetch = 512;
-asm
-  push    esi
-  mov     esi,eax             {ESI = Source}
-  mov     eax,ecx             {EAX = Count}
-  and     eax,-128            {EAX = No of Bytes to Block Move}
-  add     esi,eax
-  add     edx,eax
-  shr     eax,3               {EAX = No of QWORD's to Block Move}
-  neg     eax
-  cmp     eax, -(32*1024)     {Count > 256K}
-  jl      @Large
-@Small: {Count<=256K}
-  test    esi,15              {Check if Both Source/Dest Aligned}
-  jnz     @SmallUnaligned
-@SmallAligned:                {Both Source and Dest 16-Byte Aligned}
-@SmallAlignedLoop:
-  movaps  xmm0,[esi+8*eax]
-  movaps  xmm1,[esi+8*eax+16]
-  movaps  xmm2,[esi+8*eax+32]
-  movaps  xmm3,[esi+8*eax+48]
-  movaps  [edx+8*eax],xmm0
-  movaps  [edx+8*eax+16],xmm1
-  movaps  [edx+8*eax+32],xmm2
-  movaps  [edx+8*eax+48],xmm3
-  movaps  xmm4,[esi+8*eax+64]
-  movaps  xmm5,[esi+8*eax+80]
-  movaps  xmm6,[esi+8*eax+96]
-  movaps  xmm7,[esi+8*eax+112]
-  movaps  [edx+8*eax+64],xmm4
-  movaps  [edx+8*eax+80],xmm5
-  movaps  [edx+8*eax+96],xmm6
-  movaps  [edx+8*eax+112],xmm7
-  add     eax,16
-  js      @SmallAlignedLoop
-  jmp     @Remainder
-@SmallUnaligned:              {Source Not 16-Byte Aligned}
-@SmallUnalignedLoop:
-  movups  xmm0,[esi+8*eax]
-  movups  xmm1,[esi+8*eax+16]
-  movups  xmm2,[esi+8*eax+32]
-  movups  xmm3,[esi+8*eax+48]
-  movaps  [edx+8*eax],xmm0
-  movaps  [edx+8*eax+16],xmm1
-  movaps  [edx+8*eax+32],xmm2
-  movaps  [edx+8*eax+48],xmm3
-  movups  xmm4,[esi+8*eax+64]
-  movups  xmm5,[esi+8*eax+80]
-  movups  xmm6,[esi+8*eax+96]
-  movups  xmm7,[esi+8*eax+112]
-  movaps  [edx+8*eax+64],xmm4
-  movaps  [edx+8*eax+80],xmm5
-  movaps  [edx+8*eax+96],xmm6
-  movaps  [edx+8*eax+112],xmm7
-  add     eax,16
-  js      @SmallUnalignedLoop
-  jmp     @Remainder
-@Large: {Count>256K}
-  test    esi,15              {Check if Both Source/Dest Aligned}
-  jnz     @LargeUnaligned
-@LargeAligned:                {Both Source and Dest 16-Byte Aligned}
-@LargeAlignedLoop:
-  prefetchnta  [esi+8*eax+Prefetch]
-  prefetchnta  [esi+8*eax+Prefetch+64]
-  movaps  xmm0,[esi+8*eax]
-  movaps  xmm1,[esi+8*eax+16]
-  movaps  xmm2,[esi+8*eax+32]
-  movaps  xmm3,[esi+8*eax+48]
-  movntps [edx+8*eax],xmm0
-  movntps [edx+8*eax+16],xmm1
-  movntps [edx+8*eax+32],xmm2
-  movntps [edx+8*eax+48],xmm3
-  movaps  xmm4,[esi+8*eax+64]
-  movaps  xmm5,[esi+8*eax+80]
-  movaps  xmm6,[esi+8*eax+96]
-  movaps  xmm7,[esi+8*eax+112]
-  movntps [edx+8*eax+64],xmm4
-  movntps [edx+8*eax+80],xmm5
-  movntps [edx+8*eax+96],xmm6
-  movntps [edx+8*eax+112],xmm7
-  add     eax,16
-  js      @LargeAlignedLoop
-  sfence
-  jmp     @Remainder
-@LargeUnaligned:              {Source Not 16-Byte Aligned}
-@LargeUnalignedLoop:
-  prefetchnta  [esi+8*eax+Prefetch]
-  prefetchnta  [esi+8*eax+Prefetch+64]
-  movups  xmm0,[esi+8*eax]
-  movups  xmm1,[esi+8*eax+16]
-  movups  xmm2,[esi+8*eax+32]
-  movups  xmm3,[esi+8*eax+48]
-  movntps [edx+8*eax],xmm0
-  movntps [edx+8*eax+16],xmm1
-  movntps [edx+8*eax+32],xmm2
-  movntps [edx+8*eax+48],xmm3
-  movups  xmm4,[esi+8*eax+64]
-  movups  xmm5,[esi+8*eax+80]
-  movups  xmm6,[esi+8*eax+96]
-  movups  xmm7,[esi+8*eax+112]
-  movntps [edx+8*eax+64],xmm4
-  movntps [edx+8*eax+80],xmm5
-  movntps [edx+8*eax+96],xmm6
-  movntps [edx+8*eax+112],xmm7
-  add     eax,16
-  js      @LargeUnalignedLoop
-  sfence
-@Remainder:
-  and     ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)}
-  jz      @Done
-  add     esi,ecx
-  add     edx,ecx
-  neg     ecx
-@RemainderLoop:
-  movups  xmm0,[esi+ecx]
-  movaps  [edx+ecx],xmm0
-  add     ecx,16
-  jnz     @RemainderLoop
-@Done:
-  pop     esi
-end; {AlignedFwdMoveSSE}
-
-{-------------------------------------------------------------------------}
-{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
-procedure Forwards_SSE_3;assembler;nostackframe;
+{$ifndef FASTMOVE_DISABLE_SSE}
+procedure Move_8OrMore_SSE; assembler; nostackframe;
+{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
 const
-  LARGESIZE = 2048;
+  ErmsThreshold = 1536;
+  NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
+  PrefetchDistance = 512;
 asm
-  cmp     ecx,LARGESIZE
-  jge     @FwdLargeMove
-  cmp     ecx,SMALLMOVESIZE+32
-  movups  xmm0,[eax]
-  jg      @FwdMoveSSE
-  movups  xmm1,[eax+16]
-  movups  [edx],xmm0
-  movups  [edx+16],xmm1
-  add     eax,ecx
-  add     edx,ecx
-  sub     ecx,32
-  jmp     SmallForwardMove_3
-@FwdMoveSSE:
-  push    ebx
-  mov     ebx,edx
-  {Align Writes}
-  add     eax,ecx
-  add     ecx,edx
-  add     edx,15
-  and     edx,-16
-  sub     ecx,edx
-  add     edx,ecx
-  {Now Aligned}
-  sub     ecx,32
-  neg     ecx
-@FwdLoopSSE:
-  movups  xmm1,[eax+ecx-32]
-  movups  xmm2,[eax+ecx-16]
-  movaps  [edx+ecx-32],xmm1
-  movaps  [edx+ecx-16],xmm2
-  add     ecx,32
-  jle     @FwdLoopSSE
-  movups  [ebx],xmm0 {First 16 Bytes}
-  neg     ecx
-  add     ecx,32
-  pop     ebx
-  jmp     SmallForwardMove_3
-@FwdLargeMove:
-  push    ebx
-  mov     ebx,ecx
-  test    edx,15
-  jz      @FwdLargeAligned
-  {16 byte Align Destination}
-  mov     ecx,edx
-  add     ecx,15
-  and     ecx,-16
-  sub     ecx,edx
-  add     eax,ecx
-  add     edx,ecx
-  sub     ebx,ecx
-  {Destination now 16 Byte Aligned}
-  call    SmallForwardMove_3
-  mov     ecx,ebx
-@FwdLargeAligned:
-  and     ecx,-16
-  sub     ebx,ecx {EBX = Remainder}
-  push    edx
-  push    eax
-  push    ecx
-  call    AlignedFwdMoveSSE_3
-  pop     ecx
-  pop     eax
-  pop     edx
-  add     ecx,ebx
-  add     eax,ecx
-  add     edx,ecx
-  mov     ecx,ebx
-  pop     ebx
-  jmp     SmallForwardMove_3
-end; {Forwards_SSE}
-
-{-------------------------------------------------------------------------}
-{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
-procedure Backwards_SSE_3;assembler;nostackframe;
-asm
-  cmp     ecx,SMALLMOVESIZE+32
-  jg      @BwdMoveSSE
-  sub     ecx,32
-  movups  xmm1,[eax+ecx]
-  movups  xmm2,[eax+ecx+16]
-  movups  [edx+ecx],xmm1
-  movups  [edx+ecx+16],xmm2
-  jmp     SmallBackwardMove_3
-@BwdMoveSSE:
-  push    ebx
-  movups  xmm0,[eax+ecx-16] {Last 16 Bytes}
-  {Align Writes}
-  lea     ebx,[edx+ecx]
-  and     ebx,15
-  sub     ecx,ebx
-  add     ebx,ecx
-  {Now Aligned}
-  sub     ecx,32
-@BwdLoop:
-  movups  xmm1,[eax+ecx]
-  movups  xmm2,[eax+ecx+16]
-  movaps  [edx+ecx],xmm1
-  movaps  [edx+ecx+16],xmm2
-  sub     ecx,32
-  jge     @BwdLoop
-  movups  [edx+ebx-16],xmm0  {Last 16 Bytes}
-  add     ecx,32
-  pop     ebx
-  jmp     SmallBackwardMove_3
-end; {Backwards_SSE}
-{$endif ndef FASTMOVE_DISABLE_SSE3}
+    cmp    $16, %ecx
+    jle    .L9to16
+    movups (%eax), %xmm4         { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
+    movups -16(%eax,%ecx), %xmm5
+    cmp    $32, %ecx
+    jg     .L33OrMore
+    movups %xmm4, (%edx)         { 17–32 bytes }
+    movups %xmm5, -16(%edx,%ecx)
+    pop    %ebx
+    ret
 
-const
-   fastmoveproc_forward : pointer = @Forwards_IA32_3;
-   fastmoveproc_backward : pointer = @Backwards_IA32_3;
+.L9to16:
+    movq   (%eax), %xmm0
+    movq   -8(%eax,%ecx), %xmm1
+    movq   %xmm0, (%edx)
+    movq   %xmm1, -8(%edx,%ecx)
+.Lquit:
+    pop    %ebx
+    ret
+    .byte  0x66,0x66,0x66,0x66,0x66,0x2E,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop32f into a no-op. }
 
-procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
-asm
-  cmp     ecx,SMALLMOVESIZE
-  ja      @Large
-  cmp     eax,edx
-  lea     eax,[eax+ecx]
-  jle     @SmallCheck
-@SmallForward:
-  add     edx,ecx
-  jmp     SmallForwardMove_3
-@SmallCheck:
-  je      @Done {For Compatibility with Delphi's move for Source = Dest}
-  sub     eax,ecx
-  jmp     SmallBackwardMove_3
-@Large:
-  jng     @Done {For Compatibility with Delphi's move for Count < 0}
-  cmp     eax,edx
-  jg      @moveforward
-  je      @Done {For Compatibility with Delphi's move for Source = Dest}
-  push    eax
-  add     eax,ecx
-  cmp     eax,edx
-  pop     eax
-  jg      @movebackward
-@moveforward:
-  jmp     dword ptr fastmoveproc_forward
-@movebackward:
-  jmp     dword ptr fastmoveproc_backward {Source/Dest Overlap}
-@Done:
+.L33OrMore:
+    sub    %edx, %eax            { eax = src - dest }
+    jz     .Lquit                { exit if src=dest }
+    jnb    .LForward             { src>dest => forward move }
+
+    mov    %ecx, %ebx
+    add    %eax, %ebx            { eax is negative => ecx+eax > 0 if regions overlap }
+    jb     .Lback                { if no overlap, still do forward move }
+
+.LForward:
+    mov    %edx, %ebx            { remember original dest to write first 16 bytes }
+    add    %edx, %ecx            { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
+    add    $16, %edx
+    and    $-16, %edx
+    sub    %edx, %ecx
+
+.LRestAfterNTf:
+    sub    $32, %ecx             { During the N× loop, ecx is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
+    jbe    .LPost32f
+    cmp    $NtThreshold-32, %ecx
+    jae    .Lntf                 { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
+.LNtIsNotBetter:
+    cmp    $ErmsThreshold-32, %ecx { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT. }
+    jae    .LRepMovsF
+.LRepMovsIsNotBetter:
+    test   $15, %eax
+    jz     .Lalignedloop32f
+
+    .balign 16                   { no-op }
+.Lloop32f:
+    movups (%eax,%edx), %xmm0
+    movaps %xmm0, (%edx)
+    movups 16(%eax,%edx), %xmm0
+    movaps %xmm0, 16(%edx)
+    add    $32, %edx
+    sub    $32, %ecx
+    ja     .Lloop32f
+
+.LPost32f:                       { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. }
+    cmp    $-16, %ecx
+    jle    .LFirstAndLast16f
+    movups (%eax,%edx), %xmm0
+    movaps %xmm0, (%edx)
+.LFirstAndLast16f:
+    movups %xmm5, 16(%edx,%ecx)  { Write first and last 16 bytes after everything else. }
+    movups %xmm4, (%ebx)         { Important for <16-byte step between src and dest. }
+    pop    %ebx
+    ret
+
+    .balign 16
+.Lalignedloop32f:                { Same as above starting from .Lloop32f but with MOVAPSes. }
+    movaps (%eax,%edx), %xmm0
+    movaps %xmm0, (%edx)
+    movaps 16(%eax,%edx), %xmm0
+    movaps %xmm0, 16(%edx)
+    add    $32, %edx
+    sub    $32, %ecx
+    ja     .Lalignedloop32f
+
+.LalignedPost32f:
+    cmp    $-16, %ecx
+    jle    .LalignedFirstAndLast16f
+    movaps (%eax,%edx), %xmm0
+    movaps %xmm0, (%edx)
+.LalignedFirstAndLast16f:
+    movups %xmm5, 16(%edx,%ecx)
+    movups %xmm4, (%ebx)
+    pop    %ebx
+    ret
+
+.LRepMovsF:
+{$ifdef FPC_PIC}
+    push   %ebx
+    call   fpc_geteipasebx
+    addl   $_GLOBAL_OFFSET_TABLE_, %ebx
+    movl   fast_large_repmovstosb@GOT(%ebx), %ebx
+    cmpb   $1, (%ebx)
+    pop    %ebx
+{$else FPC_PIC}
+    cmpb   $1, fast_large_repmovstosb
+{$endif FPC_PIC}
+    jne    .LRepMovsIsNotBetter
+    push   %esi
+    push   %edi
+    lea    (%eax,%edx), %esi
+    mov    %edx, %edi
+    add    $32, %ecx
+    rep movsb
+    movups %xmm4, (%ebx)         { last 16 aren't required }
+    pop    %edi
+    pop    %esi
+    pop    %ebx
+    ret
+
+.Lntf:
+    cmp    $NtThreshold, %eax    { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
+    jb     .LNtIsNotBetter       { (this check is performed here to not stand in the way of smaller counts) }
+    sub    $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
+    test   $15, %eax
+    jz     .Lalignedntloop64f
+
+    .balign 16
+.Lntloop64f:
+    prefetchnta 0+PrefetchDistance(%eax,%edx,1)
+    movups (%eax,%edx,1), %xmm0
+    movntps %xmm0, (%edx)
+    movups 16(%eax,%edx,1), %xmm0
+    movntps %xmm0, 16(%edx)
+    movups 32(%eax,%edx,1), %xmm0
+    movntps %xmm0, 32(%edx)
+    movups 48(%eax,%edx,1), %xmm0
+    movntps %xmm0, 48(%edx)
+    add    $64, %edx
+    sub    $64, %ecx
+    jae    .Lntloop64f
+
+    sfence
+    add    $PrefetchDistance+64, %ecx
+    jmp    .LRestAfterNTf        { go handle remaining bytes }
+
+    .balign 16
+.Lalignedntloop64f:              { Same as above starting from .Lntloop64f but with MOVAPSes. }
+    prefetchnta 0+PrefetchDistance(%eax,%edx,1)
+    movaps (%eax,%edx,1), %xmm0
+    movntps %xmm0, (%edx)
+    movaps 16(%eax,%edx,1), %xmm0
+    movntps %xmm0, 16(%edx)
+    movaps 32(%eax,%edx,1), %xmm0
+    movntps %xmm0, 32(%edx)
+    movaps 48(%eax,%edx,1), %xmm0
+    movntps %xmm0, 48(%edx)
+    add    $64, %edx
+    sub    $64, %ecx
+    jae    .Lalignedntloop64f
+
+    sfence
+    add    $PrefetchDistance+64, %ecx
+    jmp    .LRestAfterNTf
+    .byte  0x66,0x0F,0x1F,0x44,0,0 { Turns .balign 16 before .Lloop32b into a no-op. }
+
+{ backwards move }
+.Lback:
+    lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 16 bytes }
+    mov    %ebx, %ecx            { move dest to the previous 16-byte boundary... }
+    and    $-16, %ecx
+    sub    %edx, %ecx
+    add    %ecx, %edx
+
+.LRestAfterNTb:
+    sub    $32, %ecx
+    jbe    .LPost32b
+    cmp    $NtThreshold-32, %ecx
+    jae    .Lntb
+
+    .balign 16                   { no-op }
+.Lloop32b:
+    sub    $32, %edx
+    movups 16(%eax,%edx), %xmm0
+    movaps %xmm0, 16(%edx)
+    movups (%eax,%edx), %xmm0
+    movaps %xmm0, (%edx)
+    sub    $32, %ecx
+    ja     .Lloop32b
+
+.LPost32b:
+    cmp    $-16, %ecx
+    jle    .LFirstAndLast16b
+    movups -16(%eax,%edx), %xmm0
+    movaps %xmm0, -16(%edx)
+.LFirstAndLast16b:
+    sub    %ecx, %edx
+    movups %xmm4, -32(%edx)
+    movups %xmm5, -15(%ebx)
+    pop    %ebx
+    ret
+
+.Lntb:
+    cmp    $-NtThreshold, %eax
+    jnb    .Lloop32b
+    sub    $PrefetchDistance+32, %ecx
+
+    .balign 16
+.Lntloop64b:
+    prefetchnta -PrefetchDistance(%eax,%edx,1)
+    sub    $64, %edx
+    movups 48(%eax,%edx,1), %xmm0
+    movntps %xmm0, 48(%edx)
+    movups 32(%eax,%edx,1), %xmm0
+    movntps %xmm0, 32(%edx)
+    movups 16(%eax,%edx,1), %xmm0
+    movntps %xmm0, 16(%edx)
+    movups (%eax,%edx,1), %xmm0
+    movntps %xmm0, (%edx)
+    sub    $64, %ecx
+    jae    .Lntloop64b
+
+    sfence
+    add    $PrefetchDistance+64, %ecx
+    jmp    .LRestAfterNTb
 end;
+{$endif ndef FASTMOVE_DISABLE_SSE}
+
+procedure Move_8OrMore_Dispatch; forward;
 
-{$asmmode att}
-{$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
 var
+  fastmoveproc : pointer = @Move_8OrMore_Dispatch;
+{$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
   valgrind_used : boolean;external name '__fpc_valgrind';
 {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
 
-procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
-  begin
-    { workaround valgrind bug }
+function Move_8OrMore_HumanFriendlyDispatch: pointer;
+begin
+  { workaround valgrind bug }
 {$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
-    if EntryInformation.valgrind_used then
+  if EntryInformation.valgrind_used then
 {$else FPC_HAS_INDIRECT_ENTRY_INFORMATION}
-    if valgrind_used then
+  if valgrind_used then
 {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
-      begin
-        fastmoveproc_forward:=@Forwards_Valgrind;
-        fastmoveproc_backward:=@Backwards_Valgrind;
-      end
-{$ifndef FASTMOVE_DISABLE_SSE3}
-    else if has_sse_support then
-      begin
-        fastmoveproc_forward:=@Forwards_SSE_3;
-        fastmoveproc_backward:=@Backwards_SSE_3;
-      end
-{$endif ndef FASTMOVE_DISABLE_SSE3}
-   else if has_mmx_support then
-      begin
-        fastmoveproc_forward:=@Forwards_MMX_3;
-        fastmoveproc_backward:=@Backwards_MMX_3;
-      end;
-  end;
+    result:=@Move_8OrMore_Valgrind
+{$ifndef FASTMOVE_DISABLE_SSE}
+  else if has_sse_support then
+    result:=@Move_8OrMore_SSE
+{$endif ndef FASTMOVE_DISABLE_SSE}
+  else if has_mmx_support then
+    result:=@Move_8OrMore_MMX
+  else
+    result:=@Move_8OrMore_IA32;
+  if fpc_cpucodeinit_performed then
+    fastmoveproc:=result;
+end;
 
-{$endif  FPC_SYSTEM_HAS_MOVE}
+procedure Move_8OrMore_Dispatch; assembler; nostackframe;
+{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
+asm
+    push %eax
+    push %edx
+    push %ecx
+    call Move_8OrMore_HumanFriendlyDispatch
+    mov  %eax, %ebx
+    pop  %ecx
+    pop  %edx
+    pop  %eax
+    jmp  %ebx
+end;
 
+procedure Move(const source;var dest;count:SizeInt); [public, alias: 'FPC_MOVE']; assembler; nostackframe;
+asm
+    push   %ebx
+    cmp    $8, %ecx
+    jle    .L8OrLess
+{$ifdef FPC_PIC}
+    call   fpc_geteipasebx
+    addl   $_GLOBAL_OFFSET_TABLE_, %ebx
+    movl   fastmoveproc@GOT(%ebx), %ebx
+    jmp    (%ebx)
+{$else}
+    jmp    fastmoveproc
 {$endif}
+
+.L8OrLess:
+    cmp    $3, %ecx
+    jle    .L3OrLess
+    mov    (%eax), %ebx
+    mov    -4(%eax,%ecx), %eax
+    mov    %ebx, (%edx)
+    mov    %eax, -4(%edx,%ecx)
+    pop    %ebx
+    ret
+
+.L3OrLess:
+    cmp    $1, %ecx
+    jl     .LZero
+    movzbl (%eax), %ebx
+    je     .LOne
+    movzwl -2(%eax,%ecx), %eax
+    mov    %ax, -2(%edx,%ecx)
+.LOne:
+    mov    %bl, (%edx)
+.LZero:
+    pop    %ebx
+end;
+
+{$endif  FPC_SYSTEM_HAS_MOVE}

+ 23 - 23
rtl/i386/i386.inc

@@ -25,6 +25,8 @@ var
   os_supports_sse : boolean;
   { this variable is set to true, if currently an sse check is executed and no sig ill should be generated }
   sse_check : boolean;
+  fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
+  fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. }
 
 {$asmmode ATT}
 
@@ -47,15 +49,6 @@ function cpuid_support : boolean;assembler;nostackframe;
     setnz   %al
   end;
 
-{$ifndef FPC_PIC}
-{$ifndef FPC_SYSTEM_HAS_MOVE}
-{$ifndef OLD_ASSEMBLER}
-{$define USE_FASTMOVE}
-{$i fastmove.inc}
-{$endif not OLD_ASSEMBLER}
-{$endif FPC_SYSTEM_HAS_MOVE}
-{$endif FPC_PIC}
-
 {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
 procedure fpc_cpuinit;
   begin
@@ -63,7 +56,6 @@ procedure fpc_cpuinit;
       must be implemented OS dependend (FK)
     has_sse_support:=sse_support;
     has_mmx_support:=mmx_support;
-    setup_fastmove;
     }
   end;
 
@@ -80,6 +72,12 @@ asm
 end;
 {$endif}
 
+{$if not defined(FPC_SYSTEM_HAS_MOVE)
+ and not defined(OLD_ASSEMBLER)
+ and not defined(darwin)}
+{$i fastmove.inc}
+{$endif}
+
 {$ifndef FPC_SYSTEM_HAS_MOVE}
 {$define FPC_SYSTEM_HAS_MOVE}
 
@@ -2027,7 +2025,7 @@ Procedure SysResetFPU;
 { because of the brain dead sse detection on x86, this test is post poned }
 procedure fpc_cpucodeinit;
   var
-    _eax,_ecx_cpuid1,_edx_cpuid1,_ebx : longint;
+    _eax,_ecx_cpuid1,_edx_cpuid1,_ebx_cpuid7 : longint;
   begin
     if cpuid_support then
       begin
@@ -2067,23 +2065,27 @@ procedure fpc_cpucodeinit;
               cpuid
               movl %eax,_eax
             end;
-            if (_eax>=7) and (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then
+            if _eax>=7 then
               begin
                 asm
+                  movl $7,%eax
                   xorl %ecx,%ecx
-                  .byte   0x0f,0x01,0xd0 { xgetbv }
-                  movl %eax,_eax
+                  cpuid
+                  movl %ebx,_ebx_cpuid7
                 end;
-                if (_eax and 6)=6 then
+                fast_large_repmovstosb:=_ebx_cpuid7 and (1 shl 9)<>0;
+                if (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then
                   begin
-                    has_avx_support:=(_ecx_cpuid1 and $10000000)<>0;
                     asm
-                      movl $7,%eax
                       xorl %ecx,%ecx
-                      cpuid
-                      movl %ebx,_ebx
+                      .byte   0x0f,0x01,0xd0 { xgetbv }
+                      movl %eax,_eax
                     end;
-                    has_avx2_support:=(_ebx and $20)<>0;
+                    if (_eax and 6)=6 then
+                      begin
+                        has_avx_support:=(_ecx_cpuid1 and $10000000)<>0;
+                        has_avx2_support:=(_ebx_cpuid7 and $20)<>0;
+                      end;
                   end;
               end;
           end;
@@ -2098,9 +2100,7 @@ procedure fpc_cpucodeinit;
       end;
 
     SysResetFPU;
-{$ifdef USE_FASTMOVE}
-    setup_fastmove;
-{$endif}
+    fpc_cpucodeinit_performed:=true;
   end;
 
 

+ 2 - 2
rtl/watcom/system.pp

@@ -25,8 +25,8 @@ INTERFACE
 {$define FPC_ANSI_TEXTFILEREC}
 { include system-independent routine headers }
 
-{ wasm does not support SSE3 instructions }
-{$define FASTMOVE_DISABLE_SSE3}
+{ wasm does not support SSE instructions }
+{$define FASTMOVE_DISABLE_SSE}
 
 {$include systemh.inc}