vor 1 Jahr · 0750777fc8
--- a/rtl/i386/fastmove.inc
+++ b/rtl/i386/fastmove.inc
@@ -1,907 +1,533 @@
 
				-{
			
 
				-  Copyright (c) 2004, John O'Harrow ([email protected])
			
 
				+{$ifndef FPC_SYSTEM_HAS_MOVE}
			
 
				+{$define FPC_SYSTEM_HAS_MOVE}
			
 
				 
			
 
				-This software is provided 'as-is', without any express or implied warranty.
			
 
				-In no event will the authors be held liable for any damages arising from the
			
 
				-use of this software.
			
 
				+{ at least valgrind up to 3.3 has a bug which prevents the default code to
			
 
				+  work so we use a rather simple implementation here }
			
 
				+procedure Move_8OrMore_Valgrind; assembler; nostackframe;
			
 
				+{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
			
 
				+asm
			
 
				+    sub    %edx, %eax
			
 
				+    jae    .LForward
			
 
				+    mov    %ecx, %ebx
			
 
				+    add    %eax, %ebx            { eax is negative => ecx+eax > 0 if regions overlap }
			
 
				+    jb     .LBack                { if no overlap, still do forward move }
			
 
				 
			
 
				-Permission is granted to anyone to use this software for any purpose, including
			
 
				-commercial applications, and to alter it and redistribute it freely, subject to
			
 
				-the following restrictions:
			
 
				+.LForward:
			
 
				+{$ifdef FPC_ENABLED_CLD}
			
 
				+    cld
			
 
				+{$endif FPC_ENABLED_CLD}
			
 
				+    push   %esi
			
 
				+    push   %edi
			
 
				+    lea    (%eax,%edx), %esi
			
 
				+    mov    %edx, %edi
			
 
				+    rep movsb
			
 
				+    pop    %edi
			
 
				+    pop    %esi
			
 
				+    pop    %ebx
			
 
				+    ret
			
 
				 
			
 
				-1. The origin of this software must not be misrepresented; you must not claim
			
 
				-   that you wrote the original software. If you use this software in a product,
			
 
				-   an acknowledgment in the product documentation would be appreciated but is
			
 
				-   not required.
			
 
				+.LBack:
			
 
				+    add    %ecx, %edx
			
 
				+.LNextb:
			
 
				+    dec    %edx
			
 
				+    mov    (%eax,%edx), %bl
			
 
				+    mov    %bl, (%edx)
			
 
				+    dec    %ecx
			
 
				+    jnz    .LNextb
			
 
				+    pop    %ebx
			
 
				+end;
			
 
				 
			
 
				-2. Altered source versions must be plainly marked as such, and must not be
			
 
				-   misrepresented as being the original software.
			
 
				+procedure Move_8OrMore_IA32; assembler; nostackframe;
			
 
				+{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
			
 
				+asm
			
 
				+    fildq  (%eax)                { First and last 8 bytes, used both in .L33OrMore and ladder ending (.L9to16). }
			
 
				+    fildq  -8(%eax,%ecx)
			
 
				+    cmp    $16, %ecx
			
 
				+    jle    .L9to16
			
 
				+    cmp    $32, %ecx
			
 
				+    jg     .L33OrMore
			
 
				+    fildq  8(%eax)
			
 
				+    fildq  -16(%eax,%ecx)
			
 
				+    fistpq -16(%edx,%ecx)
			
 
				+    fistpq 8(%edx)
			
 
				+.L9to16:
			
 
				+    fistpq -8(%edx,%ecx)         { 9–16 bytes }
			
 
				+    fistpq (%edx)
			
 
				+    pop    %ebx
			
 
				+    ret
			
 
				 
			
 
				-3. This notice may not be removed or altered from any source distribution.
			
 
				+.Lcancel:
			
 
				+    fucompp                      { Pop two elements loaded at the beginning. }
			
 
				+    pop    %ebx
			
 
				+    ret
			
 
				+    .byte  0x66,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16f into a no-op. }
			
 
				 
			
 
				--------------------------------------------------------------------------------
			
 
				+.L33OrMore:
			
 
				+    sub    %edx, %eax            { eax = src - dest }
			
 
				+    jz     .Lcancel              { exit if src=dest }
			
 
				+    jnb    .LForward             { src>dest => forward move }
			
 
				 
			
 
				-Version: 1.40 - 16-SEP-2004
			
 
				-}
			
 
				+    mov    %ecx, %ebx
			
 
				+    add    %eax, %ebx            { eax is negative => ecx+eax > 0 if regions overlap }
			
 
				+    jb     .Lback                { if no overlap, still do forward move }
			
 
				 
			
 
				-{$ifdef USE_FASTMOVE}
			
 
				+.LForward:
			
 
				+    mov    %edx, %ebx            { remember original dest to write first 16 bytes }
			
 
				+    add    %edx, %ecx            { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
			
 
				+    add    $8, %edx
			
 
				+    and    $-8, %edx
			
 
				+    sub    %edx, %ecx
			
 
				 
			
 
				-{$ifndef FPC_SYSTEM_HAS_MOVE}
			
 
				-{$define FPC_SYSTEM_HAS_MOVE}
			
 
				+    sub    $16, %ecx
			
 
				+    jbe    .LPost16f
			
 
				 
			
 
				-{$asmmode intel}
			
 
				+    .balign 16                   { no-op }
			
 
				+.Lloop16f:
			
 
				+    fildq  (%eax,%edx)
			
 
				+    fistpq (%edx)
			
 
				+    fildq  8(%eax,%edx)
			
 
				+    fistpq 8(%edx)
			
 
				+    add    $16, %edx
			
 
				+    sub    $16, %ecx
			
 
				+    ja     .Lloop16f
			
 
				 
			
 
				-{-------------------------------------------------------------------------}
			
 
				-(*
			
 
				-{Just to show that a good Pascal algorithm can beat the default BASM}
			
 
				-procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer);
			
 
				-var
			
 
				-  S, D       : PtrUInt;
			
 
				-  Temp, C, I : PtrInt;
			
 
				-  L          : PPtrInt;
			
 
				-begin
			
 
				-  S := Cardinal(@Source);
			
 
				-  D := Cardinal(@Dest);
			
 
				-  if S = D then
			
 
				-    Exit;
			
 
				-  if Count <= 4 then
			
 
				-    case Count of
			
 
				-      1 : PByte(@Dest)^ := PByte(S)^;
			
 
				-      2 : PWord(@Dest)^ := PWord(S)^;
			
 
				-      3 : if D > S then
			
 
				-            begin
			
 
				-              PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
			
 
				-              PWord(@Dest)^ := PWord(S)^;
			
 
				-            end
			
 
				-          else
			
 
				-            begin
			
 
				-              PWord(@Dest)^ := PWord(S)^;
			
 
				-              PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
			
 
				-            end;
			
 
				-      4 : PInteger(@Dest)^ := PInteger(S)^
			
 
				-      else Exit; {Count <= 0}
			
 
				-    end
			
 
				-  else
			
 
				-    if D > S then
			
 
				-      begin
			
 
				-        Temp := PInteger(S)^;
			
 
				-        I := Integer(@Dest);
			
 
				-        C := Count - 4;
			
 
				-        L := PInteger(Integer(@Dest) + C);
			
 
				-        Inc(S, C);
			
 
				-        repeat
			
 
				-          L^ := PInteger(S)^;
			
 
				-          if Count <= 8 then
			
 
				-            Break;
			
 
				-          Dec(Count, 4);
			
 
				-          Dec(S, 4);
			
 
				-          Dec(L);
			
 
				-        until False;
			
 
				-        PInteger(I)^ := Temp;
			
 
				-      end
			
 
				-    else
			
 
				-      begin
			
 
				-        C := Count - 4;
			
 
				-        Temp := PInteger(S + Cardinal(C))^;
			
 
				-        I := Integer(@Dest) + C;
			
 
				-        L := @Dest;
			
 
				-        repeat
			
 
				-          L^ := PInteger(S)^;
			
 
				-          if Count <= 8 then
			
 
				-            Break;
			
 
				-          Dec(Count, 4);
			
 
				-          Inc(S, 4);
			
 
				-          Inc(L);
			
 
				-        until False;
			
 
				-        PInteger(I)^ := Temp;
			
 
				-      end;
			
 
				-end; {MoveJOH_PAS}
			
 
				-*)
			
 
				+.LPost16f:                       { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
			
 
				+    cmp    $-8, %ecx
			
 
				+    jle    .LFirstAndLast8f
			
 
				+    fildq  (%eax,%edx)
			
 
				+    fistpq (%edx)
			
 
				+.LFirstAndLast8f:
			
 
				+    fistpq 8(%edx,%ecx)          { Write first and last 8 bytes after everything else. }
			
 
				+    fistpq (%ebx)                { Important for <8-byte step between src and dest. }
			
 
				+    pop    %ebx
			
 
				+    ret
			
 
				+    .byte  0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16b into a no-op. }
			
 
				 
			
 
				-const
			
 
				-  SMALLMOVESIZE = 36;
			
 
				+{ backwards move }
			
 
				+.Lback:
			
 
				+    lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 8 bytes }
			
 
				+    mov    %ebx, %ecx            { move dest to the previous 8-byte boundary... }
			
 
				+    and    $-8, %ecx
			
 
				+    sub    %edx, %ecx
			
 
				+    add    %ecx, %edx
			
 
				 
			
 
				-{-------------------------------------------------------------------------}
			
 
				-{Perform Forward Move of 0..36 Bytes}
			
 
				-{On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count.  Destroys ECX}
			
 
				-procedure SmallForwardMove_3;assembler;nostackframe;
			
 
				-asm
			
 
				-  jmp     dword ptr @@FwdJumpTable[ecx*4]
			
 
				-  align   16
			
 
				-@@FwdJumpTable:
			
 
				-  dd      @@Done {Removes need to test for zero size move}
			
 
				-  dd      @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08
			
 
				-  dd      @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16
			
 
				-  dd      @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24
			
 
				-  dd      @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32
			
 
				-  dd      @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36
			
 
				-@@Fwd36:
			
 
				-  mov     ecx,[eax-36]
			
 
				-  mov     [edx-36],ecx
			
 
				-@@Fwd32:
			
 
				-  mov     ecx,[eax-32]
			
 
				-  mov     [edx-32],ecx
			
 
				-@@Fwd28:
			
 
				-  mov     ecx,[eax-28]
			
 
				-  mov     [edx-28],ecx
			
 
				-@@Fwd24:
			
 
				-  mov     ecx,[eax-24]
			
 
				-  mov     [edx-24],ecx
			
 
				-@@Fwd20:
			
 
				-  mov     ecx,[eax-20]
			
 
				-  mov     [edx-20],ecx
			
 
				-@@Fwd16:
			
 
				-  mov     ecx,[eax-16]
			
 
				-  mov     [edx-16],ecx
			
 
				-@@Fwd12:
			
 
				-  mov     ecx,[eax-12]
			
 
				-  mov     [edx-12],ecx
			
 
				-@@Fwd08:
			
 
				-  mov     ecx,[eax-8]
			
 
				-  mov     [edx-8],ecx
			
 
				-@@Fwd04:
			
 
				-  mov     ecx,[eax-4]
			
 
				-  mov     [edx-4],ecx
			
 
				-  ret
			
 
				-@@Fwd35:
			
 
				-  mov     ecx,[eax-35]
			
 
				-  mov     [edx-35],ecx
			
 
				-@@Fwd31:
			
 
				-  mov     ecx,[eax-31]
			
 
				-  mov     [edx-31],ecx
			
 
				-@@Fwd27:
			
 
				-  mov     ecx,[eax-27]
			
 
				-  mov     [edx-27],ecx
			
 
				-@@Fwd23:
			
 
				-  mov     ecx,[eax-23]
			
 
				-  mov     [edx-23],ecx
			
 
				-@@Fwd19:
			
 
				-  mov     ecx,[eax-19]
			
 
				-  mov     [edx-19],ecx
			
 
				-@@Fwd15:
			
 
				-  mov     ecx,[eax-15]
			
 
				-  mov     [edx-15],ecx
			
 
				-@@Fwd11:
			
 
				-  mov     ecx,[eax-11]
			
 
				-  mov     [edx-11],ecx
			
 
				-@@Fwd07:
			
 
				-  mov     ecx,[eax-7]
			
 
				-  mov     [edx-7],ecx
			
 
				-  mov     ecx,[eax-4]
			
 
				-  mov     [edx-4],ecx
			
 
				-  ret
			
 
				-@@Fwd03:
			
 
				-  movzx   ecx, word ptr [eax-3]
			
 
				-  mov     [edx-3],cx
			
 
				-  movzx   ecx, byte ptr [eax-1]
			
 
				-  mov     [edx-1],cl
			
 
				-  ret
			
 
				-@@Fwd34:
			
 
				-  mov     ecx,[eax-34]
			
 
				-  mov     [edx-34],ecx
			
 
				-@@Fwd30:
			
 
				-  mov     ecx,[eax-30]
			
 
				-  mov     [edx-30],ecx
			
 
				-@@Fwd26:
			
 
				-  mov     ecx,[eax-26]
			
 
				-  mov     [edx-26],ecx
			
 
				-@@Fwd22:
			
 
				-  mov     ecx,[eax-22]
			
 
				-  mov     [edx-22],ecx
			
 
				-@@Fwd18:
			
 
				-  mov     ecx,[eax-18]
			
 
				-  mov     [edx-18],ecx
			
 
				-@@Fwd14:
			
 
				-  mov     ecx,[eax-14]
			
 
				-  mov     [edx-14],ecx
			
 
				-@@Fwd10:
			
 
				-  mov     ecx,[eax-10]
			
 
				-  mov     [edx-10],ecx
			
 
				-@@Fwd06:
			
 
				-  mov     ecx,[eax-6]
			
 
				-  mov     [edx-6],ecx
			
 
				-@@Fwd02:
			
 
				-  movzx   ecx, word ptr [eax-2]
			
 
				-  mov     [edx-2],cx
			
 
				-  ret
			
 
				-@@Fwd33:
			
 
				-  mov     ecx,[eax-33]
			
 
				-  mov     [edx-33],ecx
			
 
				-@@Fwd29:
			
 
				-  mov     ecx,[eax-29]
			
 
				-  mov     [edx-29],ecx
			
 
				-@@Fwd25:
			
 
				-  mov     ecx,[eax-25]
			
 
				-  mov     [edx-25],ecx
			
 
				-@@Fwd21:
			
 
				-  mov     ecx,[eax-21]
			
 
				-  mov     [edx-21],ecx
			
 
				-@@Fwd17:
			
 
				-  mov     ecx,[eax-17]
			
 
				-  mov     [edx-17],ecx
			
 
				-@@Fwd13:
			
 
				-  mov     ecx,[eax-13]
			
 
				-  mov     [edx-13],ecx
			
 
				-@@Fwd09:
			
 
				-  mov     ecx,[eax-9]
			
 
				-  mov     [edx-9],ecx
			
 
				-@@Fwd05:
			
 
				-  mov     ecx,[eax-5]
			
 
				-  mov     [edx-5],ecx
			
 
				-@@Fwd01:
			
 
				-  movzx   ecx, byte ptr [eax-1]
			
 
				-  mov     [edx-1],cl
			
 
				-@@Done:
			
 
				-end; {SmallForwardMove}
			
 
				-
			
 
				-{-------------------------------------------------------------------------}
			
 
				-{Perform Backward Move of 0..36 Bytes}
			
 
				-{On Entry, ECX = Count, EAX = Source, EDX = Dest.  Destroys ECX}
			
 
				-procedure SmallBackwardMove_3;assembler;nostackframe;
			
 
				-asm
			
 
				-  jmp     dword ptr @@BwdJumpTable[ecx*4]
			
 
				-  align   16
			
 
				-@@BwdJumpTable:
			
 
				-  dd      @@Done {Removes need to test for zero size move}
			
 
				-  dd      @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08
			
 
				-  dd      @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16
			
 
				-  dd      @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24
			
 
				-  dd      @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32
			
 
				-  dd      @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36
			
 
				-@@Bwd36:
			
 
				-  mov     ecx,[eax+32]
			
 
				-  mov     [edx+32],ecx
			
 
				-@@Bwd32:
			
 
				-  mov     ecx,[eax+28]
			
 
				-  mov     [edx+28],ecx
			
 
				-@@Bwd28:
			
 
				-  mov     ecx,[eax+24]
			
 
				-  mov     [edx+24],ecx
			
 
				-@@Bwd24:
			
 
				-  mov     ecx,[eax+20]
			
 
				-  mov     [edx+20],ecx
			
 
				-@@Bwd20:
			
 
				-  mov     ecx,[eax+16]
			
 
				-  mov     [edx+16],ecx
			
 
				-@@Bwd16:
			
 
				-  mov     ecx,[eax+12]
			
 
				-  mov     [edx+12],ecx
			
 
				-@@Bwd12:
			
 
				-  mov     ecx,[eax+8]
			
 
				-  mov     [edx+8],ecx
			
 
				-@@Bwd08:
			
 
				-  mov     ecx,[eax+4]
			
 
				-  mov     [edx+4],ecx
			
 
				-@@Bwd04:
			
 
				-  mov     ecx,[eax]
			
 
				-  mov     [edx],ecx
			
 
				-  ret
			
 
				-@@Bwd35:
			
 
				-  mov     ecx,[eax+31]
			
 
				-  mov     [edx+31],ecx
			
 
				-@@Bwd31:
			
 
				-  mov     ecx,[eax+27]
			
 
				-  mov     [edx+27],ecx
			
 
				-@@Bwd27:
			
 
				-  mov     ecx,[eax+23]
			
 
				-  mov     [edx+23],ecx
			
 
				-@@Bwd23:
			
 
				-  mov     ecx,[eax+19]
			
 
				-  mov     [edx+19],ecx
			
 
				-@@Bwd19:
			
 
				-  mov     ecx,[eax+15]
			
 
				-  mov     [edx+15],ecx
			
 
				-@@Bwd15:
			
 
				-  mov     ecx,[eax+11]
			
 
				-  mov     [edx+11],ecx
			
 
				-@@Bwd11:
			
 
				-  mov     ecx,[eax+7]
			
 
				-  mov     [edx+7],ecx
			
 
				-@@Bwd07:
			
 
				-  mov     ecx,[eax+3]
			
 
				-  mov     [edx+3],ecx
			
 
				-  mov     ecx,[eax]
			
 
				-  mov     [edx],ecx
			
 
				-  ret
			
 
				-@@Bwd03:
			
 
				-  movzx   ecx, word ptr [eax+1]
			
 
				-  mov     [edx+1],cx
			
 
				-  movzx   ecx, byte ptr [eax]
			
 
				-  mov     [edx],cl
			
 
				-  ret
			
 
				-@@Bwd34:
			
 
				-  mov     ecx,[eax+30]
			
 
				-  mov     [edx+30],ecx
			
 
				-@@Bwd30:
			
 
				-  mov     ecx,[eax+26]
			
 
				-  mov     [edx+26],ecx
			
 
				-@@Bwd26:
			
 
				-  mov     ecx,[eax+22]
			
 
				-  mov     [edx+22],ecx
			
 
				-@@Bwd22:
			
 
				-  mov     ecx,[eax+18]
			
 
				-  mov     [edx+18],ecx
			
 
				-@@Bwd18:
			
 
				-  mov     ecx,[eax+14]
			
 
				-  mov     [edx+14],ecx
			
 
				-@@Bwd14:
			
 
				-  mov     ecx,[eax+10]
			
 
				-  mov     [edx+10],ecx
			
 
				-@@Bwd10:
			
 
				-  mov     ecx,[eax+6]
			
 
				-  mov     [edx+6],ecx
			
 
				-@@Bwd06:
			
 
				-  mov     ecx,[eax+2]
			
 
				-  mov     [edx+2],ecx
			
 
				-@@Bwd02:
			
 
				-  movzx   ecx, word ptr [eax]
			
 
				-  mov     [edx],cx
			
 
				-  ret
			
 
				-@@Bwd33:
			
 
				-  mov     ecx,[eax+29]
			
 
				-  mov     [edx+29],ecx
			
 
				-@@Bwd29:
			
 
				-  mov     ecx,[eax+25]
			
 
				-  mov     [edx+25],ecx
			
 
				-@@Bwd25:
			
 
				-  mov     ecx,[eax+21]
			
 
				-  mov     [edx+21],ecx
			
 
				-@@Bwd21:
			
 
				-  mov     ecx,[eax+17]
			
 
				-  mov     [edx+17],ecx
			
 
				-@@Bwd17:
			
 
				-  mov     ecx,[eax+13]
			
 
				-  mov     [edx+13],ecx
			
 
				-@@Bwd13:
			
 
				-  mov     ecx,[eax+9]
			
 
				-  mov     [edx+9],ecx
			
 
				-@@Bwd09:
			
 
				-  mov     ecx,[eax+5]
			
 
				-  mov     [edx+5],ecx
			
 
				-@@Bwd05:
			
 
				-  mov     ecx,[eax+1]
			
 
				-  mov     [edx+1],ecx
			
 
				-@@Bwd01:
			
 
				-  movzx   ecx, byte ptr[eax]
			
 
				-  mov     [edx],cl
			
 
				-@@Done:
			
 
				-end; {SmallBackwardMove}
			
 
				+    sub    $16, %ecx
			
 
				+    jbe    .LPost16b
			
 
				 
			
 
				+    .balign 16                   { no-op }
			
 
				+.Lloop16b:
			
 
				+    sub    $16, %edx
			
 
				+    fildq  8(%eax,%edx)
			
 
				+    fistpq 8(%edx)
			
 
				+    fildq  (%eax,%edx)
			
 
				+    fistpq (%edx)
			
 
				+    sub    $16, %ecx
			
 
				+    ja     .Lloop16b
			
 
				 
			
 
				-{ at least valgrind up to 3.3 has a bug which prevents the default code to
			
 
				-  work so we use a rather simple implementation here
			
 
				-}
			
 
				-procedure Forwards_Valgrind;assembler;nostackframe;
			
 
				-asm
			
 
				-{$ifdef FPC_ENABLED_CLD}
			
 
				-  cld
			
 
				-{$endif FPC_ENABLED_CLD}
			
 
				-  push    esi
			
 
				-  push    edi
			
 
				-  mov     esi,eax
			
 
				-  mov     edi,edx
			
 
				-  rep     movsb
			
 
				-  pop     edi
			
 
				-  pop     esi
			
 
				+.LPost16b:
			
 
				+    cmp    $-8, %ecx
			
 
				+    jle    .LFirstAndLast8b
			
 
				+    fildq  -8(%eax,%edx)
			
 
				+    fistpq -8(%edx)
			
 
				+.LFirstAndLast8b:
			
 
				+    sub    %ecx, %edx
			
 
				+    fistpq -7(%ebx)
			
 
				+    fistpq -16(%edx)
			
 
				+    pop    %ebx
			
 
				 end;
			
 
				 
			
 
				-{ at least valgrind up to 3.3 has a bug which prevents the default code to
			
 
				-  work so we use a rather simple implementation here
			
 
				-}
			
 
				-procedure Backwards_Valgrind;assembler;nostackframe;
			
 
				+procedure Move_8OrMore_MMX; assembler; nostackframe;
			
 
				+{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
			
 
				 asm
			
 
				-  push    esi
			
 
				-  push    edi
			
 
				-  lea     esi,[eax+ecx-1]
			
 
				-  lea     edi,[edx+ecx-1]
			
 
				-@@repeat:
			
 
				-  mov     al,[esi]
			
 
				-  mov     [edi],al
			
 
				-  dec     esi
			
 
				-  dec     edi
			
 
				-  dec     ecx
			
 
				-  jnz     @@repeat
			
 
				-  pop     edi
			
 
				-  pop     esi
			
 
				+    cmp    $72, %ecx             { Size at which using MMX becomes worthwhile. }
			
 
				+    jl     Move_8OrMore_IA32
			
 
				+    movq   (%eax), %mm4          { First and last 8 bytes. }
			
 
				+    movq   -8(%eax,%ecx), %mm5
			
 
				+    sub    %edx, %eax            { eax = src - dest }
			
 
				+    jz     .Lquit                { exit if src=dest }
			
 
				+    jnb    .LForward             { src>dest => forward move }
			
 
				+
			
 
				+    mov    %ecx, %ebx
			
 
				+    add    %eax, %ebx            { eax is negative => ecx+eax > 0 if regions overlap }
			
 
				+    jb     .Lback                { if no overlap, still do forward move }
			
 
				+
			
 
				+.LForward:
			
 
				+    mov    %edx, %ebx            { remember original dest to write first 16 bytes }
			
 
				+    add    %edx, %ecx            { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
			
 
				+    add    $8, %edx
			
 
				+    and    $-8, %edx
			
 
				+    sub    %edx, %ecx
			
 
				+
			
 
				+    sub    $16, %ecx
			
 
				+    jbe    .LPost16f
			
 
				+
			
 
				+    .balign 16
			
 
				+.Lloop16f:
			
 
				+    movq   (%eax,%edx), %mm0
			
 
				+    movq   %mm0, (%edx)
			
 
				+    movq   8(%eax,%edx), %mm0
			
 
				+    movq   %mm0, 8(%edx)
			
 
				+    add    $16, %edx
			
 
				+    sub    $16, %ecx
			
 
				+    ja     .Lloop16f
			
 
				+
			
 
				+.LPost16f:                       { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
			
 
				+    cmp    $-8, %ecx
			
 
				+    jle    .LFirstAndLast8f
			
 
				+    movq   (%eax,%edx), %mm0
			
 
				+    movq   %mm0, (%edx)
			
 
				+.LFirstAndLast8f:
			
 
				+    movq   %mm5, 8(%edx,%ecx)    { Write first and last 8 bytes after everything else. }
			
 
				+    movq   %mm4, (%ebx)          { Important for <8-byte step between src and dest. }
			
 
				+.Lquit:
			
 
				+    emms
			
 
				+    pop    %ebx
			
 
				+    ret
			
 
				+    .byte  0x66,0x66,0x66,0x66,0x66,0x2E,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16b into a no-op. }
			
 
				+
			
 
				+{ backwards move }
			
 
				+.Lback:
			
 
				+    lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 8 bytes }
			
 
				+    mov    %ebx, %ecx            { move dest to the previous 8-byte boundary... }
			
 
				+    and    $-8, %ecx
			
 
				+    sub    %edx, %ecx
			
 
				+    add    %ecx, %edx
			
 
				+
			
 
				+    sub    $16, %ecx
			
 
				+    jbe    .LPost16b
			
 
				+
			
 
				+    .balign 16                   { no-op }
			
 
				+.Lloop16b:
			
 
				+    sub    $16, %edx
			
 
				+    movq   8(%eax,%edx), %mm0
			
 
				+    movq   %mm0, 8(%edx)
			
 
				+    movq   (%eax,%edx), %mm0
			
 
				+    movq   %mm0, (%edx)
			
 
				+    sub    $16, %ecx
			
 
				+    ja     .Lloop16b
			
 
				+
			
 
				+.LPost16b:
			
 
				+    cmp    $-8, %ecx
			
 
				+    jle    .LFirstAndLast8b
			
 
				+    movq   -8(%eax,%edx), %mm0
			
 
				+    movq   %mm0, -8(%edx)
			
 
				+.LFirstAndLast8b:
			
 
				+    sub    %ecx, %edx
			
 
				+    movq   %mm4, -16(%edx)
			
 
				+    movq   %mm5, -7(%ebx)
			
 
				+    emms
			
 
				+    pop    %ebx
			
 
				 end;
			
 
				 
			
 
				-{-------------------------------------------------------------------------}
			
 
				-{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
			
 
				-procedure Forwards_IA32_3;assembler;nostackframe;
			
 
				-asm
			
 
				-  push    ebx
			
 
				-  mov     ebx,edx
			
 
				-  fild    qword ptr [eax]
			
 
				-  add     eax,ecx {QWORD Align Writes}
			
 
				-  add     ecx,edx
			
 
				-  add     edx,7
			
 
				-  and     edx,-8
			
 
				-  sub     ecx,edx
			
 
				-  add     edx,ecx {Now QWORD Aligned}
			
 
				-  sub     ecx,16
			
 
				-  neg     ecx
			
 
				-@FwdLoop:
			
 
				-  fild    qword ptr [eax+ecx-16]
			
 
				-  fistp   qword ptr [edx+ecx-16]
			
 
				-  fild    qword ptr [eax+ecx-8]
			
 
				-  fistp   qword ptr [edx+ecx-8]
			
 
				-  add     ecx,16
			
 
				-  jle     @FwdLoop
			
 
				-  fistp   qword ptr [ebx]
			
 
				-  neg     ecx
			
 
				-  add     ecx,16
			
 
				-  pop     ebx
			
 
				-  jmp     SmallForwardMove_3
			
 
				-end; {Forwards_IA32}
			
 
				-
			
 
				-{-------------------------------------------------------------------------}
			
 
				-{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
			
 
				-procedure Backwards_IA32_3;assembler;nostackframe;
			
 
				-asm
			
 
				-  push    ebx
			
 
				-  fild    qword ptr [eax+ecx-8]
			
 
				-  lea     ebx,[edx+ecx] {QWORD Align Writes}
			
 
				-  and     ebx,7
			
 
				-  sub     ecx,ebx
			
 
				-  add     ebx,ecx {Now QWORD Aligned, EBX = Original Length}
			
 
				-  sub     ecx,16
			
 
				-@BwdLoop:
			
 
				-  fild    qword ptr [eax+ecx]
			
 
				-  fild    qword ptr [eax+ecx+8]
			
 
				-  fistp   qword ptr [edx+ecx+8]
			
 
				-  fistp   qword ptr [edx+ecx]
			
 
				-  sub     ecx,16
			
 
				-  jge     @BwdLoop
			
 
				-  fistp   qword ptr [edx+ebx-8]
			
 
				-  add     ecx,16
			
 
				-  pop     ebx
			
 
				-  jmp     SmallBackwardMove_3
			
 
				-end; {Backwards_IA32}
			
 
				-
			
 
				-{-------------------------------------------------------------------------}
			
 
				-{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
			
 
				-procedure Forwards_MMX_3;assembler;nostackframe;
			
 
				-const
			
 
				-  LARGESIZE = 1024;
			
 
				-asm
			
 
				-  cmp     ecx,LARGESIZE
			
 
				-  jge     @FwdLargeMove
			
 
				-  cmp     ecx,72 {Size at which using MMX becomes worthwhile}
			
 
				-  jl      Forwards_IA32_3
			
 
				-  push    ebx
			
 
				-  mov     ebx,edx
			
 
				-  movq    mm0,[eax] {First 8 Characters}
			
 
				-  {QWORD Align Writes}
			
 
				-  add     eax,ecx
			
 
				-  add     ecx,edx
			
 
				-  add     edx,7
			
 
				-  and     edx,-8
			
 
				-  sub     ecx,edx
			
 
				-  add     edx,ecx
			
 
				-  {Now QWORD Aligned}
			
 
				-  sub     ecx,32
			
 
				-  neg     ecx
			
 
				-@FwdLoopMMX:
			
 
				-  movq    mm1,[eax+ecx-32]
			
 
				-  movq    mm2,[eax+ecx-24]
			
 
				-  movq    mm3,[eax+ecx-16]
			
 
				-  movq    mm4,[eax+ecx- 8]
			
 
				-  movq    [edx+ecx-32],mm1
			
 
				-  movq    [edx+ecx-24],mm2
			
 
				-  movq    [edx+ecx-16],mm3
			
 
				-  movq    [edx+ecx- 8],mm4
			
 
				-  add     ecx,32
			
 
				-  jle     @FwdLoopMMX
			
 
				-  movq    [ebx],mm0 {First 8 Characters}
			
 
				-  emms
			
 
				-  pop     ebx
			
 
				-  neg     ecx
			
 
				-  add     ecx,32
			
 
				-  jmp     SmallForwardMove_3
			
 
				-@FwdLargeMove:
			
 
				-  push    ebx
			
 
				-  mov     ebx,ecx
			
 
				-  test    edx,15
			
 
				-  jz      @FwdAligned
			
 
				-  {16 byte Align Destination}
			
 
				-  mov     ecx,edx
			
 
				-  add     ecx,15
			
 
				-  and     ecx,-16
			
 
				-  sub     ecx,edx
			
 
				-  add     eax,ecx
			
 
				-  add     edx,ecx
			
 
				-  sub     ebx,ecx
			
 
				-  {Destination now 16 Byte Aligned}
			
 
				-  call    SmallForwardMove_3
			
 
				-@FwdAligned:
			
 
				-  mov     ecx,ebx
			
 
				-  and     ecx,-16
			
 
				-  sub     ebx,ecx {EBX = Remainder}
			
 
				-  push    esi
			
 
				-  push    edi
			
 
				-  mov     esi,eax          {ESI = Source}
			
 
				-  mov     edi,edx          {EDI = Dest}
			
 
				-  mov     eax,ecx          {EAX = Count}
			
 
				-  and     eax,-64          {EAX = No of Bytes to Blocks Moves}
			
 
				-  and     ecx,$3F          {ECX = Remaining Bytes to Move (0..63)}
			
 
				-  add     esi,eax
			
 
				-  add     edi,eax
			
 
				-  shr     eax,3            {EAX = No of QWORD's to Block Move}
			
 
				-  neg     eax
			
 
				-@MMXcopyloop:
			
 
				-  movq    mm0,[esi+eax*8   ]
			
 
				-  movq    mm1,[esi+eax*8+ 8]
			
 
				-  movq    mm2,[esi+eax*8+16]
			
 
				-  movq    mm3,[esi+eax*8+24]
			
 
				-  movq    mm4,[esi+eax*8+32]
			
 
				-  movq    mm5,[esi+eax*8+40]
			
 
				-  movq    mm6,[esi+eax*8+48]
			
 
				-  movq    mm7,[esi+eax*8+56]
			
 
				-  movq    [edi+eax*8   ],mm0
			
 
				-  movq    [edi+eax*8+ 8],mm1
			
 
				-  movq    [edi+eax*8+16],mm2
			
 
				-  movq    [edi+eax*8+24],mm3
			
 
				-  movq    [edi+eax*8+32],mm4
			
 
				-  movq    [edi+eax*8+40],mm5
			
 
				-  movq    [edi+eax*8+48],mm6
			
 
				-  movq    [edi+eax*8+56],mm7
			
 
				-  add     eax,8
			
 
				-  jnz     @MMXcopyloop
			
 
				-  emms                   {Empty MMX State}
			
 
				-{$ifdef FPC_ENABLED_CLD}
			
 
				-  cld
			
 
				-{$endif FPC_ENABLED_CLD}
			
 
				-  add     ecx,ebx
			
 
				-  shr     ecx,2
			
 
				-  rep     movsd
			
 
				-  mov     ecx,ebx
			
 
				-  and     ecx,3
			
 
				-  rep     movsb
			
 
				-  pop     edi
			
 
				-  pop     esi
			
 
				-  pop     ebx
			
 
				-end; {Forwards_MMX}
			
 
				-
			
 
				-{-------------------------------------------------------------------------}
			
 
				-{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
			
 
				-procedure Backwards_MMX_3;assembler;nostackframe;
			
 
				-asm
			
 
				-  cmp     ecx,72 {Size at which using MMX becomes worthwhile}
			
 
				-  jl      Backwards_IA32_3
			
 
				-  push    ebx
			
 
				-  movq    mm0,[eax+ecx-8] {Get Last QWORD}
			
 
				-  {QWORD Align Writes}
			
 
				-  lea     ebx,[edx+ecx]
			
 
				-  and     ebx,7
			
 
				-  sub     ecx,ebx
			
 
				-  add     ebx,ecx
			
 
				-  {Now QWORD Aligned}
			
 
				-  sub     ecx,32
			
 
				-@BwdLoopMMX:
			
 
				-  movq    mm1,[eax+ecx   ]
			
 
				-  movq    mm2,[eax+ecx+ 8]
			
 
				-  movq    mm3,[eax+ecx+16]
			
 
				-  movq    mm4,[eax+ecx+24]
			
 
				-  movq    [edx+ecx+24],mm4
			
 
				-  movq    [edx+ecx+16],mm3
			
 
				-  movq    [edx+ecx+ 8],mm2
			
 
				-  movq    [edx+ecx   ],mm1
			
 
				-  sub     ecx,32
			
 
				-  jge     @BwdLoopMMX
			
 
				-  movq    [edx+ebx-8], mm0 {Last QWORD}
			
 
				-  emms
			
 
				-  add     ecx,32
			
 
				-  pop     ebx
			
 
				-  jmp     SmallBackwardMove_3
			
 
				-end; {Backwards_MMX}
			
 
				-
			
 
				-{$ifndef FASTMOVE_DISABLE_SSE3}
			
 
				-{-------------------------------------------------------------------------}
			
 
				-{Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
			
 
				-procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe;
			
 
				-const
			
 
				-  Prefetch = 512;
			
 
				-asm
			
 
				-  push    esi
			
 
				-  mov     esi,eax             {ESI = Source}
			
 
				-  mov     eax,ecx             {EAX = Count}
			
 
				-  and     eax,-128            {EAX = No of Bytes to Block Move}
			
 
				-  add     esi,eax
			
 
				-  add     edx,eax
			
 
				-  shr     eax,3               {EAX = No of QWORD's to Block Move}
			
 
				-  neg     eax
			
 
				-  cmp     eax, -(32*1024)     {Count > 256K}
			
 
				-  jl      @Large
			
 
				-@Small: {Count<=256K}
			
 
				-  test    esi,15              {Check if Both Source/Dest Aligned}
			
 
				-  jnz     @SmallUnaligned
			
 
				-@SmallAligned:                {Both Source and Dest 16-Byte Aligned}
			
 
				-@SmallAlignedLoop:
			
 
				-  movaps  xmm0,[esi+8*eax]
			
 
				-  movaps  xmm1,[esi+8*eax+16]
			
 
				-  movaps  xmm2,[esi+8*eax+32]
			
 
				-  movaps  xmm3,[esi+8*eax+48]
			
 
				-  movaps  [edx+8*eax],xmm0
			
 
				-  movaps  [edx+8*eax+16],xmm1
			
 
				-  movaps  [edx+8*eax+32],xmm2
			
 
				-  movaps  [edx+8*eax+48],xmm3
			
 
				-  movaps  xmm4,[esi+8*eax+64]
			
 
				-  movaps  xmm5,[esi+8*eax+80]
			
 
				-  movaps  xmm6,[esi+8*eax+96]
			
 
				-  movaps  xmm7,[esi+8*eax+112]
			
 
				-  movaps  [edx+8*eax+64],xmm4
			
 
				-  movaps  [edx+8*eax+80],xmm5
			
 
				-  movaps  [edx+8*eax+96],xmm6
			
 
				-  movaps  [edx+8*eax+112],xmm7
			
 
				-  add     eax,16
			
 
				-  js      @SmallAlignedLoop
			
 
				-  jmp     @Remainder
			
 
				-@SmallUnaligned:              {Source Not 16-Byte Aligned}
			
 
				-@SmallUnalignedLoop:
			
 
				-  movups  xmm0,[esi+8*eax]
			
 
				-  movups  xmm1,[esi+8*eax+16]
			
 
				-  movups  xmm2,[esi+8*eax+32]
			
 
				-  movups  xmm3,[esi+8*eax+48]
			
 
				-  movaps  [edx+8*eax],xmm0
			
 
				-  movaps  [edx+8*eax+16],xmm1
			
 
				-  movaps  [edx+8*eax+32],xmm2
			
 
				-  movaps  [edx+8*eax+48],xmm3
			
 
				-  movups  xmm4,[esi+8*eax+64]
			
 
				-  movups  xmm5,[esi+8*eax+80]
			
 
				-  movups  xmm6,[esi+8*eax+96]
			
 
				-  movups  xmm7,[esi+8*eax+112]
			
 
				-  movaps  [edx+8*eax+64],xmm4
			
 
				-  movaps  [edx+8*eax+80],xmm5
			
 
				-  movaps  [edx+8*eax+96],xmm6
			
 
				-  movaps  [edx+8*eax+112],xmm7
			
 
				-  add     eax,16
			
 
				-  js      @SmallUnalignedLoop
			
 
				-  jmp     @Remainder
			
 
				-@Large: {Count>256K}
			
 
				-  test    esi,15              {Check if Both Source/Dest Aligned}
			
 
				-  jnz     @LargeUnaligned
			
 
				-@LargeAligned:                {Both Source and Dest 16-Byte Aligned}
			
 
				-@LargeAlignedLoop:
			
 
				-  prefetchnta  [esi+8*eax+Prefetch]
			
 
				-  prefetchnta  [esi+8*eax+Prefetch+64]
			
 
				-  movaps  xmm0,[esi+8*eax]
			
 
				-  movaps  xmm1,[esi+8*eax+16]
			
 
				-  movaps  xmm2,[esi+8*eax+32]
			
 
				-  movaps  xmm3,[esi+8*eax+48]
			
 
				-  movntps [edx+8*eax],xmm0
			
 
				-  movntps [edx+8*eax+16],xmm1
			
 
				-  movntps [edx+8*eax+32],xmm2
			
 
				-  movntps [edx+8*eax+48],xmm3
			
 
				-  movaps  xmm4,[esi+8*eax+64]
			
 
				-  movaps  xmm5,[esi+8*eax+80]
			
 
				-  movaps  xmm6,[esi+8*eax+96]
			
 
				-  movaps  xmm7,[esi+8*eax+112]
			
 
				-  movntps [edx+8*eax+64],xmm4
			
 
				-  movntps [edx+8*eax+80],xmm5
			
 
				-  movntps [edx+8*eax+96],xmm6
			
 
				-  movntps [edx+8*eax+112],xmm7
			
 
				-  add     eax,16
			
 
				-  js      @LargeAlignedLoop
			
 
				-  sfence
			
 
				-  jmp     @Remainder
			
 
				-@LargeUnaligned:              {Source Not 16-Byte Aligned}
			
 
				-@LargeUnalignedLoop:
			
 
				-  prefetchnta  [esi+8*eax+Prefetch]
			
 
				-  prefetchnta  [esi+8*eax+Prefetch+64]
			
 
				-  movups  xmm0,[esi+8*eax]
			
 
				-  movups  xmm1,[esi+8*eax+16]
			
 
				-  movups  xmm2,[esi+8*eax+32]
			
 
				-  movups  xmm3,[esi+8*eax+48]
			
 
				-  movntps [edx+8*eax],xmm0
			
 
				-  movntps [edx+8*eax+16],xmm1
			
 
				-  movntps [edx+8*eax+32],xmm2
			
 
				-  movntps [edx+8*eax+48],xmm3
			
 
				-  movups  xmm4,[esi+8*eax+64]
			
 
				-  movups  xmm5,[esi+8*eax+80]
			
 
				-  movups  xmm6,[esi+8*eax+96]
			
 
				-  movups  xmm7,[esi+8*eax+112]
			
 
				-  movntps [edx+8*eax+64],xmm4
			
 
				-  movntps [edx+8*eax+80],xmm5
			
 
				-  movntps [edx+8*eax+96],xmm6
			
 
				-  movntps [edx+8*eax+112],xmm7
			
 
				-  add     eax,16
			
 
				-  js      @LargeUnalignedLoop
			
 
				-  sfence
			
 
				-@Remainder:
			
 
				-  and     ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)}
			
 
				-  jz      @Done
			
 
				-  add     esi,ecx
			
 
				-  add     edx,ecx
			
 
				-  neg     ecx
			
 
				-@RemainderLoop:
			
 
				-  movups  xmm0,[esi+ecx]
			
 
				-  movaps  [edx+ecx],xmm0
			
 
				-  add     ecx,16
			
 
				-  jnz     @RemainderLoop
			
 
				-@Done:
			
 
				-  pop     esi
			
 
				-end; {AlignedFwdMoveSSE}
			
 
				-
			
 
				-{-------------------------------------------------------------------------}
			
 
				-{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
			
 
				-procedure Forwards_SSE_3;assembler;nostackframe;
			
 
				+{$ifndef FASTMOVE_DISABLE_SSE}
			
 
				+procedure Move_8OrMore_SSE; assembler; nostackframe;
			
 
				+{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
			
 
				 const
			
 
				-  LARGESIZE = 2048;
			
 
				+  ErmsThreshold = 1536;
			
 
				+  NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
			
 
				+  PrefetchDistance = 512;
			
 
				 asm
			
 
				-  cmp     ecx,LARGESIZE
			
 
				-  jge     @FwdLargeMove
			
 
				-  cmp     ecx,SMALLMOVESIZE+32
			
 
				-  movups  xmm0,[eax]
			
 
				-  jg      @FwdMoveSSE
			
 
				-  movups  xmm1,[eax+16]
			
 
				-  movups  [edx],xmm0
			
 
				-  movups  [edx+16],xmm1
			
 
				-  add     eax,ecx
			
 
				-  add     edx,ecx
			
 
				-  sub     ecx,32
			
 
				-  jmp     SmallForwardMove_3
			
 
				-@FwdMoveSSE:
			
 
				-  push    ebx
			
 
				-  mov     ebx,edx
			
 
				-  {Align Writes}
			
 
				-  add     eax,ecx
			
 
				-  add     ecx,edx
			
 
				-  add     edx,15
			
 
				-  and     edx,-16
			
 
				-  sub     ecx,edx
			
 
				-  add     edx,ecx
			
 
				-  {Now Aligned}
			
 
				-  sub     ecx,32
			
 
				-  neg     ecx
			
 
				-@FwdLoopSSE:
			
 
				-  movups  xmm1,[eax+ecx-32]
			
 
				-  movups  xmm2,[eax+ecx-16]
			
 
				-  movaps  [edx+ecx-32],xmm1
			
 
				-  movaps  [edx+ecx-16],xmm2
			
 
				-  add     ecx,32
			
 
				-  jle     @FwdLoopSSE
			
 
				-  movups  [ebx],xmm0 {First 16 Bytes}
			
 
				-  neg     ecx
			
 
				-  add     ecx,32
			
 
				-  pop     ebx
			
 
				-  jmp     SmallForwardMove_3
			
 
				-@FwdLargeMove:
			
 
				-  push    ebx
			
 
				-  mov     ebx,ecx
			
 
				-  test    edx,15
			
 
				-  jz      @FwdLargeAligned
			
 
				-  {16 byte Align Destination}
			
 
				-  mov     ecx,edx
			
 
				-  add     ecx,15
			
 
				-  and     ecx,-16
			
 
				-  sub     ecx,edx
			
 
				-  add     eax,ecx
			
 
				-  add     edx,ecx
			
 
				-  sub     ebx,ecx
			
 
				-  {Destination now 16 Byte Aligned}
			
 
				-  call    SmallForwardMove_3
			
 
				-  mov     ecx,ebx
			
 
				-@FwdLargeAligned:
			
 
				-  and     ecx,-16
			
 
				-  sub     ebx,ecx {EBX = Remainder}
			
 
				-  push    edx
			
 
				-  push    eax
			
 
				-  push    ecx
			
 
				-  call    AlignedFwdMoveSSE_3
			
 
				-  pop     ecx
			
 
				-  pop     eax
			
 
				-  pop     edx
			
 
				-  add     ecx,ebx
			
 
				-  add     eax,ecx
			
 
				-  add     edx,ecx
			
 
				-  mov     ecx,ebx
			
 
				-  pop     ebx
			
 
				-  jmp     SmallForwardMove_3
			
 
				-end; {Forwards_SSE}
			
 
				-
			
 
				-{-------------------------------------------------------------------------}
			
 
				-{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
			
 
				-procedure Backwards_SSE_3;assembler;nostackframe;
			
 
				-asm
			
 
				-  cmp     ecx,SMALLMOVESIZE+32
			
 
				-  jg      @BwdMoveSSE
			
 
				-  sub     ecx,32
			
 
				-  movups  xmm1,[eax+ecx]
			
 
				-  movups  xmm2,[eax+ecx+16]
			
 
				-  movups  [edx+ecx],xmm1
			
 
				-  movups  [edx+ecx+16],xmm2
			
 
				-  jmp     SmallBackwardMove_3
			
 
				-@BwdMoveSSE:
			
 
				-  push    ebx
			
 
				-  movups  xmm0,[eax+ecx-16] {Last 16 Bytes}
			
 
				-  {Align Writes}
			
 
				-  lea     ebx,[edx+ecx]
			
 
				-  and     ebx,15
			
 
				-  sub     ecx,ebx
			
 
				-  add     ebx,ecx
			
 
				-  {Now Aligned}
			
 
				-  sub     ecx,32
			
 
				-@BwdLoop:
			
 
				-  movups  xmm1,[eax+ecx]
			
 
				-  movups  xmm2,[eax+ecx+16]
			
 
				-  movaps  [edx+ecx],xmm1
			
 
				-  movaps  [edx+ecx+16],xmm2
			
 
				-  sub     ecx,32
			
 
				-  jge     @BwdLoop
			
 
				-  movups  [edx+ebx-16],xmm0  {Last 16 Bytes}
			
 
				-  add     ecx,32
			
 
				-  pop     ebx
			
 
				-  jmp     SmallBackwardMove_3
			
 
				-end; {Backwards_SSE}
			
 
				-{$endif ndef FASTMOVE_DISABLE_SSE3}
			
 
				+    cmp    $16, %ecx
			
 
				+    jle    .L9to16
			
 
				+    movups (%eax), %xmm4         { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
			
 
				+    movups -16(%eax,%ecx), %xmm5
			
 
				+    cmp    $32, %ecx
			
 
				+    jg     .L33OrMore
			
 
				+    movups %xmm4, (%edx)         { 17–32 bytes }
			
 
				+    movups %xmm5, -16(%edx,%ecx)
			
 
				+    pop    %ebx
			
 
				+    ret
			
 
				 
			
 
				-const
			
 
				-   fastmoveproc_forward : pointer = @Forwards_IA32_3;
			
 
				-   fastmoveproc_backward : pointer = @Backwards_IA32_3;
			
 
				+.L9to16:
			
 
				+    movq   (%eax), %xmm0
			
 
				+    movq   -8(%eax,%ecx), %xmm1
			
 
				+    movq   %xmm0, (%edx)
			
 
				+    movq   %xmm1, -8(%edx,%ecx)
			
 
				+.Lquit:
			
 
				+    pop    %ebx
			
 
				+    ret
			
 
				+    .byte  0x66,0x66,0x66,0x66,0x66,0x2E,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop32f into a no-op. }
			
 
				 
			
 
				-procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
			
 
				-asm
			
 
				-  cmp     ecx,SMALLMOVESIZE
			
 
				-  ja      @Large
			
 
				-  cmp     eax,edx
			
 
				-  lea     eax,[eax+ecx]
			
 
				-  jle     @SmallCheck
			
 
				-@SmallForward:
			
 
				-  add     edx,ecx
			
 
				-  jmp     SmallForwardMove_3
			
 
				-@SmallCheck:
			
 
				-  je      @Done {For Compatibility with Delphi's move for Source = Dest}
			
 
				-  sub     eax,ecx
			
 
				-  jmp     SmallBackwardMove_3
			
 
				-@Large:
			
 
				-  jng     @Done {For Compatibility with Delphi's move for Count < 0}
			
 
				-  cmp     eax,edx
			
 
				-  jg      @moveforward
			
 
				-  je      @Done {For Compatibility with Delphi's move for Source = Dest}
			
 
				-  push    eax
			
 
				-  add     eax,ecx
			
 
				-  cmp     eax,edx
			
 
				-  pop     eax
			
 
				-  jg      @movebackward
			
 
				-@moveforward:
			
 
				-  jmp     dword ptr fastmoveproc_forward
			
 
				-@movebackward:
			
 
				-  jmp     dword ptr fastmoveproc_backward {Source/Dest Overlap}
			
 
				-@Done:
			
 
				+.L33OrMore:
			
 
				+    sub    %edx, %eax            { eax = src - dest }
			
 
				+    jz     .Lquit                { exit if src=dest }
			
 
				+    jnb    .LForward             { src>dest => forward move }
			
 
				+
			
 
				+    mov    %ecx, %ebx
			
 
				+    add    %eax, %ebx            { eax is negative => ecx+eax > 0 if regions overlap }
			
 
				+    jb     .Lback                { if no overlap, still do forward move }
			
 
				+
			
 
				+.LForward:
			
 
				+    mov    %edx, %ebx            { remember original dest to write first 16 bytes }
			
 
				+    add    %edx, %ecx            { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
			
 
				+    add    $16, %edx
			
 
				+    and    $-16, %edx
			
 
				+    sub    %edx, %ecx
			
 
				+
			
 
				+.LRestAfterNTf:
			
 
				+    sub    $32, %ecx             { During the N× loop, ecx is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
			
 
				+    jbe    .LPost32f
			
 
				+    cmp    $NtThreshold-32, %ecx
			
 
				+    jae    .Lntf                 { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
			
 
				+.LNtIsNotBetter:
			
 
				+    cmp    $ErmsThreshold-32, %ecx { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT. }
			
 
				+    jae    .LRepMovsF
			
 
				+.LRepMovsIsNotBetter:
			
 
				+    test   $15, %eax
			
 
				+    jz     .Lalignedloop32f
			
 
				+
			
 
				+    .balign 16                   { no-op }
			
 
				+.Lloop32f:
			
 
				+    movups (%eax,%edx), %xmm0
			
 
				+    movaps %xmm0, (%edx)
			
 
				+    movups 16(%eax,%edx), %xmm0
			
 
				+    movaps %xmm0, 16(%edx)
			
 
				+    add    $32, %edx
			
 
				+    sub    $32, %ecx
			
 
				+    ja     .Lloop32f
			
 
				+
			
 
				+.LPost32f:                       { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. }
			
 
				+    cmp    $-16, %ecx
			
 
				+    jle    .LFirstAndLast16f
			
 
				+    movups (%eax,%edx), %xmm0
			
 
				+    movaps %xmm0, (%edx)
			
 
				+.LFirstAndLast16f:
			
 
				+    movups %xmm5, 16(%edx,%ecx)  { Write first and last 16 bytes after everything else. }
			
 
				+    movups %xmm4, (%ebx)         { Important for <16-byte step between src and dest. }
			
 
				+    pop    %ebx
			
 
				+    ret
			
 
				+
			
 
				+    .balign 16
			
 
				+.Lalignedloop32f:                { Same as above starting from .Lloop32f but with MOVAPSes. }
			
 
				+    movaps (%eax,%edx), %xmm0
			
 
				+    movaps %xmm0, (%edx)
			
 
				+    movaps 16(%eax,%edx), %xmm0
			
 
				+    movaps %xmm0, 16(%edx)
			
 
				+    add    $32, %edx
			
 
				+    sub    $32, %ecx
			
 
				+    ja     .Lalignedloop32f
			
 
				+
			
 
				+.LalignedPost32f:
			
 
				+    cmp    $-16, %ecx
			
 
				+    jle    .LalignedFirstAndLast16f
			
 
				+    movaps (%eax,%edx), %xmm0
			
 
				+    movaps %xmm0, (%edx)
			
 
				+.LalignedFirstAndLast16f:
			
 
				+    movups %xmm5, 16(%edx,%ecx)
			
 
				+    movups %xmm4, (%ebx)
			
 
				+    pop    %ebx
			
 
				+    ret
			
 
				+
			
 
				+.LRepMovsF:
			
 
				+{$ifdef FPC_PIC}
			
 
				+    push   %ebx
			
 
				+    call   fpc_geteipasebx
			
 
				+    addl   $_GLOBAL_OFFSET_TABLE_, %ebx
			
 
				+    movl   fast_large_repmovstosb@GOT(%ebx), %ebx
			
 
				+    cmpb   $1, (%ebx)
			
 
				+    pop    %ebx
			
 
				+{$else FPC_PIC}
			
 
				+    cmpb   $1, fast_large_repmovstosb
			
 
				+{$endif FPC_PIC}
			
 
				+    jne    .LRepMovsIsNotBetter
			
 
				+    push   %esi
			
 
				+    push   %edi
			
 
				+    lea    (%eax,%edx), %esi
			
 
				+    mov    %edx, %edi
			
 
				+    add    $32, %ecx
			
 
				+    rep movsb
			
 
				+    movups %xmm4, (%ebx)         { last 16 aren't required }
			
 
				+    pop    %edi
			
 
				+    pop    %esi
			
 
				+    pop    %ebx
			
 
				+    ret
			
 
				+
			
 
				+.Lntf:
			
 
				+    cmp    $NtThreshold, %eax    { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
			
 
				+    jb     .LNtIsNotBetter       { (this check is performed here to not stand in the way of smaller counts) }
			
 
				+    sub    $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
			
 
				+    test   $15, %eax
			
 
				+    jz     .Lalignedntloop64f
			
 
				+
			
 
				+    .balign 16
			
 
				+.Lntloop64f:
			
 
				+    prefetchnta 0+PrefetchDistance(%eax,%edx,1)
			
 
				+    movups (%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, (%edx)
			
 
				+    movups 16(%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, 16(%edx)
			
 
				+    movups 32(%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, 32(%edx)
			
 
				+    movups 48(%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, 48(%edx)
			
 
				+    add    $64, %edx
			
 
				+    sub    $64, %ecx
			
 
				+    jae    .Lntloop64f
			
 
				+
			
 
				+    sfence
			
 
				+    add    $PrefetchDistance+64, %ecx
			
 
				+    jmp    .LRestAfterNTf        { go handle remaining bytes }
			
 
				+
			
 
				+    .balign 16
			
 
				+.Lalignedntloop64f:              { Same as above starting from .Lntloop64f but with MOVAPSes. }
			
 
				+    prefetchnta 0+PrefetchDistance(%eax,%edx,1)
			
 
				+    movaps (%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, (%edx)
			
 
				+    movaps 16(%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, 16(%edx)
			
 
				+    movaps 32(%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, 32(%edx)
			
 
				+    movaps 48(%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, 48(%edx)
			
 
				+    add    $64, %edx
			
 
				+    sub    $64, %ecx
			
 
				+    jae    .Lalignedntloop64f
			
 
				+
			
 
				+    sfence
			
 
				+    add    $PrefetchDistance+64, %ecx
			
 
				+    jmp    .LRestAfterNTf
			
 
				+    .byte  0x66,0x0F,0x1F,0x44,0,0 { Turns .balign 16 before .Lloop32b into a no-op. }
			
 
				+
			
 
				+{ backwards move }
			
 
				+.Lback:
			
 
				+    lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 16 bytes }
			
 
				+    mov    %ebx, %ecx            { move dest to the previous 16-byte boundary... }
			
 
				+    and    $-16, %ecx
			
 
				+    sub    %edx, %ecx
			
 
				+    add    %ecx, %edx
			
 
				+
			
 
				+.LRestAfterNTb:
			
 
				+    sub    $32, %ecx
			
 
				+    jbe    .LPost32b
			
 
				+    cmp    $NtThreshold-32, %ecx
			
 
				+    jae    .Lntb
			
 
				+
			
 
				+    .balign 16                   { no-op }
			
 
				+.Lloop32b:
			
 
				+    sub    $32, %edx
			
 
				+    movups 16(%eax,%edx), %xmm0
			
 
				+    movaps %xmm0, 16(%edx)
			
 
				+    movups (%eax,%edx), %xmm0
			
 
				+    movaps %xmm0, (%edx)
			
 
				+    sub    $32, %ecx
			
 
				+    ja     .Lloop32b
			
 
				+
			
 
				+.LPost32b:
			
 
				+    cmp    $-16, %ecx
			
 
				+    jle    .LFirstAndLast16b
			
 
				+    movups -16(%eax,%edx), %xmm0
			
 
				+    movaps %xmm0, -16(%edx)
			
 
				+.LFirstAndLast16b:
			
 
				+    sub    %ecx, %edx
			
 
				+    movups %xmm4, -32(%edx)
			
 
				+    movups %xmm5, -15(%ebx)
			
 
				+    pop    %ebx
			
 
				+    ret
			
 
				+
			
 
				+.Lntb:
			
 
				+    cmp    $-NtThreshold, %eax
			
 
				+    jnb    .Lloop32b
			
 
				+    sub    $PrefetchDistance+32, %ecx
			
 
				+
			
 
				+    .balign 16
			
 
				+.Lntloop64b:
			
 
				+    prefetchnta -PrefetchDistance(%eax,%edx,1)
			
 
				+    sub    $64, %edx
			
 
				+    movups 48(%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, 48(%edx)
			
 
				+    movups 32(%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, 32(%edx)
			
 
				+    movups 16(%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, 16(%edx)
			
 
				+    movups (%eax,%edx,1), %xmm0
			
 
				+    movntps %xmm0, (%edx)
			
 
				+    sub    $64, %ecx
			
 
				+    jae    .Lntloop64b
			
 
				+
			
 
				+    sfence
			
 
				+    add    $PrefetchDistance+64, %ecx
			
 
				+    jmp    .LRestAfterNTb
			
 
				 end;
			
 
				+{$endif ndef FASTMOVE_DISABLE_SSE}
			
 
				+
			
 
				+procedure Move_8OrMore_Dispatch; forward;
			
 
				 
			
 
				-{$asmmode att}
			
 
				-{$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
			
 
				 var
			
 
				+  fastmoveproc : pointer = @Move_8OrMore_Dispatch;
			
 
				+{$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
			
 
				   valgrind_used : boolean;external name '__fpc_valgrind';
			
 
				 {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
			
 
				 
			
 
				-procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
			
 
				-  begin
			
 
				-    { workaround valgrind bug }
			
 
				+function Move_8OrMore_HumanFriendlyDispatch: pointer;
			
 
				+begin
			
 
				+  { workaround valgrind bug }
			
 
				 {$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
			
 
				-    if EntryInformation.valgrind_used then
			
 
				+  if EntryInformation.valgrind_used then
			
 
				 {$else FPC_HAS_INDIRECT_ENTRY_INFORMATION}
			
 
				-    if valgrind_used then
			
 
				+  if valgrind_used then
			
 
				 {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
			
 
				-      begin
			
 
				-        fastmoveproc_forward:=@Forwards_Valgrind;
			
 
				-        fastmoveproc_backward:=@Backwards_Valgrind;
			
 
				-      end
			
 
				-{$ifndef FASTMOVE_DISABLE_SSE3}
			
 
				-    else if has_sse_support then
			
 
				-      begin
			
 
				-        fastmoveproc_forward:=@Forwards_SSE_3;
			
 
				-        fastmoveproc_backward:=@Backwards_SSE_3;
			
 
				-      end
			
 
				-{$endif ndef FASTMOVE_DISABLE_SSE3}
			
 
				-   else if has_mmx_support then
			
 
				-      begin
			
 
				-        fastmoveproc_forward:=@Forwards_MMX_3;
			
 
				-        fastmoveproc_backward:=@Backwards_MMX_3;
			
 
				-      end;
			
 
				-  end;
			
 
				+    result:=@Move_8OrMore_Valgrind
			
 
				+{$ifndef FASTMOVE_DISABLE_SSE}
			
 
				+  else if has_sse_support then
			
 
				+    result:=@Move_8OrMore_SSE
			
 
				+{$endif ndef FASTMOVE_DISABLE_SSE}
			
 
				+  else if has_mmx_support then
			
 
				+    result:=@Move_8OrMore_MMX
			
 
				+  else
			
 
				+    result:=@Move_8OrMore_IA32;
			
 
				+  if fpc_cpucodeinit_performed then
			
 
				+    fastmoveproc:=result;
			
 
				+end;
			
 
				 
			
 
				-{$endif  FPC_SYSTEM_HAS_MOVE}
			
 
				+procedure Move_8OrMore_Dispatch; assembler; nostackframe;
			
 
				+{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
			
 
				+asm
			
 
				+    push %eax
			
 
				+    push %edx
			
 
				+    push %ecx
			
 
				+    call Move_8OrMore_HumanFriendlyDispatch
			
 
				+    mov  %eax, %ebx
			
 
				+    pop  %ecx
			
 
				+    pop  %edx
			
 
				+    pop  %eax
			
 
				+    jmp  %ebx
			
 
				+end;
			
 
				 
			
 
				+procedure Move(const source;var dest;count:SizeInt); [public, alias: 'FPC_MOVE']; assembler; nostackframe;
			
 
				+asm
			
 
				+    push   %ebx
			
 
				+    cmp    $8, %ecx
			
 
				+    jle    .L8OrLess
			
 
				+{$ifdef FPC_PIC}
			
 
				+    call   fpc_geteipasebx
			
 
				+    addl   $_GLOBAL_OFFSET_TABLE_, %ebx
			
 
				+    movl   fastmoveproc@GOT(%ebx), %ebx
			
 
				+    jmp    (%ebx)
			
 
				+{$else}
			
 
				+    jmp    fastmoveproc
			
 
				 {$endif}
			
 
				+
			
 
				+.L8OrLess:
			
 
				+    cmp    $3, %ecx
			
 
				+    jle    .L3OrLess
			
 
				+    mov    (%eax), %ebx
			
 
				+    mov    -4(%eax,%ecx), %eax
			
 
				+    mov    %ebx, (%edx)
			
 
				+    mov    %eax, -4(%edx,%ecx)
			
 
				+    pop    %ebx
			
 
				+    ret
			
 
				+
			
 
				+.L3OrLess:
			
 
				+    cmp    $1, %ecx
			
 
				+    jl     .LZero
			
 
				+    movzbl (%eax), %ebx
			
 
				+    je     .LOne
			
 
				+    movzwl -2(%eax,%ecx), %eax
			
 
				+    mov    %ax, -2(%edx,%ecx)
			
 
				+.LOne:
			
 
				+    mov    %bl, (%edx)
			
 
				+.LZero:
			
 
				+    pop    %ebx
			
 
				+end;
			
 
				+
			
 
				+{$endif  FPC_SYSTEM_HAS_MOVE}
			
--- a/rtl/i386/i386.inc
+++ b/rtl/i386/i386.inc
@@ -25,6 +25,8 @@ var
 
				   os_supports_sse : boolean;
			
 
				   { this variable is set to true, if currently an sse check is executed and no sig ill should be generated }
			
 
				   sse_check : boolean;
			
 
				+  fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
			
 
				+  fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. }
			
 
				 
			
 
				 {$asmmode ATT}
			
 
				 
			
@@ -47,15 +49,6 @@ function cpuid_support : boolean;assembler;nostackframe;
 
				     setnz   %al
			
 
				   end;
			
 
				 
			
 
				-{$ifndef FPC_PIC}
			
 
				-{$ifndef FPC_SYSTEM_HAS_MOVE}
			
 
				-{$ifndef OLD_ASSEMBLER}
			
 
				-{$define USE_FASTMOVE}
			
 
				-{$i fastmove.inc}
			
 
				-{$endif not OLD_ASSEMBLER}
			
 
				-{$endif FPC_SYSTEM_HAS_MOVE}
			
 
				-{$endif FPC_PIC}
			
 
				-
			
 
				 {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
			
 
				 procedure fpc_cpuinit;
			
 
				   begin
			
@@ -63,7 +56,6 @@ procedure fpc_cpuinit;
 
				       must be implemented OS dependend (FK)
			
 
				     has_sse_support:=sse_support;
			
 
				     has_mmx_support:=mmx_support;
			
 
				-    setup_fastmove;
			
 
				     }
			
 
				   end;
			
 
				 
			
@@ -80,6 +72,12 @@ asm
 
				 end;
			
 
				 {$endif}
			
 
				 
			
 
				+{$if not defined(FPC_SYSTEM_HAS_MOVE)
			
 
				+ and not defined(OLD_ASSEMBLER)
			
 
				+ and not defined(darwin)}
			
 
				+{$i fastmove.inc}
			
 
				+{$endif}
			
 
				+
			
 
				 {$ifndef FPC_SYSTEM_HAS_MOVE}
			
 
				 {$define FPC_SYSTEM_HAS_MOVE}
			
 
				 
			
@@ -2027,7 +2025,7 @@ Procedure SysResetFPU;
 
				 { because of the brain dead sse detection on x86, this test is post poned }
			
 
				 procedure fpc_cpucodeinit;
			
 
				   var
			
 
				-    _eax,_ecx_cpuid1,_edx_cpuid1,_ebx : longint;
			
 
				+    _eax,_ecx_cpuid1,_edx_cpuid1,_ebx_cpuid7 : longint;
			
 
				   begin
			
 
				     if cpuid_support then
			
 
				       begin
			
@@ -2067,23 +2065,27 @@ procedure fpc_cpucodeinit;
 
				               cpuid
			
 
				               movl %eax,_eax
			
 
				             end;
			
 
				-            if (_eax>=7) and (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then
			
 
				+            if _eax>=7 then
			
 
				               begin
			
 
				                 asm
			
 
				+                  movl $7,%eax
			
 
				                   xorl %ecx,%ecx
			
 
				-                  .byte   0x0f,0x01,0xd0 { xgetbv }
			
 
				-                  movl %eax,_eax
			
 
				+                  cpuid
			
 
				+                  movl %ebx,_ebx_cpuid7
			
 
				                 end;
			
 
				-                if (_eax and 6)=6 then
			
 
				+                fast_large_repmovstosb:=_ebx_cpuid7 and (1 shl 9)<>0;
			
 
				+                if (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then
			
 
				                   begin
			
 
				-                    has_avx_support:=(_ecx_cpuid1 and $10000000)<>0;
			
 
				                     asm
			
 
				-                      movl $7,%eax
			
 
				                       xorl %ecx,%ecx
			
 
				-                      cpuid
			
 
				-                      movl %ebx,_ebx
			
 
				+                      .byte   0x0f,0x01,0xd0 { xgetbv }
			
 
				+                      movl %eax,_eax
			
 
				                     end;
			
 
				-                    has_avx2_support:=(_ebx and $20)<>0;
			
 
				+                    if (_eax and 6)=6 then
			
 
				+                      begin
			
 
				+                        has_avx_support:=(_ecx_cpuid1 and $10000000)<>0;
			
 
				+                        has_avx2_support:=(_ebx_cpuid7 and $20)<>0;
			
 
				+                      end;
			
 
				                   end;
			
 
				               end;
			
 
				           end;
			
@@ -2098,9 +2100,7 @@ procedure fpc_cpucodeinit;
 
				       end;
			
 
				 
			
 
				     SysResetFPU;
			
 
				-{$ifdef USE_FASTMOVE}
			
 
				-    setup_fastmove;
			
 
				-{$endif}
			
 
				+    fpc_cpucodeinit_performed:=true;
			
 
				   end;
			
 
				 
			
 
				 
			
--- a/rtl/watcom/system.pp
+++ b/rtl/watcom/system.pp
@@ -25,8 +25,8 @@ INTERFACE
 
				 {$define FPC_ANSI_TEXTFILEREC}
			
 
				 { include system-independent routine headers }
			
 
				 
			
 
				-{ wasm does not support SSE3 instructions }
			
 
				-{$define FASTMOVE_DISABLE_SSE3}
			
 
				+{ wasm does not support SSE instructions }
			
 
				+{$define FASTMOVE_DISABLE_SSE}
			
 
				 
			
 
				 {$include systemh.inc}