Browse Source

+ fastmove from John O'Harrow integrated

florian 20 years ago
parent
commit
3600b51d32
2 changed files with 931 additions and 4 deletions
  1. 854 0
      rtl/i386/fastmove.inc
  2. 77 4
      rtl/i386/i386.inc

+ 854 - 0
rtl/i386/fastmove.inc

@@ -0,0 +1,854 @@
+{
+  $Id$
+  Copyright (c) 2004, John O'Harrow ([email protected])
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the
+use of this software.
+
+Permission is granted to anyone to use this software for any purpose, including
+commercial applications, and to alter it and redistribute it freely, subject to
+the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim
+   that you wrote the original software. If you use this software in a product,
+   an acknowledgment in the product documentation would be appreciated but is
+   not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source distribution.
+
+-------------------------------------------------------------------------------
+
+Version: 1.40 - 16-SEP-2004
+}
+
+
+{$if (FPC_VERSION>1) or ((FPC_RELEASE>=9) and (FPC_PATCH>6))}
+
+{$ifndef FPC_SYSTEM_HAS_MOVE}
+{$define FPC_SYSTEM_HAS_MOVE}
+
+{$asmmode intel}
+
+{-------------------------------------------------------------------------}
+{Just to show that a good Pascal algorithm can beat the default BASM}
+procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer);
+var
+  S, D       : PtrUInt;
+  Temp, C, I : PtrInt;
+  L          : PPtrInt;
+begin
+  S := Cardinal(@Source);
+  D := Cardinal(@Dest);
+  if S = D then
+    Exit;
+  if Count <= 4 then
+    case Count of
+      1 : PByte(@Dest)^ := PByte(S)^;
+      2 : PWord(@Dest)^ := PWord(S)^;
+      3 : if D > S then
+            begin
+              PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
+              PWord(@Dest)^ := PWord(S)^;
+            end
+          else
+            begin
+              PWord(@Dest)^ := PWord(S)^;
+              PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
+            end;
+      4 : PInteger(@Dest)^ := PInteger(S)^
+      else Exit; {Count <= 0}
+    end
+  else
+    if D > S then
+      begin
+        Temp := PInteger(S)^;
+        I := Integer(@Dest);
+        C := Count - 4;
+        L := PInteger(Integer(@Dest) + C);
+        Inc(S, C);
+        repeat
+          L^ := PInteger(S)^;
+          if Count <= 8 then
+            Break;
+          Dec(Count, 4);
+          Dec(S, 4);
+          Dec(L);
+        until False;
+        PInteger(I)^ := Temp;
+      end
+    else
+      begin
+        C := Count - 4;
+        Temp := PInteger(S + Cardinal(C))^;
+        I := Integer(@Dest) + C;
+        L := @Dest;
+        repeat
+          L^ := PInteger(S)^;
+          if Count <= 8 then
+            Break;
+          Dec(Count, 4);
+          Inc(S, 4);
+          Inc(L);
+        until False;
+        PInteger(I)^ := Temp;
+      end;
+end; {MoveJOH_PAS}
+
+const
+  SMALLMOVESIZE = 36;
+
+{-------------------------------------------------------------------------}
+{Perform Forward Move of 0..36 Bytes}
+{On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count.  Destroys ECX}
+procedure SmallForwardMove_3;assembler;nostackframe;
+asm
+  jmp     dword ptr @@FwdJumpTable[ecx*4]
+  align   16
+@@FwdJumpTable:
+  dd      @@Done {Removes need to test for zero size move}
+  dd      @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08
+  dd      @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16
+  dd      @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24
+  dd      @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32
+  dd      @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36
+@@Fwd36:
+  mov     ecx,[eax-36]
+  mov     [edx-36],ecx
+@@Fwd32:
+  mov     ecx,[eax-32]
+  mov     [edx-32],ecx
+@@Fwd28:
+  mov     ecx,[eax-28]
+  mov     [edx-28],ecx
+@@Fwd24:
+  mov     ecx,[eax-24]
+  mov     [edx-24],ecx
+@@Fwd20:
+  mov     ecx,[eax-20]
+  mov     [edx-20],ecx
+@@Fwd16:
+  mov     ecx,[eax-16]
+  mov     [edx-16],ecx
+@@Fwd12:
+  mov     ecx,[eax-12]
+  mov     [edx-12],ecx
+@@Fwd08:
+  mov     ecx,[eax-8]
+  mov     [edx-8],ecx
+@@Fwd04:
+  mov     ecx,[eax-4]
+  mov     [edx-4],ecx
+  ret
+@@Fwd35:
+  mov     ecx,[eax-35]
+  mov     [edx-35],ecx
+@@Fwd31:
+  mov     ecx,[eax-31]
+  mov     [edx-31],ecx
+@@Fwd27:
+  mov     ecx,[eax-27]
+  mov     [edx-27],ecx
+@@Fwd23:
+  mov     ecx,[eax-23]
+  mov     [edx-23],ecx
+@@Fwd19:
+  mov     ecx,[eax-19]
+  mov     [edx-19],ecx
+@@Fwd15:
+  mov     ecx,[eax-15]
+  mov     [edx-15],ecx
+@@Fwd11:
+  mov     ecx,[eax-11]
+  mov     [edx-11],ecx
+@@Fwd07:
+  mov     ecx,[eax-7]
+  mov     [edx-7],ecx
+  mov     ecx,[eax-4]
+  mov     [edx-4],ecx
+  ret
+@@Fwd03:
+  movzx   ecx, word ptr [eax-3]
+  mov     [edx-3],cx
+  movzx   ecx, byte ptr [eax-1]
+  mov     [edx-1],cl
+  ret
+@@Fwd34:
+  mov     ecx,[eax-34]
+  mov     [edx-34],ecx
+@@Fwd30:
+  mov     ecx,[eax-30]
+  mov     [edx-30],ecx
+@@Fwd26:
+  mov     ecx,[eax-26]
+  mov     [edx-26],ecx
+@@Fwd22:
+  mov     ecx,[eax-22]
+  mov     [edx-22],ecx
+@@Fwd18:
+  mov     ecx,[eax-18]
+  mov     [edx-18],ecx
+@@Fwd14:
+  mov     ecx,[eax-14]
+  mov     [edx-14],ecx
+@@Fwd10:
+  mov     ecx,[eax-10]
+  mov     [edx-10],ecx
+@@Fwd06:
+  mov     ecx,[eax-6]
+  mov     [edx-6],ecx
+@@Fwd02:
+  movzx   ecx, word ptr [eax-2]
+  mov     [edx-2],cx
+  ret
+@@Fwd33:
+  mov     ecx,[eax-33]
+  mov     [edx-33],ecx
+@@Fwd29:
+  mov     ecx,[eax-29]
+  mov     [edx-29],ecx
+@@Fwd25:
+  mov     ecx,[eax-25]
+  mov     [edx-25],ecx
+@@Fwd21:
+  mov     ecx,[eax-21]
+  mov     [edx-21],ecx
+@@Fwd17:
+  mov     ecx,[eax-17]
+  mov     [edx-17],ecx
+@@Fwd13:
+  mov     ecx,[eax-13]
+  mov     [edx-13],ecx
+@@Fwd09:
+  mov     ecx,[eax-9]
+  mov     [edx-9],ecx
+@@Fwd05:
+  mov     ecx,[eax-5]
+  mov     [edx-5],ecx
+@@Fwd01:
+  movzx   ecx, byte ptr [eax-1]
+  mov     [edx-1],cl
+@@Done:
+end; {SmallForwardMove}
+
+{-------------------------------------------------------------------------}
+{Perform Backward Move of 0..36 Bytes}
+{On Entry, ECX = Count, EAX = Source, EDX = Dest.  Destroys ECX}
+procedure SmallBackwardMove_3;assembler;nostackframe;
+asm
+  jmp     dword ptr @@BwdJumpTable[ecx*4]
+  align   16
+@@BwdJumpTable:
+  dd      @@Done {Removes need to test for zero size move}
+  dd      @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08
+  dd      @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16
+  dd      @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24
+  dd      @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32
+  dd      @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36
+@@Bwd36:
+  mov     ecx,[eax+32]
+  mov     [edx+32],ecx
+@@Bwd32:
+  mov     ecx,[eax+28]
+  mov     [edx+28],ecx
+@@Bwd28:
+  mov     ecx,[eax+24]
+  mov     [edx+24],ecx
+@@Bwd24:
+  mov     ecx,[eax+20]
+  mov     [edx+20],ecx
+@@Bwd20:
+  mov     ecx,[eax+16]
+  mov     [edx+16],ecx
+@@Bwd16:
+  mov     ecx,[eax+12]
+  mov     [edx+12],ecx
+@@Bwd12:
+  mov     ecx,[eax+8]
+  mov     [edx+8],ecx
+@@Bwd08:
+  mov     ecx,[eax+4]
+  mov     [edx+4],ecx
+@@Bwd04:
+  mov     ecx,[eax]
+  mov     [edx],ecx
+  ret
+@@Bwd35:
+  mov     ecx,[eax+31]
+  mov     [edx+31],ecx
+@@Bwd31:
+  mov     ecx,[eax+27]
+  mov     [edx+27],ecx
+@@Bwd27:
+  mov     ecx,[eax+23]
+  mov     [edx+23],ecx
+@@Bwd23:
+  mov     ecx,[eax+19]
+  mov     [edx+19],ecx
+@@Bwd19:
+  mov     ecx,[eax+15]
+  mov     [edx+15],ecx
+@@Bwd15:
+  mov     ecx,[eax+11]
+  mov     [edx+11],ecx
+@@Bwd11:
+  mov     ecx,[eax+7]
+  mov     [edx+7],ecx
+@@Bwd07:
+  mov     ecx,[eax+3]
+  mov     [edx+3],ecx
+  mov     ecx,[eax]
+  mov     [edx],ecx
+  ret
+@@Bwd03:
+  movzx   ecx, word ptr [eax+1]
+  mov     [edx+1],cx
+  movzx   ecx, byte ptr [eax]
+  mov     [edx],cl
+  ret
+@@Bwd34:
+  mov     ecx,[eax+30]
+  mov     [edx+30],ecx
+@@Bwd30:
+  mov     ecx,[eax+26]
+  mov     [edx+26],ecx
+@@Bwd26:
+  mov     ecx,[eax+22]
+  mov     [edx+22],ecx
+@@Bwd22:
+  mov     ecx,[eax+18]
+  mov     [edx+18],ecx
+@@Bwd18:
+  mov     ecx,[eax+14]
+  mov     [edx+14],ecx
+@@Bwd14:
+  mov     ecx,[eax+10]
+  mov     [edx+10],ecx
+@@Bwd10:
+  mov     ecx,[eax+6]
+  mov     [edx+6],ecx
+@@Bwd06:
+  mov     ecx,[eax+2]
+  mov     [edx+2],ecx
+@@Bwd02:
+  movzx   ecx, word ptr [eax]
+  mov     [edx],cx
+  ret
+@@Bwd33:
+  mov     ecx,[eax+29]
+  mov     [edx+29],ecx
+@@Bwd29:
+  mov     ecx,[eax+25]
+  mov     [edx+25],ecx
+@@Bwd25:
+  mov     ecx,[eax+21]
+  mov     [edx+21],ecx
+@@Bwd21:
+  mov     ecx,[eax+17]
+  mov     [edx+17],ecx
+@@Bwd17:
+  mov     ecx,[eax+13]
+  mov     [edx+13],ecx
+@@Bwd13:
+  mov     ecx,[eax+9]
+  mov     [edx+9],ecx
+@@Bwd09:
+  mov     ecx,[eax+5]
+  mov     [edx+5],ecx
+@@Bwd05:
+  mov     ecx,[eax+1]
+  mov     [edx+1],ecx
+@@Bwd01:
+  movzx   ecx, byte ptr[eax]
+  mov     [edx],cl
+@@Done:
+end; {SmallBackwardMove}
+
+{-------------------------------------------------------------------------}
+{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
+procedure Forwards_IA32_3;assembler;nostackframe;
+asm
+  push    ebx
+  mov     ebx,edx
+  fild    qword ptr [eax]
+  add     eax,ecx {QWORD Align Writes}
+  add     ecx,edx
+  add     edx,7
+  and     edx,-8
+  sub     ecx,edx
+  add     edx,ecx {Now QWORD Aligned}
+  sub     ecx,16
+  neg     ecx
+@FwdLoop:
+  fild    qword ptr [eax+ecx-16]
+  fistp   qword ptr [edx+ecx-16]
+  fild    qword ptr [eax+ecx-8]
+  fistp   qword ptr [edx+ecx-8]
+  add     ecx,16
+  jle     @FwdLoop
+  fistp   qword ptr [ebx]
+  neg     ecx
+  add     ecx,16
+  pop     ebx
+  jmp     SmallForwardMove_3
+end; {Forwards_IA32}
+
+{-------------------------------------------------------------------------}
+{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
+procedure Backwards_IA32_3;assembler;nostackframe;
+asm
+  push    ebx
+  fild    qword ptr [eax+ecx-8]
+  lea     ebx,[edx+ecx] {QWORD Align Writes}
+  and     ebx,7
+  sub     ecx,ebx
+  add     ebx,ecx {Now QWORD Aligned, EBX = Original Length}
+  sub     ecx,16
+@BwdLoop:
+  fild    qword ptr [eax+ecx]
+  fild    qword ptr [eax+ecx+8]
+  fistp   qword ptr [edx+ecx+8]
+  fistp   qword ptr [edx+ecx]
+  sub     ecx,16
+  jge     @BwdLoop
+  fistp   qword ptr [edx+ebx-8]
+  add     ecx,16
+  pop     ebx
+  jmp     SmallBackwardMove_3
+end; {Backwards_IA32}
+
+{-------------------------------------------------------------------------}
+{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
+procedure Forwards_MMX_3;assembler;nostackframe;
+const
+  LARGESIZE = 1024;
+asm
+  cmp     ecx,LARGESIZE
+  jge     @FwdLargeMove
+  cmp     ecx,72 {Size at which using MMX becomes worthwhile}
+  jl      Forwards_IA32_3
+  push    ebx
+  mov     ebx,edx
+  movq    mm0,[eax] {First 8 Characters}
+  {QWORD Align Writes}
+  add     eax,ecx
+  add     ecx,edx
+  add     edx,7
+  and     edx,-8
+  sub     ecx,edx
+  add     edx,ecx
+  {Now QWORD Aligned}
+  sub     ecx,32
+  neg     ecx
+@FwdLoopMMX:
+  movq    mm1,[eax+ecx-32]
+  movq    mm2,[eax+ecx-24]
+  movq    mm3,[eax+ecx-16]
+  movq    mm4,[eax+ecx- 8]
+  movq    [edx+ecx-32],mm1
+  movq    [edx+ecx-24],mm2
+  movq    [edx+ecx-16],mm3
+  movq    [edx+ecx- 8],mm4
+  add     ecx,32
+  jle     @FwdLoopMMX
+  movq    [ebx],mm0 {First 8 Characters}
+  emms
+  pop     ebx
+  neg     ecx
+  add     ecx,32
+  jmp     SmallForwardMove_3
+@FwdLargeMove:
+  push    ebx
+  mov     ebx,ecx
+  test    edx,15
+  jz      @FwdAligned
+  {16 byte Align Destination}
+  mov     ecx,edx
+  add     ecx,15
+  and     ecx,-16
+  sub     ecx,edx
+  add     eax,ecx
+  add     edx,ecx
+  sub     ebx,ecx
+  {Destination now 16 Byte Aligned}
+  call    SmallForwardMove_3
+@FwdAligned:
+  mov     ecx,ebx
+  and     ecx,-16
+  sub     ebx,ecx {EBX = Remainder}
+  push    esi
+  push    edi
+  mov     esi,eax          {ESI = Source}
+  mov     edi,edx          {EDI = Dest}
+  mov     eax,ecx          {EAX = Count}
+  and     eax,-64          {EAX = No of Bytes to Blocks Moves}
+  and     ecx,$3F          {ECX = Remaining Bytes to Move (0..63)}
+  add     esi,eax
+  add     edi,eax
+  shr     eax,3            {EAX = No of QWORD's to Block Move}
+  neg     eax
+@MMXcopyloop:
+  movq    mm0,[esi+eax*8   ]
+  movq    mm1,[esi+eax*8+ 8]
+  movq    mm2,[esi+eax*8+16]
+  movq    mm3,[esi+eax*8+24]
+  movq    mm4,[esi+eax*8+32]
+  movq    mm5,[esi+eax*8+40]
+  movq    mm6,[esi+eax*8+48]
+  movq    mm7,[esi+eax*8+56]
+  movq    [edi+eax*8   ],mm0
+  movq    [edi+eax*8+ 8],mm1
+  movq    [edi+eax*8+16],mm2
+  movq    [edi+eax*8+24],mm3
+  movq    [edi+eax*8+32],mm4
+  movq    [edi+eax*8+40],mm5
+  movq    [edi+eax*8+48],mm6
+  movq    [edi+eax*8+56],mm7
+  add     eax,8
+  jnz     @MMXcopyloop
+  emms                   {Empty MMX State}
+  add     ecx,ebx
+  shr     ecx,2
+  rep     movsd
+  mov     ecx,ebx
+  and     ecx,3
+  rep     movsb
+  pop     edi
+  pop     esi
+  pop     ebx
+end; {Forwards_MMX}
+
+{-------------------------------------------------------------------------}
+{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
+procedure Backwards_MMX_3;assembler;nostackframe;
+asm
+  cmp     ecx,72 {Size at which using MMX becomes worthwhile}
+  jl      Backwards_IA32_3
+  push    ebx
+  movq    mm0,[eax+ecx-8] {Get Last QWORD}
+  {QWORD Align Writes}
+  lea     ebx,[edx+ecx]
+  and     ebx,7
+  sub     ecx,ebx
+  add     ebx,ecx
+  {Now QWORD Aligned}
+  sub     ecx,32
+@BwdLoopMMX:
+  movq    mm1,[eax+ecx   ]
+  movq    mm2,[eax+ecx+ 8]
+  movq    mm3,[eax+ecx+16]
+  movq    mm4,[eax+ecx+24]
+  movq    [edx+ecx+24],mm4
+  movq    [edx+ecx+16],mm3
+  movq    [edx+ecx+ 8],mm2
+  movq    [edx+ecx   ],mm1
+  sub     ecx,32
+  jge     @BwdLoopMMX
+  movq    [edx+ebx-8], mm0 {Last QWORD}
+  emms
+  add     ecx,32
+  pop     ebx
+  jmp     SmallBackwardMove_3
+end; {Backwards_MMX}
+
+{-------------------------------------------------------------------------}
+{Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
+procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe;
+const
+  Prefetch = 512;
+asm
+  push    esi
+  mov     esi,eax             {ESI = Source}
+  mov     eax,ecx             {EAX = Count}
+  and     eax,-128            {EAX = No of Bytes to Block Move}
+  add     esi,eax
+  add     edx,eax
+  shr     eax,3               {EAX = No of QWORD's to Block Move}
+  neg     eax
+  cmp     eax, -(32*1024)     {Count > 256K}
+  jl      @Large
+@Small: {Count<=256K}
+  test    esi,15              {Check if Both Source/Dest Aligned}
+  jnz     @SmallUnaligned
+@SmallAligned:                {Both Source and Dest 16-Byte Aligned}
+@SmallAlignedLoop:
+  movaps  xmm0,[esi+8*eax]
+  movaps  xmm1,[esi+8*eax+16]
+  movaps  xmm2,[esi+8*eax+32]
+  movaps  xmm3,[esi+8*eax+48]
+  movaps  [edx+8*eax],xmm0
+  movaps  [edx+8*eax+16],xmm1
+  movaps  [edx+8*eax+32],xmm2
+  movaps  [edx+8*eax+48],xmm3
+  movaps  xmm4,[esi+8*eax+64]
+  movaps  xmm5,[esi+8*eax+80]
+  movaps  xmm6,[esi+8*eax+96]
+  movaps  xmm7,[esi+8*eax+112]
+  movaps  [edx+8*eax+64],xmm4
+  movaps  [edx+8*eax+80],xmm5
+  movaps  [edx+8*eax+96],xmm6
+  movaps  [edx+8*eax+112],xmm7
+  add     eax,16
+  js      @SmallAlignedLoop
+  jmp     @Remainder
+@SmallUnaligned:              {Source Not 16-Byte Aligned}
+@SmallUnalignedLoop:
+  movups  xmm0,[esi+8*eax]
+  movups  xmm1,[esi+8*eax+16]
+  movups  xmm2,[esi+8*eax+32]
+  movups  xmm3,[esi+8*eax+48]
+  movaps  [edx+8*eax],xmm0
+  movaps  [edx+8*eax+16],xmm1
+  movaps  [edx+8*eax+32],xmm2
+  movaps  [edx+8*eax+48],xmm3
+  movups  xmm4,[esi+8*eax+64]
+  movups  xmm5,[esi+8*eax+80]
+  movups  xmm6,[esi+8*eax+96]
+  movups  xmm7,[esi+8*eax+112]
+  movaps  [edx+8*eax+64],xmm4
+  movaps  [edx+8*eax+80],xmm5
+  movaps  [edx+8*eax+96],xmm6
+  movaps  [edx+8*eax+112],xmm7
+  add     eax,16
+  js      @SmallUnalignedLoop
+  jmp     @Remainder
+@Large: {Count>256K}
+  test    esi,15              {Check if Both Source/Dest Aligned}
+  jnz     @LargeUnaligned
+@LargeAligned:                {Both Source and Dest 16-Byte Aligned}
+@LargeAlignedLoop:
+  prefetchnta  [esi+8*eax+Prefetch]
+  prefetchnta  [esi+8*eax+Prefetch+64]
+  movaps  xmm0,[esi+8*eax]
+  movaps  xmm1,[esi+8*eax+16]
+  movaps  xmm2,[esi+8*eax+32]
+  movaps  xmm3,[esi+8*eax+48]
+  movntps [edx+8*eax],xmm0
+  movntps [edx+8*eax+16],xmm1
+  movntps [edx+8*eax+32],xmm2
+  movntps [edx+8*eax+48],xmm3
+  movaps  xmm4,[esi+8*eax+64]
+  movaps  xmm5,[esi+8*eax+80]
+  movaps  xmm6,[esi+8*eax+96]
+  movaps  xmm7,[esi+8*eax+112]
+  movntps [edx+8*eax+64],xmm4
+  movntps [edx+8*eax+80],xmm5
+  movntps [edx+8*eax+96],xmm6
+  movntps [edx+8*eax+112],xmm7
+  add     eax,16
+  js      @LargeAlignedLoop
+  sfence
+  jmp     @Remainder
+@LargeUnaligned:              {Source Not 16-Byte Aligned}
+@LargeUnalignedLoop:
+  prefetchnta  [esi+8*eax+Prefetch]
+  prefetchnta  [esi+8*eax+Prefetch+64]
+  movups  xmm0,[esi+8*eax]
+  movups  xmm1,[esi+8*eax+16]
+  movups  xmm2,[esi+8*eax+32]
+  movups  xmm3,[esi+8*eax+48]
+  movntps [edx+8*eax],xmm0
+  movntps [edx+8*eax+16],xmm1
+  movntps [edx+8*eax+32],xmm2
+  movntps [edx+8*eax+48],xmm3
+  movups  xmm4,[esi+8*eax+64]
+  movups  xmm5,[esi+8*eax+80]
+  movups  xmm6,[esi+8*eax+96]
+  movups  xmm7,[esi+8*eax+112]
+  movntps [edx+8*eax+64],xmm4
+  movntps [edx+8*eax+80],xmm5
+  movntps [edx+8*eax+96],xmm6
+  movntps [edx+8*eax+112],xmm7
+  add     eax,16
+  js      @LargeUnalignedLoop
+  sfence
+@Remainder:
+  and     ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)}
+  jz      @Done
+  add     esi,ecx
+  add     edx,ecx
+  neg     ecx
+@RemainderLoop:
+  movups  xmm0,[esi+ecx]
+  movaps  [edx+ecx],xmm0
+  add     ecx,16
+  jnz     @RemainderLoop
+@Done:
+  pop     esi
+end; {AlignedFwdMoveSSE}
+
+{-------------------------------------------------------------------------}
+{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
+procedure Forwards_SSE_3;assembler;nostackframe;
+const
+  LARGESIZE = 2048;
+asm
+  cmp     ecx,LARGESIZE
+  jge     @FwdLargeMove
+  cmp     ecx,SMALLMOVESIZE+32
+  movups  xmm0,[eax]
+  jg      @FwdMoveSSE
+  movups  xmm1,[eax+16]
+  movups  [edx],xmm0
+  movups  [edx+16],xmm1
+  add     eax,ecx
+  add     edx,ecx
+  sub     ecx,32
+  jmp     SmallForwardMove_3
+@FwdMoveSSE:
+  push    ebx
+  mov     ebx,edx
+  {Align Writes}
+  add     eax,ecx
+  add     ecx,edx
+  add     edx,15
+  and     edx,-16
+  sub     ecx,edx
+  add     edx,ecx
+  {Now Aligned}
+  sub     ecx,32
+  neg     ecx
+@FwdLoopSSE:
+  movups  xmm1,[eax+ecx-32]
+  movups  xmm2,[eax+ecx-16]
+  movaps  [edx+ecx-32],xmm1
+  movaps  [edx+ecx-16],xmm2
+  add     ecx,32
+  jle     @FwdLoopSSE
+  movups  [ebx],xmm0 {First 16 Bytes}
+  neg     ecx
+  add     ecx,32
+  pop     ebx
+  jmp     SmallForwardMove_3
+@FwdLargeMove:
+  push    ebx
+  mov     ebx,ecx
+  test    edx,15
+  jz      @FwdLargeAligned
+  {16 byte Align Destination}
+  mov     ecx,edx
+  add     ecx,15
+  and     ecx,-16
+  sub     ecx,edx
+  add     eax,ecx
+  add     edx,ecx
+  sub     ebx,ecx
+  {Destination now 16 Byte Aligned}
+  call    SmallForwardMove_3
+  mov     ecx,ebx
+@FwdLargeAligned:
+  and     ecx,-16
+  sub     ebx,ecx {EBX = Remainder}
+  push    edx
+  push    eax
+  push    ecx
+  call    AlignedFwdMoveSSE_3
+  pop     ecx
+  pop     eax
+  pop     edx
+  add     ecx,ebx
+  add     eax,ecx
+  add     edx,ecx
+  mov     ecx,ebx
+  pop     ebx
+  jmp     SmallForwardMove_3
+end; {Forwards_SSE}
+
+{-------------------------------------------------------------------------}
+{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
+procedure Backwards_SSE_3;assembler;nostackframe;
+asm
+  cmp     ecx,SMALLMOVESIZE+32
+  jg      @BwdMoveSSE
+  sub     ecx,32
+  movups  xmm1,[eax+ecx]
+  movups  xmm2,[eax+ecx+16]
+  movups  [edx+ecx],xmm1
+  movups  [edx+ecx+16],xmm2
+  jmp     SmallBackwardMove_3
+@BwdMoveSSE:
+  push    ebx
+  movups  xmm0,[eax+ecx-16] {Last 16 Bytes}
+  {Align Writes}
+  lea     ebx,[edx+ecx]
+  and     ebx,15
+  sub     ecx,ebx
+  add     ebx,ecx
+  {Now Aligned}
+  sub     ecx,32
+@BwdLoop:
+  movups  xmm1,[eax+ecx]
+  movups  xmm2,[eax+ecx+16]
+  movaps  [edx+ecx],xmm1
+  movaps  [edx+ecx+16],xmm2
+  sub     ecx,32
+  jge     @BwdLoop
+  movups  [edx+ebx-16],xmm0  {Last 16 Bytes}
+  add     ecx,32
+  pop     ebx
+  jmp     SmallBackwardMove_3
+end; {Backwards_SSE}
+
+const
+   fastmoveproc_forward : pointer = @Forwards_SSE_3;
+   fastmoveproc_backward : pointer = @Backwards_SSE_3;
+
+procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
+asm
+  cmp     ecx,SMALLMOVESIZE
+  ja      @Large
+  cmp     eax,edx
+  lea     eax,[eax+ecx]
+  jle     @SmallCheck
+@SmallForward:
+  add     edx,ecx
+  jmp     SmallForwardMove_3
+@SmallCheck:
+  je      @Done {For Compatibility with Delphi's move for Source = Dest}
+  sub     eax,ecx
+  jmp     SmallBackwardMove_3
+@Large:
+  jng     @Done {For Compatibility with Delphi's move for Count < 0}
+  cmp     eax,edx
+  jg      @moveforward
+  je      @Done {For Compatibility with Delphi's move for Source = Dest}
+  push    eax
+  add     eax,ecx
+  cmp     eax,edx
+  pop     eax
+  jg      @movebackward
+@moveforward:
+  jmp     dword ptr fastmoveproc_forward
+@movebackward:
+  jmp     dword ptr fastmoveproc_backward {Source/Dest Overlap}
+@Done:
+end;
+
+{$asmmode att}
+
+procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
+  begin
+    if has_sse_support then
+      begin
+        fastmoveproc_forward:=@Forwards_SSE_3;
+        fastmoveproc_backward:=@Backwards_SSE_3;
+      end
+    else if has_mmx_support then
+      begin
+        fastmoveproc_forward:=@Forwards_MMX_3;
+        fastmoveproc_backward:=@Backwards_MMX_3;
+      end;
+  end;
+
+{$endif  FPC_SYSTEM_HAS_MOVE}
+
+{$else}
+
+procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
+  begin
+  end;
+
+{$endif}

+ 77 - 4
rtl/i386/i386.inc

@@ -15,14 +15,85 @@
 
  **********************************************************************}
 
-{$asmmode ATT}
-
 {****************************************************************************
                                Primitives
 ****************************************************************************}
+var
+  has_sse_support,has_mmx_support : boolean;
+
+{$asmmode intel}
+
+function cpuid_support : boolean;assembler;
+  {
+    Check if the ID-flag can be changed, if changed then CpuID is supported.
+    Tested under go32v1 and Linux on c6x86 with CpuID enabled and disabled (PFV)
+  }
+  asm
+     pushf
+     pushf
+     pop     eax
+     mov     ebx,eax
+     xor     eax,200000h
+    push    eax
+     popf
+     pushf
+     pop     eax
+     popf
+     and     eax,200000h
+     and     ebx,200000h
+     cmp     eax,ebx
+     setnz   al
+  end;
+
+{$asmmode ATT}
+
+function sse_support : boolean;
+  var
+     _edx : longint;
+  begin
+    if cpuid_support then
+     begin
+        asm
+           movl $1,%eax
+           cpuid
+           movl %edx,_edx
+        end;
+        sse_support:=(_edx and $2000000)<>0;
+     end
+    else
+     { a cpu with without cpuid instruction supports never sse }
+     sse_support:=false;
+  end;
+
+
+{ returns true, if the processor supports the mmx instructions }
+function mmx_support : boolean;
+
+  var
+     _edx : longint;
+
+  begin
+     if cpuid_support then
+       begin
+          asm
+             movl $1,%eax
+             cpuid
+             movl %edx,_edx
+          end;
+          mmx_support:=(_edx and $800000)<>0;
+       end
+     else
+       { a cpu with without cpuid instruction supports never mmx }
+       mmx_support:=false;
+  end;
+
+{$i fastmove.inc}
 
 procedure fpc_cpuinit;
 begin
+  has_sse_support:=sse_support;
+  has_mmx_support:=mmx_support;
+  setup_fastmove;
 end;
 
 
@@ -32,7 +103,6 @@ asm
   ret
 end;
 
-
 {$ifndef FPC_SYSTEM_HAS_MOVE}
 {$define FPC_SYSTEM_HAS_MOVE}
 procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;
@@ -1510,7 +1580,10 @@ end;
 
 {
   $Log$
-  Revision 1.66  2004-11-17 22:19:04  peter
+  Revision 1.67  2005-01-23 20:03:23  florian
+    + fastmove from John O'Harrow integrated
+
+  Revision 1.66  2004/11/17 22:19:04  peter
   internconst, internproc and some external declarations moved to interface
 
   Revision 1.65  2004/11/01 12:43:29  peter