123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907 |
- {
- Copyright (c) 2004, John O'Harrow ([email protected])
- This software is provided 'as-is', without any express or implied warranty.
- In no event will the authors be held liable for any damages arising from the
- use of this software.
- Permission is granted to anyone to use this software for any purpose, including
- commercial applications, and to alter it and redistribute it freely, subject to
- the following restrictions:
- 1. The origin of this software must not be misrepresented; you must not claim
- that you wrote the original software. If you use this software in a product,
- an acknowledgment in the product documentation would be appreciated but is
- not required.
- 2. Altered source versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
- 3. This notice may not be removed or altered from any source distribution.
- -------------------------------------------------------------------------------
- Version: 1.40 - 16-SEP-2004
- }
- {$ifdef USE_FASTMOVE}
- {$ifndef FPC_SYSTEM_HAS_MOVE}
- {$define FPC_SYSTEM_HAS_MOVE}
- {$asmmode intel}
- {-------------------------------------------------------------------------}
- (*
- {Just to show that a good Pascal algorithm can beat the default BASM}
- procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer);
- var
- S, D : PtrUInt;
- Temp, C, I : PtrInt;
- L : PPtrInt;
- begin
- S := Cardinal(@Source);
- D := Cardinal(@Dest);
- if S = D then
- Exit;
- if Count <= 4 then
- case Count of
- 1 : PByte(@Dest)^ := PByte(S)^;
- 2 : PWord(@Dest)^ := PWord(S)^;
- 3 : if D > S then
- begin
- PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
- PWord(@Dest)^ := PWord(S)^;
- end
- else
- begin
- PWord(@Dest)^ := PWord(S)^;
- PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
- end;
- 4 : PInteger(@Dest)^ := PInteger(S)^
- else Exit; {Count <= 0}
- end
- else
- if D > S then
- begin
- Temp := PInteger(S)^;
- I := Integer(@Dest);
- C := Count - 4;
- L := PInteger(Integer(@Dest) + C);
- Inc(S, C);
- repeat
- L^ := PInteger(S)^;
- if Count <= 8 then
- Break;
- Dec(Count, 4);
- Dec(S, 4);
- Dec(L);
- until False;
- PInteger(I)^ := Temp;
- end
- else
- begin
- C := Count - 4;
- Temp := PInteger(S + Cardinal(C))^;
- I := Integer(@Dest) + C;
- L := @Dest;
- repeat
- L^ := PInteger(S)^;
- if Count <= 8 then
- Break;
- Dec(Count, 4);
- Inc(S, 4);
- Inc(L);
- until False;
- PInteger(I)^ := Temp;
- end;
- end; {MoveJOH_PAS}
- *)
- const
- SMALLMOVESIZE = 36;
- {-------------------------------------------------------------------------}
- {Perform Forward Move of 0..36 Bytes}
- {On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count. Destroys ECX}
- procedure SmallForwardMove_3;assembler;nostackframe;
- asm
- jmp dword ptr @@FwdJumpTable[ecx*4]
- align 16
- @@FwdJumpTable:
- dd @@Done {Removes need to test for zero size move}
- dd @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08
- dd @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16
- dd @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24
- dd @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32
- dd @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36
- @@Fwd36:
- mov ecx,[eax-36]
- mov [edx-36],ecx
- @@Fwd32:
- mov ecx,[eax-32]
- mov [edx-32],ecx
- @@Fwd28:
- mov ecx,[eax-28]
- mov [edx-28],ecx
- @@Fwd24:
- mov ecx,[eax-24]
- mov [edx-24],ecx
- @@Fwd20:
- mov ecx,[eax-20]
- mov [edx-20],ecx
- @@Fwd16:
- mov ecx,[eax-16]
- mov [edx-16],ecx
- @@Fwd12:
- mov ecx,[eax-12]
- mov [edx-12],ecx
- @@Fwd08:
- mov ecx,[eax-8]
- mov [edx-8],ecx
- @@Fwd04:
- mov ecx,[eax-4]
- mov [edx-4],ecx
- ret
- @@Fwd35:
- mov ecx,[eax-35]
- mov [edx-35],ecx
- @@Fwd31:
- mov ecx,[eax-31]
- mov [edx-31],ecx
- @@Fwd27:
- mov ecx,[eax-27]
- mov [edx-27],ecx
- @@Fwd23:
- mov ecx,[eax-23]
- mov [edx-23],ecx
- @@Fwd19:
- mov ecx,[eax-19]
- mov [edx-19],ecx
- @@Fwd15:
- mov ecx,[eax-15]
- mov [edx-15],ecx
- @@Fwd11:
- mov ecx,[eax-11]
- mov [edx-11],ecx
- @@Fwd07:
- mov ecx,[eax-7]
- mov [edx-7],ecx
- mov ecx,[eax-4]
- mov [edx-4],ecx
- ret
- @@Fwd03:
- movzx ecx, word ptr [eax-3]
- mov [edx-3],cx
- movzx ecx, byte ptr [eax-1]
- mov [edx-1],cl
- ret
- @@Fwd34:
- mov ecx,[eax-34]
- mov [edx-34],ecx
- @@Fwd30:
- mov ecx,[eax-30]
- mov [edx-30],ecx
- @@Fwd26:
- mov ecx,[eax-26]
- mov [edx-26],ecx
- @@Fwd22:
- mov ecx,[eax-22]
- mov [edx-22],ecx
- @@Fwd18:
- mov ecx,[eax-18]
- mov [edx-18],ecx
- @@Fwd14:
- mov ecx,[eax-14]
- mov [edx-14],ecx
- @@Fwd10:
- mov ecx,[eax-10]
- mov [edx-10],ecx
- @@Fwd06:
- mov ecx,[eax-6]
- mov [edx-6],ecx
- @@Fwd02:
- movzx ecx, word ptr [eax-2]
- mov [edx-2],cx
- ret
- @@Fwd33:
- mov ecx,[eax-33]
- mov [edx-33],ecx
- @@Fwd29:
- mov ecx,[eax-29]
- mov [edx-29],ecx
- @@Fwd25:
- mov ecx,[eax-25]
- mov [edx-25],ecx
- @@Fwd21:
- mov ecx,[eax-21]
- mov [edx-21],ecx
- @@Fwd17:
- mov ecx,[eax-17]
- mov [edx-17],ecx
- @@Fwd13:
- mov ecx,[eax-13]
- mov [edx-13],ecx
- @@Fwd09:
- mov ecx,[eax-9]
- mov [edx-9],ecx
- @@Fwd05:
- mov ecx,[eax-5]
- mov [edx-5],ecx
- @@Fwd01:
- movzx ecx, byte ptr [eax-1]
- mov [edx-1],cl
- @@Done:
- end; {SmallForwardMove}
- {-------------------------------------------------------------------------}
- {Perform Backward Move of 0..36 Bytes}
- {On Entry, ECX = Count, EAX = Source, EDX = Dest. Destroys ECX}
- procedure SmallBackwardMove_3;assembler;nostackframe;
- asm
- jmp dword ptr @@BwdJumpTable[ecx*4]
- align 16
- @@BwdJumpTable:
- dd @@Done {Removes need to test for zero size move}
- dd @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08
- dd @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16
- dd @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24
- dd @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32
- dd @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36
- @@Bwd36:
- mov ecx,[eax+32]
- mov [edx+32],ecx
- @@Bwd32:
- mov ecx,[eax+28]
- mov [edx+28],ecx
- @@Bwd28:
- mov ecx,[eax+24]
- mov [edx+24],ecx
- @@Bwd24:
- mov ecx,[eax+20]
- mov [edx+20],ecx
- @@Bwd20:
- mov ecx,[eax+16]
- mov [edx+16],ecx
- @@Bwd16:
- mov ecx,[eax+12]
- mov [edx+12],ecx
- @@Bwd12:
- mov ecx,[eax+8]
- mov [edx+8],ecx
- @@Bwd08:
- mov ecx,[eax+4]
- mov [edx+4],ecx
- @@Bwd04:
- mov ecx,[eax]
- mov [edx],ecx
- ret
- @@Bwd35:
- mov ecx,[eax+31]
- mov [edx+31],ecx
- @@Bwd31:
- mov ecx,[eax+27]
- mov [edx+27],ecx
- @@Bwd27:
- mov ecx,[eax+23]
- mov [edx+23],ecx
- @@Bwd23:
- mov ecx,[eax+19]
- mov [edx+19],ecx
- @@Bwd19:
- mov ecx,[eax+15]
- mov [edx+15],ecx
- @@Bwd15:
- mov ecx,[eax+11]
- mov [edx+11],ecx
- @@Bwd11:
- mov ecx,[eax+7]
- mov [edx+7],ecx
- @@Bwd07:
- mov ecx,[eax+3]
- mov [edx+3],ecx
- mov ecx,[eax]
- mov [edx],ecx
- ret
- @@Bwd03:
- movzx ecx, word ptr [eax+1]
- mov [edx+1],cx
- movzx ecx, byte ptr [eax]
- mov [edx],cl
- ret
- @@Bwd34:
- mov ecx,[eax+30]
- mov [edx+30],ecx
- @@Bwd30:
- mov ecx,[eax+26]
- mov [edx+26],ecx
- @@Bwd26:
- mov ecx,[eax+22]
- mov [edx+22],ecx
- @@Bwd22:
- mov ecx,[eax+18]
- mov [edx+18],ecx
- @@Bwd18:
- mov ecx,[eax+14]
- mov [edx+14],ecx
- @@Bwd14:
- mov ecx,[eax+10]
- mov [edx+10],ecx
- @@Bwd10:
- mov ecx,[eax+6]
- mov [edx+6],ecx
- @@Bwd06:
- mov ecx,[eax+2]
- mov [edx+2],ecx
- @@Bwd02:
- movzx ecx, word ptr [eax]
- mov [edx],cx
- ret
- @@Bwd33:
- mov ecx,[eax+29]
- mov [edx+29],ecx
- @@Bwd29:
- mov ecx,[eax+25]
- mov [edx+25],ecx
- @@Bwd25:
- mov ecx,[eax+21]
- mov [edx+21],ecx
- @@Bwd21:
- mov ecx,[eax+17]
- mov [edx+17],ecx
- @@Bwd17:
- mov ecx,[eax+13]
- mov [edx+13],ecx
- @@Bwd13:
- mov ecx,[eax+9]
- mov [edx+9],ecx
- @@Bwd09:
- mov ecx,[eax+5]
- mov [edx+5],ecx
- @@Bwd05:
- mov ecx,[eax+1]
- mov [edx+1],ecx
- @@Bwd01:
- movzx ecx, byte ptr[eax]
- mov [edx],cl
- @@Done:
- end; {SmallBackwardMove}
- { at least valgrind up to 3.3 has a bug which prevents the default code to
- work so we use a rather simple implementation here
- }
- procedure Forwards_Valgrind;assembler;nostackframe;
- asm
- {$ifdef FPC_ENABLED_CLD}
- cld
- {$endif FPC_ENABLED_CLD}
- push esi
- push edi
- mov esi,eax
- mov edi,edx
- rep movsb
- pop edi
- pop esi
- end;
- { at least valgrind up to 3.3 has a bug which prevents the default code to
- work so we use a rather simple implementation here
- }
- procedure Backwards_Valgrind;assembler;nostackframe;
- asm
- push esi
- push edi
- lea esi,[eax+ecx-1]
- lea edi,[edx+ecx-1]
- @@repeat:
- mov al,[esi]
- mov [edi],al
- dec esi
- dec edi
- dec ecx
- jnz @@repeat
- pop edi
- pop esi
- end;
- {-------------------------------------------------------------------------}
- {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
- procedure Forwards_IA32_3;assembler;nostackframe;
- asm
- push ebx
- mov ebx,edx
- fild qword ptr [eax]
- add eax,ecx {QWORD Align Writes}
- add ecx,edx
- add edx,7
- and edx,-8
- sub ecx,edx
- add edx,ecx {Now QWORD Aligned}
- sub ecx,16
- neg ecx
- @FwdLoop:
- fild qword ptr [eax+ecx-16]
- fistp qword ptr [edx+ecx-16]
- fild qword ptr [eax+ecx-8]
- fistp qword ptr [edx+ecx-8]
- add ecx,16
- jle @FwdLoop
- fistp qword ptr [ebx]
- neg ecx
- add ecx,16
- pop ebx
- jmp SmallForwardMove_3
- end; {Forwards_IA32}
- {-------------------------------------------------------------------------}
- {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
- procedure Backwards_IA32_3;assembler;nostackframe;
- asm
- push ebx
- fild qword ptr [eax+ecx-8]
- lea ebx,[edx+ecx] {QWORD Align Writes}
- and ebx,7
- sub ecx,ebx
- add ebx,ecx {Now QWORD Aligned, EBX = Original Length}
- sub ecx,16
- @BwdLoop:
- fild qword ptr [eax+ecx]
- fild qword ptr [eax+ecx+8]
- fistp qword ptr [edx+ecx+8]
- fistp qword ptr [edx+ecx]
- sub ecx,16
- jge @BwdLoop
- fistp qword ptr [edx+ebx-8]
- add ecx,16
- pop ebx
- jmp SmallBackwardMove_3
- end; {Backwards_IA32}
- {-------------------------------------------------------------------------}
- {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
- procedure Forwards_MMX_3;assembler;nostackframe;
- const
- LARGESIZE = 1024;
- asm
- cmp ecx,LARGESIZE
- jge @FwdLargeMove
- cmp ecx,72 {Size at which using MMX becomes worthwhile}
- jl Forwards_IA32_3
- push ebx
- mov ebx,edx
- movq mm0,[eax] {First 8 Characters}
- {QWORD Align Writes}
- add eax,ecx
- add ecx,edx
- add edx,7
- and edx,-8
- sub ecx,edx
- add edx,ecx
- {Now QWORD Aligned}
- sub ecx,32
- neg ecx
- @FwdLoopMMX:
- movq mm1,[eax+ecx-32]
- movq mm2,[eax+ecx-24]
- movq mm3,[eax+ecx-16]
- movq mm4,[eax+ecx- 8]
- movq [edx+ecx-32],mm1
- movq [edx+ecx-24],mm2
- movq [edx+ecx-16],mm3
- movq [edx+ecx- 8],mm4
- add ecx,32
- jle @FwdLoopMMX
- movq [ebx],mm0 {First 8 Characters}
- emms
- pop ebx
- neg ecx
- add ecx,32
- jmp SmallForwardMove_3
- @FwdLargeMove:
- push ebx
- mov ebx,ecx
- test edx,15
- jz @FwdAligned
- {16 byte Align Destination}
- mov ecx,edx
- add ecx,15
- and ecx,-16
- sub ecx,edx
- add eax,ecx
- add edx,ecx
- sub ebx,ecx
- {Destination now 16 Byte Aligned}
- call SmallForwardMove_3
- @FwdAligned:
- mov ecx,ebx
- and ecx,-16
- sub ebx,ecx {EBX = Remainder}
- push esi
- push edi
- mov esi,eax {ESI = Source}
- mov edi,edx {EDI = Dest}
- mov eax,ecx {EAX = Count}
- and eax,-64 {EAX = No of Bytes to Blocks Moves}
- and ecx,$3F {ECX = Remaining Bytes to Move (0..63)}
- add esi,eax
- add edi,eax
- shr eax,3 {EAX = No of QWORD's to Block Move}
- neg eax
- @MMXcopyloop:
- movq mm0,[esi+eax*8 ]
- movq mm1,[esi+eax*8+ 8]
- movq mm2,[esi+eax*8+16]
- movq mm3,[esi+eax*8+24]
- movq mm4,[esi+eax*8+32]
- movq mm5,[esi+eax*8+40]
- movq mm6,[esi+eax*8+48]
- movq mm7,[esi+eax*8+56]
- movq [edi+eax*8 ],mm0
- movq [edi+eax*8+ 8],mm1
- movq [edi+eax*8+16],mm2
- movq [edi+eax*8+24],mm3
- movq [edi+eax*8+32],mm4
- movq [edi+eax*8+40],mm5
- movq [edi+eax*8+48],mm6
- movq [edi+eax*8+56],mm7
- add eax,8
- jnz @MMXcopyloop
- emms {Empty MMX State}
- {$ifdef FPC_ENABLED_CLD}
- cld
- {$endif FPC_ENABLED_CLD}
- add ecx,ebx
- shr ecx,2
- rep movsd
- mov ecx,ebx
- and ecx,3
- rep movsb
- pop edi
- pop esi
- pop ebx
- end; {Forwards_MMX}
- {-------------------------------------------------------------------------}
- {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
- procedure Backwards_MMX_3;assembler;nostackframe;
- asm
- cmp ecx,72 {Size at which using MMX becomes worthwhile}
- jl Backwards_IA32_3
- push ebx
- movq mm0,[eax+ecx-8] {Get Last QWORD}
- {QWORD Align Writes}
- lea ebx,[edx+ecx]
- and ebx,7
- sub ecx,ebx
- add ebx,ecx
- {Now QWORD Aligned}
- sub ecx,32
- @BwdLoopMMX:
- movq mm1,[eax+ecx ]
- movq mm2,[eax+ecx+ 8]
- movq mm3,[eax+ecx+16]
- movq mm4,[eax+ecx+24]
- movq [edx+ecx+24],mm4
- movq [edx+ecx+16],mm3
- movq [edx+ecx+ 8],mm2
- movq [edx+ecx ],mm1
- sub ecx,32
- jge @BwdLoopMMX
- movq [edx+ebx-8], mm0 {Last QWORD}
- emms
- add ecx,32
- pop ebx
- jmp SmallBackwardMove_3
- end; {Backwards_MMX}
- {$ifndef FASTMOVE_DISABLE_SSE3}
- {-------------------------------------------------------------------------}
- {Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
- procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe;
- const
- Prefetch = 512;
- asm
- push esi
- mov esi,eax {ESI = Source}
- mov eax,ecx {EAX = Count}
- and eax,-128 {EAX = No of Bytes to Block Move}
- add esi,eax
- add edx,eax
- shr eax,3 {EAX = No of QWORD's to Block Move}
- neg eax
- cmp eax, -(32*1024) {Count > 256K}
- jl @Large
- @Small: {Count<=256K}
- test esi,15 {Check if Both Source/Dest Aligned}
- jnz @SmallUnaligned
- @SmallAligned: {Both Source and Dest 16-Byte Aligned}
- @SmallAlignedLoop:
- movaps xmm0,[esi+8*eax]
- movaps xmm1,[esi+8*eax+16]
- movaps xmm2,[esi+8*eax+32]
- movaps xmm3,[esi+8*eax+48]
- movaps [edx+8*eax],xmm0
- movaps [edx+8*eax+16],xmm1
- movaps [edx+8*eax+32],xmm2
- movaps [edx+8*eax+48],xmm3
- movaps xmm4,[esi+8*eax+64]
- movaps xmm5,[esi+8*eax+80]
- movaps xmm6,[esi+8*eax+96]
- movaps xmm7,[esi+8*eax+112]
- movaps [edx+8*eax+64],xmm4
- movaps [edx+8*eax+80],xmm5
- movaps [edx+8*eax+96],xmm6
- movaps [edx+8*eax+112],xmm7
- add eax,16
- js @SmallAlignedLoop
- jmp @Remainder
- @SmallUnaligned: {Source Not 16-Byte Aligned}
- @SmallUnalignedLoop:
- movups xmm0,[esi+8*eax]
- movups xmm1,[esi+8*eax+16]
- movups xmm2,[esi+8*eax+32]
- movups xmm3,[esi+8*eax+48]
- movaps [edx+8*eax],xmm0
- movaps [edx+8*eax+16],xmm1
- movaps [edx+8*eax+32],xmm2
- movaps [edx+8*eax+48],xmm3
- movups xmm4,[esi+8*eax+64]
- movups xmm5,[esi+8*eax+80]
- movups xmm6,[esi+8*eax+96]
- movups xmm7,[esi+8*eax+112]
- movaps [edx+8*eax+64],xmm4
- movaps [edx+8*eax+80],xmm5
- movaps [edx+8*eax+96],xmm6
- movaps [edx+8*eax+112],xmm7
- add eax,16
- js @SmallUnalignedLoop
- jmp @Remainder
- @Large: {Count>256K}
- test esi,15 {Check if Both Source/Dest Aligned}
- jnz @LargeUnaligned
- @LargeAligned: {Both Source and Dest 16-Byte Aligned}
- @LargeAlignedLoop:
- prefetchnta [esi+8*eax+Prefetch]
- prefetchnta [esi+8*eax+Prefetch+64]
- movaps xmm0,[esi+8*eax]
- movaps xmm1,[esi+8*eax+16]
- movaps xmm2,[esi+8*eax+32]
- movaps xmm3,[esi+8*eax+48]
- movntps [edx+8*eax],xmm0
- movntps [edx+8*eax+16],xmm1
- movntps [edx+8*eax+32],xmm2
- movntps [edx+8*eax+48],xmm3
- movaps xmm4,[esi+8*eax+64]
- movaps xmm5,[esi+8*eax+80]
- movaps xmm6,[esi+8*eax+96]
- movaps xmm7,[esi+8*eax+112]
- movntps [edx+8*eax+64],xmm4
- movntps [edx+8*eax+80],xmm5
- movntps [edx+8*eax+96],xmm6
- movntps [edx+8*eax+112],xmm7
- add eax,16
- js @LargeAlignedLoop
- sfence
- jmp @Remainder
- @LargeUnaligned: {Source Not 16-Byte Aligned}
- @LargeUnalignedLoop:
- prefetchnta [esi+8*eax+Prefetch]
- prefetchnta [esi+8*eax+Prefetch+64]
- movups xmm0,[esi+8*eax]
- movups xmm1,[esi+8*eax+16]
- movups xmm2,[esi+8*eax+32]
- movups xmm3,[esi+8*eax+48]
- movntps [edx+8*eax],xmm0
- movntps [edx+8*eax+16],xmm1
- movntps [edx+8*eax+32],xmm2
- movntps [edx+8*eax+48],xmm3
- movups xmm4,[esi+8*eax+64]
- movups xmm5,[esi+8*eax+80]
- movups xmm6,[esi+8*eax+96]
- movups xmm7,[esi+8*eax+112]
- movntps [edx+8*eax+64],xmm4
- movntps [edx+8*eax+80],xmm5
- movntps [edx+8*eax+96],xmm6
- movntps [edx+8*eax+112],xmm7
- add eax,16
- js @LargeUnalignedLoop
- sfence
- @Remainder:
- and ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)}
- jz @Done
- add esi,ecx
- add edx,ecx
- neg ecx
- @RemainderLoop:
- movups xmm0,[esi+ecx]
- movaps [edx+ecx],xmm0
- add ecx,16
- jnz @RemainderLoop
- @Done:
- pop esi
- end; {AlignedFwdMoveSSE}
- {-------------------------------------------------------------------------}
- {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
- procedure Forwards_SSE_3;assembler;nostackframe;
- const
- LARGESIZE = 2048;
- asm
- cmp ecx,LARGESIZE
- jge @FwdLargeMove
- cmp ecx,SMALLMOVESIZE+32
- movups xmm0,[eax]
- jg @FwdMoveSSE
- movups xmm1,[eax+16]
- movups [edx],xmm0
- movups [edx+16],xmm1
- add eax,ecx
- add edx,ecx
- sub ecx,32
- jmp SmallForwardMove_3
- @FwdMoveSSE:
- push ebx
- mov ebx,edx
- {Align Writes}
- add eax,ecx
- add ecx,edx
- add edx,15
- and edx,-16
- sub ecx,edx
- add edx,ecx
- {Now Aligned}
- sub ecx,32
- neg ecx
- @FwdLoopSSE:
- movups xmm1,[eax+ecx-32]
- movups xmm2,[eax+ecx-16]
- movaps [edx+ecx-32],xmm1
- movaps [edx+ecx-16],xmm2
- add ecx,32
- jle @FwdLoopSSE
- movups [ebx],xmm0 {First 16 Bytes}
- neg ecx
- add ecx,32
- pop ebx
- jmp SmallForwardMove_3
- @FwdLargeMove:
- push ebx
- mov ebx,ecx
- test edx,15
- jz @FwdLargeAligned
- {16 byte Align Destination}
- mov ecx,edx
- add ecx,15
- and ecx,-16
- sub ecx,edx
- add eax,ecx
- add edx,ecx
- sub ebx,ecx
- {Destination now 16 Byte Aligned}
- call SmallForwardMove_3
- mov ecx,ebx
- @FwdLargeAligned:
- and ecx,-16
- sub ebx,ecx {EBX = Remainder}
- push edx
- push eax
- push ecx
- call AlignedFwdMoveSSE_3
- pop ecx
- pop eax
- pop edx
- add ecx,ebx
- add eax,ecx
- add edx,ecx
- mov ecx,ebx
- pop ebx
- jmp SmallForwardMove_3
- end; {Forwards_SSE}
- {-------------------------------------------------------------------------}
- {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
- procedure Backwards_SSE_3;assembler;nostackframe;
- asm
- cmp ecx,SMALLMOVESIZE+32
- jg @BwdMoveSSE
- sub ecx,32
- movups xmm1,[eax+ecx]
- movups xmm2,[eax+ecx+16]
- movups [edx+ecx],xmm1
- movups [edx+ecx+16],xmm2
- jmp SmallBackwardMove_3
- @BwdMoveSSE:
- push ebx
- movups xmm0,[eax+ecx-16] {Last 16 Bytes}
- {Align Writes}
- lea ebx,[edx+ecx]
- and ebx,15
- sub ecx,ebx
- add ebx,ecx
- {Now Aligned}
- sub ecx,32
- @BwdLoop:
- movups xmm1,[eax+ecx]
- movups xmm2,[eax+ecx+16]
- movaps [edx+ecx],xmm1
- movaps [edx+ecx+16],xmm2
- sub ecx,32
- jge @BwdLoop
- movups [edx+ebx-16],xmm0 {Last 16 Bytes}
- add ecx,32
- pop ebx
- jmp SmallBackwardMove_3
- end; {Backwards_SSE}
- {$endif ndef FASTMOVE_DISABLE_SSE3}
- const
- fastmoveproc_forward : pointer = @Forwards_IA32_3;
- fastmoveproc_backward : pointer = @Backwards_IA32_3;
- procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
- asm
- cmp ecx,SMALLMOVESIZE
- ja @Large
- cmp eax,edx
- lea eax,[eax+ecx]
- jle @SmallCheck
- @SmallForward:
- add edx,ecx
- jmp SmallForwardMove_3
- @SmallCheck:
- je @Done {For Compatibility with Delphi's move for Source = Dest}
- sub eax,ecx
- jmp SmallBackwardMove_3
- @Large:
- jng @Done {For Compatibility with Delphi's move for Count < 0}
- cmp eax,edx
- jg @moveforward
- je @Done {For Compatibility with Delphi's move for Source = Dest}
- push eax
- add eax,ecx
- cmp eax,edx
- pop eax
- jg @movebackward
- @moveforward:
- jmp dword ptr fastmoveproc_forward
- @movebackward:
- jmp dword ptr fastmoveproc_backward {Source/Dest Overlap}
- @Done:
- end;
- {$asmmode att}
- {$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
- var
- valgrind_used : boolean;external name '__fpc_valgrind';
- {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
- procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
- begin
- { workaround valgrind bug }
- {$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
- if EntryInformation.valgrind_used then
- {$else FPC_HAS_INDIRECT_ENTRY_INFORMATION}
- if valgrind_used then
- {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
- begin
- fastmoveproc_forward:=@Forwards_Valgrind;
- fastmoveproc_backward:=@Backwards_Valgrind;
- end
- {$ifndef FASTMOVE_DISABLE_SSE3}
- else if has_sse_support then
- begin
- fastmoveproc_forward:=@Forwards_SSE_3;
- fastmoveproc_backward:=@Backwards_SSE_3;
- end
- {$endif ndef FASTMOVE_DISABLE_SSE3}
- else if has_mmx_support then
- begin
- fastmoveproc_forward:=@Forwards_MMX_3;
- fastmoveproc_backward:=@Backwards_MMX_3;
- end;
- end;
- {$endif FPC_SYSTEM_HAS_MOVE}
- {$endif}
|