fastmove.inc 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861
  1. {
  2. $Id$
  3. Copyright (c) 2004, John O'Harrow ([email protected])
  4. This software is provided 'as-is', without any express or implied warranty.
  5. In no event will the authors be held liable for any damages arising from the
  6. use of this software.
  7. Permission is granted to anyone to use this software for any purpose, including
  8. commercial applications, and to alter it and redistribute it freely, subject to
  9. the following restrictions:
  10. 1. The origin of this software must not be misrepresented; you must not claim
  11. that you wrote the original software. If you use this software in a product,
  12. an acknowledgment in the product documentation would be appreciated but is
  13. not required.
  14. 2. Altered source versions must be plainly marked as such, and must not be
  15. misrepresented as being the original software.
  16. 3. This notice may not be removed or altered from any source distribution.
  17. -------------------------------------------------------------------------------
  18. Version: 1.40 - 16-SEP-2004
  19. }
  20. {$ifndef VER1_0}
  21. {$if (FPC_VERSION>1) or ((FPC_RELEASE>=9) and (FPC_PATCH>6))}
  22. { $define USE_FASTMOVE}
  23. {$endif}
  24. {$endif}
  25. {$ifdef USE_FASTMOVE}
  26. {$ifndef FPC_SYSTEM_HAS_MOVE}
  27. {$define FPC_SYSTEM_HAS_MOVE}
  28. {$asmmode intel}
  29. {-------------------------------------------------------------------------}
  30. (*
  31. {Just to show that a good Pascal algorithm can beat the default BASM}
  32. procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer);
  33. var
  34. S, D : PtrUInt;
  35. Temp, C, I : PtrInt;
  36. L : PPtrInt;
  37. begin
  38. S := Cardinal(@Source);
  39. D := Cardinal(@Dest);
  40. if S = D then
  41. Exit;
  42. if Count <= 4 then
  43. case Count of
  44. 1 : PByte(@Dest)^ := PByte(S)^;
  45. 2 : PWord(@Dest)^ := PWord(S)^;
  46. 3 : if D > S then
  47. begin
  48. PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
  49. PWord(@Dest)^ := PWord(S)^;
  50. end
  51. else
  52. begin
  53. PWord(@Dest)^ := PWord(S)^;
  54. PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
  55. end;
  56. 4 : PInteger(@Dest)^ := PInteger(S)^
  57. else Exit; {Count <= 0}
  58. end
  59. else
  60. if D > S then
  61. begin
  62. Temp := PInteger(S)^;
  63. I := Integer(@Dest);
  64. C := Count - 4;
  65. L := PInteger(Integer(@Dest) + C);
  66. Inc(S, C);
  67. repeat
  68. L^ := PInteger(S)^;
  69. if Count <= 8 then
  70. Break;
  71. Dec(Count, 4);
  72. Dec(S, 4);
  73. Dec(L);
  74. until False;
  75. PInteger(I)^ := Temp;
  76. end
  77. else
  78. begin
  79. C := Count - 4;
  80. Temp := PInteger(S + Cardinal(C))^;
  81. I := Integer(@Dest) + C;
  82. L := @Dest;
  83. repeat
  84. L^ := PInteger(S)^;
  85. if Count <= 8 then
  86. Break;
  87. Dec(Count, 4);
  88. Inc(S, 4);
  89. Inc(L);
  90. until False;
  91. PInteger(I)^ := Temp;
  92. end;
  93. end; {MoveJOH_PAS}
  94. *)
  95. const
  96. SMALLMOVESIZE = 36;
  97. {-------------------------------------------------------------------------}
  98. {Perform Forward Move of 0..36 Bytes}
  99. {On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count. Destroys ECX}
  100. procedure SmallForwardMove_3;assembler;nostackframe;
  101. asm
  102. jmp dword ptr @@FwdJumpTable[ecx*4]
  103. align 16
  104. @@FwdJumpTable:
  105. dd @@Done {Removes need to test for zero size move}
  106. dd @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08
  107. dd @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16
  108. dd @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24
  109. dd @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32
  110. dd @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36
  111. @@Fwd36:
  112. mov ecx,[eax-36]
  113. mov [edx-36],ecx
  114. @@Fwd32:
  115. mov ecx,[eax-32]
  116. mov [edx-32],ecx
  117. @@Fwd28:
  118. mov ecx,[eax-28]
  119. mov [edx-28],ecx
  120. @@Fwd24:
  121. mov ecx,[eax-24]
  122. mov [edx-24],ecx
  123. @@Fwd20:
  124. mov ecx,[eax-20]
  125. mov [edx-20],ecx
  126. @@Fwd16:
  127. mov ecx,[eax-16]
  128. mov [edx-16],ecx
  129. @@Fwd12:
  130. mov ecx,[eax-12]
  131. mov [edx-12],ecx
  132. @@Fwd08:
  133. mov ecx,[eax-8]
  134. mov [edx-8],ecx
  135. @@Fwd04:
  136. mov ecx,[eax-4]
  137. mov [edx-4],ecx
  138. ret
  139. @@Fwd35:
  140. mov ecx,[eax-35]
  141. mov [edx-35],ecx
  142. @@Fwd31:
  143. mov ecx,[eax-31]
  144. mov [edx-31],ecx
  145. @@Fwd27:
  146. mov ecx,[eax-27]
  147. mov [edx-27],ecx
  148. @@Fwd23:
  149. mov ecx,[eax-23]
  150. mov [edx-23],ecx
  151. @@Fwd19:
  152. mov ecx,[eax-19]
  153. mov [edx-19],ecx
  154. @@Fwd15:
  155. mov ecx,[eax-15]
  156. mov [edx-15],ecx
  157. @@Fwd11:
  158. mov ecx,[eax-11]
  159. mov [edx-11],ecx
  160. @@Fwd07:
  161. mov ecx,[eax-7]
  162. mov [edx-7],ecx
  163. mov ecx,[eax-4]
  164. mov [edx-4],ecx
  165. ret
  166. @@Fwd03:
  167. movzx ecx, word ptr [eax-3]
  168. mov [edx-3],cx
  169. movzx ecx, byte ptr [eax-1]
  170. mov [edx-1],cl
  171. ret
  172. @@Fwd34:
  173. mov ecx,[eax-34]
  174. mov [edx-34],ecx
  175. @@Fwd30:
  176. mov ecx,[eax-30]
  177. mov [edx-30],ecx
  178. @@Fwd26:
  179. mov ecx,[eax-26]
  180. mov [edx-26],ecx
  181. @@Fwd22:
  182. mov ecx,[eax-22]
  183. mov [edx-22],ecx
  184. @@Fwd18:
  185. mov ecx,[eax-18]
  186. mov [edx-18],ecx
  187. @@Fwd14:
  188. mov ecx,[eax-14]
  189. mov [edx-14],ecx
  190. @@Fwd10:
  191. mov ecx,[eax-10]
  192. mov [edx-10],ecx
  193. @@Fwd06:
  194. mov ecx,[eax-6]
  195. mov [edx-6],ecx
  196. @@Fwd02:
  197. movzx ecx, word ptr [eax-2]
  198. mov [edx-2],cx
  199. ret
  200. @@Fwd33:
  201. mov ecx,[eax-33]
  202. mov [edx-33],ecx
  203. @@Fwd29:
  204. mov ecx,[eax-29]
  205. mov [edx-29],ecx
  206. @@Fwd25:
  207. mov ecx,[eax-25]
  208. mov [edx-25],ecx
  209. @@Fwd21:
  210. mov ecx,[eax-21]
  211. mov [edx-21],ecx
  212. @@Fwd17:
  213. mov ecx,[eax-17]
  214. mov [edx-17],ecx
  215. @@Fwd13:
  216. mov ecx,[eax-13]
  217. mov [edx-13],ecx
  218. @@Fwd09:
  219. mov ecx,[eax-9]
  220. mov [edx-9],ecx
  221. @@Fwd05:
  222. mov ecx,[eax-5]
  223. mov [edx-5],ecx
  224. @@Fwd01:
  225. movzx ecx, byte ptr [eax-1]
  226. mov [edx-1],cl
  227. @@Done:
  228. end; {SmallForwardMove}
  229. {-------------------------------------------------------------------------}
  230. {Perform Backward Move of 0..36 Bytes}
  231. {On Entry, ECX = Count, EAX = Source, EDX = Dest. Destroys ECX}
  232. procedure SmallBackwardMove_3;assembler;nostackframe;
  233. asm
  234. jmp dword ptr @@BwdJumpTable[ecx*4]
  235. align 16
  236. @@BwdJumpTable:
  237. dd @@Done {Removes need to test for zero size move}
  238. dd @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08
  239. dd @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16
  240. dd @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24
  241. dd @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32
  242. dd @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36
  243. @@Bwd36:
  244. mov ecx,[eax+32]
  245. mov [edx+32],ecx
  246. @@Bwd32:
  247. mov ecx,[eax+28]
  248. mov [edx+28],ecx
  249. @@Bwd28:
  250. mov ecx,[eax+24]
  251. mov [edx+24],ecx
  252. @@Bwd24:
  253. mov ecx,[eax+20]
  254. mov [edx+20],ecx
  255. @@Bwd20:
  256. mov ecx,[eax+16]
  257. mov [edx+16],ecx
  258. @@Bwd16:
  259. mov ecx,[eax+12]
  260. mov [edx+12],ecx
  261. @@Bwd12:
  262. mov ecx,[eax+8]
  263. mov [edx+8],ecx
  264. @@Bwd08:
  265. mov ecx,[eax+4]
  266. mov [edx+4],ecx
  267. @@Bwd04:
  268. mov ecx,[eax]
  269. mov [edx],ecx
  270. ret
  271. @@Bwd35:
  272. mov ecx,[eax+31]
  273. mov [edx+31],ecx
  274. @@Bwd31:
  275. mov ecx,[eax+27]
  276. mov [edx+27],ecx
  277. @@Bwd27:
  278. mov ecx,[eax+23]
  279. mov [edx+23],ecx
  280. @@Bwd23:
  281. mov ecx,[eax+19]
  282. mov [edx+19],ecx
  283. @@Bwd19:
  284. mov ecx,[eax+15]
  285. mov [edx+15],ecx
  286. @@Bwd15:
  287. mov ecx,[eax+11]
  288. mov [edx+11],ecx
  289. @@Bwd11:
  290. mov ecx,[eax+7]
  291. mov [edx+7],ecx
  292. @@Bwd07:
  293. mov ecx,[eax+3]
  294. mov [edx+3],ecx
  295. mov ecx,[eax]
  296. mov [edx],ecx
  297. ret
  298. @@Bwd03:
  299. movzx ecx, word ptr [eax+1]
  300. mov [edx+1],cx
  301. movzx ecx, byte ptr [eax]
  302. mov [edx],cl
  303. ret
  304. @@Bwd34:
  305. mov ecx,[eax+30]
  306. mov [edx+30],ecx
  307. @@Bwd30:
  308. mov ecx,[eax+26]
  309. mov [edx+26],ecx
  310. @@Bwd26:
  311. mov ecx,[eax+22]
  312. mov [edx+22],ecx
  313. @@Bwd22:
  314. mov ecx,[eax+18]
  315. mov [edx+18],ecx
  316. @@Bwd18:
  317. mov ecx,[eax+14]
  318. mov [edx+14],ecx
  319. @@Bwd14:
  320. mov ecx,[eax+10]
  321. mov [edx+10],ecx
  322. @@Bwd10:
  323. mov ecx,[eax+6]
  324. mov [edx+6],ecx
  325. @@Bwd06:
  326. mov ecx,[eax+2]
  327. mov [edx+2],ecx
  328. @@Bwd02:
  329. movzx ecx, word ptr [eax]
  330. mov [edx],cx
  331. ret
  332. @@Bwd33:
  333. mov ecx,[eax+29]
  334. mov [edx+29],ecx
  335. @@Bwd29:
  336. mov ecx,[eax+25]
  337. mov [edx+25],ecx
  338. @@Bwd25:
  339. mov ecx,[eax+21]
  340. mov [edx+21],ecx
  341. @@Bwd21:
  342. mov ecx,[eax+17]
  343. mov [edx+17],ecx
  344. @@Bwd17:
  345. mov ecx,[eax+13]
  346. mov [edx+13],ecx
  347. @@Bwd13:
  348. mov ecx,[eax+9]
  349. mov [edx+9],ecx
  350. @@Bwd09:
  351. mov ecx,[eax+5]
  352. mov [edx+5],ecx
  353. @@Bwd05:
  354. mov ecx,[eax+1]
  355. mov [edx+1],ecx
  356. @@Bwd01:
  357. movzx ecx, byte ptr[eax]
  358. mov [edx],cl
  359. @@Done:
  360. end; {SmallBackwardMove}
  361. {-------------------------------------------------------------------------}
  362. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  363. procedure Forwards_IA32_3;assembler;nostackframe;
  364. asm
  365. push ebx
  366. mov ebx,edx
  367. fild qword ptr [eax]
  368. add eax,ecx {QWORD Align Writes}
  369. add ecx,edx
  370. add edx,7
  371. and edx,-8
  372. sub ecx,edx
  373. add edx,ecx {Now QWORD Aligned}
  374. sub ecx,16
  375. neg ecx
  376. @FwdLoop:
  377. fild qword ptr [eax+ecx-16]
  378. fistp qword ptr [edx+ecx-16]
  379. fild qword ptr [eax+ecx-8]
  380. fistp qword ptr [edx+ecx-8]
  381. add ecx,16
  382. jle @FwdLoop
  383. fistp qword ptr [ebx]
  384. neg ecx
  385. add ecx,16
  386. pop ebx
  387. jmp SmallForwardMove_3
  388. end; {Forwards_IA32}
  389. {-------------------------------------------------------------------------}
  390. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  391. procedure Backwards_IA32_3;assembler;nostackframe;
  392. asm
  393. push ebx
  394. fild qword ptr [eax+ecx-8]
  395. lea ebx,[edx+ecx] {QWORD Align Writes}
  396. and ebx,7
  397. sub ecx,ebx
  398. add ebx,ecx {Now QWORD Aligned, EBX = Original Length}
  399. sub ecx,16
  400. @BwdLoop:
  401. fild qword ptr [eax+ecx]
  402. fild qword ptr [eax+ecx+8]
  403. fistp qword ptr [edx+ecx+8]
  404. fistp qword ptr [edx+ecx]
  405. sub ecx,16
  406. jge @BwdLoop
  407. fistp qword ptr [edx+ebx-8]
  408. add ecx,16
  409. pop ebx
  410. jmp SmallBackwardMove_3
  411. end; {Backwards_IA32}
  412. {-------------------------------------------------------------------------}
  413. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  414. procedure Forwards_MMX_3;assembler;nostackframe;
  415. const
  416. LARGESIZE = 1024;
  417. asm
  418. cmp ecx,LARGESIZE
  419. jge @FwdLargeMove
  420. cmp ecx,72 {Size at which using MMX becomes worthwhile}
  421. jl Forwards_IA32_3
  422. push ebx
  423. mov ebx,edx
  424. movq mm0,[eax] {First 8 Characters}
  425. {QWORD Align Writes}
  426. add eax,ecx
  427. add ecx,edx
  428. add edx,7
  429. and edx,-8
  430. sub ecx,edx
  431. add edx,ecx
  432. {Now QWORD Aligned}
  433. sub ecx,32
  434. neg ecx
  435. @FwdLoopMMX:
  436. movq mm1,[eax+ecx-32]
  437. movq mm2,[eax+ecx-24]
  438. movq mm3,[eax+ecx-16]
  439. movq mm4,[eax+ecx- 8]
  440. movq [edx+ecx-32],mm1
  441. movq [edx+ecx-24],mm2
  442. movq [edx+ecx-16],mm3
  443. movq [edx+ecx- 8],mm4
  444. add ecx,32
  445. jle @FwdLoopMMX
  446. movq [ebx],mm0 {First 8 Characters}
  447. emms
  448. pop ebx
  449. neg ecx
  450. add ecx,32
  451. jmp SmallForwardMove_3
  452. @FwdLargeMove:
  453. push ebx
  454. mov ebx,ecx
  455. test edx,15
  456. jz @FwdAligned
  457. {16 byte Align Destination}
  458. mov ecx,edx
  459. add ecx,15
  460. and ecx,-16
  461. sub ecx,edx
  462. add eax,ecx
  463. add edx,ecx
  464. sub ebx,ecx
  465. {Destination now 16 Byte Aligned}
  466. call SmallForwardMove_3
  467. @FwdAligned:
  468. mov ecx,ebx
  469. and ecx,-16
  470. sub ebx,ecx {EBX = Remainder}
  471. push esi
  472. push edi
  473. mov esi,eax {ESI = Source}
  474. mov edi,edx {EDI = Dest}
  475. mov eax,ecx {EAX = Count}
  476. and eax,-64 {EAX = No of Bytes to Blocks Moves}
  477. and ecx,$3F {ECX = Remaining Bytes to Move (0..63)}
  478. add esi,eax
  479. add edi,eax
  480. shr eax,3 {EAX = No of QWORD's to Block Move}
  481. neg eax
  482. @MMXcopyloop:
  483. movq mm0,[esi+eax*8 ]
  484. movq mm1,[esi+eax*8+ 8]
  485. movq mm2,[esi+eax*8+16]
  486. movq mm3,[esi+eax*8+24]
  487. movq mm4,[esi+eax*8+32]
  488. movq mm5,[esi+eax*8+40]
  489. movq mm6,[esi+eax*8+48]
  490. movq mm7,[esi+eax*8+56]
  491. movq [edi+eax*8 ],mm0
  492. movq [edi+eax*8+ 8],mm1
  493. movq [edi+eax*8+16],mm2
  494. movq [edi+eax*8+24],mm3
  495. movq [edi+eax*8+32],mm4
  496. movq [edi+eax*8+40],mm5
  497. movq [edi+eax*8+48],mm6
  498. movq [edi+eax*8+56],mm7
  499. add eax,8
  500. jnz @MMXcopyloop
  501. emms {Empty MMX State}
  502. add ecx,ebx
  503. shr ecx,2
  504. rep movsd
  505. mov ecx,ebx
  506. and ecx,3
  507. rep movsb
  508. pop edi
  509. pop esi
  510. pop ebx
  511. end; {Forwards_MMX}
  512. {-------------------------------------------------------------------------}
  513. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  514. procedure Backwards_MMX_3;assembler;nostackframe;
  515. asm
  516. cmp ecx,72 {Size at which using MMX becomes worthwhile}
  517. jl Backwards_IA32_3
  518. push ebx
  519. movq mm0,[eax+ecx-8] {Get Last QWORD}
  520. {QWORD Align Writes}
  521. lea ebx,[edx+ecx]
  522. and ebx,7
  523. sub ecx,ebx
  524. add ebx,ecx
  525. {Now QWORD Aligned}
  526. sub ecx,32
  527. @BwdLoopMMX:
  528. movq mm1,[eax+ecx ]
  529. movq mm2,[eax+ecx+ 8]
  530. movq mm3,[eax+ecx+16]
  531. movq mm4,[eax+ecx+24]
  532. movq [edx+ecx+24],mm4
  533. movq [edx+ecx+16],mm3
  534. movq [edx+ecx+ 8],mm2
  535. movq [edx+ecx ],mm1
  536. sub ecx,32
  537. jge @BwdLoopMMX
  538. movq [edx+ebx-8], mm0 {Last QWORD}
  539. emms
  540. add ecx,32
  541. pop ebx
  542. jmp SmallBackwardMove_3
  543. end; {Backwards_MMX}
  544. {-------------------------------------------------------------------------}
  545. {Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
  546. procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe;
  547. const
  548. Prefetch = 512;
  549. asm
  550. push esi
  551. mov esi,eax {ESI = Source}
  552. mov eax,ecx {EAX = Count}
  553. and eax,-128 {EAX = No of Bytes to Block Move}
  554. add esi,eax
  555. add edx,eax
  556. shr eax,3 {EAX = No of QWORD's to Block Move}
  557. neg eax
  558. cmp eax, -(32*1024) {Count > 256K}
  559. jl @Large
  560. @Small: {Count<=256K}
  561. test esi,15 {Check if Both Source/Dest Aligned}
  562. jnz @SmallUnaligned
  563. @SmallAligned: {Both Source and Dest 16-Byte Aligned}
  564. @SmallAlignedLoop:
  565. movaps xmm0,[esi+8*eax]
  566. movaps xmm1,[esi+8*eax+16]
  567. movaps xmm2,[esi+8*eax+32]
  568. movaps xmm3,[esi+8*eax+48]
  569. movaps [edx+8*eax],xmm0
  570. movaps [edx+8*eax+16],xmm1
  571. movaps [edx+8*eax+32],xmm2
  572. movaps [edx+8*eax+48],xmm3
  573. movaps xmm4,[esi+8*eax+64]
  574. movaps xmm5,[esi+8*eax+80]
  575. movaps xmm6,[esi+8*eax+96]
  576. movaps xmm7,[esi+8*eax+112]
  577. movaps [edx+8*eax+64],xmm4
  578. movaps [edx+8*eax+80],xmm5
  579. movaps [edx+8*eax+96],xmm6
  580. movaps [edx+8*eax+112],xmm7
  581. add eax,16
  582. js @SmallAlignedLoop
  583. jmp @Remainder
  584. @SmallUnaligned: {Source Not 16-Byte Aligned}
  585. @SmallUnalignedLoop:
  586. movups xmm0,[esi+8*eax]
  587. movups xmm1,[esi+8*eax+16]
  588. movups xmm2,[esi+8*eax+32]
  589. movups xmm3,[esi+8*eax+48]
  590. movaps [edx+8*eax],xmm0
  591. movaps [edx+8*eax+16],xmm1
  592. movaps [edx+8*eax+32],xmm2
  593. movaps [edx+8*eax+48],xmm3
  594. movups xmm4,[esi+8*eax+64]
  595. movups xmm5,[esi+8*eax+80]
  596. movups xmm6,[esi+8*eax+96]
  597. movups xmm7,[esi+8*eax+112]
  598. movaps [edx+8*eax+64],xmm4
  599. movaps [edx+8*eax+80],xmm5
  600. movaps [edx+8*eax+96],xmm6
  601. movaps [edx+8*eax+112],xmm7
  602. add eax,16
  603. js @SmallUnalignedLoop
  604. jmp @Remainder
  605. @Large: {Count>256K}
  606. test esi,15 {Check if Both Source/Dest Aligned}
  607. jnz @LargeUnaligned
  608. @LargeAligned: {Both Source and Dest 16-Byte Aligned}
  609. @LargeAlignedLoop:
  610. prefetchnta [esi+8*eax+Prefetch]
  611. prefetchnta [esi+8*eax+Prefetch+64]
  612. movaps xmm0,[esi+8*eax]
  613. movaps xmm1,[esi+8*eax+16]
  614. movaps xmm2,[esi+8*eax+32]
  615. movaps xmm3,[esi+8*eax+48]
  616. movntps [edx+8*eax],xmm0
  617. movntps [edx+8*eax+16],xmm1
  618. movntps [edx+8*eax+32],xmm2
  619. movntps [edx+8*eax+48],xmm3
  620. movaps xmm4,[esi+8*eax+64]
  621. movaps xmm5,[esi+8*eax+80]
  622. movaps xmm6,[esi+8*eax+96]
  623. movaps xmm7,[esi+8*eax+112]
  624. movntps [edx+8*eax+64],xmm4
  625. movntps [edx+8*eax+80],xmm5
  626. movntps [edx+8*eax+96],xmm6
  627. movntps [edx+8*eax+112],xmm7
  628. add eax,16
  629. js @LargeAlignedLoop
  630. sfence
  631. jmp @Remainder
  632. @LargeUnaligned: {Source Not 16-Byte Aligned}
  633. @LargeUnalignedLoop:
  634. prefetchnta [esi+8*eax+Prefetch]
  635. prefetchnta [esi+8*eax+Prefetch+64]
  636. movups xmm0,[esi+8*eax]
  637. movups xmm1,[esi+8*eax+16]
  638. movups xmm2,[esi+8*eax+32]
  639. movups xmm3,[esi+8*eax+48]
  640. movntps [edx+8*eax],xmm0
  641. movntps [edx+8*eax+16],xmm1
  642. movntps [edx+8*eax+32],xmm2
  643. movntps [edx+8*eax+48],xmm3
  644. movups xmm4,[esi+8*eax+64]
  645. movups xmm5,[esi+8*eax+80]
  646. movups xmm6,[esi+8*eax+96]
  647. movups xmm7,[esi+8*eax+112]
  648. movntps [edx+8*eax+64],xmm4
  649. movntps [edx+8*eax+80],xmm5
  650. movntps [edx+8*eax+96],xmm6
  651. movntps [edx+8*eax+112],xmm7
  652. add eax,16
  653. js @LargeUnalignedLoop
  654. sfence
  655. @Remainder:
  656. and ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)}
  657. jz @Done
  658. add esi,ecx
  659. add edx,ecx
  660. neg ecx
  661. @RemainderLoop:
  662. movups xmm0,[esi+ecx]
  663. movaps [edx+ecx],xmm0
  664. add ecx,16
  665. jnz @RemainderLoop
  666. @Done:
  667. pop esi
  668. end; {AlignedFwdMoveSSE}
  669. {-------------------------------------------------------------------------}
  670. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  671. procedure Forwards_SSE_3;assembler;nostackframe;
  672. const
  673. LARGESIZE = 2048;
  674. asm
  675. cmp ecx,LARGESIZE
  676. jge @FwdLargeMove
  677. cmp ecx,SMALLMOVESIZE+32
  678. movups xmm0,[eax]
  679. jg @FwdMoveSSE
  680. movups xmm1,[eax+16]
  681. movups [edx],xmm0
  682. movups [edx+16],xmm1
  683. add eax,ecx
  684. add edx,ecx
  685. sub ecx,32
  686. jmp SmallForwardMove_3
  687. @FwdMoveSSE:
  688. push ebx
  689. mov ebx,edx
  690. {Align Writes}
  691. add eax,ecx
  692. add ecx,edx
  693. add edx,15
  694. and edx,-16
  695. sub ecx,edx
  696. add edx,ecx
  697. {Now Aligned}
  698. sub ecx,32
  699. neg ecx
  700. @FwdLoopSSE:
  701. movups xmm1,[eax+ecx-32]
  702. movups xmm2,[eax+ecx-16]
  703. movaps [edx+ecx-32],xmm1
  704. movaps [edx+ecx-16],xmm2
  705. add ecx,32
  706. jle @FwdLoopSSE
  707. movups [ebx],xmm0 {First 16 Bytes}
  708. neg ecx
  709. add ecx,32
  710. pop ebx
  711. jmp SmallForwardMove_3
  712. @FwdLargeMove:
  713. push ebx
  714. mov ebx,ecx
  715. test edx,15
  716. jz @FwdLargeAligned
  717. {16 byte Align Destination}
  718. mov ecx,edx
  719. add ecx,15
  720. and ecx,-16
  721. sub ecx,edx
  722. add eax,ecx
  723. add edx,ecx
  724. sub ebx,ecx
  725. {Destination now 16 Byte Aligned}
  726. call SmallForwardMove_3
  727. mov ecx,ebx
  728. @FwdLargeAligned:
  729. and ecx,-16
  730. sub ebx,ecx {EBX = Remainder}
  731. push edx
  732. push eax
  733. push ecx
  734. call AlignedFwdMoveSSE_3
  735. pop ecx
  736. pop eax
  737. pop edx
  738. add ecx,ebx
  739. add eax,ecx
  740. add edx,ecx
  741. mov ecx,ebx
  742. pop ebx
  743. jmp SmallForwardMove_3
  744. end; {Forwards_SSE}
  745. {-------------------------------------------------------------------------}
  746. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  747. procedure Backwards_SSE_3;assembler;nostackframe;
  748. asm
  749. cmp ecx,SMALLMOVESIZE+32
  750. jg @BwdMoveSSE
  751. sub ecx,32
  752. movups xmm1,[eax+ecx]
  753. movups xmm2,[eax+ecx+16]
  754. movups [edx+ecx],xmm1
  755. movups [edx+ecx+16],xmm2
  756. jmp SmallBackwardMove_3
  757. @BwdMoveSSE:
  758. push ebx
  759. movups xmm0,[eax+ecx-16] {Last 16 Bytes}
  760. {Align Writes}
  761. lea ebx,[edx+ecx]
  762. and ebx,15
  763. sub ecx,ebx
  764. add ebx,ecx
  765. {Now Aligned}
  766. sub ecx,32
  767. @BwdLoop:
  768. movups xmm1,[eax+ecx]
  769. movups xmm2,[eax+ecx+16]
  770. movaps [edx+ecx],xmm1
  771. movaps [edx+ecx+16],xmm2
  772. sub ecx,32
  773. jge @BwdLoop
  774. movups [edx+ebx-16],xmm0 {Last 16 Bytes}
  775. add ecx,32
  776. pop ebx
  777. jmp SmallBackwardMove_3
  778. end; {Backwards_SSE}
  779. const
  780. fastmoveproc_forward : pointer = @Forwards_IA32_3;
  781. fastmoveproc_backward : pointer = @Backwards_IA32_3;
  782. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
  783. asm
  784. cmp ecx,SMALLMOVESIZE
  785. ja @Large
  786. cmp eax,edx
  787. lea eax,[eax+ecx]
  788. jle @SmallCheck
  789. @SmallForward:
  790. add edx,ecx
  791. jmp SmallForwardMove_3
  792. @SmallCheck:
  793. je @Done {For Compatibility with Delphi's move for Source = Dest}
  794. sub eax,ecx
  795. jmp SmallBackwardMove_3
  796. @Large:
  797. jng @Done {For Compatibility with Delphi's move for Count < 0}
  798. cmp eax,edx
  799. jg @moveforward
  800. je @Done {For Compatibility with Delphi's move for Source = Dest}
  801. push eax
  802. add eax,ecx
  803. cmp eax,edx
  804. pop eax
  805. jg @movebackward
  806. @moveforward:
  807. jmp dword ptr fastmoveproc_forward
  808. @movebackward:
  809. jmp dword ptr fastmoveproc_backward {Source/Dest Overlap}
  810. @Done:
  811. end;
  812. {$asmmode att}
  813. procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
  814. begin
  815. if has_sse_support then
  816. begin
  817. fastmoveproc_forward:=@Forwards_SSE_3;
  818. fastmoveproc_backward:=@Backwards_SSE_3;
  819. end
  820. else if has_mmx_support then
  821. begin
  822. fastmoveproc_forward:=@Forwards_MMX_3;
  823. fastmoveproc_backward:=@Backwards_MMX_3;
  824. end;
  825. end;
  826. {$endif FPC_SYSTEM_HAS_MOVE}
  827. {$else}
  828. procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
  829. begin
  830. end;
  831. {$endif}