fastmove.inc 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854
  1. {
  2. $Id$
  3. Copyright (c) 2004, John O'Harrow ([email protected])
  4. This software is provided 'as-is', without any express or implied warranty.
  5. In no event will the authors be held liable for any damages arising from the
  6. use of this software.
  7. Permission is granted to anyone to use this software for any purpose, including
  8. commercial applications, and to alter it and redistribute it freely, subject to
  9. the following restrictions:
  10. 1. The origin of this software must not be misrepresented; you must not claim
  11. that you wrote the original software. If you use this software in a product,
  12. an acknowledgment in the product documentation would be appreciated but is
  13. not required.
  14. 2. Altered source versions must be plainly marked as such, and must not be
  15. misrepresented as being the original software.
  16. 3. This notice may not be removed or altered from any source distribution.
  17. -------------------------------------------------------------------------------
  18. Version: 1.40 - 16-SEP-2004
  19. }
  20. {$if (FPC_VERSION>1) or ((FPC_RELEASE>=9) and (FPC_PATCH>6))}
  21. {$ifndef FPC_SYSTEM_HAS_MOVE}
  22. {$define FPC_SYSTEM_HAS_MOVE}
  23. {$asmmode intel}
  24. {-------------------------------------------------------------------------}
  25. {Just to show that a good Pascal algorithm can beat the default BASM}
  26. procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer);
  27. var
  28. S, D : PtrUInt;
  29. Temp, C, I : PtrInt;
  30. L : PPtrInt;
  31. begin
  32. S := Cardinal(@Source);
  33. D := Cardinal(@Dest);
  34. if S = D then
  35. Exit;
  36. if Count <= 4 then
  37. case Count of
  38. 1 : PByte(@Dest)^ := PByte(S)^;
  39. 2 : PWord(@Dest)^ := PWord(S)^;
  40. 3 : if D > S then
  41. begin
  42. PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
  43. PWord(@Dest)^ := PWord(S)^;
  44. end
  45. else
  46. begin
  47. PWord(@Dest)^ := PWord(S)^;
  48. PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
  49. end;
  50. 4 : PInteger(@Dest)^ := PInteger(S)^
  51. else Exit; {Count <= 0}
  52. end
  53. else
  54. if D > S then
  55. begin
  56. Temp := PInteger(S)^;
  57. I := Integer(@Dest);
  58. C := Count - 4;
  59. L := PInteger(Integer(@Dest) + C);
  60. Inc(S, C);
  61. repeat
  62. L^ := PInteger(S)^;
  63. if Count <= 8 then
  64. Break;
  65. Dec(Count, 4);
  66. Dec(S, 4);
  67. Dec(L);
  68. until False;
  69. PInteger(I)^ := Temp;
  70. end
  71. else
  72. begin
  73. C := Count - 4;
  74. Temp := PInteger(S + Cardinal(C))^;
  75. I := Integer(@Dest) + C;
  76. L := @Dest;
  77. repeat
  78. L^ := PInteger(S)^;
  79. if Count <= 8 then
  80. Break;
  81. Dec(Count, 4);
  82. Inc(S, 4);
  83. Inc(L);
  84. until False;
  85. PInteger(I)^ := Temp;
  86. end;
  87. end; {MoveJOH_PAS}
  88. const
  89. SMALLMOVESIZE = 36;
  90. {-------------------------------------------------------------------------}
  91. {Perform Forward Move of 0..36 Bytes}
  92. {On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count. Destroys ECX}
  93. procedure SmallForwardMove_3;assembler;nostackframe;
  94. asm
  95. jmp dword ptr @@FwdJumpTable[ecx*4]
  96. align 16
  97. @@FwdJumpTable:
  98. dd @@Done {Removes need to test for zero size move}
  99. dd @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08
  100. dd @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16
  101. dd @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24
  102. dd @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32
  103. dd @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36
  104. @@Fwd36:
  105. mov ecx,[eax-36]
  106. mov [edx-36],ecx
  107. @@Fwd32:
  108. mov ecx,[eax-32]
  109. mov [edx-32],ecx
  110. @@Fwd28:
  111. mov ecx,[eax-28]
  112. mov [edx-28],ecx
  113. @@Fwd24:
  114. mov ecx,[eax-24]
  115. mov [edx-24],ecx
  116. @@Fwd20:
  117. mov ecx,[eax-20]
  118. mov [edx-20],ecx
  119. @@Fwd16:
  120. mov ecx,[eax-16]
  121. mov [edx-16],ecx
  122. @@Fwd12:
  123. mov ecx,[eax-12]
  124. mov [edx-12],ecx
  125. @@Fwd08:
  126. mov ecx,[eax-8]
  127. mov [edx-8],ecx
  128. @@Fwd04:
  129. mov ecx,[eax-4]
  130. mov [edx-4],ecx
  131. ret
  132. @@Fwd35:
  133. mov ecx,[eax-35]
  134. mov [edx-35],ecx
  135. @@Fwd31:
  136. mov ecx,[eax-31]
  137. mov [edx-31],ecx
  138. @@Fwd27:
  139. mov ecx,[eax-27]
  140. mov [edx-27],ecx
  141. @@Fwd23:
  142. mov ecx,[eax-23]
  143. mov [edx-23],ecx
  144. @@Fwd19:
  145. mov ecx,[eax-19]
  146. mov [edx-19],ecx
  147. @@Fwd15:
  148. mov ecx,[eax-15]
  149. mov [edx-15],ecx
  150. @@Fwd11:
  151. mov ecx,[eax-11]
  152. mov [edx-11],ecx
  153. @@Fwd07:
  154. mov ecx,[eax-7]
  155. mov [edx-7],ecx
  156. mov ecx,[eax-4]
  157. mov [edx-4],ecx
  158. ret
  159. @@Fwd03:
  160. movzx ecx, word ptr [eax-3]
  161. mov [edx-3],cx
  162. movzx ecx, byte ptr [eax-1]
  163. mov [edx-1],cl
  164. ret
  165. @@Fwd34:
  166. mov ecx,[eax-34]
  167. mov [edx-34],ecx
  168. @@Fwd30:
  169. mov ecx,[eax-30]
  170. mov [edx-30],ecx
  171. @@Fwd26:
  172. mov ecx,[eax-26]
  173. mov [edx-26],ecx
  174. @@Fwd22:
  175. mov ecx,[eax-22]
  176. mov [edx-22],ecx
  177. @@Fwd18:
  178. mov ecx,[eax-18]
  179. mov [edx-18],ecx
  180. @@Fwd14:
  181. mov ecx,[eax-14]
  182. mov [edx-14],ecx
  183. @@Fwd10:
  184. mov ecx,[eax-10]
  185. mov [edx-10],ecx
  186. @@Fwd06:
  187. mov ecx,[eax-6]
  188. mov [edx-6],ecx
  189. @@Fwd02:
  190. movzx ecx, word ptr [eax-2]
  191. mov [edx-2],cx
  192. ret
  193. @@Fwd33:
  194. mov ecx,[eax-33]
  195. mov [edx-33],ecx
  196. @@Fwd29:
  197. mov ecx,[eax-29]
  198. mov [edx-29],ecx
  199. @@Fwd25:
  200. mov ecx,[eax-25]
  201. mov [edx-25],ecx
  202. @@Fwd21:
  203. mov ecx,[eax-21]
  204. mov [edx-21],ecx
  205. @@Fwd17:
  206. mov ecx,[eax-17]
  207. mov [edx-17],ecx
  208. @@Fwd13:
  209. mov ecx,[eax-13]
  210. mov [edx-13],ecx
  211. @@Fwd09:
  212. mov ecx,[eax-9]
  213. mov [edx-9],ecx
  214. @@Fwd05:
  215. mov ecx,[eax-5]
  216. mov [edx-5],ecx
  217. @@Fwd01:
  218. movzx ecx, byte ptr [eax-1]
  219. mov [edx-1],cl
  220. @@Done:
  221. end; {SmallForwardMove}
  222. {-------------------------------------------------------------------------}
  223. {Perform Backward Move of 0..36 Bytes}
  224. {On Entry, ECX = Count, EAX = Source, EDX = Dest. Destroys ECX}
  225. procedure SmallBackwardMove_3;assembler;nostackframe;
  226. asm
  227. jmp dword ptr @@BwdJumpTable[ecx*4]
  228. align 16
  229. @@BwdJumpTable:
  230. dd @@Done {Removes need to test for zero size move}
  231. dd @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08
  232. dd @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16
  233. dd @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24
  234. dd @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32
  235. dd @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36
  236. @@Bwd36:
  237. mov ecx,[eax+32]
  238. mov [edx+32],ecx
  239. @@Bwd32:
  240. mov ecx,[eax+28]
  241. mov [edx+28],ecx
  242. @@Bwd28:
  243. mov ecx,[eax+24]
  244. mov [edx+24],ecx
  245. @@Bwd24:
  246. mov ecx,[eax+20]
  247. mov [edx+20],ecx
  248. @@Bwd20:
  249. mov ecx,[eax+16]
  250. mov [edx+16],ecx
  251. @@Bwd16:
  252. mov ecx,[eax+12]
  253. mov [edx+12],ecx
  254. @@Bwd12:
  255. mov ecx,[eax+8]
  256. mov [edx+8],ecx
  257. @@Bwd08:
  258. mov ecx,[eax+4]
  259. mov [edx+4],ecx
  260. @@Bwd04:
  261. mov ecx,[eax]
  262. mov [edx],ecx
  263. ret
  264. @@Bwd35:
  265. mov ecx,[eax+31]
  266. mov [edx+31],ecx
  267. @@Bwd31:
  268. mov ecx,[eax+27]
  269. mov [edx+27],ecx
  270. @@Bwd27:
  271. mov ecx,[eax+23]
  272. mov [edx+23],ecx
  273. @@Bwd23:
  274. mov ecx,[eax+19]
  275. mov [edx+19],ecx
  276. @@Bwd19:
  277. mov ecx,[eax+15]
  278. mov [edx+15],ecx
  279. @@Bwd15:
  280. mov ecx,[eax+11]
  281. mov [edx+11],ecx
  282. @@Bwd11:
  283. mov ecx,[eax+7]
  284. mov [edx+7],ecx
  285. @@Bwd07:
  286. mov ecx,[eax+3]
  287. mov [edx+3],ecx
  288. mov ecx,[eax]
  289. mov [edx],ecx
  290. ret
  291. @@Bwd03:
  292. movzx ecx, word ptr [eax+1]
  293. mov [edx+1],cx
  294. movzx ecx, byte ptr [eax]
  295. mov [edx],cl
  296. ret
  297. @@Bwd34:
  298. mov ecx,[eax+30]
  299. mov [edx+30],ecx
  300. @@Bwd30:
  301. mov ecx,[eax+26]
  302. mov [edx+26],ecx
  303. @@Bwd26:
  304. mov ecx,[eax+22]
  305. mov [edx+22],ecx
  306. @@Bwd22:
  307. mov ecx,[eax+18]
  308. mov [edx+18],ecx
  309. @@Bwd18:
  310. mov ecx,[eax+14]
  311. mov [edx+14],ecx
  312. @@Bwd14:
  313. mov ecx,[eax+10]
  314. mov [edx+10],ecx
  315. @@Bwd10:
  316. mov ecx,[eax+6]
  317. mov [edx+6],ecx
  318. @@Bwd06:
  319. mov ecx,[eax+2]
  320. mov [edx+2],ecx
  321. @@Bwd02:
  322. movzx ecx, word ptr [eax]
  323. mov [edx],cx
  324. ret
  325. @@Bwd33:
  326. mov ecx,[eax+29]
  327. mov [edx+29],ecx
  328. @@Bwd29:
  329. mov ecx,[eax+25]
  330. mov [edx+25],ecx
  331. @@Bwd25:
  332. mov ecx,[eax+21]
  333. mov [edx+21],ecx
  334. @@Bwd21:
  335. mov ecx,[eax+17]
  336. mov [edx+17],ecx
  337. @@Bwd17:
  338. mov ecx,[eax+13]
  339. mov [edx+13],ecx
  340. @@Bwd13:
  341. mov ecx,[eax+9]
  342. mov [edx+9],ecx
  343. @@Bwd09:
  344. mov ecx,[eax+5]
  345. mov [edx+5],ecx
  346. @@Bwd05:
  347. mov ecx,[eax+1]
  348. mov [edx+1],ecx
  349. @@Bwd01:
  350. movzx ecx, byte ptr[eax]
  351. mov [edx],cl
  352. @@Done:
  353. end; {SmallBackwardMove}
  354. {-------------------------------------------------------------------------}
  355. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  356. procedure Forwards_IA32_3;assembler;nostackframe;
  357. asm
  358. push ebx
  359. mov ebx,edx
  360. fild qword ptr [eax]
  361. add eax,ecx {QWORD Align Writes}
  362. add ecx,edx
  363. add edx,7
  364. and edx,-8
  365. sub ecx,edx
  366. add edx,ecx {Now QWORD Aligned}
  367. sub ecx,16
  368. neg ecx
  369. @FwdLoop:
  370. fild qword ptr [eax+ecx-16]
  371. fistp qword ptr [edx+ecx-16]
  372. fild qword ptr [eax+ecx-8]
  373. fistp qword ptr [edx+ecx-8]
  374. add ecx,16
  375. jle @FwdLoop
  376. fistp qword ptr [ebx]
  377. neg ecx
  378. add ecx,16
  379. pop ebx
  380. jmp SmallForwardMove_3
  381. end; {Forwards_IA32}
  382. {-------------------------------------------------------------------------}
  383. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  384. procedure Backwards_IA32_3;assembler;nostackframe;
  385. asm
  386. push ebx
  387. fild qword ptr [eax+ecx-8]
  388. lea ebx,[edx+ecx] {QWORD Align Writes}
  389. and ebx,7
  390. sub ecx,ebx
  391. add ebx,ecx {Now QWORD Aligned, EBX = Original Length}
  392. sub ecx,16
  393. @BwdLoop:
  394. fild qword ptr [eax+ecx]
  395. fild qword ptr [eax+ecx+8]
  396. fistp qword ptr [edx+ecx+8]
  397. fistp qword ptr [edx+ecx]
  398. sub ecx,16
  399. jge @BwdLoop
  400. fistp qword ptr [edx+ebx-8]
  401. add ecx,16
  402. pop ebx
  403. jmp SmallBackwardMove_3
  404. end; {Backwards_IA32}
  405. {-------------------------------------------------------------------------}
  406. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  407. procedure Forwards_MMX_3;assembler;nostackframe;
  408. const
  409. LARGESIZE = 1024;
  410. asm
  411. cmp ecx,LARGESIZE
  412. jge @FwdLargeMove
  413. cmp ecx,72 {Size at which using MMX becomes worthwhile}
  414. jl Forwards_IA32_3
  415. push ebx
  416. mov ebx,edx
  417. movq mm0,[eax] {First 8 Characters}
  418. {QWORD Align Writes}
  419. add eax,ecx
  420. add ecx,edx
  421. add edx,7
  422. and edx,-8
  423. sub ecx,edx
  424. add edx,ecx
  425. {Now QWORD Aligned}
  426. sub ecx,32
  427. neg ecx
  428. @FwdLoopMMX:
  429. movq mm1,[eax+ecx-32]
  430. movq mm2,[eax+ecx-24]
  431. movq mm3,[eax+ecx-16]
  432. movq mm4,[eax+ecx- 8]
  433. movq [edx+ecx-32],mm1
  434. movq [edx+ecx-24],mm2
  435. movq [edx+ecx-16],mm3
  436. movq [edx+ecx- 8],mm4
  437. add ecx,32
  438. jle @FwdLoopMMX
  439. movq [ebx],mm0 {First 8 Characters}
  440. emms
  441. pop ebx
  442. neg ecx
  443. add ecx,32
  444. jmp SmallForwardMove_3
  445. @FwdLargeMove:
  446. push ebx
  447. mov ebx,ecx
  448. test edx,15
  449. jz @FwdAligned
  450. {16 byte Align Destination}
  451. mov ecx,edx
  452. add ecx,15
  453. and ecx,-16
  454. sub ecx,edx
  455. add eax,ecx
  456. add edx,ecx
  457. sub ebx,ecx
  458. {Destination now 16 Byte Aligned}
  459. call SmallForwardMove_3
  460. @FwdAligned:
  461. mov ecx,ebx
  462. and ecx,-16
  463. sub ebx,ecx {EBX = Remainder}
  464. push esi
  465. push edi
  466. mov esi,eax {ESI = Source}
  467. mov edi,edx {EDI = Dest}
  468. mov eax,ecx {EAX = Count}
  469. and eax,-64 {EAX = No of Bytes to Blocks Moves}
  470. and ecx,$3F {ECX = Remaining Bytes to Move (0..63)}
  471. add esi,eax
  472. add edi,eax
  473. shr eax,3 {EAX = No of QWORD's to Block Move}
  474. neg eax
  475. @MMXcopyloop:
  476. movq mm0,[esi+eax*8 ]
  477. movq mm1,[esi+eax*8+ 8]
  478. movq mm2,[esi+eax*8+16]
  479. movq mm3,[esi+eax*8+24]
  480. movq mm4,[esi+eax*8+32]
  481. movq mm5,[esi+eax*8+40]
  482. movq mm6,[esi+eax*8+48]
  483. movq mm7,[esi+eax*8+56]
  484. movq [edi+eax*8 ],mm0
  485. movq [edi+eax*8+ 8],mm1
  486. movq [edi+eax*8+16],mm2
  487. movq [edi+eax*8+24],mm3
  488. movq [edi+eax*8+32],mm4
  489. movq [edi+eax*8+40],mm5
  490. movq [edi+eax*8+48],mm6
  491. movq [edi+eax*8+56],mm7
  492. add eax,8
  493. jnz @MMXcopyloop
  494. emms {Empty MMX State}
  495. add ecx,ebx
  496. shr ecx,2
  497. rep movsd
  498. mov ecx,ebx
  499. and ecx,3
  500. rep movsb
  501. pop edi
  502. pop esi
  503. pop ebx
  504. end; {Forwards_MMX}
  505. {-------------------------------------------------------------------------}
  506. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  507. procedure Backwards_MMX_3;assembler;nostackframe;
  508. asm
  509. cmp ecx,72 {Size at which using MMX becomes worthwhile}
  510. jl Backwards_IA32_3
  511. push ebx
  512. movq mm0,[eax+ecx-8] {Get Last QWORD}
  513. {QWORD Align Writes}
  514. lea ebx,[edx+ecx]
  515. and ebx,7
  516. sub ecx,ebx
  517. add ebx,ecx
  518. {Now QWORD Aligned}
  519. sub ecx,32
  520. @BwdLoopMMX:
  521. movq mm1,[eax+ecx ]
  522. movq mm2,[eax+ecx+ 8]
  523. movq mm3,[eax+ecx+16]
  524. movq mm4,[eax+ecx+24]
  525. movq [edx+ecx+24],mm4
  526. movq [edx+ecx+16],mm3
  527. movq [edx+ecx+ 8],mm2
  528. movq [edx+ecx ],mm1
  529. sub ecx,32
  530. jge @BwdLoopMMX
  531. movq [edx+ebx-8], mm0 {Last QWORD}
  532. emms
  533. add ecx,32
  534. pop ebx
  535. jmp SmallBackwardMove_3
  536. end; {Backwards_MMX}
  537. {-------------------------------------------------------------------------}
  538. {Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
  539. procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe;
  540. const
  541. Prefetch = 512;
  542. asm
  543. push esi
  544. mov esi,eax {ESI = Source}
  545. mov eax,ecx {EAX = Count}
  546. and eax,-128 {EAX = No of Bytes to Block Move}
  547. add esi,eax
  548. add edx,eax
  549. shr eax,3 {EAX = No of QWORD's to Block Move}
  550. neg eax
  551. cmp eax, -(32*1024) {Count > 256K}
  552. jl @Large
  553. @Small: {Count<=256K}
  554. test esi,15 {Check if Both Source/Dest Aligned}
  555. jnz @SmallUnaligned
  556. @SmallAligned: {Both Source and Dest 16-Byte Aligned}
  557. @SmallAlignedLoop:
  558. movaps xmm0,[esi+8*eax]
  559. movaps xmm1,[esi+8*eax+16]
  560. movaps xmm2,[esi+8*eax+32]
  561. movaps xmm3,[esi+8*eax+48]
  562. movaps [edx+8*eax],xmm0
  563. movaps [edx+8*eax+16],xmm1
  564. movaps [edx+8*eax+32],xmm2
  565. movaps [edx+8*eax+48],xmm3
  566. movaps xmm4,[esi+8*eax+64]
  567. movaps xmm5,[esi+8*eax+80]
  568. movaps xmm6,[esi+8*eax+96]
  569. movaps xmm7,[esi+8*eax+112]
  570. movaps [edx+8*eax+64],xmm4
  571. movaps [edx+8*eax+80],xmm5
  572. movaps [edx+8*eax+96],xmm6
  573. movaps [edx+8*eax+112],xmm7
  574. add eax,16
  575. js @SmallAlignedLoop
  576. jmp @Remainder
  577. @SmallUnaligned: {Source Not 16-Byte Aligned}
  578. @SmallUnalignedLoop:
  579. movups xmm0,[esi+8*eax]
  580. movups xmm1,[esi+8*eax+16]
  581. movups xmm2,[esi+8*eax+32]
  582. movups xmm3,[esi+8*eax+48]
  583. movaps [edx+8*eax],xmm0
  584. movaps [edx+8*eax+16],xmm1
  585. movaps [edx+8*eax+32],xmm2
  586. movaps [edx+8*eax+48],xmm3
  587. movups xmm4,[esi+8*eax+64]
  588. movups xmm5,[esi+8*eax+80]
  589. movups xmm6,[esi+8*eax+96]
  590. movups xmm7,[esi+8*eax+112]
  591. movaps [edx+8*eax+64],xmm4
  592. movaps [edx+8*eax+80],xmm5
  593. movaps [edx+8*eax+96],xmm6
  594. movaps [edx+8*eax+112],xmm7
  595. add eax,16
  596. js @SmallUnalignedLoop
  597. jmp @Remainder
  598. @Large: {Count>256K}
  599. test esi,15 {Check if Both Source/Dest Aligned}
  600. jnz @LargeUnaligned
  601. @LargeAligned: {Both Source and Dest 16-Byte Aligned}
  602. @LargeAlignedLoop:
  603. prefetchnta [esi+8*eax+Prefetch]
  604. prefetchnta [esi+8*eax+Prefetch+64]
  605. movaps xmm0,[esi+8*eax]
  606. movaps xmm1,[esi+8*eax+16]
  607. movaps xmm2,[esi+8*eax+32]
  608. movaps xmm3,[esi+8*eax+48]
  609. movntps [edx+8*eax],xmm0
  610. movntps [edx+8*eax+16],xmm1
  611. movntps [edx+8*eax+32],xmm2
  612. movntps [edx+8*eax+48],xmm3
  613. movaps xmm4,[esi+8*eax+64]
  614. movaps xmm5,[esi+8*eax+80]
  615. movaps xmm6,[esi+8*eax+96]
  616. movaps xmm7,[esi+8*eax+112]
  617. movntps [edx+8*eax+64],xmm4
  618. movntps [edx+8*eax+80],xmm5
  619. movntps [edx+8*eax+96],xmm6
  620. movntps [edx+8*eax+112],xmm7
  621. add eax,16
  622. js @LargeAlignedLoop
  623. sfence
  624. jmp @Remainder
  625. @LargeUnaligned: {Source Not 16-Byte Aligned}
  626. @LargeUnalignedLoop:
  627. prefetchnta [esi+8*eax+Prefetch]
  628. prefetchnta [esi+8*eax+Prefetch+64]
  629. movups xmm0,[esi+8*eax]
  630. movups xmm1,[esi+8*eax+16]
  631. movups xmm2,[esi+8*eax+32]
  632. movups xmm3,[esi+8*eax+48]
  633. movntps [edx+8*eax],xmm0
  634. movntps [edx+8*eax+16],xmm1
  635. movntps [edx+8*eax+32],xmm2
  636. movntps [edx+8*eax+48],xmm3
  637. movups xmm4,[esi+8*eax+64]
  638. movups xmm5,[esi+8*eax+80]
  639. movups xmm6,[esi+8*eax+96]
  640. movups xmm7,[esi+8*eax+112]
  641. movntps [edx+8*eax+64],xmm4
  642. movntps [edx+8*eax+80],xmm5
  643. movntps [edx+8*eax+96],xmm6
  644. movntps [edx+8*eax+112],xmm7
  645. add eax,16
  646. js @LargeUnalignedLoop
  647. sfence
  648. @Remainder:
  649. and ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)}
  650. jz @Done
  651. add esi,ecx
  652. add edx,ecx
  653. neg ecx
  654. @RemainderLoop:
  655. movups xmm0,[esi+ecx]
  656. movaps [edx+ecx],xmm0
  657. add ecx,16
  658. jnz @RemainderLoop
  659. @Done:
  660. pop esi
  661. end; {AlignedFwdMoveSSE}
  662. {-------------------------------------------------------------------------}
  663. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  664. procedure Forwards_SSE_3;assembler;nostackframe;
  665. const
  666. LARGESIZE = 2048;
  667. asm
  668. cmp ecx,LARGESIZE
  669. jge @FwdLargeMove
  670. cmp ecx,SMALLMOVESIZE+32
  671. movups xmm0,[eax]
  672. jg @FwdMoveSSE
  673. movups xmm1,[eax+16]
  674. movups [edx],xmm0
  675. movups [edx+16],xmm1
  676. add eax,ecx
  677. add edx,ecx
  678. sub ecx,32
  679. jmp SmallForwardMove_3
  680. @FwdMoveSSE:
  681. push ebx
  682. mov ebx,edx
  683. {Align Writes}
  684. add eax,ecx
  685. add ecx,edx
  686. add edx,15
  687. and edx,-16
  688. sub ecx,edx
  689. add edx,ecx
  690. {Now Aligned}
  691. sub ecx,32
  692. neg ecx
  693. @FwdLoopSSE:
  694. movups xmm1,[eax+ecx-32]
  695. movups xmm2,[eax+ecx-16]
  696. movaps [edx+ecx-32],xmm1
  697. movaps [edx+ecx-16],xmm2
  698. add ecx,32
  699. jle @FwdLoopSSE
  700. movups [ebx],xmm0 {First 16 Bytes}
  701. neg ecx
  702. add ecx,32
  703. pop ebx
  704. jmp SmallForwardMove_3
  705. @FwdLargeMove:
  706. push ebx
  707. mov ebx,ecx
  708. test edx,15
  709. jz @FwdLargeAligned
  710. {16 byte Align Destination}
  711. mov ecx,edx
  712. add ecx,15
  713. and ecx,-16
  714. sub ecx,edx
  715. add eax,ecx
  716. add edx,ecx
  717. sub ebx,ecx
  718. {Destination now 16 Byte Aligned}
  719. call SmallForwardMove_3
  720. mov ecx,ebx
  721. @FwdLargeAligned:
  722. and ecx,-16
  723. sub ebx,ecx {EBX = Remainder}
  724. push edx
  725. push eax
  726. push ecx
  727. call AlignedFwdMoveSSE_3
  728. pop ecx
  729. pop eax
  730. pop edx
  731. add ecx,ebx
  732. add eax,ecx
  733. add edx,ecx
  734. mov ecx,ebx
  735. pop ebx
  736. jmp SmallForwardMove_3
  737. end; {Forwards_SSE}
  738. {-------------------------------------------------------------------------}
  739. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  740. procedure Backwards_SSE_3;assembler;nostackframe;
  741. asm
  742. cmp ecx,SMALLMOVESIZE+32
  743. jg @BwdMoveSSE
  744. sub ecx,32
  745. movups xmm1,[eax+ecx]
  746. movups xmm2,[eax+ecx+16]
  747. movups [edx+ecx],xmm1
  748. movups [edx+ecx+16],xmm2
  749. jmp SmallBackwardMove_3
  750. @BwdMoveSSE:
  751. push ebx
  752. movups xmm0,[eax+ecx-16] {Last 16 Bytes}
  753. {Align Writes}
  754. lea ebx,[edx+ecx]
  755. and ebx,15
  756. sub ecx,ebx
  757. add ebx,ecx
  758. {Now Aligned}
  759. sub ecx,32
  760. @BwdLoop:
  761. movups xmm1,[eax+ecx]
  762. movups xmm2,[eax+ecx+16]
  763. movaps [edx+ecx],xmm1
  764. movaps [edx+ecx+16],xmm2
  765. sub ecx,32
  766. jge @BwdLoop
  767. movups [edx+ebx-16],xmm0 {Last 16 Bytes}
  768. add ecx,32
  769. pop ebx
  770. jmp SmallBackwardMove_3
  771. end; {Backwards_SSE}
  772. const
  773. fastmoveproc_forward : pointer = @Forwards_SSE_3;
  774. fastmoveproc_backward : pointer = @Backwards_SSE_3;
  775. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
  776. asm
  777. cmp ecx,SMALLMOVESIZE
  778. ja @Large
  779. cmp eax,edx
  780. lea eax,[eax+ecx]
  781. jle @SmallCheck
  782. @SmallForward:
  783. add edx,ecx
  784. jmp SmallForwardMove_3
  785. @SmallCheck:
  786. je @Done {For Compatibility with Delphi's move for Source = Dest}
  787. sub eax,ecx
  788. jmp SmallBackwardMove_3
  789. @Large:
  790. jng @Done {For Compatibility with Delphi's move for Count < 0}
  791. cmp eax,edx
  792. jg @moveforward
  793. je @Done {For Compatibility with Delphi's move for Source = Dest}
  794. push eax
  795. add eax,ecx
  796. cmp eax,edx
  797. pop eax
  798. jg @movebackward
  799. @moveforward:
  800. jmp dword ptr fastmoveproc_forward
  801. @movebackward:
  802. jmp dword ptr fastmoveproc_backward {Source/Dest Overlap}
  803. @Done:
  804. end;
  805. {$asmmode att}
  806. procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
  807. begin
  808. if has_sse_support then
  809. begin
  810. fastmoveproc_forward:=@Forwards_SSE_3;
  811. fastmoveproc_backward:=@Backwards_SSE_3;
  812. end
  813. else if has_mmx_support then
  814. begin
  815. fastmoveproc_forward:=@Forwards_MMX_3;
  816. fastmoveproc_backward:=@Backwards_MMX_3;
  817. end;
  818. end;
  819. {$endif FPC_SYSTEM_HAS_MOVE}
  820. {$else}
  821. procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
  822. begin
  823. end;
  824. {$endif}