fastmove.inc 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854
  1. {
  2. Copyright (c) 2004, John O'Harrow ([email protected])
  3. This software is provided 'as-is', without any express or implied warranty.
  4. In no event will the authors be held liable for any damages arising from the
  5. use of this software.
  6. Permission is granted to anyone to use this software for any purpose, including
  7. commercial applications, and to alter it and redistribute it freely, subject to
  8. the following restrictions:
  9. 1. The origin of this software must not be misrepresented; you must not claim
  10. that you wrote the original software. If you use this software in a product,
  11. an acknowledgment in the product documentation would be appreciated but is
  12. not required.
  13. 2. Altered source versions must be plainly marked as such, and must not be
  14. misrepresented as being the original software.
  15. 3. This notice may not be removed or altered from any source distribution.
  16. -------------------------------------------------------------------------------
  17. Version: 1.40 - 16-SEP-2004
  18. }
  19. {$ifdef USE_FASTMOVE}
  20. {$ifndef FPC_SYSTEM_HAS_MOVE}
  21. {$define FPC_SYSTEM_HAS_MOVE}
  22. {$asmmode intel}
  23. {-------------------------------------------------------------------------}
  24. (*
  25. {Just to show that a good Pascal algorithm can beat the default BASM}
  26. procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer);
  27. var
  28. S, D : PtrUInt;
  29. Temp, C, I : PtrInt;
  30. L : PPtrInt;
  31. begin
  32. S := Cardinal(@Source);
  33. D := Cardinal(@Dest);
  34. if S = D then
  35. Exit;
  36. if Count <= 4 then
  37. case Count of
  38. 1 : PByte(@Dest)^ := PByte(S)^;
  39. 2 : PWord(@Dest)^ := PWord(S)^;
  40. 3 : if D > S then
  41. begin
  42. PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
  43. PWord(@Dest)^ := PWord(S)^;
  44. end
  45. else
  46. begin
  47. PWord(@Dest)^ := PWord(S)^;
  48. PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
  49. end;
  50. 4 : PInteger(@Dest)^ := PInteger(S)^
  51. else Exit; {Count <= 0}
  52. end
  53. else
  54. if D > S then
  55. begin
  56. Temp := PInteger(S)^;
  57. I := Integer(@Dest);
  58. C := Count - 4;
  59. L := PInteger(Integer(@Dest) + C);
  60. Inc(S, C);
  61. repeat
  62. L^ := PInteger(S)^;
  63. if Count <= 8 then
  64. Break;
  65. Dec(Count, 4);
  66. Dec(S, 4);
  67. Dec(L);
  68. until False;
  69. PInteger(I)^ := Temp;
  70. end
  71. else
  72. begin
  73. C := Count - 4;
  74. Temp := PInteger(S + Cardinal(C))^;
  75. I := Integer(@Dest) + C;
  76. L := @Dest;
  77. repeat
  78. L^ := PInteger(S)^;
  79. if Count <= 8 then
  80. Break;
  81. Dec(Count, 4);
  82. Inc(S, 4);
  83. Inc(L);
  84. until False;
  85. PInteger(I)^ := Temp;
  86. end;
  87. end; {MoveJOH_PAS}
  88. *)
  89. const
  90. SMALLMOVESIZE = 36;
  91. {-------------------------------------------------------------------------}
  92. {Perform Forward Move of 0..36 Bytes}
  93. {On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count. Destroys ECX}
  94. procedure SmallForwardMove_3;assembler;nostackframe;
  95. asm
  96. jmp dword ptr @@FwdJumpTable[ecx*4]
  97. align 16
  98. @@FwdJumpTable:
  99. dd @@Done {Removes need to test for zero size move}
  100. dd @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08
  101. dd @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16
  102. dd @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24
  103. dd @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32
  104. dd @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36
  105. @@Fwd36:
  106. mov ecx,[eax-36]
  107. mov [edx-36],ecx
  108. @@Fwd32:
  109. mov ecx,[eax-32]
  110. mov [edx-32],ecx
  111. @@Fwd28:
  112. mov ecx,[eax-28]
  113. mov [edx-28],ecx
  114. @@Fwd24:
  115. mov ecx,[eax-24]
  116. mov [edx-24],ecx
  117. @@Fwd20:
  118. mov ecx,[eax-20]
  119. mov [edx-20],ecx
  120. @@Fwd16:
  121. mov ecx,[eax-16]
  122. mov [edx-16],ecx
  123. @@Fwd12:
  124. mov ecx,[eax-12]
  125. mov [edx-12],ecx
  126. @@Fwd08:
  127. mov ecx,[eax-8]
  128. mov [edx-8],ecx
  129. @@Fwd04:
  130. mov ecx,[eax-4]
  131. mov [edx-4],ecx
  132. ret
  133. @@Fwd35:
  134. mov ecx,[eax-35]
  135. mov [edx-35],ecx
  136. @@Fwd31:
  137. mov ecx,[eax-31]
  138. mov [edx-31],ecx
  139. @@Fwd27:
  140. mov ecx,[eax-27]
  141. mov [edx-27],ecx
  142. @@Fwd23:
  143. mov ecx,[eax-23]
  144. mov [edx-23],ecx
  145. @@Fwd19:
  146. mov ecx,[eax-19]
  147. mov [edx-19],ecx
  148. @@Fwd15:
  149. mov ecx,[eax-15]
  150. mov [edx-15],ecx
  151. @@Fwd11:
  152. mov ecx,[eax-11]
  153. mov [edx-11],ecx
  154. @@Fwd07:
  155. mov ecx,[eax-7]
  156. mov [edx-7],ecx
  157. mov ecx,[eax-4]
  158. mov [edx-4],ecx
  159. ret
  160. @@Fwd03:
  161. movzx ecx, word ptr [eax-3]
  162. mov [edx-3],cx
  163. movzx ecx, byte ptr [eax-1]
  164. mov [edx-1],cl
  165. ret
  166. @@Fwd34:
  167. mov ecx,[eax-34]
  168. mov [edx-34],ecx
  169. @@Fwd30:
  170. mov ecx,[eax-30]
  171. mov [edx-30],ecx
  172. @@Fwd26:
  173. mov ecx,[eax-26]
  174. mov [edx-26],ecx
  175. @@Fwd22:
  176. mov ecx,[eax-22]
  177. mov [edx-22],ecx
  178. @@Fwd18:
  179. mov ecx,[eax-18]
  180. mov [edx-18],ecx
  181. @@Fwd14:
  182. mov ecx,[eax-14]
  183. mov [edx-14],ecx
  184. @@Fwd10:
  185. mov ecx,[eax-10]
  186. mov [edx-10],ecx
  187. @@Fwd06:
  188. mov ecx,[eax-6]
  189. mov [edx-6],ecx
  190. @@Fwd02:
  191. movzx ecx, word ptr [eax-2]
  192. mov [edx-2],cx
  193. ret
  194. @@Fwd33:
  195. mov ecx,[eax-33]
  196. mov [edx-33],ecx
  197. @@Fwd29:
  198. mov ecx,[eax-29]
  199. mov [edx-29],ecx
  200. @@Fwd25:
  201. mov ecx,[eax-25]
  202. mov [edx-25],ecx
  203. @@Fwd21:
  204. mov ecx,[eax-21]
  205. mov [edx-21],ecx
  206. @@Fwd17:
  207. mov ecx,[eax-17]
  208. mov [edx-17],ecx
  209. @@Fwd13:
  210. mov ecx,[eax-13]
  211. mov [edx-13],ecx
  212. @@Fwd09:
  213. mov ecx,[eax-9]
  214. mov [edx-9],ecx
  215. @@Fwd05:
  216. mov ecx,[eax-5]
  217. mov [edx-5],ecx
  218. @@Fwd01:
  219. movzx ecx, byte ptr [eax-1]
  220. mov [edx-1],cl
  221. @@Done:
  222. end; {SmallForwardMove}
  223. {-------------------------------------------------------------------------}
  224. {Perform Backward Move of 0..36 Bytes}
  225. {On Entry, ECX = Count, EAX = Source, EDX = Dest. Destroys ECX}
  226. procedure SmallBackwardMove_3;assembler;nostackframe;
  227. asm
  228. jmp dword ptr @@BwdJumpTable[ecx*4]
  229. align 16
  230. @@BwdJumpTable:
  231. dd @@Done {Removes need to test for zero size move}
  232. dd @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08
  233. dd @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16
  234. dd @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24
  235. dd @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32
  236. dd @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36
  237. @@Bwd36:
  238. mov ecx,[eax+32]
  239. mov [edx+32],ecx
  240. @@Bwd32:
  241. mov ecx,[eax+28]
  242. mov [edx+28],ecx
  243. @@Bwd28:
  244. mov ecx,[eax+24]
  245. mov [edx+24],ecx
  246. @@Bwd24:
  247. mov ecx,[eax+20]
  248. mov [edx+20],ecx
  249. @@Bwd20:
  250. mov ecx,[eax+16]
  251. mov [edx+16],ecx
  252. @@Bwd16:
  253. mov ecx,[eax+12]
  254. mov [edx+12],ecx
  255. @@Bwd12:
  256. mov ecx,[eax+8]
  257. mov [edx+8],ecx
  258. @@Bwd08:
  259. mov ecx,[eax+4]
  260. mov [edx+4],ecx
  261. @@Bwd04:
  262. mov ecx,[eax]
  263. mov [edx],ecx
  264. ret
  265. @@Bwd35:
  266. mov ecx,[eax+31]
  267. mov [edx+31],ecx
  268. @@Bwd31:
  269. mov ecx,[eax+27]
  270. mov [edx+27],ecx
  271. @@Bwd27:
  272. mov ecx,[eax+23]
  273. mov [edx+23],ecx
  274. @@Bwd23:
  275. mov ecx,[eax+19]
  276. mov [edx+19],ecx
  277. @@Bwd19:
  278. mov ecx,[eax+15]
  279. mov [edx+15],ecx
  280. @@Bwd15:
  281. mov ecx,[eax+11]
  282. mov [edx+11],ecx
  283. @@Bwd11:
  284. mov ecx,[eax+7]
  285. mov [edx+7],ecx
  286. @@Bwd07:
  287. mov ecx,[eax+3]
  288. mov [edx+3],ecx
  289. mov ecx,[eax]
  290. mov [edx],ecx
  291. ret
  292. @@Bwd03:
  293. movzx ecx, word ptr [eax+1]
  294. mov [edx+1],cx
  295. movzx ecx, byte ptr [eax]
  296. mov [edx],cl
  297. ret
  298. @@Bwd34:
  299. mov ecx,[eax+30]
  300. mov [edx+30],ecx
  301. @@Bwd30:
  302. mov ecx,[eax+26]
  303. mov [edx+26],ecx
  304. @@Bwd26:
  305. mov ecx,[eax+22]
  306. mov [edx+22],ecx
  307. @@Bwd22:
  308. mov ecx,[eax+18]
  309. mov [edx+18],ecx
  310. @@Bwd18:
  311. mov ecx,[eax+14]
  312. mov [edx+14],ecx
  313. @@Bwd14:
  314. mov ecx,[eax+10]
  315. mov [edx+10],ecx
  316. @@Bwd10:
  317. mov ecx,[eax+6]
  318. mov [edx+6],ecx
  319. @@Bwd06:
  320. mov ecx,[eax+2]
  321. mov [edx+2],ecx
  322. @@Bwd02:
  323. movzx ecx, word ptr [eax]
  324. mov [edx],cx
  325. ret
  326. @@Bwd33:
  327. mov ecx,[eax+29]
  328. mov [edx+29],ecx
  329. @@Bwd29:
  330. mov ecx,[eax+25]
  331. mov [edx+25],ecx
  332. @@Bwd25:
  333. mov ecx,[eax+21]
  334. mov [edx+21],ecx
  335. @@Bwd21:
  336. mov ecx,[eax+17]
  337. mov [edx+17],ecx
  338. @@Bwd17:
  339. mov ecx,[eax+13]
  340. mov [edx+13],ecx
  341. @@Bwd13:
  342. mov ecx,[eax+9]
  343. mov [edx+9],ecx
  344. @@Bwd09:
  345. mov ecx,[eax+5]
  346. mov [edx+5],ecx
  347. @@Bwd05:
  348. mov ecx,[eax+1]
  349. mov [edx+1],ecx
  350. @@Bwd01:
  351. movzx ecx, byte ptr[eax]
  352. mov [edx],cl
  353. @@Done:
  354. end; {SmallBackwardMove}
  355. {-------------------------------------------------------------------------}
  356. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  357. procedure Forwards_IA32_3;assembler;nostackframe;
  358. asm
  359. push ebx
  360. mov ebx,edx
  361. fild qword ptr [eax]
  362. add eax,ecx {QWORD Align Writes}
  363. add ecx,edx
  364. add edx,7
  365. and edx,-8
  366. sub ecx,edx
  367. add edx,ecx {Now QWORD Aligned}
  368. sub ecx,16
  369. neg ecx
  370. @FwdLoop:
  371. fild qword ptr [eax+ecx-16]
  372. fistp qword ptr [edx+ecx-16]
  373. fild qword ptr [eax+ecx-8]
  374. fistp qword ptr [edx+ecx-8]
  375. add ecx,16
  376. jle @FwdLoop
  377. fistp qword ptr [ebx]
  378. neg ecx
  379. add ecx,16
  380. pop ebx
  381. jmp SmallForwardMove_3
  382. end; {Forwards_IA32}
  383. {-------------------------------------------------------------------------}
  384. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  385. procedure Backwards_IA32_3;assembler;nostackframe;
  386. asm
  387. push ebx
  388. fild qword ptr [eax+ecx-8]
  389. lea ebx,[edx+ecx] {QWORD Align Writes}
  390. and ebx,7
  391. sub ecx,ebx
  392. add ebx,ecx {Now QWORD Aligned, EBX = Original Length}
  393. sub ecx,16
  394. @BwdLoop:
  395. fild qword ptr [eax+ecx]
  396. fild qword ptr [eax+ecx+8]
  397. fistp qword ptr [edx+ecx+8]
  398. fistp qword ptr [edx+ecx]
  399. sub ecx,16
  400. jge @BwdLoop
  401. fistp qword ptr [edx+ebx-8]
  402. add ecx,16
  403. pop ebx
  404. jmp SmallBackwardMove_3
  405. end; {Backwards_IA32}
  406. {-------------------------------------------------------------------------}
  407. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  408. procedure Forwards_MMX_3;assembler;nostackframe;
  409. const
  410. LARGESIZE = 1024;
  411. asm
  412. cmp ecx,LARGESIZE
  413. jge @FwdLargeMove
  414. cmp ecx,72 {Size at which using MMX becomes worthwhile}
  415. jl Forwards_IA32_3
  416. push ebx
  417. mov ebx,edx
  418. movq mm0,[eax] {First 8 Characters}
  419. {QWORD Align Writes}
  420. add eax,ecx
  421. add ecx,edx
  422. add edx,7
  423. and edx,-8
  424. sub ecx,edx
  425. add edx,ecx
  426. {Now QWORD Aligned}
  427. sub ecx,32
  428. neg ecx
  429. @FwdLoopMMX:
  430. movq mm1,[eax+ecx-32]
  431. movq mm2,[eax+ecx-24]
  432. movq mm3,[eax+ecx-16]
  433. movq mm4,[eax+ecx- 8]
  434. movq [edx+ecx-32],mm1
  435. movq [edx+ecx-24],mm2
  436. movq [edx+ecx-16],mm3
  437. movq [edx+ecx- 8],mm4
  438. add ecx,32
  439. jle @FwdLoopMMX
  440. movq [ebx],mm0 {First 8 Characters}
  441. emms
  442. pop ebx
  443. neg ecx
  444. add ecx,32
  445. jmp SmallForwardMove_3
  446. @FwdLargeMove:
  447. push ebx
  448. mov ebx,ecx
  449. test edx,15
  450. jz @FwdAligned
  451. {16 byte Align Destination}
  452. mov ecx,edx
  453. add ecx,15
  454. and ecx,-16
  455. sub ecx,edx
  456. add eax,ecx
  457. add edx,ecx
  458. sub ebx,ecx
  459. {Destination now 16 Byte Aligned}
  460. call SmallForwardMove_3
  461. @FwdAligned:
  462. mov ecx,ebx
  463. and ecx,-16
  464. sub ebx,ecx {EBX = Remainder}
  465. push esi
  466. push edi
  467. mov esi,eax {ESI = Source}
  468. mov edi,edx {EDI = Dest}
  469. mov eax,ecx {EAX = Count}
  470. and eax,-64 {EAX = No of Bytes to Blocks Moves}
  471. and ecx,$3F {ECX = Remaining Bytes to Move (0..63)}
  472. add esi,eax
  473. add edi,eax
  474. shr eax,3 {EAX = No of QWORD's to Block Move}
  475. neg eax
  476. @MMXcopyloop:
  477. movq mm0,[esi+eax*8 ]
  478. movq mm1,[esi+eax*8+ 8]
  479. movq mm2,[esi+eax*8+16]
  480. movq mm3,[esi+eax*8+24]
  481. movq mm4,[esi+eax*8+32]
  482. movq mm5,[esi+eax*8+40]
  483. movq mm6,[esi+eax*8+48]
  484. movq mm7,[esi+eax*8+56]
  485. movq [edi+eax*8 ],mm0
  486. movq [edi+eax*8+ 8],mm1
  487. movq [edi+eax*8+16],mm2
  488. movq [edi+eax*8+24],mm3
  489. movq [edi+eax*8+32],mm4
  490. movq [edi+eax*8+40],mm5
  491. movq [edi+eax*8+48],mm6
  492. movq [edi+eax*8+56],mm7
  493. add eax,8
  494. jnz @MMXcopyloop
  495. emms {Empty MMX State}
  496. add ecx,ebx
  497. shr ecx,2
  498. rep movsd
  499. mov ecx,ebx
  500. and ecx,3
  501. rep movsb
  502. pop edi
  503. pop esi
  504. pop ebx
  505. end; {Forwards_MMX}
  506. {-------------------------------------------------------------------------}
  507. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  508. procedure Backwards_MMX_3;assembler;nostackframe;
  509. asm
  510. cmp ecx,72 {Size at which using MMX becomes worthwhile}
  511. jl Backwards_IA32_3
  512. push ebx
  513. movq mm0,[eax+ecx-8] {Get Last QWORD}
  514. {QWORD Align Writes}
  515. lea ebx,[edx+ecx]
  516. and ebx,7
  517. sub ecx,ebx
  518. add ebx,ecx
  519. {Now QWORD Aligned}
  520. sub ecx,32
  521. @BwdLoopMMX:
  522. movq mm1,[eax+ecx ]
  523. movq mm2,[eax+ecx+ 8]
  524. movq mm3,[eax+ecx+16]
  525. movq mm4,[eax+ecx+24]
  526. movq [edx+ecx+24],mm4
  527. movq [edx+ecx+16],mm3
  528. movq [edx+ecx+ 8],mm2
  529. movq [edx+ecx ],mm1
  530. sub ecx,32
  531. jge @BwdLoopMMX
  532. movq [edx+ebx-8], mm0 {Last QWORD}
  533. emms
  534. add ecx,32
  535. pop ebx
  536. jmp SmallBackwardMove_3
  537. end; {Backwards_MMX}
  538. {-------------------------------------------------------------------------}
  539. {Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
  540. procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe;
  541. const
  542. Prefetch = 512;
  543. asm
  544. push esi
  545. mov esi,eax {ESI = Source}
  546. mov eax,ecx {EAX = Count}
  547. and eax,-128 {EAX = No of Bytes to Block Move}
  548. add esi,eax
  549. add edx,eax
  550. shr eax,3 {EAX = No of QWORD's to Block Move}
  551. neg eax
  552. cmp eax, -(32*1024) {Count > 256K}
  553. jl @Large
  554. @Small: {Count<=256K}
  555. test esi,15 {Check if Both Source/Dest Aligned}
  556. jnz @SmallUnaligned
  557. @SmallAligned: {Both Source and Dest 16-Byte Aligned}
  558. @SmallAlignedLoop:
  559. movaps xmm0,[esi+8*eax]
  560. movaps xmm1,[esi+8*eax+16]
  561. movaps xmm2,[esi+8*eax+32]
  562. movaps xmm3,[esi+8*eax+48]
  563. movaps [edx+8*eax],xmm0
  564. movaps [edx+8*eax+16],xmm1
  565. movaps [edx+8*eax+32],xmm2
  566. movaps [edx+8*eax+48],xmm3
  567. movaps xmm4,[esi+8*eax+64]
  568. movaps xmm5,[esi+8*eax+80]
  569. movaps xmm6,[esi+8*eax+96]
  570. movaps xmm7,[esi+8*eax+112]
  571. movaps [edx+8*eax+64],xmm4
  572. movaps [edx+8*eax+80],xmm5
  573. movaps [edx+8*eax+96],xmm6
  574. movaps [edx+8*eax+112],xmm7
  575. add eax,16
  576. js @SmallAlignedLoop
  577. jmp @Remainder
  578. @SmallUnaligned: {Source Not 16-Byte Aligned}
  579. @SmallUnalignedLoop:
  580. movups xmm0,[esi+8*eax]
  581. movups xmm1,[esi+8*eax+16]
  582. movups xmm2,[esi+8*eax+32]
  583. movups xmm3,[esi+8*eax+48]
  584. movaps [edx+8*eax],xmm0
  585. movaps [edx+8*eax+16],xmm1
  586. movaps [edx+8*eax+32],xmm2
  587. movaps [edx+8*eax+48],xmm3
  588. movups xmm4,[esi+8*eax+64]
  589. movups xmm5,[esi+8*eax+80]
  590. movups xmm6,[esi+8*eax+96]
  591. movups xmm7,[esi+8*eax+112]
  592. movaps [edx+8*eax+64],xmm4
  593. movaps [edx+8*eax+80],xmm5
  594. movaps [edx+8*eax+96],xmm6
  595. movaps [edx+8*eax+112],xmm7
  596. add eax,16
  597. js @SmallUnalignedLoop
  598. jmp @Remainder
  599. @Large: {Count>256K}
  600. test esi,15 {Check if Both Source/Dest Aligned}
  601. jnz @LargeUnaligned
  602. @LargeAligned: {Both Source and Dest 16-Byte Aligned}
  603. @LargeAlignedLoop:
  604. prefetchnta [esi+8*eax+Prefetch]
  605. prefetchnta [esi+8*eax+Prefetch+64]
  606. movaps xmm0,[esi+8*eax]
  607. movaps xmm1,[esi+8*eax+16]
  608. movaps xmm2,[esi+8*eax+32]
  609. movaps xmm3,[esi+8*eax+48]
  610. movntps [edx+8*eax],xmm0
  611. movntps [edx+8*eax+16],xmm1
  612. movntps [edx+8*eax+32],xmm2
  613. movntps [edx+8*eax+48],xmm3
  614. movaps xmm4,[esi+8*eax+64]
  615. movaps xmm5,[esi+8*eax+80]
  616. movaps xmm6,[esi+8*eax+96]
  617. movaps xmm7,[esi+8*eax+112]
  618. movntps [edx+8*eax+64],xmm4
  619. movntps [edx+8*eax+80],xmm5
  620. movntps [edx+8*eax+96],xmm6
  621. movntps [edx+8*eax+112],xmm7
  622. add eax,16
  623. js @LargeAlignedLoop
  624. sfence
  625. jmp @Remainder
  626. @LargeUnaligned: {Source Not 16-Byte Aligned}
  627. @LargeUnalignedLoop:
  628. prefetchnta [esi+8*eax+Prefetch]
  629. prefetchnta [esi+8*eax+Prefetch+64]
  630. movups xmm0,[esi+8*eax]
  631. movups xmm1,[esi+8*eax+16]
  632. movups xmm2,[esi+8*eax+32]
  633. movups xmm3,[esi+8*eax+48]
  634. movntps [edx+8*eax],xmm0
  635. movntps [edx+8*eax+16],xmm1
  636. movntps [edx+8*eax+32],xmm2
  637. movntps [edx+8*eax+48],xmm3
  638. movups xmm4,[esi+8*eax+64]
  639. movups xmm5,[esi+8*eax+80]
  640. movups xmm6,[esi+8*eax+96]
  641. movups xmm7,[esi+8*eax+112]
  642. movntps [edx+8*eax+64],xmm4
  643. movntps [edx+8*eax+80],xmm5
  644. movntps [edx+8*eax+96],xmm6
  645. movntps [edx+8*eax+112],xmm7
  646. add eax,16
  647. js @LargeUnalignedLoop
  648. sfence
  649. @Remainder:
  650. and ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)}
  651. jz @Done
  652. add esi,ecx
  653. add edx,ecx
  654. neg ecx
  655. @RemainderLoop:
  656. movups xmm0,[esi+ecx]
  657. movaps [edx+ecx],xmm0
  658. add ecx,16
  659. jnz @RemainderLoop
  660. @Done:
  661. pop esi
  662. end; {AlignedFwdMoveSSE}
  663. {-------------------------------------------------------------------------}
  664. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  665. procedure Forwards_SSE_3;assembler;nostackframe;
  666. const
  667. LARGESIZE = 2048;
  668. asm
  669. cmp ecx,LARGESIZE
  670. jge @FwdLargeMove
  671. cmp ecx,SMALLMOVESIZE+32
  672. movups xmm0,[eax]
  673. jg @FwdMoveSSE
  674. movups xmm1,[eax+16]
  675. movups [edx],xmm0
  676. movups [edx+16],xmm1
  677. add eax,ecx
  678. add edx,ecx
  679. sub ecx,32
  680. jmp SmallForwardMove_3
  681. @FwdMoveSSE:
  682. push ebx
  683. mov ebx,edx
  684. {Align Writes}
  685. add eax,ecx
  686. add ecx,edx
  687. add edx,15
  688. and edx,-16
  689. sub ecx,edx
  690. add edx,ecx
  691. {Now Aligned}
  692. sub ecx,32
  693. neg ecx
  694. @FwdLoopSSE:
  695. movups xmm1,[eax+ecx-32]
  696. movups xmm2,[eax+ecx-16]
  697. movaps [edx+ecx-32],xmm1
  698. movaps [edx+ecx-16],xmm2
  699. add ecx,32
  700. jle @FwdLoopSSE
  701. movups [ebx],xmm0 {First 16 Bytes}
  702. neg ecx
  703. add ecx,32
  704. pop ebx
  705. jmp SmallForwardMove_3
  706. @FwdLargeMove:
  707. push ebx
  708. mov ebx,ecx
  709. test edx,15
  710. jz @FwdLargeAligned
  711. {16 byte Align Destination}
  712. mov ecx,edx
  713. add ecx,15
  714. and ecx,-16
  715. sub ecx,edx
  716. add eax,ecx
  717. add edx,ecx
  718. sub ebx,ecx
  719. {Destination now 16 Byte Aligned}
  720. call SmallForwardMove_3
  721. mov ecx,ebx
  722. @FwdLargeAligned:
  723. and ecx,-16
  724. sub ebx,ecx {EBX = Remainder}
  725. push edx
  726. push eax
  727. push ecx
  728. call AlignedFwdMoveSSE_3
  729. pop ecx
  730. pop eax
  731. pop edx
  732. add ecx,ebx
  733. add eax,ecx
  734. add edx,ecx
  735. mov ecx,ebx
  736. pop ebx
  737. jmp SmallForwardMove_3
  738. end; {Forwards_SSE}
  739. {-------------------------------------------------------------------------}
  740. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  741. procedure Backwards_SSE_3;assembler;nostackframe;
  742. asm
  743. cmp ecx,SMALLMOVESIZE+32
  744. jg @BwdMoveSSE
  745. sub ecx,32
  746. movups xmm1,[eax+ecx]
  747. movups xmm2,[eax+ecx+16]
  748. movups [edx+ecx],xmm1
  749. movups [edx+ecx+16],xmm2
  750. jmp SmallBackwardMove_3
  751. @BwdMoveSSE:
  752. push ebx
  753. movups xmm0,[eax+ecx-16] {Last 16 Bytes}
  754. {Align Writes}
  755. lea ebx,[edx+ecx]
  756. and ebx,15
  757. sub ecx,ebx
  758. add ebx,ecx
  759. {Now Aligned}
  760. sub ecx,32
  761. @BwdLoop:
  762. movups xmm1,[eax+ecx]
  763. movups xmm2,[eax+ecx+16]
  764. movaps [edx+ecx],xmm1
  765. movaps [edx+ecx+16],xmm2
  766. sub ecx,32
  767. jge @BwdLoop
  768. movups [edx+ebx-16],xmm0 {Last 16 Bytes}
  769. add ecx,32
  770. pop ebx
  771. jmp SmallBackwardMove_3
  772. end; {Backwards_SSE}
  773. const
  774. fastmoveproc_forward : pointer = @Forwards_IA32_3;
  775. fastmoveproc_backward : pointer = @Backwards_IA32_3;
  776. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
  777. asm
  778. cmp ecx,SMALLMOVESIZE
  779. ja @Large
  780. cmp eax,edx
  781. lea eax,[eax+ecx]
  782. jle @SmallCheck
  783. @SmallForward:
  784. add edx,ecx
  785. jmp SmallForwardMove_3
  786. @SmallCheck:
  787. je @Done {For Compatibility with Delphi's move for Source = Dest}
  788. sub eax,ecx
  789. jmp SmallBackwardMove_3
  790. @Large:
  791. jng @Done {For Compatibility with Delphi's move for Count < 0}
  792. cmp eax,edx
  793. jg @moveforward
  794. je @Done {For Compatibility with Delphi's move for Source = Dest}
  795. push eax
  796. add eax,ecx
  797. cmp eax,edx
  798. pop eax
  799. jg @movebackward
  800. @moveforward:
  801. jmp dword ptr fastmoveproc_forward
  802. @movebackward:
  803. jmp dword ptr fastmoveproc_backward {Source/Dest Overlap}
  804. @Done:
  805. end;
  806. {$asmmode att}
  807. procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
  808. begin
  809. if has_sse_support then
  810. begin
  811. fastmoveproc_forward:=@Forwards_SSE_3;
  812. fastmoveproc_backward:=@Backwards_SSE_3;
  813. end
  814. else if has_mmx_support then
  815. begin
  816. fastmoveproc_forward:=@Forwards_MMX_3;
  817. fastmoveproc_backward:=@Backwards_MMX_3;
  818. end;
  819. end;
  820. {$endif FPC_SYSTEM_HAS_MOVE}
  821. {$else}
  822. procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
  823. begin
  824. end;
  825. {$endif}