fastmove.inc 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905
  1. {
  2. Copyright (c) 2004, John O'Harrow ([email protected])
  3. This software is provided 'as-is', without any express or implied warranty.
  4. In no event will the authors be held liable for any damages arising from the
  5. use of this software.
  6. Permission is granted to anyone to use this software for any purpose, including
  7. commercial applications, and to alter it and redistribute it freely, subject to
  8. the following restrictions:
  9. 1. The origin of this software must not be misrepresented; you must not claim
  10. that you wrote the original software. If you use this software in a product,
  11. an acknowledgment in the product documentation would be appreciated but is
  12. not required.
  13. 2. Altered source versions must be plainly marked as such, and must not be
  14. misrepresented as being the original software.
  15. 3. This notice may not be removed or altered from any source distribution.
  16. -------------------------------------------------------------------------------
  17. Version: 1.40 - 16-SEP-2004
  18. }
  19. {$ifdef USE_FASTMOVE}
  20. {$ifndef FPC_SYSTEM_HAS_MOVE}
  21. {$define FPC_SYSTEM_HAS_MOVE}
  22. {$asmmode intel}
  23. {-------------------------------------------------------------------------}
  24. (*
  25. {Just to show that a good Pascal algorithm can beat the default BASM}
  26. procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer);
  27. var
  28. S, D : PtrUInt;
  29. Temp, C, I : PtrInt;
  30. L : PPtrInt;
  31. begin
  32. S := Cardinal(@Source);
  33. D := Cardinal(@Dest);
  34. if S = D then
  35. Exit;
  36. if Count <= 4 then
  37. case Count of
  38. 1 : PByte(@Dest)^ := PByte(S)^;
  39. 2 : PWord(@Dest)^ := PWord(S)^;
  40. 3 : if D > S then
  41. begin
  42. PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
  43. PWord(@Dest)^ := PWord(S)^;
  44. end
  45. else
  46. begin
  47. PWord(@Dest)^ := PWord(S)^;
  48. PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
  49. end;
  50. 4 : PInteger(@Dest)^ := PInteger(S)^
  51. else Exit; {Count <= 0}
  52. end
  53. else
  54. if D > S then
  55. begin
  56. Temp := PInteger(S)^;
  57. I := Integer(@Dest);
  58. C := Count - 4;
  59. L := PInteger(Integer(@Dest) + C);
  60. Inc(S, C);
  61. repeat
  62. L^ := PInteger(S)^;
  63. if Count <= 8 then
  64. Break;
  65. Dec(Count, 4);
  66. Dec(S, 4);
  67. Dec(L);
  68. until False;
  69. PInteger(I)^ := Temp;
  70. end
  71. else
  72. begin
  73. C := Count - 4;
  74. Temp := PInteger(S + Cardinal(C))^;
  75. I := Integer(@Dest) + C;
  76. L := @Dest;
  77. repeat
  78. L^ := PInteger(S)^;
  79. if Count <= 8 then
  80. Break;
  81. Dec(Count, 4);
  82. Inc(S, 4);
  83. Inc(L);
  84. until False;
  85. PInteger(I)^ := Temp;
  86. end;
  87. end; {MoveJOH_PAS}
  88. *)
  89. const
  90. SMALLMOVESIZE = 36;
  91. {-------------------------------------------------------------------------}
  92. {Perform Forward Move of 0..36 Bytes}
  93. {On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count. Destroys ECX}
  94. procedure SmallForwardMove_3;assembler;nostackframe;
  95. asm
  96. jmp dword ptr @@FwdJumpTable[ecx*4]
  97. align 16
  98. @@FwdJumpTable:
  99. dd @@Done {Removes need to test for zero size move}
  100. dd @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08
  101. dd @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16
  102. dd @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24
  103. dd @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32
  104. dd @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36
  105. @@Fwd36:
  106. mov ecx,[eax-36]
  107. mov [edx-36],ecx
  108. @@Fwd32:
  109. mov ecx,[eax-32]
  110. mov [edx-32],ecx
  111. @@Fwd28:
  112. mov ecx,[eax-28]
  113. mov [edx-28],ecx
  114. @@Fwd24:
  115. mov ecx,[eax-24]
  116. mov [edx-24],ecx
  117. @@Fwd20:
  118. mov ecx,[eax-20]
  119. mov [edx-20],ecx
  120. @@Fwd16:
  121. mov ecx,[eax-16]
  122. mov [edx-16],ecx
  123. @@Fwd12:
  124. mov ecx,[eax-12]
  125. mov [edx-12],ecx
  126. @@Fwd08:
  127. mov ecx,[eax-8]
  128. mov [edx-8],ecx
  129. @@Fwd04:
  130. mov ecx,[eax-4]
  131. mov [edx-4],ecx
  132. ret
  133. @@Fwd35:
  134. mov ecx,[eax-35]
  135. mov [edx-35],ecx
  136. @@Fwd31:
  137. mov ecx,[eax-31]
  138. mov [edx-31],ecx
  139. @@Fwd27:
  140. mov ecx,[eax-27]
  141. mov [edx-27],ecx
  142. @@Fwd23:
  143. mov ecx,[eax-23]
  144. mov [edx-23],ecx
  145. @@Fwd19:
  146. mov ecx,[eax-19]
  147. mov [edx-19],ecx
  148. @@Fwd15:
  149. mov ecx,[eax-15]
  150. mov [edx-15],ecx
  151. @@Fwd11:
  152. mov ecx,[eax-11]
  153. mov [edx-11],ecx
  154. @@Fwd07:
  155. mov ecx,[eax-7]
  156. mov [edx-7],ecx
  157. mov ecx,[eax-4]
  158. mov [edx-4],ecx
  159. ret
  160. @@Fwd03:
  161. movzx ecx, word ptr [eax-3]
  162. mov [edx-3],cx
  163. movzx ecx, byte ptr [eax-1]
  164. mov [edx-1],cl
  165. ret
  166. @@Fwd34:
  167. mov ecx,[eax-34]
  168. mov [edx-34],ecx
  169. @@Fwd30:
  170. mov ecx,[eax-30]
  171. mov [edx-30],ecx
  172. @@Fwd26:
  173. mov ecx,[eax-26]
  174. mov [edx-26],ecx
  175. @@Fwd22:
  176. mov ecx,[eax-22]
  177. mov [edx-22],ecx
  178. @@Fwd18:
  179. mov ecx,[eax-18]
  180. mov [edx-18],ecx
  181. @@Fwd14:
  182. mov ecx,[eax-14]
  183. mov [edx-14],ecx
  184. @@Fwd10:
  185. mov ecx,[eax-10]
  186. mov [edx-10],ecx
  187. @@Fwd06:
  188. mov ecx,[eax-6]
  189. mov [edx-6],ecx
  190. @@Fwd02:
  191. movzx ecx, word ptr [eax-2]
  192. mov [edx-2],cx
  193. ret
  194. @@Fwd33:
  195. mov ecx,[eax-33]
  196. mov [edx-33],ecx
  197. @@Fwd29:
  198. mov ecx,[eax-29]
  199. mov [edx-29],ecx
  200. @@Fwd25:
  201. mov ecx,[eax-25]
  202. mov [edx-25],ecx
  203. @@Fwd21:
  204. mov ecx,[eax-21]
  205. mov [edx-21],ecx
  206. @@Fwd17:
  207. mov ecx,[eax-17]
  208. mov [edx-17],ecx
  209. @@Fwd13:
  210. mov ecx,[eax-13]
  211. mov [edx-13],ecx
  212. @@Fwd09:
  213. mov ecx,[eax-9]
  214. mov [edx-9],ecx
  215. @@Fwd05:
  216. mov ecx,[eax-5]
  217. mov [edx-5],ecx
  218. @@Fwd01:
  219. movzx ecx, byte ptr [eax-1]
  220. mov [edx-1],cl
  221. @@Done:
  222. end; {SmallForwardMove}
  223. {-------------------------------------------------------------------------}
  224. {Perform Backward Move of 0..36 Bytes}
  225. {On Entry, ECX = Count, EAX = Source, EDX = Dest. Destroys ECX}
  226. procedure SmallBackwardMove_3;assembler;nostackframe;
  227. asm
  228. jmp dword ptr @@BwdJumpTable[ecx*4]
  229. align 16
  230. @@BwdJumpTable:
  231. dd @@Done {Removes need to test for zero size move}
  232. dd @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08
  233. dd @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16
  234. dd @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24
  235. dd @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32
  236. dd @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36
  237. @@Bwd36:
  238. mov ecx,[eax+32]
  239. mov [edx+32],ecx
  240. @@Bwd32:
  241. mov ecx,[eax+28]
  242. mov [edx+28],ecx
  243. @@Bwd28:
  244. mov ecx,[eax+24]
  245. mov [edx+24],ecx
  246. @@Bwd24:
  247. mov ecx,[eax+20]
  248. mov [edx+20],ecx
  249. @@Bwd20:
  250. mov ecx,[eax+16]
  251. mov [edx+16],ecx
  252. @@Bwd16:
  253. mov ecx,[eax+12]
  254. mov [edx+12],ecx
  255. @@Bwd12:
  256. mov ecx,[eax+8]
  257. mov [edx+8],ecx
  258. @@Bwd08:
  259. mov ecx,[eax+4]
  260. mov [edx+4],ecx
  261. @@Bwd04:
  262. mov ecx,[eax]
  263. mov [edx],ecx
  264. ret
  265. @@Bwd35:
  266. mov ecx,[eax+31]
  267. mov [edx+31],ecx
  268. @@Bwd31:
  269. mov ecx,[eax+27]
  270. mov [edx+27],ecx
  271. @@Bwd27:
  272. mov ecx,[eax+23]
  273. mov [edx+23],ecx
  274. @@Bwd23:
  275. mov ecx,[eax+19]
  276. mov [edx+19],ecx
  277. @@Bwd19:
  278. mov ecx,[eax+15]
  279. mov [edx+15],ecx
  280. @@Bwd15:
  281. mov ecx,[eax+11]
  282. mov [edx+11],ecx
  283. @@Bwd11:
  284. mov ecx,[eax+7]
  285. mov [edx+7],ecx
  286. @@Bwd07:
  287. mov ecx,[eax+3]
  288. mov [edx+3],ecx
  289. mov ecx,[eax]
  290. mov [edx],ecx
  291. ret
  292. @@Bwd03:
  293. movzx ecx, word ptr [eax+1]
  294. mov [edx+1],cx
  295. movzx ecx, byte ptr [eax]
  296. mov [edx],cl
  297. ret
  298. @@Bwd34:
  299. mov ecx,[eax+30]
  300. mov [edx+30],ecx
  301. @@Bwd30:
  302. mov ecx,[eax+26]
  303. mov [edx+26],ecx
  304. @@Bwd26:
  305. mov ecx,[eax+22]
  306. mov [edx+22],ecx
  307. @@Bwd22:
  308. mov ecx,[eax+18]
  309. mov [edx+18],ecx
  310. @@Bwd18:
  311. mov ecx,[eax+14]
  312. mov [edx+14],ecx
  313. @@Bwd14:
  314. mov ecx,[eax+10]
  315. mov [edx+10],ecx
  316. @@Bwd10:
  317. mov ecx,[eax+6]
  318. mov [edx+6],ecx
  319. @@Bwd06:
  320. mov ecx,[eax+2]
  321. mov [edx+2],ecx
  322. @@Bwd02:
  323. movzx ecx, word ptr [eax]
  324. mov [edx],cx
  325. ret
  326. @@Bwd33:
  327. mov ecx,[eax+29]
  328. mov [edx+29],ecx
  329. @@Bwd29:
  330. mov ecx,[eax+25]
  331. mov [edx+25],ecx
  332. @@Bwd25:
  333. mov ecx,[eax+21]
  334. mov [edx+21],ecx
  335. @@Bwd21:
  336. mov ecx,[eax+17]
  337. mov [edx+17],ecx
  338. @@Bwd17:
  339. mov ecx,[eax+13]
  340. mov [edx+13],ecx
  341. @@Bwd13:
  342. mov ecx,[eax+9]
  343. mov [edx+9],ecx
  344. @@Bwd09:
  345. mov ecx,[eax+5]
  346. mov [edx+5],ecx
  347. @@Bwd05:
  348. mov ecx,[eax+1]
  349. mov [edx+1],ecx
  350. @@Bwd01:
  351. movzx ecx, byte ptr[eax]
  352. mov [edx],cl
  353. @@Done:
  354. end; {SmallBackwardMove}
  355. { at least valgrind up to 3.3 has a bug which prevents the default code to
  356. work so we use a rather simple implementation here
  357. }
  358. procedure Forwards_Valgrind;assembler;nostackframe;
  359. asm
  360. {$ifdef FPC_ENABLED_CLD}
  361. cld
  362. {$endif FPC_ENABLED_CLD}
  363. push esi
  364. push edi
  365. mov esi,eax
  366. mov edi,edx
  367. rep movsb
  368. pop edi
  369. pop esi
  370. end;
  371. { at least valgrind up to 3.3 has a bug which prevents the default code to
  372. work so we use a rather simple implementation here
  373. }
  374. procedure Backwards_Valgrind;assembler;nostackframe;
  375. asm
  376. push esi
  377. push edi
  378. lea esi,[eax+ecx-1]
  379. lea edi,[edx+ecx-1]
  380. @@repeat:
  381. mov al,[esi]
  382. mov [edi],al
  383. dec esi
  384. dec edi
  385. dec ecx
  386. jnz @@repeat
  387. pop edi
  388. pop esi
  389. end;
  390. {-------------------------------------------------------------------------}
  391. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  392. procedure Forwards_IA32_3;assembler;nostackframe;
  393. asm
  394. push ebx
  395. mov ebx,edx
  396. fild qword ptr [eax]
  397. add eax,ecx {QWORD Align Writes}
  398. add ecx,edx
  399. add edx,7
  400. and edx,-8
  401. sub ecx,edx
  402. add edx,ecx {Now QWORD Aligned}
  403. sub ecx,16
  404. neg ecx
  405. @FwdLoop:
  406. fild qword ptr [eax+ecx-16]
  407. fistp qword ptr [edx+ecx-16]
  408. fild qword ptr [eax+ecx-8]
  409. fistp qword ptr [edx+ecx-8]
  410. add ecx,16
  411. jle @FwdLoop
  412. fistp qword ptr [ebx]
  413. neg ecx
  414. add ecx,16
  415. pop ebx
  416. jmp SmallForwardMove_3
  417. end; {Forwards_IA32}
  418. {-------------------------------------------------------------------------}
  419. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  420. procedure Backwards_IA32_3;assembler;nostackframe;
  421. asm
  422. push ebx
  423. fild qword ptr [eax+ecx-8]
  424. lea ebx,[edx+ecx] {QWORD Align Writes}
  425. and ebx,7
  426. sub ecx,ebx
  427. add ebx,ecx {Now QWORD Aligned, EBX = Original Length}
  428. sub ecx,16
  429. @BwdLoop:
  430. fild qword ptr [eax+ecx]
  431. fild qword ptr [eax+ecx+8]
  432. fistp qword ptr [edx+ecx+8]
  433. fistp qword ptr [edx+ecx]
  434. sub ecx,16
  435. jge @BwdLoop
  436. fistp qword ptr [edx+ebx-8]
  437. add ecx,16
  438. pop ebx
  439. jmp SmallBackwardMove_3
  440. end; {Backwards_IA32}
  441. {-------------------------------------------------------------------------}
  442. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  443. procedure Forwards_MMX_3;assembler;nostackframe;
  444. const
  445. LARGESIZE = 1024;
  446. asm
  447. cmp ecx,LARGESIZE
  448. jge @FwdLargeMove
  449. cmp ecx,72 {Size at which using MMX becomes worthwhile}
  450. jl Forwards_IA32_3
  451. push ebx
  452. mov ebx,edx
  453. movq mm0,[eax] {First 8 Characters}
  454. {QWORD Align Writes}
  455. add eax,ecx
  456. add ecx,edx
  457. add edx,7
  458. and edx,-8
  459. sub ecx,edx
  460. add edx,ecx
  461. {Now QWORD Aligned}
  462. sub ecx,32
  463. neg ecx
  464. @FwdLoopMMX:
  465. movq mm1,[eax+ecx-32]
  466. movq mm2,[eax+ecx-24]
  467. movq mm3,[eax+ecx-16]
  468. movq mm4,[eax+ecx- 8]
  469. movq [edx+ecx-32],mm1
  470. movq [edx+ecx-24],mm2
  471. movq [edx+ecx-16],mm3
  472. movq [edx+ecx- 8],mm4
  473. add ecx,32
  474. jle @FwdLoopMMX
  475. movq [ebx],mm0 {First 8 Characters}
  476. emms
  477. pop ebx
  478. neg ecx
  479. add ecx,32
  480. jmp SmallForwardMove_3
  481. @FwdLargeMove:
  482. push ebx
  483. mov ebx,ecx
  484. test edx,15
  485. jz @FwdAligned
  486. {16 byte Align Destination}
  487. mov ecx,edx
  488. add ecx,15
  489. and ecx,-16
  490. sub ecx,edx
  491. add eax,ecx
  492. add edx,ecx
  493. sub ebx,ecx
  494. {Destination now 16 Byte Aligned}
  495. call SmallForwardMove_3
  496. @FwdAligned:
  497. mov ecx,ebx
  498. and ecx,-16
  499. sub ebx,ecx {EBX = Remainder}
  500. push esi
  501. push edi
  502. mov esi,eax {ESI = Source}
  503. mov edi,edx {EDI = Dest}
  504. mov eax,ecx {EAX = Count}
  505. and eax,-64 {EAX = No of Bytes to Blocks Moves}
  506. and ecx,$3F {ECX = Remaining Bytes to Move (0..63)}
  507. add esi,eax
  508. add edi,eax
  509. shr eax,3 {EAX = No of QWORD's to Block Move}
  510. neg eax
  511. @MMXcopyloop:
  512. movq mm0,[esi+eax*8 ]
  513. movq mm1,[esi+eax*8+ 8]
  514. movq mm2,[esi+eax*8+16]
  515. movq mm3,[esi+eax*8+24]
  516. movq mm4,[esi+eax*8+32]
  517. movq mm5,[esi+eax*8+40]
  518. movq mm6,[esi+eax*8+48]
  519. movq mm7,[esi+eax*8+56]
  520. movq [edi+eax*8 ],mm0
  521. movq [edi+eax*8+ 8],mm1
  522. movq [edi+eax*8+16],mm2
  523. movq [edi+eax*8+24],mm3
  524. movq [edi+eax*8+32],mm4
  525. movq [edi+eax*8+40],mm5
  526. movq [edi+eax*8+48],mm6
  527. movq [edi+eax*8+56],mm7
  528. add eax,8
  529. jnz @MMXcopyloop
  530. emms {Empty MMX State}
  531. {$ifdef FPC_ENABLED_CLD}
  532. cld
  533. {$endif FPC_ENABLED_CLD}
  534. add ecx,ebx
  535. shr ecx,2
  536. rep movsd
  537. mov ecx,ebx
  538. and ecx,3
  539. rep movsb
  540. pop edi
  541. pop esi
  542. pop ebx
  543. end; {Forwards_MMX}
  544. {-------------------------------------------------------------------------}
  545. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  546. procedure Backwards_MMX_3;assembler;nostackframe;
  547. asm
  548. cmp ecx,72 {Size at which using MMX becomes worthwhile}
  549. jl Backwards_IA32_3
  550. push ebx
  551. movq mm0,[eax+ecx-8] {Get Last QWORD}
  552. {QWORD Align Writes}
  553. lea ebx,[edx+ecx]
  554. and ebx,7
  555. sub ecx,ebx
  556. add ebx,ecx
  557. {Now QWORD Aligned}
  558. sub ecx,32
  559. @BwdLoopMMX:
  560. movq mm1,[eax+ecx ]
  561. movq mm2,[eax+ecx+ 8]
  562. movq mm3,[eax+ecx+16]
  563. movq mm4,[eax+ecx+24]
  564. movq [edx+ecx+24],mm4
  565. movq [edx+ecx+16],mm3
  566. movq [edx+ecx+ 8],mm2
  567. movq [edx+ecx ],mm1
  568. sub ecx,32
  569. jge @BwdLoopMMX
  570. movq [edx+ebx-8], mm0 {Last QWORD}
  571. emms
  572. add ecx,32
  573. pop ebx
  574. jmp SmallBackwardMove_3
  575. end; {Backwards_MMX}
  576. {-------------------------------------------------------------------------}
  577. {Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
  578. procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe;
  579. const
  580. Prefetch = 512;
  581. asm
  582. push esi
  583. mov esi,eax {ESI = Source}
  584. mov eax,ecx {EAX = Count}
  585. and eax,-128 {EAX = No of Bytes to Block Move}
  586. add esi,eax
  587. add edx,eax
  588. shr eax,3 {EAX = No of QWORD's to Block Move}
  589. neg eax
  590. cmp eax, -(32*1024) {Count > 256K}
  591. jl @Large
  592. @Small: {Count<=256K}
  593. test esi,15 {Check if Both Source/Dest Aligned}
  594. jnz @SmallUnaligned
  595. @SmallAligned: {Both Source and Dest 16-Byte Aligned}
  596. @SmallAlignedLoop:
  597. movaps xmm0,[esi+8*eax]
  598. movaps xmm1,[esi+8*eax+16]
  599. movaps xmm2,[esi+8*eax+32]
  600. movaps xmm3,[esi+8*eax+48]
  601. movaps [edx+8*eax],xmm0
  602. movaps [edx+8*eax+16],xmm1
  603. movaps [edx+8*eax+32],xmm2
  604. movaps [edx+8*eax+48],xmm3
  605. movaps xmm4,[esi+8*eax+64]
  606. movaps xmm5,[esi+8*eax+80]
  607. movaps xmm6,[esi+8*eax+96]
  608. movaps xmm7,[esi+8*eax+112]
  609. movaps [edx+8*eax+64],xmm4
  610. movaps [edx+8*eax+80],xmm5
  611. movaps [edx+8*eax+96],xmm6
  612. movaps [edx+8*eax+112],xmm7
  613. add eax,16
  614. js @SmallAlignedLoop
  615. jmp @Remainder
  616. @SmallUnaligned: {Source Not 16-Byte Aligned}
  617. @SmallUnalignedLoop:
  618. movups xmm0,[esi+8*eax]
  619. movups xmm1,[esi+8*eax+16]
  620. movups xmm2,[esi+8*eax+32]
  621. movups xmm3,[esi+8*eax+48]
  622. movaps [edx+8*eax],xmm0
  623. movaps [edx+8*eax+16],xmm1
  624. movaps [edx+8*eax+32],xmm2
  625. movaps [edx+8*eax+48],xmm3
  626. movups xmm4,[esi+8*eax+64]
  627. movups xmm5,[esi+8*eax+80]
  628. movups xmm6,[esi+8*eax+96]
  629. movups xmm7,[esi+8*eax+112]
  630. movaps [edx+8*eax+64],xmm4
  631. movaps [edx+8*eax+80],xmm5
  632. movaps [edx+8*eax+96],xmm6
  633. movaps [edx+8*eax+112],xmm7
  634. add eax,16
  635. js @SmallUnalignedLoop
  636. jmp @Remainder
  637. @Large: {Count>256K}
  638. test esi,15 {Check if Both Source/Dest Aligned}
  639. jnz @LargeUnaligned
  640. @LargeAligned: {Both Source and Dest 16-Byte Aligned}
  641. @LargeAlignedLoop:
  642. prefetchnta [esi+8*eax+Prefetch]
  643. prefetchnta [esi+8*eax+Prefetch+64]
  644. movaps xmm0,[esi+8*eax]
  645. movaps xmm1,[esi+8*eax+16]
  646. movaps xmm2,[esi+8*eax+32]
  647. movaps xmm3,[esi+8*eax+48]
  648. movntps [edx+8*eax],xmm0
  649. movntps [edx+8*eax+16],xmm1
  650. movntps [edx+8*eax+32],xmm2
  651. movntps [edx+8*eax+48],xmm3
  652. movaps xmm4,[esi+8*eax+64]
  653. movaps xmm5,[esi+8*eax+80]
  654. movaps xmm6,[esi+8*eax+96]
  655. movaps xmm7,[esi+8*eax+112]
  656. movntps [edx+8*eax+64],xmm4
  657. movntps [edx+8*eax+80],xmm5
  658. movntps [edx+8*eax+96],xmm6
  659. movntps [edx+8*eax+112],xmm7
  660. add eax,16
  661. js @LargeAlignedLoop
  662. sfence
  663. jmp @Remainder
  664. @LargeUnaligned: {Source Not 16-Byte Aligned}
  665. @LargeUnalignedLoop:
  666. prefetchnta [esi+8*eax+Prefetch]
  667. prefetchnta [esi+8*eax+Prefetch+64]
  668. movups xmm0,[esi+8*eax]
  669. movups xmm1,[esi+8*eax+16]
  670. movups xmm2,[esi+8*eax+32]
  671. movups xmm3,[esi+8*eax+48]
  672. movntps [edx+8*eax],xmm0
  673. movntps [edx+8*eax+16],xmm1
  674. movntps [edx+8*eax+32],xmm2
  675. movntps [edx+8*eax+48],xmm3
  676. movups xmm4,[esi+8*eax+64]
  677. movups xmm5,[esi+8*eax+80]
  678. movups xmm6,[esi+8*eax+96]
  679. movups xmm7,[esi+8*eax+112]
  680. movntps [edx+8*eax+64],xmm4
  681. movntps [edx+8*eax+80],xmm5
  682. movntps [edx+8*eax+96],xmm6
  683. movntps [edx+8*eax+112],xmm7
  684. add eax,16
  685. js @LargeUnalignedLoop
  686. sfence
  687. @Remainder:
  688. and ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)}
  689. jz @Done
  690. add esi,ecx
  691. add edx,ecx
  692. neg ecx
  693. @RemainderLoop:
  694. movups xmm0,[esi+ecx]
  695. movaps [edx+ecx],xmm0
  696. add ecx,16
  697. jnz @RemainderLoop
  698. @Done:
  699. pop esi
  700. end; {AlignedFwdMoveSSE}
  701. {-------------------------------------------------------------------------}
  702. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  703. procedure Forwards_SSE_3;assembler;nostackframe;
  704. const
  705. LARGESIZE = 2048;
  706. asm
  707. cmp ecx,LARGESIZE
  708. jge @FwdLargeMove
  709. cmp ecx,SMALLMOVESIZE+32
  710. movups xmm0,[eax]
  711. jg @FwdMoveSSE
  712. movups xmm1,[eax+16]
  713. movups [edx],xmm0
  714. movups [edx+16],xmm1
  715. add eax,ecx
  716. add edx,ecx
  717. sub ecx,32
  718. jmp SmallForwardMove_3
  719. @FwdMoveSSE:
  720. push ebx
  721. mov ebx,edx
  722. {Align Writes}
  723. add eax,ecx
  724. add ecx,edx
  725. add edx,15
  726. and edx,-16
  727. sub ecx,edx
  728. add edx,ecx
  729. {Now Aligned}
  730. sub ecx,32
  731. neg ecx
  732. @FwdLoopSSE:
  733. movups xmm1,[eax+ecx-32]
  734. movups xmm2,[eax+ecx-16]
  735. movaps [edx+ecx-32],xmm1
  736. movaps [edx+ecx-16],xmm2
  737. add ecx,32
  738. jle @FwdLoopSSE
  739. movups [ebx],xmm0 {First 16 Bytes}
  740. neg ecx
  741. add ecx,32
  742. pop ebx
  743. jmp SmallForwardMove_3
  744. @FwdLargeMove:
  745. push ebx
  746. mov ebx,ecx
  747. test edx,15
  748. jz @FwdLargeAligned
  749. {16 byte Align Destination}
  750. mov ecx,edx
  751. add ecx,15
  752. and ecx,-16
  753. sub ecx,edx
  754. add eax,ecx
  755. add edx,ecx
  756. sub ebx,ecx
  757. {Destination now 16 Byte Aligned}
  758. call SmallForwardMove_3
  759. mov ecx,ebx
  760. @FwdLargeAligned:
  761. and ecx,-16
  762. sub ebx,ecx {EBX = Remainder}
  763. push edx
  764. push eax
  765. push ecx
  766. call AlignedFwdMoveSSE_3
  767. pop ecx
  768. pop eax
  769. pop edx
  770. add ecx,ebx
  771. add eax,ecx
  772. add edx,ecx
  773. mov ecx,ebx
  774. pop ebx
  775. jmp SmallForwardMove_3
  776. end; {Forwards_SSE}
  777. {-------------------------------------------------------------------------}
  778. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  779. procedure Backwards_SSE_3;assembler;nostackframe;
  780. asm
  781. cmp ecx,SMALLMOVESIZE+32
  782. jg @BwdMoveSSE
  783. sub ecx,32
  784. movups xmm1,[eax+ecx]
  785. movups xmm2,[eax+ecx+16]
  786. movups [edx+ecx],xmm1
  787. movups [edx+ecx+16],xmm2
  788. jmp SmallBackwardMove_3
  789. @BwdMoveSSE:
  790. push ebx
  791. movups xmm0,[eax+ecx-16] {Last 16 Bytes}
  792. {Align Writes}
  793. lea ebx,[edx+ecx]
  794. and ebx,15
  795. sub ecx,ebx
  796. add ebx,ecx
  797. {Now Aligned}
  798. sub ecx,32
  799. @BwdLoop:
  800. movups xmm1,[eax+ecx]
  801. movups xmm2,[eax+ecx+16]
  802. movaps [edx+ecx],xmm1
  803. movaps [edx+ecx+16],xmm2
  804. sub ecx,32
  805. jge @BwdLoop
  806. movups [edx+ebx-16],xmm0 {Last 16 Bytes}
  807. add ecx,32
  808. pop ebx
  809. jmp SmallBackwardMove_3
  810. end; {Backwards_SSE}
  811. const
  812. fastmoveproc_forward : pointer = @Forwards_IA32_3;
  813. fastmoveproc_backward : pointer = @Backwards_IA32_3;
  814. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
  815. asm
  816. cmp ecx,SMALLMOVESIZE
  817. ja @Large
  818. cmp eax,edx
  819. lea eax,[eax+ecx]
  820. jle @SmallCheck
  821. @SmallForward:
  822. add edx,ecx
  823. jmp SmallForwardMove_3
  824. @SmallCheck:
  825. je @Done {For Compatibility with Delphi's move for Source = Dest}
  826. sub eax,ecx
  827. jmp SmallBackwardMove_3
  828. @Large:
  829. jng @Done {For Compatibility with Delphi's move for Count < 0}
  830. cmp eax,edx
  831. jg @moveforward
  832. je @Done {For Compatibility with Delphi's move for Source = Dest}
  833. push eax
  834. add eax,ecx
  835. cmp eax,edx
  836. pop eax
  837. jg @movebackward
  838. @moveforward:
  839. jmp dword ptr fastmoveproc_forward
  840. @movebackward:
  841. jmp dword ptr fastmoveproc_backward {Source/Dest Overlap}
  842. @Done:
  843. end;
  844. {$asmmode att}
  845. {$ifndef FPC_HAS_INDIRECT_MAIN_INFORMATION}
  846. var
  847. valgrind_used : boolean;external name '__fpc_valgrind';
  848. {$endif FPC_HAS_INDIRECT_MAIN_INFORMATION}
  849. procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
  850. begin
  851. { workaround valgrind bug }
  852. {$ifdef FPC_HAS_INDIRECT_MAIN_INFORMATION}
  853. if EntryInformation.valgrind_used then
  854. {$else FPC_HAS_INDIRECT_MAIN_INFORMATION}
  855. if valgrind_used then
  856. {$endif FPC_HAS_INDIRECT_MAIN_INFORMATION}
  857. begin
  858. fastmoveproc_forward:=@Forwards_Valgrind;
  859. fastmoveproc_backward:=@Backwards_Valgrind;
  860. end
  861. else
  862. if has_sse_support then
  863. begin
  864. fastmoveproc_forward:=@Forwards_SSE_3;
  865. fastmoveproc_backward:=@Backwards_SSE_3;
  866. end
  867. else if has_mmx_support then
  868. begin
  869. fastmoveproc_forward:=@Forwards_MMX_3;
  870. fastmoveproc_backward:=@Backwards_MMX_3;
  871. end;
  872. end;
  873. {$endif FPC_SYSTEM_HAS_MOVE}
  874. {$endif}