fastmove.inc 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907
  1. {
  2. Copyright (c) 2004, John O'Harrow ([email protected])
  3. This software is provided 'as-is', without any express or implied warranty.
  4. In no event will the authors be held liable for any damages arising from the
  5. use of this software.
  6. Permission is granted to anyone to use this software for any purpose, including
  7. commercial applications, and to alter it and redistribute it freely, subject to
  8. the following restrictions:
  9. 1. The origin of this software must not be misrepresented; you must not claim
  10. that you wrote the original software. If you use this software in a product,
  11. an acknowledgment in the product documentation would be appreciated but is
  12. not required.
  13. 2. Altered source versions must be plainly marked as such, and must not be
  14. misrepresented as being the original software.
  15. 3. This notice may not be removed or altered from any source distribution.
  16. -------------------------------------------------------------------------------
  17. Version: 1.40 - 16-SEP-2004
  18. }
  19. {$ifdef USE_FASTMOVE}
  20. {$ifndef FPC_SYSTEM_HAS_MOVE}
  21. {$define FPC_SYSTEM_HAS_MOVE}
  22. {$asmmode intel}
  23. {-------------------------------------------------------------------------}
  24. (*
  25. {Just to show that a good Pascal algorithm can beat the default BASM}
  26. procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer);
  27. var
  28. S, D : PtrUInt;
  29. Temp, C, I : PtrInt;
  30. L : PPtrInt;
  31. begin
  32. S := Cardinal(@Source);
  33. D := Cardinal(@Dest);
  34. if S = D then
  35. Exit;
  36. if Count <= 4 then
  37. case Count of
  38. 1 : PByte(@Dest)^ := PByte(S)^;
  39. 2 : PWord(@Dest)^ := PWord(S)^;
  40. 3 : if D > S then
  41. begin
  42. PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
  43. PWord(@Dest)^ := PWord(S)^;
  44. end
  45. else
  46. begin
  47. PWord(@Dest)^ := PWord(S)^;
  48. PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
  49. end;
  50. 4 : PInteger(@Dest)^ := PInteger(S)^
  51. else Exit; {Count <= 0}
  52. end
  53. else
  54. if D > S then
  55. begin
  56. Temp := PInteger(S)^;
  57. I := Integer(@Dest);
  58. C := Count - 4;
  59. L := PInteger(Integer(@Dest) + C);
  60. Inc(S, C);
  61. repeat
  62. L^ := PInteger(S)^;
  63. if Count <= 8 then
  64. Break;
  65. Dec(Count, 4);
  66. Dec(S, 4);
  67. Dec(L);
  68. until False;
  69. PInteger(I)^ := Temp;
  70. end
  71. else
  72. begin
  73. C := Count - 4;
  74. Temp := PInteger(S + Cardinal(C))^;
  75. I := Integer(@Dest) + C;
  76. L := @Dest;
  77. repeat
  78. L^ := PInteger(S)^;
  79. if Count <= 8 then
  80. Break;
  81. Dec(Count, 4);
  82. Inc(S, 4);
  83. Inc(L);
  84. until False;
  85. PInteger(I)^ := Temp;
  86. end;
  87. end; {MoveJOH_PAS}
  88. *)
  89. const
  90. SMALLMOVESIZE = 36;
  91. {-------------------------------------------------------------------------}
  92. {Perform Forward Move of 0..36 Bytes}
  93. {On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count. Destroys ECX}
  94. procedure SmallForwardMove_3;assembler;nostackframe;
  95. asm
  96. jmp dword ptr @@FwdJumpTable[ecx*4]
  97. align 16
  98. @@FwdJumpTable:
  99. dd @@Done {Removes need to test for zero size move}
  100. dd @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08
  101. dd @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16
  102. dd @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24
  103. dd @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32
  104. dd @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36
  105. @@Fwd36:
  106. mov ecx,[eax-36]
  107. mov [edx-36],ecx
  108. @@Fwd32:
  109. mov ecx,[eax-32]
  110. mov [edx-32],ecx
  111. @@Fwd28:
  112. mov ecx,[eax-28]
  113. mov [edx-28],ecx
  114. @@Fwd24:
  115. mov ecx,[eax-24]
  116. mov [edx-24],ecx
  117. @@Fwd20:
  118. mov ecx,[eax-20]
  119. mov [edx-20],ecx
  120. @@Fwd16:
  121. mov ecx,[eax-16]
  122. mov [edx-16],ecx
  123. @@Fwd12:
  124. mov ecx,[eax-12]
  125. mov [edx-12],ecx
  126. @@Fwd08:
  127. mov ecx,[eax-8]
  128. mov [edx-8],ecx
  129. @@Fwd04:
  130. mov ecx,[eax-4]
  131. mov [edx-4],ecx
  132. ret
  133. @@Fwd35:
  134. mov ecx,[eax-35]
  135. mov [edx-35],ecx
  136. @@Fwd31:
  137. mov ecx,[eax-31]
  138. mov [edx-31],ecx
  139. @@Fwd27:
  140. mov ecx,[eax-27]
  141. mov [edx-27],ecx
  142. @@Fwd23:
  143. mov ecx,[eax-23]
  144. mov [edx-23],ecx
  145. @@Fwd19:
  146. mov ecx,[eax-19]
  147. mov [edx-19],ecx
  148. @@Fwd15:
  149. mov ecx,[eax-15]
  150. mov [edx-15],ecx
  151. @@Fwd11:
  152. mov ecx,[eax-11]
  153. mov [edx-11],ecx
  154. @@Fwd07:
  155. mov ecx,[eax-7]
  156. mov [edx-7],ecx
  157. mov ecx,[eax-4]
  158. mov [edx-4],ecx
  159. ret
  160. @@Fwd03:
  161. movzx ecx, word ptr [eax-3]
  162. mov [edx-3],cx
  163. movzx ecx, byte ptr [eax-1]
  164. mov [edx-1],cl
  165. ret
  166. @@Fwd34:
  167. mov ecx,[eax-34]
  168. mov [edx-34],ecx
  169. @@Fwd30:
  170. mov ecx,[eax-30]
  171. mov [edx-30],ecx
  172. @@Fwd26:
  173. mov ecx,[eax-26]
  174. mov [edx-26],ecx
  175. @@Fwd22:
  176. mov ecx,[eax-22]
  177. mov [edx-22],ecx
  178. @@Fwd18:
  179. mov ecx,[eax-18]
  180. mov [edx-18],ecx
  181. @@Fwd14:
  182. mov ecx,[eax-14]
  183. mov [edx-14],ecx
  184. @@Fwd10:
  185. mov ecx,[eax-10]
  186. mov [edx-10],ecx
  187. @@Fwd06:
  188. mov ecx,[eax-6]
  189. mov [edx-6],ecx
  190. @@Fwd02:
  191. movzx ecx, word ptr [eax-2]
  192. mov [edx-2],cx
  193. ret
  194. @@Fwd33:
  195. mov ecx,[eax-33]
  196. mov [edx-33],ecx
  197. @@Fwd29:
  198. mov ecx,[eax-29]
  199. mov [edx-29],ecx
  200. @@Fwd25:
  201. mov ecx,[eax-25]
  202. mov [edx-25],ecx
  203. @@Fwd21:
  204. mov ecx,[eax-21]
  205. mov [edx-21],ecx
  206. @@Fwd17:
  207. mov ecx,[eax-17]
  208. mov [edx-17],ecx
  209. @@Fwd13:
  210. mov ecx,[eax-13]
  211. mov [edx-13],ecx
  212. @@Fwd09:
  213. mov ecx,[eax-9]
  214. mov [edx-9],ecx
  215. @@Fwd05:
  216. mov ecx,[eax-5]
  217. mov [edx-5],ecx
  218. @@Fwd01:
  219. movzx ecx, byte ptr [eax-1]
  220. mov [edx-1],cl
  221. @@Done:
  222. end; {SmallForwardMove}
  223. {-------------------------------------------------------------------------}
  224. {Perform Backward Move of 0..36 Bytes}
  225. {On Entry, ECX = Count, EAX = Source, EDX = Dest. Destroys ECX}
  226. procedure SmallBackwardMove_3;assembler;nostackframe;
  227. asm
  228. jmp dword ptr @@BwdJumpTable[ecx*4]
  229. align 16
  230. @@BwdJumpTable:
  231. dd @@Done {Removes need to test for zero size move}
  232. dd @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08
  233. dd @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16
  234. dd @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24
  235. dd @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32
  236. dd @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36
  237. @@Bwd36:
  238. mov ecx,[eax+32]
  239. mov [edx+32],ecx
  240. @@Bwd32:
  241. mov ecx,[eax+28]
  242. mov [edx+28],ecx
  243. @@Bwd28:
  244. mov ecx,[eax+24]
  245. mov [edx+24],ecx
  246. @@Bwd24:
  247. mov ecx,[eax+20]
  248. mov [edx+20],ecx
  249. @@Bwd20:
  250. mov ecx,[eax+16]
  251. mov [edx+16],ecx
  252. @@Bwd16:
  253. mov ecx,[eax+12]
  254. mov [edx+12],ecx
  255. @@Bwd12:
  256. mov ecx,[eax+8]
  257. mov [edx+8],ecx
  258. @@Bwd08:
  259. mov ecx,[eax+4]
  260. mov [edx+4],ecx
  261. @@Bwd04:
  262. mov ecx,[eax]
  263. mov [edx],ecx
  264. ret
  265. @@Bwd35:
  266. mov ecx,[eax+31]
  267. mov [edx+31],ecx
  268. @@Bwd31:
  269. mov ecx,[eax+27]
  270. mov [edx+27],ecx
  271. @@Bwd27:
  272. mov ecx,[eax+23]
  273. mov [edx+23],ecx
  274. @@Bwd23:
  275. mov ecx,[eax+19]
  276. mov [edx+19],ecx
  277. @@Bwd19:
  278. mov ecx,[eax+15]
  279. mov [edx+15],ecx
  280. @@Bwd15:
  281. mov ecx,[eax+11]
  282. mov [edx+11],ecx
  283. @@Bwd11:
  284. mov ecx,[eax+7]
  285. mov [edx+7],ecx
  286. @@Bwd07:
  287. mov ecx,[eax+3]
  288. mov [edx+3],ecx
  289. mov ecx,[eax]
  290. mov [edx],ecx
  291. ret
  292. @@Bwd03:
  293. movzx ecx, word ptr [eax+1]
  294. mov [edx+1],cx
  295. movzx ecx, byte ptr [eax]
  296. mov [edx],cl
  297. ret
  298. @@Bwd34:
  299. mov ecx,[eax+30]
  300. mov [edx+30],ecx
  301. @@Bwd30:
  302. mov ecx,[eax+26]
  303. mov [edx+26],ecx
  304. @@Bwd26:
  305. mov ecx,[eax+22]
  306. mov [edx+22],ecx
  307. @@Bwd22:
  308. mov ecx,[eax+18]
  309. mov [edx+18],ecx
  310. @@Bwd18:
  311. mov ecx,[eax+14]
  312. mov [edx+14],ecx
  313. @@Bwd14:
  314. mov ecx,[eax+10]
  315. mov [edx+10],ecx
  316. @@Bwd10:
  317. mov ecx,[eax+6]
  318. mov [edx+6],ecx
  319. @@Bwd06:
  320. mov ecx,[eax+2]
  321. mov [edx+2],ecx
  322. @@Bwd02:
  323. movzx ecx, word ptr [eax]
  324. mov [edx],cx
  325. ret
  326. @@Bwd33:
  327. mov ecx,[eax+29]
  328. mov [edx+29],ecx
  329. @@Bwd29:
  330. mov ecx,[eax+25]
  331. mov [edx+25],ecx
  332. @@Bwd25:
  333. mov ecx,[eax+21]
  334. mov [edx+21],ecx
  335. @@Bwd21:
  336. mov ecx,[eax+17]
  337. mov [edx+17],ecx
  338. @@Bwd17:
  339. mov ecx,[eax+13]
  340. mov [edx+13],ecx
  341. @@Bwd13:
  342. mov ecx,[eax+9]
  343. mov [edx+9],ecx
  344. @@Bwd09:
  345. mov ecx,[eax+5]
  346. mov [edx+5],ecx
  347. @@Bwd05:
  348. mov ecx,[eax+1]
  349. mov [edx+1],ecx
  350. @@Bwd01:
  351. movzx ecx, byte ptr[eax]
  352. mov [edx],cl
  353. @@Done:
  354. end; {SmallBackwardMove}
  355. { at least valgrind up to 3.3 has a bug which prevents the default code to
  356. work so we use a rather simple implementation here
  357. }
  358. procedure Forwards_Valgrind;assembler;nostackframe;
  359. asm
  360. {$ifdef FPC_ENABLED_CLD}
  361. cld
  362. {$endif FPC_ENABLED_CLD}
  363. push esi
  364. push edi
  365. mov esi,eax
  366. mov edi,edx
  367. rep movsb
  368. pop edi
  369. pop esi
  370. end;
  371. { at least valgrind up to 3.3 has a bug which prevents the default code to
  372. work so we use a rather simple implementation here
  373. }
  374. procedure Backwards_Valgrind;assembler;nostackframe;
  375. asm
  376. push esi
  377. push edi
  378. lea esi,[eax+ecx-1]
  379. lea edi,[edx+ecx-1]
  380. @@repeat:
  381. mov al,[esi]
  382. mov [edi],al
  383. dec esi
  384. dec edi
  385. dec ecx
  386. jnz @@repeat
  387. pop edi
  388. pop esi
  389. end;
  390. {-------------------------------------------------------------------------}
  391. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  392. procedure Forwards_IA32_3;assembler;nostackframe;
  393. asm
  394. push ebx
  395. mov ebx,edx
  396. fild qword ptr [eax]
  397. add eax,ecx {QWORD Align Writes}
  398. add ecx,edx
  399. add edx,7
  400. and edx,-8
  401. sub ecx,edx
  402. add edx,ecx {Now QWORD Aligned}
  403. sub ecx,16
  404. neg ecx
  405. @FwdLoop:
  406. fild qword ptr [eax+ecx-16]
  407. fistp qword ptr [edx+ecx-16]
  408. fild qword ptr [eax+ecx-8]
  409. fistp qword ptr [edx+ecx-8]
  410. add ecx,16
  411. jle @FwdLoop
  412. fistp qword ptr [ebx]
  413. neg ecx
  414. add ecx,16
  415. pop ebx
  416. jmp SmallForwardMove_3
  417. end; {Forwards_IA32}
  418. {-------------------------------------------------------------------------}
  419. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  420. procedure Backwards_IA32_3;assembler;nostackframe;
  421. asm
  422. push ebx
  423. fild qword ptr [eax+ecx-8]
  424. lea ebx,[edx+ecx] {QWORD Align Writes}
  425. and ebx,7
  426. sub ecx,ebx
  427. add ebx,ecx {Now QWORD Aligned, EBX = Original Length}
  428. sub ecx,16
  429. @BwdLoop:
  430. fild qword ptr [eax+ecx]
  431. fild qword ptr [eax+ecx+8]
  432. fistp qword ptr [edx+ecx+8]
  433. fistp qword ptr [edx+ecx]
  434. sub ecx,16
  435. jge @BwdLoop
  436. fistp qword ptr [edx+ebx-8]
  437. add ecx,16
  438. pop ebx
  439. jmp SmallBackwardMove_3
  440. end; {Backwards_IA32}
  441. {-------------------------------------------------------------------------}
  442. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  443. procedure Forwards_MMX_3;assembler;nostackframe;
  444. const
  445. LARGESIZE = 1024;
  446. asm
  447. cmp ecx,LARGESIZE
  448. jge @FwdLargeMove
  449. cmp ecx,72 {Size at which using MMX becomes worthwhile}
  450. jl Forwards_IA32_3
  451. push ebx
  452. mov ebx,edx
  453. movq mm0,[eax] {First 8 Characters}
  454. {QWORD Align Writes}
  455. add eax,ecx
  456. add ecx,edx
  457. add edx,7
  458. and edx,-8
  459. sub ecx,edx
  460. add edx,ecx
  461. {Now QWORD Aligned}
  462. sub ecx,32
  463. neg ecx
  464. @FwdLoopMMX:
  465. movq mm1,[eax+ecx-32]
  466. movq mm2,[eax+ecx-24]
  467. movq mm3,[eax+ecx-16]
  468. movq mm4,[eax+ecx- 8]
  469. movq [edx+ecx-32],mm1
  470. movq [edx+ecx-24],mm2
  471. movq [edx+ecx-16],mm3
  472. movq [edx+ecx- 8],mm4
  473. add ecx,32
  474. jle @FwdLoopMMX
  475. movq [ebx],mm0 {First 8 Characters}
  476. emms
  477. pop ebx
  478. neg ecx
  479. add ecx,32
  480. jmp SmallForwardMove_3
  481. @FwdLargeMove:
  482. push ebx
  483. mov ebx,ecx
  484. test edx,15
  485. jz @FwdAligned
  486. {16 byte Align Destination}
  487. mov ecx,edx
  488. add ecx,15
  489. and ecx,-16
  490. sub ecx,edx
  491. add eax,ecx
  492. add edx,ecx
  493. sub ebx,ecx
  494. {Destination now 16 Byte Aligned}
  495. call SmallForwardMove_3
  496. @FwdAligned:
  497. mov ecx,ebx
  498. and ecx,-16
  499. sub ebx,ecx {EBX = Remainder}
  500. push esi
  501. push edi
  502. mov esi,eax {ESI = Source}
  503. mov edi,edx {EDI = Dest}
  504. mov eax,ecx {EAX = Count}
  505. and eax,-64 {EAX = No of Bytes to Blocks Moves}
  506. and ecx,$3F {ECX = Remaining Bytes to Move (0..63)}
  507. add esi,eax
  508. add edi,eax
  509. shr eax,3 {EAX = No of QWORD's to Block Move}
  510. neg eax
  511. @MMXcopyloop:
  512. movq mm0,[esi+eax*8 ]
  513. movq mm1,[esi+eax*8+ 8]
  514. movq mm2,[esi+eax*8+16]
  515. movq mm3,[esi+eax*8+24]
  516. movq mm4,[esi+eax*8+32]
  517. movq mm5,[esi+eax*8+40]
  518. movq mm6,[esi+eax*8+48]
  519. movq mm7,[esi+eax*8+56]
  520. movq [edi+eax*8 ],mm0
  521. movq [edi+eax*8+ 8],mm1
  522. movq [edi+eax*8+16],mm2
  523. movq [edi+eax*8+24],mm3
  524. movq [edi+eax*8+32],mm4
  525. movq [edi+eax*8+40],mm5
  526. movq [edi+eax*8+48],mm6
  527. movq [edi+eax*8+56],mm7
  528. add eax,8
  529. jnz @MMXcopyloop
  530. emms {Empty MMX State}
  531. {$ifdef FPC_ENABLED_CLD}
  532. cld
  533. {$endif FPC_ENABLED_CLD}
  534. add ecx,ebx
  535. shr ecx,2
  536. rep movsd
  537. mov ecx,ebx
  538. and ecx,3
  539. rep movsb
  540. pop edi
  541. pop esi
  542. pop ebx
  543. end; {Forwards_MMX}
  544. {-------------------------------------------------------------------------}
  545. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  546. procedure Backwards_MMX_3;assembler;nostackframe;
  547. asm
  548. cmp ecx,72 {Size at which using MMX becomes worthwhile}
  549. jl Backwards_IA32_3
  550. push ebx
  551. movq mm0,[eax+ecx-8] {Get Last QWORD}
  552. {QWORD Align Writes}
  553. lea ebx,[edx+ecx]
  554. and ebx,7
  555. sub ecx,ebx
  556. add ebx,ecx
  557. {Now QWORD Aligned}
  558. sub ecx,32
  559. @BwdLoopMMX:
  560. movq mm1,[eax+ecx ]
  561. movq mm2,[eax+ecx+ 8]
  562. movq mm3,[eax+ecx+16]
  563. movq mm4,[eax+ecx+24]
  564. movq [edx+ecx+24],mm4
  565. movq [edx+ecx+16],mm3
  566. movq [edx+ecx+ 8],mm2
  567. movq [edx+ecx ],mm1
  568. sub ecx,32
  569. jge @BwdLoopMMX
  570. movq [edx+ebx-8], mm0 {Last QWORD}
  571. emms
  572. add ecx,32
  573. pop ebx
  574. jmp SmallBackwardMove_3
  575. end; {Backwards_MMX}
  576. {$ifndef FASTMOVE_DISABLE_SSE3}
  577. {-------------------------------------------------------------------------}
  578. {Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
  579. procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe;
  580. const
  581. Prefetch = 512;
  582. asm
  583. push esi
  584. mov esi,eax {ESI = Source}
  585. mov eax,ecx {EAX = Count}
  586. and eax,-128 {EAX = No of Bytes to Block Move}
  587. add esi,eax
  588. add edx,eax
  589. shr eax,3 {EAX = No of QWORD's to Block Move}
  590. neg eax
  591. cmp eax, -(32*1024) {Count > 256K}
  592. jl @Large
  593. @Small: {Count<=256K}
  594. test esi,15 {Check if Both Source/Dest Aligned}
  595. jnz @SmallUnaligned
  596. @SmallAligned: {Both Source and Dest 16-Byte Aligned}
  597. @SmallAlignedLoop:
  598. movaps xmm0,[esi+8*eax]
  599. movaps xmm1,[esi+8*eax+16]
  600. movaps xmm2,[esi+8*eax+32]
  601. movaps xmm3,[esi+8*eax+48]
  602. movaps [edx+8*eax],xmm0
  603. movaps [edx+8*eax+16],xmm1
  604. movaps [edx+8*eax+32],xmm2
  605. movaps [edx+8*eax+48],xmm3
  606. movaps xmm4,[esi+8*eax+64]
  607. movaps xmm5,[esi+8*eax+80]
  608. movaps xmm6,[esi+8*eax+96]
  609. movaps xmm7,[esi+8*eax+112]
  610. movaps [edx+8*eax+64],xmm4
  611. movaps [edx+8*eax+80],xmm5
  612. movaps [edx+8*eax+96],xmm6
  613. movaps [edx+8*eax+112],xmm7
  614. add eax,16
  615. js @SmallAlignedLoop
  616. jmp @Remainder
  617. @SmallUnaligned: {Source Not 16-Byte Aligned}
  618. @SmallUnalignedLoop:
  619. movups xmm0,[esi+8*eax]
  620. movups xmm1,[esi+8*eax+16]
  621. movups xmm2,[esi+8*eax+32]
  622. movups xmm3,[esi+8*eax+48]
  623. movaps [edx+8*eax],xmm0
  624. movaps [edx+8*eax+16],xmm1
  625. movaps [edx+8*eax+32],xmm2
  626. movaps [edx+8*eax+48],xmm3
  627. movups xmm4,[esi+8*eax+64]
  628. movups xmm5,[esi+8*eax+80]
  629. movups xmm6,[esi+8*eax+96]
  630. movups xmm7,[esi+8*eax+112]
  631. movaps [edx+8*eax+64],xmm4
  632. movaps [edx+8*eax+80],xmm5
  633. movaps [edx+8*eax+96],xmm6
  634. movaps [edx+8*eax+112],xmm7
  635. add eax,16
  636. js @SmallUnalignedLoop
  637. jmp @Remainder
  638. @Large: {Count>256K}
  639. test esi,15 {Check if Both Source/Dest Aligned}
  640. jnz @LargeUnaligned
  641. @LargeAligned: {Both Source and Dest 16-Byte Aligned}
  642. @LargeAlignedLoop:
  643. prefetchnta [esi+8*eax+Prefetch]
  644. prefetchnta [esi+8*eax+Prefetch+64]
  645. movaps xmm0,[esi+8*eax]
  646. movaps xmm1,[esi+8*eax+16]
  647. movaps xmm2,[esi+8*eax+32]
  648. movaps xmm3,[esi+8*eax+48]
  649. movntps [edx+8*eax],xmm0
  650. movntps [edx+8*eax+16],xmm1
  651. movntps [edx+8*eax+32],xmm2
  652. movntps [edx+8*eax+48],xmm3
  653. movaps xmm4,[esi+8*eax+64]
  654. movaps xmm5,[esi+8*eax+80]
  655. movaps xmm6,[esi+8*eax+96]
  656. movaps xmm7,[esi+8*eax+112]
  657. movntps [edx+8*eax+64],xmm4
  658. movntps [edx+8*eax+80],xmm5
  659. movntps [edx+8*eax+96],xmm6
  660. movntps [edx+8*eax+112],xmm7
  661. add eax,16
  662. js @LargeAlignedLoop
  663. sfence
  664. jmp @Remainder
  665. @LargeUnaligned: {Source Not 16-Byte Aligned}
  666. @LargeUnalignedLoop:
  667. prefetchnta [esi+8*eax+Prefetch]
  668. prefetchnta [esi+8*eax+Prefetch+64]
  669. movups xmm0,[esi+8*eax]
  670. movups xmm1,[esi+8*eax+16]
  671. movups xmm2,[esi+8*eax+32]
  672. movups xmm3,[esi+8*eax+48]
  673. movntps [edx+8*eax],xmm0
  674. movntps [edx+8*eax+16],xmm1
  675. movntps [edx+8*eax+32],xmm2
  676. movntps [edx+8*eax+48],xmm3
  677. movups xmm4,[esi+8*eax+64]
  678. movups xmm5,[esi+8*eax+80]
  679. movups xmm6,[esi+8*eax+96]
  680. movups xmm7,[esi+8*eax+112]
  681. movntps [edx+8*eax+64],xmm4
  682. movntps [edx+8*eax+80],xmm5
  683. movntps [edx+8*eax+96],xmm6
  684. movntps [edx+8*eax+112],xmm7
  685. add eax,16
  686. js @LargeUnalignedLoop
  687. sfence
  688. @Remainder:
  689. and ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)}
  690. jz @Done
  691. add esi,ecx
  692. add edx,ecx
  693. neg ecx
  694. @RemainderLoop:
  695. movups xmm0,[esi+ecx]
  696. movaps [edx+ecx],xmm0
  697. add ecx,16
  698. jnz @RemainderLoop
  699. @Done:
  700. pop esi
  701. end; {AlignedFwdMoveSSE}
  702. {-------------------------------------------------------------------------}
  703. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  704. procedure Forwards_SSE_3;assembler;nostackframe;
  705. const
  706. LARGESIZE = 2048;
  707. asm
  708. cmp ecx,LARGESIZE
  709. jge @FwdLargeMove
  710. cmp ecx,SMALLMOVESIZE+32
  711. movups xmm0,[eax]
  712. jg @FwdMoveSSE
  713. movups xmm1,[eax+16]
  714. movups [edx],xmm0
  715. movups [edx+16],xmm1
  716. add eax,ecx
  717. add edx,ecx
  718. sub ecx,32
  719. jmp SmallForwardMove_3
  720. @FwdMoveSSE:
  721. push ebx
  722. mov ebx,edx
  723. {Align Writes}
  724. add eax,ecx
  725. add ecx,edx
  726. add edx,15
  727. and edx,-16
  728. sub ecx,edx
  729. add edx,ecx
  730. {Now Aligned}
  731. sub ecx,32
  732. neg ecx
  733. @FwdLoopSSE:
  734. movups xmm1,[eax+ecx-32]
  735. movups xmm2,[eax+ecx-16]
  736. movaps [edx+ecx-32],xmm1
  737. movaps [edx+ecx-16],xmm2
  738. add ecx,32
  739. jle @FwdLoopSSE
  740. movups [ebx],xmm0 {First 16 Bytes}
  741. neg ecx
  742. add ecx,32
  743. pop ebx
  744. jmp SmallForwardMove_3
  745. @FwdLargeMove:
  746. push ebx
  747. mov ebx,ecx
  748. test edx,15
  749. jz @FwdLargeAligned
  750. {16 byte Align Destination}
  751. mov ecx,edx
  752. add ecx,15
  753. and ecx,-16
  754. sub ecx,edx
  755. add eax,ecx
  756. add edx,ecx
  757. sub ebx,ecx
  758. {Destination now 16 Byte Aligned}
  759. call SmallForwardMove_3
  760. mov ecx,ebx
  761. @FwdLargeAligned:
  762. and ecx,-16
  763. sub ebx,ecx {EBX = Remainder}
  764. push edx
  765. push eax
  766. push ecx
  767. call AlignedFwdMoveSSE_3
  768. pop ecx
  769. pop eax
  770. pop edx
  771. add ecx,ebx
  772. add eax,ecx
  773. add edx,ecx
  774. mov ecx,ebx
  775. pop ebx
  776. jmp SmallForwardMove_3
  777. end; {Forwards_SSE}
  778. {-------------------------------------------------------------------------}
  779. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  780. procedure Backwards_SSE_3;assembler;nostackframe;
  781. asm
  782. cmp ecx,SMALLMOVESIZE+32
  783. jg @BwdMoveSSE
  784. sub ecx,32
  785. movups xmm1,[eax+ecx]
  786. movups xmm2,[eax+ecx+16]
  787. movups [edx+ecx],xmm1
  788. movups [edx+ecx+16],xmm2
  789. jmp SmallBackwardMove_3
  790. @BwdMoveSSE:
  791. push ebx
  792. movups xmm0,[eax+ecx-16] {Last 16 Bytes}
  793. {Align Writes}
  794. lea ebx,[edx+ecx]
  795. and ebx,15
  796. sub ecx,ebx
  797. add ebx,ecx
  798. {Now Aligned}
  799. sub ecx,32
  800. @BwdLoop:
  801. movups xmm1,[eax+ecx]
  802. movups xmm2,[eax+ecx+16]
  803. movaps [edx+ecx],xmm1
  804. movaps [edx+ecx+16],xmm2
  805. sub ecx,32
  806. jge @BwdLoop
  807. movups [edx+ebx-16],xmm0 {Last 16 Bytes}
  808. add ecx,32
  809. pop ebx
  810. jmp SmallBackwardMove_3
  811. end; {Backwards_SSE}
  812. {$endif ndef FASTMOVE_DISABLE_SSE3}
  813. const
  814. fastmoveproc_forward : pointer = @Forwards_IA32_3;
  815. fastmoveproc_backward : pointer = @Backwards_IA32_3;
  816. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
  817. asm
  818. cmp ecx,SMALLMOVESIZE
  819. ja @Large
  820. cmp eax,edx
  821. lea eax,[eax+ecx]
  822. jle @SmallCheck
  823. @SmallForward:
  824. add edx,ecx
  825. jmp SmallForwardMove_3
  826. @SmallCheck:
  827. je @Done {For Compatibility with Delphi's move for Source = Dest}
  828. sub eax,ecx
  829. jmp SmallBackwardMove_3
  830. @Large:
  831. jng @Done {For Compatibility with Delphi's move for Count < 0}
  832. cmp eax,edx
  833. jg @moveforward
  834. je @Done {For Compatibility with Delphi's move for Source = Dest}
  835. push eax
  836. add eax,ecx
  837. cmp eax,edx
  838. pop eax
  839. jg @movebackward
  840. @moveforward:
  841. jmp dword ptr fastmoveproc_forward
  842. @movebackward:
  843. jmp dword ptr fastmoveproc_backward {Source/Dest Overlap}
  844. @Done:
  845. end;
  846. {$asmmode att}
  847. {$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  848. var
  849. valgrind_used : boolean;external name '__fpc_valgrind';
  850. {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  851. procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
  852. begin
  853. { workaround valgrind bug }
  854. {$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  855. if EntryInformation.valgrind_used then
  856. {$else FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  857. if valgrind_used then
  858. {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  859. begin
  860. fastmoveproc_forward:=@Forwards_Valgrind;
  861. fastmoveproc_backward:=@Backwards_Valgrind;
  862. end
  863. {$ifndef FASTMOVE_DISABLE_SSE3}
  864. else if has_sse_support then
  865. begin
  866. fastmoveproc_forward:=@Forwards_SSE_3;
  867. fastmoveproc_backward:=@Backwards_SSE_3;
  868. end
  869. {$endif ndef FASTMOVE_DISABLE_SSE3}
  870. else if has_mmx_support then
  871. begin
  872. fastmoveproc_forward:=@Forwards_MMX_3;
  873. fastmoveproc_backward:=@Backwards_MMX_3;
  874. end;
  875. end;
  876. {$endif FPC_SYSTEM_HAS_MOVE}
  877. {$endif}