fastmove.inc 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903
  1. {
  2. Copyright (c) 2004, John O'Harrow ([email protected])
  3. This software is provided 'as-is', without any express or implied warranty.
  4. In no event will the authors be held liable for any damages arising from the
  5. use of this software.
  6. Permission is granted to anyone to use this software for any purpose, including
  7. commercial applications, and to alter it and redistribute it freely, subject to
  8. the following restrictions:
  9. 1. The origin of this software must not be misrepresented; you must not claim
  10. that you wrote the original software. If you use this software in a product,
  11. an acknowledgment in the product documentation would be appreciated but is
  12. not required.
  13. 2. Altered source versions must be plainly marked as such, and must not be
  14. misrepresented as being the original software.
  15. 3. This notice may not be removed or altered from any source distribution.
  16. -------------------------------------------------------------------------------
  17. Version: 1.40 - 16-SEP-2004
  18. }
  19. {$ifdef USE_FASTMOVE}
  20. {$ifndef FPC_SYSTEM_HAS_MOVE}
  21. {$define FPC_SYSTEM_HAS_MOVE}
  22. {$asmmode intel}
  23. {-------------------------------------------------------------------------}
  24. (*
  25. {Just to show that a good Pascal algorithm can beat the default BASM}
  26. procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer);
  27. var
  28. S, D : PtrUInt;
  29. Temp, C, I : PtrInt;
  30. L : PPtrInt;
  31. begin
  32. S := Cardinal(@Source);
  33. D := Cardinal(@Dest);
  34. if S = D then
  35. Exit;
  36. if Count <= 4 then
  37. case Count of
  38. 1 : PByte(@Dest)^ := PByte(S)^;
  39. 2 : PWord(@Dest)^ := PWord(S)^;
  40. 3 : if D > S then
  41. begin
  42. PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
  43. PWord(@Dest)^ := PWord(S)^;
  44. end
  45. else
  46. begin
  47. PWord(@Dest)^ := PWord(S)^;
  48. PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
  49. end;
  50. 4 : PInteger(@Dest)^ := PInteger(S)^
  51. else Exit; {Count <= 0}
  52. end
  53. else
  54. if D > S then
  55. begin
  56. Temp := PInteger(S)^;
  57. I := Integer(@Dest);
  58. C := Count - 4;
  59. L := PInteger(Integer(@Dest) + C);
  60. Inc(S, C);
  61. repeat
  62. L^ := PInteger(S)^;
  63. if Count <= 8 then
  64. Break;
  65. Dec(Count, 4);
  66. Dec(S, 4);
  67. Dec(L);
  68. until False;
  69. PInteger(I)^ := Temp;
  70. end
  71. else
  72. begin
  73. C := Count - 4;
  74. Temp := PInteger(S + Cardinal(C))^;
  75. I := Integer(@Dest) + C;
  76. L := @Dest;
  77. repeat
  78. L^ := PInteger(S)^;
  79. if Count <= 8 then
  80. Break;
  81. Dec(Count, 4);
  82. Inc(S, 4);
  83. Inc(L);
  84. until False;
  85. PInteger(I)^ := Temp;
  86. end;
  87. end; {MoveJOH_PAS}
  88. *)
  89. const
  90. SMALLMOVESIZE = 36;
  91. {-------------------------------------------------------------------------}
  92. {Perform Forward Move of 0..36 Bytes}
  93. {On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count. Destroys ECX}
  94. procedure SmallForwardMove_3;assembler;nostackframe;
  95. asm
  96. jmp dword ptr @@FwdJumpTable[ecx*4]
  97. align 16
  98. @@FwdJumpTable:
  99. dd @@Done {Removes need to test for zero size move}
  100. dd @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08
  101. dd @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16
  102. dd @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24
  103. dd @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32
  104. dd @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36
  105. @@Fwd36:
  106. mov ecx,[eax-36]
  107. mov [edx-36],ecx
  108. @@Fwd32:
  109. mov ecx,[eax-32]
  110. mov [edx-32],ecx
  111. @@Fwd28:
  112. mov ecx,[eax-28]
  113. mov [edx-28],ecx
  114. @@Fwd24:
  115. mov ecx,[eax-24]
  116. mov [edx-24],ecx
  117. @@Fwd20:
  118. mov ecx,[eax-20]
  119. mov [edx-20],ecx
  120. @@Fwd16:
  121. mov ecx,[eax-16]
  122. mov [edx-16],ecx
  123. @@Fwd12:
  124. mov ecx,[eax-12]
  125. mov [edx-12],ecx
  126. @@Fwd08:
  127. mov ecx,[eax-8]
  128. mov [edx-8],ecx
  129. @@Fwd04:
  130. mov ecx,[eax-4]
  131. mov [edx-4],ecx
  132. ret
  133. @@Fwd35:
  134. mov ecx,[eax-35]
  135. mov [edx-35],ecx
  136. @@Fwd31:
  137. mov ecx,[eax-31]
  138. mov [edx-31],ecx
  139. @@Fwd27:
  140. mov ecx,[eax-27]
  141. mov [edx-27],ecx
  142. @@Fwd23:
  143. mov ecx,[eax-23]
  144. mov [edx-23],ecx
  145. @@Fwd19:
  146. mov ecx,[eax-19]
  147. mov [edx-19],ecx
  148. @@Fwd15:
  149. mov ecx,[eax-15]
  150. mov [edx-15],ecx
  151. @@Fwd11:
  152. mov ecx,[eax-11]
  153. mov [edx-11],ecx
  154. @@Fwd07:
  155. mov ecx,[eax-7]
  156. mov [edx-7],ecx
  157. mov ecx,[eax-4]
  158. mov [edx-4],ecx
  159. ret
  160. @@Fwd03:
  161. movzx ecx, word ptr [eax-3]
  162. mov [edx-3],cx
  163. movzx ecx, byte ptr [eax-1]
  164. mov [edx-1],cl
  165. ret
  166. @@Fwd34:
  167. mov ecx,[eax-34]
  168. mov [edx-34],ecx
  169. @@Fwd30:
  170. mov ecx,[eax-30]
  171. mov [edx-30],ecx
  172. @@Fwd26:
  173. mov ecx,[eax-26]
  174. mov [edx-26],ecx
  175. @@Fwd22:
  176. mov ecx,[eax-22]
  177. mov [edx-22],ecx
  178. @@Fwd18:
  179. mov ecx,[eax-18]
  180. mov [edx-18],ecx
  181. @@Fwd14:
  182. mov ecx,[eax-14]
  183. mov [edx-14],ecx
  184. @@Fwd10:
  185. mov ecx,[eax-10]
  186. mov [edx-10],ecx
  187. @@Fwd06:
  188. mov ecx,[eax-6]
  189. mov [edx-6],ecx
  190. @@Fwd02:
  191. movzx ecx, word ptr [eax-2]
  192. mov [edx-2],cx
  193. ret
  194. @@Fwd33:
  195. mov ecx,[eax-33]
  196. mov [edx-33],ecx
  197. @@Fwd29:
  198. mov ecx,[eax-29]
  199. mov [edx-29],ecx
  200. @@Fwd25:
  201. mov ecx,[eax-25]
  202. mov [edx-25],ecx
  203. @@Fwd21:
  204. mov ecx,[eax-21]
  205. mov [edx-21],ecx
  206. @@Fwd17:
  207. mov ecx,[eax-17]
  208. mov [edx-17],ecx
  209. @@Fwd13:
  210. mov ecx,[eax-13]
  211. mov [edx-13],ecx
  212. @@Fwd09:
  213. mov ecx,[eax-9]
  214. mov [edx-9],ecx
  215. @@Fwd05:
  216. mov ecx,[eax-5]
  217. mov [edx-5],ecx
  218. @@Fwd01:
  219. movzx ecx, byte ptr [eax-1]
  220. mov [edx-1],cl
  221. @@Done:
  222. end; {SmallForwardMove}
  223. {-------------------------------------------------------------------------}
  224. {Perform Backward Move of 0..36 Bytes}
  225. {On Entry, ECX = Count, EAX = Source, EDX = Dest. Destroys ECX}
  226. procedure SmallBackwardMove_3;assembler;nostackframe;
  227. asm
  228. jmp dword ptr @@BwdJumpTable[ecx*4]
  229. align 16
  230. @@BwdJumpTable:
  231. dd @@Done {Removes need to test for zero size move}
  232. dd @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08
  233. dd @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16
  234. dd @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24
  235. dd @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32
  236. dd @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36
  237. @@Bwd36:
  238. mov ecx,[eax+32]
  239. mov [edx+32],ecx
  240. @@Bwd32:
  241. mov ecx,[eax+28]
  242. mov [edx+28],ecx
  243. @@Bwd28:
  244. mov ecx,[eax+24]
  245. mov [edx+24],ecx
  246. @@Bwd24:
  247. mov ecx,[eax+20]
  248. mov [edx+20],ecx
  249. @@Bwd20:
  250. mov ecx,[eax+16]
  251. mov [edx+16],ecx
  252. @@Bwd16:
  253. mov ecx,[eax+12]
  254. mov [edx+12],ecx
  255. @@Bwd12:
  256. mov ecx,[eax+8]
  257. mov [edx+8],ecx
  258. @@Bwd08:
  259. mov ecx,[eax+4]
  260. mov [edx+4],ecx
  261. @@Bwd04:
  262. mov ecx,[eax]
  263. mov [edx],ecx
  264. ret
  265. @@Bwd35:
  266. mov ecx,[eax+31]
  267. mov [edx+31],ecx
  268. @@Bwd31:
  269. mov ecx,[eax+27]
  270. mov [edx+27],ecx
  271. @@Bwd27:
  272. mov ecx,[eax+23]
  273. mov [edx+23],ecx
  274. @@Bwd23:
  275. mov ecx,[eax+19]
  276. mov [edx+19],ecx
  277. @@Bwd19:
  278. mov ecx,[eax+15]
  279. mov [edx+15],ecx
  280. @@Bwd15:
  281. mov ecx,[eax+11]
  282. mov [edx+11],ecx
  283. @@Bwd11:
  284. mov ecx,[eax+7]
  285. mov [edx+7],ecx
  286. @@Bwd07:
  287. mov ecx,[eax+3]
  288. mov [edx+3],ecx
  289. mov ecx,[eax]
  290. mov [edx],ecx
  291. ret
  292. @@Bwd03:
  293. movzx ecx, word ptr [eax+1]
  294. mov [edx+1],cx
  295. movzx ecx, byte ptr [eax]
  296. mov [edx],cl
  297. ret
  298. @@Bwd34:
  299. mov ecx,[eax+30]
  300. mov [edx+30],ecx
  301. @@Bwd30:
  302. mov ecx,[eax+26]
  303. mov [edx+26],ecx
  304. @@Bwd26:
  305. mov ecx,[eax+22]
  306. mov [edx+22],ecx
  307. @@Bwd22:
  308. mov ecx,[eax+18]
  309. mov [edx+18],ecx
  310. @@Bwd18:
  311. mov ecx,[eax+14]
  312. mov [edx+14],ecx
  313. @@Bwd14:
  314. mov ecx,[eax+10]
  315. mov [edx+10],ecx
  316. @@Bwd10:
  317. mov ecx,[eax+6]
  318. mov [edx+6],ecx
  319. @@Bwd06:
  320. mov ecx,[eax+2]
  321. mov [edx+2],ecx
  322. @@Bwd02:
  323. movzx ecx, word ptr [eax]
  324. mov [edx],cx
  325. ret
  326. @@Bwd33:
  327. mov ecx,[eax+29]
  328. mov [edx+29],ecx
  329. @@Bwd29:
  330. mov ecx,[eax+25]
  331. mov [edx+25],ecx
  332. @@Bwd25:
  333. mov ecx,[eax+21]
  334. mov [edx+21],ecx
  335. @@Bwd21:
  336. mov ecx,[eax+17]
  337. mov [edx+17],ecx
  338. @@Bwd17:
  339. mov ecx,[eax+13]
  340. mov [edx+13],ecx
  341. @@Bwd13:
  342. mov ecx,[eax+9]
  343. mov [edx+9],ecx
  344. @@Bwd09:
  345. mov ecx,[eax+5]
  346. mov [edx+5],ecx
  347. @@Bwd05:
  348. mov ecx,[eax+1]
  349. mov [edx+1],ecx
  350. @@Bwd01:
  351. movzx ecx, byte ptr[eax]
  352. mov [edx],cl
  353. @@Done:
  354. end; {SmallBackwardMove}
  355. { at least valgrind up to 3.3 has a bug which prevents the default code to
  356. work so we use a rather simple implementation here
  357. }
  358. procedure Forwards_Valgrind;assembler;nostackframe;
  359. asm
  360. push esi
  361. push edi
  362. mov esi,eax
  363. mov edi,edx
  364. rep movsb
  365. pop edi
  366. pop esi
  367. end;
  368. { at least valgrind up to 3.3 has a bug which prevents the default code to
  369. work so we use a rather simple implementation here
  370. }
  371. procedure Backwards_Valgrind;assembler;nostackframe;
  372. asm
  373. push esi
  374. push edi
  375. lea esi,[eax+ecx-1]
  376. lea edi,[edx+ecx-1]
  377. @@repeat:
  378. mov al,[esi]
  379. mov [edi],al
  380. dec esi
  381. dec edi
  382. dec ecx
  383. jnz @@repeat
  384. pop edi
  385. pop esi
  386. end;
  387. {-------------------------------------------------------------------------}
  388. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  389. procedure Forwards_IA32_3;assembler;nostackframe;
  390. asm
  391. push ebx
  392. mov ebx,edx
  393. fild qword ptr [eax]
  394. add eax,ecx {QWORD Align Writes}
  395. add ecx,edx
  396. add edx,7
  397. and edx,-8
  398. sub ecx,edx
  399. add edx,ecx {Now QWORD Aligned}
  400. sub ecx,16
  401. neg ecx
  402. @FwdLoop:
  403. fild qword ptr [eax+ecx-16]
  404. fistp qword ptr [edx+ecx-16]
  405. fild qword ptr [eax+ecx-8]
  406. fistp qword ptr [edx+ecx-8]
  407. add ecx,16
  408. jle @FwdLoop
  409. fistp qword ptr [ebx]
  410. neg ecx
  411. add ecx,16
  412. pop ebx
  413. jmp SmallForwardMove_3
  414. end; {Forwards_IA32}
  415. {-------------------------------------------------------------------------}
  416. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  417. procedure Backwards_IA32_3;assembler;nostackframe;
  418. asm
  419. push ebx
  420. fild qword ptr [eax+ecx-8]
  421. lea ebx,[edx+ecx] {QWORD Align Writes}
  422. and ebx,7
  423. sub ecx,ebx
  424. add ebx,ecx {Now QWORD Aligned, EBX = Original Length}
  425. sub ecx,16
  426. @BwdLoop:
  427. fild qword ptr [eax+ecx]
  428. fild qword ptr [eax+ecx+8]
  429. fistp qword ptr [edx+ecx+8]
  430. fistp qword ptr [edx+ecx]
  431. sub ecx,16
  432. jge @BwdLoop
  433. fistp qword ptr [edx+ebx-8]
  434. add ecx,16
  435. pop ebx
  436. jmp SmallBackwardMove_3
  437. end; {Backwards_IA32}
  438. {-------------------------------------------------------------------------}
  439. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  440. procedure Forwards_MMX_3;assembler;nostackframe;
  441. const
  442. LARGESIZE = 1024;
  443. asm
  444. cmp ecx,LARGESIZE
  445. jge @FwdLargeMove
  446. cmp ecx,72 {Size at which using MMX becomes worthwhile}
  447. jl Forwards_IA32_3
  448. push ebx
  449. mov ebx,edx
  450. movq mm0,[eax] {First 8 Characters}
  451. {QWORD Align Writes}
  452. add eax,ecx
  453. add ecx,edx
  454. add edx,7
  455. and edx,-8
  456. sub ecx,edx
  457. add edx,ecx
  458. {Now QWORD Aligned}
  459. sub ecx,32
  460. neg ecx
  461. @FwdLoopMMX:
  462. movq mm1,[eax+ecx-32]
  463. movq mm2,[eax+ecx-24]
  464. movq mm3,[eax+ecx-16]
  465. movq mm4,[eax+ecx- 8]
  466. movq [edx+ecx-32],mm1
  467. movq [edx+ecx-24],mm2
  468. movq [edx+ecx-16],mm3
  469. movq [edx+ecx- 8],mm4
  470. add ecx,32
  471. jle @FwdLoopMMX
  472. movq [ebx],mm0 {First 8 Characters}
  473. emms
  474. pop ebx
  475. neg ecx
  476. add ecx,32
  477. jmp SmallForwardMove_3
  478. @FwdLargeMove:
  479. push ebx
  480. mov ebx,ecx
  481. test edx,15
  482. jz @FwdAligned
  483. {16 byte Align Destination}
  484. mov ecx,edx
  485. add ecx,15
  486. and ecx,-16
  487. sub ecx,edx
  488. add eax,ecx
  489. add edx,ecx
  490. sub ebx,ecx
  491. {Destination now 16 Byte Aligned}
  492. call SmallForwardMove_3
  493. @FwdAligned:
  494. mov ecx,ebx
  495. and ecx,-16
  496. sub ebx,ecx {EBX = Remainder}
  497. push esi
  498. push edi
  499. mov esi,eax {ESI = Source}
  500. mov edi,edx {EDI = Dest}
  501. mov eax,ecx {EAX = Count}
  502. and eax,-64 {EAX = No of Bytes to Blocks Moves}
  503. and ecx,$3F {ECX = Remaining Bytes to Move (0..63)}
  504. add esi,eax
  505. add edi,eax
  506. shr eax,3 {EAX = No of QWORD's to Block Move}
  507. neg eax
  508. @MMXcopyloop:
  509. movq mm0,[esi+eax*8 ]
  510. movq mm1,[esi+eax*8+ 8]
  511. movq mm2,[esi+eax*8+16]
  512. movq mm3,[esi+eax*8+24]
  513. movq mm4,[esi+eax*8+32]
  514. movq mm5,[esi+eax*8+40]
  515. movq mm6,[esi+eax*8+48]
  516. movq mm7,[esi+eax*8+56]
  517. movq [edi+eax*8 ],mm0
  518. movq [edi+eax*8+ 8],mm1
  519. movq [edi+eax*8+16],mm2
  520. movq [edi+eax*8+24],mm3
  521. movq [edi+eax*8+32],mm4
  522. movq [edi+eax*8+40],mm5
  523. movq [edi+eax*8+48],mm6
  524. movq [edi+eax*8+56],mm7
  525. add eax,8
  526. jnz @MMXcopyloop
  527. emms {Empty MMX State}
  528. add ecx,ebx
  529. shr ecx,2
  530. rep movsd
  531. mov ecx,ebx
  532. and ecx,3
  533. rep movsb
  534. pop edi
  535. pop esi
  536. pop ebx
  537. end; {Forwards_MMX}
  538. {-------------------------------------------------------------------------}
  539. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  540. procedure Backwards_MMX_3;assembler;nostackframe;
  541. asm
  542. cmp ecx,72 {Size at which using MMX becomes worthwhile}
  543. jl Backwards_IA32_3
  544. push ebx
  545. movq mm0,[eax+ecx-8] {Get Last QWORD}
  546. {QWORD Align Writes}
  547. lea ebx,[edx+ecx]
  548. and ebx,7
  549. sub ecx,ebx
  550. add ebx,ecx
  551. {Now QWORD Aligned}
  552. sub ecx,32
  553. @BwdLoopMMX:
  554. movq mm1,[eax+ecx ]
  555. movq mm2,[eax+ecx+ 8]
  556. movq mm3,[eax+ecx+16]
  557. movq mm4,[eax+ecx+24]
  558. movq [edx+ecx+24],mm4
  559. movq [edx+ecx+16],mm3
  560. movq [edx+ecx+ 8],mm2
  561. movq [edx+ecx ],mm1
  562. sub ecx,32
  563. jge @BwdLoopMMX
  564. movq [edx+ebx-8], mm0 {Last QWORD}
  565. emms
  566. add ecx,32
  567. pop ebx
  568. jmp SmallBackwardMove_3
  569. end; {Backwards_MMX}
  570. {-------------------------------------------------------------------------}
  571. {Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
  572. procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe;
  573. const
  574. Prefetch = 512;
  575. asm
  576. push esi
  577. mov esi,eax {ESI = Source}
  578. mov eax,ecx {EAX = Count}
  579. and eax,-128 {EAX = No of Bytes to Block Move}
  580. add esi,eax
  581. add edx,eax
  582. shr eax,3 {EAX = No of QWORD's to Block Move}
  583. neg eax
  584. cmp eax, -(32*1024) {Count > 256K}
  585. jl @Large
  586. @Small: {Count<=256K}
  587. test esi,15 {Check if Both Source/Dest Aligned}
  588. jnz @SmallUnaligned
  589. @SmallAligned: {Both Source and Dest 16-Byte Aligned}
  590. @SmallAlignedLoop:
  591. movaps xmm0,[esi+8*eax]
  592. movaps xmm1,[esi+8*eax+16]
  593. movaps xmm2,[esi+8*eax+32]
  594. movaps xmm3,[esi+8*eax+48]
  595. movaps [edx+8*eax],xmm0
  596. movaps [edx+8*eax+16],xmm1
  597. movaps [edx+8*eax+32],xmm2
  598. movaps [edx+8*eax+48],xmm3
  599. movaps xmm4,[esi+8*eax+64]
  600. movaps xmm5,[esi+8*eax+80]
  601. movaps xmm6,[esi+8*eax+96]
  602. movaps xmm7,[esi+8*eax+112]
  603. movaps [edx+8*eax+64],xmm4
  604. movaps [edx+8*eax+80],xmm5
  605. movaps [edx+8*eax+96],xmm6
  606. movaps [edx+8*eax+112],xmm7
  607. add eax,16
  608. js @SmallAlignedLoop
  609. jmp @Remainder
  610. @SmallUnaligned: {Source Not 16-Byte Aligned}
  611. @SmallUnalignedLoop:
  612. movups xmm0,[esi+8*eax]
  613. movups xmm1,[esi+8*eax+16]
  614. movups xmm2,[esi+8*eax+32]
  615. movups xmm3,[esi+8*eax+48]
  616. movaps [edx+8*eax],xmm0
  617. movaps [edx+8*eax+16],xmm1
  618. movaps [edx+8*eax+32],xmm2
  619. movaps [edx+8*eax+48],xmm3
  620. movups xmm4,[esi+8*eax+64]
  621. movups xmm5,[esi+8*eax+80]
  622. movups xmm6,[esi+8*eax+96]
  623. movups xmm7,[esi+8*eax+112]
  624. movaps [edx+8*eax+64],xmm4
  625. movaps [edx+8*eax+80],xmm5
  626. movaps [edx+8*eax+96],xmm6
  627. movaps [edx+8*eax+112],xmm7
  628. add eax,16
  629. js @SmallUnalignedLoop
  630. jmp @Remainder
  631. @Large: {Count>256K}
  632. test esi,15 {Check if Both Source/Dest Aligned}
  633. jnz @LargeUnaligned
  634. @LargeAligned: {Both Source and Dest 16-Byte Aligned}
  635. @LargeAlignedLoop:
  636. prefetchnta [esi+8*eax+Prefetch]
  637. prefetchnta [esi+8*eax+Prefetch+64]
  638. movaps xmm0,[esi+8*eax]
  639. movaps xmm1,[esi+8*eax+16]
  640. movaps xmm2,[esi+8*eax+32]
  641. movaps xmm3,[esi+8*eax+48]
  642. movntps [edx+8*eax],xmm0
  643. movntps [edx+8*eax+16],xmm1
  644. movntps [edx+8*eax+32],xmm2
  645. movntps [edx+8*eax+48],xmm3
  646. movaps xmm4,[esi+8*eax+64]
  647. movaps xmm5,[esi+8*eax+80]
  648. movaps xmm6,[esi+8*eax+96]
  649. movaps xmm7,[esi+8*eax+112]
  650. movntps [edx+8*eax+64],xmm4
  651. movntps [edx+8*eax+80],xmm5
  652. movntps [edx+8*eax+96],xmm6
  653. movntps [edx+8*eax+112],xmm7
  654. add eax,16
  655. js @LargeAlignedLoop
  656. sfence
  657. jmp @Remainder
  658. @LargeUnaligned: {Source Not 16-Byte Aligned}
  659. @LargeUnalignedLoop:
  660. prefetchnta [esi+8*eax+Prefetch]
  661. prefetchnta [esi+8*eax+Prefetch+64]
  662. movups xmm0,[esi+8*eax]
  663. movups xmm1,[esi+8*eax+16]
  664. movups xmm2,[esi+8*eax+32]
  665. movups xmm3,[esi+8*eax+48]
  666. movntps [edx+8*eax],xmm0
  667. movntps [edx+8*eax+16],xmm1
  668. movntps [edx+8*eax+32],xmm2
  669. movntps [edx+8*eax+48],xmm3
  670. movups xmm4,[esi+8*eax+64]
  671. movups xmm5,[esi+8*eax+80]
  672. movups xmm6,[esi+8*eax+96]
  673. movups xmm7,[esi+8*eax+112]
  674. movntps [edx+8*eax+64],xmm4
  675. movntps [edx+8*eax+80],xmm5
  676. movntps [edx+8*eax+96],xmm6
  677. movntps [edx+8*eax+112],xmm7
  678. add eax,16
  679. js @LargeUnalignedLoop
  680. sfence
  681. @Remainder:
  682. and ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)}
  683. jz @Done
  684. add esi,ecx
  685. add edx,ecx
  686. neg ecx
  687. @RemainderLoop:
  688. movups xmm0,[esi+ecx]
  689. movaps [edx+ecx],xmm0
  690. add ecx,16
  691. jnz @RemainderLoop
  692. @Done:
  693. pop esi
  694. end; {AlignedFwdMoveSSE}
  695. {-------------------------------------------------------------------------}
  696. {Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
  697. procedure Forwards_SSE_3;assembler;nostackframe;
  698. const
  699. LARGESIZE = 2048;
  700. asm
  701. cmp ecx,LARGESIZE
  702. jge @FwdLargeMove
  703. cmp ecx,SMALLMOVESIZE+32
  704. movups xmm0,[eax]
  705. jg @FwdMoveSSE
  706. movups xmm1,[eax+16]
  707. movups [edx],xmm0
  708. movups [edx+16],xmm1
  709. add eax,ecx
  710. add edx,ecx
  711. sub ecx,32
  712. jmp SmallForwardMove_3
  713. @FwdMoveSSE:
  714. push ebx
  715. mov ebx,edx
  716. {Align Writes}
  717. add eax,ecx
  718. add ecx,edx
  719. add edx,15
  720. and edx,-16
  721. sub ecx,edx
  722. add edx,ecx
  723. {Now Aligned}
  724. sub ecx,32
  725. neg ecx
  726. @FwdLoopSSE:
  727. movups xmm1,[eax+ecx-32]
  728. movups xmm2,[eax+ecx-16]
  729. movaps [edx+ecx-32],xmm1
  730. movaps [edx+ecx-16],xmm2
  731. add ecx,32
  732. jle @FwdLoopSSE
  733. movups [ebx],xmm0 {First 16 Bytes}
  734. neg ecx
  735. add ecx,32
  736. pop ebx
  737. jmp SmallForwardMove_3
  738. @FwdLargeMove:
  739. push ebx
  740. mov ebx,ecx
  741. test edx,15
  742. jz @FwdLargeAligned
  743. {16 byte Align Destination}
  744. mov ecx,edx
  745. add ecx,15
  746. and ecx,-16
  747. sub ecx,edx
  748. add eax,ecx
  749. add edx,ecx
  750. sub ebx,ecx
  751. {Destination now 16 Byte Aligned}
  752. call SmallForwardMove_3
  753. mov ecx,ebx
  754. @FwdLargeAligned:
  755. and ecx,-16
  756. sub ebx,ecx {EBX = Remainder}
  757. push edx
  758. push eax
  759. push ecx
  760. call AlignedFwdMoveSSE_3
  761. pop ecx
  762. pop eax
  763. pop edx
  764. add ecx,ebx
  765. add eax,ecx
  766. add edx,ecx
  767. mov ecx,ebx
  768. pop ebx
  769. jmp SmallForwardMove_3
  770. end; {Forwards_SSE}
  771. {-------------------------------------------------------------------------}
  772. {Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
  773. procedure Backwards_SSE_3;assembler;nostackframe;
  774. asm
  775. cmp ecx,SMALLMOVESIZE+32
  776. jg @BwdMoveSSE
  777. sub ecx,32
  778. movups xmm1,[eax+ecx]
  779. movups xmm2,[eax+ecx+16]
  780. movups [edx+ecx],xmm1
  781. movups [edx+ecx+16],xmm2
  782. jmp SmallBackwardMove_3
  783. @BwdMoveSSE:
  784. push ebx
  785. movups xmm0,[eax+ecx-16] {Last 16 Bytes}
  786. {Align Writes}
  787. lea ebx,[edx+ecx]
  788. and ebx,15
  789. sub ecx,ebx
  790. add ebx,ecx
  791. {Now Aligned}
  792. sub ecx,32
  793. @BwdLoop:
  794. movups xmm1,[eax+ecx]
  795. movups xmm2,[eax+ecx+16]
  796. movaps [edx+ecx],xmm1
  797. movaps [edx+ecx+16],xmm2
  798. sub ecx,32
  799. jge @BwdLoop
  800. movups [edx+ebx-16],xmm0 {Last 16 Bytes}
  801. add ecx,32
  802. pop ebx
  803. jmp SmallBackwardMove_3
  804. end; {Backwards_SSE}
  805. const
  806. fastmoveproc_forward : pointer = @Forwards_IA32_3;
  807. fastmoveproc_backward : pointer = @Backwards_IA32_3;
  808. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
  809. asm
  810. cmp ecx,SMALLMOVESIZE
  811. ja @Large
  812. cmp eax,edx
  813. lea eax,[eax+ecx]
  814. jle @SmallCheck
  815. @SmallForward:
  816. add edx,ecx
  817. jmp SmallForwardMove_3
  818. @SmallCheck:
  819. je @Done {For Compatibility with Delphi's move for Source = Dest}
  820. sub eax,ecx
  821. jmp SmallBackwardMove_3
  822. @Large:
  823. jng @Done {For Compatibility with Delphi's move for Count < 0}
  824. cmp eax,edx
  825. jg @moveforward
  826. je @Done {For Compatibility with Delphi's move for Source = Dest}
  827. push eax
  828. add eax,ecx
  829. cmp eax,edx
  830. pop eax
  831. jg @movebackward
  832. @moveforward:
  833. jmp dword ptr fastmoveproc_forward
  834. @movebackward:
  835. jmp dword ptr fastmoveproc_backward {Source/Dest Overlap}
  836. @Done:
  837. end;
  838. {$asmmode att}
  839. {$ifdef FPC_HAS_VALGRINDBOOL}
  840. {$ifndef FPC_HAS_INDIRECT_MAIN_INFORMATION}
  841. var
  842. valgrind_used : boolean;external name '__fpc_valgrind';
  843. {$endif FPC_HAS_INDIRECT_MAIN_INFORMATION}
  844. {$endif FPC_HAS_VALGRINDBOOL}
  845. procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
  846. begin
  847. {$ifdef FPC_HAS_VALGRINDBOOL}
  848. { workaround valgrind bug }
  849. {$ifdef FPC_HAS_INDIRECT_MAIN_INFORMATION}
  850. if EntryInformation.valgrind_used then
  851. {$else FPC_HAS_INDIRECT_MAIN_INFORMATION}
  852. if valgrind_used then
  853. {$endif FPC_HAS_INDIRECT_MAIN_INFORMATION}
  854. begin
  855. fastmoveproc_forward:=@Forwards_Valgrind;
  856. fastmoveproc_backward:=@Backwards_Valgrind;
  857. end
  858. else
  859. {$endif FPC_HAS_VALGRINDBOOL}
  860. if has_sse_support then
  861. begin
  862. fastmoveproc_forward:=@Forwards_SSE_3;
  863. fastmoveproc_backward:=@Backwards_SSE_3;
  864. end
  865. else if has_mmx_support then
  866. begin
  867. fastmoveproc_forward:=@Forwards_MMX_3;
  868. fastmoveproc_backward:=@Backwards_MMX_3;
  869. end;
  870. end;
  871. {$endif FPC_SYSTEM_HAS_MOVE}
  872. {$endif}