fastmove.inc 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645
  1. {$ifndef FPC_SYSTEM_HAS_MOVE}
  2. {$define FPC_SYSTEM_HAS_MOVE}
  3. { at least valgrind up to 3.3 has a bug which prevents the default code to
  4. work so we use a rather simple implementation here }
  5. procedure Move_8OrMore_Valgrind; assembler; nostackframe;
  6. { eax = source, edx = dest, ecx = count (ecx >= 8).
  7. If FPC_PIC: ebx pushed. }
  8. asm
  9. sub %eax, %edx { edx = dest - src }
  10. cmp %edx, %ecx
  11. ja .LBack { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
  12. {$ifdef FPC_ENABLED_CLD}
  13. cld
  14. {$endif FPC_ENABLED_CLD}
  15. push %esi
  16. push %edi
  17. mov %eax, %esi
  18. lea (%edx,%eax), %edi
  19. rep movsb
  20. pop %edi
  21. pop %esi
  22. {$ifdef FPC_PIC}
  23. pop %ebx
  24. {$endif}
  25. ret
  26. .LBack:
  27. {$ifndef FPC_PIC}
  28. push %ebx
  29. {$endif}
  30. add %ecx, %eax
  31. .LNextb:
  32. dec %eax
  33. mov (%eax), %bl
  34. mov %bl, (%edx,%eax)
  35. dec %ecx
  36. jnz .LNextb
  37. pop %ebx
  38. end;
  39. {$if not defined(CPUX86_HAS_SSEUNIT) or defined(FASTMOVE_DISABLE_SSE)}
  40. {$define fastmove_has_ia32_and_mmx}
  41. procedure Move_8OrMore_IA32; assembler; nostackframe;
  42. { eax = source, edx = dest, ecx = count (ecx >= 8).
  43. If FPC_PIC: ebx pushed. }
  44. asm
  45. fildq (%eax) { First and last 8 bytes, used both in .L33OrMore and ladder ending (.L9to16). }
  46. fildq -8(%eax,%ecx)
  47. cmp $16, %ecx
  48. jle .L9to16
  49. cmp $32, %ecx
  50. jg .L33OrMore
  51. fildq 8(%eax)
  52. fildq -16(%eax,%ecx)
  53. fistpq -16(%edx,%ecx)
  54. fistpq 8(%edx)
  55. .L9to16:
  56. fistpq -8(%edx,%ecx) { 9–16 bytes }
  57. fistpq (%edx)
  58. {$ifdef FPC_PIC}
  59. pop %ebx
  60. {$endif}
  61. ret
  62. .Lcancel:
  63. fstp %st(0) { Pop the “second int64 from the end” .L33OrMore loads. }
  64. fucompp { Pop two elements loaded at the beginning. }
  65. pop %ebx
  66. ret
  67. .byte 102,102,144 { Turns .balign 16 before .Lloop16f into a no-op. }
  68. .L33OrMore:
  69. fildq -16(%eax,%ecx) { Second int64 from the end. }
  70. {$ifndef FPC_PIC}
  71. push %ebx
  72. {$endif}
  73. sub %edx, %eax { eax = src - dest }
  74. jz .Lcancel { exit if src=dest }
  75. mov %eax, %ebx
  76. neg %ebx
  77. cmp %ebx, %ecx
  78. ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
  79. mov %edx, %ebx { remember original dest to write first 16 bytes }
  80. add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  81. add $8, %edx
  82. and $-8, %edx
  83. sub %edx, %ecx
  84. sub $16, %ecx
  85. jbe .LPost16f
  86. .balign 16 { no-op }
  87. .Lloop16f:
  88. fildq (%eax,%edx)
  89. fistpq (%edx)
  90. fildq 8(%eax,%edx)
  91. fistpq 8(%edx)
  92. add $16, %edx
  93. sub $16, %ecx
  94. ja .Lloop16f
  95. .LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
  96. fistpq (%edx,%ecx)
  97. fistpq 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
  98. fistpq (%ebx) { Important for <8-byte step between src and dest. }
  99. pop %ebx
  100. ret
  101. .byte 102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
  102. { backwards move }
  103. .Lback:
  104. fstp %st(0)
  105. fildq 8(%eax,%edx) { Second int64 from the start. }
  106. lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
  107. mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
  108. and $-8, %ecx
  109. sub %edx, %ecx
  110. add %ecx, %edx
  111. sub $16, %ecx
  112. jbe .LPost16b
  113. .balign 16 { no-op }
  114. .Lloop16b:
  115. sub $16, %edx
  116. fildq 8(%eax,%edx)
  117. fistpq 8(%edx)
  118. fildq (%eax,%edx)
  119. fistpq (%edx)
  120. sub $16, %ecx
  121. ja .Lloop16b
  122. .LPost16b:
  123. sub %ecx, %edx
  124. fistpq -8(%edx)
  125. fistpq -7(%ebx)
  126. fistpq -16(%edx)
  127. pop %ebx
  128. end;
  129. procedure Move_8OrMore_MMX; assembler; nostackframe;
  130. { eax = source, edx = dest, ecx = count (ecx >= 8).
  131. If FPC_PIC: ebx pushed. }
  132. asm
  133. cmp $72, %ecx { Size at which using MMX becomes worthwhile. }
  134. jl Move_8OrMore_IA32
  135. {$ifndef FPC_PIC}
  136. push %ebx
  137. {$endif}
  138. movq (%eax), %mm4 { First and last 8 bytes. }
  139. movq -8(%eax,%ecx), %mm5
  140. movq -16(%eax,%ecx), %mm3 { Second vector from the end. }
  141. sub %edx, %eax { eax = src - dest }
  142. jz .Lquit { exit if src=dest }
  143. mov %eax, %ebx
  144. neg %ebx
  145. cmp %ebx, %ecx
  146. ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
  147. mov %edx, %ebx { remember original dest to write first 16 bytes }
  148. add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  149. add $8, %edx
  150. and $-8, %edx
  151. sub %edx, %ecx
  152. sub $16, %ecx
  153. jbe .LPost16f
  154. .balign 16
  155. .Lloop16f:
  156. movq (%eax,%edx), %mm0
  157. movq %mm0, (%edx)
  158. movq 8(%eax,%edx), %mm0
  159. movq %mm0, 8(%edx)
  160. add $16, %edx
  161. sub $16, %ecx
  162. ja .Lloop16f
  163. .LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
  164. movq %mm3, (%edx,%ecx)
  165. movq %mm5, 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
  166. movq %mm4, (%ebx) { Important for <8-byte step between src and dest. }
  167. .Lquit:
  168. emms
  169. pop %ebx
  170. ret
  171. .byte 144 { Turns .balign 16 before .Lloop16b into a no-op. }
  172. { backwards move }
  173. .Lback:
  174. movq 8(%eax,%edx), %mm3 { Second vector from the start. }
  175. lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
  176. mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
  177. and $-8, %ecx
  178. sub %edx, %ecx
  179. add %ecx, %edx
  180. sub $16, %ecx
  181. jbe .LPost16b
  182. .balign 16 { no-op }
  183. .Lloop16b:
  184. sub $16, %edx
  185. movq 8(%eax,%edx), %mm0
  186. movq %mm0, 8(%edx)
  187. movq (%eax,%edx), %mm0
  188. movq %mm0, (%edx)
  189. sub $16, %ecx
  190. ja .Lloop16b
  191. .LPost16b:
  192. sub %ecx, %edx
  193. movq %mm3, -8(%edx)
  194. movq %mm4, -16(%edx)
  195. movq %mm5, -7(%ebx)
  196. emms
  197. pop %ebx
  198. end;
  199. {$endif need IA32 and MMX versions}
  200. {$ifndef FASTMOVE_DISABLE_SSE}
  201. label
  202. Move_8OrMore_SSE_9to15, Move_8OrMore_SSE_33OrMore, Move_8OrMore_SSE_CancelERMSBackwards;
  203. const
  204. Move_NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
  205. procedure Move_8OrMore_SSE; assembler; nostackframe;
  206. { eax = source, edx = dest, ecx = count (ecx >= 8).
  207. If FPC_PIC: ebx pushed. }
  208. const
  209. PrefetchDistance = 512;
  210. asm
  211. cmp $15, %ecx
  212. jle Move_8OrMore_SSE_9to15
  213. movups (%eax), %xmm4 { First and last 16 bytes, used both in Move_8OrMore_SSE_33OrMore and 17–32 branch. }
  214. movups -16(%eax,%ecx), %xmm5
  215. cmp $32, %ecx
  216. jg Move_8OrMore_SSE_33OrMore
  217. movups %xmm4, (%edx) { 16–32 bytes }
  218. movups %xmm5, -16(%edx,%ecx)
  219. {$ifdef FPC_PIC}
  220. pop %ebx
  221. {$endif}
  222. ret
  223. Move_8OrMore_SSE_9to15:
  224. movlps (%eax), %xmm0
  225. movlps -8(%eax,%ecx), %xmm1
  226. movlps %xmm0, (%edx)
  227. movlps %xmm1, -8(%edx,%ecx)
  228. .Lquit:
  229. {$ifdef FPC_PIC}
  230. pop %ebx
  231. {$endif}
  232. ret
  233. .byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
  234. Move_8OrMore_SSE_33OrMore:
  235. movups -32(%eax,%ecx), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
  236. { but -32(%eax,%ecx) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
  237. sub %edx, %eax { eax = src - dest }
  238. jz .Lquit { exit if src=dest }
  239. {$ifndef FPC_PIC}
  240. push %ebx
  241. {$endif}
  242. mov %eax, %ebx
  243. neg %ebx
  244. cmp %ebx, %ecx
  245. ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
  246. mov %edx, %ebx { remember original dest to write first 16 bytes }
  247. add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  248. add $16, %edx
  249. and $-16, %edx
  250. sub %edx, %ecx
  251. .LRestAfterNTf:
  252. sub $32, %ecx { During the N× loop, ecx is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
  253. jbe .LPost32f
  254. cmp $Move_NtThreshold-32, %ecx
  255. jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
  256. .LNtIsNotBetterF:
  257. test $15, %eax
  258. jz .Lalignedloop32f
  259. .balign 16 { no-op }
  260. .Lloop32f:
  261. movups (%eax,%edx), %xmm0
  262. movaps %xmm0, (%edx)
  263. movups 16(%eax,%edx), %xmm0
  264. movaps %xmm0, 16(%edx)
  265. add $32, %edx
  266. sub $32, %ecx
  267. ja .Lloop32f
  268. .LPost32f: { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. }
  269. movups %xmm3, (%edx, %ecx)
  270. movups %xmm5, 16(%edx,%ecx) { Write first and last 16 bytes after everything else. }
  271. movups %xmm4, (%ebx) { Important for <16-byte step between src and dest. }
  272. pop %ebx
  273. ret
  274. .balign 16
  275. .Lalignedloop32f: { Same as above starting from .Lloop32f but with MOVAPSes. }
  276. movaps (%eax,%edx), %xmm0
  277. movaps %xmm0, (%edx)
  278. movaps 16(%eax,%edx), %xmm0
  279. movaps %xmm0, 16(%edx)
  280. add $32, %edx
  281. sub $32, %ecx
  282. ja .Lalignedloop32f
  283. .LalignedPost32f:
  284. movups %xmm3, (%edx, %ecx)
  285. movups %xmm5, 16(%edx,%ecx)
  286. movups %xmm4, (%ebx)
  287. pop %ebx
  288. ret
  289. .Lntf:
  290. cmp $Move_NtThreshold, %eax { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
  291. jb .LNtIsNotBetterF { (this check is performed here to not stand in the way of smaller counts) }
  292. sub $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
  293. test $15, %eax
  294. jz .Lalignedntloop64f
  295. .balign 16
  296. .Lntloop64f:
  297. prefetchnta 0+PrefetchDistance(%eax,%edx,1)
  298. movups (%eax,%edx,1), %xmm0
  299. movntps %xmm0, (%edx)
  300. movups 16(%eax,%edx,1), %xmm0
  301. movntps %xmm0, 16(%edx)
  302. movups 32(%eax,%edx,1), %xmm0
  303. movntps %xmm0, 32(%edx)
  304. movups 48(%eax,%edx,1), %xmm0
  305. movntps %xmm0, 48(%edx)
  306. add $64, %edx
  307. sub $64, %ecx
  308. jae .Lntloop64f
  309. sfence
  310. add $PrefetchDistance+64, %ecx
  311. jmp .LRestAfterNTf { go handle remaining bytes }
  312. .balign 16
  313. .Lalignedntloop64f: { Same as above starting from .Lntloop64f but with MOVAPSes. }
  314. prefetchnta 0+PrefetchDistance(%eax,%edx,1)
  315. movaps (%eax,%edx,1), %xmm0
  316. movntps %xmm0, (%edx)
  317. movaps 16(%eax,%edx,1), %xmm0
  318. movntps %xmm0, 16(%edx)
  319. movaps 32(%eax,%edx,1), %xmm0
  320. movntps %xmm0, 32(%edx)
  321. movaps 48(%eax,%edx,1), %xmm0
  322. movntps %xmm0, 48(%edx)
  323. add $64, %edx
  324. sub $64, %ecx
  325. jae .Lalignedntloop64f
  326. sfence
  327. add $PrefetchDistance+64, %ecx
  328. jmp .LRestAfterNTf
  329. .byte {$ifdef FPC_PIC}102,{$endif}102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
  330. Move_8OrMore_SSE_CancelERMSBackwards:
  331. { Adapt from Move_8OrMore_SSE_ERMS.Lback where eax = src, edx = dest - src, xmm4 and xmm5 aren’t read, ebx isn't pushed if not FPC_PIC. }
  332. {$ifndef FPC_PIC}
  333. push %ebx
  334. {$endif}
  335. add %eax, %edx
  336. movups (%eax), %xmm4
  337. movups -16(%eax,%ecx), %xmm5
  338. sub %edx, %eax
  339. { backwards move }
  340. .Lback:
  341. movups 16(%eax,%edx), %xmm3 { Second vector from the start. }
  342. lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 16 bytes }
  343. mov %ebx, %ecx { move dest to the previous 16-byte boundary... }
  344. and $-16, %ecx
  345. sub %edx, %ecx
  346. add %ecx, %edx
  347. .LRestAfterNTb:
  348. sub $32, %ecx
  349. jbe .LPost32b
  350. cmp $Move_NtThreshold-32, %ecx
  351. jae .Lntb
  352. .balign 16 { no-op }
  353. .Lloop32b:
  354. sub $32, %edx
  355. movups 16(%eax,%edx), %xmm0
  356. movaps %xmm0, 16(%edx)
  357. movups (%eax,%edx), %xmm0
  358. movaps %xmm0, (%edx)
  359. sub $32, %ecx
  360. ja .Lloop32b
  361. .LPost32b:
  362. sub %ecx, %edx
  363. movups %xmm3, -16(%edx)
  364. movups %xmm4, -32(%edx)
  365. movups %xmm5, -15(%ebx)
  366. pop %ebx
  367. ret
  368. .Lntb:
  369. cmp $-Move_NtThreshold, %eax
  370. ja .Lloop32b
  371. sub $PrefetchDistance+32, %ecx
  372. .balign 16
  373. .Lntloop64b:
  374. prefetchnta -PrefetchDistance(%eax,%edx,1)
  375. sub $64, %edx
  376. movups 48(%eax,%edx,1), %xmm0
  377. movntps %xmm0, 48(%edx)
  378. movups 32(%eax,%edx,1), %xmm0
  379. movntps %xmm0, 32(%edx)
  380. movups 16(%eax,%edx,1), %xmm0
  381. movntps %xmm0, 16(%edx)
  382. movups (%eax,%edx,1), %xmm0
  383. movntps %xmm0, (%edx)
  384. sub $64, %ecx
  385. jae .Lntloop64b
  386. sfence
  387. add $PrefetchDistance+64, %ecx
  388. jmp .LRestAfterNTb
  389. end;
  390. procedure Move_8OrMore_SSE_ERMS; assembler; nostackframe;
  391. { eax = source, edx = dest, ecx = count (ecx >= 8).
  392. If FPC_PIC: ebx pushed. }
  393. const
  394. ErmsThreshold = 1536;
  395. asm
  396. cmp $15, %ecx
  397. jle Move_8OrMore_SSE_9to15
  398. cmp $ErmsThreshold, %ecx
  399. jae .LRepMovs
  400. movups (%eax), %xmm4 { Same as in Move_8OrMore_SSE. }
  401. movups -16(%eax,%ecx), %xmm5
  402. cmp $32, %ecx
  403. jg Move_8OrMore_SSE_33OrMore
  404. movups %xmm4, (%edx) { 16–32 bytes }
  405. movups %xmm5, -16(%edx,%ecx)
  406. {$ifdef FPC_PIC}
  407. pop %ebx
  408. {$endif}
  409. ret
  410. .LRepMovs:
  411. sub %eax, %edx { edx = dest - src }
  412. jz .Lquit { exit if src=dest }
  413. cmp %edx, %ecx { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
  414. ja .Lback
  415. cmp $Move_NtThreshold+16, %ecx
  416. jae .LNtF { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT done by Move_8OrMore_SSE. }
  417. .LNtIsNotBetterF:
  418. push %esi
  419. push %edi
  420. mov %eax, %esi
  421. lea (%edx,%eax), %edi
  422. rep movsb
  423. pop %edi
  424. pop %esi
  425. .Lquit:
  426. {$ifdef FPC_PIC}
  427. pop %ebx
  428. {$endif}
  429. ret
  430. .LNtF:
  431. cmp $-Move_NtThreshold, %edx { Check move distance. Bad case for forward NT is 0 < src - dest < NtThreshold => unsigned(dest - src) > unsigned(-NtThreshold). }
  432. ja .LNtIsNotBetterF { NT is not better and Move_8OrMore_SSE won't perform it either. Use REP MOVSB. }
  433. add %eax, %edx { Recover edx = dest. }
  434. jmp Move_8OrMore_SSE { Will perform NT. }
  435. .Lback:
  436. { dst = 3
  437. v
  438. Move(abcdefghijXXX, count=10)
  439. ^
  440. src = 0
  441. = abcABCDEFGHIJ
  442. can be moved right to left in non-overlapping groups of “dst - src”:
  443. abcdefghijHIJ
  444. ^^^
  445. abcdefgEFGhij
  446. ^^^
  447. abcdBCDefghij
  448. ^^^
  449. abcAbcdefghij <- tail is handled by restarting the Move with corresponding count instead, as it can have 0 to dst - src - 1 bytes.
  450. ^
  451. Only REP MOVs with DF=0 are fast with ERMS, in case you’re wondering why not just use DF=1. }
  452. cmp $ErmsThreshold, %edx
  453. jb Move_8OrMore_SSE_CancelERMSBackwards { Unfortunately this branch can’t benefit the common case of small distance (like inserting 1 array element into the beginning). :( }
  454. cmp $Move_NtThreshold+16, %ecx
  455. jae .LNtB
  456. .LNtIsNotBetterB:
  457. {$ifndef FPC_PIC}
  458. push %ebx
  459. {$endif}
  460. mov %ecx, %ebx { ebx = remaining }
  461. sub %edx, %ebx { edx = dst - src = step; remaining -= step. }
  462. add %ecx, %eax
  463. push %esi
  464. push %edi
  465. .LRepMovsNextPieceB: { At least 1 iteration is always performed. }
  466. mov %eax, %edi { edi = src before subtracting step = dst = rep movsb dest }
  467. sub %edx, %eax { src -= step }
  468. mov %eax, %esi { esi = src = rep movsb source }
  469. mov %edx, %ecx { ecx = step = rep movsb count }
  470. rep movsb
  471. sub %edx, %ebx { remaining -= step }
  472. jnc .LRepMovsNextPieceB { CF=1 after remaining -= step means that remaining became strictly negative and the loop must be stopped/not performed. }
  473. pop %edi
  474. pop %esi
  475. lea (%edx,%ebx), %ecx { ecx = remaining }
  476. sub %ecx, %eax { eax = src }
  477. add %eax, %edx { edx = dest }
  478. pop %ebx
  479. jmp Move { Remaining piece ("a" in the example above). Can save a bit of jumps by doing more checks and jumping to more specific places, but whatever. }
  480. .LNtB:
  481. cmp $Move_NtThreshold, %edx { Check move distance. Bad case for backward NT is dest - src < NtThreshold; src is always < dest. }
  482. jb .LNtIsNotBetterB { NT is not better and Move_8OrMore_SSE won't perform it either. Use REP MOVSB. }
  483. add %eax, %edx { Recover edx = dest. }
  484. jmp Move_8OrMore_SSE { Will perform NT. }
  485. end;
  486. {$endif ndef FASTMOVE_DISABLE_SSE}
  487. procedure Move_8OrMore_Dispatch; forward;
  488. var
  489. fastmoveproc : pointer = @Move_8OrMore_Dispatch;
  490. {$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  491. valgrind_used : boolean;external name '__fpc_valgrind';
  492. {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  493. function Move_8OrMore_HumanFriendlyDispatch: pointer;
  494. begin
  495. { workaround valgrind bug }
  496. {$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  497. if EntryInformation.valgrind_used then
  498. {$else FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  499. if valgrind_used then
  500. {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  501. result:=@Move_8OrMore_Valgrind
  502. {$ifndef FASTMOVE_DISABLE_SSE}
  503. else if fast_large_repmovstosb then
  504. result:=@Move_8OrMore_SSE_ERMS
  505. else {$ifdef fastmove_has_ia32_and_mmx} if has_sse_support then {$endif}
  506. result:=@Move_8OrMore_SSE
  507. {$endif ndef FASTMOVE_DISABLE_SSE}
  508. {$ifdef fastmove_has_ia32_and_mmx}
  509. else if has_mmx_support then
  510. result:=@Move_8OrMore_MMX
  511. else
  512. result:=@Move_8OrMore_IA32
  513. {$endif fastmove_has_ia32_and_mmx};
  514. if fpc_cpucodeinit_performed then
  515. fastmoveproc:=result;
  516. end;
  517. procedure Move_8OrMore_Dispatch; assembler; nostackframe;
  518. { eax = source, edx = dest, ecx = count (ecx >= 8).
  519. If FPC_PIC: ebx pushed. }
  520. asm
  521. {$ifndef FPC_PIC}
  522. push %ebx
  523. {$endif}
  524. push %eax
  525. push %edx
  526. push %ecx
  527. call Move_8OrMore_HumanFriendlyDispatch
  528. mov %eax, %ebx
  529. pop %ecx
  530. pop %edx
  531. pop %eax
  532. {$ifdef FPC_PIC}
  533. jmp %ebx
  534. {$else}
  535. call %ebx
  536. pop %ebx
  537. {$endif}
  538. end;
  539. procedure Move(const source;var dest;count:SizeInt); [public, alias: 'FPC_MOVE']; assembler; nostackframe;
  540. asm
  541. cmp $8, %ecx
  542. jle .L8OrLess
  543. {$ifdef FPC_PIC}
  544. push %ebx
  545. call fpc_geteipasebx
  546. addl $_GLOBAL_OFFSET_TABLE_, %ebx
  547. movl fastmoveproc@GOT(%ebx), %ebx
  548. jmp (%ebx)
  549. {$else}
  550. jmp fastmoveproc
  551. {$endif}
  552. .L8OrLess:
  553. cmp $3, %ecx
  554. jle .L3OrLess
  555. push %ebx
  556. mov (%eax), %ebx
  557. mov -4(%eax,%ecx), %eax
  558. mov %ebx, (%edx)
  559. mov %eax, -4(%edx,%ecx)
  560. pop %ebx
  561. ret
  562. .L3OrLess:
  563. cmp $1, %ecx
  564. jl .LZero
  565. push %ebx
  566. movzbl (%eax), %ebx
  567. je .LOne
  568. movzwl -2(%eax,%ecx), %eax
  569. mov %ax, -2(%edx,%ecx)
  570. .LOne:
  571. mov %bl, (%edx)
  572. pop %ebx
  573. .LZero:
  574. end;
  575. {$endif FPC_SYSTEM_HAS_MOVE}