fastmove.inc 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640
  1. {$ifndef FPC_SYSTEM_HAS_MOVE}
  2. {$define FPC_SYSTEM_HAS_MOVE}
  3. { at least valgrind up to 3.3 has a bug which prevents the default code to
  4. work so we use a rather simple implementation here }
  5. procedure Move_8OrMore_Valgrind; assembler; nostackframe;
  6. { eax = source, edx = dest, ecx = count (ecx >= 8).
  7. If FPC_PIC: ebx pushed. }
  8. asm
  9. sub %eax, %edx { edx = dest - src }
  10. cmp %edx, %ecx
  11. ja .LBack { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
  12. {$ifdef FPC_ENABLED_CLD}
  13. cld
  14. {$endif FPC_ENABLED_CLD}
  15. push %esi
  16. push %edi
  17. mov %eax, %esi
  18. lea (%edx,%eax), %edi
  19. rep movsb
  20. pop %edi
  21. pop %esi
  22. {$ifdef FPC_PIC}
  23. pop %ebx
  24. {$endif}
  25. ret
  26. .LBack:
  27. {$ifndef FPC_PIC}
  28. push %ebx
  29. {$endif}
  30. add %ecx, %eax
  31. .LNextb:
  32. dec %eax
  33. mov (%eax), %bl
  34. mov %bl, (%edx,%eax)
  35. dec %ecx
  36. jnz .LNextb
  37. pop %ebx
  38. end;
  39. procedure Move_8OrMore_IA32; assembler; nostackframe;
  40. { eax = source, edx = dest, ecx = count (ecx >= 8).
  41. If FPC_PIC: ebx pushed. }
  42. asm
  43. fildq (%eax) { First and last 8 bytes, used both in .L33OrMore and ladder ending (.L9to16). }
  44. fildq -8(%eax,%ecx)
  45. cmp $16, %ecx
  46. jle .L9to16
  47. cmp $32, %ecx
  48. jg .L33OrMore
  49. fildq 8(%eax)
  50. fildq -16(%eax,%ecx)
  51. fistpq -16(%edx,%ecx)
  52. fistpq 8(%edx)
  53. .L9to16:
  54. fistpq -8(%edx,%ecx) { 9–16 bytes }
  55. fistpq (%edx)
  56. {$ifdef FPC_PIC}
  57. pop %ebx
  58. {$endif}
  59. ret
  60. .Lcancel:
  61. fstp %st(0) { Pop the “second int64 from the end” .L33OrMore loads. }
  62. fucompp { Pop two elements loaded at the beginning. }
  63. pop %ebx
  64. ret
  65. .byte 102,102,144 { Turns .balign 16 before .Lloop16f into a no-op. }
  66. .L33OrMore:
  67. fildq -16(%eax,%ecx) { Second int64 from the end. }
  68. {$ifndef FPC_PIC}
  69. push %ebx
  70. {$endif}
  71. sub %edx, %eax { eax = src - dest }
  72. jz .Lcancel { exit if src=dest }
  73. mov %eax, %ebx
  74. neg %ebx
  75. cmp %ebx, %ecx
  76. ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
  77. mov %edx, %ebx { remember original dest to write first 16 bytes }
  78. add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  79. add $8, %edx
  80. and $-8, %edx
  81. sub %edx, %ecx
  82. sub $16, %ecx
  83. jbe .LPost16f
  84. .balign 16 { no-op }
  85. .Lloop16f:
  86. fildq (%eax,%edx)
  87. fistpq (%edx)
  88. fildq 8(%eax,%edx)
  89. fistpq 8(%edx)
  90. add $16, %edx
  91. sub $16, %ecx
  92. ja .Lloop16f
  93. .LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
  94. fistpq (%edx,%ecx)
  95. fistpq 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
  96. fistpq (%ebx) { Important for <8-byte step between src and dest. }
  97. pop %ebx
  98. ret
  99. .byte 102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
  100. { backwards move }
  101. .Lback:
  102. fstp %st(0)
  103. fildq 8(%eax,%edx) { Second int64 from the start. }
  104. lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
  105. mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
  106. and $-8, %ecx
  107. sub %edx, %ecx
  108. add %ecx, %edx
  109. sub $16, %ecx
  110. jbe .LPost16b
  111. .balign 16 { no-op }
  112. .Lloop16b:
  113. sub $16, %edx
  114. fildq 8(%eax,%edx)
  115. fistpq 8(%edx)
  116. fildq (%eax,%edx)
  117. fistpq (%edx)
  118. sub $16, %ecx
  119. ja .Lloop16b
  120. .LPost16b:
  121. sub %ecx, %edx
  122. fistpq -8(%edx)
  123. fistpq -7(%ebx)
  124. fistpq -16(%edx)
  125. pop %ebx
  126. end;
  127. procedure Move_8OrMore_MMX; assembler; nostackframe;
  128. { eax = source, edx = dest, ecx = count (ecx >= 8).
  129. If FPC_PIC: ebx pushed. }
  130. asm
  131. cmp $72, %ecx { Size at which using MMX becomes worthwhile. }
  132. jl Move_8OrMore_IA32
  133. {$ifndef FPC_PIC}
  134. push %ebx
  135. {$endif}
  136. movq (%eax), %mm4 { First and last 8 bytes. }
  137. movq -8(%eax,%ecx), %mm5
  138. movq -16(%eax,%ecx), %mm3 { Second vector from the end. }
  139. sub %edx, %eax { eax = src - dest }
  140. jz .Lquit { exit if src=dest }
  141. mov %eax, %ebx
  142. neg %ebx
  143. cmp %ebx, %ecx
  144. ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
  145. mov %edx, %ebx { remember original dest to write first 16 bytes }
  146. add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  147. add $8, %edx
  148. and $-8, %edx
  149. sub %edx, %ecx
  150. sub $16, %ecx
  151. jbe .LPost16f
  152. .balign 16
  153. .Lloop16f:
  154. movq (%eax,%edx), %mm0
  155. movq %mm0, (%edx)
  156. movq 8(%eax,%edx), %mm0
  157. movq %mm0, 8(%edx)
  158. add $16, %edx
  159. sub $16, %ecx
  160. ja .Lloop16f
  161. .LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
  162. movq %mm3, (%edx,%ecx)
  163. movq %mm5, 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
  164. movq %mm4, (%ebx) { Important for <8-byte step between src and dest. }
  165. .Lquit:
  166. emms
  167. pop %ebx
  168. ret
  169. .byte 144 { Turns .balign 16 before .Lloop16b into a no-op. }
  170. { backwards move }
  171. .Lback:
  172. movq 8(%eax,%edx), %mm3 { Second vector from the start. }
  173. lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
  174. mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
  175. and $-8, %ecx
  176. sub %edx, %ecx
  177. add %ecx, %edx
  178. sub $16, %ecx
  179. jbe .LPost16b
  180. .balign 16 { no-op }
  181. .Lloop16b:
  182. sub $16, %edx
  183. movq 8(%eax,%edx), %mm0
  184. movq %mm0, 8(%edx)
  185. movq (%eax,%edx), %mm0
  186. movq %mm0, (%edx)
  187. sub $16, %ecx
  188. ja .Lloop16b
  189. .LPost16b:
  190. sub %ecx, %edx
  191. movq %mm3, -8(%edx)
  192. movq %mm4, -16(%edx)
  193. movq %mm5, -7(%ebx)
  194. emms
  195. pop %ebx
  196. end;
  197. {$ifndef FASTMOVE_DISABLE_SSE}
  198. label
  199. Move_8OrMore_SSE_9to15, Move_8OrMore_SSE_33OrMore, Move_8OrMore_SSE_CancelERMSBackwards;
  200. const
  201. Move_NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
  202. procedure Move_8OrMore_SSE; assembler; nostackframe;
  203. { eax = source, edx = dest, ecx = count (ecx >= 8).
  204. If FPC_PIC: ebx pushed. }
  205. const
  206. PrefetchDistance = 512;
  207. asm
  208. cmp $15, %ecx
  209. jle Move_8OrMore_SSE_9to15
  210. movups (%eax), %xmm4 { First and last 16 bytes, used both in Move_8OrMore_SSE_33OrMore and 17–32 branch. }
  211. movups -16(%eax,%ecx), %xmm5
  212. cmp $32, %ecx
  213. jg Move_8OrMore_SSE_33OrMore
  214. movups %xmm4, (%edx) { 16–32 bytes }
  215. movups %xmm5, -16(%edx,%ecx)
  216. {$ifdef FPC_PIC}
  217. pop %ebx
  218. {$endif}
  219. ret
  220. Move_8OrMore_SSE_9to15:
  221. movlps (%eax), %xmm0
  222. movlps -8(%eax,%ecx), %xmm1
  223. movlps %xmm0, (%edx)
  224. movlps %xmm1, -8(%edx,%ecx)
  225. .Lquit:
  226. {$ifdef FPC_PIC}
  227. pop %ebx
  228. {$endif}
  229. ret
  230. .byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
  231. Move_8OrMore_SSE_33OrMore:
  232. movups -32(%eax,%ecx), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
  233. { but -32(%eax,%ecx) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
  234. sub %edx, %eax { eax = src - dest }
  235. jz .Lquit { exit if src=dest }
  236. {$ifndef FPC_PIC}
  237. push %ebx
  238. {$endif}
  239. mov %eax, %ebx
  240. neg %ebx
  241. cmp %ebx, %ecx
  242. ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
  243. mov %edx, %ebx { remember original dest to write first 16 bytes }
  244. add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  245. add $16, %edx
  246. and $-16, %edx
  247. sub %edx, %ecx
  248. .LRestAfterNTf:
  249. sub $32, %ecx { During the N× loop, ecx is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
  250. jbe .LPost32f
  251. cmp $Move_NtThreshold-32, %ecx
  252. jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
  253. .LNtIsNotBetterF:
  254. test $15, %eax
  255. jz .Lalignedloop32f
  256. .balign 16 { no-op }
  257. .Lloop32f:
  258. movups (%eax,%edx), %xmm0
  259. movaps %xmm0, (%edx)
  260. movups 16(%eax,%edx), %xmm0
  261. movaps %xmm0, 16(%edx)
  262. add $32, %edx
  263. sub $32, %ecx
  264. ja .Lloop32f
  265. .LPost32f: { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. }
  266. movups %xmm3, (%edx, %ecx)
  267. movups %xmm5, 16(%edx,%ecx) { Write first and last 16 bytes after everything else. }
  268. movups %xmm4, (%ebx) { Important for <16-byte step between src and dest. }
  269. pop %ebx
  270. ret
  271. .balign 16
  272. .Lalignedloop32f: { Same as above starting from .Lloop32f but with MOVAPSes. }
  273. movaps (%eax,%edx), %xmm0
  274. movaps %xmm0, (%edx)
  275. movaps 16(%eax,%edx), %xmm0
  276. movaps %xmm0, 16(%edx)
  277. add $32, %edx
  278. sub $32, %ecx
  279. ja .Lalignedloop32f
  280. .LalignedPost32f:
  281. movups %xmm3, (%edx, %ecx)
  282. movups %xmm5, 16(%edx,%ecx)
  283. movups %xmm4, (%ebx)
  284. pop %ebx
  285. ret
  286. .Lntf:
  287. cmp $Move_NtThreshold, %eax { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
  288. jb .LNtIsNotBetterF { (this check is performed here to not stand in the way of smaller counts) }
  289. sub $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
  290. test $15, %eax
  291. jz .Lalignedntloop64f
  292. .balign 16
  293. .Lntloop64f:
  294. prefetchnta 0+PrefetchDistance(%eax,%edx,1)
  295. movups (%eax,%edx,1), %xmm0
  296. movntps %xmm0, (%edx)
  297. movups 16(%eax,%edx,1), %xmm0
  298. movntps %xmm0, 16(%edx)
  299. movups 32(%eax,%edx,1), %xmm0
  300. movntps %xmm0, 32(%edx)
  301. movups 48(%eax,%edx,1), %xmm0
  302. movntps %xmm0, 48(%edx)
  303. add $64, %edx
  304. sub $64, %ecx
  305. jae .Lntloop64f
  306. sfence
  307. add $PrefetchDistance+64, %ecx
  308. jmp .LRestAfterNTf { go handle remaining bytes }
  309. .balign 16
  310. .Lalignedntloop64f: { Same as above starting from .Lntloop64f but with MOVAPSes. }
  311. prefetchnta 0+PrefetchDistance(%eax,%edx,1)
  312. movaps (%eax,%edx,1), %xmm0
  313. movntps %xmm0, (%edx)
  314. movaps 16(%eax,%edx,1), %xmm0
  315. movntps %xmm0, 16(%edx)
  316. movaps 32(%eax,%edx,1), %xmm0
  317. movntps %xmm0, 32(%edx)
  318. movaps 48(%eax,%edx,1), %xmm0
  319. movntps %xmm0, 48(%edx)
  320. add $64, %edx
  321. sub $64, %ecx
  322. jae .Lalignedntloop64f
  323. sfence
  324. add $PrefetchDistance+64, %ecx
  325. jmp .LRestAfterNTf
  326. .byte {$ifdef FPC_PIC}102,{$endif}102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
  327. Move_8OrMore_SSE_CancelERMSBackwards:
  328. { Adapt from Move_8OrMore_SSE_ERMS.Lback where eax = src, edx = dest - src, xmm4 and xmm5 aren’t read, ebx isn't pushed if not FPC_PIC. }
  329. {$ifndef FPC_PIC}
  330. push %ebx
  331. {$endif}
  332. add %eax, %edx
  333. movups (%eax), %xmm4
  334. movups -16(%eax,%ecx), %xmm5
  335. sub %edx, %eax
  336. { backwards move }
  337. .Lback:
  338. movups 16(%eax,%edx), %xmm3 { Second vector from the start. }
  339. lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 16 bytes }
  340. mov %ebx, %ecx { move dest to the previous 16-byte boundary... }
  341. and $-16, %ecx
  342. sub %edx, %ecx
  343. add %ecx, %edx
  344. .LRestAfterNTb:
  345. sub $32, %ecx
  346. jbe .LPost32b
  347. cmp $Move_NtThreshold-32, %ecx
  348. jae .Lntb
  349. .balign 16 { no-op }
  350. .Lloop32b:
  351. sub $32, %edx
  352. movups 16(%eax,%edx), %xmm0
  353. movaps %xmm0, 16(%edx)
  354. movups (%eax,%edx), %xmm0
  355. movaps %xmm0, (%edx)
  356. sub $32, %ecx
  357. ja .Lloop32b
  358. .LPost32b:
  359. sub %ecx, %edx
  360. movups %xmm3, -16(%edx)
  361. movups %xmm4, -32(%edx)
  362. movups %xmm5, -15(%ebx)
  363. pop %ebx
  364. ret
  365. .Lntb:
  366. cmp $-Move_NtThreshold, %eax
  367. ja .Lloop32b
  368. sub $PrefetchDistance+32, %ecx
  369. .balign 16
  370. .Lntloop64b:
  371. prefetchnta -PrefetchDistance(%eax,%edx,1)
  372. sub $64, %edx
  373. movups 48(%eax,%edx,1), %xmm0
  374. movntps %xmm0, 48(%edx)
  375. movups 32(%eax,%edx,1), %xmm0
  376. movntps %xmm0, 32(%edx)
  377. movups 16(%eax,%edx,1), %xmm0
  378. movntps %xmm0, 16(%edx)
  379. movups (%eax,%edx,1), %xmm0
  380. movntps %xmm0, (%edx)
  381. sub $64, %ecx
  382. jae .Lntloop64b
  383. sfence
  384. add $PrefetchDistance+64, %ecx
  385. jmp .LRestAfterNTb
  386. end;
  387. procedure Move_8OrMore_SSE_ERMS; assembler; nostackframe;
  388. { eax = source, edx = dest, ecx = count (ecx >= 8).
  389. If FPC_PIC: ebx pushed. }
  390. const
  391. ErmsThreshold = 1536;
  392. asm
  393. cmp $15, %ecx
  394. jle Move_8OrMore_SSE_9to15
  395. cmp $ErmsThreshold, %ecx
  396. jae .LRepMovs
  397. movups (%eax), %xmm4 { Same as in Move_8OrMore_SSE. }
  398. movups -16(%eax,%ecx), %xmm5
  399. cmp $32, %ecx
  400. jg Move_8OrMore_SSE_33OrMore
  401. movups %xmm4, (%edx) { 16–32 bytes }
  402. movups %xmm5, -16(%edx,%ecx)
  403. {$ifdef FPC_PIC}
  404. pop %ebx
  405. {$endif}
  406. ret
  407. .LRepMovs:
  408. sub %eax, %edx { edx = dest - src }
  409. jz .Lquit { exit if src=dest }
  410. cmp %edx, %ecx { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
  411. ja .Lback
  412. cmp $Move_NtThreshold+16, %ecx
  413. jae .LNtF { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT done by Move_8OrMore_SSE. }
  414. .LNtIsNotBetterF:
  415. push %esi
  416. push %edi
  417. mov %eax, %esi
  418. lea (%edx,%eax), %edi
  419. rep movsb
  420. pop %edi
  421. pop %esi
  422. .Lquit:
  423. {$ifdef FPC_PIC}
  424. pop %ebx
  425. {$endif}
  426. ret
  427. .LNtF:
  428. cmp $-Move_NtThreshold, %edx { Check move distance. Bad case for forward NT is 0 < src - dest < NtThreshold => unsigned(dest - src) > unsigned(-NtThreshold). }
  429. ja .LNtIsNotBetterF { NT is not better and Move_8OrMore_SSE won't perform it either. Use REP MOVSB. }
  430. add %eax, %edx { Recover edx = dest. }
  431. jmp Move_8OrMore_SSE { Will perform NT. }
  432. .Lback:
  433. { dst = 3
  434. v
  435. Move(abcdefghijXXX, count=10)
  436. ^
  437. src = 0
  438. = abcABCDEFGHIJ
  439. can be moved right to left in non-overlapping groups of “dst - src”:
  440. abcdefghijHIJ
  441. ^^^
  442. abcdefgEFGhij
  443. ^^^
  444. abcdBCDefghij
  445. ^^^
  446. abcAbcdefghij <- tail is handled by restarting the Move with corresponding count instead, as it can have 0 to dst - src - 1 bytes.
  447. ^
  448. Only REP MOVs with DF=0 are fast with ERMS, in case you’re wondering why not just use DF=1. }
  449. cmp $ErmsThreshold, %edx
  450. jb Move_8OrMore_SSE_CancelERMSBackwards { Unfortunately this branch can’t benefit the common case of small distance (like inserting 1 array element into the beginning). :( }
  451. cmp $Move_NtThreshold+16, %ecx
  452. jae .LNtB
  453. .LNtIsNotBetterB:
  454. {$ifndef FPC_PIC}
  455. push %ebx
  456. {$endif}
  457. mov %ecx, %ebx { ebx = remaining }
  458. sub %edx, %ebx { edx = dst - src = step; remaining -= step. }
  459. add %ecx, %eax
  460. push %esi
  461. push %edi
  462. .LRepMovsNextPieceB: { At least 1 iteration is always performed. }
  463. mov %eax, %edi { edi = src before subtracting step = dst = rep movsb dest }
  464. sub %edx, %eax { src -= step }
  465. mov %eax, %esi { esi = src = rep movsb source }
  466. mov %edx, %ecx { ecx = step = rep movsb count }
  467. rep movsb
  468. sub %edx, %ebx { remaining -= step }
  469. jnc .LRepMovsNextPieceB { CF=1 after remaining -= step means that remaining became strictly negative and the loop must be stopped/not performed. }
  470. pop %edi
  471. pop %esi
  472. lea (%edx,%ebx), %ecx { ecx = remaining }
  473. sub %ecx, %eax { eax = src }
  474. add %eax, %edx { edx = dest }
  475. pop %ebx
  476. jmp Move { Remaining piece ("a" in the example above). Can save a bit of jumps by doing more checks and jumping to more specific places, but whatever. }
  477. .LNtB:
  478. cmp $Move_NtThreshold, %edx { Check move distance. Bad case for backward NT is dest - src < NtThreshold; src is always < dest. }
  479. jb .LNtIsNotBetterB { NT is not better and Move_8OrMore_SSE won't perform it either. Use REP MOVSB. }
  480. add %eax, %edx { Recover edx = dest. }
  481. jmp Move_8OrMore_SSE { Will perform NT. }
  482. end;
  483. {$endif ndef FASTMOVE_DISABLE_SSE}
  484. procedure Move_8OrMore_Dispatch; forward;
  485. var
  486. fastmoveproc : pointer = @Move_8OrMore_Dispatch;
  487. {$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  488. valgrind_used : boolean;external name '__fpc_valgrind';
  489. {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  490. function Move_8OrMore_HumanFriendlyDispatch: pointer;
  491. begin
  492. { workaround valgrind bug }
  493. {$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  494. if EntryInformation.valgrind_used then
  495. {$else FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  496. if valgrind_used then
  497. {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  498. result:=@Move_8OrMore_Valgrind
  499. {$ifndef FASTMOVE_DISABLE_SSE}
  500. else if fast_large_repmovstosb then
  501. result:=@Move_8OrMore_SSE_ERMS
  502. else if has_sse_support then
  503. result:=@Move_8OrMore_SSE
  504. {$endif ndef FASTMOVE_DISABLE_SSE}
  505. else if has_mmx_support then
  506. result:=@Move_8OrMore_MMX
  507. else
  508. result:=@Move_8OrMore_IA32;
  509. if fpc_cpucodeinit_performed then
  510. fastmoveproc:=result;
  511. end;
  512. procedure Move_8OrMore_Dispatch; assembler; nostackframe;
  513. { eax = source, edx = dest, ecx = count (ecx >= 8).
  514. If FPC_PIC: ebx pushed. }
  515. asm
  516. {$ifndef FPC_PIC}
  517. push %ebx
  518. {$endif}
  519. push %eax
  520. push %edx
  521. push %ecx
  522. call Move_8OrMore_HumanFriendlyDispatch
  523. mov %eax, %ebx
  524. pop %ecx
  525. pop %edx
  526. pop %eax
  527. {$ifdef FPC_PIC}
  528. jmp %ebx
  529. {$else}
  530. call %ebx
  531. pop %ebx
  532. {$endif}
  533. end;
  534. procedure Move(const source;var dest;count:SizeInt); [public, alias: 'FPC_MOVE']; assembler; nostackframe;
  535. asm
  536. cmp $8, %ecx
  537. jle .L8OrLess
  538. {$ifdef FPC_PIC}
  539. push %ebx
  540. call fpc_geteipasebx
  541. addl $_GLOBAL_OFFSET_TABLE_, %ebx
  542. movl fastmoveproc@GOT(%ebx), %ebx
  543. jmp (%ebx)
  544. {$else}
  545. jmp fastmoveproc
  546. {$endif}
  547. .L8OrLess:
  548. cmp $3, %ecx
  549. jle .L3OrLess
  550. push %ebx
  551. mov (%eax), %ebx
  552. mov -4(%eax,%ecx), %eax
  553. mov %ebx, (%edx)
  554. mov %eax, -4(%edx,%ecx)
  555. pop %ebx
  556. ret
  557. .L3OrLess:
  558. cmp $1, %ecx
  559. jl .LZero
  560. push %ebx
  561. movzbl (%eax), %ebx
  562. je .LOne
  563. movzwl -2(%eax,%ecx), %eax
  564. mov %ax, -2(%edx,%ecx)
  565. .LOne:
  566. mov %bl, (%edx)
  567. pop %ebx
  568. .LZero:
  569. end;
  570. {$endif FPC_SYSTEM_HAS_MOVE}