fastmove.inc 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667
  1. {$ifndef FPC_SYSTEM_HAS_MOVE}
  2. {$define FPC_SYSTEM_HAS_MOVE}
  3. { at least valgrind up to 3.3 has a bug which prevents the default code to
  4. work so we use a rather simple implementation here }
  5. procedure Move_8OrMore_Valgrind; assembler; nostackframe;
  6. { eax = source, edx = dest, ecx = count (ecx >= 8).
  7. If FPC_PIC: ebx pushed. }
  8. asm
  9. {$ifndef FPC_PIC}
  10. push %ebx
  11. {$endif}
  12. sub %edx, %eax
  13. jae .LForward
  14. mov %ecx, %ebx
  15. add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
  16. jb .LBack { if no overlap, still do forward move }
  17. .LForward:
  18. {$ifdef FPC_ENABLED_CLD}
  19. cld
  20. {$endif FPC_ENABLED_CLD}
  21. push %esi
  22. push %edi
  23. lea (%eax,%edx), %esi
  24. mov %edx, %edi
  25. rep movsb
  26. pop %edi
  27. pop %esi
  28. pop %ebx
  29. ret
  30. .LBack:
  31. add %ecx, %edx
  32. .LNextb:
  33. dec %edx
  34. mov (%eax,%edx), %bl
  35. mov %bl, (%edx)
  36. dec %ecx
  37. jnz .LNextb
  38. pop %ebx
  39. end;
  40. procedure Move_8OrMore_IA32; assembler; nostackframe;
  41. { eax = source, edx = dest, ecx = count (ecx >= 8).
  42. If FPC_PIC: ebx pushed. }
  43. asm
  44. fildq (%eax) { First and last 8 bytes, used both in .L33OrMore and ladder ending (.L9to16). }
  45. fildq -8(%eax,%ecx)
  46. cmp $16, %ecx
  47. jle .L9to16
  48. cmp $32, %ecx
  49. jg .L33OrMore
  50. fildq 8(%eax)
  51. fildq -16(%eax,%ecx)
  52. fistpq -16(%edx,%ecx)
  53. fistpq 8(%edx)
  54. .L9to16:
  55. fistpq -8(%edx,%ecx) { 9–16 bytes }
  56. fistpq (%edx)
  57. {$ifdef FPC_PIC}
  58. pop %ebx
  59. {$endif}
  60. ret
  61. .Lcancel:
  62. fucompp { Pop two elements loaded at the beginning. }
  63. {$ifdef FPC_PIC}
  64. pop %ebx
  65. {$endif}
  66. ret
  67. .byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16f into a no-op. }
  68. .L33OrMore:
  69. sub %edx, %eax { eax = src - dest }
  70. jz .Lcancel { exit if src=dest }
  71. {$ifndef FPC_PIC}
  72. push %ebx
  73. {$endif}
  74. jnb .LForward { src>dest => forward move }
  75. mov %ecx, %ebx
  76. add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
  77. jb .Lback { if no overlap, still do forward move }
  78. .LForward:
  79. mov %edx, %ebx { remember original dest to write first 16 bytes }
  80. add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  81. add $8, %edx
  82. and $-8, %edx
  83. sub %edx, %ecx
  84. sub $16, %ecx
  85. jbe .LPost16f
  86. .balign 16 { no-op }
  87. .Lloop16f:
  88. fildq (%eax,%edx)
  89. fistpq (%edx)
  90. fildq 8(%eax,%edx)
  91. fistpq 8(%edx)
  92. add $16, %edx
  93. sub $16, %ecx
  94. ja .Lloop16f
  95. .LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
  96. cmp $-8, %ecx
  97. jle .LFirstAndLast8f
  98. fildq (%eax,%edx)
  99. fistpq (%edx)
  100. .LFirstAndLast8f:
  101. fistpq 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
  102. fistpq (%ebx) { Important for <8-byte step between src and dest. }
  103. pop %ebx
  104. ret
  105. .byte 102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
  106. { backwards move }
  107. .Lback:
  108. lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
  109. mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
  110. and $-8, %ecx
  111. sub %edx, %ecx
  112. add %ecx, %edx
  113. sub $16, %ecx
  114. jbe .LPost16b
  115. .balign 16 { no-op }
  116. .Lloop16b:
  117. sub $16, %edx
  118. fildq 8(%eax,%edx)
  119. fistpq 8(%edx)
  120. fildq (%eax,%edx)
  121. fistpq (%edx)
  122. sub $16, %ecx
  123. ja .Lloop16b
  124. .LPost16b:
  125. cmp $-8, %ecx
  126. jle .LFirstAndLast8b
  127. fildq -8(%eax,%edx)
  128. fistpq -8(%edx)
  129. .LFirstAndLast8b:
  130. sub %ecx, %edx
  131. fistpq -7(%ebx)
  132. fistpq -16(%edx)
  133. pop %ebx
  134. end;
  135. procedure Move_8OrMore_MMX; assembler; nostackframe;
  136. { eax = source, edx = dest, ecx = count (ecx >= 8).
  137. If FPC_PIC: ebx pushed. }
  138. asm
  139. cmp $72, %ecx { Size at which using MMX becomes worthwhile. }
  140. jl Move_8OrMore_IA32
  141. {$ifndef FPC_PIC}
  142. push %ebx
  143. {$endif}
  144. movq (%eax), %mm4 { First and last 8 bytes. }
  145. movq -8(%eax,%ecx), %mm5
  146. sub %edx, %eax { eax = src - dest }
  147. jz .Lquit { exit if src=dest }
  148. jnb .LForward { src>dest => forward move }
  149. mov %ecx, %ebx
  150. add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
  151. jb .Lback { if no overlap, still do forward move }
  152. .LForward:
  153. mov %edx, %ebx { remember original dest to write first 16 bytes }
  154. add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  155. add $8, %edx
  156. and $-8, %edx
  157. sub %edx, %ecx
  158. sub $16, %ecx
  159. jbe .LPost16f
  160. .balign 16
  161. .Lloop16f:
  162. movq (%eax,%edx), %mm0
  163. movq %mm0, (%edx)
  164. movq 8(%eax,%edx), %mm0
  165. movq %mm0, 8(%edx)
  166. add $16, %edx
  167. sub $16, %ecx
  168. ja .Lloop16f
  169. .LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
  170. cmp $-8, %ecx
  171. jle .LFirstAndLast8f
  172. movq (%eax,%edx), %mm0
  173. movq %mm0, (%edx)
  174. .LFirstAndLast8f:
  175. movq %mm5, 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
  176. movq %mm4, (%ebx) { Important for <8-byte step between src and dest. }
  177. .Lquit:
  178. emms
  179. pop %ebx
  180. ret
  181. .byte 102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
  182. { backwards move }
  183. .Lback:
  184. lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
  185. mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
  186. and $-8, %ecx
  187. sub %edx, %ecx
  188. add %ecx, %edx
  189. sub $16, %ecx
  190. jbe .LPost16b
  191. .balign 16 { no-op }
  192. .Lloop16b:
  193. sub $16, %edx
  194. movq 8(%eax,%edx), %mm0
  195. movq %mm0, 8(%edx)
  196. movq (%eax,%edx), %mm0
  197. movq %mm0, (%edx)
  198. sub $16, %ecx
  199. ja .Lloop16b
  200. .LPost16b:
  201. cmp $-8, %ecx
  202. jle .LFirstAndLast8b
  203. movq -8(%eax,%edx), %mm0
  204. movq %mm0, -8(%edx)
  205. .LFirstAndLast8b:
  206. sub %ecx, %edx
  207. movq %mm4, -16(%edx)
  208. movq %mm5, -7(%ebx)
  209. emms
  210. pop %ebx
  211. end;
  212. {$ifndef FASTMOVE_DISABLE_SSE}
  213. label
  214. Move_8OrMore_SSE_9to16, Move_8OrMore_SSE_33OrMore, Move_8OrMore_SSE_CancelERMSBackwards;
  215. const
  216. Move_NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
  217. procedure Move_8OrMore_SSE; assembler; nostackframe;
  218. { eax = source, edx = dest, ecx = count (ecx >= 8).
  219. If FPC_PIC: ebx pushed. }
  220. const
  221. PrefetchDistance = 512;
  222. asm
  223. cmp $16, %ecx
  224. jle Move_8OrMore_SSE_9to16
  225. movups (%eax), %xmm4 { First and last 16 bytes, used both in Move_8OrMore_SSE_33OrMore and 17–32 branch. }
  226. movups -16(%eax,%ecx), %xmm5
  227. cmp $32, %ecx
  228. jg Move_8OrMore_SSE_33OrMore
  229. movups %xmm4, (%edx) { 17–32 bytes }
  230. movups %xmm5, -16(%edx,%ecx)
  231. {$ifdef FPC_PIC}
  232. pop %ebx
  233. {$endif}
  234. ret
  235. Move_8OrMore_SSE_9to16:
  236. movlps (%eax), %xmm0
  237. movlps -8(%eax,%ecx), %xmm1
  238. movlps %xmm0, (%edx)
  239. movlps %xmm1, -8(%edx,%ecx)
  240. .Lquit:
  241. {$ifdef FPC_PIC}
  242. pop %ebx
  243. {$endif}
  244. ret
  245. .byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
  246. Move_8OrMore_SSE_33OrMore:
  247. sub %edx, %eax { eax = src - dest }
  248. jz .Lquit { exit if src=dest }
  249. {$ifndef FPC_PIC}
  250. push %ebx
  251. {$endif}
  252. jnb .LForward { src>dest => forward move }
  253. lea -1(%ecx), %ebx
  254. add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
  255. jb .Lback { if no overlap, still do forward move }
  256. .LForward:
  257. mov %edx, %ebx { remember original dest to write first 16 bytes }
  258. add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  259. add $16, %edx
  260. and $-16, %edx
  261. sub %edx, %ecx
  262. .LRestAfterNTf:
  263. sub $32, %ecx { During the N× loop, ecx is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
  264. jbe .LPost32f
  265. cmp $Move_NtThreshold-32, %ecx
  266. jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
  267. .LNtIsNotBetterF:
  268. test $15, %eax
  269. jz .Lalignedloop32f
  270. .balign 16 { no-op }
  271. .Lloop32f:
  272. movups (%eax,%edx), %xmm0
  273. movaps %xmm0, (%edx)
  274. movups 16(%eax,%edx), %xmm0
  275. movaps %xmm0, 16(%edx)
  276. add $32, %edx
  277. sub $32, %ecx
  278. ja .Lloop32f
  279. .LPost32f: { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. }
  280. cmp $-16, %ecx
  281. jle .LFirstAndLast16f
  282. movups (%eax,%edx), %xmm0
  283. movaps %xmm0, (%edx)
  284. .LFirstAndLast16f:
  285. movups %xmm5, 16(%edx,%ecx) { Write first and last 16 bytes after everything else. }
  286. movups %xmm4, (%ebx) { Important for <16-byte step between src and dest. }
  287. pop %ebx
  288. ret
  289. .balign 16
  290. .Lalignedloop32f: { Same as above starting from .Lloop32f but with MOVAPSes. }
  291. movaps (%eax,%edx), %xmm0
  292. movaps %xmm0, (%edx)
  293. movaps 16(%eax,%edx), %xmm0
  294. movaps %xmm0, 16(%edx)
  295. add $32, %edx
  296. sub $32, %ecx
  297. ja .Lalignedloop32f
  298. .LalignedPost32f:
  299. cmp $-16, %ecx
  300. jle .LalignedFirstAndLast16f
  301. movaps (%eax,%edx), %xmm0
  302. movaps %xmm0, (%edx)
  303. .LalignedFirstAndLast16f:
  304. movups %xmm5, 16(%edx,%ecx)
  305. movups %xmm4, (%ebx)
  306. pop %ebx
  307. ret
  308. .Lntf:
  309. cmp $Move_NtThreshold, %eax { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
  310. jb .LNtIsNotBetterF { (this check is performed here to not stand in the way of smaller counts) }
  311. sub $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
  312. test $15, %eax
  313. jz .Lalignedntloop64f
  314. .balign 16
  315. .Lntloop64f:
  316. prefetchnta 0+PrefetchDistance(%eax,%edx,1)
  317. movups (%eax,%edx,1), %xmm0
  318. movntps %xmm0, (%edx)
  319. movups 16(%eax,%edx,1), %xmm0
  320. movntps %xmm0, 16(%edx)
  321. movups 32(%eax,%edx,1), %xmm0
  322. movntps %xmm0, 32(%edx)
  323. movups 48(%eax,%edx,1), %xmm0
  324. movntps %xmm0, 48(%edx)
  325. add $64, %edx
  326. sub $64, %ecx
  327. jae .Lntloop64f
  328. sfence
  329. add $PrefetchDistance+64, %ecx
  330. jmp .LRestAfterNTf { go handle remaining bytes }
  331. .balign 16
  332. .Lalignedntloop64f: { Same as above starting from .Lntloop64f but with MOVAPSes. }
  333. prefetchnta 0+PrefetchDistance(%eax,%edx,1)
  334. movaps (%eax,%edx,1), %xmm0
  335. movntps %xmm0, (%edx)
  336. movaps 16(%eax,%edx,1), %xmm0
  337. movntps %xmm0, 16(%edx)
  338. movaps 32(%eax,%edx,1), %xmm0
  339. movntps %xmm0, 32(%edx)
  340. movaps 48(%eax,%edx,1), %xmm0
  341. movntps %xmm0, 48(%edx)
  342. add $64, %edx
  343. sub $64, %ecx
  344. jae .Lalignedntloop64f
  345. sfence
  346. add $PrefetchDistance+64, %ecx
  347. jmp .LRestAfterNTf
  348. .byte {$ifdef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
  349. Move_8OrMore_SSE_CancelERMSBackwards:
  350. { Adapt from Move_8OrMore_SSE_ERMS.Lback where eax = src, edx = dest - src, xmm4 and xmm5 aren’t read, ebx isn't pushed if not FPC_PIC. }
  351. {$ifndef FPC_PIC}
  352. push %ebx
  353. {$endif}
  354. add %eax, %edx
  355. movups (%eax), %xmm4
  356. movups -16(%eax,%ecx), %xmm5
  357. sub %edx, %eax
  358. { backwards move }
  359. .Lback:
  360. lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 16 bytes }
  361. mov %ebx, %ecx { move dest to the previous 16-byte boundary... }
  362. and $-16, %ecx
  363. sub %edx, %ecx
  364. add %ecx, %edx
  365. .LRestAfterNTb:
  366. sub $32, %ecx
  367. jbe .LPost32b
  368. cmp $Move_NtThreshold-32, %ecx
  369. jae .Lntb
  370. .balign 16 { no-op }
  371. .Lloop32b:
  372. sub $32, %edx
  373. movups 16(%eax,%edx), %xmm0
  374. movaps %xmm0, 16(%edx)
  375. movups (%eax,%edx), %xmm0
  376. movaps %xmm0, (%edx)
  377. sub $32, %ecx
  378. ja .Lloop32b
  379. .LPost32b:
  380. cmp $-16, %ecx
  381. jle .LFirstAndLast16b
  382. movups -16(%eax,%edx), %xmm0
  383. movaps %xmm0, -16(%edx)
  384. .LFirstAndLast16b:
  385. sub %ecx, %edx
  386. movups %xmm4, -32(%edx)
  387. movups %xmm5, -15(%ebx)
  388. pop %ebx
  389. ret
  390. .Lntb:
  391. cmp $-Move_NtThreshold, %eax
  392. ja .Lloop32b
  393. sub $PrefetchDistance+32, %ecx
  394. .balign 16
  395. .Lntloop64b:
  396. prefetchnta -PrefetchDistance(%eax,%edx,1)
  397. sub $64, %edx
  398. movups 48(%eax,%edx,1), %xmm0
  399. movntps %xmm0, 48(%edx)
  400. movups 32(%eax,%edx,1), %xmm0
  401. movntps %xmm0, 32(%edx)
  402. movups 16(%eax,%edx,1), %xmm0
  403. movntps %xmm0, 16(%edx)
  404. movups (%eax,%edx,1), %xmm0
  405. movntps %xmm0, (%edx)
  406. sub $64, %ecx
  407. jae .Lntloop64b
  408. sfence
  409. add $PrefetchDistance+64, %ecx
  410. jmp .LRestAfterNTb
  411. end;
  412. procedure Move_8OrMore_SSE_ERMS; assembler; nostackframe;
  413. { eax = source, edx = dest, ecx = count (ecx >= 8).
  414. If FPC_PIC: ebx pushed. }
  415. const
  416. ErmsThreshold = 1536;
  417. asm
  418. cmp $16, %ecx
  419. jle Move_8OrMore_SSE_9to16
  420. cmp $ErmsThreshold, %ecx
  421. jae .LRepMovs
  422. movups (%eax), %xmm4 { Same as in Move_8OrMore_SSE. }
  423. movups -16(%eax,%ecx), %xmm5
  424. cmp $32, %ecx
  425. jg Move_8OrMore_SSE_33OrMore
  426. movups %xmm4, (%edx) { 17–32 bytes }
  427. movups %xmm5, -16(%edx,%ecx)
  428. {$ifdef FPC_PIC}
  429. pop %ebx
  430. {$endif}
  431. ret
  432. .LRepMovs:
  433. sub %eax, %edx { edx = dest - src }
  434. jz .Lquit { exit if src=dest }
  435. cmp %edx, %ecx { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
  436. ja .Lback
  437. cmp $Move_NtThreshold+16, %ecx
  438. jae .LNtF { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT done by Move_8OrMore_SSE. }
  439. .LNtIsNotBetterF:
  440. push %esi
  441. push %edi
  442. mov %eax, %esi
  443. lea (%edx,%eax), %edi
  444. rep movsb
  445. pop %edi
  446. pop %esi
  447. .Lquit:
  448. {$ifdef FPC_PIC}
  449. pop %ebx
  450. {$endif}
  451. ret
  452. .LNtF:
  453. cmp $-Move_NtThreshold, %edx { Check move distance. Bad case for forward NT is 0 < src - dest < NtThreshold => unsigned(dest - src) > unsigned(-NtThreshold). }
  454. ja .LNtIsNotBetterF { NT is not better and Move_8OrMore_SSE won't perform it either. Use REP MOVSB. }
  455. add %eax, %edx { Recover edx = dest. }
  456. jmp Move_8OrMore_SSE { Will perform NT. }
  457. .Lback:
  458. { dst = 3
  459. v
  460. Move(abcdefghijXXX, count=10)
  461. ^
  462. src = 0
  463. = abcABCDEFGHIJ
  464. can be moved right to left in non-overlapping groups of “dst - src”:
  465. abcdefghijHIJ
  466. ^^^
  467. abcdefgEFGhij
  468. ^^^
  469. abcdBCDefghij
  470. ^^^
  471. abcAbcdefghij <- tail is handled by restarting the Move with corresponding count instead, as it can have 0 to dst - src - 1 bytes.
  472. ^
  473. Only REP MOVs with DF=0 are fast with ERMS, in case you’re wondering why not just use DF=1. }
  474. cmp $ErmsThreshold, %edx
  475. jb Move_8OrMore_SSE_CancelERMSBackwards { Unfortunately this branch can’t benefit the common case of small distance (like inserting 1 array element into the beginning). :( }
  476. cmp $Move_NtThreshold+16, %ecx
  477. jae .LNtB
  478. .LNtIsNotBetterB:
  479. {$ifndef FPC_PIC}
  480. push %ebx
  481. {$endif}
  482. mov %ecx, %ebx { ebx = remaining }
  483. sub %edx, %ebx { edx = dst - src = step; remaining -= step. }
  484. add %ecx, %eax
  485. push %esi
  486. push %edi
  487. .LRepMovsNextPieceB: { At least 1 iteration is always performed. }
  488. mov %eax, %edi { edi = src before subtracting step = dst = rep movsb dest }
  489. sub %edx, %eax { src -= step }
  490. mov %eax, %esi { esi = src = rep movsb source }
  491. mov %edx, %ecx { ecx = step = rep movsb count }
  492. rep movsb
  493. sub %edx, %ebx { remaining -= step }
  494. jnc .LRepMovsNextPieceB { CF=1 after remaining -= step means that remaining became strictly negative and the loop must be stopped/not performed. }
  495. pop %edi
  496. pop %esi
  497. lea (%edx,%ebx), %ecx { ecx = remaining }
  498. sub %ecx, %eax { eax = src }
  499. add %eax, %edx { edx = dest }
  500. pop %ebx
  501. jmp Move { Remaining piece ("a" in the example above). Can save a bit of jumps by doing more checks and jumping to more specific places, but whatever. }
  502. .LNtB:
  503. cmp $Move_NtThreshold, %edx { Check move distance. Bad case for backward NT is dest - src < NtThreshold; src is always < dest. }
  504. jb .LNtIsNotBetterB { NT is not better and Move_8OrMore_SSE won't perform it either. Use REP MOVSB. }
  505. add %eax, %edx { Recover edx = dest. }
  506. jmp Move_8OrMore_SSE { Will perform NT. }
  507. end;
  508. {$endif ndef FASTMOVE_DISABLE_SSE}
  509. procedure Move_8OrMore_Dispatch; forward;
  510. var
  511. fastmoveproc : pointer = @Move_8OrMore_Dispatch;
  512. {$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  513. valgrind_used : boolean;external name '__fpc_valgrind';
  514. {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  515. function Move_8OrMore_HumanFriendlyDispatch: pointer;
  516. begin
  517. { workaround valgrind bug }
  518. {$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  519. if EntryInformation.valgrind_used then
  520. {$else FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  521. if valgrind_used then
  522. {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  523. result:=@Move_8OrMore_Valgrind
  524. {$ifndef FASTMOVE_DISABLE_SSE}
  525. else if fast_large_repmovstosb then
  526. result:=@Move_8OrMore_SSE_ERMS
  527. else if has_sse_support then
  528. result:=@Move_8OrMore_SSE
  529. {$endif ndef FASTMOVE_DISABLE_SSE}
  530. else if has_mmx_support then
  531. result:=@Move_8OrMore_MMX
  532. else
  533. result:=@Move_8OrMore_IA32;
  534. if fpc_cpucodeinit_performed then
  535. fastmoveproc:=result;
  536. end;
  537. procedure Move_8OrMore_Dispatch; assembler; nostackframe;
  538. { eax = source, edx = dest, ecx = count (ecx >= 8).
  539. If FPC_PIC: ebx pushed. }
  540. asm
  541. {$ifndef FPC_PIC}
  542. push %ebx
  543. {$endif}
  544. push %eax
  545. push %edx
  546. push %ecx
  547. call Move_8OrMore_HumanFriendlyDispatch
  548. mov %eax, %ebx
  549. pop %ecx
  550. pop %edx
  551. pop %eax
  552. {$ifdef FPC_PIC}
  553. jmp %ebx
  554. {$else}
  555. call %ebx
  556. pop %ebx
  557. {$endif}
  558. end;
  559. procedure Move(const source;var dest;count:SizeInt); [public, alias: 'FPC_MOVE']; assembler; nostackframe;
  560. asm
  561. cmp $8, %ecx
  562. jle .L8OrLess
  563. {$ifdef FPC_PIC}
  564. push %ebx
  565. call fpc_geteipasebx
  566. addl $_GLOBAL_OFFSET_TABLE_, %ebx
  567. movl fastmoveproc@GOT(%ebx), %ebx
  568. jmp (%ebx)
  569. {$else}
  570. jmp fastmoveproc
  571. {$endif}
  572. .L8OrLess:
  573. cmp $3, %ecx
  574. jle .L3OrLess
  575. push %ebx
  576. mov (%eax), %ebx
  577. mov -4(%eax,%ecx), %eax
  578. mov %ebx, (%edx)
  579. mov %eax, -4(%edx,%ecx)
  580. pop %ebx
  581. ret
  582. .L3OrLess:
  583. cmp $1, %ecx
  584. jl .LZero
  585. push %ebx
  586. movzbl (%eax), %ebx
  587. je .LOne
  588. movzwl -2(%eax,%ecx), %eax
  589. mov %ax, -2(%edx,%ecx)
  590. .LOne:
  591. mov %bl, (%edx)
  592. pop %ebx
  593. .LZero:
  594. end;
  595. {$endif FPC_SYSTEM_HAS_MOVE}