fastmove.inc 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. {$ifndef FPC_SYSTEM_HAS_MOVE}
  2. {$define FPC_SYSTEM_HAS_MOVE}
  3. { at least valgrind up to 3.3 has a bug which prevents the default code to
  4. work so we use a rather simple implementation here }
  5. procedure Move_8OrMore_Valgrind; assembler; nostackframe;
  6. { eax = source, edx = dest, ecx = count (ecx >= 8).
  7. If FPC_PIC: ebx pushed. }
  8. asm
  9. {$ifndef FPC_PIC}
  10. push %ebx
  11. {$endif}
  12. sub %edx, %eax
  13. jae .LForward
  14. mov %ecx, %ebx
  15. add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
  16. jb .LBack { if no overlap, still do forward move }
  17. .LForward:
  18. {$ifdef FPC_ENABLED_CLD}
  19. cld
  20. {$endif FPC_ENABLED_CLD}
  21. push %esi
  22. push %edi
  23. lea (%eax,%edx), %esi
  24. mov %edx, %edi
  25. rep movsb
  26. pop %edi
  27. pop %esi
  28. pop %ebx
  29. ret
  30. .LBack:
  31. add %ecx, %edx
  32. .LNextb:
  33. dec %edx
  34. mov (%eax,%edx), %bl
  35. mov %bl, (%edx)
  36. dec %ecx
  37. jnz .LNextb
  38. pop %ebx
  39. end;
  40. procedure Move_8OrMore_IA32; assembler; nostackframe;
  41. { eax = source, edx = dest, ecx = count (ecx >= 8).
  42. If FPC_PIC: ebx pushed. }
  43. asm
  44. fildq (%eax) { First and last 8 bytes, used both in .L33OrMore and ladder ending (.L9to16). }
  45. fildq -8(%eax,%ecx)
  46. cmp $16, %ecx
  47. jle .L9to16
  48. cmp $32, %ecx
  49. jg .L33OrMore
  50. fildq 8(%eax)
  51. fildq -16(%eax,%ecx)
  52. fistpq -16(%edx,%ecx)
  53. fistpq 8(%edx)
  54. .L9to16:
  55. fistpq -8(%edx,%ecx) { 9–16 bytes }
  56. fistpq (%edx)
  57. {$ifdef FPC_PIC}
  58. pop %ebx
  59. {$endif}
  60. ret
  61. .Lcancel:
  62. fucompp { Pop two elements loaded at the beginning. }
  63. {$ifdef FPC_PIC}
  64. pop %ebx
  65. {$endif}
  66. ret
  67. .byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16f into a no-op. }
  68. .L33OrMore:
  69. sub %edx, %eax { eax = src - dest }
  70. jz .Lcancel { exit if src=dest }
  71. {$ifndef FPC_PIC}
  72. push %ebx
  73. {$endif}
  74. jnb .LForward { src>dest => forward move }
  75. mov %ecx, %ebx
  76. add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
  77. jb .Lback { if no overlap, still do forward move }
  78. .LForward:
  79. mov %edx, %ebx { remember original dest to write first 16 bytes }
  80. add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  81. add $8, %edx
  82. and $-8, %edx
  83. sub %edx, %ecx
  84. sub $16, %ecx
  85. jbe .LPost16f
  86. .balign 16 { no-op }
  87. .Lloop16f:
  88. fildq (%eax,%edx)
  89. fistpq (%edx)
  90. fildq 8(%eax,%edx)
  91. fistpq 8(%edx)
  92. add $16, %edx
  93. sub $16, %ecx
  94. ja .Lloop16f
  95. .LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
  96. cmp $-8, %ecx
  97. jle .LFirstAndLast8f
  98. fildq (%eax,%edx)
  99. fistpq (%edx)
  100. .LFirstAndLast8f:
  101. fistpq 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
  102. fistpq (%ebx) { Important for <8-byte step between src and dest. }
  103. pop %ebx
  104. ret
  105. .byte 102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
  106. { backwards move }
  107. .Lback:
  108. lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
  109. mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
  110. and $-8, %ecx
  111. sub %edx, %ecx
  112. add %ecx, %edx
  113. sub $16, %ecx
  114. jbe .LPost16b
  115. .balign 16 { no-op }
  116. .Lloop16b:
  117. sub $16, %edx
  118. fildq 8(%eax,%edx)
  119. fistpq 8(%edx)
  120. fildq (%eax,%edx)
  121. fistpq (%edx)
  122. sub $16, %ecx
  123. ja .Lloop16b
  124. .LPost16b:
  125. cmp $-8, %ecx
  126. jle .LFirstAndLast8b
  127. fildq -8(%eax,%edx)
  128. fistpq -8(%edx)
  129. .LFirstAndLast8b:
  130. sub %ecx, %edx
  131. fistpq -7(%ebx)
  132. fistpq -16(%edx)
  133. pop %ebx
  134. end;
  135. procedure Move_8OrMore_MMX; assembler; nostackframe;
  136. { eax = source, edx = dest, ecx = count (ecx >= 8).
  137. If FPC_PIC: ebx pushed. }
  138. asm
  139. cmp $72, %ecx { Size at which using MMX becomes worthwhile. }
  140. jl Move_8OrMore_IA32
  141. {$ifndef FPC_PIC}
  142. push %ebx
  143. {$endif}
  144. movq (%eax), %mm4 { First and last 8 bytes. }
  145. movq -8(%eax,%ecx), %mm5
  146. sub %edx, %eax { eax = src - dest }
  147. jz .Lquit { exit if src=dest }
  148. jnb .LForward { src>dest => forward move }
  149. mov %ecx, %ebx
  150. add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
  151. jb .Lback { if no overlap, still do forward move }
  152. .LForward:
  153. mov %edx, %ebx { remember original dest to write first 16 bytes }
  154. add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  155. add $8, %edx
  156. and $-8, %edx
  157. sub %edx, %ecx
  158. sub $16, %ecx
  159. jbe .LPost16f
  160. .balign 16
  161. .Lloop16f:
  162. movq (%eax,%edx), %mm0
  163. movq %mm0, (%edx)
  164. movq 8(%eax,%edx), %mm0
  165. movq %mm0, 8(%edx)
  166. add $16, %edx
  167. sub $16, %ecx
  168. ja .Lloop16f
  169. .LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
  170. cmp $-8, %ecx
  171. jle .LFirstAndLast8f
  172. movq (%eax,%edx), %mm0
  173. movq %mm0, (%edx)
  174. .LFirstAndLast8f:
  175. movq %mm5, 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
  176. movq %mm4, (%ebx) { Important for <8-byte step between src and dest. }
  177. .Lquit:
  178. emms
  179. pop %ebx
  180. ret
  181. .byte 102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
  182. { backwards move }
  183. .Lback:
  184. lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
  185. mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
  186. and $-8, %ecx
  187. sub %edx, %ecx
  188. add %ecx, %edx
  189. sub $16, %ecx
  190. jbe .LPost16b
  191. .balign 16 { no-op }
  192. .Lloop16b:
  193. sub $16, %edx
  194. movq 8(%eax,%edx), %mm0
  195. movq %mm0, 8(%edx)
  196. movq (%eax,%edx), %mm0
  197. movq %mm0, (%edx)
  198. sub $16, %ecx
  199. ja .Lloop16b
  200. .LPost16b:
  201. cmp $-8, %ecx
  202. jle .LFirstAndLast8b
  203. movq -8(%eax,%edx), %mm0
  204. movq %mm0, -8(%edx)
  205. .LFirstAndLast8b:
  206. sub %ecx, %edx
  207. movq %mm4, -16(%edx)
  208. movq %mm5, -7(%ebx)
  209. emms
  210. pop %ebx
  211. end;
  212. {$ifndef FASTMOVE_DISABLE_SSE}
  213. procedure Move_8OrMore_SSE; assembler; nostackframe;
  214. { eax = source, edx = dest, ecx = count (ecx >= 8).
  215. If FPC_PIC: ebx pushed. }
  216. const
  217. ErmsThreshold = 1536;
  218. NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
  219. PrefetchDistance = 512;
  220. asm
  221. cmp $16, %ecx
  222. jle .L9to16
  223. movups (%eax), %xmm4 { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
  224. movups -16(%eax,%ecx), %xmm5
  225. cmp $32, %ecx
  226. jg .L33OrMore
  227. movups %xmm4, (%edx) { 17–32 bytes }
  228. movups %xmm5, -16(%edx,%ecx)
  229. {$ifdef FPC_PIC}
  230. pop %ebx
  231. {$endif}
  232. ret
  233. .L9to16:
  234. movq (%eax), %xmm0
  235. movq -8(%eax,%ecx), %xmm1
  236. movq %xmm0, (%edx)
  237. movq %xmm1, -8(%edx,%ecx)
  238. .Lquit:
  239. {$ifdef FPC_PIC}
  240. pop %ebx
  241. {$endif}
  242. ret
  243. .byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
  244. .L33OrMore:
  245. sub %edx, %eax { eax = src - dest }
  246. jz .Lquit { exit if src=dest }
  247. {$ifndef FPC_PIC}
  248. push %ebx
  249. {$endif}
  250. jnb .LForward { src>dest => forward move }
  251. mov %ecx, %ebx
  252. add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
  253. jb .Lback { if no overlap, still do forward move }
  254. .LForward:
  255. mov %edx, %ebx { remember original dest to write first 16 bytes }
  256. add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  257. add $16, %edx
  258. and $-16, %edx
  259. sub %edx, %ecx
  260. .LRestAfterNTf:
  261. sub $32, %ecx { During the N× loop, ecx is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
  262. jbe .LPost32f
  263. cmp $NtThreshold-32, %ecx
  264. jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
  265. .LNtIsNotBetter:
  266. cmp $ErmsThreshold-32, %ecx { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT. }
  267. jae .LRepMovsF
  268. .LRepMovsIsNotBetter:
  269. test $15, %eax
  270. jz .Lalignedloop32f
  271. .balign 16 { no-op }
  272. .Lloop32f:
  273. movups (%eax,%edx), %xmm0
  274. movaps %xmm0, (%edx)
  275. movups 16(%eax,%edx), %xmm0
  276. movaps %xmm0, 16(%edx)
  277. add $32, %edx
  278. sub $32, %ecx
  279. ja .Lloop32f
  280. .LPost32f: { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. }
  281. cmp $-16, %ecx
  282. jle .LFirstAndLast16f
  283. movups (%eax,%edx), %xmm0
  284. movaps %xmm0, (%edx)
  285. .LFirstAndLast16f:
  286. movups %xmm5, 16(%edx,%ecx) { Write first and last 16 bytes after everything else. }
  287. movups %xmm4, (%ebx) { Important for <16-byte step between src and dest. }
  288. pop %ebx
  289. ret
  290. .balign 16
  291. .Lalignedloop32f: { Same as above starting from .Lloop32f but with MOVAPSes. }
  292. movaps (%eax,%edx), %xmm0
  293. movaps %xmm0, (%edx)
  294. movaps 16(%eax,%edx), %xmm0
  295. movaps %xmm0, 16(%edx)
  296. add $32, %edx
  297. sub $32, %ecx
  298. ja .Lalignedloop32f
  299. .LalignedPost32f:
  300. cmp $-16, %ecx
  301. jle .LalignedFirstAndLast16f
  302. movaps (%eax,%edx), %xmm0
  303. movaps %xmm0, (%edx)
  304. .LalignedFirstAndLast16f:
  305. movups %xmm5, 16(%edx,%ecx)
  306. movups %xmm4, (%ebx)
  307. pop %ebx
  308. ret
  309. .LRepMovsF:
  310. {$ifdef FPC_PIC}
  311. push %ebx
  312. call fpc_geteipasebx
  313. addl $_GLOBAL_OFFSET_TABLE_, %ebx
  314. movl fast_large_repmovstosb@GOT(%ebx), %ebx
  315. cmpb $1, (%ebx)
  316. pop %ebx
  317. {$else FPC_PIC}
  318. cmpb $1, fast_large_repmovstosb
  319. {$endif FPC_PIC}
  320. jne .LRepMovsIsNotBetter
  321. push %esi
  322. push %edi
  323. lea (%eax,%edx), %esi
  324. mov %edx, %edi
  325. add $32, %ecx
  326. rep movsb
  327. movups %xmm4, (%ebx) { last 16 aren't required }
  328. pop %edi
  329. pop %esi
  330. pop %ebx
  331. ret
  332. .Lntf:
  333. cmp $NtThreshold, %eax { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
  334. jb .LNtIsNotBetter { (this check is performed here to not stand in the way of smaller counts) }
  335. sub $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
  336. test $15, %eax
  337. jz .Lalignedntloop64f
  338. .balign 16
  339. .Lntloop64f:
  340. prefetchnta 0+PrefetchDistance(%eax,%edx,1)
  341. movups (%eax,%edx,1), %xmm0
  342. movntps %xmm0, (%edx)
  343. movups 16(%eax,%edx,1), %xmm0
  344. movntps %xmm0, 16(%edx)
  345. movups 32(%eax,%edx,1), %xmm0
  346. movntps %xmm0, 32(%edx)
  347. movups 48(%eax,%edx,1), %xmm0
  348. movntps %xmm0, 48(%edx)
  349. add $64, %edx
  350. sub $64, %ecx
  351. jae .Lntloop64f
  352. sfence
  353. add $PrefetchDistance+64, %ecx
  354. jmp .LRestAfterNTf { go handle remaining bytes }
  355. .balign 16
  356. .Lalignedntloop64f: { Same as above starting from .Lntloop64f but with MOVAPSes. }
  357. prefetchnta 0+PrefetchDistance(%eax,%edx,1)
  358. movaps (%eax,%edx,1), %xmm0
  359. movntps %xmm0, (%edx)
  360. movaps 16(%eax,%edx,1), %xmm0
  361. movntps %xmm0, 16(%edx)
  362. movaps 32(%eax,%edx,1), %xmm0
  363. movntps %xmm0, 32(%edx)
  364. movaps 48(%eax,%edx,1), %xmm0
  365. movntps %xmm0, 48(%edx)
  366. add $64, %edx
  367. sub $64, %ecx
  368. jae .Lalignedntloop64f
  369. sfence
  370. add $PrefetchDistance+64, %ecx
  371. jmp .LRestAfterNTf
  372. .byte 102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
  373. { backwards move }
  374. .Lback:
  375. lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 16 bytes }
  376. mov %ebx, %ecx { move dest to the previous 16-byte boundary... }
  377. and $-16, %ecx
  378. sub %edx, %ecx
  379. add %ecx, %edx
  380. .LRestAfterNTb:
  381. sub $32, %ecx
  382. jbe .LPost32b
  383. cmp $NtThreshold-32, %ecx
  384. jae .Lntb
  385. .balign 16 { no-op }
  386. .Lloop32b:
  387. sub $32, %edx
  388. movups 16(%eax,%edx), %xmm0
  389. movaps %xmm0, 16(%edx)
  390. movups (%eax,%edx), %xmm0
  391. movaps %xmm0, (%edx)
  392. sub $32, %ecx
  393. ja .Lloop32b
  394. .LPost32b:
  395. cmp $-16, %ecx
  396. jle .LFirstAndLast16b
  397. movups -16(%eax,%edx), %xmm0
  398. movaps %xmm0, -16(%edx)
  399. .LFirstAndLast16b:
  400. sub %ecx, %edx
  401. movups %xmm4, -32(%edx)
  402. movups %xmm5, -15(%ebx)
  403. pop %ebx
  404. ret
  405. .Lntb:
  406. cmp $-NtThreshold, %eax
  407. jnb .Lloop32b
  408. sub $PrefetchDistance+32, %ecx
  409. .balign 16
  410. .Lntloop64b:
  411. prefetchnta -PrefetchDistance(%eax,%edx,1)
  412. sub $64, %edx
  413. movups 48(%eax,%edx,1), %xmm0
  414. movntps %xmm0, 48(%edx)
  415. movups 32(%eax,%edx,1), %xmm0
  416. movntps %xmm0, 32(%edx)
  417. movups 16(%eax,%edx,1), %xmm0
  418. movntps %xmm0, 16(%edx)
  419. movups (%eax,%edx,1), %xmm0
  420. movntps %xmm0, (%edx)
  421. sub $64, %ecx
  422. jae .Lntloop64b
  423. sfence
  424. add $PrefetchDistance+64, %ecx
  425. jmp .LRestAfterNTb
  426. end;
  427. {$endif ndef FASTMOVE_DISABLE_SSE}
  428. procedure Move_8OrMore_Dispatch; forward;
  429. var
  430. fastmoveproc : pointer = @Move_8OrMore_Dispatch;
  431. {$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  432. valgrind_used : boolean;external name '__fpc_valgrind';
  433. {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  434. function Move_8OrMore_HumanFriendlyDispatch: pointer;
  435. begin
  436. { workaround valgrind bug }
  437. {$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  438. if EntryInformation.valgrind_used then
  439. {$else FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  440. if valgrind_used then
  441. {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  442. result:=@Move_8OrMore_Valgrind
  443. {$ifndef FASTMOVE_DISABLE_SSE}
  444. else if has_sse_support then
  445. result:=@Move_8OrMore_SSE
  446. {$endif ndef FASTMOVE_DISABLE_SSE}
  447. else if has_mmx_support then
  448. result:=@Move_8OrMore_MMX
  449. else
  450. result:=@Move_8OrMore_IA32;
  451. if fpc_cpucodeinit_performed then
  452. fastmoveproc:=result;
  453. end;
  454. procedure Move_8OrMore_Dispatch; assembler; nostackframe;
  455. { eax = source, edx = dest, ecx = count (ecx >= 8).
  456. If FPC_PIC: ebx pushed. }
  457. asm
  458. {$ifndef FPC_PIC}
  459. push %ebx
  460. {$endif}
  461. push %eax
  462. push %edx
  463. push %ecx
  464. call Move_8OrMore_HumanFriendlyDispatch
  465. mov %eax, %ebx
  466. pop %ecx
  467. pop %edx
  468. pop %eax
  469. {$ifdef FPC_PIC}
  470. jmp %ebx
  471. {$else}
  472. call %ebx
  473. pop %ebx
  474. {$endif}
  475. end;
  476. procedure Move(const source;var dest;count:SizeInt); [public, alias: 'FPC_MOVE']; assembler; nostackframe;
  477. asm
  478. cmp $8, %ecx
  479. jle .L8OrLess
  480. {$ifdef FPC_PIC}
  481. push %ebx
  482. call fpc_geteipasebx
  483. addl $_GLOBAL_OFFSET_TABLE_, %ebx
  484. movl fastmoveproc@GOT(%ebx), %ebx
  485. jmp (%ebx)
  486. {$else}
  487. jmp fastmoveproc
  488. {$endif}
  489. .L8OrLess:
  490. cmp $3, %ecx
  491. jle .L3OrLess
  492. push %ebx
  493. mov (%eax), %ebx
  494. mov -4(%eax,%ecx), %eax
  495. mov %ebx, (%edx)
  496. mov %eax, -4(%edx,%ecx)
  497. pop %ebx
  498. ret
  499. .L3OrLess:
  500. cmp $1, %ecx
  501. jl .LZero
  502. push %ebx
  503. movzbl (%eax), %ebx
  504. je .LOne
  505. movzwl -2(%eax,%ecx), %eax
  506. mov %ax, -2(%edx,%ecx)
  507. .LOne:
  508. mov %bl, (%edx)
  509. pop %ebx
  510. .LZero:
  511. end;
  512. {$endif FPC_SYSTEM_HAS_MOVE}