x86_64.inc 44 KB


  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2002 by Florian Klaempfl and Sergei Gorelkin
  4. Members of the Free Pascal development team
  5. Processor dependent implementation for the system unit for
  6. the x86-64 architecture
  7. See the file COPYING.FPC, included in this distribution,
  8. for details about the copyright.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  12. **********************************************************************}
  13. {$asmmode GAS}
  14. {****************************************************************************
  15. Primitives
  16. ****************************************************************************}
  17. {$ifndef win64}
  18. {$define use_fast_repmovstos} { REP STOS uses nonvolatile RDI and would require a stack frame on Win64 to be SEH-compliant. }
  19. {$endif}
  20. {$ifdef use_fast_repmovstos}
  21. var
  22. fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
  23. {$endif}
  24. {$define FPC_SYSTEM_HAS_SPTR}
  25. Function Sptr : Pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  26. asm
  27. movq %rsp,%rax
  28. end;
  29. {$IFNDEF INTERNAL_BACKTRACE}
  30. {$define FPC_SYSTEM_HAS_GET_FRAME}
  31. function get_frame:pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  32. asm
  33. movq %rbp,%rax
  34. end;
  35. {$ENDIF not INTERNAL_BACKTRACE}
  36. {$define FPC_SYSTEM_HAS_GET_PC_ADDR}
  37. function get_pc_addr:pointer;assembler;nostackframe;
  38. asm
  39. movq (%rsp),%rax
  40. end;
  41. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  42. function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;{$ifdef SYSTEMINLINE}inline;{$endif}
  43. begin
  44. get_caller_addr:=framebp;
  45. if assigned(framebp) then
  46. get_caller_addr:=PPointer(framebp)[1];
  47. end;
  48. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  49. function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;{$ifdef SYSTEMINLINE}inline;{$endif}
  50. begin
  51. get_caller_frame:=framebp;
  52. if assigned(framebp) then
  53. get_caller_frame:=PPointer(framebp)^;
  54. end;
  55. // The following assembler procedures are disabled for FreeBSD due to
  56. // multiple issues with its old GNU assembler (Mantis #19188).
  57. // Even after fixing them, it can be enabled only for the trunk version,
  58. // otherwise bootstrapping won't be possible.
  59. // Modified to use oldbinutils as in cpu.pp source, to allow easier use for other targets.
  60. {$ifdef freebsd}
  61. {$ifndef overridebinutils}
  62. {$define oldbinutils}
  63. {$endif}
  64. {$endif freebsd}
  65. {$ifndef oldbinutils}
  66. {$ifndef FPC_SYSTEM_HAS_MOVE}
  67. {$define FPC_SYSTEM_HAS_MOVE}
  68. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
  69. { Linux: rdi source, rsi dest, rdx count
  70. win64: rcx source, rdx dest, r8 count }
  71. const
  72. NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
  73. PrefetchDistance = 512;
  74. asm
  75. {$ifndef win64}
  76. mov %rdx, %r8
  77. mov %rsi, %rdx
  78. mov %rdi, %rcx
  79. {$endif win64}
  80. cmp $3, %r8
  81. jle .L3OrLess
  82. cmp $8, %r8
  83. jle .L4to8
  84. cmp $16, %r8
  85. jle .L9to16
  86. movups (%rcx), %xmm4 { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
  87. movups -16(%rcx,%r8), %xmm5
  88. cmp $32, %r8
  89. jg .L33OrMore
  90. movups %xmm4, (%rdx) { 17–32 bytes }
  91. movups %xmm5, -16(%rdx,%r8)
  92. ret
  93. .balign 16
  94. .L3OrLess:
  95. cmp $1, %r8
  96. jl .LZero
  97. movzbl (%rcx), %eax
  98. je .LOne
  99. movzwl -2(%rcx,%r8), %r9d
  100. mov %r9w, -2(%rdx,%r8)
  101. .LOne:
  102. mov %al, (%rdx)
  103. .LZero:
  104. ret
  105. .L4to8:
  106. mov (%rcx), %eax
  107. mov -4(%rcx,%r8), %r9d
  108. mov %eax, (%rdx)
  109. mov %r9d, -4(%rdx,%r8)
  110. ret
  111. .L9to16:
  112. mov (%rcx), %rax
  113. mov -8(%rcx,%r8), %r9
  114. mov %rax, (%rdx)
  115. mov %r9, -8(%rdx,%r8)
  116. .Lquit:
  117. ret
  118. .byte 102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
  119. .L33OrMore:
  120. movups -32(%rcx,%r8), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
  121. { but -32(%rcx,%r8) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
  122. sub %rdx, %rcx { rcx = src - dest }
  123. jz .Lquit { exit if src=dest }
  124. mov %rcx, %rax
  125. neg %rax
  126. cmp %rax, %r8
  127. ja .Lback { count (r8) > unsigned(dest - src) (rax) if regions overlap }
  128. mov %rdx, %r9 { remember original dest to write first 16 bytes }
  129. add %rdx, %r8 { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  130. add $16, %rdx
  131. and $-16, %rdx
  132. sub %rdx, %r8
  133. .LRestAfterNTf:
  134. sub $32, %r8 { During the N× loop, r8 is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
  135. jbe .LPost32f
  136. cmp $NtThreshold-32, %r8
  137. jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
  138. .balign 16 { no-op }
  139. .Lloop32f:
  140. movups (%rcx,%rdx), %xmm0
  141. movaps %xmm0, (%rdx)
  142. movups 16(%rcx,%rdx), %xmm0
  143. movaps %xmm0, 16(%rdx)
  144. add $32, %rdx
  145. sub $32, %r8
  146. ja .Lloop32f
  147. .LPost32f: { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. }
  148. movups %xmm3, (%rdx, %r8)
  149. movups %xmm5, 16(%rdx,%r8) { Write first and last 16 bytes after everything else. }
  150. movups %xmm4, (%r9) { Important for <16-byte step between src and dest. }
  151. ret
  152. .balign 16
  153. .Lntf:
  154. cmp $NtThreshold, %rcx { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
  155. jb .Lloop32f { (this check is performed here to not stand in the way of smaller counts) }
  156. sub $PrefetchDistance+32, %r8 { r8 = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
  157. .balign 16 { no-op }
  158. .Lntloop64f:
  159. prefetchnta 0+PrefetchDistance(%rcx,%rdx,1)
  160. movups (%rcx,%rdx,1), %xmm0
  161. movntps %xmm0, (%rdx)
  162. movups 16(%rcx,%rdx,1), %xmm0
  163. movntps %xmm0, 16(%rdx)
  164. movups 32(%rcx,%rdx,1), %xmm0
  165. movntps %xmm0, 32(%rdx)
  166. movups 48(%rcx,%rdx,1), %xmm0
  167. movntps %xmm0, 48(%rdx)
  168. add $64, %rdx
  169. sub $64, %r8
  170. jae .Lntloop64f
  171. sfence
  172. add $PrefetchDistance+64, %r8
  173. jmpq .LRestAfterNTf { go handle remaining bytes }
  174. .byte 102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
  175. { backwards move }
  176. .Lback:
  177. movups 16(%rcx,%rdx), %xmm3 { Second vector from the start. }
  178. lea (%rdx,%r8), %r9 { points to the end of dest; remember to write last 16 bytes }
  179. lea -1(%r9), %r8 { move dest to the previous 16-byte boundary... }
  180. and $-16, %r8
  181. sub %rdx, %r8
  182. add %r8, %rdx
  183. .LRestAfterNTb:
  184. sub $32, %r8
  185. jbe .LPost32b
  186. cmp $NtThreshold-32, %r8
  187. jae .Lntb
  188. .balign 16 { no-op }
  189. .Lloop32b:
  190. sub $32, %rdx
  191. movups 16(%rcx,%rdx), %xmm0
  192. movaps %xmm0, 16(%rdx)
  193. movups (%rcx,%rdx), %xmm0
  194. movaps %xmm0, (%rdx)
  195. sub $32, %r8
  196. ja .Lloop32b
  197. .LPost32b:
  198. sub %r8, %rdx
  199. movups %xmm3, -16(%rdx)
  200. movups %xmm4, -32(%rdx)
  201. movups %xmm5, -16(%r9)
  202. ret
  203. .balign 16
  204. .Lntb:
  205. cmp $-NtThreshold,%rcx
  206. jnb .Lloop32b
  207. sub $PrefetchDistance+32, %r8
  208. .balign 16 { no-op }
  209. .Lntloop64b:
  210. prefetchnta -PrefetchDistance(%rcx,%rdx,1)
  211. sub $64, %rdx
  212. movups 48(%rcx,%rdx,1), %xmm0
  213. movntps %xmm0, 48(%rdx)
  214. movups 32(%rcx,%rdx,1), %xmm0
  215. movntps %xmm0, 32(%rdx)
  216. movups 16(%rcx,%rdx,1), %xmm0
  217. movntps %xmm0, 16(%rdx)
  218. movups (%rcx,%rdx,1), %xmm0
  219. movntps %xmm0, (%rdx)
  220. sub $64, %r8
  221. jae .Lntloop64b
  222. sfence
  223. add $PrefetchDistance+64, %r8
  224. jmpq .LRestAfterNTb
  225. end;
  226. {$endif FPC_SYSTEM_HAS_MOVE}
  227. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  228. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  229. or not defined(FPC_SYSTEM_HAS_FILLDWORD)
  230. or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  231. procedure FillXxxx_MoreThanTwoXmms; assembler; nostackframe;
  232. { Input:
  233. rcx = 'x'
  234. rdx = byte count
  235. xmm0 = pattern for unaligned writes
  236. xmm1 = pattern for aligned writes }
  237. const
  238. {$ifdef use_fast_repmovstos}
  239. ErmsThreshold = 1536;
  240. {$endif}
  241. NtThreshold = 4 * 1024 * 1024;
  242. asm
  243. { x can start and end misaligned on the vector boundary:
  244. x = ~~][H1][H2][...][T2][T1]~
  245. [UH] [UT]
  246. UH (“unaligned head”) potentially overlaps with H1 and is already written with 'movdqu' by the caller.
  247. At least 1 of its bytes is exclusive to it, i.e. if x is already aligned, H1 starts at byte 16.
  248. H1 and so on are called “aligned heads” or just “heads”.
  249. T1 and so on are called “aligned tails” or just “tails”.
  250. UT (“unaligned tail”) is written with another 'movdqu' after the loop.
  251. At least 1 of its bytes is exclusive to it as well, that’s why 65 is subtracted below instead of 64. }
  252. lea -65(%rcx,%rdx), %r8 { r8 = end of x - 65, to get the loop bound and to write UT later (why not write it right away though...). }
  253. and $-16, %rcx { align rcx to the LEFT (so needs to be offset by an additional +16 for a while). }
  254. movdqa %xmm1, 16(%rcx) { Write H1. }
  255. mov %r8, %rax
  256. and $-16, %rax { rax = “T4” (possibly fictive) = aligned r8 = loop bound. }
  257. cmp $49, %rdx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
  258. jle .LOneAlignedTailWrite
  259. movdqa %xmm1, 32(%rcx) { Write H2. }
  260. cmp $81, %rdx { 50~81 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
  261. jle .LTwoAlignedTailWrites
  262. cmp $113, %rdx { 82~113 bytes might contain 4~6 heads+tails; write as H1–2 and T4–1. }
  263. jle .LFourAlignedTailWrites
  264. add $48, %rcx
  265. {$ifdef use_fast_repmovstos}
  266. cmp $ErmsThreshold, %rdx
  267. jae .LRepStos
  268. {$else}
  269. cmp $NtThreshold, %rdx
  270. jae .L64xNT_Body
  271. {$endif}
  272. .balign 16
  273. .L64x_Body:
  274. movdqa %xmm1, (%rcx)
  275. movdqa %xmm1, 16(%rcx)
  276. movdqa %xmm1, 32(%rcx)
  277. movdqa %xmm1, 48(%rcx)
  278. add $64, %rcx
  279. cmp %rax, %rcx
  280. jb .L64x_Body
  281. .LFourAlignedTailWrites:
  282. movdqa %xmm1, (%rax) { T4 }
  283. movdqa %xmm1, 16(%rax) { T3 }
  284. .LTwoAlignedTailWrites:
  285. movdqa %xmm1, 32(%rax) { T2 }
  286. .LOneAlignedTailWrite:
  287. movdqa %xmm1, 48(%rax) { T1 }
  288. movdqu %xmm0, 65-16(%r8) { UT }
  289. ret
  290. {$ifdef use_fast_repmovstos}
  291. .LRepStos:
  292. {$ifdef FPC_PIC}
  293. movq fast_large_repmovstosb@GOTPCREL(%rip), %r9
  294. cmpb $1, (%r9)
  295. {$else FPC_PIC}
  296. cmpb $1, fast_large_repmovstosb(%rip)
  297. {$endif FPC_PIC}
  298. jne .LRepStosIsNotBetter
  299. {$ifdef win64}
  300. push %rdi { For tests on Windows; however this is SEH incompliant so the entire use_fast_repmovstos branch is disabled by default! }
  301. {$endif}
  302. mov %rcx, %rdi { rdi = REP STOS destination. }
  303. lea 65-16+8-1(%r8), %rcx
  304. sub %rdi, %rcx
  305. shr $3, %rcx { rcx = count of REP STOSQ blocks before UT. }
  306. movq %xmm1, %rax { recover pattern for aligned writes back to GPR :) }
  307. rep stosq
  308. movdqu %xmm0, 65-16(%r8) { UT }
  309. {$ifdef win64}
  310. pop %rdi
  311. {$endif}
  312. ret
  313. {$endif}
  314. .LRepStosIsNotBetter:
  315. cmp $NtThreshold, %rdx
  316. jb .L64x_Body
  317. .balign 16
  318. .L64xNT_Body:
  319. movntdq %xmm1, (%rcx)
  320. movntdq %xmm1, 16(%rcx)
  321. movntdq %xmm1, 32(%rcx)
  322. movntdq %xmm1, 48(%rcx)
  323. add $64, %rcx
  324. cmp %rax, %rcx
  325. jb .L64xNT_Body
  326. sfence
  327. jmp .LFourAlignedTailWrites
  328. end;
  329. {$endif FPC_SYSTEM_HAS_FILLxxxx}
  330. {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
  331. {$define FPC_SYSTEM_HAS_FILLCHAR}
  332. Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
  333. asm
  334. { win64: rcx dest, rdx count, r8b value
  335. linux: rdi dest, rsi count, rdx value }
  336. movzbl {$ifdef win64} %r8b {$else} %dl {$endif}, %eax
  337. imul $0x01010101, %eax
  338. {$ifndef win64}
  339. mov %rsi, %rdx
  340. mov %rdi, %rcx
  341. {$endif win64}
  342. cmp $3, %rdx
  343. jle .L3OrLess
  344. cmp $16, %rdx
  345. jl .L4to15
  346. movd %eax, %xmm0
  347. pshufd $0, %xmm0, %xmm0
  348. movdqu %xmm0, (%rcx)
  349. movdqa %xmm0, %xmm1
  350. cmp $32, %rdx
  351. jg FillXxxx_MoreThanTwoXmms
  352. movdqu %xmm0, -16(%rcx,%rdx)
  353. ret
  354. .L4to15:
  355. mov %eax, (%rcx)
  356. cmp $8, %edx
  357. jle .LLast4
  358. mov %eax, 4(%rcx)
  359. mov %eax, -8(%rcx,%rdx)
  360. .LLast4:
  361. mov %eax, -4(%rcx,%rdx)
  362. ret
  363. .L3OrLess:
  364. test %rdx, %rdx
  365. jle .LQuit
  366. mov %al, (%rcx)
  367. mov %al, -1(%rcx,%rdx)
  368. shr $1, %edx
  369. mov %al, (%rcx,%rdx)
  370. .LQuit:
  371. end;
  372. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  373. {$ifndef FPC_SYSTEM_HAS_FILLWORD}
  374. {$define FPC_SYSTEM_HAS_FILLWORD}
  375. procedure FillWord(var x;count:SizeInt;value:word);assembler;nostackframe;
  376. asm
  377. {$ifdef win64}
  378. movzwl %r8w, %eax
  379. shl $16, %r8d
  380. or %r8d, %eax
  381. {$else}
  382. movzwl %dx, %eax
  383. shl $16, %edx
  384. or %edx, %eax
  385. mov %rsi, %rdx
  386. mov %rdi, %rcx
  387. {$endif}
  388. cmp $3, %rdx
  389. jle .L3OrLess
  390. cmp $8, %rdx
  391. jle .L4to8
  392. movd %eax, %xmm0
  393. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  394. movdqu %xmm0, (%rcx)
  395. cmp $16, %rdx
  396. jle .LTail
  397. shl $1, %rdx { rdx = byte count }
  398. mov %rcx, %r8
  399. shl $3, %ecx
  400. rol %cl, %eax { misalign the pattern by the misalignment of x }
  401. mov %r8, %rcx
  402. movd %eax, %xmm1
  403. pshufd $0, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
  404. jmp FillXxxx_MoreThanTwoXmms
  405. .LTail:
  406. movdqu %xmm0, -16(%rcx,%rdx,2)
  407. ret
  408. .L4to8:
  409. mov %eax, %r8d
  410. shl $32, %r8
  411. or %r8, %rax
  412. mov %rax, (%rcx)
  413. mov %rax, -8(%rcx,%rdx,2)
  414. ret
  415. .L3OrLess:
  416. test %rdx, %rdx
  417. jle .LQuit
  418. mov %ax, (%rcx)
  419. mov %ax, -2(%rcx,%rdx,2)
  420. shr $1, %edx
  421. mov %ax, (%rcx,%rdx,2)
  422. .LQuit:
  423. end;
  424. {$endif FPC_SYSTEM_HAS_FILLWORD}
  425. {$ifndef FPC_SYSTEM_HAS_FILLDWORD}
  426. {$define FPC_SYSTEM_HAS_FILLDWORD}
  427. procedure FillDWord(var x;count:SizeInt;value:DWord);assembler;nostackframe;
  428. asm
  429. {$ifdef win64}
  430. mov %r8d, %eax
  431. {$else}
  432. mov %edx, %eax
  433. mov %rsi, %rdx
  434. mov %rdi, %rcx
  435. {$endif win64}
  436. cmp $3, %rdx
  437. jle .L3OrLess
  438. cmp $8, %rdx
  439. jle .L4to8
  440. movd %eax, %xmm0
  441. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  442. movdqu %xmm0, (%rcx)
  443. shl $2, %rdx { rdx = byte count }
  444. mov %rcx, %r8
  445. shl $3, %ecx
  446. rol %cl, %eax { misalign the pattern by the misalignment of x }
  447. mov %r8, %rcx
  448. movd %eax, %xmm1
  449. pshufd $0, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
  450. jmp FillXxxx_MoreThanTwoXmms
  451. .L4to8:
  452. {$ifndef win64} { on win64, eax = r8d already. }
  453. mov %eax, %r8d
  454. {$endif}
  455. shl $32, %r8
  456. or %r8, %rax
  457. mov %rax, (%rcx)
  458. mov %rax, 8(%rcx)
  459. mov %rax, -16(%rcx,%rdx,4)
  460. mov %rax, -8(%rcx,%rdx,4)
  461. ret
  462. .L3OrLess:
  463. test %rdx, %rdx
  464. jle .LQuit
  465. mov %eax, (%rcx)
  466. mov %eax, -4(%rcx,%rdx,4)
  467. shr $1, %edx
  468. mov %eax, (%rcx,%rdx,4)
  469. .LQuit:
  470. end;
  471. {$endif FPC_SYSTEM_HAS_FILLDWORD}
  472. {$ifndef FPC_SYSTEM_HAS_FILLQWORD}
  473. {$define FPC_SYSTEM_HAS_FILLQWORD}
  474. procedure FillQWord(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  475. asm
  476. {$ifdef win64}
  477. mov %r8, %rax
  478. {$else}
  479. mov %rdx, %rax
  480. mov %rsi, %rdx
  481. mov %rdi, %rcx
  482. {$endif win64}
  483. cmp $2, %rdx
  484. jle .L2OrLess
  485. cmp $6, %rdx
  486. jle .L3to6
  487. movq %rax, %xmm0
  488. pshufd $0b01000100, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  489. movdqu %xmm0, (%rcx)
  490. shl $3, %rdx { rdx = byte count }
  491. mov %rcx, %r8
  492. shl $3, %ecx
  493. rol %cl, %rax { misalign the pattern by the misalignment of x }
  494. mov %r8, %rcx
  495. movq %rax, %xmm1
  496. pshufd $0b01000100, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
  497. jmp FillXxxx_MoreThanTwoXmms
  498. .L3to6:
  499. mov %rax, (%rcx)
  500. mov %rax, 8(%rcx)
  501. mov %rax, 16(%rcx)
  502. mov %rax, -24(%rcx,%rdx,8)
  503. mov %rax, -16(%rcx,%rdx,8)
  504. mov %rax, -8(%rcx,%rdx,8)
  505. ret
  506. .L2OrLess:
  507. test %rdx, %rdx
  508. jle .LQuit
  509. mov %rax, (%rcx)
  510. mov %rax, -8(%rcx,%rdx,8)
  511. .LQuit:
  512. end;
  513. {$endif FPC_SYSTEM_HAS_FILLQWORD}
  514. {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
  515. {$define FPC_SYSTEM_HAS_INDEXBYTE}
  516. function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  517. { win64: rcx buf, rdx len, r8b word
  518. linux: rdi buf, rsi len, rdx word }
  519. asm
  520. test len, len
  521. jz .Lnotfound { exit if len=0 }
  522. movd {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1
  523. {$ifdef win64}
  524. mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
  525. add $16, %rcx
  526. {$else}
  527. lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
  528. {$endif}
  529. punpcklbw %xmm1, %xmm1
  530. and $-0x10, %rcx { first aligned address after buf }
  531. punpcklbw %xmm1, %xmm1
  532. pshufd $0, %xmm1, %xmm1
  533. movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
  534. sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr }
  535. pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
  536. pmovmskb %xmm0, %eax
  537. shl %cl, %eax { shift valid bits into high word }
  538. and $0xffff0000, %eax { clear low word containing invalid bits }
  539. shr %cl, %eax { shift back }
  540. jz .Lcontinue
  541. .Lmatch:
  542. bsf %eax, %eax
  543. lea -16(%rcx,%rax), %rax
  544. cmp %rax, len { check against the buffer length }
  545. jbe .Lnotfound
  546. ret
  547. .balign 16
  548. .Lloop:
  549. movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0 { r8/rdi and rcx may have any values, }
  550. add $16, %rcx { but their sum is evenly divisible by 16. }
  551. pcmpeqb %xmm1, %xmm0
  552. pmovmskb %xmm0, %eax
  553. test %eax, %eax
  554. jnz .Lmatch
  555. .Lcontinue:
  556. cmp %rcx, len
  557. ja .Lloop
  558. .Lnotfound:
  559. or $-1, %rax
  560. end;
  561. {$endif FPC_SYSTEM_HAS_INDEXBYTE}
  562. {$ifndef FPC_SYSTEM_HAS_INDEXWORD}
  563. {$define FPC_SYSTEM_HAS_INDEXWORD}
  564. function IndexWord(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  565. { win64: rcx buf, rdx len, r8b word
  566. linux: rdi buf, rsi len, rdx word }
  567. asm
  568. test len, len
  569. jz .Lnotfound { exit if len=0 }
  570. movd {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1
  571. {$ifdef win64}
  572. mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
  573. add $16, %rcx
  574. {$else}
  575. lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
  576. {$endif}
  577. punpcklwd %xmm1, %xmm1
  578. and $-0x10, %rcx
  579. pshufd $0, %xmm1, %xmm1
  580. movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
  581. sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes }
  582. test $1, {$ifdef win64} %r8b {$else} %dil {$endif} { if buffer isn't aligned to word boundary, }
  583. jnz .Lunaligned { use a different algorithm }
  584. pcmpeqw %xmm1, %xmm0
  585. pmovmskb %xmm0, %eax
  586. shl %cl, %eax
  587. and $0xffff0000, %eax
  588. shr %cl, %eax
  589. shr $1, %ecx { bytes->words }
  590. test %eax, %eax
  591. jz .Lcontinue
  592. .Lmatch:
  593. bsf %eax, %eax
  594. shr $1, %eax { in words }
  595. lea -8(%rcx,%rax), %rax
  596. cmp %rax, len
  597. jbe .Lnotfound { if match is after the specified length, ignore it }
  598. retq
  599. .balign 16
  600. .Lloop:
  601. movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx,2), %xmm0
  602. add $8, %rcx
  603. pcmpeqw %xmm1, %xmm0
  604. pmovmskb %xmm0, %eax
  605. test %eax, %eax
  606. jnz .Lmatch
  607. .Lcontinue:
  608. cmp %rcx, len
  609. ja .Lloop
  610. .Lnotfound:
  611. or $-1, %rax
  612. retq
  613. .Lunaligned:
  614. movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
  615. psllw $8, %xmm1 { swap bytes of each word of pattern) }
  616. psrlw $8, %xmm2
  617. por %xmm2, %xmm1
  618. pcmpeqb %xmm1, %xmm0
  619. pmovmskb %xmm0, %eax
  620. shl %cl, %eax
  621. and $0xffff0000, %eax
  622. shr %cl, %eax
  623. add len, len { length words -> bytes }
  624. xor %r10d, %r10d { nothing to merge yet }
  625. jmp .Lcontinue_u
  626. .balign 16
  627. .Lloop_u:
  628. movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0
  629. add $16, %rcx
  630. pcmpeqb %xmm1, %xmm0 { compare by bytes }
  631. shr $16, %r10d { bit 16 shifts into 0 }
  632. pmovmskb %xmm0, %eax
  633. .Lcontinue_u:
  634. shl $1, %eax { 15:0 -> 16:1 }
  635. or %r10d, %eax { merge bit 0 from previous round }
  636. mov %eax, %r10d
  637. shr $1, %eax { now AND together adjacent pairs of bits }
  638. and %r10d, %eax
  639. and $0x5555, %eax { also reset odd bits }
  640. jnz .Lmatch_u
  641. cmpq %rcx, len
  642. ja .Lloop_u
  643. .Lnotfound_u:
  644. or $-1, %rax
  645. retq
  646. .Lmatch_u:
  647. bsf %eax, %eax
  648. lea -16(%rcx,%rax), %rax
  649. cmp %rax, len
  650. jbe .Lnotfound_u { if match is after the specified length, ignore it }
  651. sar $1, %rax { in words }
  652. end;
  653. {$endif FPC_SYSTEM_HAS_INDEXWORD}
  654. {$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
  655. {$define FPC_SYSTEM_HAS_INDEXDWORD}
  656. function IndexDWord(Const buf;len:SizeInt;b:dword):SizeInt; assembler; nostackframe;
  657. asm
  658. {$ifdef win64}
  659. mov %rcx, %rax
  660. {$else}
  661. mov %rdx, %r8
  662. mov %rsi, %rdx
  663. mov %rdi, %rax
  664. {$endif}
  665. cmp $4, %rdx
  666. jle .LDwordwise_Prepare
  667. sub $4, %rdx
  668. movd %r8d, %xmm1
  669. pshufd $0, %xmm1, %xmm1
  670. .balign 16
  671. .L4x_Body:
  672. movdqu (%rax), %xmm0
  673. pcmpeqd %xmm1, %xmm0
  674. pmovmskb %xmm0, %r8d
  675. test %r8d, %r8d
  676. jnz .LFoundAtMask
  677. add $16, %rax
  678. sub $4, %rdx
  679. jg .L4x_Body
  680. lea (%rax,%rdx,4), %rax
  681. movdqu (%rax), %xmm0
  682. pcmpeqd %xmm1, %xmm0
  683. pmovmskb %xmm0, %r8d
  684. test %r8d, %r8d
  685. jnz .LFoundAtMask
  686. or $-1, %rax
  687. ret
  688. .balign 16 { no-op }
  689. .LDwordwise_Body:
  690. cmp (%rax), %r8d
  691. je .LFoundAtRax
  692. add $4, %rax
  693. .LDwordwise_Prepare:
  694. sub $1, %rdx
  695. jae .LDwordwise_Body
  696. or $-1, %rax
  697. ret
  698. .LFoundAtMask:
  699. bsf %r8d, %r8d
  700. add %r8, %rax
  701. .LFoundAtRax:
  702. sub {$ifdef win64} %rcx {$else} %rdi {$endif}, %rax
  703. shr $2, %rax
  704. end;
  705. {$endif FPC_SYSTEM_HAS_INDEXDWORD}
  706. {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
  707. {$define FPC_SYSTEM_HAS_INDEXQWORD}
  708. function IndexQWord(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  709. { win64: rcx=buf, rdx=len, r8=b
  710. else: rdi=buf, rsi=len, rdx=b }
  711. asm
  712. mov {$ifdef win64} %rcx {$else} %rdi {$endif}, %rax
  713. sub $8, %rax
  714. .balign 16
  715. .LQwordwise_Next:
  716. add $8, %rax
  717. sub $1, {$ifdef win64} %rdx {$else} %rsi {$endif}
  718. jb .LNothing
  719. cmp {$ifdef win64} %r8 {$else} %rdx {$endif}, (%rax)
  720. jne .LQwordwise_Next
  721. sub {$ifdef win64} %rcx {$else} %rdi {$endif}, %rax
  722. shr $3, %rax
  723. ret
  724. .LNothing:
  725. mov $-1, %rax
  726. end;
  727. {$endif FPC_SYSTEM_HAS_INDEXQWORD}
  728. {$endif freebsd}
  729. {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
  730. {$define FPC_SYSTEM_HAS_COMPAREBYTE}
  731. function CompareByte(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  732. { win64: rcx buf, rdx buf, r8 len
  733. linux: rdi buf, rsi buf, rdx len }
  734. asm
  735. {$ifndef win64}
  736. mov %rdx, %r8
  737. mov %rsi, %rdx
  738. mov %rdi, %rcx
  739. {$endif win64}
  740. { rcx = buf1, rdx = buf2, r8 = len }
  741. cmp $1, %r8
  742. jle .L1OrLess
  743. cmp $16, %r8
  744. jae .LVecOrMore
  745. { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but is faster. }
  746. mov %ecx, %eax
  747. or %edx, %eax
  748. and $4095, %eax
  749. cmp $4080, %eax
  750. ja .LCantOverReadBoth
  751. { Over-read both as XMMs. }
  752. movdqu (%rcx), %xmm0
  753. movdqu (%rdx), %xmm1
  754. pcmpeqb %xmm1, %xmm0
  755. pmovmskb %xmm0, %eax
  756. inc %ax
  757. jz .LNothing
  758. bsf %eax, %eax
  759. cmp %r8d, %eax { Ignore garbage beyond 'len'. }
  760. jae .LNothing
  761. movzbl (%rdx,%rax), %edx
  762. movzbl (%rcx,%rax), %eax
  763. sub %rdx, %rax
  764. ret
  765. .balign 16
  766. .LNothing:
  767. xor %eax, %eax
  768. ret
  769. .LAligned32xLoop_TwoVectorsDiffer:
  770. add %rcx, %rdx { restore rdx = buf2 }
  771. pmovmskb %xmm0, %r8d { Is there a difference in the first vector? }
  772. inc %r8w
  773. jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, eax = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
  774. mov %r8d, %eax
  775. .LVec0Differs:
  776. bsf %eax, %eax
  777. movzbl (%rdx,%rax), %edx
  778. movzbl (%rcx,%rax), %eax
  779. sub %rdx, %rax
  780. ret
  781. .byte 0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
  782. .LVecOrMore:
  783. { Compare first vectors. }
  784. movdqu (%rcx), %xmm0
  785. movdqu (%rdx), %xmm1
  786. pcmpeqb %xmm1, %xmm0
  787. pmovmskb %xmm0, %eax
  788. inc %ax
  789. jnz .LVec0Differs
  790. sub $32, %r8
  791. jbe .LLastVec
  792. { Compare second vectors. }
  793. movdqu 16(%rcx), %xmm0
  794. movdqu 16(%rdx), %xmm1
  795. pcmpeqb %xmm1, %xmm0
  796. pmovmskb %xmm0, %eax
  797. inc %ax
  798. jnz .LVec1Differs
  799. cmp $32, %r8
  800. jbe .LLastTwoVectors
  801. { More than four vectors: aligned loop. }
  802. lea -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). }
  803. sub %rcx, %rdx { rdx = buf2 - buf1 }
  804. and $-16, %rcx { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
  805. sub %rcx, %r8 { r8 = count to be handled with loop }
  806. .balign 16 { no-op }
  807. .LAligned32xLoop_Body:
  808. add $32, %rcx
  809. { Compare two XMMs, reduce the result with 'and'. }
  810. movdqu (%rdx,%rcx), %xmm0
  811. pcmpeqb (%rcx), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
  812. movdqu 16(%rdx,%rcx), %xmm1
  813. pcmpeqb 16(%rcx), %xmm1
  814. pand %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
  815. pmovmskb %xmm1, %eax
  816. inc %ax
  817. jnz .LAligned32xLoop_TwoVectorsDiffer
  818. sub $32, %r8
  819. ja .LAligned32xLoop_Body
  820. add %rcx, %rdx { restore rdx = buf2 }
  821. add $32, %r8
  822. .LLastTwoVectors:
  823. movdqu (%rcx,%r8), %xmm0
  824. movdqu (%rdx,%r8), %xmm1
  825. pcmpeqb %xmm1, %xmm0
  826. pmovmskb %xmm0, %eax
  827. inc %ax
  828. jnz .LVecEm2Differs
  829. .LLastVec:
  830. movdqu 16(%rcx,%r8), %xmm0
  831. movdqu 16(%rdx,%r8), %xmm1
  832. pcmpeqb %xmm1, %xmm0
  833. pmovmskb %xmm0, %eax
  834. inc %ax
  835. jnz .LVecEm1Differs
  836. xor %eax, %eax
  837. ret
  838. .LVec1Differs:
  839. xor %r8d, %r8d
  840. .LVecEm1Differs:
  841. add $16, %r8
  842. .LVecEm2Differs:
  843. bsf %eax, %eax
  844. add %r8, %rax
  845. movzbl (%rdx,%rax), %edx
  846. movzbl (%rcx,%rax), %eax
  847. sub %rdx, %rax
  848. ret
  849. .LCantOverReadBoth:
  850. cmp $8, %r8d
  851. ja .L9to15
  852. cmp $3, %r8d
  853. jle .L2to3
  854. mov (%rcx), %eax
  855. mov (%rdx), %r9d
  856. cmp %r9d, %eax
  857. jne .L4xOr8xDiffer
  858. mov -4(%rcx,%r8), %eax
  859. mov -4(%rdx,%r8), %r9d
  860. cmp %r9d, %eax
  861. jne .L4xOr8xDiffer
  862. xor %eax, %eax
  863. ret
  864. .L9to15:
  865. mov (%rcx), %rax
  866. mov (%rdx), %r9
  867. cmp %r9, %rax
  868. jne .L4xOr8xDiffer
  869. mov -8(%rcx,%r8), %rax
  870. mov -8(%rdx,%r8), %r9
  871. cmp %r9, %rax
  872. jne .L4xOr8xDiffer
  873. xor %eax, %eax
  874. ret
  875. .L4xOr8xDiffer:
  876. bswap %r9
  877. bswap %rax
  878. cmp %r9, %rax
  879. sbb %rax, %rax
  880. or $1, %rax
  881. ret
  882. .L2to3:
  883. movzwl (%rcx), %eax
  884. bswap %eax
  885. shr $1, %eax
  886. mov -1(%rcx,%r8), %al
  887. movzwl (%rdx), %ecx
  888. bswap %ecx
  889. shr $1, %ecx
  890. mov -1(%rdx,%r8), %cl
  891. sub %rcx, %rax
  892. ret
  893. .L1OrLess:
  894. jl .LUnbounded_Prepare
  895. movzbl (%rcx), %eax
  896. movzbl (%rdx), %edx
  897. sub %rdx, %rax
  898. ret
  899. .LUnbounded_Prepare:
  900. sub %rcx, %rdx { rdx = buf2 - buf1 }
  901. test %r8, %r8
  902. jnz .LUnbounded_Body
  903. xor %eax, %eax
  904. ret
  905. .balign 16
  906. .LUnbounded_Next:
  907. add $1, %rcx
  908. .LUnbounded_Body:
  909. movzbl (%rdx,%rcx), %eax
  910. cmp %al, (%rcx)
  911. je .LUnbounded_Next
  912. sbb %rax, %rax
  913. or $1, %rax
  914. end;
  915. {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
  916. {$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
  917. {$define FPC_SYSTEM_HAS_COMPAREWORD}
  918. function CompareWord(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  919. asm
  920. {$ifndef win64}
  921. mov %rdx, %r8
  922. mov %rsi, %rdx
  923. mov %rdi, %rcx
  924. {$endif win64}
  925. sub %rcx, %rdx { rdx = buf2 - buf1 }
  926. cmp $1, %r8
  927. jle .LWordwise_Prepare
  928. mov %r8, %rax
  929. shr $62, %rax
  930. jnz .LWordwise_Prepare
  931. cmp $8, %r8
  932. jge .LVecOrMore
  933. lea (%rdx,%rcx), %eax
  934. or %ecx, %eax
  935. and $4095, %eax
  936. cmp $4080, %eax
  937. ja .LWordwise_Prepare
  938. movdqu (%rdx,%rcx), %xmm0
  939. movdqu (%rcx), %xmm1
  940. pcmpeqw %xmm1, %xmm0
  941. pmovmskb %xmm0, %eax
  942. shl $1, %r8 { convert to bytes }
  943. inc %ax
  944. jz .LNothing
  945. bsf %eax, %eax
  946. cmp %r8d, %eax
  947. jb .LSubtractWords
  948. .LNothing:
  949. xor %eax, %eax
  950. ret
  951. .balign 16
  952. .LWordwise_Body:
  953. movzwl (%rdx,%rcx), %eax
  954. cmp %ax, (%rcx)
  955. jne .LDoSbb
  956. add $2, %rcx
  957. .LWordwise_Prepare:
  958. sub $1, %r8
  959. jae .LWordwise_Body
  960. xor %eax, %eax
  961. ret
  962. .LDoSbb:
  963. sbb %rax, %rax
  964. or $1, %rax
  965. ret
  966. .LVec0Differs:
  967. bsf %eax, %eax
  968. .LSubtractWords:
  969. add %rcx, %rdx { recover rdx = buf2 }
  970. movzwl (%rdx,%rax), %edx
  971. movzwl (%rcx,%rax), %eax
  972. sub %rdx, %rax
  973. ret
  974. .LVecOrMore:
  975. movdqu (%rdx,%rcx), %xmm0 { Compare first vectors. }
  976. movdqu (%rcx), %xmm1
  977. pcmpeqw %xmm1, %xmm0
  978. pmovmskb %xmm0, %eax
  979. inc %ax
  980. jnz .LVec0Differs
  981. shl $1, %r8 { convert to bytes }
  982. sub $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  983. jle .LLastVec
  984. mov %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  985. add %rcx, %r8
  986. and $-16, %rcx { align buf1; +16 is performed by the loop. }
  987. sub %rcx, %r8
  988. .balign 16
  989. .LAligned8xLoop_Body:
  990. add $16, %rcx
  991. movdqu (%rdx,%rcx), %xmm0
  992. pcmpeqb (%rcx), %xmm0
  993. pmovmskb %xmm0, %eax
  994. inc %ax
  995. jnz .LAligned8xLoop_VecDiffers
  996. sub $16, %r8
  997. ja .LAligned8xLoop_Body
  998. .LLastVec:
  999. lea 16(%rcx,%r8), %rcx { point to the last 16 bytes }
  1000. movdqu (%rdx,%rcx), %xmm0
  1001. movdqu (%rcx), %xmm1
  1002. pcmpeqw %xmm1, %xmm0
  1003. pmovmskb %xmm0, %eax
  1004. inc %ax
  1005. jnz .LVec0Differs
  1006. xor %eax, %eax
  1007. ret
  1008. .LAligned8xLoop_VecDiffers:
  1009. bsf %eax, %eax
  1010. add %rax, %rcx
  1011. sub %r9, %rcx
  1012. and $-2, %rcx
  1013. add %r9, %rcx
  1014. movzwl (%rdx,%rcx), %edx
  1015. movzwl (%rcx), %eax
  1016. sub %rdx, %rax
  1017. end;
  1018. {$endif FPC_SYSTEM_HAS_COMPAREWORD}
  1019. {$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
  1020. {$define FPC_SYSTEM_HAS_COMPAREDWORD}
  1021. function CompareDWord(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1022. asm
  1023. {$ifndef win64}
  1024. mov %rdx, %r8
  1025. mov %rsi, %rdx
  1026. mov %rdi, %rcx
  1027. {$endif win64}
  1028. sub %rcx, %rdx { rdx = buf2 - buf1 }
  1029. cmp $4, %r8
  1030. jle .LDwordwise_Prepare
  1031. mov %r8, %rax
  1032. shr $61, %rax
  1033. jnz .LDwordwise_Prepare
  1034. movdqu (%rdx,%rcx), %xmm0 { Compare first vectors. }
  1035. movdqu (%rcx), %xmm1
  1036. pcmpeqd %xmm1, %xmm0
  1037. pmovmskb %xmm0, %eax
  1038. inc %ax
  1039. jnz .LVec0Differs
  1040. shl $2, %r8 { convert to bytes }
  1041. sub $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1042. jle .LLastVec
  1043. mov %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1044. add %rcx, %r8
  1045. and $-16, %rcx { align buf1; +16 is performed by the loop. }
  1046. sub %rcx, %r8
  1047. .balign 16
  1048. .LAligned4xLoop_Body:
  1049. add $16, %rcx
  1050. movdqu (%rdx,%rcx), %xmm0
  1051. pcmpeqb (%rcx), %xmm0
  1052. pmovmskb %xmm0, %eax
  1053. inc %ax
  1054. jnz .LAligned4xLoop_VecDiffers
  1055. sub $16, %r8
  1056. ja .LAligned4xLoop_Body
  1057. .LLastVec:
  1058. lea 16(%rcx,%r8), %rcx { point to the last 16 bytes }
  1059. movdqu (%rdx,%rcx), %xmm0
  1060. movdqu (%rcx), %xmm1
  1061. pcmpeqd %xmm1, %xmm0
  1062. pmovmskb %xmm0, %eax
  1063. inc %ax
  1064. jnz .LVec0Differs
  1065. xor %eax, %eax
  1066. ret
  1067. .LVec0Differs:
  1068. bsf %eax, %eax
  1069. add %rcx, %rdx { recover rdx = buf2 }
  1070. mov (%rdx,%rax), %edx
  1071. cmp %edx, (%rcx,%rax)
  1072. sbb %rax, %rax
  1073. or $1, %rax
  1074. ret
  1075. .LAligned4xLoop_VecDiffers:
  1076. bsf %eax, %eax
  1077. add %rax, %rcx
  1078. sub %r9, %rcx
  1079. and $-4, %rcx
  1080. add %r9, %rcx
  1081. mov (%rdx,%rcx), %edx
  1082. cmp %edx, (%rcx)
  1083. .LDoSbb:
  1084. sbb %rax, %rax
  1085. or $1, %rax
  1086. ret
  1087. .balign 16
  1088. .LDwordwise_Body:
  1089. mov (%rdx,%rcx), %eax
  1090. cmp %eax, (%rcx)
  1091. jne .LDoSbb
  1092. add $4, %rcx
  1093. .LDwordwise_Prepare:
  1094. sub $1, %r8
  1095. jae .LDwordwise_Body
  1096. xor %eax, %eax
  1097. end;
  1098. {$endif FPC_SYSTEM_HAS_COMPAREDWORD}
  1099. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  1100. { does a thread save inc/dec }
  1101. function declocked(var l : longint) : boolean;assembler; nostackframe;
  1102. asm
  1103. { this check should be done because a lock takes a lot }
  1104. { of time! }
  1105. {$ifdef FPC_PIC}
  1106. movq IsMultithread@GOTPCREL(%rip),%rax
  1107. cmpl $0,(%rax)
  1108. {$else FPC_PIC}
  1109. cmpl $0,IsMultithread(%rip)
  1110. {$endif FPC_PIC}
  1111. jz .Ldeclockedskiplock
  1112. .byte 0xF0 // LOCK prefix.
  1113. .Ldeclockedskiplock:
  1114. decl {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1115. setzb %al
  1116. end;
  1117. {$define FPC_SYSTEM_HAS_DECLOCKED_INT64}
  1118. function declocked(var l : int64) : boolean;assembler; nostackframe;
  1119. asm
  1120. { this check should be done because a lock takes a lot }
  1121. { of time! }
  1122. {$ifdef FPC_PIC}
  1123. movq IsMultithread@GOTPCREL(%rip),%rax
  1124. cmpl $0,(%rax)
  1125. {$else FPC_PIC}
  1126. cmpl $0,IsMultithread(%rip)
  1127. {$endif FPC_PIC}
  1128. jz .Ldeclockedskiplock
  1129. .byte 0xF0 // LOCK prefix.
  1130. .Ldeclockedskiplock:
  1131. decq {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1132. setzb %al
  1133. end;
  1134. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  1135. procedure inclocked(var l : longint);assembler; nostackframe;
  1136. asm
  1137. { this check should be done because a lock takes a lot }
  1138. { of time! }
  1139. {$ifdef FPC_PIC}
  1140. movq IsMultithread@GOTPCREL(%rip),%rax
  1141. cmpl $0,(%rax)
  1142. {$else FPC_PIC}
  1143. cmpl $0,IsMultithread(%rip)
  1144. {$endif FPC_PIC}
  1145. jz .Linclockedskiplock
  1146. .byte 0xF0 // LOCK prefix.
  1147. .Linclockedskiplock:
  1148. incl {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1149. end;
  1150. {$define FPC_SYSTEM_HAS_INCLOCKED_INT64}
  1151. procedure inclocked(var l : int64);assembler; nostackframe;
  1152. asm
  1153. { this check should be done because a lock takes a lot }
  1154. { of time! }
  1155. {$ifdef FPC_PIC}
  1156. movq IsMultithread@GOTPCREL(%rip),%rax
  1157. cmpl $0,(%rax)
  1158. {$else FPC_PIC}
  1159. cmpl $0,IsMultithread(%rip)
  1160. {$endif FPC_PIC}
  1161. jz .Linclockedskiplock
  1162. .byte 0xF0 // LOCK prefix.
  1163. .Linclockedskiplock:
  1164. incq {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1165. end;
  1166. function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
  1167. asm
  1168. movl $-1,%eax
  1169. lock
  1170. xaddl %eax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1171. decl %eax
  1172. end;
  1173. function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
  1174. asm
  1175. movl $1,%eax
  1176. lock
  1177. xaddl %eax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1178. incl %eax
  1179. end;
  1180. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  1181. asm
  1182. {$ifdef win64}
  1183. xchgl (%rcx),%edx
  1184. movl %edx,%eax
  1185. {$else win64}
  1186. xchgl (%rdi),%esi
  1187. movl %esi,%eax
  1188. {$endif win64}
  1189. end;
  1190. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  1191. asm
  1192. {$ifdef win64}
  1193. lock
  1194. xaddl %edx, (%rcx)
  1195. movl %edx,%eax
  1196. {$else win64}
  1197. lock
  1198. xaddl %esi, (%rdi)
  1199. movl %esi,%eax
  1200. {$endif win64}
  1201. end;
  1202. function InterLockedCompareExchange(var Target: longint; NewValue, Comperand : longint): longint; assembler; nostackframe;
  1203. asm
  1204. {$ifdef win64}
  1205. movl %r8d,%eax
  1206. lock
  1207. cmpxchgl %edx,(%rcx)
  1208. {$else win64}
  1209. movl %edx,%eax
  1210. lock
  1211. cmpxchgl %esi,(%rdi)
  1212. {$endif win64}
  1213. end;
  1214. function InterLockedDecrement64 (var Target: int64) : int64; assembler; nostackframe;
  1215. asm
  1216. movq $-1,%rax
  1217. lock
  1218. xaddq %rax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1219. decq %rax
  1220. end;
  1221. function InterLockedIncrement64 (var Target: int64) : int64; assembler; nostackframe;
  1222. asm
  1223. movq $1,%rax
  1224. lock
  1225. xaddq %rax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1226. incq %rax
  1227. end;
  1228. function InterLockedExchange64 (var Target: int64;Source : int64) : int64; assembler; nostackframe;
  1229. asm
  1230. {$ifdef win64}
  1231. xchgq (%rcx),%rdx
  1232. movq %rdx,%rax
  1233. {$else win64}
  1234. xchgq (%rdi),%rsi
  1235. movq %rsi,%rax
  1236. {$endif win64}
  1237. end;
  1238. function InterLockedExchangeAdd64 (var Target: int64;Source : int64) : int64; assembler; nostackframe;
  1239. asm
  1240. {$ifdef win64}
  1241. lock
  1242. xaddq %rdx, (%rcx)
  1243. movq %rdx,%rax
  1244. {$else win64}
  1245. lock
  1246. xaddq %rsi, (%rdi)
  1247. movq %rsi,%rax
  1248. {$endif win64}
  1249. end;
  1250. function InterLockedCompareExchange64(var Target: int64; NewValue, Comperand : int64): int64; assembler; nostackframe;
  1251. asm
  1252. {$ifdef win64}
  1253. movq %r8,%rax
  1254. lock
  1255. cmpxchgq %rdx,(%rcx)
  1256. {$else win64}
  1257. movq %rdx,%rax
  1258. lock
  1259. cmpxchgq %rsi,(%rdi)
  1260. {$endif win64}
  1261. end;
  1262. {****************************************************************************
  1263. FPU
  1264. ****************************************************************************}
  1265. const
  1266. { Internal constants for use in system unit }
  1267. FPU_Invalid = 1;
  1268. FPU_Denormal = 2;
  1269. FPU_DivisionByZero = 4;
  1270. FPU_Overflow = 8;
  1271. FPU_Underflow = $10;
  1272. FPU_StackUnderflow = $20;
  1273. FPU_StackOverflow = $40;
  1274. FPU_ExceptionMask = $ff;
  1275. MM_Invalid = 1;
  1276. MM_Denormal = 2;
  1277. MM_DivisionByZero = 4;
  1278. MM_Overflow = 8;
  1279. MM_Underflow = $10;
  1280. MM_Precicion = $20;
  1281. MM_ExceptionMask = $3f;
  1282. MM_MaskInvalidOp = %0000000010000000;
  1283. MM_MaskDenorm = %0000000100000000;
  1284. MM_MaskDivZero = %0000001000000000;
  1285. MM_MaskOverflow = %0000010000000000;
  1286. MM_MaskUnderflow = %0000100000000000;
  1287. MM_MaskPrecision = %0001000000000000;
  1288. {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
  1289. procedure fpc_cpuinit;
  1290. var
  1291. _eax,cpuid7_ebx,cpuid1_ecx : dword;
  1292. begin
  1293. { don't let libraries influence the FPU cw set by the host program }
  1294. if IsLibrary then
  1295. begin
  1296. Default8087CW:=Get8087CW;
  1297. DefaultMXCSR:=GetMXCSR;
  1298. end;
  1299. SysResetFPU;
  1300. asm
  1301. xorl %eax,%eax
  1302. cpuid
  1303. movl %eax,_eax
  1304. end;
  1305. if _eax>=7 then
  1306. begin
  1307. asm
  1308. movl $1,%eax
  1309. xorl %ecx,%ecx
  1310. cpuid
  1311. movl %ecx,cpuid1_ecx
  1312. movl $7,%eax
  1313. xorl %ecx,%ecx
  1314. cpuid
  1315. movl %ebx,cpuid7_ebx
  1316. end;
  1317. {$ifdef use_fast_repmovstos}
  1318. fast_large_repmovstosb:=cpuid7_ebx and (1 shl 9)<>0;
  1319. {$endif}
  1320. { XGETBV support? }
  1321. if (cpuid1_ecx and $8000000)<>0 then
  1322. begin
  1323. asm
  1324. xorl %ecx,%ecx
  1325. .byte 0x0f,0x01,0xd0 { xgetbv }
  1326. movl %eax,_eax
  1327. end;
  1328. if (_eax and 6)=6 then
  1329. begin
  1330. has_avx_support:=(cpuid1_ecx and $10000000)<>0;
  1331. has_avx2_support:=(cpuid7_ebx and $20)<>0;
  1332. end;
  1333. end;
  1334. end;
  1335. end;
  1336. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  1337. Procedure SysInitFPU;
  1338. begin
  1339. end;
  1340. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  1341. Procedure SysResetFPU;
  1342. var
  1343. { these locals are so we don't have to hack pic code in the assembler }
  1344. localmxcsr: dword;
  1345. localfpucw: word;
  1346. begin
  1347. localfpucw:=Default8087CW;
  1348. localmxcsr:=DefaultMXCSR;
  1349. asm
  1350. fninit
  1351. fwait
  1352. fldcw localfpucw
  1353. ldmxcsr localmxcsr
  1354. end;
  1355. end;
  1356. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  1357. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  1358. procedure ReadBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  1359. asm
  1360. lfence
  1361. end;
  1362. procedure ReadDependencyBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  1363. asm
  1364. { reads imply barrier on earlier reads depended on }
  1365. end;
  1366. procedure ReadWriteBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  1367. asm
  1368. mfence
  1369. end;
  1370. procedure WriteBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  1371. asm
  1372. sfence
  1373. end;
  1374. {$endif}
  1375. {****************************************************************************
  1376. Math Routines
  1377. ****************************************************************************}
  1378. {$define FPC_SYSTEM_HAS_SWAPENDIAN}
  1379. { SwapEndian(<16 Bit>) being inlined is faster than using assembler }
  1380. function SwapEndian(const AValue: SmallInt): SmallInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  1381. begin
  1382. { the extra Word type cast is necessary because the "AValue shr 8" }
  1383. { is turned into "longint(AValue) shr 8", so if AValue < 0 then }
  1384. { the sign bits from the upper 16 bits are shifted in rather than }
  1385. { zeroes. }
  1386. Result := SmallInt(((Word(AValue) shr 8) or (Word(AValue) shl 8)) and $ffff);
  1387. end;
  1388. function SwapEndian(const AValue: Word): Word;{$ifdef SYSTEMINLINE}inline;{$endif}
  1389. begin
  1390. Result := ((AValue shr 8) or (AValue shl 8)) and $ffff;
  1391. end;
  1392. function SwapEndian(const AValue: LongInt): LongInt; assembler; nostackframe;
  1393. asm
  1394. {$ifdef win64}
  1395. movl %ecx, %eax
  1396. {$else win64}
  1397. movl %edi, %eax
  1398. {$endif win64}
  1399. bswap %eax
  1400. end;
  1401. function SwapEndian(const AValue: DWord): DWord; assembler; nostackframe;
  1402. asm
  1403. {$ifdef win64}
  1404. movl %ecx, %eax
  1405. {$else win64}
  1406. movl %edi, %eax
  1407. {$endif win64}
  1408. bswap %eax
  1409. end;
  1410. function SwapEndian(const AValue: Int64): Int64; assembler; nostackframe;
  1411. asm
  1412. {$ifdef win64}
  1413. movq %rcx, %rax
  1414. {$else win64}
  1415. movq %rdi, %rax
  1416. {$endif win64}
  1417. bswap %rax
  1418. end;
  1419. function SwapEndian(const AValue: QWord): QWord; assembler; nostackframe;
  1420. asm
  1421. {$ifdef win64}
  1422. movq %rcx, %rax
  1423. {$else win64}
  1424. movq %rdi, %rax
  1425. {$endif win64}
  1426. bswap %rax
  1427. end;
  1428. {$ifndef win64}
  1429. {$define FPC_SYSTEM_HAS_U128_DIV_U64_TO_U64}
  1430. function u128_div_u64_to_u64( const xh, xl: qword; const y: qword; out quotient, remainder: qword ): boolean;nostackframe;assembler;
  1431. {
  1432. SysV:
  1433. xh: RDI
  1434. xl: RSI
  1435. y: RDX
  1436. quotient: RCX
  1437. remainder: R8
  1438. }
  1439. label
  1440. dodiv;
  1441. asm
  1442. cmpq %rdi,%rdx
  1443. ja dodiv
  1444. xorl %eax,%eax
  1445. ret
  1446. dodiv:
  1447. movq %rdx,%r9
  1448. movq %rsi,%rax
  1449. movq %rdi,%rdx
  1450. divq %r9
  1451. movq %rax,(%rcx)
  1452. movq %rdx,(%r8)
  1453. movl $1,%eax
  1454. end;
  1455. {$endif win64}