x86_64.inc 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2002 by Florian Klaempfl and Sergei Gorelkin
  4. Members of the Free Pascal development team
  5. Processor dependent implementation for the system unit for
  6. the x86-64 architecture
  7. See the file COPYING.FPC, included in this distribution,
  8. for details about the copyright.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  12. **********************************************************************}
  13. {$asmmode GAS}
  14. {****************************************************************************
  15. Primitives
  16. ****************************************************************************}
  17. {$define FPC_SYSTEM_HAS_SPTR}
  18. Function Sptr : Pointer;assembler;{$ifdef SYSTEMINLINE}inline;{$endif}
  19. asm
  20. movq %rsp,%rax
  21. end;
  22. {$IFNDEF INTERNAL_BACKTRACE}
  23. {$define FPC_SYSTEM_HAS_GET_FRAME}
  24. function get_frame:pointer;assembler;{$ifdef SYSTEMINLINE}inline;{$endif}
  25. asm
  26. movq %rbp,%rax
  27. end;
  28. {$ENDIF not INTERNAL_BACKTRACE}
  29. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  30. function get_caller_addr(framebp:pointer):pointer;{$ifdef SYSTEMINLINE}inline;{$endif}
  31. begin
  32. get_caller_addr:=framebp;
  33. if assigned(framebp) then
  34. get_caller_addr:=PPointer(framebp)[1];
  35. end;
  36. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  37. function get_caller_frame(framebp:pointer):pointer;{$ifdef SYSTEMINLINE}inline;{$endif}
  38. begin
  39. get_caller_frame:=framebp;
  40. if assigned(framebp) then
  41. get_caller_frame:=PPointer(framebp)^;
  42. end;
  43. // The following assembler procedures are disabled for FreeBSD due to
  44. // multiple issues with its old GNU assembler (Mantis #19188).
  45. // Even after fixing them, it can be enabled only for the trunk version,
  46. // otherwise bootstrapping won't be possible.
  47. {$ifndef freebsd}
  48. {$ifndef FPC_SYSTEM_HAS_MOVE}
  49. {$define FPC_SYSTEM_HAS_MOVE}
  50. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
  51. { Linux: rdi source, rsi dest, rdx count
  52. win64: rcx source, rdx dest, r8 count }
  53. asm
  54. {$ifndef win64}
  55. mov %rdx, %r8
  56. mov %rsi, %rdx
  57. mov %rdi, %rcx
  58. {$endif win64}
  59. mov %r8, %rax
  60. sub %rdx, %rcx { rcx = src - dest }
  61. jz .Lquit { exit if src=dest }
  62. jnb .L1 { src>dest => forward move }
  63. add %rcx, %rax { rcx is negative => r8+rcx > 0 if regions overlap }
  64. jb .Lback { if no overlap, still do forward move }
  65. .L1:
  66. cmp $8, %r8
  67. jl .Lless8f { signed compare, negative count not allowed }
  68. test $7, %dl
  69. je .Ldestaligned
  70. test $1, %dl { align dest by moving first 1+2+4 bytes }
  71. je .L2f
  72. mov (%rcx,%rdx,1),%al
  73. dec %r8
  74. mov %al, (%rdx)
  75. add $1, %rdx
  76. .L2f:
  77. test $2, %dl
  78. je .L4f
  79. mov (%rcx,%rdx,1),%ax
  80. sub $2, %r8
  81. mov %ax, (%rdx)
  82. add $2, %rdx
  83. .L4f:
  84. test $4, %dl
  85. je .Ldestaligned
  86. mov (%rcx,%rdx,1),%eax
  87. sub $4, %r8
  88. mov %eax, (%rdx)
  89. add $4, %rdx
  90. .Ldestaligned:
  91. mov %r8, %r9
  92. shr $5, %r9
  93. jne .Lmore32
  94. .Ltail:
  95. mov %r8, %r9
  96. shr $3, %r9
  97. je .Lless8f
  98. .balign 16
  99. .Lloop8f: { max. 8 iterations }
  100. mov (%rcx,%rdx,1),%rax
  101. mov %rax, (%rdx)
  102. add $8, %rdx
  103. dec %r9
  104. jne .Lloop8f
  105. and $7, %r8
  106. .Lless8f:
  107. test %r8, %r8
  108. jle .Lquit
  109. .balign 16
  110. .Lloop1f:
  111. mov (%rcx,%rdx,1),%al
  112. mov %al,(%rdx)
  113. inc %rdx
  114. dec %r8
  115. jne .Lloop1f
  116. .Lquit:
  117. retq
  118. .Lmore32:
  119. cmp $0x2000, %r9 { this limit must be processor-specific (1/2 L2 cache size) }
  120. jnae .Lloop32
  121. cmp $0x1000, %rcx { but don't bother bypassing cache if src and dest }
  122. jnb .Lntloopf { are close to each other}
  123. .balign 16
  124. .Lloop32:
  125. add $32,%rdx
  126. mov -32(%rcx,%rdx,1),%rax
  127. mov -24(%rcx,%rdx,1),%r10
  128. mov %rax,-32(%rdx)
  129. mov %r10,-24(%rdx)
  130. dec %r9
  131. mov -16(%rcx,%rdx,1),%rax
  132. mov -8(%rcx,%rdx,1),%r10
  133. mov %rax,-16(%rdx)
  134. mov %r10,-8(%rdx)
  135. jne .Lloop32
  136. and $0x1f, %r8
  137. jmpq .Ltail
  138. .Lntloopf:
  139. mov $32, %eax
  140. .balign 16
  141. .Lpref:
  142. prefetchnta (%rcx,%rdx,1)
  143. prefetchnta 0x40(%rcx,%rdx,1)
  144. add $0x80, %rdx
  145. dec %eax
  146. jne .Lpref
  147. sub $0x1000, %rdx
  148. mov $64, %eax
  149. .balign 16
  150. .Loop64:
  151. add $64, %rdx
  152. mov -64(%rcx,%rdx,1), %r9
  153. mov -56(%rcx,%rdx,1), %r10
  154. movnti %r9, -64(%rdx)
  155. movnti %r10, -56(%rdx)
  156. mov -48(%rcx,%rdx,1), %r9
  157. mov -40(%rcx,%rdx,1), %r10
  158. movnti %r9, -48(%rdx)
  159. movnti %r10, -40(%rdx)
  160. dec %eax
  161. mov -32(%rcx,%rdx,1), %r9
  162. mov -24(%rcx,%rdx,1), %r10
  163. movnti %r9, -32(%rdx)
  164. movnti %r10, -24(%rdx)
  165. mov -16(%rcx,%rdx,1), %r9
  166. mov -8(%rcx,%rdx,1), %r10
  167. movnti %r9, -16(%rdx)
  168. movnti %r10, -8(%rdx)
  169. jne .Loop64
  170. sub $0x1000, %r8
  171. cmp $0x1000, %r8
  172. jae .Lntloopf
  173. mfence
  174. jmpq .Ldestaligned { go handle remaining bytes }
  175. { backwards move }
  176. .Lback:
  177. add %r8, %rdx { points to the end of dest }
  178. cmp $8, %r8
  179. jl .Lless8b { signed compare, negative count not allowed }
  180. test $7, %dl
  181. je .Ldestalignedb
  182. test $1, %dl
  183. je .L2b
  184. dec %rdx
  185. mov (%rcx,%rdx,1), %al
  186. dec %r8
  187. mov %al, (%rdx)
  188. .L2b:
  189. test $2, %dl
  190. je .L4b
  191. sub $2, %rdx
  192. mov (%rcx,%rdx,1), %ax
  193. sub $2, %r8
  194. mov %ax, (%rdx)
  195. .L4b:
  196. test $4, %dl
  197. je .Ldestalignedb
  198. sub $4, %rdx
  199. mov (%rcx,%rdx,1), %eax
  200. sub $4, %r8
  201. mov %eax, (%rdx)
  202. .Ldestalignedb:
  203. mov %r8, %r9
  204. shr $5, %r9
  205. jne .Lmore32b
  206. .Ltailb:
  207. mov %r8, %r9
  208. shr $3, %r9
  209. je .Lless8b
  210. .Lloop8b:
  211. sub $8, %rdx
  212. mov (%rcx,%rdx,1), %rax
  213. dec %r9
  214. mov %rax, (%rdx)
  215. jne .Lloop8b
  216. and $7, %r8
  217. .Lless8b:
  218. test %r8, %r8
  219. jle .Lquit2
  220. .balign 16
  221. .Lsmallb:
  222. dec %rdx
  223. mov (%rcx,%rdx,1), %al
  224. dec %r8
  225. mov %al,(%rdx)
  226. jnz .Lsmallb
  227. .Lquit2:
  228. retq
  229. .Lmore32b:
  230. cmp $0x2000, %r9
  231. jnae .Lloop32b
  232. cmp $0xfffffffffffff000,%rcx
  233. jb .Lntloopb
  234. .balign 16
  235. .Lloop32b:
  236. sub $32, %rdx
  237. mov 24(%rcx,%rdx,1), %rax
  238. mov 16(%rcx,%rdx,1), %r10
  239. mov %rax, 24(%rdx)
  240. mov %r10, 16(%rdx)
  241. dec %r9
  242. mov 8(%rcx,%rdx,1),%rax
  243. mov (%rcx,%rdx,1), %r10
  244. mov %rax, 8(%rdx)
  245. mov %r10, (%rdx)
  246. jne .Lloop32b
  247. and $0x1f, %r8
  248. jmpq .Ltailb
  249. .Lntloopb:
  250. mov $32, %eax
  251. .balign 16
  252. .Lprefb:
  253. sub $0x80, %rdx
  254. prefetchnta (%rcx,%rdx,1)
  255. prefetchnta 0x40(%rcx,%rdx,1)
  256. dec %eax
  257. jnz .Lprefb
  258. add $0x1000, %rdx
  259. mov $0x40, %eax
  260. .balign 16
  261. .Lloop64b:
  262. sub $64, %rdx
  263. mov 56(%rcx,%rdx,1), %r9
  264. mov 48(%rcx,%rdx,1), %r10
  265. movnti %r9, 56(%rdx)
  266. movnti %r10, 48(%rdx)
  267. mov 40(%rcx,%rdx,1), %r9
  268. mov 32(%rcx,%rdx,1), %r10
  269. movnti %r9, 40(%rdx)
  270. movnti %r10, 32(%rdx)
  271. dec %eax
  272. mov 24(%rcx,%rdx,1), %r9
  273. mov 16(%rcx,%rdx,1), %r10
  274. movnti %r9, 24(%rdx)
  275. movnti %r10, 16(%rdx)
  276. mov 8(%rcx,%rdx,1), %r9
  277. mov (%rcx,%rdx,1), %r10
  278. movnti %r9, 8(%rdx)
  279. movnti %r10, (%rdx)
  280. jne .Lloop64b
  281. sub $0x1000, %r8
  282. cmp $0x1000, %r8
  283. jae .Lntloopb
  284. mfence
  285. jmpq .Ldestalignedb
  286. end;
  287. {$endif FPC_SYSTEM_HAS_MOVE}
  288. {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
  289. {$define FPC_SYSTEM_HAS_FILLCHAR}
  290. Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
  291. asm
  292. { win64: rcx dest, rdx count, r8b value
  293. linux: rdi dest, rsi count, rdx value }
  294. {$ifndef win64}
  295. mov %rdx, %r8
  296. mov %rsi, %rdx
  297. mov %rdi, %rcx
  298. {$endif win64}
  299. cmp $8, %rdx
  300. jl .Ltiny
  301. // TODO: movz?q and movs?q are not accepted by FPC asmreader, it needs fixing.
  302. // `movzbl' instead is accepted and generates correct code with internal assembler,
  303. // but breaks targets using external GAS (Mantis #19188).
  304. // So use a different instruction for now.
  305. { expand byte value }
  306. andq $0xff, %r8
  307. {
  308. movzbq %r8b, %r8
  309. }
  310. mov $0x0101010101010101,%r9
  311. imul %r9, %r8
  312. test $7, %cl
  313. je .Laligned
  314. { align dest to 8 bytes }
  315. test $1, %cl
  316. je .L2
  317. movb %r8b, (%rcx)
  318. add $1, %rcx
  319. sub $1, %rdx
  320. .L2:
  321. test $2, %cl
  322. je .L4
  323. movw %r8w, (%rcx)
  324. add $2, %rcx
  325. sub $2, %rdx
  326. .L4:
  327. test $4, %cl
  328. je .Laligned
  329. movl %r8d, (%rcx)
  330. add $4, %rcx
  331. sub $4, %rdx
  332. .Laligned:
  333. mov %rdx, %rax
  334. and $0x3f, %rdx
  335. shr $6, %rax
  336. jne .Lmore64
  337. .Lless64:
  338. mov %rdx, %rax
  339. and $7, %rdx
  340. shr $3, %rax
  341. je .Ltiny
  342. .balign 16
  343. .Lloop8: { max. 8 iterations }
  344. mov %r8, (%rcx)
  345. add $8, %rcx
  346. dec %rax
  347. jne .Lloop8
  348. .Ltiny:
  349. test %rdx, %rdx
  350. jle .Lquit
  351. .Lloop1:
  352. movb %r8b, (%rcx)
  353. inc %rcx
  354. dec %rdx
  355. jnz .Lloop1
  356. .Lquit:
  357. retq
  358. .Lmore64:
  359. cmp $0x2000,%rax
  360. jae .Lloop64nti
  361. .balign 16
  362. .Lloop64:
  363. add $64, %rcx
  364. mov %r8, -64(%rcx)
  365. mov %r8, -56(%rcx)
  366. mov %r8, -48(%rcx)
  367. mov %r8, -40(%rcx)
  368. dec %rax
  369. mov %r8, -32(%rcx)
  370. mov %r8, -24(%rcx)
  371. mov %r8, -16(%rcx)
  372. mov %r8, -8(%rcx)
  373. jne .Lloop64
  374. jmp .Lless64
  375. .balign 16
  376. .Lloop64nti:
  377. add $64, %rcx
  378. movnti %r8, -64(%rcx)
  379. movnti %r8, -56(%rcx)
  380. movnti %r8, -48(%rcx)
  381. movnti %r8, -40(%rcx)
  382. dec %rax
  383. movnti %r8, -32(%rcx)
  384. movnti %r8, -24(%rcx)
  385. movnti %r8, -16(%rcx)
  386. movnti %r8, -8(%rcx)
  387. jnz .Lloop64nti
  388. mfence
  389. jmp .Lless64
  390. end;
  391. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  392. {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
  393. {$define FPC_SYSTEM_HAS_INDEXBYTE}
  394. function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  395. { win64: rcx buf, rdx len, r8b word
  396. linux: rdi buf, rsi len, rdx word }
  397. asm
  398. {$ifdef win64}
  399. movd %r8d, %xmm1
  400. {$else}
  401. movd %edx, %xmm1
  402. movq %rdi, %rcx
  403. movq %rsi, %rdx
  404. {$endif}
  405. mov %rcx, %r8
  406. punpcklbw %xmm1, %xmm1
  407. and $-0x10, %rcx { highest aligned address before buf }
  408. test %rdx, %rdx
  409. punpcklbw %xmm1, %xmm1
  410. jz .Lnotfound { exit if len=0 }
  411. add $16, %rcx { first aligned address after buf }
  412. pshufd $0, %xmm1, %xmm1
  413. movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
  414. sub %r8, %rcx { rcx=number of valid bytes, r8=original ptr }
  415. pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
  416. pmovmskb %xmm0, %eax
  417. shl %cl, %eax { shift valid bits into high word }
  418. and $0xffff0000, %eax { clear low word containing invalid bits }
  419. shr %cl, %eax { shift back }
  420. jmp .Lcontinue
  421. .balign 16
  422. .Lloop:
  423. movdqa (%r8,%rcx), %xmm0 { r8 and rcx may have any values, }
  424. add $16, %rcx { but their sum is evenly divisible by 16. }
  425. pcmpeqb %xmm1, %xmm0
  426. pmovmskb %xmm0, %eax
  427. .Lcontinue:
  428. test %eax, %eax
  429. jnz .Lmatch
  430. cmp %rcx, %rdx
  431. ja .Lloop
  432. .Lnotfound:
  433. or $-1, %rax
  434. retq
  435. .Lmatch:
  436. bsf %eax, %eax
  437. lea -16(%rcx,%rax), %rax
  438. cmp %rax, %rdx { check against the buffer length }
  439. jbe .Lnotfound
  440. end;
  441. {$endif FPC_SYSTEM_HAS_INDEXBYTE}
  442. {$ifndef FPC_SYSTEM_HAS_INDEXWORD}
  443. {$define FPC_SYSTEM_HAS_INDEXWORD}
  444. function IndexWord(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  445. { win64: rcx buf, rdx len, r8b word
  446. linux: rdi buf, rsi len, rdx word }
  447. asm
  448. {$ifdef win64}
  449. movd %r8d, %xmm1
  450. {$else}
  451. movd %edx, %xmm1
  452. movq %rdi, %rcx
  453. movq %rsi, %rdx
  454. {$endif}
  455. mov %rcx, %r8
  456. punpcklwd %xmm1, %xmm1
  457. and $-0x10, %rcx
  458. test %rdx, %rdx
  459. pshufd $0, %xmm1, %xmm1
  460. jz .Lnotfound { exit if len=0 }
  461. add $16, %rcx
  462. movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
  463. sub %r8, %rcx { rcx=number of valid bytes }
  464. test $1, %r8b { if buffer isn't aligned to word boundary, }
  465. jnz .Lunaligned { use a different algorithm }
  466. pcmpeqw %xmm1, %xmm0
  467. pmovmskb %xmm0, %eax
  468. shl %cl, %eax
  469. and $0xffff0000, %eax
  470. shr %cl, %eax
  471. shr $1, %ecx { bytes->words }
  472. jmp .Lcontinue
  473. .balign 16
  474. .Lloop:
  475. movdqa (%r8,%rcx,2), %xmm0
  476. add $8, %rcx
  477. pcmpeqw %xmm1, %xmm0
  478. pmovmskb %xmm0, %eax
  479. .Lcontinue:
  480. test %eax, %eax
  481. jnz .Lmatch
  482. cmp %rcx, %rdx
  483. ja .Lloop
  484. .Lnotfound:
  485. or $-1, %rax
  486. retq
  487. .Lmatch:
  488. bsf %eax, %eax
  489. shr $1, %eax { in words }
  490. lea -8(%rcx,%rax), %rax
  491. cmp %rax, %rdx
  492. jbe .Lnotfound { if match is after the specified length, ignore it }
  493. retq
  494. .Lunaligned:
  495. movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
  496. psllw $8, %xmm1 { swap bytes of each word of pattern) }
  497. psrlw $8, %xmm2
  498. por %xmm2, %xmm1
  499. pcmpeqb %xmm1, %xmm0
  500. pmovmskb %xmm0, %eax
  501. shl %cl, %eax
  502. and $0xffff0000, %eax
  503. shr %cl, %eax
  504. add %rdx, %rdx { length words -> bytes }
  505. xor %r10d, %r10d { nothing to merge yet }
  506. jmp .Lcontinue_u
  507. .balign 16
  508. .Lloop_u:
  509. movdqa (%r8,%rcx), %xmm0
  510. add $16, %rcx
  511. pcmpeqb %xmm1, %xmm0 { compare by bytes }
  512. shr $16, %r10d { bit 16 shifts into 0 }
  513. pmovmskb %xmm0, %eax
  514. .Lcontinue_u:
  515. shl $1, %eax { 15:0 -> 16:1 }
  516. or %r10d, %eax { merge bit 0 from previous round }
  517. mov %eax, %r10d
  518. shr $1, %eax { now AND together adjacent pairs of bits }
  519. and %r10d, %eax
  520. and $0x5555, %eax { also reset odd bits }
  521. jnz .Lmatch_u
  522. cmpq %rcx, %rdx
  523. ja .Lloop_u
  524. .Lnotfound_u:
  525. or $-1, %rax
  526. retq
  527. .Lmatch_u:
  528. bsf %eax, %eax
  529. lea -16(%rcx,%rax), %rax
  530. cmp %rax, %rdx
  531. jbe .Lnotfound_u { if match is after the specified length, ignore it }
  532. sar $1, %rax { in words }
  533. end;
  534. {$endif FPC_SYSTEM_HAS_INDEXWORD}
  535. {$endif freebsd}
  536. {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
  537. {$define FPC_SYSTEM_HAS_COMPAREBYTE}
  538. function CompareByte(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  539. { win64: rcx buf, rdx buf, r8 len
  540. linux: rdi buf, rsi buf, rdx len }
  541. asm
  542. {$ifndef win64}
  543. mov %rdx, %r8
  544. mov %rsi, %rdx
  545. mov %rdi, %rcx
  546. {$endif win64}
  547. testq %r8,%r8
  548. je .LCmpbyteZero
  549. .balign 8
  550. .LCmpbyteLoop:
  551. movb (%rcx),%r9b
  552. cmpb (%rdx),%r9b
  553. leaq 1(%rcx),%rcx
  554. leaq 1(%rdx),%rdx
  555. jne .LCmpbyteExitFast
  556. decq %r8
  557. jne .LCmpbyteLoop
  558. .LCmpbyteExitFast:
  559. movzbq -1(%rdx),%r8 { Compare last position }
  560. movzbq %r9b,%rax
  561. subq %r8,%rax
  562. ret
  563. .LCmpbyteZero:
  564. movq $0,%rax
  565. ret
  566. end;
  567. {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
  568. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  569. { does a thread save inc/dec }
  570. function declocked(var l : longint) : boolean;assembler; nostackframe;
  571. asm
  572. { this check should be done because a lock takes a lot }
  573. { of time! }
  574. {$ifdef FPC_PIC}
  575. movq IsMultithread@GOTPCREL(%rip),%rax
  576. cmpb $0,(%rax)
  577. {$else FPC_PIC}
  578. cmpb $0,IsMultithread(%rip)
  579. {$endif FPC_PIC}
  580. {$ifndef win64}
  581. mov %rdi, %rcx
  582. {$endif win64}
  583. jz .Ldeclockednolock
  584. lock
  585. decl (%rcx)
  586. jmp .Ldeclockedend
  587. .Ldeclockednolock:
  588. decl (%rcx)
  589. .Ldeclockedend:
  590. setzb %al
  591. end;
  592. {$define FPC_SYSTEM_HAS_DECLOCKED_INT64}
  593. function declocked(var l : int64) : boolean;assembler; nostackframe;
  594. asm
  595. { this check should be done because a lock takes a lot }
  596. { of time! }
  597. {$ifdef FPC_PIC}
  598. movq IsMultithread@GOTPCREL(%rip),%rax
  599. cmpb $0,(%rax)
  600. {$else FPC_PIC}
  601. cmpb $0,IsMultithread(%rip)
  602. {$endif FPC_PIC}
  603. {$ifndef win64}
  604. mov %rdi, %rcx
  605. {$endif win64}
  606. jz .Ldeclockednolock
  607. lock
  608. decq (%rcx)
  609. jmp .Ldeclockedend
  610. .Ldeclockednolock:
  611. decq (%rcx)
  612. .Ldeclockedend:
  613. setzb %al
  614. end;
  615. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  616. procedure inclocked(var l : longint);assembler; nostackframe;
  617. asm
  618. { this check should be done because a lock takes a lot }
  619. { of time! }
  620. {$ifdef FPC_PIC}
  621. movq IsMultithread@GOTPCREL(%rip),%rax
  622. cmpb $0,(%rax)
  623. {$else FPC_PIC}
  624. cmpb $0,IsMultithread(%rip)
  625. {$endif FPC_PIC}
  626. {$ifndef win64}
  627. mov %rdi, %rcx
  628. {$endif win64}
  629. jz .Linclockednolock
  630. lock
  631. incl (%rcx)
  632. jmp .Linclockedend
  633. .Linclockednolock:
  634. incl (%rcx)
  635. .Linclockedend:
  636. end;
  637. {$define FPC_SYSTEM_HAS_INCLOCKED_INT64}
  638. procedure inclocked(var l : int64);assembler; nostackframe;
  639. asm
  640. { this check should be done because a lock takes a lot }
  641. { of time! }
  642. {$ifdef FPC_PIC}
  643. movq IsMultithread@GOTPCREL(%rip),%rax
  644. cmpb $0,(%rax)
  645. {$else FPC_PIC}
  646. cmpb $0,IsMultithread(%rip)
  647. {$endif FPC_PIC}
  648. {$ifndef win64}
  649. mov %rdi, %rcx
  650. {$endif win64}
  651. jz .Linclockednolock
  652. lock
  653. incq (%rcx)
  654. jmp .Linclockedend
  655. .Linclockednolock:
  656. incq (%rcx)
  657. .Linclockedend:
  658. end;
  659. function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
  660. asm
  661. {$ifdef win64}
  662. movq %rcx,%rax
  663. {$else win64}
  664. movq %rdi,%rax
  665. {$endif win64}
  666. movl $-1,%edx
  667. xchgq %rdx,%rax
  668. lock
  669. xaddl %eax, (%rdx)
  670. decl %eax
  671. end;
  672. function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
  673. asm
  674. {$ifdef win64}
  675. movq %rcx,%rax
  676. {$else win64}
  677. movq %rdi,%rax
  678. {$endif win64}
  679. movl $1,%edx
  680. xchgq %rdx,%rax
  681. lock
  682. xaddl %eax, (%rdx)
  683. incl %eax
  684. end;
  685. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  686. asm
  687. {$ifdef win64}
  688. xchgl (%rcx),%edx
  689. movl %edx,%eax
  690. {$else win64}
  691. xchgl (%rdi),%esi
  692. movl %esi,%eax
  693. {$endif win64}
  694. end;
  695. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  696. asm
  697. {$ifdef win64}
  698. xchgq %rcx,%rdx
  699. lock
  700. xaddl %ecx, (%rdx)
  701. movl %ecx,%eax
  702. {$else win64}
  703. xchgq %rdi,%rsi
  704. lock
  705. xaddl %edi, (%rsi)
  706. movl %edi,%eax
  707. {$endif win64}
  708. end;
  709. function InterLockedCompareExchange(var Target: longint; NewValue, Comperand : longint): longint; assembler; nostackframe;
  710. asm
  711. {$ifdef win64}
  712. movl %r8d,%eax
  713. lock
  714. cmpxchgl %edx,(%rcx)
  715. {$else win64}
  716. movl %edx,%eax
  717. lock
  718. cmpxchgl %esi,(%rdi)
  719. {$endif win64}
  720. end;
  721. function InterLockedDecrement64 (var Target: int64) : int64; assembler; nostackframe;
  722. asm
  723. {$ifdef win64}
  724. movq %rcx,%rax
  725. {$else win64}
  726. movq %rdi,%rax
  727. {$endif win64}
  728. movq $-1,%rdx
  729. xchgq %rdx,%rax
  730. lock
  731. xaddq %rax, (%rdx)
  732. decq %rax
  733. end;
  734. function InterLockedIncrement64 (var Target: int64) : int64; assembler; nostackframe;
  735. asm
  736. {$ifdef win64}
  737. movq %rcx,%rax
  738. {$else win64}
  739. movq %rdi,%rax
  740. {$endif win64}
  741. movq $1,%rdx
  742. xchgq %rdx,%rax
  743. lock
  744. xaddq %rax, (%rdx)
  745. incq %rax
  746. end;
  747. function InterLockedExchange64 (var Target: int64;Source : int64) : int64; assembler; nostackframe;
  748. asm
  749. {$ifdef win64}
  750. xchgq (%rcx),%rdx
  751. movq %rdx,%rax
  752. {$else win64}
  753. xchgq (%rdi),%rsi
  754. movq %rsi,%rax
  755. {$endif win64}
  756. end;
  757. function InterLockedExchangeAdd64 (var Target: int64;Source : int64) : int64; assembler; nostackframe;
  758. asm
  759. {$ifdef win64}
  760. xchgq %rcx,%rdx
  761. lock
  762. xaddq %rcx, (%rdx)
  763. movq %rcx,%rax
  764. {$else win64}
  765. xchgq %rdi,%rsi
  766. lock
  767. xaddq %rdi, (%rsi)
  768. movq %rdi,%rax
  769. {$endif win64}
  770. end;
  771. function InterLockedCompareExchange64(var Target: int64; NewValue, Comperand : int64): int64; assembler; nostackframe;
  772. asm
  773. {$ifdef win64}
  774. movq %r8,%rax
  775. lock
  776. cmpxchgq %rdx,(%rcx)
  777. {$else win64}
  778. movq %rdx,%rax
  779. lock
  780. cmpxchgq %rsi,(%rdi)
  781. {$endif win64}
  782. end;
  783. {****************************************************************************
  784. FPU
  785. ****************************************************************************}
  786. const
  787. { Internal constants for use in system unit }
  788. FPU_Invalid = 1;
  789. FPU_Denormal = 2;
  790. FPU_DivisionByZero = 4;
  791. FPU_Overflow = 8;
  792. FPU_Underflow = $10;
  793. FPU_StackUnderflow = $20;
  794. FPU_StackOverflow = $40;
  795. FPU_ExceptionMask = $ff;
  796. fpucw : word = $1300 or FPU_StackUnderflow or FPU_Underflow or FPU_Denormal;
  797. MM_MaskInvalidOp = %0000000010000000;
  798. MM_MaskDenorm = %0000000100000000;
  799. MM_MaskDivZero = %0000001000000000;
  800. MM_MaskOverflow = %0000010000000000;
  801. MM_MaskUnderflow = %0000100000000000;
  802. MM_MaskPrecision = %0001000000000000;
  803. mxcsr : dword = MM_MaskUnderflow or MM_MaskPrecision or MM_MaskDenorm;
  804. procedure fpc_cpuinit;
  805. begin
  806. { don't let libraries influence the FPU cw set by the host program }
  807. if IsLibrary then
  808. begin
  809. Default8087CW:=Get8087CW;
  810. mxcsr:=GetSSECSR;
  811. end;
  812. SysResetFPU;
  813. if not(IsLibrary) then
  814. SysInitFPU;
  815. end;
  816. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  817. Procedure SysInitFPU;
  818. var
  819. { these locals are so we don't have to hack pic code in the assembler }
  820. localmxcsr: dword;
  821. localfpucw: word;
  822. begin
  823. localmxcsr:=mxcsr;
  824. localfpucw:=fpucw;
  825. asm
  826. fldcw localfpucw
  827. { set sse exceptions }
  828. ldmxcsr localmxcsr
  829. end ['RAX'];
  830. { x86-64 might use softfloat code }
  831. softfloat_exception_mask:=float_flag_underflow or float_flag_inexact or float_flag_denormal;
  832. end;
  833. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  834. Procedure SysResetFPU;
  835. var
  836. { these locals are so we don't have to hack pic code in the assembler }
  837. localmxcsr: dword;
  838. localfpucw: word;
  839. begin
  840. localfpucw:=Default8087CW;
  841. localmxcsr:=mxcsr;
  842. asm
  843. fninit
  844. fwait
  845. fldcw localfpucw
  846. ldmxcsr localmxcsr
  847. end;
  848. { x86-64 might use softfloat code }
  849. softfloat_exception_flags:=0;
  850. end;
  851. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  852. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  853. procedure ReadBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  854. asm
  855. lfence
  856. end;
  857. procedure ReadDependencyBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  858. asm
  859. { reads imply barrier on earlier reads depended on }
  860. end;
  861. procedure ReadWriteBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  862. asm
  863. mfence
  864. end;
  865. procedure WriteBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  866. asm
  867. sfence
  868. end;
  869. {$endif}
  870. {****************************************************************************
  871. Math Routines
  872. ****************************************************************************}
  873. {$define FPC_SYSTEM_HAS_SWAPENDIAN}
  874. { SwapEndian(<16 Bit>) being inlined is faster than using assembler }
  875. function SwapEndian(const AValue: SmallInt): SmallInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  876. begin
  877. { the extra Word type cast is necessary because the "AValue shr 8" }
  878. { is turned into "longint(AValue) shr 8", so if AValue < 0 then }
  879. { the sign bits from the upper 16 bits are shifted in rather than }
  880. { zeroes. }
  881. Result := SmallInt((Word(AValue) shr 8) or (Word(AValue) shl 8));
  882. end;
  883. function SwapEndian(const AValue: Word): Word;{$ifdef SYSTEMINLINE}inline;{$endif}
  884. begin
  885. Result := Word((AValue shr 8) or (AValue shl 8));
  886. end;
  887. function SwapEndian(const AValue: LongInt): LongInt; assembler; nostackframe;
  888. asm
  889. {$ifdef win64}
  890. movl %ecx, %eax
  891. {$else win64}
  892. movl %edi, %eax
  893. {$endif win64}
  894. bswap %eax
  895. end;
  896. function SwapEndian(const AValue: DWord): DWord; assembler; nostackframe;
  897. asm
  898. {$ifdef win64}
  899. movl %ecx, %eax
  900. {$else win64}
  901. movl %edi, %eax
  902. {$endif win64}
  903. bswap %eax
  904. end;
  905. function SwapEndian(const AValue: Int64): Int64; assembler; nostackframe;
  906. asm
  907. {$ifdef win64}
  908. movq %rcx, %rax
  909. {$else win64}
  910. movq %rdi, %rax
  911. {$endif win64}
  912. bswap %rax
  913. end;
  914. function SwapEndian(const AValue: QWord): QWord; assembler; nostackframe;
  915. asm
  916. {$ifdef win64}
  917. movq %rcx, %rax
  918. {$else win64}
  919. movq %rdi, %rax
  920. {$endif win64}
  921. bswap %rax
  922. end;