x86_64.inc 50 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2002 by Florian Klaempfl and Sergei Gorelkin
  4. Members of the Free Pascal development team
  5. Processor dependent implementation for the system unit for
  6. the x86-64 architecture
  7. See the file COPYING.FPC, included in this distribution,
  8. for details about the copyright.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  12. **********************************************************************}
  13. {$asmmode GAS}
  14. {****************************************************************************
  15. Primitives
  16. ****************************************************************************}
  17. {$ifndef win64}
  18. {$define use_fast_repmovstos} { REP STOS uses nonvolatile RDI and would require a stack frame on Win64 to be SEH-compliant. }
  19. {$endif}
  20. {$ifdef use_fast_repmovstos}
  21. var
  22. fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
  23. {$endif}
  24. var
  25. has_sse41_support,fpc_cpuinit_performed : boolean;
  26. {$define FPC_SYSTEM_HAS_SPTR}
  27. Function Sptr : Pointer;assembler;nostackframe;
  28. asm
  29. movq %rsp,%rax
  30. end;
  31. {$IFNDEF INTERNAL_BACKTRACE}
  32. {$define FPC_SYSTEM_HAS_GET_FRAME}
  33. function get_frame:pointer;assembler;nostackframe;
  34. asm
  35. movq %rbp,%rax
  36. end;
  37. {$ENDIF not INTERNAL_BACKTRACE}
  38. {$define FPC_SYSTEM_HAS_GET_PC_ADDR}
  39. function get_pc_addr:pointer;assembler;nostackframe;
  40. asm
  41. movq (%rsp),%rax
  42. end;
  43. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  44. function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;
  45. begin
  46. get_caller_addr:=framebp;
  47. if assigned(framebp) then
  48. get_caller_addr:=PPointer(framebp)[1];
  49. end;
  50. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  51. function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;
  52. begin
  53. get_caller_frame:=framebp;
  54. if assigned(framebp) then
  55. get_caller_frame:=PPointer(framebp)^;
  56. end;
  57. // The following assembler procedures are disabled for FreeBSD due to
  58. // multiple issues with its old GNU assembler (Mantis #19188).
  59. // Even after fixing them, it can be enabled only for the trunk version,
  60. // otherwise bootstrapping won't be possible.
  61. // Modified to use oldbinutils as in cpu.pp source, to allow easier use for other targets.
  62. {$ifdef freebsd}
  63. {$ifndef overridebinutils}
  64. {$define oldbinutils}
  65. {$endif}
  66. {$endif freebsd}
  67. {$ifndef oldbinutils}
  68. {$ifndef FPC_SYSTEM_HAS_MOVE}
  69. {$define FPC_SYSTEM_HAS_MOVE}
  70. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
  71. { Linux: rdi source, rsi dest, rdx count
  72. win64: rcx source, rdx dest, r8 count }
  73. const
  74. NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
  75. PrefetchDistance = 512;
  76. asm
  77. {$ifndef win64}
  78. mov %rdx, %r8
  79. mov %rsi, %rdx
  80. mov %rdi, %rcx
  81. {$endif win64}
  82. cmp $3, %r8
  83. jle .L3OrLess
  84. cmp $8, %r8
  85. jle .L4to8
  86. cmp $16, %r8
  87. jle .L9to16
  88. movups (%rcx), %xmm4 { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
  89. movups -16(%rcx,%r8), %xmm5
  90. cmp $32, %r8
  91. jg .L33OrMore
  92. movups %xmm4, (%rdx) { 17–32 bytes }
  93. movups %xmm5, -16(%rdx,%r8)
  94. ret
  95. .balign 16
  96. .L3OrLess:
  97. cmp $1, %r8
  98. jl .LZero
  99. movzbl (%rcx), %eax
  100. je .LOne
  101. movzwl -2(%rcx,%r8), %r9d
  102. mov %r9w, -2(%rdx,%r8)
  103. .LOne:
  104. mov %al, (%rdx)
  105. .LZero:
  106. ret
  107. .L4to8:
  108. mov (%rcx), %eax
  109. mov -4(%rcx,%r8), %r9d
  110. mov %eax, (%rdx)
  111. mov %r9d, -4(%rdx,%r8)
  112. ret
  113. .L9to16:
  114. mov (%rcx), %rax
  115. mov -8(%rcx,%r8), %r9
  116. mov %rax, (%rdx)
  117. mov %r9, -8(%rdx,%r8)
  118. .Lquit:
  119. ret
  120. .byte 102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
  121. .L33OrMore:
  122. movups -32(%rcx,%r8), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
  123. { but -32(%rcx,%r8) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
  124. sub %rdx, %rcx { rcx = src - dest }
  125. jz .Lquit { exit if src=dest }
  126. mov %rcx, %rax
  127. neg %rax
  128. cmp %rax, %r8
  129. ja .Lback { count (r8) > unsigned(dest - src) (rax) if regions overlap }
  130. mov %rdx, %r9 { remember original dest to write first 16 bytes }
  131. add %rdx, %r8 { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  132. add $16, %rdx
  133. and $-16, %rdx
  134. sub %rdx, %r8
  135. .LRestAfterNTf:
  136. sub $32, %r8 { During the N× loop, r8 is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
  137. jbe .LPost32f
  138. cmp $NtThreshold-32, %r8
  139. jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
  140. .balign 16 { no-op }
  141. .Lloop32f:
  142. movups (%rcx,%rdx), %xmm0
  143. movaps %xmm0, (%rdx)
  144. movups 16(%rcx,%rdx), %xmm0
  145. movaps %xmm0, 16(%rdx)
  146. add $32, %rdx
  147. sub $32, %r8
  148. ja .Lloop32f
  149. .LPost32f: { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. }
  150. movups %xmm3, (%rdx, %r8)
  151. movups %xmm5, 16(%rdx,%r8) { Write first and last 16 bytes after everything else. }
  152. movups %xmm4, (%r9) { Important for <16-byte step between src and dest. }
  153. ret
  154. .balign 16
  155. .Lntf:
  156. cmp $NtThreshold, %rcx { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
  157. jb .Lloop32f { (this check is performed here to not stand in the way of smaller counts) }
  158. sub $PrefetchDistance+32, %r8 { r8 = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
  159. .balign 16 { no-op }
  160. .Lntloop64f:
  161. prefetchnta 0+PrefetchDistance(%rcx,%rdx,1)
  162. movups (%rcx,%rdx,1), %xmm0
  163. movntps %xmm0, (%rdx)
  164. movups 16(%rcx,%rdx,1), %xmm0
  165. movntps %xmm0, 16(%rdx)
  166. movups 32(%rcx,%rdx,1), %xmm0
  167. movntps %xmm0, 32(%rdx)
  168. movups 48(%rcx,%rdx,1), %xmm0
  169. movntps %xmm0, 48(%rdx)
  170. add $64, %rdx
  171. sub $64, %r8
  172. jae .Lntloop64f
  173. sfence
  174. add $PrefetchDistance+64, %r8
  175. jmpq .LRestAfterNTf { go handle remaining bytes }
  176. .byte 102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
  177. { backwards move }
  178. .Lback:
  179. movups 16(%rcx,%rdx), %xmm3 { Second vector from the start. }
  180. lea (%rdx,%r8), %r9 { points to the end of dest; remember to write last 16 bytes }
  181. lea -1(%r9), %r8 { move dest to the previous 16-byte boundary... }
  182. and $-16, %r8
  183. sub %rdx, %r8
  184. add %r8, %rdx
  185. .LRestAfterNTb:
  186. sub $32, %r8
  187. jbe .LPost32b
  188. cmp $NtThreshold-32, %r8
  189. jae .Lntb
  190. .balign 16 { no-op }
  191. .Lloop32b:
  192. sub $32, %rdx
  193. movups 16(%rcx,%rdx), %xmm0
  194. movaps %xmm0, 16(%rdx)
  195. movups (%rcx,%rdx), %xmm0
  196. movaps %xmm0, (%rdx)
  197. sub $32, %r8
  198. ja .Lloop32b
  199. .LPost32b:
  200. sub %r8, %rdx
  201. movups %xmm3, -16(%rdx)
  202. movups %xmm4, -32(%rdx)
  203. movups %xmm5, -16(%r9)
  204. ret
  205. .balign 16
  206. .Lntb:
  207. cmp $-NtThreshold,%rcx
  208. jnb .Lloop32b
  209. sub $PrefetchDistance+32, %r8
  210. .balign 16 { no-op }
  211. .Lntloop64b:
  212. prefetchnta -PrefetchDistance(%rcx,%rdx,1)
  213. sub $64, %rdx
  214. movups 48(%rcx,%rdx,1), %xmm0
  215. movntps %xmm0, 48(%rdx)
  216. movups 32(%rcx,%rdx,1), %xmm0
  217. movntps %xmm0, 32(%rdx)
  218. movups 16(%rcx,%rdx,1), %xmm0
  219. movntps %xmm0, 16(%rdx)
  220. movups (%rcx,%rdx,1), %xmm0
  221. movntps %xmm0, (%rdx)
  222. sub $64, %r8
  223. jae .Lntloop64b
  224. sfence
  225. add $PrefetchDistance+64, %r8
  226. jmpq .LRestAfterNTb
  227. end;
  228. {$endif FPC_SYSTEM_HAS_MOVE}
  229. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  230. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  231. or not defined(FPC_SYSTEM_HAS_FILLDWORD)
  232. or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  233. procedure FillXxxx_MoreThanTwoXmms; assembler; nostackframe;
  234. { Input:
  235. rcx = 'x'
  236. rdx = byte count
  237. xmm0 = pattern for ALIGNED writes
  238. First and last 16 bytes are written. }
  239. const
  240. {$ifdef use_fast_repmovstos}
  241. ErmsThreshold = 1536;
  242. {$endif}
  243. NtThreshold = 4 * 1024 * 1024;
  244. asm
  245. { x can start and end misaligned on the vector boundary:
  246. x = ~~][H1][H2][...][T2][T1]~
  247. [UH] [UT]
  248. UH (“unaligned head”) potentially overlaps with H1 and is already written with 'movdqu' by the caller.
  249. At least 1 of its bytes is exclusive to it, i.e. if x is already aligned, H1 starts at byte 16.
  250. H1 and so on are called “aligned heads” or just “heads”.
  251. T1 and so on are called “aligned tails” or just “tails”.
  252. UT (“unaligned tail”) is written by the caller as well.
  253. At least 1 of its bytes is exclusive to it as well, that’s why 65 is subtracted below instead of 64. }
  254. lea -65(%rcx,%rdx), %rax
  255. and $-16, %rax { rax = “T4” (possibly fictive). }
  256. mov %rax, %rdx { Remember T4 to rdx. }
  257. and $-16, %rcx { rcx = H1 − 16. }
  258. sub %rcx, %rax { rax = aligned byte count − 48. }
  259. movdqa %xmm0, 16(%rcx) { Write H1. }
  260. cmp $32-48, %rax
  261. jle .LOneAlignedTailWrite
  262. movdqa %xmm0, 32(%rcx) { Write H2. }
  263. cmp $64-48, %rax
  264. jle .LTwoAlignedTailWrites
  265. sub $48, %rax { rax = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
  266. jle .LFourAlignedTailWrites
  267. add $48, %rcx { rcx = H3. }
  268. {$ifdef use_fast_repmovstos}
  269. cmp $ErmsThreshold-64, %rax { Need to write aligned byte count − 32 bytes already written. rax = aligned byte count − 96, so compare rax + 64 to ErmsThreshold, or rax to ErmsThreshold − 64. }
  270. jae .LRepStos
  271. {$else}
  272. cmp $NtThreshold, %rax
  273. jae .L64xNT_Body
  274. {$endif}
  275. .balign 16
  276. .L64x_Body:
  277. movdqa %xmm0, (%rcx)
  278. movdqa %xmm0, 16(%rcx)
  279. movdqa %xmm0, 32(%rcx)
  280. movdqa %xmm0, 48(%rcx)
  281. add $64, %rcx
  282. sub $64, %rax
  283. ja .L64x_Body
  284. .LFourAlignedTailWrites:
  285. movdqa %xmm0, (%rdx) { T4 }
  286. movdqa %xmm0, 16(%rdx) { T3 }
  287. .LTwoAlignedTailWrites:
  288. movdqa %xmm0, 32(%rdx) { T2 }
  289. .LOneAlignedTailWrite:
  290. movdqa %xmm0, 48(%rdx) { T1 }
  291. ret
  292. {$ifdef use_fast_repmovstos}
  293. .LRepStos:
  294. {$ifdef FPC_PIC}
  295. movq fast_large_repmovstosb@GOTPCREL(%rip), %r8
  296. cmpb $1, (%r8)
  297. {$else FPC_PIC}
  298. cmpb $1, fast_large_repmovstosb(%rip)
  299. {$endif FPC_PIC}
  300. jne .LRepStosIsNotBetter
  301. {$ifdef win64}
  302. push %rdi { For tests on Windows; however this is SEH incompliant so the entire use_fast_repmovstos branch is disabled by default! }
  303. {$endif}
  304. mov %rcx, %rdi { rdi = REP STOS destination. }
  305. lea 64(%rax), %rcx
  306. shr $3, %rcx { rcx = count of REP STOSQ blocks up to T1 (might be 1 more than strictly required if T1 and UT overlap is 8 or more, don’t care). }
  307. movq %xmm0, %rax { recover pattern for aligned writes back to GPR :) }
  308. rep stosq
  309. {$ifdef win64}
  310. pop %rdi
  311. {$endif}
  312. ret
  313. {$endif}
  314. .LRepStosIsNotBetter:
  315. cmp $NtThreshold-64, %rax
  316. jb .L64x_Body
  317. .balign 16
  318. .L64xNT_Body:
  319. movntdq %xmm0, (%rcx)
  320. movntdq %xmm0, 16(%rcx)
  321. movntdq %xmm0, 32(%rcx)
  322. movntdq %xmm0, 48(%rcx)
  323. add $64, %rcx
  324. sub $64, %rax
  325. ja .L64xNT_Body
  326. sfence
  327. jmp .LFourAlignedTailWrites
  328. end;
  329. {$endif FPC_SYSTEM_HAS_FILLxxxx}
  330. {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
  331. {$define FPC_SYSTEM_HAS_FILLCHAR}
  332. Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
  333. asm
  334. { win64: rcx dest, rdx count, r8b value
  335. linux: rdi dest, rsi count, rdx value }
  336. movzbl {$ifdef win64} %r8b {$else} %dl {$endif}, %eax
  337. imul $0x01010101, %eax
  338. {$ifndef win64}
  339. mov %rsi, %rdx
  340. mov %rdi, %rcx
  341. {$endif win64}
  342. cmp $3, %rdx
  343. jle .L3OrLess
  344. cmp $16, %rdx
  345. jl .L4to15
  346. movd %eax, %xmm0
  347. pshufd $0, %xmm0, %xmm0
  348. movdqu %xmm0, (%rcx)
  349. movdqu %xmm0, -16(%rcx,%rdx)
  350. cmp $32, %rdx
  351. jg FillXxxx_MoreThanTwoXmms
  352. ret
  353. .L4to15:
  354. mov %eax, (%rcx)
  355. cmp $8, %edx
  356. jle .LLast4
  357. mov %eax, 4(%rcx)
  358. mov %eax, -8(%rcx,%rdx)
  359. .LLast4:
  360. mov %eax, -4(%rcx,%rdx)
  361. ret
  362. .L3OrLess:
  363. test %rdx, %rdx
  364. jle .LQuit
  365. mov %al, (%rcx)
  366. mov %al, -1(%rcx,%rdx)
  367. shr $1, %edx
  368. mov %al, (%rcx,%rdx)
  369. .LQuit:
  370. end;
  371. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  372. {$ifndef FPC_SYSTEM_HAS_FILLWORD}
  373. {$define FPC_SYSTEM_HAS_FILLWORD}
  374. procedure FillWord(var x;count:SizeInt;value:word);assembler;nostackframe;
  375. asm
  376. {$ifdef win64}
  377. movzwl %r8w, %eax
  378. shl $16, %r8d
  379. or %r8d, %eax
  380. {$else}
  381. movzwl %dx, %eax
  382. shl $16, %edx
  383. or %edx, %eax
  384. mov %rsi, %rdx
  385. mov %rdi, %rcx
  386. {$endif}
  387. cmp $3, %rdx
  388. jle .L3OrLess
  389. cmp $8, %rdx
  390. jle .L4to8
  391. movd %eax, %xmm0
  392. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  393. movdqu %xmm0, (%rcx)
  394. movdqu %xmm0, -16(%rcx,%rdx,2)
  395. cmp $16, %rdx
  396. jg .LMoreThanTwoXMMs
  397. ret
  398. .LMoreThanTwoXMMs:
  399. shl $1, %rdx { rdx = byte count }
  400. mov %rcx, %r8
  401. shl $3, %ecx
  402. rol %cl, %eax { misalign the pattern by the misalignment of x }
  403. mov %r8, %rcx
  404. movd %eax, %xmm0
  405. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
  406. jmp FillXxxx_MoreThanTwoXmms
  407. .L4to8:
  408. mov %eax, %r8d
  409. shl $32, %r8
  410. or %r8, %rax
  411. mov %rax, (%rcx)
  412. mov %rax, -8(%rcx,%rdx,2)
  413. ret
  414. .L3OrLess:
  415. test %rdx, %rdx
  416. jle .LQuit
  417. mov %ax, (%rcx)
  418. mov %ax, -2(%rcx,%rdx,2)
  419. shr $1, %edx
  420. mov %ax, (%rcx,%rdx,2)
  421. .LQuit:
  422. end;
  423. {$endif FPC_SYSTEM_HAS_FILLWORD}
  424. {$ifndef FPC_SYSTEM_HAS_FILLDWORD}
  425. {$define FPC_SYSTEM_HAS_FILLDWORD}
  426. procedure FillDWord(var x;count:SizeInt;value:DWord);assembler;nostackframe;
  427. asm
  428. {$ifdef win64}
  429. mov %r8d, %eax
  430. {$else}
  431. mov %edx, %eax
  432. mov %rsi, %rdx
  433. mov %rdi, %rcx
  434. {$endif win64}
  435. cmp $3, %rdx
  436. jle .L3OrLess
  437. cmp $8, %rdx
  438. jle .L4to8
  439. movd %eax, %xmm0
  440. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  441. movdqu %xmm0, (%rcx)
  442. movdqu %xmm0, -16(%rcx,%rdx,4)
  443. shl $2, %rdx { rdx = byte count }
  444. mov %rcx, %r8
  445. shl $3, %ecx
  446. rol %cl, %eax { misalign the pattern by the misalignment of x }
  447. mov %r8, %rcx
  448. movd %eax, %xmm0
  449. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
  450. jmp FillXxxx_MoreThanTwoXmms
  451. .L4to8:
  452. {$ifndef win64} { on win64, eax = r8d already. }
  453. mov %eax, %r8d
  454. {$endif}
  455. shl $32, %r8
  456. or %r8, %rax
  457. mov %rax, (%rcx)
  458. mov %rax, 8(%rcx)
  459. mov %rax, -16(%rcx,%rdx,4)
  460. mov %rax, -8(%rcx,%rdx,4)
  461. ret
  462. .L3OrLess:
  463. test %rdx, %rdx
  464. jle .LQuit
  465. mov %eax, (%rcx)
  466. mov %eax, -4(%rcx,%rdx,4)
  467. shr $1, %edx
  468. mov %eax, (%rcx,%rdx,4)
  469. .LQuit:
  470. end;
  471. {$endif FPC_SYSTEM_HAS_FILLDWORD}
  472. {$ifndef FPC_SYSTEM_HAS_FILLQWORD}
  473. {$define FPC_SYSTEM_HAS_FILLQWORD}
  474. procedure FillQWord(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  475. asm
  476. {$ifdef win64}
  477. mov %r8, %rax
  478. {$else}
  479. mov %rdx, %rax
  480. mov %rsi, %rdx
  481. mov %rdi, %rcx
  482. {$endif win64}
  483. cmp $2, %rdx
  484. jle .L2OrLess
  485. cmp $6, %rdx
  486. jle .L3to6
  487. movq %rax, %xmm0
  488. punpcklqdq %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  489. movdqu %xmm0, (%rcx)
  490. movdqu %xmm0, -16(%rcx,%rdx,8)
  491. shl $3, %rdx { rdx = byte count }
  492. mov %rcx, %r8
  493. shl $3, %ecx
  494. rol %cl, %rax { misalign the pattern by the misalignment of x }
  495. mov %r8, %rcx
  496. movq %rax, %xmm0
  497. punpcklqdq %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
  498. jmp FillXxxx_MoreThanTwoXmms
  499. .L3to6:
  500. mov %rax, (%rcx)
  501. mov %rax, 8(%rcx)
  502. mov %rax, 16(%rcx)
  503. mov %rax, -24(%rcx,%rdx,8)
  504. mov %rax, -16(%rcx,%rdx,8)
  505. mov %rax, -8(%rcx,%rdx,8)
  506. ret
  507. .L2OrLess:
  508. test %rdx, %rdx
  509. jle .LQuit
  510. mov %rax, (%rcx)
  511. mov %rax, -8(%rcx,%rdx,8)
  512. .LQuit:
  513. end;
  514. {$endif FPC_SYSTEM_HAS_FILLQWORD}
  515. {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
  516. {$define FPC_SYSTEM_HAS_INDEXBYTE}
  517. function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  518. { win64: rcx buf, rdx len, r8b word
  519. linux: rdi buf, rsi len, rdx word }
  520. asm
  521. test len, len
  522. jz .Lnotfound { exit if len=0 }
  523. movd {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1
  524. mov {$ifdef win64} %ecx {$else} %edi {$endif}, %eax
  525. punpcklbw %xmm1, %xmm1
  526. punpcklbw %xmm1, %xmm1
  527. and $4095, %eax
  528. pshufd $0, %xmm1, %xmm1
  529. cmp $4080, %eax
  530. ja .LCrossPage
  531. movdqu ({$ifdef win64} %rcx {$else} %rdi {$endif}), %xmm0 { Analyze first 16 bytes, unaligned. }
  532. pcmpeqb %xmm1, %xmm0
  533. pmovmskb %xmm0, %eax
  534. test %eax, %eax
  535. jz .LContinueAligned
  536. bsf %eax, %eax
  537. cmp len, %rax
  538. jae .Lnotfound
  539. ret
  540. .byte {$ifndef win64}102,102,102,102,{$endif}102,102,102,102,102,102,102,102,102,144 { Make .balign 16 before .Lloop a no-op. }
  541. .LContinueAligned:
  542. cmp $16, len { Length might be explicitly set to 16 or less; if so, skip a bit of work. }
  543. jbe .Lnotfound { (Or rather, this check is *required* unless jumping to .Lcontinue instead of going directly to .Lloop) }
  544. {$ifdef win64}
  545. mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
  546. add $16, %rcx
  547. {$else}
  548. lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
  549. {$endif}
  550. and $-0x10, %rcx { first aligned address after buf }
  551. sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr }
  552. .balign 16
  553. .Lloop:
  554. movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0 { r8/rdi and rcx may have any values, }
  555. add $16, %rcx { but their sum is evenly divisible by 16. }
  556. pcmpeqb %xmm1, %xmm0
  557. pmovmskb %xmm0, %eax
  558. test %eax, %eax
  559. jnz .Lmatch
  560. .Lcontinue:
  561. cmp %rcx, len
  562. ja .Lloop
  563. .Lnotfound:
  564. or $-1, %rax
  565. ret
  566. .LCrossPage:
  567. {$ifdef win64}
  568. mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
  569. add $16, %rcx
  570. {$else}
  571. lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
  572. {$endif}
  573. and $-0x10, %rcx { first aligned address after buf }
  574. movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
  575. sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr }
  576. pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
  577. pmovmskb %xmm0, %eax
  578. shl %cl, %eax { shift valid bits into high word }
  579. and $0xffff0000, %eax { clear low word containing invalid bits }
  580. shr %cl, %eax { shift back }
  581. jz .Lcontinue
  582. .Lmatch:
  583. bsf %eax, %eax
  584. lea -16(%rcx,%rax), %rax
  585. cmp %rax, len { check against the buffer length }
  586. jbe .Lnotfound
  587. end;
  588. {$endif FPC_SYSTEM_HAS_INDEXBYTE}
  589. {$ifndef FPC_SYSTEM_HAS_INDEXWORD}
  590. {$define FPC_SYSTEM_HAS_INDEXWORD}
  591. function IndexWord(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  592. { win64: rcx buf, rdx len, r8b word
  593. linux: rdi buf, rsi len, rdx word }
  594. asm
  595. test len, len
  596. jz .Lnotfound { exit if len=0 }
  597. movd {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1
  598. {$ifdef win64}
  599. mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
  600. add $16, %rcx
  601. {$else}
  602. lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
  603. {$endif}
  604. punpcklwd %xmm1, %xmm1
  605. and $-0x10, %rcx
  606. pshufd $0, %xmm1, %xmm1
  607. movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
  608. sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes }
  609. test $1, {$ifdef win64} %r8b {$else} %dil {$endif} { if buffer isn't aligned to word boundary, }
  610. jnz .Lunaligned { use a different algorithm }
  611. pcmpeqw %xmm1, %xmm0
  612. pmovmskb %xmm0, %eax
  613. shl %cl, %eax
  614. and $0xffff0000, %eax
  615. shr %cl, %eax
  616. shr $1, %ecx { bytes->words }
  617. test %eax, %eax
  618. jz .Lcontinue
  619. .Lmatch:
  620. bsf %eax, %eax
  621. shr $1, %eax { in words }
  622. lea -8(%rcx,%rax), %rax
  623. cmp %rax, len
  624. jbe .Lnotfound { if match is after the specified length, ignore it }
  625. retq
  626. .balign 16
  627. .Lloop:
  628. movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx,2), %xmm0
  629. add $8, %rcx
  630. pcmpeqw %xmm1, %xmm0
  631. pmovmskb %xmm0, %eax
  632. test %eax, %eax
  633. jnz .Lmatch
  634. .Lcontinue:
  635. cmp %rcx, len
  636. ja .Lloop
  637. .Lnotfound:
  638. or $-1, %rax
  639. retq
  640. .Lunaligned:
  641. movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
  642. psllw $8, %xmm1 { swap bytes of each word of pattern) }
  643. psrlw $8, %xmm2
  644. por %xmm2, %xmm1
  645. pcmpeqb %xmm1, %xmm0
  646. pmovmskb %xmm0, %eax
  647. shl %cl, %eax
  648. and $0xffff0000, %eax
  649. shr %cl, %eax
  650. add len, len { length words -> bytes }
  651. xor %r10d, %r10d { nothing to merge yet }
  652. jmp .Lcontinue_u
  653. .balign 16
  654. .Lloop_u:
  655. movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0
  656. add $16, %rcx
  657. pcmpeqb %xmm1, %xmm0 { compare by bytes }
  658. shr $16, %r10d { bit 16 shifts into 0 }
  659. pmovmskb %xmm0, %eax
  660. .Lcontinue_u:
  661. shl $1, %eax { 15:0 -> 16:1 }
  662. or %r10d, %eax { merge bit 0 from previous round }
  663. mov %eax, %r10d
  664. shr $1, %eax { now AND together adjacent pairs of bits }
  665. and %r10d, %eax
  666. and $0x5555, %eax { also reset odd bits }
  667. jnz .Lmatch_u
  668. cmpq %rcx, len
  669. ja .Lloop_u
  670. .Lnotfound_u:
  671. or $-1, %rax
  672. retq
  673. .Lmatch_u:
  674. bsf %eax, %eax
  675. lea -16(%rcx,%rax), %rax
  676. cmp %rax, len
  677. jbe .Lnotfound_u { if match is after the specified length, ignore it }
  678. sar $1, %rax { in words }
  679. end;
  680. {$endif FPC_SYSTEM_HAS_INDEXWORD}
  681. {$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
  682. {$define FPC_SYSTEM_HAS_INDEXDWORD}
  683. function IndexDWord(Const buf;len:SizeInt;b:dword):SizeInt; assembler; nostackframe;
  684. asm
  685. {$ifdef win64}
  686. mov %rcx, %rax
  687. {$else}
  688. mov %rdx, %r8
  689. mov %rsi, %rdx
  690. mov %rdi, %rax
  691. {$endif}
  692. cmp $4, %rdx
  693. jle .LDwordwise_Prepare
  694. sub $4, %rdx
  695. movd %r8d, %xmm1
  696. pshufd $0, %xmm1, %xmm1
  697. .balign 16
  698. .L4x_Body:
  699. movdqu (%rax), %xmm0
  700. pcmpeqd %xmm1, %xmm0
  701. pmovmskb %xmm0, %r8d
  702. test %r8d, %r8d
  703. jnz .LFoundAtMask
  704. add $16, %rax
  705. sub $4, %rdx
  706. jg .L4x_Body
  707. lea (%rax,%rdx,4), %rax
  708. movdqu (%rax), %xmm0
  709. pcmpeqd %xmm1, %xmm0
  710. pmovmskb %xmm0, %r8d
  711. test %r8d, %r8d
  712. jnz .LFoundAtMask
  713. or $-1, %rax
  714. ret
  715. .balign 16 { no-op }
  716. .LDwordwise_Body:
  717. cmp (%rax), %r8d
  718. je .LFoundAtRax
  719. add $4, %rax
  720. .LDwordwise_Prepare:
  721. sub $1, %rdx
  722. jae .LDwordwise_Body
  723. or $-1, %rax
  724. ret
  725. .LFoundAtMask:
  726. bsf %r8d, %r8d
  727. add %r8, %rax
  728. .LFoundAtRax:
  729. sub {$ifdef win64} %rcx {$else} %rdi {$endif}, %rax
  730. shr $2, %rax
  731. end;
  732. {$endif FPC_SYSTEM_HAS_INDEXDWORD}
  733. {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
  734. {$define FPC_SYSTEM_HAS_INDEXQWORD}
  735. function IndexQWord_Plain(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  736. { win64: rcx=buf, rdx=len, r8=b
  737. else: rdi=buf, rsi=len, rdx=b }
  738. asm
  739. mov buf, %rax
  740. sub $8, %rax
  741. .balign 16
  742. .LQwordwise_Next:
  743. add $8, %rax
  744. sub $1, len
  745. jb .LNothing
  746. cmpq b, (%rax)
  747. jne .LQwordwise_Next
  748. sub buf, %rax
  749. shr $3, %rax
  750. ret
  751. .LNothing:
  752. mov $-1, %rax
  753. end;
  754. function {$ifdef CPUX86_HAS_SSE4_1} IndexQWord {$else} IndexQWord_SSE41 {$endif}(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  755. { win64: rcx=buf, rdx=len, r8=b
  756. else: rdi=buf, rsi=len, rdx=b }
  757. asm
  758. cmp $6, len
  759. jle IndexQWord_Plain
  760. mov buf, %rax
  761. movq {$ifdef win64} %r8 {$else} %rdx {$endif}, %xmm0
  762. punpcklqdq %xmm0, %xmm0 { xmm0 = pattern of 'b's. }
  763. sub $6, len
  764. .balign 16
  765. .L6x_Loop:
  766. movdqu (%rax), %xmm1
  767. pcmpeqq %xmm0, %xmm1 { xmm1 = cmpeq(vec 0, pattern) }
  768. movdqu 16(%rax), %xmm2
  769. pcmpeqq %xmm0, %xmm2
  770. por %xmm1, %xmm2 { xmm2 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) }
  771. movdqu 32(%rax), %xmm3
  772. pcmpeqq %xmm0, %xmm3
  773. por %xmm2, %xmm3 { xmm3 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) or cmpeq(vec 2, pattern) }
  774. ptest %xmm3, %xmm3
  775. jnz .LFound
  776. add $48, %rax
  777. sub $6, len
  778. jge .L6x_Loop
  779. lea (%rax,{$ifdef win64} %rdx {$else} %rsi {$endif},8), %rax { Point to last 3 vectors. }
  780. cmp $-5, len
  781. jge .L6x_Loop { Reuse .L6x_Loop to compare last 3 vectors, if not compared already. }
  782. mov $-1, %rax
  783. ret
  784. .LFound:
  785. sub buf, %rax
  786. ptest %xmm1, %xmm1
  787. jnz .LFoundAtXmm1
  788. ptest %xmm2, %xmm2
  789. jnz .LFoundAtXmm2
  790. add $16, %rax
  791. movdqa %xmm3, %xmm2
  792. .LFoundAtXmm2:
  793. add $16, %rax
  794. movdqa %xmm2, %xmm1
  795. .LFoundAtXmm1:
  796. pmovmskb %xmm1, %ecx
  797. bsf %ecx, %ecx
  798. add %rcx, %rax
  799. shr $3, %rax
  800. end;
  801. {$ifndef CPUX86_HAS_SSE4_1}
  802. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt; forward;
  803. var
  804. IndexQWord_Impl: function(const buf;len:SizeInt;b:QWord):SizeInt = @IndexQWord_Dispatch;
  805. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt;
  806. begin
  807. if not fpc_cpuinit_performed then
  808. exit(IndexQWord_Plain(buf,len,b));
  809. if has_sse41_support then
  810. IndexQWord_Impl:=@IndexQWord_SSE41
  811. else
  812. IndexQWord_Impl:=@IndexQWord_Plain;
  813. result:=IndexQWord_Impl(buf,len,b);
  814. end;
  815. function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt;
  816. begin
  817. result:=IndexQWord_Impl(buf,len,b);
  818. end;
  819. {$endif ndef CPUX86_HAS_SSE4_1}
  820. {$endif FPC_SYSTEM_HAS_INDEXQWORD}
  821. {$endif freebsd}
  822. {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
  823. {$define FPC_SYSTEM_HAS_COMPAREBYTE}
  824. function CompareByte(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  825. { win64: rcx buf, rdx buf, r8 len
  826. linux: rdi buf, rsi buf, rdx len }
  827. asm
  828. {$ifndef win64}
  829. mov %rdx, %r8
  830. mov %rsi, %rdx
  831. mov %rdi, %rcx
  832. {$endif win64}
  833. { rcx = buf1, rdx = buf2, r8 = len }
  834. cmp $1, %r8
  835. jle .L1OrLess
  836. cmp $16, %r8
  837. jae .LVecOrMore
  838. { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but is faster. }
  839. mov %ecx, %eax
  840. or %edx, %eax
  841. and $4095, %eax
  842. cmp $4080, %eax
  843. ja .LCantOverReadBoth
  844. { Over-read both as XMMs. }
  845. movdqu (%rcx), %xmm0
  846. movdqu (%rdx), %xmm1
  847. pcmpeqb %xmm1, %xmm0
  848. pmovmskb %xmm0, %eax
  849. inc %ax
  850. jz .LNothing
  851. bsf %eax, %eax
  852. cmp %r8d, %eax { Ignore garbage beyond 'len'. }
  853. jae .LNothing
  854. movzbl (%rdx,%rax), %edx
  855. movzbl (%rcx,%rax), %eax
  856. sub %rdx, %rax
  857. ret
  858. .balign 16
  859. .LNothing:
  860. xor %eax, %eax
  861. ret
  862. .LAligned32xLoop_TwoVectorsDiffer:
  863. add %rcx, %rdx { restore rdx = buf2 }
  864. pmovmskb %xmm0, %r8d { Is there a difference in the first vector? }
  865. inc %r8w
  866. jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, eax = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
  867. mov %r8d, %eax
  868. .LVec0Differs:
  869. bsf %eax, %eax
  870. movzbl (%rdx,%rax), %edx
  871. movzbl (%rcx,%rax), %eax
  872. sub %rdx, %rax
  873. ret
  874. .byte 102,102,102,102,102,102,102,144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
  875. .LVecOrMore:
  876. { Compare first vectors. }
  877. movdqu (%rcx), %xmm0
  878. movdqu (%rdx), %xmm1
  879. pcmpeqb %xmm1, %xmm0
  880. pmovmskb %xmm0, %eax
  881. inc %ax
  882. jnz .LVec0Differs
  883. sub $32, %r8
  884. jbe .LLastVec
  885. { Compare second vectors. }
  886. movdqu 16(%rcx), %xmm0
  887. movdqu 16(%rdx), %xmm1
  888. pcmpeqb %xmm1, %xmm0
  889. pmovmskb %xmm0, %eax
  890. inc %ax
  891. jnz .LVec1Differs
  892. cmp $32, %r8
  893. jbe .LLastTwoVectors
  894. { More than four vectors: aligned loop. }
  895. sub %rcx, %rdx { rdx = buf2 - buf1 }
  896. jz .LNothing { Exit if buf1 = buf2. }
  897. lea -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). }
  898. and $-16, %rcx { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
  899. sub %rcx, %r8 { r8 = count to be handled with loop }
  900. .balign 16 { no-op }
  901. .LAligned32xLoop_Body:
  902. add $32, %rcx
  903. { Compare two XMMs, reduce the result with 'and'. }
  904. movdqu (%rdx,%rcx), %xmm0
  905. pcmpeqb (%rcx), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
  906. movdqu 16(%rdx,%rcx), %xmm1
  907. pcmpeqb 16(%rcx), %xmm1
  908. pand %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
  909. pmovmskb %xmm1, %eax
  910. inc %ax
  911. jnz .LAligned32xLoop_TwoVectorsDiffer
  912. sub $32, %r8
  913. ja .LAligned32xLoop_Body
  914. add %rcx, %rdx { restore rdx = buf2 }
  915. add $32, %r8
  916. .LLastTwoVectors:
  917. movdqu (%rcx,%r8), %xmm0
  918. movdqu (%rdx,%r8), %xmm1
  919. pcmpeqb %xmm1, %xmm0
  920. pmovmskb %xmm0, %eax
  921. inc %ax
  922. jnz .LVecEm2Differs
  923. .LLastVec:
  924. movdqu 16(%rcx,%r8), %xmm0
  925. movdqu 16(%rdx,%r8), %xmm1
  926. pcmpeqb %xmm1, %xmm0
  927. pmovmskb %xmm0, %eax
  928. inc %ax
  929. jnz .LVecEm1Differs
  930. xor %eax, %eax
  931. ret
  932. .LVec1Differs:
  933. xor %r8d, %r8d
  934. .LVecEm1Differs:
  935. add $16, %r8
  936. .LVecEm2Differs:
  937. bsf %eax, %eax
  938. add %r8, %rax
  939. movzbl (%rdx,%rax), %edx
  940. movzbl (%rcx,%rax), %eax
  941. sub %rdx, %rax
  942. ret
  943. .LCantOverReadBoth:
  944. cmp $8, %r8d
  945. ja .L9to15
  946. cmp $3, %r8d
  947. jle .L2to3
  948. mov (%rcx), %eax
  949. mov (%rdx), %r9d
  950. cmp %r9d, %eax
  951. jne .L4xOr8xDiffer
  952. mov -4(%rcx,%r8), %eax
  953. mov -4(%rdx,%r8), %r9d
  954. cmp %r9d, %eax
  955. jne .L4xOr8xDiffer
  956. xor %eax, %eax
  957. ret
  958. .L9to15:
  959. mov (%rcx), %rax
  960. mov (%rdx), %r9
  961. cmp %r9, %rax
  962. jne .L4xOr8xDiffer
  963. mov -8(%rcx,%r8), %rax
  964. mov -8(%rdx,%r8), %r9
  965. cmp %r9, %rax
  966. jne .L4xOr8xDiffer
  967. xor %eax, %eax
  968. ret
  969. .L4xOr8xDiffer:
  970. bswap %r9
  971. bswap %rax
  972. cmp %r9, %rax
  973. sbb %rax, %rax
  974. or $1, %rax
  975. ret
  976. .L2to3:
  977. movzwl (%rcx), %eax
  978. bswap %eax
  979. shr $1, %eax
  980. mov -1(%rcx,%r8), %al
  981. movzwl (%rdx), %ecx
  982. bswap %ecx
  983. shr $1, %ecx
  984. mov -1(%rdx,%r8), %cl
  985. sub %rcx, %rax
  986. ret
  987. .L1OrLess:
  988. jl .LUnbounded_Prepare
  989. movzbl (%rcx), %eax
  990. movzbl (%rdx), %edx
  991. sub %rdx, %rax
  992. ret
  993. .LUnbounded_Prepare:
  994. sub %rcx, %rdx { rdx = buf2 - buf1 }
  995. test %r8, %r8
  996. jnz .LUnbounded_Body
  997. xor %eax, %eax
  998. ret
  999. .balign 16
  1000. .LUnbounded_Next:
  1001. add $1, %rcx
  1002. .LUnbounded_Body:
  1003. movzbl (%rdx,%rcx), %eax
  1004. cmp %al, (%rcx)
  1005. je .LUnbounded_Next
  1006. sbb %rax, %rax
  1007. or $1, %rax
  1008. end;
  1009. {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
  1010. {$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
  1011. {$define FPC_SYSTEM_HAS_COMPAREWORD}
  1012. function CompareWord(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1013. asm
  1014. {$ifndef win64}
  1015. mov %rdx, %r8
  1016. mov %rsi, %rdx
  1017. mov %rdi, %rcx
  1018. {$endif win64}
  1019. sub %rcx, %rdx { rdx = buf2 - buf1 }
  1020. cmp $1, %r8
  1021. jle .LWordwise_Prepare
  1022. mov %r8, %rax
  1023. shr $62, %rax
  1024. jnz .LWordwise_Prepare
  1025. cmp $8, %r8
  1026. jge .LVecOrMore
  1027. lea (%rdx,%rcx), %eax
  1028. or %ecx, %eax
  1029. and $4095, %eax
  1030. cmp $4080, %eax
  1031. ja .LWordwise_Prepare
  1032. movdqu (%rdx,%rcx), %xmm0
  1033. movdqu (%rcx), %xmm1
  1034. pcmpeqw %xmm1, %xmm0
  1035. pmovmskb %xmm0, %eax
  1036. shl $1, %r8 { convert to bytes }
  1037. inc %ax
  1038. jz .LNothing
  1039. bsf %eax, %eax
  1040. cmp %r8d, %eax
  1041. jb .LSubtractWords
  1042. .LNothing:
  1043. xor %eax, %eax
  1044. ret
  1045. .balign 16
  1046. .LWordwise_Body:
  1047. movzwl (%rdx,%rcx), %eax
  1048. cmp %ax, (%rcx)
  1049. jne .LDoSbb
  1050. add $2, %rcx
  1051. .LWordwise_Prepare:
  1052. sub $1, %r8
  1053. jae .LWordwise_Body
  1054. xor %eax, %eax
  1055. ret
  1056. .LDoSbb:
  1057. sbb %rax, %rax
  1058. or $1, %rax
  1059. ret
  1060. .LVec0Differs:
  1061. bsf %eax, %eax
  1062. .LSubtractWords:
  1063. add %rcx, %rdx { recover rdx = buf2 }
  1064. movzwl (%rdx,%rax), %edx
  1065. movzwl (%rcx,%rax), %eax
  1066. sub %rdx, %rax
  1067. ret
  1068. .byte 102,102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turn .balign 16 before .LAligned8xLoop_Body into a no-op. }
  1069. .LVecOrMore:
  1070. movdqu (%rdx,%rcx), %xmm0 { Compare first vectors. }
  1071. movdqu (%rcx), %xmm1
  1072. pcmpeqw %xmm1, %xmm0
  1073. pmovmskb %xmm0, %eax
  1074. inc %ax
  1075. jnz .LVec0Differs
  1076. shl $1, %r8 { convert to bytes }
  1077. sub $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1078. jle .LLastVec
  1079. test %rdx, %rdx
  1080. jz .LNothing { Exit if buf1 = buf2. }
  1081. mov %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1082. add %rcx, %r8
  1083. and $-16, %rcx { align buf1; +16 is performed by the loop. }
  1084. sub %rcx, %r8
  1085. .balign 16 { no-op }
  1086. .LAligned8xLoop_Body:
  1087. add $16, %rcx
  1088. movdqu (%rdx,%rcx), %xmm0
  1089. pcmpeqb (%rcx), %xmm0
  1090. pmovmskb %xmm0, %eax
  1091. inc %ax
  1092. jnz .LAligned8xLoop_VecDiffers
  1093. sub $16, %r8
  1094. ja .LAligned8xLoop_Body
  1095. .LLastVec:
  1096. lea 16(%rcx,%r8), %rcx { point to the last 16 bytes }
  1097. movdqu (%rdx,%rcx), %xmm0
  1098. movdqu (%rcx), %xmm1
  1099. pcmpeqw %xmm1, %xmm0
  1100. pmovmskb %xmm0, %eax
  1101. inc %ax
  1102. jnz .LVec0Differs
  1103. xor %eax, %eax
  1104. ret
  1105. .LAligned8xLoop_VecDiffers:
  1106. bsf %eax, %eax
  1107. add %rax, %rcx
  1108. sub %r9, %rcx
  1109. and $-2, %rcx
  1110. add %r9, %rcx
  1111. movzwl (%rdx,%rcx), %edx
  1112. movzwl (%rcx), %eax
  1113. sub %rdx, %rax
  1114. end;
  1115. {$endif FPC_SYSTEM_HAS_COMPAREWORD}
  1116. {$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
  1117. {$define FPC_SYSTEM_HAS_COMPAREDWORD}
  1118. function CompareDWord(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1119. asm
  1120. {$ifndef win64}
  1121. mov %rdx, %r8
  1122. mov %rsi, %rdx
  1123. mov %rdi, %rcx
  1124. {$endif win64}
  1125. sub %rcx, %rdx { rdx = buf2 - buf1 }
  1126. cmp $4, %r8
  1127. jle .LDwordwise_Prepare
  1128. mov %r8, %rax
  1129. shr $61, %rax
  1130. jnz .LDwordwise_Prepare
  1131. movdqu (%rdx,%rcx), %xmm0 { Compare first vectors. }
  1132. movdqu (%rcx), %xmm1
  1133. pcmpeqd %xmm1, %xmm0
  1134. pmovmskb %xmm0, %eax
  1135. inc %ax
  1136. jnz .LVec0Differs
  1137. shl $2, %r8 { convert to bytes }
  1138. sub $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1139. jle .LLastVec
  1140. test %rdx, %rdx
  1141. jz .LNothing { Exit if buf1 = buf2. }
  1142. mov %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1143. add %rcx, %r8
  1144. and $-16, %rcx { align buf1; +16 is performed by the loop. }
  1145. sub %rcx, %r8
  1146. .balign 16
  1147. .LAligned4xLoop_Body:
  1148. add $16, %rcx
  1149. movdqu (%rdx,%rcx), %xmm0
  1150. pcmpeqb (%rcx), %xmm0
  1151. pmovmskb %xmm0, %eax
  1152. inc %ax
  1153. jnz .LAligned4xLoop_VecDiffers
  1154. sub $16, %r8
  1155. ja .LAligned4xLoop_Body
  1156. .LLastVec:
  1157. lea 16(%rcx,%r8), %rcx { point to the last 16 bytes }
  1158. movdqu (%rdx,%rcx), %xmm0
  1159. movdqu (%rcx), %xmm1
  1160. pcmpeqd %xmm1, %xmm0
  1161. pmovmskb %xmm0, %eax
  1162. inc %ax
  1163. jnz .LVec0Differs
  1164. .LNothing:
  1165. xor %eax, %eax
  1166. ret
  1167. .LVec0Differs:
  1168. bsf %eax, %eax
  1169. add %rcx, %rdx { recover rdx = buf2 }
  1170. mov (%rdx,%rax), %edx
  1171. cmp %edx, (%rcx,%rax)
  1172. sbb %rax, %rax
  1173. or $1, %rax
  1174. ret
  1175. .LAligned4xLoop_VecDiffers:
  1176. bsf %eax, %eax
  1177. add %rax, %rcx
  1178. sub %r9, %rcx
  1179. and $-4, %rcx
  1180. add %r9, %rcx
  1181. mov (%rdx,%rcx), %edx
  1182. cmp %edx, (%rcx)
  1183. .LDoSbb:
  1184. sbb %rax, %rax
  1185. or $1, %rax
  1186. ret
  1187. .balign 16
  1188. .LDwordwise_Body:
  1189. mov (%rdx,%rcx), %eax
  1190. cmp %eax, (%rcx)
  1191. jne .LDoSbb
  1192. add $4, %rcx
  1193. .LDwordwise_Prepare:
  1194. sub $1, %r8
  1195. jae .LDwordwise_Body
  1196. xor %eax, %eax
  1197. end;
  1198. {$endif FPC_SYSTEM_HAS_COMPAREDWORD}
  1199. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  1200. { does a thread save inc/dec }
  1201. function declocked(var l : longint) : boolean;assembler; nostackframe;
  1202. asm
  1203. { this check should be done because a lock takes a lot }
  1204. { of time! }
  1205. {$ifdef FPC_PIC}
  1206. movq IsMultithread@GOTPCREL(%rip),%rax
  1207. cmpl $0,(%rax)
  1208. {$else FPC_PIC}
  1209. cmpl $0,IsMultithread(%rip)
  1210. {$endif FPC_PIC}
  1211. jz .Ldeclockedskiplock
  1212. .byte 0xF0 // LOCK prefix.
  1213. .Ldeclockedskiplock:
  1214. decl {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1215. setzb %al
  1216. end;
  1217. {$define FPC_SYSTEM_HAS_DECLOCKED_INT64}
  1218. function declocked(var l : int64) : boolean;assembler; nostackframe;
  1219. asm
  1220. { this check should be done because a lock takes a lot }
  1221. { of time! }
  1222. {$ifdef FPC_PIC}
  1223. movq IsMultithread@GOTPCREL(%rip),%rax
  1224. cmpl $0,(%rax)
  1225. {$else FPC_PIC}
  1226. cmpl $0,IsMultithread(%rip)
  1227. {$endif FPC_PIC}
  1228. jz .Ldeclockedskiplock
  1229. .byte 0xF0 // LOCK prefix.
  1230. .Ldeclockedskiplock:
  1231. decq {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1232. setzb %al
  1233. end;
  1234. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  1235. procedure inclocked(var l : longint);assembler; nostackframe;
  1236. asm
  1237. { this check should be done because a lock takes a lot }
  1238. { of time! }
  1239. {$ifdef FPC_PIC}
  1240. movq IsMultithread@GOTPCREL(%rip),%rax
  1241. cmpl $0,(%rax)
  1242. {$else FPC_PIC}
  1243. cmpl $0,IsMultithread(%rip)
  1244. {$endif FPC_PIC}
  1245. jz .Linclockedskiplock
  1246. .byte 0xF0 // LOCK prefix.
  1247. .Linclockedskiplock:
  1248. incl {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1249. end;
  1250. {$define FPC_SYSTEM_HAS_INCLOCKED_INT64}
  1251. procedure inclocked(var l : int64);assembler; nostackframe;
  1252. asm
  1253. { this check should be done because a lock takes a lot }
  1254. { of time! }
  1255. {$ifdef FPC_PIC}
  1256. movq IsMultithread@GOTPCREL(%rip),%rax
  1257. cmpl $0,(%rax)
  1258. {$else FPC_PIC}
  1259. cmpl $0,IsMultithread(%rip)
  1260. {$endif FPC_PIC}
  1261. jz .Linclockedskiplock
  1262. .byte 0xF0 // LOCK prefix.
  1263. .Linclockedskiplock:
  1264. incq {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1265. end;
  1266. {$ifndef VER3_2}
  1267. {$define FPC_SYSTEM_HAS_ATOMIC_CMP_XCHG_8}
  1268. function fpc_atomic_cmp_xchg_8(var Target: shortint; NewValue: shortint; Comparand: shortint): shortint; assembler; nostackframe;
  1269. asm
  1270. movl {$ifdef win64} %r8d {$else} %edx {$endif},%eax
  1271. lock
  1272. cmpxchgb NewValue,({$ifdef win64} %rcx {$else} %rdi {$endif})
  1273. end;
  1274. {$define FPC_SYSTEM_HAS_ATOMIC_CMP_XCHG_16}
  1275. function fpc_atomic_cmp_xchg_16(var Target: smallint; NewValue: smallint; Comparand: smallint): smallint; assembler; nostackframe;
  1276. asm
  1277. movl {$ifdef win64} %r8d {$else} %edx {$endif},%eax
  1278. lock
  1279. cmpxchgw NewValue,({$ifdef win64} %rcx {$else} %rdi {$endif})
  1280. end;
  1281. {$define FPC_SYSTEM_HAS_ATOMIC_SUB_32}
  1282. function fpc_atomic_sub_32(var Target: longint; Value: longint): longint; assembler; nostackframe;
  1283. asm
  1284. negl Value
  1285. lock
  1286. xaddl Value,({$ifdef win64} %rcx {$else} %rdi {$endif})
  1287. movl Value,%eax
  1288. end;
  1289. {$define FPC_SYSTEM_HAS_ATOMIC_SUB_64}
  1290. function fpc_atomic_sub_64(var Target: int64; Value: int64): int64; assembler; nostackframe;
  1291. asm
  1292. negq Value
  1293. lock
  1294. xaddq Value,({$ifdef win64} %rcx {$else} %rdi {$endif})
  1295. movq Value,%rax
  1296. end;
  1297. {$endif VER3_2}
  1298. {$ifdef VER3_2}
  1299. function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
  1300. {$else VER3_2}
  1301. {$define FPC_SYSTEM_HAS_ATOMIC_DEC_32}
  1302. function fpc_atomic_dec_32 (var Target: longint) : longint; assembler; nostackframe;
  1303. {$endif VER3_2}
  1304. asm
  1305. movl $-1,%eax
  1306. lock
  1307. xaddl %eax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1308. decl %eax
  1309. end;
  1310. {$ifdef VER3_2}
  1311. function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
  1312. {$else VER3_2}
  1313. {$define FPC_SYSTEM_HAS_ATOMIC_INC_32}
  1314. function fpc_atomic_inc_32 (var Target: longint) : longint; assembler; nostackframe;
  1315. {$endif VER3_2}
  1316. asm
  1317. movl $1,%eax
  1318. lock
  1319. xaddl %eax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1320. incl %eax
  1321. end;
  1322. {$ifdef VER3_2}
  1323. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  1324. {$else VER3_2}
  1325. {$define FPC_SYSTEM_HAS_ATOMIC_XCHG_32}
  1326. function fpc_atomic_xchg_32 (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  1327. {$endif VER3_2}
  1328. asm
  1329. xchgl ({$ifdef win64} %rcx {$else} %rdi {$endif}),Source
  1330. movl Source,%eax
  1331. end;
  1332. {$ifdef VER3_2}
  1333. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  1334. {$else VER3_2}
  1335. {$define FPC_SYSTEM_HAS_ATOMIC_ADD_32}
  1336. function fpc_atomic_add_32 (var Target: longint;Value : longint) : longint; assembler; nostackframe;
  1337. {$endif VER3_2}
  1338. asm
  1339. lock
  1340. xaddl {$ifdef VER3_2} Source {$else} Value {$endif},({$ifdef win64} %rcx {$else} %rdi {$endif})
  1341. movl {$ifdef VER3_2} Source {$else} Value {$endif},%eax
  1342. end;
  1343. {$ifdef VER3_2}
  1344. function InterLockedCompareExchange(var Target: longint; NewValue, Comperand : longint): longint; assembler; nostackframe;
  1345. {$else VER3_2}
  1346. {$define FPC_SYSTEM_HAS_ATOMIC_CMP_XCHG_32}
  1347. function fpc_atomic_cmp_xchg_32 (var Target: longint; NewValue, Comparand : longint) : longint; assembler; nostackframe;
  1348. {$endif VER3_2}
  1349. asm
  1350. movl {$ifdef VER3_2} Comperand {$else} Comparand {$endif},%eax
  1351. lock
  1352. cmpxchgl NewValue,({$ifdef win64} %rcx {$else} %rdi {$endif})
  1353. end;
  1354. {$ifdef VER3_2}
  1355. function InterLockedDecrement64 (var Target: int64) : int64; assembler; nostackframe;
  1356. {$else VER3_2}
  1357. {$define FPC_SYSTEM_HAS_ATOMIC_DEC_64}
  1358. function fpc_atomic_dec_64 (var Target: int64) : int64; assembler; nostackframe;
  1359. {$endif VER3_2}
  1360. asm
  1361. movq $-1,%rax
  1362. lock
  1363. xaddq %rax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1364. decq %rax
  1365. end;
  1366. {$ifdef VER3_2}
  1367. function InterLockedIncrement64 (var Target: int64) : int64; assembler; nostackframe;
  1368. {$else VER3_2}
  1369. {$define FPC_SYSTEM_HAS_ATOMIC_INC_64}
  1370. function fpc_atomic_inc_64 (var Target: int64) : int64; assembler; nostackframe;
  1371. {$endif VER3_2}
  1372. asm
  1373. movq $1,%rax
  1374. lock
  1375. xaddq %rax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1376. incq %rax
  1377. end;
  1378. {$ifdef VER3_2}
  1379. function InterLockedExchange64 (var Target: int64;Source : int64) : int64; assembler; nostackframe;
  1380. {$else VER3_2}
  1381. {$define FPC_SYSTEM_HAS_ATOMIC_XCHG_64}
  1382. function fpc_atomic_xchg_64 (var Target: int64;Source: int64) : int64; assembler; nostackframe;
  1383. {$endif VER3_2}
  1384. asm
  1385. xchgq ({$ifdef win64} %rcx {$else} %rdi {$endif}),Source
  1386. movq Source,%rax
  1387. end;
  1388. {$ifdef VER3_2}
  1389. function InterLockedExchangeAdd64 (var Target: int64;Source : int64) : int64; assembler; nostackframe;
  1390. {$else VER3_2}
  1391. {$define FPC_SYSTEM_HAS_ATOMIC_ADD_64}
  1392. function fpc_atomic_add_64 (var Target: int64;Value: int64) : int64; assembler; nostackframe;
  1393. {$endif VER3_2}
  1394. asm
  1395. lock
  1396. xaddq {$ifdef VER3_2} Source {$else} Value {$endif},({$ifdef win64} %rcx {$else} %rdi {$endif})
  1397. movq {$ifdef VER3_2} Source {$else} Value {$endif},%rax
  1398. end;
  1399. {$ifdef VER3_2}
  1400. function InterLockedCompareExchange64(var Target: int64; NewValue, Comperand : int64): int64; assembler; nostackframe;
  1401. {$else VER3_2}
  1402. {$define FPC_SYSTEM_HAS_ATOMIC_CMP_XCHG_64}
  1403. function fpc_atomic_cmp_xchg_64 (var Target: int64; NewValue, Comparand : int64) : int64; [public, alias:'FPC_ATOMIC_CMP_XCHG_64']; assembler; nostackframe;
  1404. {$endif VER3_2}
  1405. asm
  1406. movq {$ifdef VER3_2} Comperand {$else} Comparand {$endif},%rax
  1407. lock
  1408. cmpxchgq NewValue,({$ifdef win64} %rcx {$else} %rdi {$endif})
  1409. end;
  1410. {****************************************************************************
  1411. FPU
  1412. ****************************************************************************}
  1413. const
  1414. { Internal constants for use in system unit }
  1415. FPU_Invalid = 1;
  1416. FPU_Denormal = 2;
  1417. FPU_DivisionByZero = 4;
  1418. FPU_Overflow = 8;
  1419. FPU_Underflow = $10;
  1420. FPU_StackUnderflow = $20;
  1421. FPU_StackOverflow = $40;
  1422. FPU_ExceptionMask = $ff;
  1423. MM_Invalid = 1;
  1424. MM_Denormal = 2;
  1425. MM_DivisionByZero = 4;
  1426. MM_Overflow = 8;
  1427. MM_Underflow = $10;
  1428. MM_Precicion = $20;
  1429. MM_ExceptionMask = $3f;
  1430. MM_MaskInvalidOp = %0000000010000000;
  1431. MM_MaskDenorm = %0000000100000000;
  1432. MM_MaskDivZero = %0000001000000000;
  1433. MM_MaskOverflow = %0000010000000000;
  1434. MM_MaskUnderflow = %0000100000000000;
  1435. MM_MaskPrecision = %0001000000000000;
  1436. {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
  1437. procedure fpc_cpuinit;
  1438. var
  1439. _eax,cpuid7_ebx,cpuid1_ecx : dword;
  1440. begin
  1441. { don't let libraries influence the FPU cw set by the host program }
  1442. if IsLibrary then
  1443. begin
  1444. Default8087CW:=Get8087CW;
  1445. DefaultMXCSR:=GetMXCSR;
  1446. end;
  1447. SysResetFPU;
  1448. asm
  1449. xorl %eax,%eax
  1450. cpuid
  1451. movl %eax,_eax
  1452. movl $1,%eax
  1453. xorl %ecx,%ecx
  1454. cpuid
  1455. movl %ecx,cpuid1_ecx
  1456. end ['eax', 'ebx', 'ecx', 'edx'];
  1457. has_sse41_support:=boolean(cpuid1_ecx shr 19 and 1);
  1458. if _eax>=7 then
  1459. begin
  1460. asm
  1461. movl $7,%eax
  1462. xorl %ecx,%ecx
  1463. cpuid
  1464. movl %ebx,cpuid7_ebx
  1465. end ['eax', 'ebx', 'ecx', 'edx'];
  1466. {$ifdef use_fast_repmovstos}
  1467. fast_large_repmovstosb:=cpuid7_ebx and (1 shl 9)<>0;
  1468. {$endif}
  1469. { XGETBV support? }
  1470. if (cpuid1_ecx and $8000000)<>0 then
  1471. begin
  1472. asm
  1473. xorl %ecx,%ecx
  1474. .byte 0x0f,0x01,0xd0 { xgetbv }
  1475. movl %eax,_eax
  1476. end ['eax', 'rcx', 'edx'];
  1477. if (_eax and 6)=6 then
  1478. begin
  1479. has_avx_support:=(cpuid1_ecx and $10000000)<>0;
  1480. has_avx2_support:=(cpuid7_ebx and $20)<>0;
  1481. end;
  1482. end;
  1483. end;
  1484. fpc_cpuinit_performed:=true;
  1485. end;
  1486. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  1487. Procedure SysInitFPU;
  1488. begin
  1489. end;
  1490. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  1491. Procedure SysResetFPU;assembler;nostackframe;
  1492. asm
  1493. fninit
  1494. fwait
  1495. {$ifdef FPC_PIC}
  1496. movq Default8087CW@GOTPCREL(%rip),%rax
  1497. fldcw (%rax)
  1498. movq DefaultMXCSR@GOTPCREL(%rip),%rax
  1499. ldmxcsr (%rax)
  1500. {$else FPC_PIC}
  1501. fldcw Default8087CW(%rip)
  1502. ldmxcsr DefaultMXCSR(%rip)
  1503. {$endif FPC_PIC}
  1504. end;
  1505. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  1506. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  1507. procedure ReadBarrier;assembler;nostackframe;
  1508. asm
  1509. lfence
  1510. end;
  1511. procedure ReadDependencyBarrier;assembler;nostackframe;
  1512. asm
  1513. { reads imply barrier on earlier reads depended on }
  1514. end;
  1515. procedure ReadWriteBarrier;assembler;nostackframe;
  1516. asm
  1517. mfence
  1518. end;
  1519. procedure WriteBarrier;assembler;nostackframe;
  1520. asm
  1521. sfence
  1522. end;
  1523. {$endif}
  1524. {****************************************************************************
  1525. Math Routines
  1526. ****************************************************************************}
  1527. {$define FPC_SYSTEM_HAS_SWAPENDIAN}
  1528. { SwapEndian(<16 Bit>) being inlined is faster than using assembler }
  1529. function SwapEndian(const AValue: SmallInt): SmallInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  1530. begin
  1531. { the extra Word type cast is necessary because the "AValue shr 8" }
  1532. { is turned into "longint(AValue) shr 8", so if AValue < 0 then }
  1533. { the sign bits from the upper 16 bits are shifted in rather than }
  1534. { zeroes. }
  1535. Result := SmallInt(((Word(AValue) shr 8) or (Word(AValue) shl 8)) and $ffff);
  1536. end;
  1537. function SwapEndian(const AValue: Word): Word;{$ifdef SYSTEMINLINE}inline;{$endif}
  1538. begin
  1539. Result := ((AValue shr 8) or (AValue shl 8)) and $ffff;
  1540. end;
  1541. function SwapEndian(const AValue: LongInt): LongInt; assembler; nostackframe;
  1542. asm
  1543. {$ifdef win64}
  1544. movl %ecx, %eax
  1545. {$else win64}
  1546. movl %edi, %eax
  1547. {$endif win64}
  1548. bswap %eax
  1549. end;
  1550. function SwapEndian(const AValue: DWord): DWord; assembler; nostackframe;
  1551. asm
  1552. {$ifdef win64}
  1553. movl %ecx, %eax
  1554. {$else win64}
  1555. movl %edi, %eax
  1556. {$endif win64}
  1557. bswap %eax
  1558. end;
  1559. function SwapEndian(const AValue: Int64): Int64; assembler; nostackframe;
  1560. asm
  1561. {$ifdef win64}
  1562. movq %rcx, %rax
  1563. {$else win64}
  1564. movq %rdi, %rax
  1565. {$endif win64}
  1566. bswap %rax
  1567. end;
  1568. function SwapEndian(const AValue: QWord): QWord; assembler; nostackframe;
  1569. asm
  1570. {$ifdef win64}
  1571. movq %rcx, %rax
  1572. {$else win64}
  1573. movq %rdi, %rax
  1574. {$endif win64}
  1575. bswap %rax
  1576. end;
  1577. {$ifndef win64}
  1578. {$define FPC_SYSTEM_HAS_U128_DIV_U64_TO_U64}
  1579. function u128_div_u64_to_u64( const xh, xl: qword; const y: qword; out quotient, remainder: qword ): boolean;nostackframe;assembler;
  1580. {
  1581. SysV:
  1582. xh: RDI
  1583. xl: RSI
  1584. y: RDX
  1585. quotient: RCX
  1586. remainder: R8
  1587. }
  1588. label
  1589. dodiv;
  1590. asm
  1591. cmpq %rdi,%rdx
  1592. ja dodiv
  1593. xorl %eax,%eax
  1594. ret
  1595. dodiv:
  1596. movq %rdx,%r9
  1597. movq %rsi,%rax
  1598. movq %rdi,%rdx
  1599. divq %r9
  1600. movq %rax,(%rcx)
  1601. movq %rdx,(%r8)
  1602. movl $1,%eax
  1603. end;
  1604. {$endif win64}
  1605. {$ifndef FPC_SYSTEM_HAS_UMUL64X64_128}
  1606. {$define FPC_SYSTEM_HAS_UMUL64X64_128}
  1607. function UMul64x64_128(a,b: uint64; out rHi: uint64): uint64; assembler; nostackframe;
  1608. { Win64: rcx = a, rdx = b, r8 = rHi.
  1609. SysV: rdi = a, rsi = b, rdx = rHi. }
  1610. asm
  1611. {$ifndef win64}
  1612. mov %rdx, %rcx { rcx = rHi, as rdx is used for mul. }
  1613. {$endif}
  1614. mov a, %rax
  1615. mul b
  1616. mov %rdx, {$ifdef win64} (%r8) {$else} (%rcx) {$endif}
  1617. end;
  1618. {$endif FPC_SYSTEM_HAS_UMUL64X64_128}