x86_64.inc 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2002 by Florian Klaempfl and Sergei Gorelkin
  4. Members of the Free Pascal development team
  5. Processor dependent implementation for the system unit for
  6. the x86-64 architecture
  7. See the file COPYING.FPC, included in this distribution,
  8. for details about the copyright.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  12. **********************************************************************}
  13. {$asmmode GAS}
  14. {****************************************************************************
  15. Primitives
  16. ****************************************************************************}
  17. {$ifndef win64}
  18. {$define use_fast_repmovstos} { REP STOS uses nonvolatile RDI and would require a stack frame on Win64 to be SEH-compliant. }
  19. {$endif}
  20. {$ifdef use_fast_repmovstos}
  21. var
  22. fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
  23. {$endif}
  24. {$define FPC_SYSTEM_HAS_SPTR}
  25. Function Sptr : Pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  26. asm
  27. movq %rsp,%rax
  28. end;
  29. {$IFNDEF INTERNAL_BACKTRACE}
  30. {$define FPC_SYSTEM_HAS_GET_FRAME}
  31. function get_frame:pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  32. asm
  33. movq %rbp,%rax
  34. end;
  35. {$ENDIF not INTERNAL_BACKTRACE}
  36. {$define FPC_SYSTEM_HAS_GET_PC_ADDR}
  37. function get_pc_addr:pointer;assembler;nostackframe;
  38. asm
  39. movq (%rsp),%rax
  40. end;
  41. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  42. function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;{$ifdef SYSTEMINLINE}inline;{$endif}
  43. begin
  44. get_caller_addr:=framebp;
  45. if assigned(framebp) then
  46. get_caller_addr:=PPointer(framebp)[1];
  47. end;
  48. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  49. function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;{$ifdef SYSTEMINLINE}inline;{$endif}
  50. begin
  51. get_caller_frame:=framebp;
  52. if assigned(framebp) then
  53. get_caller_frame:=PPointer(framebp)^;
  54. end;
  55. // The following assembler procedures are disabled for FreeBSD due to
  56. // multiple issues with its old GNU assembler (Mantis #19188).
  57. // Even after fixing them, it can be enabled only for the trunk version,
  58. // otherwise bootstrapping won't be possible.
  59. // Modified to use oldbinutils as in cpu.pp source, to allow easier use for other targets.
  60. {$ifdef freebsd}
  61. {$ifndef overridebinutils}
  62. {$define oldbinutils}
  63. {$endif}
  64. {$endif freebsd}
  65. {$ifndef oldbinutils}
  66. {$ifndef FPC_SYSTEM_HAS_MOVE}
  67. {$define FPC_SYSTEM_HAS_MOVE}
  68. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
  69. { Linux: rdi source, rsi dest, rdx count
  70. win64: rcx source, rdx dest, r8 count }
  71. asm
  72. {$ifndef win64}
  73. mov %rdx, %r8
  74. mov %rsi, %rdx
  75. mov %rdi, %rcx
  76. {$endif win64}
  77. cmp $3, %r8
  78. jle .L3OrLess
  79. cmp $8, %r8
  80. jle .L4to8
  81. cmp $16, %r8
  82. jle .L9to16
  83. movdqu (%rcx), %xmm4 { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
  84. movdqu -16(%rcx,%r8), %xmm5
  85. cmp $32, %r8
  86. jg .L33OrMore
  87. movdqu %xmm4, (%rdx) { 17–32 bytes }
  88. movdqu %xmm5, -16(%rdx,%r8)
  89. ret
  90. .balign 16
  91. .L3OrLess:
  92. cmp $1, %r8
  93. jl .LZero
  94. movzbl (%rcx), %eax
  95. je .LOne
  96. movzwl -2(%rcx,%r8), %r9d
  97. mov %r9w, -2(%rdx,%r8)
  98. .LOne:
  99. mov %al, (%rdx)
  100. .LZero:
  101. ret
  102. .L4to8:
  103. mov (%rcx), %eax
  104. mov -4(%rcx,%r8), %r9d
  105. mov %eax, (%rdx)
  106. mov %r9d, -4(%rdx,%r8)
  107. ret
  108. .L9to16:
  109. mov (%rcx), %rax
  110. mov -8(%rcx,%r8), %r9
  111. mov %rax, (%rdx)
  112. mov %r9, -8(%rdx,%r8)
  113. .Lquit:
  114. ret
  115. .byte 0x90,0x90,0x90 { Turns .balign 16 before .Lloop32f into a no-op. }
  116. .L33OrMore:
  117. sub %rdx, %rcx { rcx = src - dest }
  118. jz .Lquit { exit if src=dest }
  119. jnb .LForward { src>dest => forward move }
  120. mov %r8, %rax
  121. add %rcx, %rax { rcx is negative => r8+rcx > 0 if regions overlap }
  122. jb .Lback { if no overlap, still do forward move }
  123. .LForward:
  124. mov %rdx, %r9 { remember original dest to write first 16 bytes }
  125. add %rdx, %r8 { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  126. add $16, %rdx
  127. and $-16, %rdx
  128. sub %rdx, %r8
  129. .LRestAfterNTf:
  130. sub $32, %r8 { During the N× loop, r8 is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
  131. jbe .LPost32f
  132. cmp $0x40000, %r8 { this limit must be processor-specific (1/2 L2 cache size) }
  133. jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
  134. .balign 16 { no-op }
  135. .Lloop32f:
  136. movdqu (%rcx,%rdx), %xmm0
  137. movdqa %xmm0, (%rdx)
  138. movdqu 16(%rcx,%rdx), %xmm0
  139. movdqa %xmm0, 16(%rdx)
  140. add $32, %rdx
  141. sub $32, %r8
  142. ja .Lloop32f
  143. .LPost32f: { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. }
  144. cmp $-16, %r8
  145. jle .LFirstAndLast16f
  146. movdqu (%rcx,%rdx), %xmm0
  147. movdqa %xmm0, (%rdx)
  148. .LFirstAndLast16f:
  149. movdqu %xmm5, 16(%rdx,%r8) { Write first and last 16 bytes after everything else. }
  150. movdqu %xmm4, (%r9) { Important for <16-byte step between src and dest. }
  151. ret
  152. .Lntf:
  153. cmp $0x1000, %rcx { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
  154. jb .Lloop32f { (this check is performed here to not stand in the way of smaller counts) }
  155. sub $0xFE0, %r8 { r8 = remaining - 0x1000, but 32 was subtracted already, so must subtract only (0x1000 - 32) = 0xFE0. }
  156. .Lntloopf:
  157. mov $32, %eax
  158. .balign 16
  159. .Lpref:
  160. prefetchnta (%rcx,%rdx,1)
  161. prefetchnta 0x40(%rcx,%rdx,1)
  162. add $0x80, %rdx
  163. dec %eax
  164. jnz .Lpref
  165. sub $0x1000, %rdx
  166. mov $64, %eax
  167. .balign 16
  168. .Lntloop64f:
  169. add $64, %rdx
  170. movdqu -64(%rcx,%rdx,1), %xmm0
  171. movntdq %xmm0, -64(%rdx)
  172. movdqu -48(%rcx,%rdx,1), %xmm0
  173. movntdq %xmm0, -48(%rdx)
  174. movdqu -32(%rcx,%rdx,1), %xmm0
  175. movntdq %xmm0, -32(%rdx)
  176. movdqu -16(%rcx,%rdx,1), %xmm0
  177. movntdq %xmm0, -16(%rdx)
  178. dec %eax
  179. jnz .Lntloop64f
  180. sub $0x1000, %r8
  181. jae .Lntloopf
  182. mfence
  183. add $0x1000, %r8
  184. jmpq .LRestAfterNTf { go handle remaining bytes }
  185. .byte 0x90,0x90,0x90 { Turns .balign 16 before .Lloop32b into a no-op. }
  186. { backwards move }
  187. .Lback:
  188. lea (%rdx,%r8), %r9 { points to the end of dest; remember to write last 16 bytes }
  189. lea -1(%r9), %r8 { move dest to the previous 16-byte boundary... }
  190. and $-16, %r8
  191. sub %rdx, %r8
  192. add %r8, %rdx
  193. .LRestAfterNTb:
  194. sub $32, %r8
  195. jbe .LPost32b
  196. cmp $0x40000, %r8
  197. jae .Lntb
  198. .balign 16 { no-op }
  199. .Lloop32b:
  200. sub $32, %rdx
  201. movdqu 16(%rcx,%rdx), %xmm0
  202. movdqa %xmm0, 16(%rdx)
  203. movdqu (%rcx,%rdx), %xmm0
  204. movdqa %xmm0, (%rdx)
  205. sub $32, %r8
  206. ja .Lloop32b
  207. .LPost32b:
  208. cmp $-16, %r8
  209. jle .LFirstAndLast16b
  210. movdqu -16(%rcx,%rdx), %xmm0
  211. movdqa %xmm0, -16(%rdx)
  212. .LFirstAndLast16b:
  213. sub %r8, %rdx
  214. movdqu %xmm4, -32(%rdx)
  215. movdqu %xmm5, -16(%r9)
  216. ret
  217. .Lntb:
  218. cmp $0xfffffffffffff000,%rcx
  219. jnb .Lloop32b
  220. sub $0xFE0, %r8
  221. .Lntloopb:
  222. mov $32, %eax
  223. .balign 16
  224. .Lprefb:
  225. sub $0x80, %rdx
  226. prefetchnta (%rcx,%rdx,1)
  227. prefetchnta 0x40(%rcx,%rdx,1)
  228. dec %eax
  229. jnz .Lprefb
  230. add $0x1000, %rdx
  231. mov $0x40, %eax
  232. .balign 16
  233. .Lntloop64b:
  234. sub $64, %rdx
  235. movdqu 48(%rcx,%rdx,1), %xmm0
  236. movntdq %xmm0, 48(%rdx)
  237. movdqu 32(%rcx,%rdx,1), %xmm0
  238. movntdq %xmm0, 32(%rdx)
  239. movdqu 16(%rcx,%rdx,1), %xmm0
  240. movntdq %xmm0, 16(%rdx)
  241. movdqu (%rcx,%rdx,1), %xmm0
  242. movntdq %xmm0, (%rdx)
  243. dec %eax
  244. jnz .Lntloop64b
  245. sub $0x1000, %r8
  246. jae .Lntloopb
  247. mfence
  248. add $0x1000, %r8
  249. jmpq .LRestAfterNTb
  250. end;
  251. {$endif FPC_SYSTEM_HAS_MOVE}
  252. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  253. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  254. or not defined(FPC_SYSTEM_HAS_FILLDWORD)
  255. or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  256. procedure FillXxxx_MoreThanTwoXmms; assembler; nostackframe;
  257. { Input:
  258. rcx = 'x'
  259. rdx = byte count
  260. xmm0 = pattern for unaligned writes
  261. xmm1 = pattern for aligned writes }
  262. const
  263. {$ifdef use_fast_repmovstos}
  264. ErmsThreshold = 1536;
  265. {$endif}
  266. NtThreshold = 4 * 1024 * 1024;
  267. asm
  268. { x can start and end misaligned on the vector boundary:
  269. x = ~~][H1][H2][...][T2][T1]~
  270. [UH] [UT]
  271. UH (“unaligned head”) potentially overlaps with H1 and is already written with 'movdqu' by the caller.
  272. At least 1 of its bytes is exclusive to it, i.e. if x is already aligned, H1 starts at byte 16.
  273. H1 and so on are called “aligned heads” or just “heads”.
  274. T1 and so on are called “aligned tails” or just “tails”.
  275. UT (“unaligned tail”) is written with another 'movdqu' after the loop.
  276. At least 1 of its bytes is exclusive to it as well, that’s why 65 is subtracted below instead of 64. }
  277. lea -65(%rcx,%rdx), %r8 { r8 = end of x - 65, to get the loop bound and to write UT later (why not write it right away though...). }
  278. and $-16, %rcx { align rcx to the LEFT (so needs to be offset by an additional +16 for a while). }
  279. movdqa %xmm1, 16(%rcx) { Write H1. }
  280. mov %r8, %rax
  281. and $-16, %rax { rax = “T4” (possibly fictive) = aligned r8 = loop bound. }
  282. cmp $49, %rdx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
  283. jle .LOneAlignedTailWrite
  284. movdqa %xmm1, 32(%rcx) { Write H2. }
  285. cmp $81, %rdx { 50~81 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
  286. jle .LTwoAlignedTailWrites
  287. cmp $113, %rdx { 82~113 bytes might contain 4~6 heads+tails; write as H1–2 and T4–1. }
  288. jle .LFourAlignedTailWrites
  289. add $48, %rcx
  290. {$ifdef use_fast_repmovstos}
  291. cmp $ErmsThreshold, %rdx
  292. jae .LRepStos
  293. {$else}
  294. cmp $NtThreshold, %rdx
  295. jae .L64xNT_Body
  296. {$endif}
  297. .balign 16
  298. .L64x_Body:
  299. movdqa %xmm1, (%rcx)
  300. movdqa %xmm1, 16(%rcx)
  301. movdqa %xmm1, 32(%rcx)
  302. movdqa %xmm1, 48(%rcx)
  303. add $64, %rcx
  304. cmp %rax, %rcx
  305. jb .L64x_Body
  306. .LFourAlignedTailWrites:
  307. movdqa %xmm1, (%rax) { T4 }
  308. movdqa %xmm1, 16(%rax) { T3 }
  309. .LTwoAlignedTailWrites:
  310. movdqa %xmm1, 32(%rax) { T2 }
  311. .LOneAlignedTailWrite:
  312. movdqa %xmm1, 48(%rax) { T1 }
  313. movdqu %xmm0, 65-16(%r8) { UT }
  314. ret
  315. {$ifdef use_fast_repmovstos}
  316. .LRepStos:
  317. {$ifdef FPC_PIC}
  318. movq fast_large_repmovstosb@GOTPCREL(%rip), %r9
  319. cmpb $1, (%r9)
  320. {$else FPC_PIC}
  321. cmpb $1, fast_large_repmovstosb(%rip)
  322. {$endif FPC_PIC}
  323. jne .LRepStosIsNotBetter
  324. {$ifdef win64}
  325. push %rdi { For tests on Windows; however this is SEH incompliant so the entire use_fast_repmovstos branch is disabled by default! }
  326. {$endif}
  327. mov %rcx, %rdi { rdi = REP STOS destination. }
  328. lea 65-16+8-1(%r8), %rcx
  329. sub %rdi, %rcx
  330. shr $3, %rcx { rcx = count of REP STOSQ blocks before UT. }
  331. movq %xmm1, %rax { recover pattern for aligned writes back to GPR :) }
  332. rep stosq
  333. movdqu %xmm0, 65-16(%r8) { UT }
  334. {$ifdef win64}
  335. pop %rdi
  336. {$endif}
  337. ret
  338. {$endif}
  339. .LRepStosIsNotBetter:
  340. cmp $NtThreshold, %rdx
  341. jb .L64x_Body
  342. .balign 16
  343. .L64xNT_Body:
  344. movntdq %xmm1, (%rcx)
  345. movntdq %xmm1, 16(%rcx)
  346. movntdq %xmm1, 32(%rcx)
  347. movntdq %xmm1, 48(%rcx)
  348. add $64, %rcx
  349. cmp %rax, %rcx
  350. jb .L64xNT_Body
  351. sfence
  352. jmp .LFourAlignedTailWrites
  353. end;
  354. {$endif FPC_SYSTEM_HAS_FILLxxxx}
  355. {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
  356. {$define FPC_SYSTEM_HAS_FILLCHAR}
  357. Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
  358. asm
  359. { win64: rcx dest, rdx count, r8b value
  360. linux: rdi dest, rsi count, rdx value }
  361. movzbl {$ifdef win64} %r8b {$else} %dl {$endif}, %eax
  362. imul $0x01010101, %eax
  363. {$ifndef win64}
  364. mov %rsi, %rdx
  365. mov %rdi, %rcx
  366. {$endif win64}
  367. cmp $3, %rdx
  368. jle .L3OrLess
  369. cmp $16, %rdx
  370. jl .L4to15
  371. movd %eax, %xmm0
  372. pshufd $0, %xmm0, %xmm0
  373. movdqu %xmm0, (%rcx)
  374. movdqa %xmm0, %xmm1
  375. cmp $32, %rdx
  376. jg FillXxxx_MoreThanTwoXmms
  377. movdqu %xmm0, -16(%rcx,%rdx)
  378. ret
  379. .L4to15:
  380. mov %eax, (%rcx)
  381. cmp $8, %edx
  382. jle .LLast4
  383. mov %eax, 4(%rcx)
  384. mov %eax, -8(%rcx,%rdx)
  385. .LLast4:
  386. mov %eax, -4(%rcx,%rdx)
  387. ret
  388. .L3OrLess:
  389. test %rdx, %rdx
  390. jle .LQuit
  391. mov %al, (%rcx)
  392. mov %al, -1(%rcx,%rdx)
  393. shr $1, %edx
  394. mov %al, (%rcx,%rdx)
  395. .LQuit:
  396. end;
  397. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  398. {$ifndef FPC_SYSTEM_HAS_FILLWORD}
  399. {$define FPC_SYSTEM_HAS_FILLWORD}
  400. procedure FillWord(var x;count:SizeInt;value:word);assembler;nostackframe;
  401. asm
  402. {$ifdef win64}
  403. movzwl %r8w, %eax
  404. shl $16, %r8d
  405. or %r8d, %eax
  406. {$else}
  407. movzwl %dx, %eax
  408. shl $16, %edx
  409. or %edx, %eax
  410. mov %rsi, %rdx
  411. mov %rdi, %rcx
  412. {$endif}
  413. cmp $3, %rdx
  414. jle .L3OrLess
  415. cmp $8, %rdx
  416. jle .L4to8
  417. movd %eax, %xmm0
  418. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  419. movdqu %xmm0, (%rcx)
  420. cmp $16, %rdx
  421. jle .LTail
  422. shl $1, %rdx { rdx = byte count }
  423. mov %rcx, %r8
  424. shl $3, %ecx
  425. rol %cl, %eax { misalign the pattern by the misalignment of x }
  426. mov %r8, %rcx
  427. movd %eax, %xmm1
  428. pshufd $0, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
  429. jmp FillXxxx_MoreThanTwoXmms
  430. .LTail:
  431. movdqu %xmm0, -16(%rcx,%rdx,2)
  432. ret
  433. .L4to8:
  434. mov %eax, %r8d
  435. shl $32, %r8
  436. or %r8, %rax
  437. mov %rax, (%rcx)
  438. mov %rax, -8(%rcx,%rdx,2)
  439. ret
  440. .L3OrLess:
  441. test %rdx, %rdx
  442. jle .LQuit
  443. mov %ax, (%rcx)
  444. mov %ax, -2(%rcx,%rdx,2)
  445. shr $1, %edx
  446. mov %ax, (%rcx,%rdx,2)
  447. .LQuit:
  448. end;
  449. {$endif FPC_SYSTEM_HAS_FILLWORD}
  450. {$ifndef FPC_SYSTEM_HAS_FILLDWORD}
  451. {$define FPC_SYSTEM_HAS_FILLDWORD}
  452. procedure FillDWord(var x;count:SizeInt;value:DWord);assembler;nostackframe;
  453. asm
  454. {$ifdef win64}
  455. mov %r8d, %eax
  456. {$else}
  457. mov %edx, %eax
  458. mov %rsi, %rdx
  459. mov %rdi, %rcx
  460. {$endif win64}
  461. cmp $3, %rdx
  462. jle .L3OrLess
  463. cmp $8, %rdx
  464. jle .L4to8
  465. movd %eax, %xmm0
  466. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  467. movdqu %xmm0, (%rcx)
  468. shl $2, %rdx { rdx = byte count }
  469. mov %rcx, %r8
  470. shl $3, %ecx
  471. rol %cl, %eax { misalign the pattern by the misalignment of x }
  472. mov %r8, %rcx
  473. movd %eax, %xmm1
  474. pshufd $0, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
  475. jmp FillXxxx_MoreThanTwoXmms
  476. .L4to8:
  477. {$ifndef win64} { on win64, eax = r8d already. }
  478. mov %eax, %r8d
  479. {$endif}
  480. shl $32, %r8
  481. or %r8, %rax
  482. mov %rax, (%rcx)
  483. mov %rax, 8(%rcx)
  484. mov %rax, -16(%rcx,%rdx,4)
  485. mov %rax, -8(%rcx,%rdx,4)
  486. ret
  487. .L3OrLess:
  488. test %rdx, %rdx
  489. jle .LQuit
  490. mov %eax, (%rcx)
  491. mov %eax, -4(%rcx,%rdx,4)
  492. shr $1, %edx
  493. mov %eax, (%rcx,%rdx,4)
  494. .LQuit:
  495. end;
  496. {$endif FPC_SYSTEM_HAS_FILLDWORD}
  497. {$ifndef FPC_SYSTEM_HAS_FILLQWORD}
  498. {$define FPC_SYSTEM_HAS_FILLQWORD}
  499. procedure FillQWord(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  500. asm
  501. {$ifdef win64}
  502. mov %r8, %rax
  503. {$else}
  504. mov %rdx, %rax
  505. mov %rsi, %rdx
  506. mov %rdi, %rcx
  507. {$endif win64}
  508. cmp $2, %rdx
  509. jle .L2OrLess
  510. cmp $6, %rdx
  511. jle .L3to6
  512. movq %rax, %xmm0
  513. pshufd $0b01000100, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  514. movdqu %xmm0, (%rcx)
  515. shl $3, %rdx { rdx = byte count }
  516. mov %rcx, %r8
  517. shl $3, %ecx
  518. rol %cl, %rax { misalign the pattern by the misalignment of x }
  519. mov %r8, %rcx
  520. movq %rax, %xmm1
  521. pshufd $0b01000100, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
  522. jmp FillXxxx_MoreThanTwoXmms
  523. .L3to6:
  524. mov %rax, (%rcx)
  525. mov %rax, 8(%rcx)
  526. mov %rax, 16(%rcx)
  527. mov %rax, -24(%rcx,%rdx,8)
  528. mov %rax, -16(%rcx,%rdx,8)
  529. mov %rax, -8(%rcx,%rdx,8)
  530. ret
  531. .L2OrLess:
  532. test %rdx, %rdx
  533. jle .LQuit
  534. mov %rax, (%rcx)
  535. mov %rax, -8(%rcx,%rdx,8)
  536. .LQuit:
  537. end;
  538. {$endif FPC_SYSTEM_HAS_FILLQWORD}
  539. {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
  540. {$define FPC_SYSTEM_HAS_INDEXBYTE}
  541. function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  542. { win64: rcx buf, rdx len, r8b word
  543. linux: rdi buf, rsi len, rdx word }
  544. asm
  545. test {$ifdef win64} %rdx, %rdx {$else} %rsi, %rsi {$endif}
  546. jz .Lnotfound { exit if len=0 }
  547. {$ifdef win64}
  548. movd %r8d, %xmm1
  549. {$else}
  550. movd %edx, %xmm1
  551. movq %rdi, %rcx
  552. movq %rsi, %rdx
  553. {$endif}
  554. mov %rcx, %r8
  555. punpcklbw %xmm1, %xmm1
  556. and $-0x10, %rcx { highest aligned address before buf }
  557. punpcklbw %xmm1, %xmm1
  558. add $16, %rcx { first aligned address after buf }
  559. pshufd $0, %xmm1, %xmm1
  560. movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
  561. sub %r8, %rcx { rcx=number of valid bytes, r8=original ptr }
  562. pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
  563. pmovmskb %xmm0, %eax
  564. shl %cl, %eax { shift valid bits into high word }
  565. and $0xffff0000, %eax { clear low word containing invalid bits }
  566. shr %cl, %eax { shift back }
  567. jmp .Lcontinue
  568. .balign 16
  569. .Lloop:
  570. movdqa (%r8,%rcx), %xmm0 { r8 and rcx may have any values, }
  571. add $16, %rcx { but their sum is evenly divisible by 16. }
  572. pcmpeqb %xmm1, %xmm0
  573. pmovmskb %xmm0, %eax
  574. .Lcontinue:
  575. test %eax, %eax
  576. jnz .Lmatch
  577. cmp %rcx, %rdx
  578. ja .Lloop
  579. .Lnotfound:
  580. or $-1, %rax
  581. retq
  582. .Lmatch:
  583. bsf %eax, %eax
  584. lea -16(%rcx,%rax), %rax
  585. cmp %rax, %rdx { check against the buffer length }
  586. jbe .Lnotfound
  587. end;
  588. {$endif FPC_SYSTEM_HAS_INDEXBYTE}
  589. {$ifndef FPC_SYSTEM_HAS_INDEXWORD}
  590. {$define FPC_SYSTEM_HAS_INDEXWORD}
  591. function IndexWord(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  592. { win64: rcx buf, rdx len, r8b word
  593. linux: rdi buf, rsi len, rdx word }
  594. asm
  595. test {$ifdef win64} %rdx, %rdx {$else} %rsi, %rsi {$endif}
  596. jz .Lnotfound { exit if len=0 }
  597. {$ifdef win64}
  598. movd %r8d, %xmm1
  599. {$else}
  600. movd %edx, %xmm1
  601. movq %rdi, %rcx
  602. movq %rsi, %rdx
  603. {$endif}
  604. mov %rcx, %r8
  605. punpcklwd %xmm1, %xmm1
  606. and $-0x10, %rcx
  607. pshufd $0, %xmm1, %xmm1
  608. add $16, %rcx
  609. movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
  610. sub %r8, %rcx { rcx=number of valid bytes }
  611. test $1, %r8b { if buffer isn't aligned to word boundary, }
  612. jnz .Lunaligned { use a different algorithm }
  613. pcmpeqw %xmm1, %xmm0
  614. pmovmskb %xmm0, %eax
  615. shl %cl, %eax
  616. and $0xffff0000, %eax
  617. shr %cl, %eax
  618. shr $1, %ecx { bytes->words }
  619. jmp .Lcontinue
  620. .balign 16
  621. .Lloop:
  622. movdqa (%r8,%rcx,2), %xmm0
  623. add $8, %rcx
  624. pcmpeqw %xmm1, %xmm0
  625. pmovmskb %xmm0, %eax
  626. .Lcontinue:
  627. test %eax, %eax
  628. jnz .Lmatch
  629. cmp %rcx, %rdx
  630. ja .Lloop
  631. .Lnotfound:
  632. or $-1, %rax
  633. retq
  634. .Lmatch:
  635. bsf %eax, %eax
  636. shr $1, %eax { in words }
  637. lea -8(%rcx,%rax), %rax
  638. cmp %rax, %rdx
  639. jbe .Lnotfound { if match is after the specified length, ignore it }
  640. retq
  641. .Lunaligned:
  642. movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
  643. psllw $8, %xmm1 { swap bytes of each word of pattern) }
  644. psrlw $8, %xmm2
  645. por %xmm2, %xmm1
  646. pcmpeqb %xmm1, %xmm0
  647. pmovmskb %xmm0, %eax
  648. shl %cl, %eax
  649. and $0xffff0000, %eax
  650. shr %cl, %eax
  651. add %rdx, %rdx { length words -> bytes }
  652. xor %r10d, %r10d { nothing to merge yet }
  653. jmp .Lcontinue_u
  654. .balign 16
  655. .Lloop_u:
  656. movdqa (%r8,%rcx), %xmm0
  657. add $16, %rcx
  658. pcmpeqb %xmm1, %xmm0 { compare by bytes }
  659. shr $16, %r10d { bit 16 shifts into 0 }
  660. pmovmskb %xmm0, %eax
  661. .Lcontinue_u:
  662. shl $1, %eax { 15:0 -> 16:1 }
  663. or %r10d, %eax { merge bit 0 from previous round }
  664. mov %eax, %r10d
  665. shr $1, %eax { now AND together adjacent pairs of bits }
  666. and %r10d, %eax
  667. and $0x5555, %eax { also reset odd bits }
  668. jnz .Lmatch_u
  669. cmpq %rcx, %rdx
  670. ja .Lloop_u
  671. .Lnotfound_u:
  672. or $-1, %rax
  673. retq
  674. .Lmatch_u:
  675. bsf %eax, %eax
  676. lea -16(%rcx,%rax), %rax
  677. cmp %rax, %rdx
  678. jbe .Lnotfound_u { if match is after the specified length, ignore it }
  679. sar $1, %rax { in words }
  680. end;
  681. {$endif FPC_SYSTEM_HAS_INDEXWORD}
  682. {$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
  683. {$define FPC_SYSTEM_HAS_INDEXDWORD}
  684. function IndexDWord(Const buf;len:SizeInt;b:dword):SizeInt; assembler; nostackframe;
  685. asm
  686. {$ifdef win64}
  687. mov %rcx, %rax
  688. {$else}
  689. mov %rdx, %r8
  690. mov %rsi, %rdx
  691. mov %rdi, %rax
  692. {$endif}
  693. cmp $4, %rdx
  694. jle .LDwordwise_Prepare
  695. sub $4, %rdx
  696. movd %r8d, %xmm1
  697. pshufd $0, %xmm1, %xmm1
  698. .balign 16
  699. .L4x_Body:
  700. movdqu (%rax), %xmm0
  701. pcmpeqd %xmm1, %xmm0
  702. pmovmskb %xmm0, %r8d
  703. test %r8d, %r8d
  704. jnz .LFoundAtMask
  705. add $16, %rax
  706. sub $4, %rdx
  707. jg .L4x_Body
  708. lea (%rax,%rdx,4), %rax
  709. movdqu (%rax), %xmm0
  710. pcmpeqd %xmm1, %xmm0
  711. pmovmskb %xmm0, %r8d
  712. test %r8d, %r8d
  713. jnz .LFoundAtMask
  714. or $-1, %rax
  715. ret
  716. .balign 16 { no-op }
  717. .LDwordwise_Body:
  718. cmp (%rax), %r8d
  719. je .LFoundAtRax
  720. add $4, %rax
  721. .LDwordwise_Prepare:
  722. sub $1, %rdx
  723. jae .LDwordwise_Body
  724. or $-1, %rax
  725. ret
  726. .LFoundAtMask:
  727. bsf %r8d, %r8d
  728. add %r8, %rax
  729. .LFoundAtRax:
  730. sub {$ifdef win64} %rcx {$else} %rdi {$endif}, %rax
  731. shr $2, %rax
  732. end;
  733. {$endif FPC_SYSTEM_HAS_INDEXDWORD}
  734. {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
  735. {$define FPC_SYSTEM_HAS_INDEXQWORD}
  736. function IndexQWord(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  737. { win64: rcx=buf, rdx=len, r8=b
  738. else: rdi=buf, rsi=len, rdx=b }
  739. asm
  740. mov {$ifdef win64} %rcx {$else} %rdi {$endif}, %rax
  741. sub $8, %rax
  742. .balign 16
  743. .LQwordwise_Next:
  744. add $8, %rax
  745. sub $1, {$ifdef win64} %rdx {$else} %rsi {$endif}
  746. jb .LNothing
  747. cmp {$ifdef win64} %r8 {$else} %rdx {$endif}, (%rax)
  748. jne .LQwordwise_Next
  749. sub {$ifdef win64} %rcx {$else} %rdi {$endif}, %rax
  750. shr $3, %rax
  751. ret
  752. .LNothing:
  753. mov $-1, %rax
  754. end;
  755. {$endif FPC_SYSTEM_HAS_INDEXQWORD}
  756. {$endif freebsd}
  757. {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
  758. {$define FPC_SYSTEM_HAS_COMPAREBYTE}
  759. function CompareByte(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  760. { win64: rcx buf, rdx buf, r8 len
  761. linux: rdi buf, rsi buf, rdx len }
  762. asm
  763. {$ifndef win64}
  764. mov %rdx, %r8
  765. mov %rsi, %rdx
  766. mov %rdi, %rcx
  767. {$endif win64}
  768. { rcx = buf1, rdx = buf2, r8 = len }
  769. cmp $1, %r8
  770. jle .L1OrLess
  771. cmp $16, %r8
  772. jae .LVecOrMore
  773. { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but is faster. }
  774. mov %ecx, %eax
  775. or %edx, %eax
  776. and $4095, %eax
  777. cmp $4080, %eax
  778. ja .LCantOverReadBoth
  779. { Over-read both as XMMs. }
  780. movdqu (%rcx), %xmm0
  781. movdqu (%rdx), %xmm1
  782. pcmpeqb %xmm1, %xmm0
  783. pmovmskb %xmm0, %eax
  784. inc %ax
  785. jz .LNothing
  786. bsf %eax, %eax
  787. cmp %r8d, %eax { Ignore garbage beyond 'len'. }
  788. jae .LNothing
  789. movzbl (%rdx,%rax), %edx
  790. movzbl (%rcx,%rax), %eax
  791. sub %rdx, %rax
  792. ret
  793. .balign 16
  794. .LNothing:
  795. xor %eax, %eax
  796. ret
  797. .LAligned32xLoop_TwoVectorsDiffer:
  798. add %rcx, %rdx { restore rdx = buf2 }
  799. pmovmskb %xmm0, %r8d { Is there a difference in the first vector? }
  800. inc %r8w
  801. jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, eax = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
  802. mov %r8d, %eax
  803. .LVec0Differs:
  804. bsf %eax, %eax
  805. movzbl (%rdx,%rax), %edx
  806. movzbl (%rcx,%rax), %eax
  807. sub %rdx, %rax
  808. ret
  809. .byte 0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
  810. .LVecOrMore:
  811. { Compare first vectors. }
  812. movdqu (%rcx), %xmm0
  813. movdqu (%rdx), %xmm1
  814. pcmpeqb %xmm1, %xmm0
  815. pmovmskb %xmm0, %eax
  816. inc %ax
  817. jnz .LVec0Differs
  818. sub $32, %r8
  819. jbe .LLastVec
  820. { Compare second vectors. }
  821. movdqu 16(%rcx), %xmm0
  822. movdqu 16(%rdx), %xmm1
  823. pcmpeqb %xmm1, %xmm0
  824. pmovmskb %xmm0, %eax
  825. inc %ax
  826. jnz .LVec1Differs
  827. cmp $32, %r8
  828. jbe .LLastTwoVectors
  829. { More than four vectors: aligned loop. }
  830. lea -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). }
  831. sub %rcx, %rdx { rdx = buf2 - buf1 }
  832. and $-16, %rcx { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
  833. sub %rcx, %r8 { r8 = count to be handled with loop }
  834. .balign 16 { no-op }
  835. .LAligned32xLoop_Body:
  836. add $32, %rcx
  837. { Compare two XMMs, reduce the result with 'and'. }
  838. movdqu (%rdx,%rcx), %xmm0
  839. pcmpeqb (%rcx), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
  840. movdqu 16(%rdx,%rcx), %xmm1
  841. pcmpeqb 16(%rcx), %xmm1
  842. pand %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
  843. pmovmskb %xmm1, %eax
  844. inc %ax
  845. jnz .LAligned32xLoop_TwoVectorsDiffer
  846. sub $32, %r8
  847. ja .LAligned32xLoop_Body
  848. add %rcx, %rdx { restore rdx = buf2 }
  849. add $32, %r8
  850. .LLastTwoVectors:
  851. movdqu (%rcx,%r8), %xmm0
  852. movdqu (%rdx,%r8), %xmm1
  853. pcmpeqb %xmm1, %xmm0
  854. pmovmskb %xmm0, %eax
  855. inc %ax
  856. jnz .LVecEm2Differs
  857. .LLastVec:
  858. movdqu 16(%rcx,%r8), %xmm0
  859. movdqu 16(%rdx,%r8), %xmm1
  860. pcmpeqb %xmm1, %xmm0
  861. pmovmskb %xmm0, %eax
  862. inc %ax
  863. jnz .LVecEm1Differs
  864. xor %eax, %eax
  865. ret
  866. .LVec1Differs:
  867. xor %r8d, %r8d
  868. .LVecEm1Differs:
  869. add $16, %r8
  870. .LVecEm2Differs:
  871. bsf %eax, %eax
  872. add %r8, %rax
  873. movzbl (%rdx,%rax), %edx
  874. movzbl (%rcx,%rax), %eax
  875. sub %rdx, %rax
  876. ret
  877. .LCantOverReadBoth:
  878. cmp $8, %r8d
  879. ja .L9to15
  880. cmp $3, %r8d
  881. jle .L2to3
  882. mov (%rcx), %eax
  883. mov (%rdx), %r9d
  884. cmp %r9d, %eax
  885. jne .L4xOr8xDiffer
  886. mov -4(%rcx,%r8), %eax
  887. mov -4(%rdx,%r8), %r9d
  888. cmp %r9d, %eax
  889. jne .L4xOr8xDiffer
  890. xor %eax, %eax
  891. ret
  892. .L9to15:
  893. mov (%rcx), %rax
  894. mov (%rdx), %r9
  895. cmp %r9, %rax
  896. jne .L4xOr8xDiffer
  897. mov -8(%rcx,%r8), %rax
  898. mov -8(%rdx,%r8), %r9
  899. cmp %r9, %rax
  900. jne .L4xOr8xDiffer
  901. xor %eax, %eax
  902. ret
  903. .L4xOr8xDiffer:
  904. bswap %r9
  905. bswap %rax
  906. cmp %r9, %rax
  907. sbb %rax, %rax
  908. or $1, %rax
  909. ret
  910. .L2to3:
  911. movzwl (%rcx), %eax
  912. bswap %eax
  913. shr $1, %eax
  914. mov -1(%rcx,%r8), %al
  915. movzwl (%rdx), %ecx
  916. bswap %ecx
  917. shr $1, %ecx
  918. mov -1(%rdx,%r8), %cl
  919. sub %rcx, %rax
  920. ret
  921. .L1OrLess:
  922. jl .LUnbounded_Prepare
  923. movzbl (%rcx), %eax
  924. movzbl (%rdx), %edx
  925. sub %rdx, %rax
  926. ret
  927. .LUnbounded_Prepare:
  928. sub %rcx, %rdx { rdx = buf2 - buf1 }
  929. test %r8, %r8
  930. jnz .LUnbounded_Body
  931. xor %eax, %eax
  932. ret
  933. .balign 16
  934. .LUnbounded_Next:
  935. add $1, %rcx
  936. .LUnbounded_Body:
  937. movzbl (%rdx,%rcx), %eax
  938. cmp %al, (%rcx)
  939. je .LUnbounded_Next
  940. sbb %rax, %rax
  941. or $1, %rax
  942. end;
  943. {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
  944. {$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
  945. {$define FPC_SYSTEM_HAS_COMPAREWORD}
  946. function CompareWord(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  947. asm
  948. {$ifndef win64}
  949. mov %rdx, %r8
  950. mov %rsi, %rdx
  951. mov %rdi, %rcx
  952. {$endif win64}
  953. sub %rcx, %rdx { rdx = buf2 - buf1 }
  954. cmp $1, %r8
  955. jle .LWordwise_Prepare
  956. mov %r8, %rax
  957. shr $62, %rax
  958. jnz .LWordwise_Prepare
  959. cmp $8, %r8
  960. jge .LVecOrMore
  961. lea (%rdx,%rcx), %eax
  962. or %ecx, %eax
  963. and $4095, %eax
  964. cmp $4080, %eax
  965. ja .LWordwise_Prepare
  966. movdqu (%rdx,%rcx), %xmm0
  967. movdqu (%rcx), %xmm1
  968. pcmpeqw %xmm1, %xmm0
  969. pmovmskb %xmm0, %eax
  970. shl $1, %r8 { convert to bytes }
  971. inc %ax
  972. jz .LNothing
  973. bsf %eax, %eax
  974. cmp %r8d, %eax
  975. jb .LSubtractWords
  976. .LNothing:
  977. xor %eax, %eax
  978. ret
  979. .balign 16
  980. .LWordwise_Body:
  981. movzwl (%rdx,%rcx), %eax
  982. cmp %ax, (%rcx)
  983. jne .LDoSbb
  984. add $2, %rcx
  985. .LWordwise_Prepare:
  986. sub $1, %r8
  987. jae .LWordwise_Body
  988. xor %eax, %eax
  989. ret
  990. .LDoSbb:
  991. sbb %rax, %rax
  992. or $1, %rax
  993. ret
  994. .LVec0Differs:
  995. bsf %eax, %eax
  996. .LSubtractWords:
  997. add %rcx, %rdx { recover rdx = buf2 }
  998. movzwl (%rdx,%rax), %edx
  999. movzwl (%rcx,%rax), %eax
  1000. sub %rdx, %rax
  1001. ret
  1002. .LVecOrMore:
  1003. movdqu (%rdx,%rcx), %xmm0 { Compare first vectors. }
  1004. movdqu (%rcx), %xmm1
  1005. pcmpeqw %xmm1, %xmm0
  1006. pmovmskb %xmm0, %eax
  1007. inc %ax
  1008. jnz .LVec0Differs
  1009. shl $1, %r8 { convert to bytes }
  1010. sub $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1011. jle .LLastVec
  1012. mov %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1013. add %rcx, %r8
  1014. and $-16, %rcx { align buf1; +16 is performed by the loop. }
  1015. sub %rcx, %r8
  1016. .balign 16
  1017. .LAligned8xLoop_Body:
  1018. add $16, %rcx
  1019. movdqu (%rdx,%rcx), %xmm0
  1020. pcmpeqb (%rcx), %xmm0
  1021. pmovmskb %xmm0, %eax
  1022. inc %ax
  1023. jnz .LAligned8xLoop_VecDiffers
  1024. sub $16, %r8
  1025. ja .LAligned8xLoop_Body
  1026. .LLastVec:
  1027. lea 16(%rcx,%r8), %rcx { point to the last 16 bytes }
  1028. movdqu (%rdx,%rcx), %xmm0
  1029. movdqu (%rcx), %xmm1
  1030. pcmpeqw %xmm1, %xmm0
  1031. pmovmskb %xmm0, %eax
  1032. inc %ax
  1033. jnz .LVec0Differs
  1034. xor %eax, %eax
  1035. ret
  1036. .LAligned8xLoop_VecDiffers:
  1037. bsf %eax, %eax
  1038. add %rax, %rcx
  1039. sub %r9, %rcx
  1040. and $-2, %rcx
  1041. add %r9, %rcx
  1042. movzwl (%rdx,%rcx), %edx
  1043. movzwl (%rcx), %eax
  1044. sub %rdx, %rax
  1045. end;
  1046. {$endif FPC_SYSTEM_HAS_COMPAREWORD}
  1047. {$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
  1048. {$define FPC_SYSTEM_HAS_COMPAREDWORD}
  1049. function CompareDWord(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1050. asm
  1051. {$ifndef win64}
  1052. mov %rdx, %r8
  1053. mov %rsi, %rdx
  1054. mov %rdi, %rcx
  1055. {$endif win64}
  1056. sub %rcx, %rdx { rdx = buf2 - buf1 }
  1057. cmp $4, %r8
  1058. jle .LDwordwise_Prepare
  1059. mov %r8, %rax
  1060. shr $61, %rax
  1061. jnz .LDwordwise_Prepare
  1062. movdqu (%rdx,%rcx), %xmm0 { Compare first vectors. }
  1063. movdqu (%rcx), %xmm1
  1064. pcmpeqd %xmm1, %xmm0
  1065. pmovmskb %xmm0, %eax
  1066. inc %ax
  1067. jnz .LVec0Differs
  1068. shl $2, %r8 { convert to bytes }
  1069. sub $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1070. jle .LLastVec
  1071. mov %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1072. add %rcx, %r8
  1073. and $-16, %rcx { align buf1; +16 is performed by the loop. }
  1074. sub %rcx, %r8
  1075. .balign 16
  1076. .LAligned4xLoop_Body:
  1077. add $16, %rcx
  1078. movdqu (%rdx,%rcx), %xmm0
  1079. pcmpeqb (%rcx), %xmm0
  1080. pmovmskb %xmm0, %eax
  1081. inc %ax
  1082. jnz .LAligned4xLoop_VecDiffers
  1083. sub $16, %r8
  1084. ja .LAligned4xLoop_Body
  1085. .LLastVec:
  1086. lea 16(%rcx,%r8), %rcx { point to the last 16 bytes }
  1087. movdqu (%rdx,%rcx), %xmm0
  1088. movdqu (%rcx), %xmm1
  1089. pcmpeqd %xmm1, %xmm0
  1090. pmovmskb %xmm0, %eax
  1091. inc %ax
  1092. jnz .LVec0Differs
  1093. xor %eax, %eax
  1094. ret
  1095. .LVec0Differs:
  1096. bsf %eax, %eax
  1097. add %rcx, %rdx { recover rdx = buf2 }
  1098. mov (%rdx,%rax), %edx
  1099. cmp %edx, (%rcx,%rax)
  1100. sbb %rax, %rax
  1101. or $1, %rax
  1102. ret
  1103. .LAligned4xLoop_VecDiffers:
  1104. bsf %eax, %eax
  1105. add %rax, %rcx
  1106. sub %r9, %rcx
  1107. and $-4, %rcx
  1108. add %r9, %rcx
  1109. mov (%rdx,%rcx), %edx
  1110. cmp %edx, (%rcx)
  1111. .LDoSbb:
  1112. sbb %rax, %rax
  1113. or $1, %rax
  1114. ret
  1115. .balign 16
  1116. .LDwordwise_Body:
  1117. mov (%rdx,%rcx), %eax
  1118. cmp %eax, (%rcx)
  1119. jne .LDoSbb
  1120. add $4, %rcx
  1121. .LDwordwise_Prepare:
  1122. sub $1, %r8
  1123. jae .LDwordwise_Body
  1124. xor %eax, %eax
  1125. end;
  1126. {$endif FPC_SYSTEM_HAS_COMPAREDWORD}
  1127. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  1128. { does a thread save inc/dec }
  1129. function declocked(var l : longint) : boolean;assembler; nostackframe;
  1130. asm
  1131. { this check should be done because a lock takes a lot }
  1132. { of time! }
  1133. {$ifdef FPC_PIC}
  1134. movq IsMultithread@GOTPCREL(%rip),%rax
  1135. cmpl $0,(%rax)
  1136. {$else FPC_PIC}
  1137. cmpl $0,IsMultithread(%rip)
  1138. {$endif FPC_PIC}
  1139. jz .Ldeclockedskiplock
  1140. .byte 0xF0 // LOCK prefix.
  1141. .Ldeclockedskiplock:
  1142. decl {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1143. setzb %al
  1144. end;
  1145. {$define FPC_SYSTEM_HAS_DECLOCKED_INT64}
  1146. function declocked(var l : int64) : boolean;assembler; nostackframe;
  1147. asm
  1148. { this check should be done because a lock takes a lot }
  1149. { of time! }
  1150. {$ifdef FPC_PIC}
  1151. movq IsMultithread@GOTPCREL(%rip),%rax
  1152. cmpl $0,(%rax)
  1153. {$else FPC_PIC}
  1154. cmpl $0,IsMultithread(%rip)
  1155. {$endif FPC_PIC}
  1156. jz .Ldeclockedskiplock
  1157. .byte 0xF0 // LOCK prefix.
  1158. .Ldeclockedskiplock:
  1159. decq {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1160. setzb %al
  1161. end;
  1162. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  1163. procedure inclocked(var l : longint);assembler; nostackframe;
  1164. asm
  1165. { this check should be done because a lock takes a lot }
  1166. { of time! }
  1167. {$ifdef FPC_PIC}
  1168. movq IsMultithread@GOTPCREL(%rip),%rax
  1169. cmpl $0,(%rax)
  1170. {$else FPC_PIC}
  1171. cmpl $0,IsMultithread(%rip)
  1172. {$endif FPC_PIC}
  1173. jz .Linclockedskiplock
  1174. .byte 0xF0 // LOCK prefix.
  1175. .Linclockedskiplock:
  1176. incl {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1177. end;
  1178. {$define FPC_SYSTEM_HAS_INCLOCKED_INT64}
  1179. procedure inclocked(var l : int64);assembler; nostackframe;
  1180. asm
  1181. { this check should be done because a lock takes a lot }
  1182. { of time! }
  1183. {$ifdef FPC_PIC}
  1184. movq IsMultithread@GOTPCREL(%rip),%rax
  1185. cmpl $0,(%rax)
  1186. {$else FPC_PIC}
  1187. cmpl $0,IsMultithread(%rip)
  1188. {$endif FPC_PIC}
  1189. jz .Linclockedskiplock
  1190. .byte 0xF0 // LOCK prefix.
  1191. .Linclockedskiplock:
  1192. incq {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1193. end;
  1194. function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
  1195. asm
  1196. movl $-1,%eax
  1197. lock
  1198. xaddl %eax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1199. decl %eax
  1200. end;
  1201. function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
  1202. asm
  1203. movl $1,%eax
  1204. lock
  1205. xaddl %eax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1206. incl %eax
  1207. end;
  1208. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  1209. asm
  1210. {$ifdef win64}
  1211. xchgl (%rcx),%edx
  1212. movl %edx,%eax
  1213. {$else win64}
  1214. xchgl (%rdi),%esi
  1215. movl %esi,%eax
  1216. {$endif win64}
  1217. end;
  1218. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  1219. asm
  1220. {$ifdef win64}
  1221. lock
  1222. xaddl %edx, (%rcx)
  1223. movl %edx,%eax
  1224. {$else win64}
  1225. lock
  1226. xaddl %esi, (%rdi)
  1227. movl %esi,%eax
  1228. {$endif win64}
  1229. end;
  1230. function InterLockedCompareExchange(var Target: longint; NewValue, Comperand : longint): longint; assembler; nostackframe;
  1231. asm
  1232. {$ifdef win64}
  1233. movl %r8d,%eax
  1234. lock
  1235. cmpxchgl %edx,(%rcx)
  1236. {$else win64}
  1237. movl %edx,%eax
  1238. lock
  1239. cmpxchgl %esi,(%rdi)
  1240. {$endif win64}
  1241. end;
  1242. function InterLockedDecrement64 (var Target: int64) : int64; assembler; nostackframe;
  1243. asm
  1244. movq $-1,%rax
  1245. lock
  1246. xaddq %rax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1247. decq %rax
  1248. end;
  1249. function InterLockedIncrement64 (var Target: int64) : int64; assembler; nostackframe;
  1250. asm
  1251. movq $1,%rax
  1252. lock
  1253. xaddq %rax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1254. incq %rax
  1255. end;
  1256. function InterLockedExchange64 (var Target: int64;Source : int64) : int64; assembler; nostackframe;
  1257. asm
  1258. {$ifdef win64}
  1259. xchgq (%rcx),%rdx
  1260. movq %rdx,%rax
  1261. {$else win64}
  1262. xchgq (%rdi),%rsi
  1263. movq %rsi,%rax
  1264. {$endif win64}
  1265. end;
  1266. function InterLockedExchangeAdd64 (var Target: int64;Source : int64) : int64; assembler; nostackframe;
  1267. asm
  1268. {$ifdef win64}
  1269. lock
  1270. xaddq %rdx, (%rcx)
  1271. movq %rdx,%rax
  1272. {$else win64}
  1273. lock
  1274. xaddq %rsi, (%rdi)
  1275. movq %rsi,%rax
  1276. {$endif win64}
  1277. end;
  1278. function InterLockedCompareExchange64(var Target: int64; NewValue, Comperand : int64): int64; assembler; nostackframe;
  1279. asm
  1280. {$ifdef win64}
  1281. movq %r8,%rax
  1282. lock
  1283. cmpxchgq %rdx,(%rcx)
  1284. {$else win64}
  1285. movq %rdx,%rax
  1286. lock
  1287. cmpxchgq %rsi,(%rdi)
  1288. {$endif win64}
  1289. end;
  1290. {****************************************************************************
  1291. FPU
  1292. ****************************************************************************}
  1293. const
  1294. { Internal constants for use in system unit }
  1295. FPU_Invalid = 1;
  1296. FPU_Denormal = 2;
  1297. FPU_DivisionByZero = 4;
  1298. FPU_Overflow = 8;
  1299. FPU_Underflow = $10;
  1300. FPU_StackUnderflow = $20;
  1301. FPU_StackOverflow = $40;
  1302. FPU_ExceptionMask = $ff;
  1303. MM_Invalid = 1;
  1304. MM_Denormal = 2;
  1305. MM_DivisionByZero = 4;
  1306. MM_Overflow = 8;
  1307. MM_Underflow = $10;
  1308. MM_Precicion = $20;
  1309. MM_ExceptionMask = $3f;
  1310. MM_MaskInvalidOp = %0000000010000000;
  1311. MM_MaskDenorm = %0000000100000000;
  1312. MM_MaskDivZero = %0000001000000000;
  1313. MM_MaskOverflow = %0000010000000000;
  1314. MM_MaskUnderflow = %0000100000000000;
  1315. MM_MaskPrecision = %0001000000000000;
  1316. {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
  1317. procedure fpc_cpuinit;
  1318. var
  1319. _eax,cpuid7_ebx,cpuid1_ecx : dword;
  1320. begin
  1321. { don't let libraries influence the FPU cw set by the host program }
  1322. if IsLibrary then
  1323. begin
  1324. Default8087CW:=Get8087CW;
  1325. DefaultMXCSR:=GetMXCSR;
  1326. end;
  1327. SysResetFPU;
  1328. asm
  1329. xorl %eax,%eax
  1330. cpuid
  1331. movl %eax,_eax
  1332. end;
  1333. if _eax>=7 then
  1334. begin
  1335. asm
  1336. movl $1,%eax
  1337. xorl %ecx,%ecx
  1338. cpuid
  1339. movl %ecx,cpuid1_ecx
  1340. movl $7,%eax
  1341. xorl %ecx,%ecx
  1342. cpuid
  1343. movl %ebx,cpuid7_ebx
  1344. end;
  1345. {$ifdef use_fast_repmovstos}
  1346. fast_large_repmovstosb:=cpuid7_ebx and (1 shl 9)<>0;
  1347. {$endif}
  1348. { XGETBV support? }
  1349. if (cpuid1_ecx and $8000000)<>0 then
  1350. begin
  1351. asm
  1352. xorl %ecx,%ecx
  1353. .byte 0x0f,0x01,0xd0 { xgetbv }
  1354. movl %eax,_eax
  1355. end;
  1356. if (_eax and 6)=6 then
  1357. begin
  1358. has_avx_support:=(cpuid1_ecx and $10000000)<>0;
  1359. has_avx2_support:=(cpuid7_ebx and $20)<>0;
  1360. end;
  1361. end;
  1362. end;
  1363. end;
  1364. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  1365. Procedure SysInitFPU;
  1366. begin
  1367. end;
  1368. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  1369. Procedure SysResetFPU;
  1370. var
  1371. { these locals are so we don't have to hack pic code in the assembler }
  1372. localmxcsr: dword;
  1373. localfpucw: word;
  1374. begin
  1375. localfpucw:=Default8087CW;
  1376. localmxcsr:=DefaultMXCSR;
  1377. asm
  1378. fninit
  1379. fwait
  1380. fldcw localfpucw
  1381. ldmxcsr localmxcsr
  1382. end;
  1383. end;
  1384. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  1385. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  1386. procedure ReadBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  1387. asm
  1388. lfence
  1389. end;
  1390. procedure ReadDependencyBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  1391. asm
  1392. { reads imply barrier on earlier reads depended on }
  1393. end;
  1394. procedure ReadWriteBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  1395. asm
  1396. mfence
  1397. end;
  1398. procedure WriteBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  1399. asm
  1400. sfence
  1401. end;
  1402. {$endif}
  1403. {****************************************************************************
  1404. Math Routines
  1405. ****************************************************************************}
  1406. {$define FPC_SYSTEM_HAS_SWAPENDIAN}
  1407. { SwapEndian(<16 Bit>) being inlined is faster than using assembler }
  1408. function SwapEndian(const AValue: SmallInt): SmallInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  1409. begin
  1410. { the extra Word type cast is necessary because the "AValue shr 8" }
  1411. { is turned into "longint(AValue) shr 8", so if AValue < 0 then }
  1412. { the sign bits from the upper 16 bits are shifted in rather than }
  1413. { zeroes. }
  1414. Result := SmallInt(((Word(AValue) shr 8) or (Word(AValue) shl 8)) and $ffff);
  1415. end;
  1416. function SwapEndian(const AValue: Word): Word;{$ifdef SYSTEMINLINE}inline;{$endif}
  1417. begin
  1418. Result := ((AValue shr 8) or (AValue shl 8)) and $ffff;
  1419. end;
  1420. function SwapEndian(const AValue: LongInt): LongInt; assembler; nostackframe;
  1421. asm
  1422. {$ifdef win64}
  1423. movl %ecx, %eax
  1424. {$else win64}
  1425. movl %edi, %eax
  1426. {$endif win64}
  1427. bswap %eax
  1428. end;
  1429. function SwapEndian(const AValue: DWord): DWord; assembler; nostackframe;
  1430. asm
  1431. {$ifdef win64}
  1432. movl %ecx, %eax
  1433. {$else win64}
  1434. movl %edi, %eax
  1435. {$endif win64}
  1436. bswap %eax
  1437. end;
  1438. function SwapEndian(const AValue: Int64): Int64; assembler; nostackframe;
  1439. asm
  1440. {$ifdef win64}
  1441. movq %rcx, %rax
  1442. {$else win64}
  1443. movq %rdi, %rax
  1444. {$endif win64}
  1445. bswap %rax
  1446. end;
  1447. function SwapEndian(const AValue: QWord): QWord; assembler; nostackframe;
  1448. asm
  1449. {$ifdef win64}
  1450. movq %rcx, %rax
  1451. {$else win64}
  1452. movq %rdi, %rax
  1453. {$endif win64}
  1454. bswap %rax
  1455. end;
  1456. {$ifndef win64}
  1457. {$define FPC_SYSTEM_HAS_U128_DIV_U64_TO_U64}
  1458. function u128_div_u64_to_u64( const xh, xl: qword; const y: qword; out quotient, remainder: qword ): boolean;nostackframe;assembler;
  1459. {
  1460. SysV:
  1461. xh: RDI
  1462. xl: RSI
  1463. y: RDX
  1464. quotient: RCX
  1465. remainder: R8
  1466. }
  1467. label
  1468. dodiv;
  1469. asm
  1470. cmpq %rdi,%rdx
  1471. ja dodiv
  1472. xorl %eax,%eax
  1473. ret
  1474. dodiv:
  1475. movq %rdx,%r9
  1476. movq %rsi,%rax
  1477. movq %rdi,%rdx
  1478. divq %r9
  1479. movq %rax,(%rcx)
  1480. movq %rdx,(%r8)
  1481. movl $1,%eax
  1482. end;
  1483. {$endif win64}