x86_64.inc 45 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2002 by Florian Klaempfl and Sergei Gorelkin
  4. Members of the Free Pascal development team
  5. Processor dependent implementation for the system unit for
  6. the x86-64 architecture
  7. See the file COPYING.FPC, included in this distribution,
  8. for details about the copyright.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  12. **********************************************************************}
  13. {$asmmode GAS}
  14. {****************************************************************************
  15. Primitives
  16. ****************************************************************************}
  17. {$ifndef win64}
  18. {$define use_fast_repmovstos} { REP STOS uses nonvolatile RDI and would require a stack frame on Win64 to be SEH-compliant. }
  19. {$endif}
  20. {$ifdef use_fast_repmovstos}
  21. var
  22. fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
  23. {$endif}
  24. var
  25. has_sse41_support,fpc_cpuinit_performed : boolean;
  26. {$define FPC_SYSTEM_HAS_SPTR}
  27. Function Sptr : Pointer;assembler;nostackframe;
  28. asm
  29. movq %rsp,%rax
  30. end;
  31. {$IFNDEF INTERNAL_BACKTRACE}
  32. {$define FPC_SYSTEM_HAS_GET_FRAME}
  33. function get_frame:pointer;assembler;nostackframe;
  34. asm
  35. movq %rbp,%rax
  36. end;
  37. {$ENDIF not INTERNAL_BACKTRACE}
  38. {$define FPC_SYSTEM_HAS_GET_PC_ADDR}
  39. function get_pc_addr:pointer;assembler;nostackframe;
  40. asm
  41. movq (%rsp),%rax
  42. end;
  43. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  44. function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;
  45. begin
  46. get_caller_addr:=framebp;
  47. if assigned(framebp) then
  48. get_caller_addr:=PPointer(framebp)[1];
  49. end;
  50. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  51. function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;
  52. begin
  53. get_caller_frame:=framebp;
  54. if assigned(framebp) then
  55. get_caller_frame:=PPointer(framebp)^;
  56. end;
  57. // The following assembler procedures are disabled for FreeBSD due to
  58. // multiple issues with its old GNU assembler (Mantis #19188).
  59. // Even after fixing them, it can be enabled only for the trunk version,
  60. // otherwise bootstrapping won't be possible.
  61. // Modified to use oldbinutils as in cpu.pp source, to allow easier use for other targets.
  62. {$ifdef freebsd}
  63. {$ifndef overridebinutils}
  64. {$define oldbinutils}
  65. {$endif}
  66. {$endif freebsd}
  67. {$ifndef oldbinutils}
  68. {$ifndef FPC_SYSTEM_HAS_MOVE}
  69. {$define FPC_SYSTEM_HAS_MOVE}
  70. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
  71. { Linux: rdi source, rsi dest, rdx count
  72. win64: rcx source, rdx dest, r8 count }
  73. const
  74. NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
  75. PrefetchDistance = 512;
  76. asm
  77. {$ifndef win64}
  78. mov %rdx, %r8
  79. mov %rsi, %rdx
  80. mov %rdi, %rcx
  81. {$endif win64}
  82. cmp $3, %r8
  83. jle .L3OrLess
  84. cmp $8, %r8
  85. jle .L4to8
  86. cmp $16, %r8
  87. jle .L9to16
  88. movups (%rcx), %xmm4 { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
  89. movups -16(%rcx,%r8), %xmm5
  90. cmp $32, %r8
  91. jg .L33OrMore
  92. movups %xmm4, (%rdx) { 17–32 bytes }
  93. movups %xmm5, -16(%rdx,%r8)
  94. ret
  95. .balign 16
  96. .L3OrLess:
  97. cmp $1, %r8
  98. jl .LZero
  99. movzbl (%rcx), %eax
  100. je .LOne
  101. movzwl -2(%rcx,%r8), %r9d
  102. mov %r9w, -2(%rdx,%r8)
  103. .LOne:
  104. mov %al, (%rdx)
  105. .LZero:
  106. ret
  107. .L4to8:
  108. mov (%rcx), %eax
  109. mov -4(%rcx,%r8), %r9d
  110. mov %eax, (%rdx)
  111. mov %r9d, -4(%rdx,%r8)
  112. ret
  113. .L9to16:
  114. mov (%rcx), %rax
  115. mov -8(%rcx,%r8), %r9
  116. mov %rax, (%rdx)
  117. mov %r9, -8(%rdx,%r8)
  118. .Lquit:
  119. ret
  120. .byte 102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
  121. .L33OrMore:
  122. movups -32(%rcx,%r8), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
  123. { but -32(%rcx,%r8) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
  124. sub %rdx, %rcx { rcx = src - dest }
  125. jz .Lquit { exit if src=dest }
  126. mov %rcx, %rax
  127. neg %rax
  128. cmp %rax, %r8
  129. ja .Lback { count (r8) > unsigned(dest - src) (rax) if regions overlap }
  130. mov %rdx, %r9 { remember original dest to write first 16 bytes }
  131. add %rdx, %r8 { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
  132. add $16, %rdx
  133. and $-16, %rdx
  134. sub %rdx, %r8
  135. .LRestAfterNTf:
  136. sub $32, %r8 { During the N× loop, r8 is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
  137. jbe .LPost32f
  138. cmp $NtThreshold-32, %r8
  139. jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
  140. .balign 16 { no-op }
  141. .Lloop32f:
  142. movups (%rcx,%rdx), %xmm0
  143. movaps %xmm0, (%rdx)
  144. movups 16(%rcx,%rdx), %xmm0
  145. movaps %xmm0, 16(%rdx)
  146. add $32, %rdx
  147. sub $32, %r8
  148. ja .Lloop32f
  149. .LPost32f: { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. }
  150. movups %xmm3, (%rdx, %r8)
  151. movups %xmm5, 16(%rdx,%r8) { Write first and last 16 bytes after everything else. }
  152. movups %xmm4, (%r9) { Important for <16-byte step between src and dest. }
  153. ret
  154. .balign 16
  155. .Lntf:
  156. cmp $NtThreshold, %rcx { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
  157. jb .Lloop32f { (this check is performed here to not stand in the way of smaller counts) }
  158. sub $PrefetchDistance+32, %r8 { r8 = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
  159. .balign 16 { no-op }
  160. .Lntloop64f:
  161. prefetchnta 0+PrefetchDistance(%rcx,%rdx,1)
  162. movups (%rcx,%rdx,1), %xmm0
  163. movntps %xmm0, (%rdx)
  164. movups 16(%rcx,%rdx,1), %xmm0
  165. movntps %xmm0, 16(%rdx)
  166. movups 32(%rcx,%rdx,1), %xmm0
  167. movntps %xmm0, 32(%rdx)
  168. movups 48(%rcx,%rdx,1), %xmm0
  169. movntps %xmm0, 48(%rdx)
  170. add $64, %rdx
  171. sub $64, %r8
  172. jae .Lntloop64f
  173. sfence
  174. add $PrefetchDistance+64, %r8
  175. jmpq .LRestAfterNTf { go handle remaining bytes }
  176. .byte 102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
  177. { backwards move }
  178. .Lback:
  179. movups 16(%rcx,%rdx), %xmm3 { Second vector from the start. }
  180. lea (%rdx,%r8), %r9 { points to the end of dest; remember to write last 16 bytes }
  181. lea -1(%r9), %r8 { move dest to the previous 16-byte boundary... }
  182. and $-16, %r8
  183. sub %rdx, %r8
  184. add %r8, %rdx
  185. .LRestAfterNTb:
  186. sub $32, %r8
  187. jbe .LPost32b
  188. cmp $NtThreshold-32, %r8
  189. jae .Lntb
  190. .balign 16 { no-op }
  191. .Lloop32b:
  192. sub $32, %rdx
  193. movups 16(%rcx,%rdx), %xmm0
  194. movaps %xmm0, 16(%rdx)
  195. movups (%rcx,%rdx), %xmm0
  196. movaps %xmm0, (%rdx)
  197. sub $32, %r8
  198. ja .Lloop32b
  199. .LPost32b:
  200. sub %r8, %rdx
  201. movups %xmm3, -16(%rdx)
  202. movups %xmm4, -32(%rdx)
  203. movups %xmm5, -16(%r9)
  204. ret
  205. .balign 16
  206. .Lntb:
  207. cmp $-NtThreshold,%rcx
  208. jnb .Lloop32b
  209. sub $PrefetchDistance+32, %r8
  210. .balign 16 { no-op }
  211. .Lntloop64b:
  212. prefetchnta -PrefetchDistance(%rcx,%rdx,1)
  213. sub $64, %rdx
  214. movups 48(%rcx,%rdx,1), %xmm0
  215. movntps %xmm0, 48(%rdx)
  216. movups 32(%rcx,%rdx,1), %xmm0
  217. movntps %xmm0, 32(%rdx)
  218. movups 16(%rcx,%rdx,1), %xmm0
  219. movntps %xmm0, 16(%rdx)
  220. movups (%rcx,%rdx,1), %xmm0
  221. movntps %xmm0, (%rdx)
  222. sub $64, %r8
  223. jae .Lntloop64b
  224. sfence
  225. add $PrefetchDistance+64, %r8
  226. jmpq .LRestAfterNTb
  227. end;
  228. {$endif FPC_SYSTEM_HAS_MOVE}
  229. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  230. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  231. or not defined(FPC_SYSTEM_HAS_FILLDWORD)
  232. or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  233. procedure FillXxxx_MoreThanTwoXmms; assembler; nostackframe;
  234. { Input:
  235. rcx = 'x'
  236. rdx = byte count
  237. xmm0 = pattern for ALIGNED writes
  238. First and last 16 bytes are written. }
  239. const
  240. {$ifdef use_fast_repmovstos}
  241. ErmsThreshold = 1536;
  242. {$endif}
  243. NtThreshold = 4 * 1024 * 1024;
  244. asm
  245. { x can start and end misaligned on the vector boundary:
  246. x = ~~][H1][H2][...][T2][T1]~
  247. [UH] [UT]
  248. UH (“unaligned head”) potentially overlaps with H1 and is already written with 'movdqu' by the caller.
  249. At least 1 of its bytes is exclusive to it, i.e. if x is already aligned, H1 starts at byte 16.
  250. H1 and so on are called “aligned heads” or just “heads”.
  251. T1 and so on are called “aligned tails” or just “tails”.
  252. UT (“unaligned tail”) is written by the caller as well.
  253. At least 1 of its bytes is exclusive to it as well, that’s why 65 is subtracted below instead of 64. }
  254. lea -65(%rcx,%rdx), %rax
  255. and $-16, %rax { rax = “T4” (possibly fictive). }
  256. mov %rax, %rdx { Remember T4 to rdx. }
  257. and $-16, %rcx { rcx = H1 − 16. }
  258. sub %rcx, %rax { rax = aligned byte count − 48. }
  259. movdqa %xmm0, 16(%rcx) { Write H1. }
  260. cmp $32-48, %rax
  261. jle .LOneAlignedTailWrite
  262. movdqa %xmm0, 32(%rcx) { Write H2. }
  263. cmp $64-48, %rax
  264. jle .LTwoAlignedTailWrites
  265. sub $48, %rax { rax = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
  266. jle .LFourAlignedTailWrites
  267. add $48, %rcx { rcx = H3. }
  268. {$ifdef use_fast_repmovstos}
  269. cmp $ErmsThreshold-64, %rax { Need to write aligned byte count − 32 bytes already written. rax = aligned byte count − 96, so compare rax + 64 to ErmsThreshold, or rax to ErmsThreshold − 64. }
  270. jae .LRepStos
  271. {$else}
  272. cmp $NtThreshold, %rax
  273. jae .L64xNT_Body
  274. {$endif}
  275. .balign 16
  276. .L64x_Body:
  277. movdqa %xmm0, (%rcx)
  278. movdqa %xmm0, 16(%rcx)
  279. movdqa %xmm0, 32(%rcx)
  280. movdqa %xmm0, 48(%rcx)
  281. add $64, %rcx
  282. sub $64, %rax
  283. ja .L64x_Body
  284. .LFourAlignedTailWrites:
  285. movdqa %xmm0, (%rdx) { T4 }
  286. movdqa %xmm0, 16(%rdx) { T3 }
  287. .LTwoAlignedTailWrites:
  288. movdqa %xmm0, 32(%rdx) { T2 }
  289. .LOneAlignedTailWrite:
  290. movdqa %xmm0, 48(%rdx) { T1 }
  291. ret
  292. {$ifdef use_fast_repmovstos}
  293. .LRepStos:
  294. {$ifdef FPC_PIC}
  295. movq fast_large_repmovstosb@GOTPCREL(%rip), %r8
  296. cmpb $1, (%r8)
  297. {$else FPC_PIC}
  298. cmpb $1, fast_large_repmovstosb(%rip)
  299. {$endif FPC_PIC}
  300. jne .LRepStosIsNotBetter
  301. {$ifdef win64}
  302. push %rdi { For tests on Windows; however this is SEH incompliant so the entire use_fast_repmovstos branch is disabled by default! }
  303. {$endif}
  304. mov %rcx, %rdi { rdi = REP STOS destination. }
  305. lea 64(%rax), %rcx
  306. shr $3, %rcx { rcx = count of REP STOSQ blocks up to T1 (might be 1 more than strictly required if T1 and UT overlap is 8 or more, don’t care). }
  307. movq %xmm0, %rax { recover pattern for aligned writes back to GPR :) }
  308. rep stosq
  309. {$ifdef win64}
  310. pop %rdi
  311. {$endif}
  312. ret
  313. {$endif}
  314. .LRepStosIsNotBetter:
  315. cmp $NtThreshold-64, %rax
  316. jb .L64x_Body
  317. .balign 16
  318. .L64xNT_Body:
  319. movntdq %xmm0, (%rcx)
  320. movntdq %xmm0, 16(%rcx)
  321. movntdq %xmm0, 32(%rcx)
  322. movntdq %xmm0, 48(%rcx)
  323. add $64, %rcx
  324. sub $64, %rax
  325. ja .L64xNT_Body
  326. sfence
  327. jmp .LFourAlignedTailWrites
  328. end;
  329. {$endif FPC_SYSTEM_HAS_FILLxxxx}
  330. {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
  331. {$define FPC_SYSTEM_HAS_FILLCHAR}
  332. Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
  333. asm
  334. { win64: rcx dest, rdx count, r8b value
  335. linux: rdi dest, rsi count, rdx value }
  336. movzbl {$ifdef win64} %r8b {$else} %dl {$endif}, %eax
  337. imul $0x01010101, %eax
  338. {$ifndef win64}
  339. mov %rsi, %rdx
  340. mov %rdi, %rcx
  341. {$endif win64}
  342. cmp $3, %rdx
  343. jle .L3OrLess
  344. cmp $16, %rdx
  345. jl .L4to15
  346. movd %eax, %xmm0
  347. pshufd $0, %xmm0, %xmm0
  348. movdqu %xmm0, (%rcx)
  349. movdqu %xmm0, -16(%rcx,%rdx)
  350. cmp $32, %rdx
  351. jg FillXxxx_MoreThanTwoXmms
  352. ret
  353. .L4to15:
  354. mov %eax, (%rcx)
  355. cmp $8, %edx
  356. jle .LLast4
  357. mov %eax, 4(%rcx)
  358. mov %eax, -8(%rcx,%rdx)
  359. .LLast4:
  360. mov %eax, -4(%rcx,%rdx)
  361. ret
  362. .L3OrLess:
  363. test %rdx, %rdx
  364. jle .LQuit
  365. mov %al, (%rcx)
  366. mov %al, -1(%rcx,%rdx)
  367. shr $1, %edx
  368. mov %al, (%rcx,%rdx)
  369. .LQuit:
  370. end;
  371. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  372. {$ifndef FPC_SYSTEM_HAS_FILLWORD}
  373. {$define FPC_SYSTEM_HAS_FILLWORD}
  374. procedure FillWord(var x;count:SizeInt;value:word);assembler;nostackframe;
  375. asm
  376. {$ifdef win64}
  377. movzwl %r8w, %eax
  378. shl $16, %r8d
  379. or %r8d, %eax
  380. {$else}
  381. movzwl %dx, %eax
  382. shl $16, %edx
  383. or %edx, %eax
  384. mov %rsi, %rdx
  385. mov %rdi, %rcx
  386. {$endif}
  387. cmp $3, %rdx
  388. jle .L3OrLess
  389. cmp $8, %rdx
  390. jle .L4to8
  391. movd %eax, %xmm0
  392. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  393. movdqu %xmm0, (%rcx)
  394. movdqu %xmm0, -16(%rcx,%rdx,2)
  395. cmp $16, %rdx
  396. jg .LMoreThanTwoXMMs
  397. ret
  398. .LMoreThanTwoXMMs:
  399. shl $1, %rdx { rdx = byte count }
  400. mov %rcx, %r8
  401. shl $3, %ecx
  402. rol %cl, %eax { misalign the pattern by the misalignment of x }
  403. mov %r8, %rcx
  404. movd %eax, %xmm0
  405. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
  406. jmp FillXxxx_MoreThanTwoXmms
  407. .L4to8:
  408. mov %eax, %r8d
  409. shl $32, %r8
  410. or %r8, %rax
  411. mov %rax, (%rcx)
  412. mov %rax, -8(%rcx,%rdx,2)
  413. ret
  414. .L3OrLess:
  415. test %rdx, %rdx
  416. jle .LQuit
  417. mov %ax, (%rcx)
  418. mov %ax, -2(%rcx,%rdx,2)
  419. shr $1, %edx
  420. mov %ax, (%rcx,%rdx,2)
  421. .LQuit:
  422. end;
  423. {$endif FPC_SYSTEM_HAS_FILLWORD}
  424. {$ifndef FPC_SYSTEM_HAS_FILLDWORD}
  425. {$define FPC_SYSTEM_HAS_FILLDWORD}
  426. procedure FillDWord(var x;count:SizeInt;value:DWord);assembler;nostackframe;
  427. asm
  428. {$ifdef win64}
  429. mov %r8d, %eax
  430. {$else}
  431. mov %edx, %eax
  432. mov %rsi, %rdx
  433. mov %rdi, %rcx
  434. {$endif win64}
  435. cmp $3, %rdx
  436. jle .L3OrLess
  437. cmp $8, %rdx
  438. jle .L4to8
  439. movd %eax, %xmm0
  440. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  441. movdqu %xmm0, (%rcx)
  442. movdqu %xmm0, -16(%rcx,%rdx,4)
  443. shl $2, %rdx { rdx = byte count }
  444. mov %rcx, %r8
  445. shl $3, %ecx
  446. rol %cl, %eax { misalign the pattern by the misalignment of x }
  447. mov %r8, %rcx
  448. movd %eax, %xmm0
  449. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
  450. jmp FillXxxx_MoreThanTwoXmms
  451. .L4to8:
  452. {$ifndef win64} { on win64, eax = r8d already. }
  453. mov %eax, %r8d
  454. {$endif}
  455. shl $32, %r8
  456. or %r8, %rax
  457. mov %rax, (%rcx)
  458. mov %rax, 8(%rcx)
  459. mov %rax, -16(%rcx,%rdx,4)
  460. mov %rax, -8(%rcx,%rdx,4)
  461. ret
  462. .L3OrLess:
  463. test %rdx, %rdx
  464. jle .LQuit
  465. mov %eax, (%rcx)
  466. mov %eax, -4(%rcx,%rdx,4)
  467. shr $1, %edx
  468. mov %eax, (%rcx,%rdx,4)
  469. .LQuit:
  470. end;
  471. {$endif FPC_SYSTEM_HAS_FILLDWORD}
  472. {$ifndef FPC_SYSTEM_HAS_FILLQWORD}
  473. {$define FPC_SYSTEM_HAS_FILLQWORD}
  474. procedure FillQWord(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  475. asm
  476. {$ifdef win64}
  477. mov %r8, %rax
  478. {$else}
  479. mov %rdx, %rax
  480. mov %rsi, %rdx
  481. mov %rdi, %rcx
  482. {$endif win64}
  483. cmp $2, %rdx
  484. jle .L2OrLess
  485. cmp $6, %rdx
  486. jle .L3to6
  487. movq %rax, %xmm0
  488. punpcklqdq %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  489. movdqu %xmm0, (%rcx)
  490. movdqu %xmm0, -16(%rcx,%rdx,8)
  491. shl $3, %rdx { rdx = byte count }
  492. mov %rcx, %r8
  493. shl $3, %ecx
  494. rol %cl, %rax { misalign the pattern by the misalignment of x }
  495. mov %r8, %rcx
  496. movq %rax, %xmm0
  497. punpcklqdq %xmm0, %xmm0 { xmm0 = pattern for aligned writes }
  498. jmp FillXxxx_MoreThanTwoXmms
  499. .L3to6:
  500. mov %rax, (%rcx)
  501. mov %rax, 8(%rcx)
  502. mov %rax, 16(%rcx)
  503. mov %rax, -24(%rcx,%rdx,8)
  504. mov %rax, -16(%rcx,%rdx,8)
  505. mov %rax, -8(%rcx,%rdx,8)
  506. ret
  507. .L2OrLess:
  508. test %rdx, %rdx
  509. jle .LQuit
  510. mov %rax, (%rcx)
  511. mov %rax, -8(%rcx,%rdx,8)
  512. .LQuit:
  513. end;
  514. {$endif FPC_SYSTEM_HAS_FILLQWORD}
  515. {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
  516. {$define FPC_SYSTEM_HAS_INDEXBYTE}
  517. function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  518. { win64: rcx buf, rdx len, r8b word
  519. linux: rdi buf, rsi len, rdx word }
  520. asm
  521. test len, len
  522. jz .Lnotfound { exit if len=0 }
  523. movd {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1
  524. {$ifdef win64}
  525. mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
  526. add $16, %rcx
  527. {$else}
  528. lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
  529. {$endif}
  530. punpcklbw %xmm1, %xmm1
  531. and $-0x10, %rcx { first aligned address after buf }
  532. punpcklbw %xmm1, %xmm1
  533. pshufd $0, %xmm1, %xmm1
  534. movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
  535. sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr }
  536. pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
  537. pmovmskb %xmm0, %eax
  538. shl %cl, %eax { shift valid bits into high word }
  539. and $0xffff0000, %eax { clear low word containing invalid bits }
  540. shr %cl, %eax { shift back }
  541. jz .Lcontinue
  542. .Lmatch:
  543. bsf %eax, %eax
  544. lea -16(%rcx,%rax), %rax
  545. cmp %rax, len { check against the buffer length }
  546. jbe .Lnotfound
  547. ret
  548. .balign 16
  549. .Lloop:
  550. movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0 { r8/rdi and rcx may have any values, }
  551. add $16, %rcx { but their sum is evenly divisible by 16. }
  552. pcmpeqb %xmm1, %xmm0
  553. pmovmskb %xmm0, %eax
  554. test %eax, %eax
  555. jnz .Lmatch
  556. .Lcontinue:
  557. cmp %rcx, len
  558. ja .Lloop
  559. .Lnotfound:
  560. or $-1, %rax
  561. end;
  562. {$endif FPC_SYSTEM_HAS_INDEXBYTE}
  563. {$ifndef FPC_SYSTEM_HAS_INDEXWORD}
  564. {$define FPC_SYSTEM_HAS_INDEXWORD}
  565. function IndexWord(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  566. { win64: rcx buf, rdx len, r8b word
  567. linux: rdi buf, rsi len, rdx word }
  568. asm
  569. test len, len
  570. jz .Lnotfound { exit if len=0 }
  571. movd {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1
  572. {$ifdef win64}
  573. mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
  574. add $16, %rcx
  575. {$else}
  576. lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
  577. {$endif}
  578. punpcklwd %xmm1, %xmm1
  579. and $-0x10, %rcx
  580. pshufd $0, %xmm1, %xmm1
  581. movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
  582. sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes }
  583. test $1, {$ifdef win64} %r8b {$else} %dil {$endif} { if buffer isn't aligned to word boundary, }
  584. jnz .Lunaligned { use a different algorithm }
  585. pcmpeqw %xmm1, %xmm0
  586. pmovmskb %xmm0, %eax
  587. shl %cl, %eax
  588. and $0xffff0000, %eax
  589. shr %cl, %eax
  590. shr $1, %ecx { bytes->words }
  591. test %eax, %eax
  592. jz .Lcontinue
  593. .Lmatch:
  594. bsf %eax, %eax
  595. shr $1, %eax { in words }
  596. lea -8(%rcx,%rax), %rax
  597. cmp %rax, len
  598. jbe .Lnotfound { if match is after the specified length, ignore it }
  599. retq
  600. .balign 16
  601. .Lloop:
  602. movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx,2), %xmm0
  603. add $8, %rcx
  604. pcmpeqw %xmm1, %xmm0
  605. pmovmskb %xmm0, %eax
  606. test %eax, %eax
  607. jnz .Lmatch
  608. .Lcontinue:
  609. cmp %rcx, len
  610. ja .Lloop
  611. .Lnotfound:
  612. or $-1, %rax
  613. retq
  614. .Lunaligned:
  615. movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
  616. psllw $8, %xmm1 { swap bytes of each word of pattern) }
  617. psrlw $8, %xmm2
  618. por %xmm2, %xmm1
  619. pcmpeqb %xmm1, %xmm0
  620. pmovmskb %xmm0, %eax
  621. shl %cl, %eax
  622. and $0xffff0000, %eax
  623. shr %cl, %eax
  624. add len, len { length words -> bytes }
  625. xor %r10d, %r10d { nothing to merge yet }
  626. jmp .Lcontinue_u
  627. .balign 16
  628. .Lloop_u:
  629. movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0
  630. add $16, %rcx
  631. pcmpeqb %xmm1, %xmm0 { compare by bytes }
  632. shr $16, %r10d { bit 16 shifts into 0 }
  633. pmovmskb %xmm0, %eax
  634. .Lcontinue_u:
  635. shl $1, %eax { 15:0 -> 16:1 }
  636. or %r10d, %eax { merge bit 0 from previous round }
  637. mov %eax, %r10d
  638. shr $1, %eax { now AND together adjacent pairs of bits }
  639. and %r10d, %eax
  640. and $0x5555, %eax { also reset odd bits }
  641. jnz .Lmatch_u
  642. cmpq %rcx, len
  643. ja .Lloop_u
  644. .Lnotfound_u:
  645. or $-1, %rax
  646. retq
  647. .Lmatch_u:
  648. bsf %eax, %eax
  649. lea -16(%rcx,%rax), %rax
  650. cmp %rax, len
  651. jbe .Lnotfound_u { if match is after the specified length, ignore it }
  652. sar $1, %rax { in words }
  653. end;
  654. {$endif FPC_SYSTEM_HAS_INDEXWORD}
  655. {$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
  656. {$define FPC_SYSTEM_HAS_INDEXDWORD}
  657. function IndexDWord(Const buf;len:SizeInt;b:dword):SizeInt; assembler; nostackframe;
  658. asm
  659. {$ifdef win64}
  660. mov %rcx, %rax
  661. {$else}
  662. mov %rdx, %r8
  663. mov %rsi, %rdx
  664. mov %rdi, %rax
  665. {$endif}
  666. cmp $4, %rdx
  667. jle .LDwordwise_Prepare
  668. sub $4, %rdx
  669. movd %r8d, %xmm1
  670. pshufd $0, %xmm1, %xmm1
  671. .balign 16
  672. .L4x_Body:
  673. movdqu (%rax), %xmm0
  674. pcmpeqd %xmm1, %xmm0
  675. pmovmskb %xmm0, %r8d
  676. test %r8d, %r8d
  677. jnz .LFoundAtMask
  678. add $16, %rax
  679. sub $4, %rdx
  680. jg .L4x_Body
  681. lea (%rax,%rdx,4), %rax
  682. movdqu (%rax), %xmm0
  683. pcmpeqd %xmm1, %xmm0
  684. pmovmskb %xmm0, %r8d
  685. test %r8d, %r8d
  686. jnz .LFoundAtMask
  687. or $-1, %rax
  688. ret
  689. .balign 16 { no-op }
  690. .LDwordwise_Body:
  691. cmp (%rax), %r8d
  692. je .LFoundAtRax
  693. add $4, %rax
  694. .LDwordwise_Prepare:
  695. sub $1, %rdx
  696. jae .LDwordwise_Body
  697. or $-1, %rax
  698. ret
  699. .LFoundAtMask:
  700. bsf %r8d, %r8d
  701. add %r8, %rax
  702. .LFoundAtRax:
  703. sub {$ifdef win64} %rcx {$else} %rdi {$endif}, %rax
  704. shr $2, %rax
  705. end;
  706. {$endif FPC_SYSTEM_HAS_INDEXDWORD}
  707. {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
  708. {$define FPC_SYSTEM_HAS_INDEXQWORD}
  709. function IndexQWord_Plain(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  710. { win64: rcx=buf, rdx=len, r8=b
  711. else: rdi=buf, rsi=len, rdx=b }
  712. asm
  713. mov buf, %rax
  714. sub $8, %rax
  715. .balign 16
  716. .LQwordwise_Next:
  717. add $8, %rax
  718. sub $1, len
  719. jb .LNothing
  720. cmpq b, (%rax)
  721. jne .LQwordwise_Next
  722. sub buf, %rax
  723. shr $3, %rax
  724. ret
  725. .LNothing:
  726. mov $-1, %rax
  727. end;
  728. function {$ifdef CPUX86_HAS_SSE4_1} IndexQWord {$else} IndexQWord_SSE41 {$endif}(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  729. { win64: rcx=buf, rdx=len, r8=b
  730. else: rdi=buf, rsi=len, rdx=b }
  731. asm
  732. cmp $6, len
  733. jle IndexQWord_Plain
  734. mov buf, %rax
  735. movq {$ifdef win64} %r8 {$else} %rdx {$endif}, %xmm0
  736. punpcklqdq %xmm0, %xmm0 { xmm0 = pattern of 'b's. }
  737. sub $6, len
  738. .balign 16
  739. .L6x_Loop:
  740. movdqu (%rax), %xmm1
  741. pcmpeqq %xmm0, %xmm1 { xmm1 = cmpeq(vec 0, pattern) }
  742. movdqu 16(%rax), %xmm2
  743. pcmpeqq %xmm0, %xmm2
  744. por %xmm1, %xmm2 { xmm2 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) }
  745. movdqu 32(%rax), %xmm3
  746. pcmpeqq %xmm0, %xmm3
  747. por %xmm2, %xmm3 { xmm3 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) or cmpeq(vec 2, pattern) }
  748. ptest %xmm3, %xmm3
  749. jnz .LFound
  750. add $48, %rax
  751. sub $6, len
  752. jge .L6x_Loop
  753. lea (%rax,{$ifdef win64} %rdx {$else} %rsi {$endif},8), %rax { Point to last 3 vectors. }
  754. cmp $-5, len
  755. jge .L6x_Loop { Reuse .L6x_Loop to compare last 3 vectors, if not compared already. }
  756. mov $-1, %rax
  757. ret
  758. .LFound:
  759. sub buf, %rax
  760. ptest %xmm1, %xmm1
  761. jnz .LFoundAtXmm1
  762. ptest %xmm2, %xmm2
  763. jnz .LFoundAtXmm2
  764. add $16, %rax
  765. movdqa %xmm3, %xmm2
  766. .LFoundAtXmm2:
  767. add $16, %rax
  768. movdqa %xmm2, %xmm1
  769. .LFoundAtXmm1:
  770. pmovmskb %xmm1, %ecx
  771. bsf %ecx, %ecx
  772. add %rcx, %rax
  773. shr $3, %rax
  774. end;
  775. {$ifndef CPUX86_HAS_SSE4_1}
  776. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt; forward;
  777. var
  778. IndexQWord_Impl: function(const buf;len:SizeInt;b:QWord):SizeInt = @IndexQWord_Dispatch;
  779. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt;
  780. begin
  781. if not fpc_cpuinit_performed then
  782. exit(IndexQWord_Plain(buf,len,b));
  783. if has_sse41_support then
  784. IndexQWord_Impl:=@IndexQWord_SSE41
  785. else
  786. IndexQWord_Impl:=@IndexQWord_Plain;
  787. result:=IndexQWord_Impl(buf,len,b);
  788. end;
  789. function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt;
  790. begin
  791. result:=IndexQWord_Impl(buf,len,b);
  792. end;
  793. {$endif ndef CPUX86_HAS_SSE4_1}
  794. {$endif FPC_SYSTEM_HAS_INDEXQWORD}
  795. {$endif freebsd}
  796. {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
  797. {$define FPC_SYSTEM_HAS_COMPAREBYTE}
  798. function CompareByte(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  799. { win64: rcx buf, rdx buf, r8 len
  800. linux: rdi buf, rsi buf, rdx len }
  801. asm
  802. {$ifndef win64}
  803. mov %rdx, %r8
  804. mov %rsi, %rdx
  805. mov %rdi, %rcx
  806. {$endif win64}
  807. { rcx = buf1, rdx = buf2, r8 = len }
  808. cmp $1, %r8
  809. jle .L1OrLess
  810. cmp $16, %r8
  811. jae .LVecOrMore
  812. { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but is faster. }
  813. mov %ecx, %eax
  814. or %edx, %eax
  815. and $4095, %eax
  816. cmp $4080, %eax
  817. ja .LCantOverReadBoth
  818. { Over-read both as XMMs. }
  819. movdqu (%rcx), %xmm0
  820. movdqu (%rdx), %xmm1
  821. pcmpeqb %xmm1, %xmm0
  822. pmovmskb %xmm0, %eax
  823. inc %ax
  824. jz .LNothing
  825. bsf %eax, %eax
  826. cmp %r8d, %eax { Ignore garbage beyond 'len'. }
  827. jae .LNothing
  828. movzbl (%rdx,%rax), %edx
  829. movzbl (%rcx,%rax), %eax
  830. sub %rdx, %rax
  831. ret
  832. .balign 16
  833. .LNothing:
  834. xor %eax, %eax
  835. ret
  836. .LAligned32xLoop_TwoVectorsDiffer:
  837. add %rcx, %rdx { restore rdx = buf2 }
  838. pmovmskb %xmm0, %r8d { Is there a difference in the first vector? }
  839. inc %r8w
  840. jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, eax = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
  841. mov %r8d, %eax
  842. .LVec0Differs:
  843. bsf %eax, %eax
  844. movzbl (%rdx,%rax), %edx
  845. movzbl (%rcx,%rax), %eax
  846. sub %rdx, %rax
  847. ret
  848. .byte 0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
  849. .LVecOrMore:
  850. { Compare first vectors. }
  851. movdqu (%rcx), %xmm0
  852. movdqu (%rdx), %xmm1
  853. pcmpeqb %xmm1, %xmm0
  854. pmovmskb %xmm0, %eax
  855. inc %ax
  856. jnz .LVec0Differs
  857. sub $32, %r8
  858. jbe .LLastVec
  859. { Compare second vectors. }
  860. movdqu 16(%rcx), %xmm0
  861. movdqu 16(%rdx), %xmm1
  862. pcmpeqb %xmm1, %xmm0
  863. pmovmskb %xmm0, %eax
  864. inc %ax
  865. jnz .LVec1Differs
  866. cmp $32, %r8
  867. jbe .LLastTwoVectors
  868. { More than four vectors: aligned loop. }
  869. lea -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). }
  870. sub %rcx, %rdx { rdx = buf2 - buf1 }
  871. and $-16, %rcx { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
  872. sub %rcx, %r8 { r8 = count to be handled with loop }
  873. .balign 16 { no-op }
  874. .LAligned32xLoop_Body:
  875. add $32, %rcx
  876. { Compare two XMMs, reduce the result with 'and'. }
  877. movdqu (%rdx,%rcx), %xmm0
  878. pcmpeqb (%rcx), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
  879. movdqu 16(%rdx,%rcx), %xmm1
  880. pcmpeqb 16(%rcx), %xmm1
  881. pand %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
  882. pmovmskb %xmm1, %eax
  883. inc %ax
  884. jnz .LAligned32xLoop_TwoVectorsDiffer
  885. sub $32, %r8
  886. ja .LAligned32xLoop_Body
  887. add %rcx, %rdx { restore rdx = buf2 }
  888. add $32, %r8
  889. .LLastTwoVectors:
  890. movdqu (%rcx,%r8), %xmm0
  891. movdqu (%rdx,%r8), %xmm1
  892. pcmpeqb %xmm1, %xmm0
  893. pmovmskb %xmm0, %eax
  894. inc %ax
  895. jnz .LVecEm2Differs
  896. .LLastVec:
  897. movdqu 16(%rcx,%r8), %xmm0
  898. movdqu 16(%rdx,%r8), %xmm1
  899. pcmpeqb %xmm1, %xmm0
  900. pmovmskb %xmm0, %eax
  901. inc %ax
  902. jnz .LVecEm1Differs
  903. xor %eax, %eax
  904. ret
  905. .LVec1Differs:
  906. xor %r8d, %r8d
  907. .LVecEm1Differs:
  908. add $16, %r8
  909. .LVecEm2Differs:
  910. bsf %eax, %eax
  911. add %r8, %rax
  912. movzbl (%rdx,%rax), %edx
  913. movzbl (%rcx,%rax), %eax
  914. sub %rdx, %rax
  915. ret
  916. .LCantOverReadBoth:
  917. cmp $8, %r8d
  918. ja .L9to15
  919. cmp $3, %r8d
  920. jle .L2to3
  921. mov (%rcx), %eax
  922. mov (%rdx), %r9d
  923. cmp %r9d, %eax
  924. jne .L4xOr8xDiffer
  925. mov -4(%rcx,%r8), %eax
  926. mov -4(%rdx,%r8), %r9d
  927. cmp %r9d, %eax
  928. jne .L4xOr8xDiffer
  929. xor %eax, %eax
  930. ret
  931. .L9to15:
  932. mov (%rcx), %rax
  933. mov (%rdx), %r9
  934. cmp %r9, %rax
  935. jne .L4xOr8xDiffer
  936. mov -8(%rcx,%r8), %rax
  937. mov -8(%rdx,%r8), %r9
  938. cmp %r9, %rax
  939. jne .L4xOr8xDiffer
  940. xor %eax, %eax
  941. ret
  942. .L4xOr8xDiffer:
  943. bswap %r9
  944. bswap %rax
  945. cmp %r9, %rax
  946. sbb %rax, %rax
  947. or $1, %rax
  948. ret
  949. .L2to3:
  950. movzwl (%rcx), %eax
  951. bswap %eax
  952. shr $1, %eax
  953. mov -1(%rcx,%r8), %al
  954. movzwl (%rdx), %ecx
  955. bswap %ecx
  956. shr $1, %ecx
  957. mov -1(%rdx,%r8), %cl
  958. sub %rcx, %rax
  959. ret
  960. .L1OrLess:
  961. jl .LUnbounded_Prepare
  962. movzbl (%rcx), %eax
  963. movzbl (%rdx), %edx
  964. sub %rdx, %rax
  965. ret
  966. .LUnbounded_Prepare:
  967. sub %rcx, %rdx { rdx = buf2 - buf1 }
  968. test %r8, %r8
  969. jnz .LUnbounded_Body
  970. xor %eax, %eax
  971. ret
  972. .balign 16
  973. .LUnbounded_Next:
  974. add $1, %rcx
  975. .LUnbounded_Body:
  976. movzbl (%rdx,%rcx), %eax
  977. cmp %al, (%rcx)
  978. je .LUnbounded_Next
  979. sbb %rax, %rax
  980. or $1, %rax
  981. end;
  982. {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
  983. {$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
  984. {$define FPC_SYSTEM_HAS_COMPAREWORD}
  985. function CompareWord(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  986. asm
  987. {$ifndef win64}
  988. mov %rdx, %r8
  989. mov %rsi, %rdx
  990. mov %rdi, %rcx
  991. {$endif win64}
  992. sub %rcx, %rdx { rdx = buf2 - buf1 }
  993. cmp $1, %r8
  994. jle .LWordwise_Prepare
  995. mov %r8, %rax
  996. shr $62, %rax
  997. jnz .LWordwise_Prepare
  998. cmp $8, %r8
  999. jge .LVecOrMore
  1000. lea (%rdx,%rcx), %eax
  1001. or %ecx, %eax
  1002. and $4095, %eax
  1003. cmp $4080, %eax
  1004. ja .LWordwise_Prepare
  1005. movdqu (%rdx,%rcx), %xmm0
  1006. movdqu (%rcx), %xmm1
  1007. pcmpeqw %xmm1, %xmm0
  1008. pmovmskb %xmm0, %eax
  1009. shl $1, %r8 { convert to bytes }
  1010. inc %ax
  1011. jz .LNothing
  1012. bsf %eax, %eax
  1013. cmp %r8d, %eax
  1014. jb .LSubtractWords
  1015. .LNothing:
  1016. xor %eax, %eax
  1017. ret
  1018. .balign 16
  1019. .LWordwise_Body:
  1020. movzwl (%rdx,%rcx), %eax
  1021. cmp %ax, (%rcx)
  1022. jne .LDoSbb
  1023. add $2, %rcx
  1024. .LWordwise_Prepare:
  1025. sub $1, %r8
  1026. jae .LWordwise_Body
  1027. xor %eax, %eax
  1028. ret
  1029. .LDoSbb:
  1030. sbb %rax, %rax
  1031. or $1, %rax
  1032. ret
  1033. .LVec0Differs:
  1034. bsf %eax, %eax
  1035. .LSubtractWords:
  1036. add %rcx, %rdx { recover rdx = buf2 }
  1037. movzwl (%rdx,%rax), %edx
  1038. movzwl (%rcx,%rax), %eax
  1039. sub %rdx, %rax
  1040. ret
  1041. .LVecOrMore:
  1042. movdqu (%rdx,%rcx), %xmm0 { Compare first vectors. }
  1043. movdqu (%rcx), %xmm1
  1044. pcmpeqw %xmm1, %xmm0
  1045. pmovmskb %xmm0, %eax
  1046. inc %ax
  1047. jnz .LVec0Differs
  1048. shl $1, %r8 { convert to bytes }
  1049. sub $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1050. jle .LLastVec
  1051. mov %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1052. add %rcx, %r8
  1053. and $-16, %rcx { align buf1; +16 is performed by the loop. }
  1054. sub %rcx, %r8
  1055. .balign 16
  1056. .LAligned8xLoop_Body:
  1057. add $16, %rcx
  1058. movdqu (%rdx,%rcx), %xmm0
  1059. pcmpeqb (%rcx), %xmm0
  1060. pmovmskb %xmm0, %eax
  1061. inc %ax
  1062. jnz .LAligned8xLoop_VecDiffers
  1063. sub $16, %r8
  1064. ja .LAligned8xLoop_Body
  1065. .LLastVec:
  1066. lea 16(%rcx,%r8), %rcx { point to the last 16 bytes }
  1067. movdqu (%rdx,%rcx), %xmm0
  1068. movdqu (%rcx), %xmm1
  1069. pcmpeqw %xmm1, %xmm0
  1070. pmovmskb %xmm0, %eax
  1071. inc %ax
  1072. jnz .LVec0Differs
  1073. xor %eax, %eax
  1074. ret
  1075. .LAligned8xLoop_VecDiffers:
  1076. bsf %eax, %eax
  1077. add %rax, %rcx
  1078. sub %r9, %rcx
  1079. and $-2, %rcx
  1080. add %r9, %rcx
  1081. movzwl (%rdx,%rcx), %edx
  1082. movzwl (%rcx), %eax
  1083. sub %rdx, %rax
  1084. end;
  1085. {$endif FPC_SYSTEM_HAS_COMPAREWORD}
  1086. {$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
  1087. {$define FPC_SYSTEM_HAS_COMPAREDWORD}
  1088. function CompareDWord(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1089. asm
  1090. {$ifndef win64}
  1091. mov %rdx, %r8
  1092. mov %rsi, %rdx
  1093. mov %rdi, %rcx
  1094. {$endif win64}
  1095. sub %rcx, %rdx { rdx = buf2 - buf1 }
  1096. cmp $4, %r8
  1097. jle .LDwordwise_Prepare
  1098. mov %r8, %rax
  1099. shr $61, %rax
  1100. jnz .LDwordwise_Prepare
  1101. movdqu (%rdx,%rcx), %xmm0 { Compare first vectors. }
  1102. movdqu (%rcx), %xmm1
  1103. pcmpeqd %xmm1, %xmm0
  1104. pmovmskb %xmm0, %eax
  1105. inc %ax
  1106. jnz .LVec0Differs
  1107. shl $2, %r8 { convert to bytes }
  1108. sub $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1109. jle .LLastVec
  1110. mov %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1111. add %rcx, %r8
  1112. and $-16, %rcx { align buf1; +16 is performed by the loop. }
  1113. sub %rcx, %r8
  1114. .balign 16
  1115. .LAligned4xLoop_Body:
  1116. add $16, %rcx
  1117. movdqu (%rdx,%rcx), %xmm0
  1118. pcmpeqb (%rcx), %xmm0
  1119. pmovmskb %xmm0, %eax
  1120. inc %ax
  1121. jnz .LAligned4xLoop_VecDiffers
  1122. sub $16, %r8
  1123. ja .LAligned4xLoop_Body
  1124. .LLastVec:
  1125. lea 16(%rcx,%r8), %rcx { point to the last 16 bytes }
  1126. movdqu (%rdx,%rcx), %xmm0
  1127. movdqu (%rcx), %xmm1
  1128. pcmpeqd %xmm1, %xmm0
  1129. pmovmskb %xmm0, %eax
  1130. inc %ax
  1131. jnz .LVec0Differs
  1132. xor %eax, %eax
  1133. ret
  1134. .LVec0Differs:
  1135. bsf %eax, %eax
  1136. add %rcx, %rdx { recover rdx = buf2 }
  1137. mov (%rdx,%rax), %edx
  1138. cmp %edx, (%rcx,%rax)
  1139. sbb %rax, %rax
  1140. or $1, %rax
  1141. ret
  1142. .LAligned4xLoop_VecDiffers:
  1143. bsf %eax, %eax
  1144. add %rax, %rcx
  1145. sub %r9, %rcx
  1146. and $-4, %rcx
  1147. add %r9, %rcx
  1148. mov (%rdx,%rcx), %edx
  1149. cmp %edx, (%rcx)
  1150. .LDoSbb:
  1151. sbb %rax, %rax
  1152. or $1, %rax
  1153. ret
  1154. .balign 16
  1155. .LDwordwise_Body:
  1156. mov (%rdx,%rcx), %eax
  1157. cmp %eax, (%rcx)
  1158. jne .LDoSbb
  1159. add $4, %rcx
  1160. .LDwordwise_Prepare:
  1161. sub $1, %r8
  1162. jae .LDwordwise_Body
  1163. xor %eax, %eax
  1164. end;
  1165. {$endif FPC_SYSTEM_HAS_COMPAREDWORD}
  1166. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  1167. { does a thread save inc/dec }
  1168. function declocked(var l : longint) : boolean;assembler; nostackframe;
  1169. asm
  1170. { this check should be done because a lock takes a lot }
  1171. { of time! }
  1172. {$ifdef FPC_PIC}
  1173. movq IsMultithread@GOTPCREL(%rip),%rax
  1174. cmpl $0,(%rax)
  1175. {$else FPC_PIC}
  1176. cmpl $0,IsMultithread(%rip)
  1177. {$endif FPC_PIC}
  1178. jz .Ldeclockedskiplock
  1179. .byte 0xF0 // LOCK prefix.
  1180. .Ldeclockedskiplock:
  1181. decl {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1182. setzb %al
  1183. end;
  1184. {$define FPC_SYSTEM_HAS_DECLOCKED_INT64}
  1185. function declocked(var l : int64) : boolean;assembler; nostackframe;
  1186. asm
  1187. { this check should be done because a lock takes a lot }
  1188. { of time! }
  1189. {$ifdef FPC_PIC}
  1190. movq IsMultithread@GOTPCREL(%rip),%rax
  1191. cmpl $0,(%rax)
  1192. {$else FPC_PIC}
  1193. cmpl $0,IsMultithread(%rip)
  1194. {$endif FPC_PIC}
  1195. jz .Ldeclockedskiplock
  1196. .byte 0xF0 // LOCK prefix.
  1197. .Ldeclockedskiplock:
  1198. decq {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1199. setzb %al
  1200. end;
  1201. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  1202. procedure inclocked(var l : longint);assembler; nostackframe;
  1203. asm
  1204. { this check should be done because a lock takes a lot }
  1205. { of time! }
  1206. {$ifdef FPC_PIC}
  1207. movq IsMultithread@GOTPCREL(%rip),%rax
  1208. cmpl $0,(%rax)
  1209. {$else FPC_PIC}
  1210. cmpl $0,IsMultithread(%rip)
  1211. {$endif FPC_PIC}
  1212. jz .Linclockedskiplock
  1213. .byte 0xF0 // LOCK prefix.
  1214. .Linclockedskiplock:
  1215. incl {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1216. end;
  1217. {$define FPC_SYSTEM_HAS_INCLOCKED_INT64}
  1218. procedure inclocked(var l : int64);assembler; nostackframe;
  1219. asm
  1220. { this check should be done because a lock takes a lot }
  1221. { of time! }
  1222. {$ifdef FPC_PIC}
  1223. movq IsMultithread@GOTPCREL(%rip),%rax
  1224. cmpl $0,(%rax)
  1225. {$else FPC_PIC}
  1226. cmpl $0,IsMultithread(%rip)
  1227. {$endif FPC_PIC}
  1228. jz .Linclockedskiplock
  1229. .byte 0xF0 // LOCK prefix.
  1230. .Linclockedskiplock:
  1231. incq {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1232. end;
  1233. function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
  1234. asm
  1235. movl $-1,%eax
  1236. lock
  1237. xaddl %eax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1238. decl %eax
  1239. end;
  1240. function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
  1241. asm
  1242. movl $1,%eax
  1243. lock
  1244. xaddl %eax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1245. incl %eax
  1246. end;
  1247. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  1248. asm
  1249. {$ifdef win64}
  1250. xchgl (%rcx),%edx
  1251. movl %edx,%eax
  1252. {$else win64}
  1253. xchgl (%rdi),%esi
  1254. movl %esi,%eax
  1255. {$endif win64}
  1256. end;
  1257. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  1258. asm
  1259. {$ifdef win64}
  1260. lock
  1261. xaddl %edx, (%rcx)
  1262. movl %edx,%eax
  1263. {$else win64}
  1264. lock
  1265. xaddl %esi, (%rdi)
  1266. movl %esi,%eax
  1267. {$endif win64}
  1268. end;
  1269. function InterLockedCompareExchange(var Target: longint; NewValue, Comperand : longint): longint; assembler; nostackframe;
  1270. asm
  1271. {$ifdef win64}
  1272. movl %r8d,%eax
  1273. lock
  1274. cmpxchgl %edx,(%rcx)
  1275. {$else win64}
  1276. movl %edx,%eax
  1277. lock
  1278. cmpxchgl %esi,(%rdi)
  1279. {$endif win64}
  1280. end;
  1281. function InterLockedDecrement64 (var Target: int64) : int64; assembler; nostackframe;
  1282. asm
  1283. movq $-1,%rax
  1284. lock
  1285. xaddq %rax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1286. decq %rax
  1287. end;
  1288. function InterLockedIncrement64 (var Target: int64) : int64; assembler; nostackframe;
  1289. asm
  1290. movq $1,%rax
  1291. lock
  1292. xaddq %rax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  1293. incq %rax
  1294. end;
  1295. function InterLockedExchange64 (var Target: int64;Source : int64) : int64; assembler; nostackframe;
  1296. asm
  1297. {$ifdef win64}
  1298. xchgq (%rcx),%rdx
  1299. movq %rdx,%rax
  1300. {$else win64}
  1301. xchgq (%rdi),%rsi
  1302. movq %rsi,%rax
  1303. {$endif win64}
  1304. end;
  1305. function InterLockedExchangeAdd64 (var Target: int64;Source : int64) : int64; assembler; nostackframe;
  1306. asm
  1307. {$ifdef win64}
  1308. lock
  1309. xaddq %rdx, (%rcx)
  1310. movq %rdx,%rax
  1311. {$else win64}
  1312. lock
  1313. xaddq %rsi, (%rdi)
  1314. movq %rsi,%rax
  1315. {$endif win64}
  1316. end;
  1317. function InterLockedCompareExchange64(var Target: int64; NewValue, Comperand : int64): int64; assembler; nostackframe;
  1318. asm
  1319. {$ifdef win64}
  1320. movq %r8,%rax
  1321. lock
  1322. cmpxchgq %rdx,(%rcx)
  1323. {$else win64}
  1324. movq %rdx,%rax
  1325. lock
  1326. cmpxchgq %rsi,(%rdi)
  1327. {$endif win64}
  1328. end;
  1329. {****************************************************************************
  1330. FPU
  1331. ****************************************************************************}
  1332. const
  1333. { Internal constants for use in system unit }
  1334. FPU_Invalid = 1;
  1335. FPU_Denormal = 2;
  1336. FPU_DivisionByZero = 4;
  1337. FPU_Overflow = 8;
  1338. FPU_Underflow = $10;
  1339. FPU_StackUnderflow = $20;
  1340. FPU_StackOverflow = $40;
  1341. FPU_ExceptionMask = $ff;
  1342. MM_Invalid = 1;
  1343. MM_Denormal = 2;
  1344. MM_DivisionByZero = 4;
  1345. MM_Overflow = 8;
  1346. MM_Underflow = $10;
  1347. MM_Precicion = $20;
  1348. MM_ExceptionMask = $3f;
  1349. MM_MaskInvalidOp = %0000000010000000;
  1350. MM_MaskDenorm = %0000000100000000;
  1351. MM_MaskDivZero = %0000001000000000;
  1352. MM_MaskOverflow = %0000010000000000;
  1353. MM_MaskUnderflow = %0000100000000000;
  1354. MM_MaskPrecision = %0001000000000000;
  1355. {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
  1356. procedure fpc_cpuinit;
  1357. var
  1358. _eax,cpuid7_ebx,cpuid1_ecx : dword;
  1359. begin
  1360. { don't let libraries influence the FPU cw set by the host program }
  1361. if IsLibrary then
  1362. begin
  1363. Default8087CW:=Get8087CW;
  1364. DefaultMXCSR:=GetMXCSR;
  1365. end;
  1366. SysResetFPU;
  1367. asm
  1368. xorl %eax,%eax
  1369. cpuid
  1370. movl %eax,_eax
  1371. movl $1,%eax
  1372. xorl %ecx,%ecx
  1373. cpuid
  1374. movl %ecx,cpuid1_ecx
  1375. end;
  1376. has_sse41_support:=boolean(cpuid1_ecx shr 19 and 1);
  1377. if _eax>=7 then
  1378. begin
  1379. asm
  1380. movl $7,%eax
  1381. xorl %ecx,%ecx
  1382. cpuid
  1383. movl %ebx,cpuid7_ebx
  1384. end;
  1385. {$ifdef use_fast_repmovstos}
  1386. fast_large_repmovstosb:=cpuid7_ebx and (1 shl 9)<>0;
  1387. {$endif}
  1388. { XGETBV support? }
  1389. if (cpuid1_ecx and $8000000)<>0 then
  1390. begin
  1391. asm
  1392. xorl %ecx,%ecx
  1393. .byte 0x0f,0x01,0xd0 { xgetbv }
  1394. movl %eax,_eax
  1395. end;
  1396. if (_eax and 6)=6 then
  1397. begin
  1398. has_avx_support:=(cpuid1_ecx and $10000000)<>0;
  1399. has_avx2_support:=(cpuid7_ebx and $20)<>0;
  1400. end;
  1401. end;
  1402. end;
  1403. fpc_cpuinit_performed:=true;
  1404. end;
  1405. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  1406. Procedure SysInitFPU;
  1407. begin
  1408. end;
  1409. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  1410. Procedure SysResetFPU;
  1411. var
  1412. { these locals are so we don't have to hack pic code in the assembler }
  1413. localmxcsr: dword;
  1414. localfpucw: word;
  1415. begin
  1416. localfpucw:=Default8087CW;
  1417. localmxcsr:=DefaultMXCSR;
  1418. asm
  1419. fninit
  1420. fwait
  1421. fldcw localfpucw
  1422. ldmxcsr localmxcsr
  1423. end;
  1424. end;
  1425. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  1426. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  1427. procedure ReadBarrier;assembler;nostackframe;
  1428. asm
  1429. lfence
  1430. end;
  1431. procedure ReadDependencyBarrier;assembler;nostackframe;
  1432. asm
  1433. { reads imply barrier on earlier reads depended on }
  1434. end;
  1435. procedure ReadWriteBarrier;assembler;nostackframe;
  1436. asm
  1437. mfence
  1438. end;
  1439. procedure WriteBarrier;assembler;nostackframe;
  1440. asm
  1441. sfence
  1442. end;
  1443. {$endif}
  1444. {****************************************************************************
  1445. Math Routines
  1446. ****************************************************************************}
  1447. {$define FPC_SYSTEM_HAS_SWAPENDIAN}
  1448. { SwapEndian(<16 Bit>) being inlined is faster than using assembler }
  1449. function SwapEndian(const AValue: SmallInt): SmallInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  1450. begin
  1451. { the extra Word type cast is necessary because the "AValue shr 8" }
  1452. { is turned into "longint(AValue) shr 8", so if AValue < 0 then }
  1453. { the sign bits from the upper 16 bits are shifted in rather than }
  1454. { zeroes. }
  1455. Result := SmallInt(((Word(AValue) shr 8) or (Word(AValue) shl 8)) and $ffff);
  1456. end;
  1457. function SwapEndian(const AValue: Word): Word;{$ifdef SYSTEMINLINE}inline;{$endif}
  1458. begin
  1459. Result := ((AValue shr 8) or (AValue shl 8)) and $ffff;
  1460. end;
  1461. function SwapEndian(const AValue: LongInt): LongInt; assembler; nostackframe;
  1462. asm
  1463. {$ifdef win64}
  1464. movl %ecx, %eax
  1465. {$else win64}
  1466. movl %edi, %eax
  1467. {$endif win64}
  1468. bswap %eax
  1469. end;
  1470. function SwapEndian(const AValue: DWord): DWord; assembler; nostackframe;
  1471. asm
  1472. {$ifdef win64}
  1473. movl %ecx, %eax
  1474. {$else win64}
  1475. movl %edi, %eax
  1476. {$endif win64}
  1477. bswap %eax
  1478. end;
  1479. function SwapEndian(const AValue: Int64): Int64; assembler; nostackframe;
  1480. asm
  1481. {$ifdef win64}
  1482. movq %rcx, %rax
  1483. {$else win64}
  1484. movq %rdi, %rax
  1485. {$endif win64}
  1486. bswap %rax
  1487. end;
  1488. function SwapEndian(const AValue: QWord): QWord; assembler; nostackframe;
  1489. asm
  1490. {$ifdef win64}
  1491. movq %rcx, %rax
  1492. {$else win64}
  1493. movq %rdi, %rax
  1494. {$endif win64}
  1495. bswap %rax
  1496. end;
  1497. {$ifndef win64}
  1498. {$define FPC_SYSTEM_HAS_U128_DIV_U64_TO_U64}
  1499. function u128_div_u64_to_u64( const xh, xl: qword; const y: qword; out quotient, remainder: qword ): boolean;nostackframe;assembler;
  1500. {
  1501. SysV:
  1502. xh: RDI
  1503. xl: RSI
  1504. y: RDX
  1505. quotient: RCX
  1506. remainder: R8
  1507. }
  1508. label
  1509. dodiv;
  1510. asm
  1511. cmpq %rdi,%rdx
  1512. ja dodiv
  1513. xorl %eax,%eax
  1514. ret
  1515. dodiv:
  1516. movq %rdx,%r9
  1517. movq %rsi,%rax
  1518. movq %rdi,%rdx
  1519. divq %r9
  1520. movq %rax,(%rcx)
  1521. movq %rdx,(%r8)
  1522. movl $1,%eax
  1523. end;
  1524. {$endif win64}