i386.inc 60 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 1999-2000 by the Free Pascal development team.
  4. Processor dependent implementation for the system unit for
  5. intel i386+
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$if not(defined(VER3_0)) and defined(linux)}
  13. {$define FPC_SYSTEM_STACKALIGNMENT16}
  14. {$endif not(defined(VER3_0)) and defined(linux)}
  15. {****************************************************************************
  16. Primitives
  17. ****************************************************************************}
  18. var
  19. os_supports_sse : boolean;
  20. { this variable is set to true, if currently an sse check is executed and no sig ill should be generated }
  21. sse_check : boolean;
  22. fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
  23. fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. }
  24. {$asmmode ATT}
  25. function cpuid_support : boolean;assembler;nostackframe;
  26. {
  27. Check if the ID-flag can be changed, if changed then CpuID is supported.
  28. Tested under go32v1 and Linux on c6x86 with CpuID enabled and disabled (PFV)
  29. }
  30. asm
  31. pushfl
  32. movl (%esp),%eax
  33. xorl $0x200000,%eax
  34. pushl %eax
  35. popfl
  36. pushfl
  37. popl %eax
  38. xorl (%esp),%eax
  39. popfl
  40. testl $0x200000,%eax
  41. setnz %al
  42. end;
  43. {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
  44. procedure fpc_cpuinit;
  45. begin
  46. { because of the brain dead sse detection on x86, this test is post poned to fpc_cpucodeinit which
  47. must be implemented OS dependend (FK)
  48. has_sse_support:=sse_support;
  49. has_mmx_support:=mmx_support;
  50. }
  51. end;
  52. {$ifndef darwin}
  53. procedure fpc_geteipasebx; [public, alias: 'fpc_geteipasebx'];assembler; nostackframe;
  54. asm
  55. movl (%esp),%ebx
  56. end;
  57. procedure fpc_geteipasecx; [public, alias: 'fpc_geteipasecx'];assembler; nostackframe;
  58. asm
  59. movl (%esp),%ecx
  60. end;
  61. {$endif}
  62. {$if not defined(FPC_SYSTEM_HAS_MOVE)
  63. and not defined(OLD_ASSEMBLER)
  64. and not defined(darwin)}
  65. {$i fastmove.inc}
  66. {$endif}
  67. {$ifndef FPC_SYSTEM_HAS_MOVE}
  68. {$define FPC_SYSTEM_HAS_MOVE}
  69. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;
  70. var
  71. saveesi,saveedi : longint;
  72. asm
  73. movl %edi,saveedi
  74. movl %esi,saveesi
  75. movl %eax,%esi
  76. movl %edx,%edi
  77. movl %ecx,%edx
  78. movl %edi,%eax
  79. { check for zero or negative count }
  80. cmpl $0,%edx
  81. jle .LMoveEnd
  82. { Check for back or forward }
  83. sub %esi,%eax
  84. jz .LMoveEnd { Do nothing when source=dest }
  85. jc .LFMove { Do forward, dest<source }
  86. cmp %edx,%eax
  87. jb .LBMove { Dest is in range of move, do backward }
  88. { Forward Copy }
  89. .LFMove:
  90. {$ifdef FPC_ENABLED_CLD}
  91. cld
  92. {$endif FPC_ENABLED_CLD}
  93. cmpl $15,%edx
  94. jl .LFMove1
  95. movl %edi,%ecx { Align on 32bits }
  96. negl %ecx
  97. andl $3,%ecx
  98. subl %ecx,%edx
  99. rep
  100. movsb
  101. movl %edx,%ecx
  102. andl $3,%edx
  103. shrl $2,%ecx
  104. rep
  105. movsl
  106. .LFMove1:
  107. movl %edx,%ecx
  108. rep
  109. movsb
  110. jmp .LMoveEnd
  111. { Backward Copy }
  112. .LBMove:
  113. std
  114. addl %edx,%esi
  115. addl %edx,%edi
  116. movl %edi,%ecx
  117. decl %esi
  118. decl %edi
  119. cmpl $15,%edx
  120. jl .LBMove1
  121. negl %ecx { Align on 32bits }
  122. andl $3,%ecx
  123. subl %ecx,%edx
  124. rep
  125. movsb
  126. movl %edx,%ecx
  127. andl $3,%edx
  128. shrl $2,%ecx
  129. subl $3,%esi
  130. subl $3,%edi
  131. rep
  132. movsl
  133. addl $3,%esi
  134. addl $3,%edi
  135. .LBMove1:
  136. movl %edx,%ecx
  137. rep
  138. movsb
  139. cld
  140. .LMoveEnd:
  141. movl saveedi,%edi
  142. movl saveesi,%esi
  143. end;
  144. {$endif FPC_SYSTEM_HAS_MOVE}
  145. {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
  146. {$define FPC_SYSTEM_HAS_FILLCHAR}
  147. Procedure FillChar(var x;count:SizeInt;value:byte);assembler; nostackframe;
  148. asm
  149. cmpl $22,%edx { empirically determined value on a Core 2 Duo Conroe }
  150. jg .LFillFull
  151. orl %edx,%edx
  152. jle .LFillZero
  153. .LFillLoop:
  154. movb %cl,(%eax)
  155. incl %eax
  156. decl %edx
  157. jne .LFillLoop
  158. .LFillZero:
  159. ret
  160. .LFillFull:
  161. {$ifdef FPC_ENABLED_CLD}
  162. cld
  163. {$endif FPC_ENABLED_CLD}
  164. push %edi
  165. movl %eax,%edi
  166. movzbl %cl,%eax
  167. movl %edx,%ecx
  168. imul $0x01010101,%eax { Expand al into a 4 subbytes of eax}
  169. shrl $2,%ecx
  170. andl $3,%edx
  171. rep
  172. stosl
  173. movl %edx,%ecx
  174. .LFill1:
  175. rep
  176. stosb
  177. .LFillEnd:
  178. pop %edi
  179. end;
  180. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  181. {$ifndef FPC_SYSTEM_HAS_FILLWORD}
  182. {$define FPC_SYSTEM_HAS_FILLWORD}
  183. procedure fillword(var x;count : SizeInt;value : word);assembler;
  184. var
  185. saveedi : longint;
  186. asm
  187. movl %edi,saveedi
  188. movl %eax,%edi
  189. movzwl %cx,%eax
  190. movl %edx,%ecx
  191. { check for zero or negative count }
  192. cmpl $0,%ecx
  193. jle .LFillWordEnd
  194. movl %eax,%edx
  195. shll $16,%eax
  196. orl %edx,%eax
  197. movl %ecx,%edx
  198. shrl $1,%ecx
  199. {$ifdef FPC_ENABLED_CLD}
  200. cld
  201. {$endif FPC_ENABLED_CLD}
  202. rep
  203. stosl
  204. movl %edx,%ecx
  205. andl $1,%ecx
  206. rep
  207. stosw
  208. .LFillWordEnd:
  209. movl saveedi,%edi
  210. end;
  211. {$endif FPC_SYSTEM_HAS_FILLWORD}
  212. {$ifndef FPC_SYSTEM_HAS_FILLDWORD}
  213. {$define FPC_SYSTEM_HAS_FILLDWORD}
  214. procedure filldword(var x;count : SizeInt;value : dword);assembler;
  215. var
  216. saveedi : longint;
  217. asm
  218. movl %edi,saveedi
  219. movl %eax,%edi
  220. movl %ecx,%eax
  221. movl %edx,%ecx
  222. { check for zero or negative count }
  223. cmpl $0,%ecx
  224. jle .LFillDWordEnd
  225. {$ifdef FPC_ENABLED_CLD}
  226. cld
  227. {$endif FPC_ENABLED_CLD}
  228. rep
  229. stosl
  230. .LFillDWordEnd:
  231. movl saveedi,%edi
  232. end;
  233. {$endif FPC_SYSTEM_HAS_FILLDWORD}
  234. {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
  235. {$define FPC_SYSTEM_HAS_INDEXBYTE}
  236. function IndexByte_Plain(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  237. asm
  238. push %esi
  239. push %edi
  240. push %eax { save initial value of 'buf' }
  241. cmp $4,%edx { less than 4 bytes, just test byte by byte. }
  242. jb .Ltail
  243. mov %cl,%ch { prepare pattern }
  244. movzwl %cx,%esi
  245. shl $16,%ecx
  246. or %esi,%ecx
  247. .Lalignloop:
  248. test $3,%al { align to 4 bytes if necessary }
  249. je .Laligned
  250. cmp %cl,(%eax)
  251. je .Lexit
  252. inc %eax
  253. dec %edx
  254. jmp .Lalignloop
  255. .balign 16 { Main loop, unrolled 4 times for speed }
  256. .Lloop:
  257. mov (%eax),%esi { load dword }
  258. xor %ecx,%esi { XOR with pattern, bytes equal to target are now 0 }
  259. lea -0x01010101(%esi),%edi
  260. xor %esi,%edi { (x-0x01010101) xor x }
  261. not %esi
  262. and $0x80808080,%esi
  263. and %edi,%esi { ((x-0x01010101) xor x) and (not x) and 0x80808080 }
  264. jnz .Lfound { one of the bytes matches }
  265. mov 4(%eax),%esi
  266. xor %ecx,%esi
  267. lea -0x01010101(%esi),%edi
  268. xor %esi,%edi
  269. not %esi
  270. and $0x80808080,%esi
  271. and %edi,%esi
  272. jnz .Lfound4
  273. mov 8(%eax),%esi
  274. xor %ecx,%esi
  275. lea -0x01010101(%esi),%edi
  276. xor %esi,%edi
  277. not %esi
  278. and $0x80808080,%esi
  279. and %edi,%esi
  280. jnz .Lfound8
  281. mov 12(%eax),%esi
  282. xor %ecx,%esi
  283. lea -0x01010101(%esi),%edi
  284. xor %esi,%edi
  285. not %esi
  286. and $0x80808080,%esi
  287. and %edi,%esi
  288. jnz .Lfound12
  289. add $16,%eax
  290. .Laligned:
  291. sub $16,%edx
  292. jae .Lloop { Still more than 16 bytes remaining }
  293. { Process remaining bytes (<16 left at this point) }
  294. { length is offset by -16 at this point }
  295. .Lloop2:
  296. cmp $4-16,%edx { < 4 bytes left? }
  297. jb .Ltail
  298. mov (%eax),%esi
  299. xor %ecx,%esi
  300. lea -0x01010101(%esi),%edi
  301. xor %esi,%edi
  302. not %esi
  303. and $0x80808080,%esi
  304. and %edi,%esi
  305. jne .Lfound
  306. add $4,%eax
  307. sub $4,%edx
  308. jmp .Lloop2
  309. .Ltail: { Less than 4 bytes remaining, check one by one }
  310. and $3, %edx
  311. jz .Lnotfound
  312. .Lloop3:
  313. cmp %cl,(%eax)
  314. je .Lexit
  315. inc %eax
  316. dec %edx
  317. jnz .Lloop3
  318. .Lnotfound:
  319. or $-1,%eax
  320. jmp .Lexit1
  321. { add missing source pointer increments }
  322. .Lfound12:
  323. add $4,%eax
  324. .Lfound8:
  325. add $4,%eax
  326. .Lfound4:
  327. add $4,%eax
  328. .Lfound:
  329. test $0xff,%esi
  330. jnz .Lexit
  331. inc %eax
  332. test $0xff00,%esi
  333. jnz .Lexit
  334. inc %eax
  335. test $0xff0000,%esi
  336. jnz .Lexit
  337. inc %eax
  338. .Lexit:
  339. sub (%esp),%eax
  340. .Lexit1:
  341. pop %ecx { removes initial 'buf' value }
  342. pop %edi
  343. pop %esi
  344. end;
  345. function IndexByte_SSE2(const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  346. asm
  347. test %edx, %edx
  348. jz .Lnotfound { exit if len=0 }
  349. push %ebx
  350. movd %ecx, %xmm1
  351. lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
  352. punpcklbw %xmm1, %xmm1
  353. and $-0x10, %ecx { first aligned address after buf }
  354. punpcklbw %xmm1, %xmm1
  355. pshufd $0, %xmm1, %xmm1
  356. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
  357. sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
  358. pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
  359. pmovmskb %xmm0, %ebx
  360. shl %cl, %ebx { shift valid bits into high word }
  361. and $0xffff0000, %ebx { clear low word containing invalid bits }
  362. shr %cl, %ebx { shift back }
  363. jz .Lcontinue
  364. .Lmatch:
  365. bsf %ebx, %ebx
  366. lea -16(%ecx,%ebx), %eax
  367. pop %ebx
  368. cmp %eax, %edx { check against the buffer length }
  369. jbe .Lnotfound
  370. ret
  371. .balign 16
  372. .Lloop:
  373. movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, }
  374. add $16, %ecx { but their sum is evenly divisible by 16. }
  375. pcmpeqb %xmm1, %xmm0
  376. pmovmskb %xmm0, %ebx
  377. test %ebx, %ebx
  378. jnz .Lmatch
  379. .Lcontinue:
  380. cmp %ecx, %edx
  381. ja .Lloop
  382. pop %ebx
  383. .Lnotfound:
  384. or $-1, %eax
  385. end;
  386. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt; forward;
  387. var
  388. IndexByte_Impl: function(const buf;len:SizeInt;b:byte):SizeInt = @IndexByte_Dispatch;
  389. {$define has_i386_IndexByte_Impl} { used in assembler to manually inline IndexByte }
  390. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt;
  391. begin
  392. if has_sse2_support then
  393. IndexByte_Impl:=@IndexByte_SSE2
  394. else
  395. IndexByte_Impl:=@IndexByte_Plain;
  396. result:=IndexByte_Impl(buf,len,b);
  397. end;
  398. function IndexByte(const buf;len:SizeInt;b:byte):SizeInt;
  399. begin
  400. result:=IndexByte_Impl(buf,len,b);
  401. end;
  402. {$endif FPC_SYSTEM_HAS_INDEXBYTE}
  403. {$ifndef FPC_SYSTEM_HAS_INDEXWORD}
  404. {$define FPC_SYSTEM_HAS_INDEXWORD}
  405. function IndexWord_Plain(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  406. asm
  407. test %edx, %edx
  408. jz .LNotFound
  409. push %eax
  410. .LWordwise_Body: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  411. cmp %cx, (%eax)
  412. je .LFound
  413. add $2, %eax
  414. dec %edx
  415. jnz .LWordwise_Body
  416. pop %edx
  417. .LNotFound:
  418. or $-1, %eax
  419. ret
  420. .LFound:
  421. pop %edx
  422. sub %edx, %eax
  423. shr $1, %eax
  424. end;
  425. function IndexWord_SSE2(const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  426. asm
  427. test %edx, %edx { exit if len=0 }
  428. je .Lnotfound
  429. push %ebx
  430. movd %ecx, %xmm1
  431. punpcklwd %xmm1, %xmm1
  432. pshufd $0, %xmm1, %xmm1
  433. lea 16(%eax), %ecx
  434. and $-16, %ecx
  435. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
  436. sub %eax, %ecx
  437. test $1, %eax { if buffer isn't aligned to word boundary, }
  438. jnz .Lunaligned { use a different algorithm }
  439. pcmpeqw %xmm1, %xmm0
  440. pmovmskb %xmm0, %ebx
  441. shl %cl, %ebx
  442. and $0xffff0000, %ebx
  443. shr %cl, %ebx
  444. shr $1, %ecx { ecx=number of valid bytes }
  445. test %ebx, %ebx
  446. jz .Lcontinue
  447. .Lmatch:
  448. bsf %ebx, %ebx
  449. shr $1, %ebx { in words }
  450. lea -8(%ecx,%ebx), %eax
  451. pop %ebx
  452. cmp %eax, %edx
  453. jbe .Lnotfound { if match is after the specified length, ignore it }
  454. ret
  455. .balign 16
  456. .Lloop:
  457. movdqa (%eax,%ecx,2), %xmm0
  458. add $8, %ecx
  459. pcmpeqw %xmm1, %xmm0
  460. pmovmskb %xmm0, %ebx
  461. test %ebx, %ebx
  462. jnz .Lmatch
  463. .Lcontinue:
  464. cmp %ecx, %edx
  465. ja .Lloop
  466. pop %ebx
  467. .Lnotfound:
  468. or $-1, %eax
  469. ret
  470. .Lunaligned:
  471. push %esi
  472. movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
  473. psllw $8, %xmm1 { swap bytes of each word of pattern) }
  474. psrlw $8, %xmm2
  475. por %xmm2, %xmm1
  476. pcmpeqb %xmm1, %xmm0
  477. pmovmskb %xmm0, %ebx
  478. shl %cl, %ebx
  479. and $0xffff0000, %ebx
  480. shr %cl, %ebx
  481. xor %esi, %esi { nothing to merge yet }
  482. add %edx, %edx { length words -> bytes }
  483. jmp .Lcontinue_u
  484. .balign 16
  485. .Lloop_u:
  486. movdqa (%eax,%ecx), %xmm0
  487. add $16, %ecx
  488. pcmpeqb %xmm1, %xmm0 { compare by bytes }
  489. shr $16, %esi { bit 16 shifts into 0 }
  490. pmovmskb %xmm0, %ebx
  491. .Lcontinue_u:
  492. shl $1, %ebx { 15:0 -> 16:1 }
  493. or %esi, %ebx { merge bit 0 from previous round }
  494. mov %ebx, %esi
  495. shr $1, %ebx { now AND together adjacent pairs of bits }
  496. and %esi, %ebx
  497. and $0x5555, %ebx { also reset odd bits }
  498. jnz .Lmatch_u
  499. cmp %ecx, %edx
  500. ja .Lloop_u
  501. .Lnotfound_u:
  502. pop %esi
  503. pop %ebx
  504. or $-1, %eax
  505. ret
  506. .Lmatch_u:
  507. bsf %ebx, %ebx
  508. lea -16(%ecx,%ebx), %eax
  509. cmp %eax, %edx
  510. jbe .Lnotfound_u { if match is after the specified length, ignore it }
  511. sar $1, %eax { in words }
  512. pop %esi
  513. pop %ebx
  514. end;
  515. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt; forward;
  516. var
  517. IndexWord_Impl: function(const buf;len:SizeInt;b:word):SizeInt = @IndexWord_Dispatch;
  518. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt;
  519. begin
  520. if has_sse2_support then
  521. IndexWord_Impl:=@IndexWord_SSE2
  522. else
  523. IndexWord_Impl:=@IndexWord_Plain;
  524. result:=IndexWord_Impl(buf,len,b);
  525. end;
  526. function IndexWord(const buf;len:SizeInt;b:word):SizeInt; inline;
  527. begin
  528. result:=IndexWord_Impl(buf,len,b);
  529. end;
  530. {$endif FPC_SYSTEM_HAS_INDEXWORD}
  531. {$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
  532. {$define FPC_SYSTEM_HAS_INDEXDWORD}
  533. function IndexDWord_Plain(Const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  534. asm
  535. push %eax
  536. sub $4, %eax
  537. .LDWordwise_Next: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  538. add $4, %eax
  539. sub $1, %edx
  540. jb .LNotFound
  541. cmp %ecx, (%eax)
  542. jne .LDWordwise_Next
  543. pop %edx
  544. sub %edx, %eax
  545. shr $2, %eax
  546. ret
  547. .LNotFound:
  548. pop %edx
  549. mov $-1, %eax
  550. end;
  551. function IndexDWord_SSE2(const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  552. asm
  553. push %eax
  554. sub $4, %edx
  555. jle .LDwordwise_Prepare
  556. movd %ecx, %xmm1
  557. pshufd $0, %xmm1, %xmm1
  558. .balign 16 { 1-byte NOP. }
  559. .L4x_Body:
  560. movdqu (%eax), %xmm0
  561. pcmpeqd %xmm1, %xmm0
  562. pmovmskb %xmm0, %ecx
  563. test %ecx, %ecx
  564. jnz .LFoundAtMask
  565. add $16, %eax
  566. sub $4, %edx
  567. jg .L4x_Body
  568. lea (%eax,%edx,4), %eax
  569. movdqu (%eax), %xmm0
  570. pcmpeqd %xmm1, %xmm0
  571. pmovmskb %xmm0, %ecx
  572. test %ecx, %ecx
  573. jz .LNothing
  574. .LFoundAtMask:
  575. bsf %ecx, %ecx
  576. add %ecx, %eax
  577. .LFoundAtEax:
  578. pop %edx
  579. sub %edx, %eax
  580. shr $2, %eax
  581. ret
  582. nop { Turns .balign 16 before .LDwordwise_Body into a no-op. }
  583. .LDwordwise_Prepare:
  584. add $3, %edx
  585. cmp $-1, %edx
  586. je .LNothing
  587. .balign 16 { no-op }
  588. .LDwordwise_Body:
  589. cmp (%eax), %ecx
  590. je .LFoundAtEax
  591. add $4, %eax
  592. sub $1, %edx
  593. jae .LDwordwise_Body
  594. .LNothing:
  595. pop %edx
  596. or $-1, %eax
  597. end;
  598. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt; forward;
  599. var
  600. IndexDWord_Impl: function(const buf;len:SizeInt;b:DWord):SizeInt = @IndexDWord_Dispatch;
  601. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt;
  602. begin
  603. if has_sse2_support then
  604. IndexDWord_Impl:=@IndexDWord_SSE2
  605. else
  606. IndexDWord_Impl:=@IndexDWord_Plain;
  607. result:=IndexDWord_Impl(buf,len,b);
  608. end;
  609. function IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt;
  610. begin
  611. result:=IndexDWord_Impl(buf,len,b);
  612. end;
  613. {$endif FPC_SYSTEM_HAS_INDEXDWORD}
  614. {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
  615. {$define FPC_SYSTEM_HAS_INDEXQWORD}
  616. function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  617. { eax = buf, edx = len, [esp+4] = b }
  618. asm
  619. push %ebx
  620. mov 8(%esp), %ecx { ecx = b[0:31] }
  621. mov 12(%esp), %ebx { ebx = b[32:63] }
  622. mov %eax, 8(%esp) { remember original buf }
  623. sub $8, %eax
  624. .balign 16 { no-op }
  625. .LQWordwise_Next:
  626. add $8, %eax
  627. sub $1, %edx
  628. jb .LNotFound
  629. cmp %ecx, (%eax)
  630. jne .LQWordwise_Next
  631. cmp %ebx, 4(%eax)
  632. jne .LQWordwise_Next
  633. sub 8(%esp), %eax
  634. pop %ebx
  635. shr $3, %eax
  636. ret $8
  637. .LNotFound:
  638. pop %ebx
  639. mov $-1, %eax
  640. end;
  641. {$endif FPC_SYSTEM_HAS_INDEXQWORD}
  642. {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
  643. {$define FPC_SYSTEM_HAS_COMPAREBYTE}
  644. function CompareByte_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  645. asm
  646. { eax = buf1, edx = buf2, ecx = len }
  647. push %ebx
  648. sub %eax, %edx { edx = buf2 - buf1 }
  649. cmp $3, %ecx
  650. jle .LBytewise_Prepare
  651. { Align buf1 on 4 bytes. }
  652. mov (%edx,%eax), %ebx
  653. cmp (%eax), %ebx
  654. jne .L4xDiffer
  655. lea -4(%eax,%ecx), %ecx { ecx = buf1 end - (4 + buf1 and -4) = count remaining }
  656. and $-4, %eax
  657. sub %eax, %ecx
  658. .balign 16
  659. .L4x_Next:
  660. add $4, %eax
  661. sub $4, %ecx { at .LLast4, ecx is 4 less than remaining bytes }
  662. jle .LLast4
  663. mov (%edx,%eax), %ebx
  664. cmp (%eax), %ebx
  665. je .L4x_Next
  666. .L4xDiffer:
  667. mov (%eax), %edx
  668. {$ifdef CPUX86_HAS_BSWAP}
  669. bswap %ebx
  670. bswap %edx
  671. {$else}
  672. rol $8, %bx
  673. rol $16, %ebx
  674. rol $8, %bx
  675. rol $8, %dx
  676. rol $16, %edx
  677. rol $8, %dx
  678. {$endif}
  679. cmp %ebx, %edx
  680. .LDoSbb:
  681. sbb %eax, %eax
  682. or $1, %eax
  683. pop %ebx
  684. ret
  685. .LLast4:
  686. add %ecx, %eax
  687. mov (%edx,%eax), %ebx
  688. cmp (%eax), %ebx
  689. jne .L4xDiffer
  690. xor %eax, %eax
  691. pop %ebx
  692. ret
  693. .LBytewise_Prepare:
  694. sub $1, %ecx
  695. jb .LNothing
  696. .balign 16 { no-op }
  697. .LBytewise_Body:
  698. movzbl (%edx,%eax), %ebx
  699. cmp %bl, (%eax)
  700. jne .LDoSbb
  701. add $1, %eax
  702. sub $1, %ecx
  703. jae .LBytewise_Body
  704. .LNothing:
  705. xor %eax, %eax
  706. pop %ebx
  707. end;
  708. function CompareByte_SSE2(const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
  709. asm
  710. { eax = buf1, edx = buf2, ecx = len }
  711. cmp $1, %ecx
  712. jle .L1OrLess
  713. push %ebx
  714. cmp $16, %ecx
  715. jae .LVecOrMore
  716. { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
  717. mov %eax, %ebx
  718. or %edx, %ebx
  719. and $4095, %ebx
  720. cmp $4080, %ebx
  721. ja .LCantOverReadBoth
  722. { Over-read both as XMMs. }
  723. movdqu (%eax), %xmm0
  724. movdqu (%edx), %xmm1
  725. pcmpeqb %xmm1, %xmm0
  726. pmovmskb %xmm0, %ebx
  727. inc %bx { Here and below, 2-byte 'inc r16' after 'pmovmskb' can be replaced with 5-byte 'add $1, r16' or 6-byte 'xor $65535, r32'. }
  728. jz .LNothing
  729. bsf %ebx, %ebx
  730. cmp %ecx, %ebx { Ignore garbage beyond 'len'. }
  731. jae .LNothing
  732. movzbl (%eax,%ebx), %eax
  733. movzbl (%edx,%ebx), %edx
  734. sub %edx, %eax
  735. pop %ebx
  736. ret
  737. .LNothing:
  738. pop %ebx
  739. xor %eax, %eax
  740. ret
  741. .LVecOrMore:
  742. { Compare first vectors. }
  743. movdqu (%eax), %xmm0
  744. movdqu (%edx), %xmm1
  745. pcmpeqb %xmm1, %xmm0
  746. pmovmskb %xmm0, %ebx
  747. inc %bx
  748. jnz .LVec0Differs
  749. sub $32, %ecx { now ecx is len - 32... mostly just to save bytes on offsets improving .LAligned32xLoop_Body alignment :) }
  750. jbe .LLastVec
  751. { Compare second vectors. }
  752. movdqu 16(%eax), %xmm0
  753. movdqu 16(%edx), %xmm1
  754. pcmpeqb %xmm1, %xmm0
  755. pmovmskb %xmm0, %ebx
  756. inc %bx
  757. jnz .LVec1Differs
  758. { More than four vectors: aligned loop. }
  759. cmp $32, %ecx
  760. ja .LAligned32xLoop_Prepare
  761. { Compare last two vectors. }
  762. movdqu (%eax,%ecx), %xmm0
  763. movdqu (%edx,%ecx), %xmm1
  764. pcmpeqb %xmm1, %xmm0
  765. pmovmskb %xmm0, %ebx
  766. inc %bx
  767. jnz .LVecEm2Differs
  768. .LLastVec:
  769. movdqu 16(%eax,%ecx), %xmm0
  770. movdqu 16(%edx,%ecx), %xmm1
  771. pcmpeqb %xmm1, %xmm0
  772. pmovmskb %xmm0, %ebx
  773. inc %bx
  774. jnz .LVecEm1Differs
  775. pop %ebx
  776. xor %eax, %eax
  777. ret
  778. .LVecEm2Differs:
  779. sub $16, %ecx
  780. .LVecEm1Differs:
  781. bsf %ebx, %ebx
  782. add %ecx, %ebx
  783. movzbl 16(%eax,%ebx), %eax
  784. movzbl 16(%edx,%ebx), %edx
  785. sub %edx, %eax
  786. pop %ebx
  787. ret
  788. nop { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
  789. .LAligned32xLoop_Prepare:
  790. lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
  791. sub %eax, %edx { edx = buf2 - buf1 }
  792. and $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
  793. sub %eax, %ecx { ecx = count to be handled with loop }
  794. .balign 16 { No-op. }
  795. .LAligned32xLoop_Body:
  796. add $32, %eax
  797. { Compare two XMMs, reduce the result with 'and'. }
  798. movdqu (%edx,%eax), %xmm0
  799. pcmpeqb (%eax), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
  800. movdqu 16(%edx,%eax), %xmm1
  801. pcmpeqb 16(%eax), %xmm1
  802. pand %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
  803. pmovmskb %xmm1, %ebx
  804. inc %bx
  805. jnz .LAligned32xLoop_TwoVectorsDiffer
  806. sub $32, %ecx
  807. ja .LAligned32xLoop_Body
  808. { Compare last two vectors after the loop by doing one more loop iteration, modified. }
  809. lea 32(%eax,%ecx), %eax
  810. movdqu (%edx,%eax), %xmm0
  811. movdqu (%eax), %xmm2
  812. pcmpeqb %xmm2, %xmm0
  813. movdqu 16(%edx,%eax), %xmm1
  814. movdqu 16(%eax), %xmm2
  815. pcmpeqb %xmm2, %xmm1
  816. pand %xmm0, %xmm1
  817. pmovmskb %xmm1, %ebx
  818. inc %bx
  819. jnz .LAligned32xLoop_TwoVectorsDiffer
  820. pop %ebx
  821. xor %eax, %eax
  822. ret
  823. .LAligned32xLoop_TwoVectorsDiffer:
  824. add %eax, %edx { restore edx = buf2 }
  825. pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
  826. inc %cx
  827. jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
  828. bsf %ecx, %ebx
  829. movzbl (%eax,%ebx), %eax
  830. movzbl (%edx,%ebx), %edx
  831. sub %edx, %eax
  832. pop %ebx
  833. ret
  834. .LVec1Differs:
  835. add $16, %eax
  836. add $16, %edx
  837. .LVec0Differs:
  838. bsf %ebx, %ebx
  839. movzbl (%eax,%ebx), %eax
  840. movzbl (%edx,%ebx), %edx
  841. sub %edx, %eax
  842. pop %ebx
  843. ret
  844. .LCantOverReadBoth:
  845. cmp $3, %ecx
  846. jle .L2to3
  847. push %esi
  848. mov (%eax), %ebx
  849. mov (%edx), %esi
  850. cmp %esi, %ebx
  851. jne .L4xDiffer
  852. cmp $8, %ecx
  853. jbe .LLast4x
  854. mov 4(%eax), %ebx
  855. mov 4(%edx), %esi
  856. cmp %esi, %ebx
  857. jne .L4xDiffer
  858. mov -8(%eax,%ecx), %ebx
  859. mov -8(%edx,%ecx), %esi
  860. cmp %esi, %ebx
  861. jne .L4xDiffer
  862. .LLast4x:
  863. mov -4(%eax,%ecx), %ebx
  864. mov -4(%edx,%ecx), %esi
  865. cmp %esi, %ebx
  866. jne .L4xDiffer
  867. pop %esi
  868. pop %ebx
  869. xor %eax, %eax
  870. ret
  871. .L4xDiffer:
  872. bswap %ebx
  873. bswap %esi
  874. cmp %esi, %ebx
  875. pop %esi
  876. sbb %eax, %eax
  877. or $1, %eax
  878. pop %ebx
  879. ret
  880. .L2to3:
  881. movzwl (%edx), %ebx
  882. bswap %ebx
  883. shr $1, %ebx
  884. mov -1(%edx,%ecx), %bl
  885. movzwl (%eax), %edx
  886. bswap %edx
  887. shr $1, %edx
  888. mov -1(%eax,%ecx), %dl
  889. mov %edx, %eax
  890. sub %ebx, %eax
  891. pop %ebx
  892. ret
  893. .L1OrLess:
  894. jl .LUnbounded_Prepare
  895. movzbl (%eax), %eax
  896. movzbl (%edx), %edx
  897. sub %edx, %eax
  898. ret
  899. .LUnbounded_Prepare:
  900. sub %eax, %edx { edx = buf2 - buf1 }
  901. test %ecx, %ecx
  902. jnz .LUnbounded_Body
  903. xor %eax, %eax
  904. ret
  905. .balign 16
  906. .LUnbounded_Next:
  907. add $1, %eax
  908. .LUnbounded_Body:
  909. movzbl (%edx,%eax), %ecx
  910. cmp %cl, (%eax)
  911. je .LUnbounded_Next
  912. sbb %eax, %eax
  913. or $1, %eax
  914. end;
  915. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  916. var
  917. CompareByte_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareByte_Dispatch;
  918. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  919. begin
  920. if has_sse2_support then
  921. CompareByte_Impl:=@CompareByte_SSE2
  922. else
  923. CompareByte_Impl:=@CompareByte_Plain;
  924. result:=CompareByte_Impl(buf1, buf2, len);
  925. end;
  926. function CompareByte(const buf1, buf2; len: SizeInt): SizeInt;
  927. begin
  928. result:=CompareByte_Impl(buf1, buf2, len);
  929. end;
  930. {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
  931. {$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
  932. {$define FPC_SYSTEM_HAS_COMPAREWORD}
  933. function CompareWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  934. asm
  935. push %ebx
  936. sub %eax, %edx { edx = buf2 - buf1 }
  937. lea -4(%ecx), %ebx { Go wordwise if ecx <= 3 or ecx > 1073741823 (High(int32) div 2) ==> uint32(ecx - 4) > 1073741819. }
  938. cmp $1073741819, %ebx
  939. ja .LWordwise_Prepare
  940. test $2, %al
  941. je .LAlignedToPtrUintOrNaturallyMisaligned
  942. movzwl (%edx,%eax), %ebx
  943. cmp %bx, (%eax)
  944. jne .LDoSbb
  945. add $2, %eax
  946. sub $1, %ecx
  947. .LAlignedToPtrUintOrNaturallyMisaligned:
  948. sub $2, %ecx
  949. .balign 16
  950. .LPtrUintWise_Next:
  951. mov (%edx,%eax), %ebx
  952. cmp %ebx, (%eax)
  953. jne .LPtrUintsDiffer
  954. add $4, %eax
  955. sub $2, %ecx
  956. jg .LPtrUintWise_Next
  957. lea (%eax,%ecx,2), %eax
  958. mov (%edx,%eax), %ebx
  959. cmp %ebx, (%eax)
  960. jne .LPtrUintsDiffer
  961. pop %ebx
  962. xor %eax, %eax
  963. ret
  964. .LPtrUintsDiffer:
  965. cmp %bx, (%eax)
  966. jne .LDoSbb
  967. shr $16, %ebx
  968. cmp %bx, 2(%eax)
  969. .LDoSbb:
  970. sbb %eax, %eax
  971. or $1, %eax
  972. pop %ebx
  973. ret
  974. .balign 16
  975. .LWordwise_Body:
  976. movzwl (%edx,%eax), %ebx
  977. cmp %bx, (%eax)
  978. jne .LDoSbb
  979. add $2, %eax
  980. .LWordwise_Prepare:
  981. sub $1, %ecx
  982. jnb .LWordwise_Body
  983. pop %ebx
  984. xor %eax, %eax
  985. end;
  986. function CompareWord_SSE2(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  987. asm
  988. push %ebx
  989. sub %eax, %edx { edx = buf2 - buf1 }
  990. lea -2(%ecx), %ebx { Go wordwise if ecx <= 1 or uint32(ecx) > 1073741823 (High(int32) div 2) ==> uint32(ecx - 2) > 1073741821. }
  991. cmp $1073741821, %ebx
  992. ja .LWordwise_Prepare
  993. cmp $8, %ecx
  994. jge .LVecOrMore
  995. lea (%edx,%eax), %ebx
  996. or %eax, %ebx
  997. and $4095, %ebx
  998. cmp $4080, %ebx
  999. ja .LWordwise_Prepare
  1000. movdqu (%edx,%eax), %xmm0
  1001. movdqu (%eax), %xmm1
  1002. pcmpeqw %xmm1, %xmm0
  1003. pmovmskb %xmm0, %ebx
  1004. inc %bx
  1005. jz .LNothing
  1006. shl $1, %ecx { convert to bytes }
  1007. bsf %ebx, %ebx
  1008. cmp %ecx, %ebx
  1009. jb .LSubtractWords
  1010. .LNothing:
  1011. pop %ebx
  1012. xor %eax, %eax
  1013. ret
  1014. .balign 16
  1015. .LWordwise_Body:
  1016. movzwl (%edx,%eax), %ebx
  1017. cmp %bx, (%eax)
  1018. jne .LDoSbb
  1019. add $2, %eax
  1020. .LWordwise_Prepare:
  1021. sub $1, %ecx
  1022. jae .LWordwise_Body
  1023. xor %eax, %eax
  1024. pop %ebx
  1025. ret
  1026. .LDoSbb:
  1027. sbb %eax, %eax
  1028. or $1, %eax
  1029. pop %ebx
  1030. ret
  1031. .LVecOrMore:
  1032. movdqu (%edx,%eax), %xmm0 { Compare first vectors. }
  1033. movdqu (%eax), %xmm1
  1034. pcmpeqw %xmm1, %xmm0
  1035. pmovmskb %xmm0, %ebx
  1036. inc %bx
  1037. jnz .LVec0Differs
  1038. shl $1, %ecx { convert to bytes }
  1039. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1040. jle .LLastVec
  1041. push %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1042. add %eax, %ecx
  1043. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1044. sub %eax, %ecx
  1045. .balign 16
  1046. .LAligned8xLoop_Body:
  1047. add $16, %eax
  1048. movdqu (%edx,%eax), %xmm0
  1049. pcmpeqb (%eax), %xmm0
  1050. pmovmskb %xmm0, %ebx
  1051. inc %bx
  1052. jnz .LAligned8xLoop_VecDiffers
  1053. sub $16, %ecx
  1054. ja .LAligned8xLoop_Body
  1055. pop %ebx { drop original buf1 }
  1056. .LLastVec:
  1057. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1058. movdqu (%edx,%eax), %xmm0
  1059. movdqu (%eax), %xmm1
  1060. pcmpeqw %xmm1, %xmm0
  1061. pmovmskb %xmm0, %ebx
  1062. inc %bx
  1063. jnz .LVec0Differs
  1064. pop %ebx
  1065. xor %eax, %eax
  1066. ret
  1067. .LVec0Differs:
  1068. bsf %ebx, %ebx
  1069. .LSubtractWords:
  1070. add %eax, %edx
  1071. movzwl (%eax,%ebx), %eax
  1072. movzwl (%edx,%ebx), %edx
  1073. sub %edx, %eax
  1074. pop %ebx
  1075. ret
  1076. .LAligned8xLoop_VecDiffers:
  1077. bsf %ebx, %ebx
  1078. add %ebx, %eax
  1079. pop %ecx
  1080. sub %ecx, %eax
  1081. and $-2, %eax
  1082. add %ecx, %eax
  1083. movzwl (%edx,%eax), %edx
  1084. movzwl (%eax), %eax
  1085. sub %edx, %eax
  1086. pop %ebx
  1087. end;
  1088. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1089. var
  1090. CompareWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareWord_Dispatch;
  1091. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1092. begin
  1093. if has_sse2_support then
  1094. CompareWord_Impl:=@CompareWord_SSE2
  1095. else
  1096. CompareWord_Impl:=@CompareWord_Plain;
  1097. result:=CompareWord_Impl(buf1, buf2, len);
  1098. end;
  1099. function CompareWord(const buf1, buf2; len: SizeInt): SizeInt;
  1100. begin
  1101. result:=CompareWord_Impl(buf1, buf2, len);
  1102. end;
  1103. {$endif FPC_SYSTEM_HAS_COMPAREWORD}
  1104. {$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
  1105. {$define FPC_SYSTEM_HAS_COMPAREDWORD}
  1106. function CompareDWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1107. asm
  1108. sub $1, %ecx
  1109. jb .LNothing
  1110. push %ebx
  1111. sub %eax, %edx
  1112. .balign 16
  1113. .LDwordwise_Body:
  1114. mov (%edx,%eax), %ebx
  1115. cmp %ebx, (%eax)
  1116. jne .LDoSbb
  1117. add $4, %eax
  1118. sub $1, %ecx
  1119. jnb .LDwordwise_Body
  1120. pop %ebx
  1121. .LNothing:
  1122. xor %eax, %eax
  1123. ret
  1124. .LDoSbb:
  1125. pop %ebx
  1126. sbb %eax, %eax
  1127. or $1, %eax
  1128. end;
  1129. function CompareDWord_SSE2(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1130. asm
  1131. push %ebx
  1132. sub %eax, %edx { edx = buf2 - buf1 }
  1133. lea -5(%ecx), %ebx { Go dwordwise if ecx <= 4 or ecx > 536870911 (High(int32) div 4) ==> uint32(ecx - 5) > 536870906. }
  1134. cmp $536870906, %ebx
  1135. ja .LDwordwise_Prepare
  1136. shl $2, %ecx { convert to bytes }
  1137. movdqu (%edx,%eax), %xmm1 { Compare first vectors. }
  1138. movdqu (%eax), %xmm0
  1139. pcmpeqd %xmm1, %xmm0
  1140. pmovmskb %xmm0, %ebx
  1141. inc %bx
  1142. jnz .LVec0Differs
  1143. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1144. jle .LLastVec
  1145. push %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1146. add %eax, %ecx
  1147. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1148. sub %eax, %ecx
  1149. .balign 16
  1150. .LAligned4xLoop_Body:
  1151. add $16, %eax
  1152. movdqu (%eax,%edx), %xmm0
  1153. pcmpeqb (%eax), %xmm0
  1154. pmovmskb %xmm0, %ebx
  1155. inc %bx
  1156. jnz .LAligned4xLoop_VecDiffers
  1157. sub $16, %ecx
  1158. ja .LAligned4xLoop_Body
  1159. pop %ebx { drop original buf1 }
  1160. .LLastVec:
  1161. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1162. movdqu (%edx,%eax), %xmm1
  1163. movdqu (%eax), %xmm0
  1164. pcmpeqd %xmm1, %xmm0
  1165. pmovmskb %xmm0, %ebx
  1166. inc %bx
  1167. jnz .LVec0Differs
  1168. pop %ebx
  1169. xor %eax, %eax
  1170. ret
  1171. .LVec0Differs:
  1172. bsf %ebx, %ebx
  1173. add %eax, %edx { recover edx = buf2 }
  1174. mov (%edx,%ebx), %edx
  1175. cmp %edx, (%eax,%ebx)
  1176. sbb %eax, %eax
  1177. or $1, %eax
  1178. pop %ebx
  1179. ret
  1180. .LAligned4xLoop_VecDiffers:
  1181. bsf %ebx, %ebx
  1182. add %ebx, %eax
  1183. pop %ecx
  1184. sub %ecx, %eax
  1185. and $-4, %eax
  1186. add %ecx, %eax
  1187. mov (%edx,%eax), %edx
  1188. cmp %edx, (%eax)
  1189. .LDoSbb:
  1190. sbb %eax, %eax
  1191. or $1, %eax
  1192. pop %ebx
  1193. ret
  1194. .balign 16
  1195. .LDwordwise_Body:
  1196. mov (%edx,%eax), %ebx
  1197. cmp %ebx, (%eax)
  1198. jne .LDoSbb
  1199. add $4, %eax
  1200. .LDwordwise_Prepare:
  1201. sub $1, %ecx
  1202. jnb .LDwordwise_Body
  1203. pop %ebx
  1204. xor %eax, %eax
  1205. end;
  1206. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1207. var
  1208. CompareDWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareDWord_Dispatch;
  1209. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1210. begin
  1211. if has_sse2_support then
  1212. CompareDWord_Impl:=@CompareDWord_SSE2
  1213. else
  1214. CompareDWord_Impl:=@CompareDWord_Plain;
  1215. result:=CompareDWord_Impl(buf1, buf2, len);
  1216. end;
  1217. function CompareDWord(const buf1, buf2; len: SizeInt): SizeInt;
  1218. begin
  1219. result:=CompareDWord_Impl(buf1, buf2, len);
  1220. end;
  1221. {$endif FPC_SYSTEM_HAS_COMPAREDWORD}
  1222. {$ifndef FPC_SYSTEM_HAS_INDEXCHAR0}
  1223. {$define FPC_SYSTEM_HAS_INDEXCHAR0}
  1224. function IndexChar0(Const buf;len:SizeInt;b:AnsiChar):SizeInt; assembler;
  1225. var
  1226. saveesi,saveebx : longint;
  1227. asm
  1228. movl %esi,saveesi
  1229. movl %ebx,saveebx
  1230. // Can't use scasb, or will have to do it twice, think this
  1231. // is faster for small "len"
  1232. movl %eax,%esi // Load address
  1233. movzbl %cl,%ebx // Load searchpattern
  1234. testl %edx,%edx
  1235. je .LFound
  1236. xorl %ecx,%ecx // zero index in Buf
  1237. xorl %eax,%eax // To make DWord compares possible
  1238. .balign 4
  1239. .LLoop:
  1240. movb (%esi),%al // Load byte
  1241. cmpb %al,%bl
  1242. je .LFound // byte the same?
  1243. incl %ecx
  1244. incl %esi
  1245. cmpl %edx,%ecx // Maximal distance reached?
  1246. je .LNotFound
  1247. testl %eax,%eax // Nullchar = end of search?
  1248. jne .LLoop
  1249. .LNotFound:
  1250. movl $-1,%ecx // Not found return -1
  1251. .LFound:
  1252. movl %ecx,%eax
  1253. movl saveesi,%esi
  1254. movl saveebx,%ebx
  1255. end;
  1256. {$endif FPC_SYSTEM_HAS_INDEXCHAR0}
  1257. {****************************************************************************
  1258. String
  1259. ****************************************************************************}
  1260. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1261. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1262. procedure fpc_shortstr_to_shortstr(out res:shortstring; const sstr: shortstring);assembler;[public,alias:'FPC_SHORTSTR_TO_SHORTSTR']; compilerproc;
  1263. var
  1264. saveesi,saveedi : longint;
  1265. asm
  1266. {$ifdef FPC_PROFILE}
  1267. push %eax
  1268. push %edx
  1269. push %ecx
  1270. call mcount
  1271. pop %ecx
  1272. pop %edx
  1273. pop %eax
  1274. {$endif FPC_PROFILE}
  1275. movl %edi,saveedi
  1276. movl %esi,saveesi
  1277. {$ifdef FPC_ENABLED_CLD}
  1278. cld
  1279. {$endif FPC_ENABLED_CLD}
  1280. movl res,%edi
  1281. movl sstr,%esi
  1282. movl %edx,%ecx
  1283. xorl %eax,%eax
  1284. lodsb
  1285. cmpl %ecx,%eax
  1286. jbe .LStrCopy1
  1287. movl %ecx,%eax
  1288. .LStrCopy1:
  1289. stosb
  1290. cmpl $7,%eax
  1291. jl .LStrCopy2
  1292. movl %edi,%ecx { Align on 32bits }
  1293. negl %ecx
  1294. andl $3,%ecx
  1295. subl %ecx,%eax
  1296. rep
  1297. movsb
  1298. movl %eax,%ecx
  1299. andl $3,%eax
  1300. shrl $2,%ecx
  1301. rep
  1302. movsl
  1303. .LStrCopy2:
  1304. movl %eax,%ecx
  1305. rep
  1306. movsb
  1307. movl saveedi,%edi
  1308. movl saveesi,%esi
  1309. end;
  1310. procedure fpc_shortstr_assign(len:longint;sstr,dstr:pointer);[public,alias:'FPC_SHORTSTR_ASSIGN'];
  1311. begin
  1312. asm
  1313. {$ifdef FPC_PROFILE}
  1314. push %eax
  1315. push %edx
  1316. push %ecx
  1317. call mcount
  1318. pop %ecx
  1319. pop %edx
  1320. pop %eax
  1321. {$endif FPC_PROFILE}
  1322. pushl %eax
  1323. pushl %ecx
  1324. {$ifdef FPC_ENABLED_CLD}
  1325. cld
  1326. {$endif FPC_ENABLED_CLD}
  1327. movl dstr,%edi
  1328. movl sstr,%esi
  1329. xorl %eax,%eax
  1330. movl len,%ecx
  1331. lodsb
  1332. cmpl %ecx,%eax
  1333. jbe .LStrCopy1
  1334. movl %ecx,%eax
  1335. .LStrCopy1:
  1336. stosb
  1337. cmpl $7,%eax
  1338. jl .LStrCopy2
  1339. movl %edi,%ecx { Align on 32bits }
  1340. negl %ecx
  1341. andl $3,%ecx
  1342. subl %ecx,%eax
  1343. rep
  1344. movsb
  1345. movl %eax,%ecx
  1346. andl $3,%eax
  1347. shrl $2,%ecx
  1348. rep
  1349. movsl
  1350. .LStrCopy2:
  1351. movl %eax,%ecx
  1352. rep
  1353. movsb
  1354. popl %ecx
  1355. popl %eax
  1356. end ['ESI','EDI'];
  1357. end;
  1358. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1359. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1360. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1361. function fpc_shortstr_compare(const left,right:shortstring): longint;assembler; [public,alias:'FPC_SHORTSTR_COMPARE']; compilerproc;
  1362. var
  1363. saveesi,saveedi,saveebx : longint;
  1364. asm
  1365. {$ifdef FPC_PROFILE}
  1366. push %eax
  1367. push %edx
  1368. push %ecx
  1369. call mcount
  1370. pop %ecx
  1371. pop %edx
  1372. pop %eax
  1373. {$endif FPC_PROFILE}
  1374. movl %edi,saveedi
  1375. movl %esi,saveesi
  1376. movl %ebx,saveebx
  1377. {$ifdef FPC_ENABLED_CLD}
  1378. cld
  1379. {$endif FPC_ENABLED_CLD}
  1380. movl right,%esi
  1381. movl left,%edi
  1382. movzbl (%esi),%eax
  1383. movzbl (%edi),%ebx
  1384. movl %eax,%edx
  1385. incl %esi
  1386. incl %edi
  1387. cmpl %ebx,%eax
  1388. jbe .LStrCmp1
  1389. movl %ebx,%eax
  1390. .LStrCmp1:
  1391. cmpl $7,%eax
  1392. jl .LStrCmp2
  1393. movl %edi,%ecx { Align on 32bits }
  1394. negl %ecx
  1395. andl $3,%ecx
  1396. subl %ecx,%eax
  1397. orl %ecx,%ecx
  1398. repe
  1399. cmpsb
  1400. jne .LStrCmp3
  1401. movl %eax,%ecx
  1402. andl $3,%eax
  1403. shrl $2,%ecx
  1404. orl %ecx,%ecx
  1405. repe
  1406. cmpsl
  1407. je .LStrCmp2
  1408. movl $4,%eax
  1409. subl %eax,%esi
  1410. subl %eax,%edi
  1411. .LStrCmp2:
  1412. movl %eax,%ecx
  1413. orl %eax,%eax
  1414. repe
  1415. cmpsb
  1416. je .LStrCmp4
  1417. .LStrCmp3:
  1418. movzbl -1(%esi),%edx // Compare failing (or equal) position
  1419. movzbl -1(%edi),%ebx
  1420. .LStrCmp4:
  1421. movl %ebx,%eax // Compare length or position
  1422. subl %edx,%eax
  1423. movl saveedi,%edi
  1424. movl saveesi,%esi
  1425. movl saveebx,%ebx
  1426. end;
  1427. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1428. {$ifndef FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1429. {$define FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1430. procedure fpc_pchar_to_shortstr(out res : shortstring;p:PAnsiChar);assembler;[public,alias:'FPC_PCHAR_TO_SHORTSTR']; compilerproc;
  1431. {$ifndef FPC_PROFILE}
  1432. nostackframe;
  1433. {$endif}
  1434. // eax = res, edx = high(res), ecx = p
  1435. asm
  1436. {$ifdef FPC_PROFILE}
  1437. push %eax
  1438. push %edx
  1439. push %ecx
  1440. call mcount
  1441. pop %ecx
  1442. pop %edx
  1443. pop %eax
  1444. {$endif FPC_PROFILE}
  1445. test %ecx, %ecx
  1446. jz .LEmpty
  1447. push %eax { save res }
  1448. push %ecx { save p }
  1449. push %edx { save high(res) }
  1450. mov %ecx, %eax { eax = IndexByte.buf }
  1451. { edx is already high(res) = IndexByte.count.
  1452. Careful: using high(res) instead of -1 limits the scan by high(res) which is a good thing,
  1453. but assumes that IndexByte is “safe” and won’t read potentially invalid memory past the searched byte even if formally (and wrongly) allowed by ‘count’.
  1454. Generic and x86 versions are “safe”. }
  1455. xor %ecx, %ecx { ecx = 0 = IndexByte.value }
  1456. { Stack is already aligned on 16 bytes if the function is nostackframe: return address + push eax + push ecx + push edx.
  1457. With a stack frame, there is an additional push ebp and need 12 more bytes to align. }
  1458. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1459. leal -12(%esp), %esp
  1460. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1461. {$if defined(FPC_PIC) or not defined(has_i386_IndexByte_Impl)}
  1462. call IndexByte
  1463. {$else}
  1464. call IndexByte_Impl { manually inline IndexByte }
  1465. {$endif}
  1466. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1467. leal 12(%esp), %esp
  1468. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1469. pop %ecx { ecx = high(res) = Move.len }
  1470. test %eax, %eax { If IndexByte result (eax) is non-negative (terminator is among first high(res) characters), use it, otherwise keep high(res). }
  1471. {$ifdef CPUX86_HAS_CMOV}
  1472. cmovns %eax, %ecx
  1473. {$else}
  1474. js .LEcxIsLen
  1475. mov %eax, %ecx
  1476. .LEcxIsLen:
  1477. {$endif}
  1478. pop %eax { pop p to eax = Move.src }
  1479. pop %edx { pop res to edx }
  1480. mov %cl, (%edx) { res[0] := len }
  1481. inc %edx { res[1] = Move.dst }
  1482. {$ifdef FPC_PROFILE}
  1483. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1484. leal -12(%esp), %esp
  1485. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1486. call Move
  1487. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1488. leal 12(%esp), %esp
  1489. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1490. jmp .LReturn
  1491. {$else FPC_PROFILE}
  1492. jmp Move { can perform a tail call }
  1493. {$endif FPC_PROFILE}
  1494. .LEmpty:
  1495. movb $0, (%eax)
  1496. {$ifdef FPC_PROFILE}
  1497. .LReturn:
  1498. {$endif}
  1499. end;
  1500. {$endif FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1501. {$undef has_i386_IndexByte_Impl} { no longer required }
  1502. {$IFNDEF INTERNAL_BACKTRACE}
  1503. {$define FPC_SYSTEM_HAS_GET_FRAME}
  1504. function get_frame:pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  1505. asm
  1506. movl %ebp,%eax
  1507. end;
  1508. {$ENDIF not INTERNAL_BACKTRACE}
  1509. {$define FPC_SYSTEM_HAS_GET_PC_ADDR}
  1510. Function Get_pc_addr : Pointer;assembler;nostackframe;
  1511. asm
  1512. movl (%esp),%eax
  1513. end;
  1514. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  1515. function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;
  1516. {$if defined(win32)}
  1517. { Windows has StackTop always properly set }
  1518. begin
  1519. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  1520. Result:=PPointer(framebp+4)^
  1521. else
  1522. Result:=nil;
  1523. end;
  1524. {$else defined(win32)}
  1525. nostackframe;assembler;
  1526. asm
  1527. orl %eax,%eax
  1528. jz .Lg_a_null
  1529. movl 4(%eax),%eax
  1530. .Lg_a_null:
  1531. end;
  1532. {$endif defined(win32)}
  1533. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  1534. function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;
  1535. {$if defined(win32)}
  1536. { Windows has StackTop always properly set }
  1537. begin
  1538. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  1539. Result:=PPointer(framebp)^
  1540. else
  1541. Result:=nil;
  1542. end;
  1543. {$else defined(win32)}
  1544. nostackframe;assembler;
  1545. asm
  1546. orl %eax,%eax
  1547. jz .Lgnf_null
  1548. movl (%eax),%eax
  1549. .Lgnf_null:
  1550. end;
  1551. {$endif defined(win32)}
  1552. {$define FPC_SYSTEM_HAS_SPTR}
  1553. Function Sptr : Pointer;assembler;nostackframe;
  1554. asm
  1555. movl %esp,%eax
  1556. end;
  1557. {****************************************************************************
  1558. Str()
  1559. ****************************************************************************}
  1560. {$if defined(disabled) and defined(regcall) }
  1561. {$define FPC_SYSTEM_HAS_INT_STR_LONGWORD}
  1562. {$define FPC_SYSTEM_HAS_INT_STR_LONGINT}
  1563. label str_int_shortcut;
  1564. procedure int_str(l:longword;out s:shortstring);assembler;nostackframe;
  1565. asm
  1566. pushl %esi
  1567. pushl %edi
  1568. pushl %ebx
  1569. mov %edx,%edi
  1570. xor %edx,%edx
  1571. jmp str_int_shortcut
  1572. end;
  1573. procedure int_str(l:longint;out s:shortstring);assembler;nostackframe;
  1574. {Optimized for speed, but balanced with size.}
  1575. const digits:array[0..9] of cardinal=(0,10,100,1000,10000,
  1576. 100000,1000000,10000000,
  1577. 100000000,1000000000);
  1578. asm
  1579. {$ifdef FPC_PROFILE}
  1580. push %eax
  1581. push %edx
  1582. push %ecx
  1583. call mcount
  1584. pop %ecx
  1585. pop %edx
  1586. pop %eax
  1587. {$endif FPC_PROFILE}
  1588. push %esi
  1589. push %edi
  1590. push %ebx
  1591. movl %edx,%edi
  1592. { Calculate absolute value and put sign in edx}
  1593. cltd
  1594. xorl %edx,%eax
  1595. subl %edx,%eax
  1596. negl %edx
  1597. str_int_shortcut:
  1598. movl %ecx,%esi
  1599. {Calculate amount of digits in ecx.}
  1600. xorl %ecx,%ecx
  1601. bsrl %eax,%ecx
  1602. incl %ecx
  1603. imul $1233,%ecx
  1604. shr $12,%ecx
  1605. {$ifdef FPC_PIC}
  1606. call fpc_geteipasebx
  1607. {$ifdef darwin}
  1608. movl digits-.Lpic(%ebx),%ebx
  1609. {$else}
  1610. addl $_GLOBAL_OFFSET_TABLE_,%ebx
  1611. movl digits@GOT(%ebx),%ebx
  1612. {$endif}
  1613. cmpl (%ebx,%ecx,4),%eax
  1614. {$else}
  1615. cmpl digits(,%ecx,4),%eax
  1616. {$endif}
  1617. cmc
  1618. adcl $0,%ecx {Nr. digits ready in ecx.}
  1619. {Write length & sign.}
  1620. lea (%edx,%ecx),%ebx
  1621. movb $45,%bh {movb $'-,%bh Not supported by our ATT reader.}
  1622. movw %bx,(%edi)
  1623. addl %edx,%edi
  1624. subl %edx,%esi
  1625. {Skip digits beyond string length.}
  1626. movl %eax,%edx
  1627. subl %ecx,%esi
  1628. jae .Lloop_write
  1629. .balign 4
  1630. .Lloop_skip:
  1631. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  1632. mull %edx
  1633. shrl $3,%edx
  1634. decl %ecx
  1635. jz .Ldone {If (l<0) and (high(s)=1) this jump is taken.}
  1636. incl %esi
  1637. jnz .Lloop_skip
  1638. {Write out digits.}
  1639. .balign 4
  1640. .Lloop_write:
  1641. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  1642. {Pre-add '0'}
  1643. leal 48(%edx),%ebx {leal $'0(,%edx),%ebx Not supported by our ATT reader.}
  1644. mull %edx
  1645. shrl $3,%edx
  1646. leal (%edx,%edx,8),%eax {x mod 10 = x-10*(x div 10)}
  1647. subl %edx,%ebx
  1648. subl %eax,%ebx
  1649. movb %bl,(%edi,%ecx)
  1650. decl %ecx
  1651. jnz .Lloop_write
  1652. .Ldone:
  1653. popl %ebx
  1654. popl %edi
  1655. popl %esi
  1656. end;
  1657. {$endif}
  1658. {****************************************************************************
  1659. Bounds Check
  1660. ****************************************************************************}
  1661. { do a thread-safe inc/dec }
  1662. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  1663. function cpudeclocked(var l : longint) : boolean;assembler;nostackframe;
  1664. asm
  1665. lock
  1666. decl (%eax)
  1667. setzb %al
  1668. end;
  1669. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  1670. procedure cpuinclocked(var l : longint);assembler;nostackframe;
  1671. asm
  1672. lock
  1673. incl (%eax)
  1674. end;
  1675. // inline SMP check and normal lock.
  1676. // the locked one is so slow, inlining doesn't matter.
  1677. function declocked(var l : longint) : boolean; inline;
  1678. begin
  1679. if not ismultithread then
  1680. begin
  1681. dec(l);
  1682. declocked:=l=0;
  1683. end
  1684. else
  1685. declocked:=cpudeclocked(l);
  1686. end;
  1687. procedure inclocked(var l : longint); inline;
  1688. begin
  1689. if not ismultithread then
  1690. inc(l)
  1691. else
  1692. cpuinclocked(l);
  1693. end;
  1694. function InterLockedDecrement (var Target: longint) : longint; assembler;
  1695. asm
  1696. movl $-1,%edx
  1697. xchgl %edx,%eax
  1698. lock
  1699. xaddl %eax, (%edx)
  1700. decl %eax
  1701. end;
  1702. function InterLockedIncrement (var Target: longint) : longint; assembler;
  1703. asm
  1704. movl $1,%edx
  1705. xchgl %edx,%eax
  1706. lock
  1707. xaddl %eax, (%edx)
  1708. incl %eax
  1709. end;
  1710. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler;
  1711. asm
  1712. xchgl (%eax),%edx
  1713. movl %edx,%eax
  1714. end;
  1715. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler;
  1716. asm
  1717. xchgl %eax,%edx
  1718. lock
  1719. xaddl %eax, (%edx)
  1720. end;
  1721. function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler;
  1722. asm
  1723. xchgl %eax,%ecx
  1724. lock
  1725. cmpxchgl %edx, (%ecx)
  1726. end;
  1727. function InterlockedCompareExchange64(var Target: int64; NewValue: int64; Comperand: int64): int64; assembler;
  1728. asm
  1729. pushl %ebx
  1730. pushl %edi
  1731. movl %eax,%edi
  1732. movl Comperand+4,%edx
  1733. movl Comperand+0,%eax
  1734. movl NewValue+4,%ecx
  1735. movl NewValue+0,%ebx
  1736. lock cmpxchg8b (%edi)
  1737. pop %edi
  1738. pop %ebx
  1739. end;
  1740. {****************************************************************************
  1741. FPU
  1742. ****************************************************************************}
  1743. const
  1744. { Internal constants for use in system unit }
  1745. FPU_Invalid = 1;
  1746. FPU_Denormal = 2;
  1747. FPU_DivisionByZero = 4;
  1748. FPU_Overflow = 8;
  1749. FPU_Underflow = $10;
  1750. FPU_StackUnderflow = $20;
  1751. FPU_StackOverflow = $40;
  1752. FPU_ExceptionMask = $ff;
  1753. MM_Invalid = 1;
  1754. MM_Denormal = 2;
  1755. MM_DivisionByZero = 4;
  1756. MM_Overflow = 8;
  1757. MM_Underflow = $10;
  1758. MM_Precicion = $20;
  1759. MM_ExceptionMask = $3f;
  1760. MM_MaskInvalidOp = %0000000010000000;
  1761. MM_MaskDenorm = %0000000100000000;
  1762. MM_MaskDivZero = %0000001000000000;
  1763. MM_MaskOverflow = %0000010000000000;
  1764. MM_MaskUnderflow = %0000100000000000;
  1765. MM_MaskPrecision = %0001000000000000;
  1766. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  1767. Procedure SysInitFPU;
  1768. begin
  1769. end;
  1770. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  1771. Procedure SysResetFPU;
  1772. var
  1773. { these locals are so we don't have to hack pic code in the assembler }
  1774. localmxcsr: dword;
  1775. localfpucw: word;
  1776. begin
  1777. localfpucw:=Default8087CW;
  1778. asm
  1779. fninit
  1780. fwait
  1781. fldcw localfpucw
  1782. end;
  1783. if has_sse_support then
  1784. begin
  1785. localmxcsr:=DefaultMXCSR;
  1786. asm
  1787. { setup sse exceptions }
  1788. {$ifndef OLD_ASSEMBLER}
  1789. ldmxcsr localmxcsr
  1790. {$else OLD_ASSEMBLER}
  1791. mov localmxcsr,%eax
  1792. subl $4,%esp
  1793. mov %eax,(%esp)
  1794. //ldmxcsr (%esp)
  1795. .byte 0x0f,0xae,0x14,0x24
  1796. addl $4,%esp
  1797. {$endif OLD_ASSEMBLER}
  1798. end;
  1799. end;
  1800. end;
  1801. { because of the brain dead sse detection on x86, this test is post poned }
  1802. procedure fpc_cpucodeinit;
  1803. var
  1804. _eax,_ecx_cpuid1,_edx_cpuid1,_ebx_cpuid7 : longint;
  1805. begin
  1806. if cpuid_support then
  1807. begin
  1808. asm
  1809. movl $1,%eax
  1810. xorl %ecx,%ecx
  1811. cpuid
  1812. movl %edx,_edx_cpuid1
  1813. movl %ecx,_ecx_cpuid1
  1814. end ['ebx'];
  1815. has_mmx_support:=(_edx_cpuid1 and $800000)<>0;
  1816. if ((_edx_cpuid1 and $2000000)<>0) then
  1817. begin
  1818. os_supports_sse:=true;
  1819. sse_check:=true;
  1820. asm
  1821. { force an sse exception if no sse is supported, the exception handler sets
  1822. os_supports_sse to false then }
  1823. { don't change this instruction, the code above depends on its size }
  1824. {$ifdef OLD_ASSEMBLER}
  1825. .byte 0x0f,0x28,0xf7
  1826. {$else}
  1827. movaps %xmm7, %xmm6
  1828. {$endif not EMX}
  1829. end;
  1830. sse_check:=false;
  1831. has_sse_support:=os_supports_sse;
  1832. end;
  1833. if has_sse_support then
  1834. begin
  1835. has_sse2_support:=((_edx_cpuid1 and $4000000)<>0);
  1836. has_sse3_support:=((_ecx_cpuid1 and $200)<>0);
  1837. { now avx }
  1838. asm
  1839. xorl %eax,%eax
  1840. cpuid
  1841. movl %eax,_eax
  1842. end;
  1843. if _eax>=7 then
  1844. begin
  1845. asm
  1846. movl $7,%eax
  1847. xorl %ecx,%ecx
  1848. cpuid
  1849. movl %ebx,_ebx_cpuid7
  1850. end;
  1851. fast_large_repmovstosb:=_ebx_cpuid7 and (1 shl 9)<>0;
  1852. if (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then
  1853. begin
  1854. asm
  1855. xorl %ecx,%ecx
  1856. .byte 0x0f,0x01,0xd0 { xgetbv }
  1857. movl %eax,_eax
  1858. end;
  1859. if (_eax and 6)=6 then
  1860. begin
  1861. has_avx_support:=(_ecx_cpuid1 and $10000000)<>0;
  1862. has_avx2_support:=(_ebx_cpuid7 and $20)<>0;
  1863. end;
  1864. end;
  1865. end;
  1866. end;
  1867. end;
  1868. { don't let libraries influence the FPU cw set by the host program }
  1869. if IsLibrary then
  1870. begin
  1871. Default8087CW:=Get8087CW;
  1872. if has_sse_support then
  1873. DefaultMXCSR:=GetMXCSR;
  1874. end;
  1875. SysResetFPU;
  1876. fpc_cpucodeinit_performed:=true;
  1877. end;
  1878. {$if not defined(darwin) and defined(regcall) }
  1879. { darwin requires that the stack is aligned to 16 bytes when calling another function }
  1880. {$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
  1881. {$define FPC_SYSTEM_HAS_ANSISTR_DECR_REF}
  1882. Procedure fpc_AnsiStr_Decr_Ref (Var S : Pointer); [Public,Alias:'FPC_ANSISTR_DECR_REF']; compilerproc; nostackframe; assembler;
  1883. asm
  1884. movl (%eax),%edx
  1885. testl %edx,%edx
  1886. jz .Lquit
  1887. movl $0,(%eax) // s:=nil
  1888. cmpl $0,-8(%edx) // exit if refcount<0
  1889. jl .Lquit
  1890. {$ifdef FPC_PIC}
  1891. call fpc_geteipasecx
  1892. addl $_GLOBAL_OFFSET_TABLE_,%ecx
  1893. movl ismultithread@GOT(%ecx),%ecx
  1894. cmpl $0,(%ecx)
  1895. {$else FPC_PIC}
  1896. cmpl $0,ismultithread
  1897. {$endif FPC_PIC}
  1898. je .Lskiplock
  1899. .byte 0xF0 // LOCK prefix, jumped over if IsMultiThread = false. FPC assembler does not accept disjoint LOCK mnemonic.
  1900. .Lskiplock:
  1901. decl -8(%edx)
  1902. jz .Lfree
  1903. .Lquit:
  1904. ret
  1905. .Lfree:
  1906. leal -12(%edx),%eax // points to start of allocation
  1907. { freemem is not an assembler leaf function like fpc_geteipasecx, so it
  1908. needs to be called with proper stack alignment }
  1909. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1910. leal -12(%esp),%esp
  1911. call FPC_FREEMEM
  1912. leal 12(%esp),%esp
  1913. {$else FPC_SYSTEM_STACKALIGNMENT16}
  1914. jmp FPC_FREEMEM // can perform a tail call
  1915. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1916. end;
  1917. function fpc_truely_ansistr_unique(Var S : Pointer): Pointer; forward;
  1918. {$define FPC_SYSTEM_HAS_ANSISTR_UNIQUE}
  1919. Function fpc_ansistr_Unique(Var S : Pointer): Pointer; [Public,Alias : 'FPC_ANSISTR_UNIQUE']; compilerproc; nostackframe;assembler;
  1920. asm
  1921. // Var S located in register
  1922. // Var $result located in register
  1923. movl %eax,%edx
  1924. // [437] pointer(result) := pointer(s);
  1925. movl (%eax),%eax
  1926. // [438] If Pointer(S)=Nil then
  1927. testl %eax,%eax
  1928. je .Lj4031
  1929. .Lj4036:
  1930. // [440] if PAnsiRec(Pointer(S)-Firstoff)^.Ref<>1 then
  1931. movl -8(%eax),%ecx
  1932. cmpl $1,%ecx
  1933. je .Lj4038
  1934. // [441] result:=fpc_truely_ansistr_unique(s);
  1935. movl %edx,%eax
  1936. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1937. leal -12(%esp),%esp
  1938. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1939. call fpc_truely_ansistr_unique
  1940. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1941. leal 12(%esp),%esp
  1942. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1943. .Lj4038:
  1944. .Lj4031:
  1945. // [442] end;
  1946. end;
  1947. {$endif FPC_HAS_FEATURE_ANSISTRINGS}
  1948. {$endif ndef darwin and defined(regcall) }
  1949. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  1950. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  1951. procedure ReadBarrier;assembler;nostackframe;
  1952. asm
  1953. {$ifdef CPUX86_HAS_SSE2}
  1954. lfence
  1955. {$else CPUX86_HAS_SSE2}
  1956. lock
  1957. addl $0,0(%esp)
  1958. {$endif CPUX86_HAS_SSE2}
  1959. end;
  1960. procedure ReadDependencyBarrier;{$ifdef SYSTEMINLINE}inline;{$endif}
  1961. begin
  1962. { reads imply barrier on earlier reads depended on }
  1963. end;
  1964. procedure ReadWriteBarrier;assembler;nostackframe;
  1965. asm
  1966. {$ifdef CPUX86_HAS_SSE2}
  1967. mfence
  1968. {$else CPUX86_HAS_SSE2}
  1969. lock
  1970. addl $0,0(%esp)
  1971. {$endif CPUX86_HAS_SSE2}
  1972. end;
  1973. procedure WriteBarrier;assembler;nostackframe;
  1974. asm
  1975. {$ifdef CPUX86_HAS_SSEUNIT}
  1976. sfence
  1977. {$endif CPUX86_HAS_SSEUNIT}
  1978. end;
  1979. {$endif}
  1980. {$ifndef FPC_SYSTEM_HAS_BSF_QWORD}
  1981. {$define FPC_SYSTEM_HAS_BSF_QWORD}
  1982. function BsfQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  1983. asm
  1984. bsfl 4(%esp),%eax
  1985. jnz .L2
  1986. .L1:
  1987. bsfl 8(%esp),%eax
  1988. jnz .L3
  1989. movl $223,%eax
  1990. .L3:
  1991. addl $32,%eax
  1992. .L2:
  1993. end;
  1994. {$endif FPC_SYSTEM_HAS_BSF_QWORD}
  1995. {$ifndef FPC_SYSTEM_HAS_BSR_QWORD}
  1996. {$define FPC_SYSTEM_HAS_BSR_QWORD}
  1997. function BsrQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  1998. asm
  1999. bsrl 8(%esp),%eax
  2000. jz .L1
  2001. add $32,%eax
  2002. jmp .L2
  2003. .L1:
  2004. bsrl 4(%esp),%eax
  2005. jnz .L2
  2006. movl $255,%eax
  2007. .L2:
  2008. end;
  2009. {$endif FPC_SYSTEM_HAS_BSR_QWORD}
  2010. {$ifndef FPC_SYSTEM_HAS_SAR_QWORD}
  2011. {$define FPC_SYSTEM_HAS_SAR_QWORD}
  2012. function fpc_SarInt64(Const AValue : Int64;const Shift : Byte): Int64; [Public,Alias:'FPC_SARINT64']; compilerproc; assembler; nostackframe;
  2013. asm
  2014. movb %al,%cl
  2015. movl 8(%esp),%edx
  2016. movl 4(%esp),%eax
  2017. andb $63,%cl
  2018. cmpb $32,%cl
  2019. jnb .L1
  2020. shrdl %cl,%edx,%eax
  2021. sarl %cl,%edx
  2022. jmp .Lexit
  2023. .L1:
  2024. movl %edx,%eax
  2025. sarl $31,%edx
  2026. andb $31,%cl
  2027. sarl %cl,%eax
  2028. .Lexit:
  2029. end;
  2030. {$endif FPC_SYSTEM_HAS_SAR_QWORD}