i386.inc 78 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 1999-2000 by the Free Pascal development team.
  4. Processor dependent implementation for the system unit for
  5. intel i386+
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$if not(defined(VER3_0)) and defined(linux)}
  13. {$define FPC_SYSTEM_STACKALIGNMENT16}
  14. {$endif not(defined(VER3_0)) and defined(linux)}
  15. {****************************************************************************
  16. Primitives
  17. ****************************************************************************}
  18. var
  19. os_supports_sse : boolean;
  20. { this variable is set to true, if currently an sse check is executed and no sig ill should be generated }
  21. sse_check : boolean;
  22. fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
  23. has_sse41_support : boolean;
  24. fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. }
  25. {$asmmode ATT}
  26. function cpuid_support : boolean;assembler;nostackframe;
  27. {
  28. Check if the ID-flag can be changed, if changed then CpuID is supported.
  29. Tested under go32v1 and Linux on c6x86 with CpuID enabled and disabled (PFV)
  30. }
  31. asm
  32. pushfl
  33. movl (%esp),%eax
  34. xorl $0x200000,%eax
  35. pushl %eax
  36. popfl
  37. pushfl
  38. popl %eax
  39. xorl (%esp),%eax
  40. popfl
  41. testl $0x200000,%eax
  42. setnz %al
  43. end;
  44. {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
  45. procedure fpc_cpuinit;
  46. begin
  47. { because of the brain dead sse detection on x86, this test is post poned to fpc_cpucodeinit which
  48. must be implemented OS dependend (FK)
  49. has_sse_support:=sse_support;
  50. has_mmx_support:=mmx_support;
  51. }
  52. end;
  53. {$ifndef darwin}
  54. procedure fpc_geteipasebx; [public, alias: 'fpc_geteipasebx'];assembler; nostackframe;
  55. asm
  56. movl (%esp),%ebx
  57. end;
  58. procedure fpc_geteipasecx; [public, alias: 'fpc_geteipasecx'];assembler; nostackframe;
  59. asm
  60. movl (%esp),%ecx
  61. end;
  62. {$endif}
  63. {$if not defined(FPC_SYSTEM_HAS_MOVE)
  64. and not defined(OLD_ASSEMBLER)
  65. and not defined(darwin)}
  66. {$i fastmove.inc}
  67. {$endif}
  68. {$ifndef FPC_SYSTEM_HAS_MOVE}
  69. {$define FPC_SYSTEM_HAS_MOVE}
  70. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;
  71. var
  72. saveesi,saveedi : longint;
  73. asm
  74. movl %edi,saveedi
  75. movl %esi,saveesi
  76. movl %eax,%esi
  77. movl %edx,%edi
  78. movl %ecx,%edx
  79. movl %edi,%eax
  80. { check for zero or negative count }
  81. cmpl $0,%edx
  82. jle .LMoveEnd
  83. { Check for back or forward }
  84. sub %esi,%eax
  85. jz .LMoveEnd { Do nothing when source=dest }
  86. jc .LFMove { Do forward, dest<source }
  87. cmp %edx,%eax
  88. jb .LBMove { Dest is in range of move, do backward }
  89. { Forward Copy }
  90. .LFMove:
  91. {$ifdef FPC_ENABLED_CLD}
  92. cld
  93. {$endif FPC_ENABLED_CLD}
  94. cmpl $15,%edx
  95. jl .LFMove1
  96. movl %edi,%ecx { Align on 32bits }
  97. negl %ecx
  98. andl $3,%ecx
  99. subl %ecx,%edx
  100. rep
  101. movsb
  102. movl %edx,%ecx
  103. andl $3,%edx
  104. shrl $2,%ecx
  105. rep
  106. movsl
  107. .LFMove1:
  108. movl %edx,%ecx
  109. rep
  110. movsb
  111. jmp .LMoveEnd
  112. { Backward Copy }
  113. .LBMove:
  114. std
  115. addl %edx,%esi
  116. addl %edx,%edi
  117. movl %edi,%ecx
  118. decl %esi
  119. decl %edi
  120. cmpl $15,%edx
  121. jl .LBMove1
  122. negl %ecx { Align on 32bits }
  123. andl $3,%ecx
  124. subl %ecx,%edx
  125. rep
  126. movsb
  127. movl %edx,%ecx
  128. andl $3,%edx
  129. shrl $2,%ecx
  130. subl $3,%esi
  131. subl $3,%edi
  132. rep
  133. movsl
  134. addl $3,%esi
  135. addl $3,%edi
  136. .LBMove1:
  137. movl %edx,%ecx
  138. rep
  139. movsb
  140. cld
  141. .LMoveEnd:
  142. movl saveedi,%edi
  143. movl saveesi,%esi
  144. end;
  145. {$endif FPC_SYSTEM_HAS_MOVE}
  146. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  147. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  148. or not defined(FPC_SYSTEM_HAS_FILLDWORD)
  149. or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  150. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  151. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  152. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  153. const
  154. FillXxxx_RepStosThreshold_ERMS = 1024;
  155. FillXxxx_RepStosThreshold_NoERMS = 512 * 1024;
  156. procedure FillXxxx_U32Pattern_RepStos_8OrMore; assembler; nostackframe;
  157. { eax — x, ecx — uint32 pattern, edx — byte count >= 8 (preferably >= FillXxxx_RepStosThreshold_(No)ERMS, depending on fast_large_repmovstosb). }
  158. asm
  159. {$ifdef FPC_ENABLED_CLD}
  160. cld
  161. {$endif FPC_ENABLED_CLD}
  162. mov %ecx, (%eax) { Write first 4 bytes unaligned. }
  163. push %ecx { pattern }
  164. push %edi
  165. mov %eax, %edi { Move x to edi, as expected by ‘rep stosl’. }
  166. xchg %eax, %ecx { now eax = pattern (as expected by ‘rep stosl’) and ecx = x (to rotate the pattern by its misalignment) }
  167. shl $3, %ecx { ecx = misalignment of x in bits. }
  168. rol %cl, %eax { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
  169. add %edi, %edx { edx = x end }
  170. lea -1(%edx), %ecx { ecx = x end - 1. }
  171. add $4, %edi
  172. and $-4, %edi { edi = 4-byte aligned pointer strictly to the right of the start. }
  173. and $-4, %ecx { ecx = 4-byte aligned pointer strictly to the left of the end. }
  174. sub %edi, %ecx { ecx = byte count between them. }
  175. shr $2, %ecx { ecx = uint32 count, as expected by ‘rep stosl’. }
  176. rep stosl
  177. pop %edi
  178. pop %ecx
  179. mov %ecx, -4(%edx) { Write last 4 bytes unaligned. }
  180. end;
  181. {$endif FillChar/Word/DWord required.}
  182. label
  183. FillXxxx_MoreThanTwoXMMs;
  184. procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
  185. { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
  186. const
  187. NtThreshold = 4 * 1024 * 1024;
  188. asm
  189. movd %ecx, %xmm0
  190. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  191. movdqu %xmm0, (%eax)
  192. movdqu %xmm0, -16(%eax,%edx)
  193. cmp $32, %edx
  194. ja .LMoreThanTwoVectors
  195. ret
  196. .byte 144 { Turn .balign 16 before .L64x_Body into a no-op. }
  197. { x can start and end misaligned on the vector boundary:
  198. x = ~~][H1][H2][...][T2][T1]~
  199. [UH] [UT]
  200. UH/UT stands for “unaligned head/tail”, both have 1~16 bytes. }
  201. .LMoreThanTwoVectors:
  202. push %esi
  203. mov %ecx, %esi { esi = pattern }
  204. mov %eax, %ecx
  205. shl $3, %ecx { ecx = misalignment of x in bits }
  206. rol %cl, %esi { misalign the pattern }
  207. movd %esi, %xmm0
  208. pshufd $0, %xmm0, %xmm0
  209. pop %esi
  210. { FillChar (to skip the misaligning above) and FillQWord jump here.
  211. eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
  212. FillXxxx_MoreThanTwoXMMs:
  213. lea -65(%eax,%edx), %ecx
  214. and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
  215. mov %ecx, %edx { Remember T4 to edx. }
  216. and $-16, %eax { eax = H1 − 16. }
  217. sub %eax, %ecx { ecx = aligned byte count − 48. }
  218. movdqa %xmm0, 16(%eax) { Write H1. }
  219. cmp $32-48, %ecx
  220. jle .LOneAlignedTailWrite
  221. movdqa %xmm0, 32(%eax) { Write H2. }
  222. cmp $64-48, %ecx
  223. jle .LTwoAlignedTailWrites
  224. sub $48, %ecx { ecx = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
  225. jle .LFourAlignedTailWrites { ecx was ≤ 96−48 }
  226. add $48, %eax { eax = H3. }
  227. cmp $NtThreshold, %ecx
  228. jae .L64xNT_Body
  229. .balign 16 { no-op }
  230. .L64x_Body:
  231. movdqa %xmm0, (%eax)
  232. movdqa %xmm0, 16(%eax)
  233. movdqa %xmm0, 32(%eax)
  234. movdqa %xmm0, 48(%eax)
  235. add $64, %eax
  236. sub $64, %ecx
  237. ja .L64x_Body
  238. .LFourAlignedTailWrites:
  239. movdqa %xmm0, (%edx) { T4 }
  240. movdqa %xmm0, 16(%edx) { T3 }
  241. .LTwoAlignedTailWrites:
  242. movdqa %xmm0, 32(%edx) { T2 }
  243. .LOneAlignedTailWrite:
  244. movdqa %xmm0, 48(%edx) { T1 }
  245. ret
  246. .balign 16
  247. .L64xNT_Body:
  248. movntdq %xmm0, (%eax)
  249. movntdq %xmm0, 16(%eax)
  250. movntdq %xmm0, 32(%eax)
  251. movntdq %xmm0, 48(%eax)
  252. add $64, %eax
  253. sub $64, %ecx
  254. ja .L64xNT_Body
  255. sfence
  256. jmp .LFourAlignedTailWrites
  257. end;
  258. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  259. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  260. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  261. {$ifndef CPUX86_HAS_SSE2}
  262. procedure FillXxxx_U32Pattern_Plain_16OrMore; assembler; nostackframe;
  263. { eax — x, ecx — uint32 pattern, edx — byte count >= 12 (preferably >= 16). }
  264. asm
  265. mov %ecx, (%eax) { Write first 4 bytes. }
  266. lea -9(%eax,%edx), %edx
  267. mov %ecx, 5(%edx) { Write last 4 bytes. }
  268. and $-4, %edx { edx = loop bound. }
  269. push %esi
  270. mov %ecx, %esi { esi = pattern }
  271. mov %eax, %ecx
  272. shl $3, %ecx { ecx = misalignment of x in bits }
  273. rol %cl, %esi { misalign the pattern }
  274. add $4, %eax
  275. and $-4, %eax
  276. .balign 16
  277. .L8xLoop:
  278. mov %esi, (%eax)
  279. mov %esi, 4(%eax)
  280. add $8, %eax
  281. cmp %edx, %eax
  282. jb .L8xLoop
  283. mov %esi, (%edx)
  284. mov %esi, 4(%edx)
  285. pop %esi
  286. end;
  287. {$endif ndef CPUX86_HAS_SSE2 (need Fill*_Plain)}
  288. procedure FillXxxx_U32Pattern_Ladder_4to16; assembler; nostackframe;
  289. { eax — x, ecx — uint32 pattern, edx — byte count, 4 <= edx <= 16. }
  290. asm
  291. mov %ecx, (%eax)
  292. cmp $8, %edx
  293. jle .LLast4
  294. mov %ecx, 4(%eax)
  295. mov %ecx, -8(%eax,%edx)
  296. .LLast4:
  297. mov %ecx, -4(%eax,%edx)
  298. end;
  299. {$endif FillChar/Word/DWord required.}
  300. {$endif FillChar/Word/DWord/QWord required.}
  301. {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
  302. {$define FPC_SYSTEM_HAS_FILLCHAR}
  303. procedure FillChar_3OrLess; assembler; nostackframe;
  304. { cl — x, edx — byte count, Low(int32) <= edx <= 3. }
  305. asm
  306. test %edx, %edx
  307. jle .LQuit
  308. mov %cl, (%eax)
  309. mov %cl, -1(%eax,%edx)
  310. shr $1, %edx
  311. mov %cl, (%eax,%edx)
  312. .LQuit:
  313. end;
  314. {$ifndef CPUX86_HAS_SSE2}
  315. procedure FillChar_Plain(var x;count:SizeInt;value:byte);assembler;nostackframe;
  316. asm
  317. cmp $3, %edx
  318. jle FillChar_3OrLess
  319. movzbl %cl, %ecx
  320. imul $0x01010101, %ecx
  321. cmp $16, %edx
  322. jbe FillXxxx_U32Pattern_Ladder_4to16
  323. jmp FillXxxx_U32Pattern_Plain_16OrMore
  324. end;
  325. {$endif ndef CPUX86_HAS_SSE2}
  326. procedure FillChar_SSE2(var x;count:SizeInt;value:byte);assembler;nostackframe;
  327. asm
  328. cmp $3, %edx
  329. jle FillChar_3OrLess
  330. movzbl %cl, %ecx
  331. imul $0x01010101, %ecx
  332. cmp $16, %edx
  333. jbe FillXxxx_U32Pattern_Ladder_4to16
  334. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  335. jae FillXxxx_U32Pattern_RepStos_8OrMore
  336. movd %ecx, %xmm0
  337. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  338. movdqu %xmm0, (%eax)
  339. movdqu %xmm0, -16(%eax,%edx)
  340. cmp $32, %edx
  341. ja FillXxxx_MoreThanTwoXMMs
  342. end;
  343. procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
  344. asm
  345. cmp $3, %edx
  346. jle FillChar_3OrLess
  347. movzbl %cl, %ecx
  348. imul $0x01010101, %ecx
  349. cmp $16, %edx
  350. jbe FillXxxx_U32Pattern_Ladder_4to16
  351. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  352. jae FillXxxx_U32Pattern_RepStos_8OrMore
  353. movd %ecx, %xmm0
  354. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  355. movdqu %xmm0, (%eax)
  356. movdqu %xmm0, -16(%eax,%edx)
  357. cmp $32, %edx
  358. ja FillXxxx_MoreThanTwoXMMs
  359. end;
  360. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
  361. var
  362. FillChar_Impl: procedure(var x;count:SizeInt;value:byte) = @FillChar_Dispatch;
  363. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte);
  364. begin
  365. if not fpc_cpucodeinit_performed then
  366. begin
  367. {$ifdef CPUX86_HAS_SSE2} FillChar_SSE2 {$else} FillChar_Plain {$endif} (x, count, value);
  368. exit;
  369. end;
  370. if fast_large_repmovstosb then
  371. FillChar_Impl := @FillChar_SSE2_ERMS
  372. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  373. FillChar_Impl := @FillChar_SSE2
  374. {$ifndef CPUX86_HAS_SSE2}
  375. else
  376. FillChar_Impl := @FillChar_Plain
  377. {$endif ndef CPUX86_HAS_SSE2};
  378. FillChar_Impl(x, count, value);
  379. end;
  380. procedure FillChar(var x;count:SizeInt;value:byte);
  381. begin
  382. FillChar_Impl(x, count, value);
  383. end;
  384. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  385. {$ifndef FPC_SYSTEM_HAS_FILLWORD}
  386. {$define FPC_SYSTEM_HAS_FILLWORD}
  387. procedure FillWord_3OrLess; assembler; nostackframe;
  388. asm
  389. test %edx, %edx
  390. jle .LQuit
  391. mov %cx, (%eax)
  392. mov %cx, -2(%eax,%edx,2)
  393. shr $1, %edx
  394. mov %cx, (%eax,%edx,2)
  395. .LQuit:
  396. end;
  397. {$ifndef CPUX86_HAS_SSE2}
  398. procedure FillWord_Plain(var x;count:SizeInt;value:word);assembler;nostackframe;
  399. asm
  400. cmp $3, %edx
  401. jle FillWord_3OrLess
  402. shl $1, %edx
  403. movzwl %cx, %ecx
  404. imul $0x00010001, %ecx
  405. cmp $16, %edx
  406. jbe FillXxxx_U32Pattern_Ladder_4to16
  407. jmp FillXxxx_U32Pattern_Plain_16OrMore
  408. end;
  409. {$endif ndef CPUX86_HAS_SSE2}
  410. procedure FillWord_SSE2(var x;count:SizeInt;value:word);assembler;nostackframe;
  411. asm
  412. cmp $3, %edx
  413. jle FillWord_3OrLess
  414. shl $1, %edx
  415. movzwl %cx, %ecx
  416. imul $0x00010001, %ecx
  417. cmp $16, %edx
  418. jbe FillXxxx_U32Pattern_Ladder_4to16
  419. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  420. jb FillXxxx_U32Pattern_SSE2_16OrMore
  421. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  422. end;
  423. procedure FillWord_SSE2_ERMS(var x;count:SizeInt;value:word);assembler;nostackframe;
  424. asm
  425. cmp $3, %edx
  426. jle FillWord_3OrLess
  427. shl $1, %edx
  428. movzwl %cx, %ecx
  429. imul $0x00010001, %ecx
  430. cmp $16, %edx
  431. jbe FillXxxx_U32Pattern_Ladder_4to16
  432. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  433. jb FillXxxx_U32Pattern_SSE2_16OrMore
  434. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  435. end;
  436. procedure FillWord_Dispatch(var x;count:SizeInt;value:word); forward;
  437. var
  438. FillWord_Impl: procedure(var x;count:SizeInt;value:word) = @FillWord_Dispatch;
  439. procedure FillWord_Dispatch(var x;count:SizeInt;value:word);
  440. begin
  441. if not fpc_cpucodeinit_performed then
  442. begin
  443. {$ifdef CPUX86_HAS_SSE2} FillWord_SSE2 {$else} FillWord_Plain {$endif} (x, count, value);
  444. exit;
  445. end;
  446. if fast_large_repmovstosb then
  447. FillWord_Impl := @FillWord_SSE2_ERMS
  448. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  449. FillWord_Impl := @FillWord_SSE2
  450. {$ifndef CPUX86_HAS_SSE2}
  451. else
  452. FillWord_Impl := @FillWord_Plain
  453. {$endif ndef CPUX86_HAS_SSE2};
  454. FillWord_Impl(x, count, value);
  455. end;
  456. procedure FillWord(var x;count:SizeInt;value:word);
  457. begin
  458. FillWord_Impl(x, count, value);
  459. end;
  460. {$endif FPC_SYSTEM_HAS_FILLWORD}
  461. {$ifndef FPC_SYSTEM_HAS_FILLDWORD}
  462. {$define FPC_SYSTEM_HAS_FILLDWORD}
  463. procedure FillDWord_4OrLess; assembler; nostackframe;
  464. asm
  465. cmp $1, %edx
  466. jl .LQuit
  467. mov %ecx, (%eax)
  468. je .LQuit
  469. mov %ecx, 4(%eax)
  470. mov %ecx, -8(%eax,%edx,4)
  471. mov %ecx, -4(%eax,%edx,4)
  472. .LQuit:
  473. end;
  474. {$ifndef CPUX86_HAS_SSE2}
  475. procedure FillDWord_Plain(var x;count:SizeInt;value:dword);assembler;nostackframe;
  476. asm
  477. cmp $4, %edx
  478. jle FillDWord_4OrLess
  479. shl $2, %edx
  480. jmp FillXxxx_U32Pattern_Plain_16OrMore
  481. end;
  482. {$endif ndef CPUX86_HAS_SSE2}
  483. procedure FillDWord_SSE2(var x;count:SizeInt;value:dword);assembler;nostackframe;
  484. asm
  485. cmp $4, %edx
  486. jle FillDWord_4OrLess
  487. shl $2, %edx
  488. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  489. jb FillXxxx_U32Pattern_SSE2_16OrMore
  490. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  491. end;
  492. procedure FillDWord_SSE2_ERMS(var x;count:SizeInt;value:dword);assembler;nostackframe;
  493. asm
  494. cmp $4, %edx
  495. jle FillDWord_4OrLess
  496. shl $2, %edx
  497. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  498. jb FillXxxx_U32Pattern_SSE2_16OrMore
  499. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  500. end;
  501. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword); forward;
  502. var
  503. FillDWord_Impl: procedure(var x;count:SizeInt;value:dword) = @FillDWord_Dispatch;
  504. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword);
  505. begin
  506. if not fpc_cpucodeinit_performed then
  507. begin
  508. {$ifdef CPUX86_HAS_SSE2} FillDWord_SSE2 {$else} FillDWord_Plain {$endif}(x, count, value);
  509. exit;
  510. end;
  511. if fast_large_repmovstosb then
  512. FillDWord_Impl := @FillDWord_SSE2_ERMS
  513. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  514. FillDWord_Impl := @FillDWord_SSE2
  515. {$ifndef CPUX86_HAS_SSE2}
  516. else
  517. FillDWord_Impl := @FillDWord_Plain
  518. {$endif ndef CPUX86_HAS_SSE2};
  519. FillDWord_Impl(x, count, value);
  520. end;
  521. procedure FillDWord(var x;count:SizeInt;value:dword);
  522. begin
  523. FillDWord_Impl(x, count, value);
  524. end;
  525. {$endif FPC_SYSTEM_HAS_FILLDWORD}
  526. {$ifndef FPC_SYSTEM_HAS_FILLQWORD}
  527. {$define FPC_SYSTEM_HAS_FILLQWORD}
  528. {$ifndef CPUX86_HAS_SSE2}
  529. procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  530. { eax = x, edx = count, [esp + 4] = value }
  531. asm
  532. test %edx, %edx { Don't care about clever things like separate small branches or aligning writes by misaligning the pattern, the function is dead. }
  533. jle .LQuit
  534. push %esi
  535. mov 4+4(%esp), %esi { esi = value[0:31] }
  536. mov 4+8(%esp), %ecx { ecx = value[32:63] }
  537. .balign 16
  538. .LLoop:
  539. mov %esi, (%eax)
  540. mov %ecx, 4(%eax)
  541. add $8, %eax
  542. sub $1, %edx
  543. jnz .LLoop
  544. pop %esi
  545. .LQuit:
  546. end;
  547. {$endif ndef CPUX86_HAS_SSE2}
  548. procedure {$ifdef CPUX86_HAS_SSE2} FillQWord {$else} FillQWord_SSE2 {$endif}(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  549. { eax = x, edx = count, [esp + 4] = value }
  550. asm
  551. cmp $4, %edx
  552. jle .L4OrLess
  553. movq 4(%esp), %xmm0
  554. punpcklqdq %xmm0, %xmm0
  555. { Stack is 12 bytes:
  556. [esp] = return address, [esp + 4] = value (not required anymore).
  557. Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs:
  558. [esp] = return address. }
  559. mov (%esp), %ecx
  560. add $8, %esp
  561. mov %ecx, (%esp)
  562. shl $3, %edx
  563. movdqu %xmm0, (%eax)
  564. movdqu %xmm0, -16(%eax,%edx)
  565. test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
  566. jz FillXxxx_MoreThanTwoXMMs
  567. mov %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x. }
  568. shl $3, %ecx
  569. and $63, %ecx
  570. movd %ecx, %xmm2
  571. movdqa %xmm0, %xmm1
  572. psllq %xmm2, %xmm1
  573. neg %ecx { Can also do not ecx; and $63, %ecx; inc ecx to support ecx = 0, as SSE shifts correctly handle count >= bitsizeof. }
  574. and $63, %ecx { But in the definitely unaligned case, ecx = 0 is impossible. }
  575. movd %ecx, %xmm2
  576. psrlq %xmm2, %xmm0
  577. por %xmm1, %xmm0
  578. jmp FillXxxx_MoreThanTwoXMMs
  579. .L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ }
  580. cmp $1, %edx
  581. jl .LQuit
  582. mov 4(%esp), %ecx
  583. mov %ecx, (%eax)
  584. je .LSecondHalfOf1
  585. mov %ecx, 8(%eax)
  586. mov %ecx, -16(%eax,%edx,8)
  587. mov %ecx, -8(%eax,%edx,8)
  588. mov 8(%esp), %ecx
  589. mov %ecx, 4(%eax)
  590. mov %ecx, 12(%eax)
  591. mov %ecx, -12(%eax,%edx,8)
  592. mov %ecx, -4(%eax,%edx,8)
  593. .LQuit:
  594. ret $8
  595. .LSecondHalfOf1:
  596. mov 8(%esp), %ecx
  597. mov %ecx, 4(%eax)
  598. end;
  599. {$ifndef CPUX86_HAS_SSE2}
  600. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword); forward;
  601. var
  602. FillQWord_Impl: procedure(var x;count:SizeInt;value:qword) = @FillQWord_Dispatch;
  603. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword);
  604. begin
  605. if not fpc_cpucodeinit_performed then
  606. begin
  607. FillQWord_Plain(x, count, value);
  608. exit;
  609. end;
  610. if has_sse2_support then
  611. FillQWord_Impl := @FillQWord_SSE2
  612. else
  613. FillQWord_Impl := @FillQWord_Plain;
  614. FillQWord_Impl(x, count, value);
  615. end;
  616. procedure FillQWord(var x;count:SizeInt;value:qword);
  617. begin
  618. FillQWord_Impl(x, count, value);
  619. end;
  620. {$endif ndef CPUX86_HAS_SSE2 (need FillQWord dispatcher)}
  621. {$endif FPC_SYSTEM_HAS_FILLQWORD}
  622. {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
  623. {$define FPC_SYSTEM_HAS_INDEXBYTE}
  624. {$ifndef CPUX86_HAS_SSE2}
  625. function IndexByte_Plain(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  626. { eax = buf, edx = len, cl = b }
  627. asm
  628. test %edx,%edx
  629. jz .Lnothing0
  630. push %eax { save initial value of 'buf' }
  631. test $3,%al
  632. jz .Laligned4
  633. .Lalignloop: { align to 4 bytes }
  634. cmp %cl,(%eax)
  635. je .Lfoundateax
  636. inc %eax
  637. dec %edx
  638. jz .Lnothing1
  639. test $3,%al
  640. jnz .Lalignloop
  641. .Laligned4: { align to 8 bytes }
  642. push %esi
  643. push %edi
  644. mov %cl,%ch { prepare pattern }
  645. movzwl %cx,%esi
  646. shl $16,%ecx
  647. or %esi,%ecx
  648. test $7,%al
  649. jz .Lloop
  650. test %edx,%edx { Adjust buf+len and reuse the second unroll from the loop body. Careful with len < 0 (esp. len = −1). }
  651. jl .Ldontfixuplen
  652. add $4,%edx
  653. .Ldontfixuplen:
  654. sub $4,%eax
  655. jmp .Lalignfrom4to8
  656. .balign 16
  657. .Lloop: { Requires 8-byte alignment of eax, to safely over-read up to 7 bytes on last iteration. }
  658. mov (%eax),%esi { load dword }
  659. xor %ecx,%esi { XOR with pattern, bytes equal to target are now 0 }
  660. lea -0x01010101(%esi),%edi
  661. not %esi
  662. and $0x80808080,%esi
  663. and %edi,%esi { (x-0x01010101) and (not x) and 0x80808080 }
  664. jnz .Lfound0 { one of the bytes matches }
  665. .Lalignfrom4to8:
  666. mov 4(%eax),%esi
  667. xor %ecx,%esi
  668. lea -0x01010101(%esi),%edi
  669. not %esi
  670. and $0x80808080,%esi
  671. and %edi,%esi
  672. jnz .Lfound1
  673. add $8,%eax
  674. sub $8,%edx
  675. ja .Lloop
  676. .Lnothing3:
  677. pop %edi
  678. pop %esi
  679. .Lnothing1:
  680. pop %edx
  681. .Lnothing0:
  682. or $-1,%eax
  683. ret
  684. .Lfound1:
  685. sub $4,%edx
  686. jbe .Lnothing3
  687. add $4,%eax
  688. .Lfound0:
  689. bsf %esi,%esi
  690. shr $3,%esi
  691. cmp %edx,%esi { Garbage after remaining length? }
  692. jae .Lnothing3
  693. add %esi,%eax
  694. pop %edi
  695. pop %esi
  696. .Lfoundateax:
  697. pop %ecx
  698. sub %ecx,%eax
  699. end;
  700. {$endif ndef CPUX86_HAS_SSE2}
  701. function {$ifdef CPUX86_HAS_SSE2} IndexByte {$else} IndexByte_SSE2 {$endif} (const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  702. asm
  703. test %edx, %edx
  704. jz .Lnotfound { exit if len=0 }
  705. movd %ecx, %xmm1
  706. mov %eax, %ecx
  707. punpcklbw %xmm1, %xmm1
  708. punpcklbw %xmm1, %xmm1
  709. and $4095, %ecx
  710. pshufd $0, %xmm1, %xmm1
  711. cmp $4080, %ecx
  712. ja .LCrossPage
  713. movdqu (%eax), %xmm0 { Analyze first 16 bytes, unaligned. }
  714. pcmpeqb %xmm1, %xmm0
  715. pmovmskb %xmm0, %ecx
  716. test %ecx, %ecx
  717. jz .LContinueAligned
  718. bsf %ecx, %eax
  719. cmp %edx, %eax
  720. jae .Lnotfound
  721. ret
  722. .byte 144 { Make .balign 16 before .Lloop a no-op. }
  723. .LContinueAligned:
  724. cmp $16, %edx { Length might be explicitly set to 16 or less; if so, skip a bit of work. }
  725. jbe .Lnotfound { (Or rather, this check is *required* unless jumping to .Lcontinue instead of going directly to .Lloop) }
  726. push %ebx
  727. lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
  728. and $-0x10, %ecx { first aligned address after buf }
  729. sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
  730. .balign 16
  731. .Lloop:
  732. movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, }
  733. add $16, %ecx { but their sum is evenly divisible by 16. }
  734. pcmpeqb %xmm1, %xmm0
  735. pmovmskb %xmm0, %ebx
  736. test %ebx, %ebx
  737. jnz .Lmatch
  738. .Lcontinue:
  739. cmp %ecx, %edx
  740. ja .Lloop
  741. pop %ebx
  742. .Lnotfound:
  743. or $-1, %eax
  744. ret
  745. .LCrossPage:
  746. push %ebx
  747. lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
  748. and $-0x10, %ecx { first aligned address after buf }
  749. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
  750. sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
  751. pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
  752. pmovmskb %xmm0, %ebx
  753. shl %cl, %ebx { shift valid bits into high word }
  754. and $0xffff0000, %ebx { clear low word containing invalid bits }
  755. shr %cl, %ebx { shift back }
  756. jz .Lcontinue
  757. .Lmatch:
  758. bsf %ebx, %ebx
  759. lea -16(%ecx,%ebx), %eax
  760. pop %ebx
  761. cmp %eax, %edx { check against the buffer length }
  762. jbe .Lnotfound
  763. end;
  764. {$ifndef CPUX86_HAS_SSE2}
  765. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt; forward;
  766. var
  767. IndexByte_Impl: function(const buf;len:SizeInt;b:byte):SizeInt = @IndexByte_Dispatch;
  768. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt;
  769. begin
  770. if not fpc_cpucodeinit_performed then
  771. exit(IndexByte_Plain(buf,len,b));
  772. if has_sse2_support then
  773. IndexByte_Impl:=@IndexByte_SSE2
  774. else
  775. IndexByte_Impl:=@IndexByte_Plain;
  776. result:=IndexByte_Impl(buf,len,b);
  777. end;
  778. function IndexByte(const buf;len:SizeInt;b:byte):SizeInt;
  779. begin
  780. result:=IndexByte_Impl(buf,len,b);
  781. end;
  782. {$endif ndef CPUX86_HAS_SSE2}
  783. {$endif FPC_SYSTEM_HAS_INDEXBYTE}
  784. {$ifndef FPC_SYSTEM_HAS_INDEXWORD}
  785. {$define FPC_SYSTEM_HAS_INDEXWORD}
  786. {$ifndef CPUX86_HAS_SSE2}
  787. function IndexWord_Plain(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  788. asm
  789. test %edx, %edx
  790. jz .LNotFound
  791. push %eax
  792. .LWordwise_Body: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  793. cmp %cx, (%eax)
  794. je .LFound
  795. add $2, %eax
  796. dec %edx
  797. jnz .LWordwise_Body
  798. pop %edx
  799. .LNotFound:
  800. or $-1, %eax
  801. ret
  802. .LFound:
  803. pop %edx
  804. sub %edx, %eax
  805. shr $1, %eax
  806. end;
  807. {$endif ndef CPUX86_HAS_SSE2}
  808. function {$ifdef CPUX86_HAS_SSE2} IndexWord {$else} IndexWord_SSE2 {$endif} (const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  809. asm
  810. test %edx, %edx { exit if len=0 }
  811. je .Lnotfound
  812. push %ebx
  813. movd %ecx, %xmm1
  814. punpcklwd %xmm1, %xmm1
  815. pshufd $0, %xmm1, %xmm1
  816. lea 16(%eax), %ecx
  817. and $-16, %ecx
  818. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
  819. sub %eax, %ecx
  820. test $1, %eax { if buffer isn't aligned to word boundary, }
  821. jnz .Lunaligned { use a different algorithm }
  822. pcmpeqw %xmm1, %xmm0
  823. pmovmskb %xmm0, %ebx
  824. shl %cl, %ebx
  825. and $0xffff0000, %ebx
  826. shr %cl, %ebx
  827. shr $1, %ecx { ecx=number of valid bytes }
  828. test %ebx, %ebx
  829. jz .Lcontinue
  830. .Lmatch:
  831. bsf %ebx, %ebx
  832. shr $1, %ebx { in words }
  833. lea -8(%ecx,%ebx), %eax
  834. pop %ebx
  835. cmp %eax, %edx
  836. jbe .Lnotfound { if match is after the specified length, ignore it }
  837. ret
  838. .balign 16
  839. .Lloop:
  840. movdqa (%eax,%ecx,2), %xmm0
  841. add $8, %ecx
  842. pcmpeqw %xmm1, %xmm0
  843. pmovmskb %xmm0, %ebx
  844. test %ebx, %ebx
  845. jnz .Lmatch
  846. .Lcontinue:
  847. cmp %ecx, %edx
  848. ja .Lloop
  849. pop %ebx
  850. .Lnotfound:
  851. or $-1, %eax
  852. ret
  853. .Lunaligned:
  854. push %esi
  855. movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
  856. psllw $8, %xmm1 { swap bytes of each word of pattern) }
  857. psrlw $8, %xmm2
  858. por %xmm2, %xmm1
  859. pcmpeqb %xmm1, %xmm0
  860. pmovmskb %xmm0, %ebx
  861. shl %cl, %ebx
  862. and $0xffff0000, %ebx
  863. shr %cl, %ebx
  864. xor %esi, %esi { nothing to merge yet }
  865. add %edx, %edx { length words -> bytes }
  866. jmp .Lcontinue_u
  867. .balign 16
  868. .Lloop_u:
  869. movdqa (%eax,%ecx), %xmm0
  870. add $16, %ecx
  871. pcmpeqb %xmm1, %xmm0 { compare by bytes }
  872. shr $16, %esi { bit 16 shifts into 0 }
  873. pmovmskb %xmm0, %ebx
  874. .Lcontinue_u:
  875. shl $1, %ebx { 15:0 -> 16:1 }
  876. or %esi, %ebx { merge bit 0 from previous round }
  877. mov %ebx, %esi
  878. shr $1, %ebx { now AND together adjacent pairs of bits }
  879. and %esi, %ebx
  880. and $0x5555, %ebx { also reset odd bits }
  881. jnz .Lmatch_u
  882. cmp %ecx, %edx
  883. ja .Lloop_u
  884. .Lnotfound_u:
  885. pop %esi
  886. pop %ebx
  887. or $-1, %eax
  888. ret
  889. .Lmatch_u:
  890. bsf %ebx, %ebx
  891. lea -16(%ecx,%ebx), %eax
  892. cmp %eax, %edx
  893. jbe .Lnotfound_u { if match is after the specified length, ignore it }
  894. sar $1, %eax { in words }
  895. pop %esi
  896. pop %ebx
  897. end;
  898. {$ifndef CPUX86_HAS_SSE2}
  899. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt; forward;
  900. var
  901. IndexWord_Impl: function(const buf;len:SizeInt;b:word):SizeInt = @IndexWord_Dispatch;
  902. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt;
  903. begin
  904. if not fpc_cpucodeinit_performed then
  905. exit(IndexWord_Plain(buf,len,b));
  906. if has_sse2_support then
  907. IndexWord_Impl:=@IndexWord_SSE2
  908. else
  909. IndexWord_Impl:=@IndexWord_Plain;
  910. result:=IndexWord_Impl(buf,len,b);
  911. end;
  912. function IndexWord(const buf;len:SizeInt;b:word):SizeInt; inline;
  913. begin
  914. result:=IndexWord_Impl(buf,len,b);
  915. end;
  916. {$endif ndef CPUX86_HAS_SSE2}
  917. {$endif FPC_SYSTEM_HAS_INDEXWORD}
  918. {$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
  919. {$define FPC_SYSTEM_HAS_INDEXDWORD}
  920. {$ifndef CPUX86_HAS_SSE2}
  921. function IndexDWord_Plain(Const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  922. asm
  923. push %eax
  924. sub $4, %eax
  925. .LDWordwise_Next: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  926. add $4, %eax
  927. sub $1, %edx
  928. jb .LNotFound
  929. cmp %ecx, (%eax)
  930. jne .LDWordwise_Next
  931. pop %edx
  932. sub %edx, %eax
  933. shr $2, %eax
  934. ret
  935. .LNotFound:
  936. pop %edx
  937. mov $-1, %eax
  938. end;
  939. {$endif ndef CPUX86_HAS_SSE2}
  940. function {$ifdef CPUX86_HAS_SSE2} IndexDWord {$else} IndexDWord_SSE2 {$endif} (const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  941. asm
  942. push %eax
  943. sub $4, %edx
  944. jle .LDwordwise_Prepare
  945. movd %ecx, %xmm1
  946. pshufd $0, %xmm1, %xmm1
  947. .balign 16 { 1-byte NOP. }
  948. .L4x_Body:
  949. movdqu (%eax), %xmm0
  950. pcmpeqd %xmm1, %xmm0
  951. pmovmskb %xmm0, %ecx
  952. test %ecx, %ecx
  953. jnz .LFoundAtMask
  954. add $16, %eax
  955. sub $4, %edx
  956. jg .L4x_Body
  957. lea (%eax,%edx,4), %eax
  958. movdqu (%eax), %xmm0
  959. pcmpeqd %xmm1, %xmm0
  960. pmovmskb %xmm0, %ecx
  961. test %ecx, %ecx
  962. jz .LNothing
  963. .LFoundAtMask:
  964. bsf %ecx, %ecx
  965. add %ecx, %eax
  966. .LFoundAtEax:
  967. pop %edx
  968. sub %edx, %eax
  969. shr $2, %eax
  970. ret
  971. nop { Turns .balign 16 before .LDwordwise_Body into a no-op. }
  972. .LDwordwise_Prepare:
  973. add $3, %edx
  974. cmp $-1, %edx
  975. je .LNothing
  976. .balign 16 { no-op }
  977. .LDwordwise_Body:
  978. cmp (%eax), %ecx
  979. je .LFoundAtEax
  980. add $4, %eax
  981. sub $1, %edx
  982. jae .LDwordwise_Body
  983. .LNothing:
  984. pop %edx
  985. or $-1, %eax
  986. end;
  987. {$ifndef CPUX86_HAS_SSE2}
  988. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt; forward;
  989. var
  990. IndexDWord_Impl: function(const buf;len:SizeInt;b:DWord):SizeInt = @IndexDWord_Dispatch;
  991. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt;
  992. begin
  993. if not fpc_cpucodeinit_performed then
  994. exit(IndexDWord_Plain(buf,len,b));
  995. if has_sse2_support then
  996. IndexDWord_Impl:=@IndexDWord_SSE2
  997. else
  998. IndexDWord_Impl:=@IndexDWord_Plain;
  999. result:=IndexDWord_Impl(buf,len,b);
  1000. end;
  1001. function IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt;
  1002. begin
  1003. result:=IndexDWord_Impl(buf,len,b);
  1004. end;
  1005. {$endif CPUX86_HAS_SSE2}
  1006. {$endif FPC_SYSTEM_HAS_INDEXDWORD}
  1007. {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
  1008. {$define FPC_SYSTEM_HAS_INDEXQWORD}
  1009. function IndexQWord_Plain(const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  1010. { eax = buf, edx = len, [esp+4] = b }
  1011. asm
  1012. push %ebx
  1013. mov 8(%esp), %ecx { ecx = b[0:31] }
  1014. mov 12(%esp), %ebx { ebx = b[32:63] }
  1015. mov %eax, 8(%esp) { remember original buf }
  1016. sub $8, %eax
  1017. .balign 16 { no-op }
  1018. .LQWordwise_Next:
  1019. add $8, %eax
  1020. sub $1, %edx
  1021. jb .LNotFound
  1022. cmp %ecx, (%eax)
  1023. jne .LQWordwise_Next
  1024. cmp %ebx, 4(%eax)
  1025. jne .LQWordwise_Next
  1026. sub 8(%esp), %eax
  1027. pop %ebx
  1028. shr $3, %eax
  1029. ret $8
  1030. .LNotFound:
  1031. pop %ebx
  1032. mov $-1, %eax
  1033. end;
  1034. function {$ifdef CPUX86_HAS_SSE4_1} IndexQWord {$else} IndexQWord_SSE41 {$endif}(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  1035. { eax = buf, edx = len, [esp+4] = b }
  1036. asm
  1037. cmp $6, len
  1038. jle IndexQWord_Plain
  1039. movddup 4(%esp), %xmm0 { xmm0 = pattern of 'b's. }
  1040. mov %eax, %ecx { ecx = original buf }
  1041. sub $6, len
  1042. .balign 16
  1043. .L6x_Loop:
  1044. movdqu (%eax), %xmm1
  1045. pcmpeqq %xmm0, %xmm1 { xmm1 = cmpeq(vec 0, pattern) }
  1046. movdqu 16(%eax), %xmm2
  1047. pcmpeqq %xmm0, %xmm2
  1048. por %xmm1, %xmm2 { xmm2 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) }
  1049. movdqu 32(%eax), %xmm3
  1050. pcmpeqq %xmm0, %xmm3
  1051. por %xmm2, %xmm3 { xmm3 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) or cmpeq(vec 2, pattern) }
  1052. ptest %xmm3, %xmm3
  1053. jnz .LFound
  1054. add $48, %eax
  1055. sub $6, len
  1056. jge .L6x_Loop
  1057. lea (%eax,%edx,8), %eax { Point to last 3 vectors. }
  1058. cmp $-5, len
  1059. jge .L6x_Loop { Reuse .L6x_Loop to compare last 3 vectors, if not compared already. }
  1060. mov $-1, %eax
  1061. ret $8
  1062. .LFound:
  1063. sub %ecx, %eax
  1064. ptest %xmm1, %xmm1
  1065. jnz .LFoundAtXmm1
  1066. ptest %xmm2, %xmm2
  1067. jnz .LFoundAtXmm2
  1068. add $16, %eax
  1069. movdqa %xmm3, %xmm2
  1070. .LFoundAtXmm2:
  1071. add $16, %eax
  1072. movdqa %xmm2, %xmm1
  1073. .LFoundAtXmm1:
  1074. pmovmskb %xmm1, %ecx
  1075. bsf %ecx, %ecx
  1076. add %ecx, %eax
  1077. shr $3, %eax
  1078. end;
  1079. {$ifndef CPUX86_HAS_SSE4_1}
  1080. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt; forward;
  1081. var
  1082. IndexQWord_Impl: function(const buf;len:SizeInt;b:QWord):SizeInt = @IndexQWord_Dispatch;
  1083. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt;
  1084. begin
  1085. if not fpc_cpucodeinit_performed then
  1086. exit(IndexQWord_Plain(buf,len,b));
  1087. if has_sse41_support then
  1088. IndexQWord_Impl:=@IndexQWord_SSE41
  1089. else
  1090. IndexQWord_Impl:=@IndexQWord_Plain;
  1091. result:=IndexQWord_Impl(buf,len,b);
  1092. end;
  1093. function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt;
  1094. begin
  1095. result:=IndexQWord_Impl(buf,len,b);
  1096. end;
  1097. {$endif ndef CPUX86_HAS_SSE4_1}
  1098. {$endif FPC_SYSTEM_HAS_INDEXQWORD}
  1099. {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
  1100. {$define FPC_SYSTEM_HAS_COMPAREBYTE}
  1101. {$ifndef CPUX86_HAS_SSE2}
  1102. function CompareByte_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1103. asm
  1104. { eax = buf1, edx = buf2, ecx = len }
  1105. push %ebx
  1106. sub %eax, %edx { edx = buf2 - buf1 }
  1107. cmp $3, %ecx
  1108. jle .LBytewise_Prepare
  1109. { Align buf1 on 4 bytes. }
  1110. mov (%edx,%eax), %ebx
  1111. cmp (%eax), %ebx
  1112. jne .L4xDiffer
  1113. lea -4(%eax,%ecx), %ecx { ecx = buf1 end - (4 + buf1 and -4) = count remaining }
  1114. and $-4, %eax
  1115. sub %eax, %ecx
  1116. .balign 16
  1117. .L4x_Next:
  1118. add $4, %eax
  1119. sub $4, %ecx { at .LLast4, ecx is 4 less than remaining bytes }
  1120. jle .LLast4
  1121. mov (%edx,%eax), %ebx
  1122. cmp (%eax), %ebx
  1123. je .L4x_Next
  1124. .L4xDiffer:
  1125. mov (%eax), %edx
  1126. {$ifdef CPUX86_HAS_BSWAP}
  1127. bswap %ebx
  1128. bswap %edx
  1129. {$else}
  1130. rol $8, %bx
  1131. rol $16, %ebx
  1132. rol $8, %bx
  1133. rol $8, %dx
  1134. rol $16, %edx
  1135. rol $8, %dx
  1136. {$endif}
  1137. cmp %ebx, %edx
  1138. .LDoSbb:
  1139. sbb %eax, %eax
  1140. or $1, %eax
  1141. pop %ebx
  1142. ret
  1143. .LLast4:
  1144. add %ecx, %eax
  1145. mov (%edx,%eax), %ebx
  1146. cmp (%eax), %ebx
  1147. jne .L4xDiffer
  1148. xor %eax, %eax
  1149. pop %ebx
  1150. ret
  1151. .LBytewise_Prepare:
  1152. sub $1, %ecx
  1153. jb .LNothing
  1154. .balign 16 { no-op }
  1155. .LBytewise_Body:
  1156. movzbl (%edx,%eax), %ebx
  1157. cmp %bl, (%eax)
  1158. jne .LDoSbb
  1159. add $1, %eax
  1160. sub $1, %ecx
  1161. jae .LBytewise_Body
  1162. .LNothing:
  1163. xor %eax, %eax
  1164. pop %ebx
  1165. end;
  1166. {$endif ndef CPUX86_HAS_SSE2}
  1167. function {$ifdef CPUX86_HAS_SSE2} CompareByte {$else} CompareByte_SSE2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
  1168. asm
  1169. { eax = buf1, edx = buf2, ecx = len }
  1170. cmp $1, %ecx
  1171. jle .L1OrLess
  1172. push %ebx
  1173. cmp $16, %ecx
  1174. jae .LVecOrMore
  1175. { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
  1176. mov %eax, %ebx
  1177. or %edx, %ebx
  1178. and $4095, %ebx
  1179. cmp $4080, %ebx
  1180. ja .LCantOverReadBoth
  1181. { Over-read both as XMMs. }
  1182. movdqu (%eax), %xmm0
  1183. movdqu (%edx), %xmm1
  1184. pcmpeqb %xmm1, %xmm0
  1185. pmovmskb %xmm0, %ebx
  1186. inc %bx { Here and below, 2-byte 'inc r16' after 'pmovmskb' can be replaced with 5-byte 'add $1, r16' or 6-byte 'xor $65535, r32'. }
  1187. jz .LNothing
  1188. bsf %ebx, %ebx
  1189. cmp %ecx, %ebx { Ignore garbage beyond 'len'. }
  1190. jae .LNothing
  1191. movzbl (%eax,%ebx), %eax
  1192. movzbl (%edx,%ebx), %edx
  1193. sub %edx, %eax
  1194. pop %ebx
  1195. ret
  1196. .LNothing:
  1197. pop %ebx
  1198. xor %eax, %eax
  1199. ret
  1200. .LVecOrMore:
  1201. { Compare first vectors. }
  1202. movdqu (%eax), %xmm0
  1203. movdqu (%edx), %xmm1
  1204. pcmpeqb %xmm1, %xmm0
  1205. pmovmskb %xmm0, %ebx
  1206. inc %bx
  1207. jnz .LVec0Differs
  1208. sub $32, %ecx { now ecx is len - 32... mostly just to save bytes on offsets improving .LAligned32xLoop_Body alignment :) }
  1209. jbe .LLastVec
  1210. { Compare second vectors. }
  1211. movdqu 16(%eax), %xmm0
  1212. movdqu 16(%edx), %xmm1
  1213. pcmpeqb %xmm1, %xmm0
  1214. pmovmskb %xmm0, %ebx
  1215. inc %bx
  1216. jnz .LVec1Differs
  1217. { More than four vectors: aligned loop. }
  1218. cmp $32, %ecx
  1219. ja .LAligned32xLoop_Prepare
  1220. { Compare last two vectors. }
  1221. movdqu (%eax,%ecx), %xmm0
  1222. movdqu (%edx,%ecx), %xmm1
  1223. pcmpeqb %xmm1, %xmm0
  1224. pmovmskb %xmm0, %ebx
  1225. inc %bx
  1226. jnz .LVecEm2Differs
  1227. .LLastVec:
  1228. movdqu 16(%eax,%ecx), %xmm0
  1229. movdqu 16(%edx,%ecx), %xmm1
  1230. pcmpeqb %xmm1, %xmm0
  1231. pmovmskb %xmm0, %ebx
  1232. inc %bx
  1233. jnz .LVecEm1Differs
  1234. pop %ebx
  1235. xor %eax, %eax
  1236. ret
  1237. .LVecEm2Differs:
  1238. sub $16, %ecx
  1239. .LVecEm1Differs:
  1240. bsf %ebx, %ebx
  1241. add %ecx, %ebx
  1242. movzbl 16(%eax,%ebx), %eax
  1243. movzbl 16(%edx,%ebx), %edx
  1244. sub %edx, %eax
  1245. pop %ebx
  1246. ret
  1247. nop { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
  1248. .LAligned32xLoop_Prepare:
  1249. lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
  1250. sub %eax, %edx { edx = buf2 - buf1 }
  1251. and $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
  1252. sub %eax, %ecx { ecx = count to be handled with loop }
  1253. .balign 16 { No-op. }
  1254. .LAligned32xLoop_Body:
  1255. add $32, %eax
  1256. { Compare two XMMs, reduce the result with 'and'. }
  1257. movdqu (%edx,%eax), %xmm0
  1258. pcmpeqb (%eax), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
  1259. movdqu 16(%edx,%eax), %xmm1
  1260. pcmpeqb 16(%eax), %xmm1
  1261. pand %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
  1262. pmovmskb %xmm1, %ebx
  1263. inc %bx
  1264. jnz .LAligned32xLoop_TwoVectorsDiffer
  1265. sub $32, %ecx
  1266. ja .LAligned32xLoop_Body
  1267. { Compare last two vectors after the loop by doing one more loop iteration, modified. }
  1268. lea 32(%eax,%ecx), %eax
  1269. movdqu (%edx,%eax), %xmm0
  1270. movdqu (%eax), %xmm2
  1271. pcmpeqb %xmm2, %xmm0
  1272. movdqu 16(%edx,%eax), %xmm1
  1273. movdqu 16(%eax), %xmm2
  1274. pcmpeqb %xmm2, %xmm1
  1275. pand %xmm0, %xmm1
  1276. pmovmskb %xmm1, %ebx
  1277. inc %bx
  1278. jnz .LAligned32xLoop_TwoVectorsDiffer
  1279. pop %ebx
  1280. xor %eax, %eax
  1281. ret
  1282. .LAligned32xLoop_TwoVectorsDiffer:
  1283. add %eax, %edx { restore edx = buf2 }
  1284. pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
  1285. inc %cx
  1286. jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
  1287. bsf %ecx, %ebx
  1288. movzbl (%eax,%ebx), %eax
  1289. movzbl (%edx,%ebx), %edx
  1290. sub %edx, %eax
  1291. pop %ebx
  1292. ret
  1293. .LVec1Differs:
  1294. add $16, %eax
  1295. add $16, %edx
  1296. .LVec0Differs:
  1297. bsf %ebx, %ebx
  1298. movzbl (%eax,%ebx), %eax
  1299. movzbl (%edx,%ebx), %edx
  1300. sub %edx, %eax
  1301. pop %ebx
  1302. ret
  1303. .LCantOverReadBoth:
  1304. cmp $3, %ecx
  1305. jle .L2to3
  1306. push %esi
  1307. mov (%eax), %ebx
  1308. mov (%edx), %esi
  1309. cmp %esi, %ebx
  1310. jne .L4xDiffer
  1311. cmp $8, %ecx
  1312. jbe .LLast4x
  1313. mov 4(%eax), %ebx
  1314. mov 4(%edx), %esi
  1315. cmp %esi, %ebx
  1316. jne .L4xDiffer
  1317. mov -8(%eax,%ecx), %ebx
  1318. mov -8(%edx,%ecx), %esi
  1319. cmp %esi, %ebx
  1320. jne .L4xDiffer
  1321. .LLast4x:
  1322. mov -4(%eax,%ecx), %ebx
  1323. mov -4(%edx,%ecx), %esi
  1324. cmp %esi, %ebx
  1325. jne .L4xDiffer
  1326. pop %esi
  1327. pop %ebx
  1328. xor %eax, %eax
  1329. ret
  1330. .L4xDiffer:
  1331. bswap %ebx
  1332. bswap %esi
  1333. cmp %esi, %ebx
  1334. pop %esi
  1335. sbb %eax, %eax
  1336. or $1, %eax
  1337. pop %ebx
  1338. ret
  1339. .L2to3:
  1340. movzwl (%edx), %ebx
  1341. bswap %ebx
  1342. shr $1, %ebx
  1343. mov -1(%edx,%ecx), %bl
  1344. movzwl (%eax), %edx
  1345. bswap %edx
  1346. shr $1, %edx
  1347. mov -1(%eax,%ecx), %dl
  1348. mov %edx, %eax
  1349. sub %ebx, %eax
  1350. pop %ebx
  1351. ret
  1352. .L1OrLess:
  1353. jl .LUnbounded_Prepare
  1354. movzbl (%eax), %eax
  1355. movzbl (%edx), %edx
  1356. sub %edx, %eax
  1357. ret
  1358. .LUnbounded_Prepare:
  1359. sub %eax, %edx { edx = buf2 - buf1 }
  1360. test %ecx, %ecx
  1361. jnz .LUnbounded_Body
  1362. xor %eax, %eax
  1363. ret
  1364. .balign 16
  1365. .LUnbounded_Next:
  1366. add $1, %eax
  1367. .LUnbounded_Body:
  1368. movzbl (%edx,%eax), %ecx
  1369. cmp %cl, (%eax)
  1370. je .LUnbounded_Next
  1371. sbb %eax, %eax
  1372. or $1, %eax
  1373. end;
  1374. {$ifndef CPUX86_HAS_SSE2}
  1375. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1376. var
  1377. CompareByte_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareByte_Dispatch;
  1378. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1379. begin
  1380. if not fpc_cpucodeinit_performed then
  1381. exit(CompareByte_Plain(buf1, buf2, len));
  1382. if has_sse2_support then
  1383. CompareByte_Impl:=@CompareByte_SSE2
  1384. else
  1385. CompareByte_Impl:=@CompareByte_Plain;
  1386. result:=CompareByte_Impl(buf1, buf2, len);
  1387. end;
  1388. function CompareByte(const buf1, buf2; len: SizeInt): SizeInt;
  1389. begin
  1390. result:=CompareByte_Impl(buf1, buf2, len);
  1391. end;
  1392. {$endif ndef CPUX86_HAS_SSE2 (need CompareByte dispatcher)}
  1393. {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
  1394. {$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
  1395. {$define FPC_SYSTEM_HAS_COMPAREWORD}
  1396. {$ifndef CPUX86_HAS_SSE2}
  1397. function CompareWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1398. asm
  1399. push %ebx
  1400. sub %eax, %edx { edx = buf2 - buf1 }
  1401. lea -4(%ecx), %ebx { Go wordwise if ecx <= 3 or ecx > 1073741823 (High(int32) div 2) ==> uint32(ecx - 4) > 1073741819. }
  1402. cmp $1073741819, %ebx
  1403. ja .LWordwise_Prepare
  1404. test $2, %al
  1405. je .LAlignedToPtrUintOrNaturallyMisaligned
  1406. movzwl (%edx,%eax), %ebx
  1407. cmp %bx, (%eax)
  1408. jne .LDoSbb
  1409. add $2, %eax
  1410. sub $1, %ecx
  1411. .LAlignedToPtrUintOrNaturallyMisaligned:
  1412. sub $2, %ecx
  1413. .balign 16
  1414. .LPtrUintWise_Next:
  1415. mov (%edx,%eax), %ebx
  1416. cmp %ebx, (%eax)
  1417. jne .LPtrUintsDiffer
  1418. add $4, %eax
  1419. sub $2, %ecx
  1420. jg .LPtrUintWise_Next
  1421. lea (%eax,%ecx,2), %eax
  1422. mov (%edx,%eax), %ebx
  1423. cmp %ebx, (%eax)
  1424. jne .LPtrUintsDiffer
  1425. pop %ebx
  1426. xor %eax, %eax
  1427. ret
  1428. .LPtrUintsDiffer:
  1429. cmp %bx, (%eax)
  1430. jne .LDoSbb
  1431. shr $16, %ebx
  1432. cmp %bx, 2(%eax)
  1433. .LDoSbb:
  1434. sbb %eax, %eax
  1435. or $1, %eax
  1436. pop %ebx
  1437. ret
  1438. .balign 16
  1439. .LWordwise_Body:
  1440. movzwl (%edx,%eax), %ebx
  1441. cmp %bx, (%eax)
  1442. jne .LDoSbb
  1443. add $2, %eax
  1444. .LWordwise_Prepare:
  1445. sub $1, %ecx
  1446. jnb .LWordwise_Body
  1447. pop %ebx
  1448. xor %eax, %eax
  1449. end;
  1450. {$endif ndef CPUX86_HAS_SSE2}
  1451. function {$ifdef CPUX86_HAS_SSE2} CompareWord {$else} CompareWord_SSE2 {$endif} (Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1452. asm
  1453. push %ebx
  1454. sub %eax, %edx { edx = buf2 - buf1 }
  1455. lea -2(%ecx), %ebx { Go wordwise if ecx <= 1 or uint32(ecx) > 1073741823 (High(int32) div 2) ==> uint32(ecx - 2) > 1073741821. }
  1456. cmp $1073741821, %ebx
  1457. ja .LWordwise_Prepare
  1458. cmp $8, %ecx
  1459. jge .LVecOrMore
  1460. lea (%edx,%eax), %ebx
  1461. or %eax, %ebx
  1462. and $4095, %ebx
  1463. cmp $4080, %ebx
  1464. ja .LWordwise_Prepare
  1465. movdqu (%edx,%eax), %xmm0
  1466. movdqu (%eax), %xmm1
  1467. pcmpeqw %xmm1, %xmm0
  1468. pmovmskb %xmm0, %ebx
  1469. inc %bx
  1470. jz .LNothing
  1471. shl $1, %ecx { convert to bytes }
  1472. bsf %ebx, %ebx
  1473. cmp %ecx, %ebx
  1474. jb .LSubtractWords
  1475. .LNothing:
  1476. pop %ebx
  1477. xor %eax, %eax
  1478. ret
  1479. .balign 16
  1480. .LWordwise_Body:
  1481. movzwl (%edx,%eax), %ebx
  1482. cmp %bx, (%eax)
  1483. jne .LDoSbb
  1484. add $2, %eax
  1485. .LWordwise_Prepare:
  1486. sub $1, %ecx
  1487. jae .LWordwise_Body
  1488. xor %eax, %eax
  1489. pop %ebx
  1490. ret
  1491. .LDoSbb:
  1492. sbb %eax, %eax
  1493. or $1, %eax
  1494. pop %ebx
  1495. ret
  1496. .LVecOrMore:
  1497. movdqu (%edx,%eax), %xmm0 { Compare first vectors. }
  1498. movdqu (%eax), %xmm1
  1499. pcmpeqw %xmm1, %xmm0
  1500. pmovmskb %xmm0, %ebx
  1501. inc %bx
  1502. jnz .LVec0Differs
  1503. shl $1, %ecx { convert to bytes }
  1504. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1505. jle .LLastVec
  1506. push %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1507. add %eax, %ecx
  1508. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1509. sub %eax, %ecx
  1510. .balign 16
  1511. .LAligned8xLoop_Body:
  1512. add $16, %eax
  1513. movdqu (%edx,%eax), %xmm0
  1514. pcmpeqb (%eax), %xmm0
  1515. pmovmskb %xmm0, %ebx
  1516. inc %bx
  1517. jnz .LAligned8xLoop_VecDiffers
  1518. sub $16, %ecx
  1519. ja .LAligned8xLoop_Body
  1520. pop %ebx { drop original buf1 }
  1521. .LLastVec:
  1522. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1523. movdqu (%edx,%eax), %xmm0
  1524. movdqu (%eax), %xmm1
  1525. pcmpeqw %xmm1, %xmm0
  1526. pmovmskb %xmm0, %ebx
  1527. inc %bx
  1528. jnz .LVec0Differs
  1529. pop %ebx
  1530. xor %eax, %eax
  1531. ret
  1532. .LVec0Differs:
  1533. bsf %ebx, %ebx
  1534. .LSubtractWords:
  1535. add %eax, %edx
  1536. movzwl (%eax,%ebx), %eax
  1537. movzwl (%edx,%ebx), %edx
  1538. sub %edx, %eax
  1539. pop %ebx
  1540. ret
  1541. .LAligned8xLoop_VecDiffers:
  1542. bsf %ebx, %ebx
  1543. add %ebx, %eax
  1544. pop %ecx
  1545. sub %ecx, %eax
  1546. and $-2, %eax
  1547. add %ecx, %eax
  1548. movzwl (%edx,%eax), %edx
  1549. movzwl (%eax), %eax
  1550. sub %edx, %eax
  1551. pop %ebx
  1552. end;
  1553. {$ifndef CPUX86_HAS_SSE2}
  1554. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1555. var
  1556. CompareWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareWord_Dispatch;
  1557. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1558. begin
  1559. if not fpc_cpucodeinit_performed then
  1560. exit(CompareWord_Plain(buf1, buf2, len));
  1561. if has_sse2_support then
  1562. CompareWord_Impl:=@CompareWord_SSE2
  1563. else
  1564. CompareWord_Impl:=@CompareWord_Plain;
  1565. result:=CompareWord_Impl(buf1, buf2, len);
  1566. end;
  1567. function CompareWord(const buf1, buf2; len: SizeInt): SizeInt;
  1568. begin
  1569. result:=CompareWord_Impl(buf1, buf2, len);
  1570. end;
  1571. {$endif ndef CPUX86_HAS_SSE2 (need CompareWord dispatcher)}
  1572. {$endif FPC_SYSTEM_HAS_COMPAREWORD}
  1573. {$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
  1574. {$define FPC_SYSTEM_HAS_COMPAREDWORD}
  1575. {$ifndef CPUX86_HAS_SSE2}
  1576. function CompareDWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1577. asm
  1578. sub $1, %ecx
  1579. jb .LNothing
  1580. push %ebx
  1581. sub %eax, %edx
  1582. .balign 16
  1583. .LDwordwise_Body:
  1584. mov (%edx,%eax), %ebx
  1585. cmp %ebx, (%eax)
  1586. jne .LDoSbb
  1587. add $4, %eax
  1588. sub $1, %ecx
  1589. jnb .LDwordwise_Body
  1590. pop %ebx
  1591. .LNothing:
  1592. xor %eax, %eax
  1593. ret
  1594. .LDoSbb:
  1595. pop %ebx
  1596. sbb %eax, %eax
  1597. or $1, %eax
  1598. end;
  1599. {$endif}
  1600. function {$ifdef CPUX86_HAS_SSE2} CompareDWord {$else} CompareDWord_SSE2 {$endif} (Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1601. asm
  1602. push %ebx
  1603. sub %eax, %edx { edx = buf2 - buf1 }
  1604. lea -5(%ecx), %ebx { Go dwordwise if ecx <= 4 or ecx > 536870911 (High(int32) div 4) ==> uint32(ecx - 5) > 536870906. }
  1605. cmp $536870906, %ebx
  1606. ja .LDwordwise_Prepare
  1607. shl $2, %ecx { convert to bytes }
  1608. movdqu (%edx,%eax), %xmm1 { Compare first vectors. }
  1609. movdqu (%eax), %xmm0
  1610. pcmpeqd %xmm1, %xmm0
  1611. pmovmskb %xmm0, %ebx
  1612. inc %bx
  1613. jnz .LVec0Differs
  1614. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1615. jle .LLastVec
  1616. push %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1617. add %eax, %ecx
  1618. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1619. sub %eax, %ecx
  1620. .balign 16
  1621. .LAligned4xLoop_Body:
  1622. add $16, %eax
  1623. movdqu (%eax,%edx), %xmm0
  1624. pcmpeqb (%eax), %xmm0
  1625. pmovmskb %xmm0, %ebx
  1626. inc %bx
  1627. jnz .LAligned4xLoop_VecDiffers
  1628. sub $16, %ecx
  1629. ja .LAligned4xLoop_Body
  1630. pop %ebx { drop original buf1 }
  1631. .LLastVec:
  1632. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1633. movdqu (%edx,%eax), %xmm1
  1634. movdqu (%eax), %xmm0
  1635. pcmpeqd %xmm1, %xmm0
  1636. pmovmskb %xmm0, %ebx
  1637. inc %bx
  1638. jnz .LVec0Differs
  1639. pop %ebx
  1640. xor %eax, %eax
  1641. ret
  1642. .LVec0Differs:
  1643. bsf %ebx, %ebx
  1644. add %eax, %edx { recover edx = buf2 }
  1645. mov (%edx,%ebx), %edx
  1646. cmp %edx, (%eax,%ebx)
  1647. sbb %eax, %eax
  1648. or $1, %eax
  1649. pop %ebx
  1650. ret
  1651. .LAligned4xLoop_VecDiffers:
  1652. bsf %ebx, %ebx
  1653. add %ebx, %eax
  1654. pop %ecx
  1655. sub %ecx, %eax
  1656. and $-4, %eax
  1657. add %ecx, %eax
  1658. mov (%edx,%eax), %edx
  1659. cmp %edx, (%eax)
  1660. .LDoSbb:
  1661. sbb %eax, %eax
  1662. or $1, %eax
  1663. pop %ebx
  1664. ret
  1665. .balign 16
  1666. .LDwordwise_Body:
  1667. mov (%edx,%eax), %ebx
  1668. cmp %ebx, (%eax)
  1669. jne .LDoSbb
  1670. add $4, %eax
  1671. .LDwordwise_Prepare:
  1672. sub $1, %ecx
  1673. jnb .LDwordwise_Body
  1674. pop %ebx
  1675. xor %eax, %eax
  1676. end;
  1677. {$ifndef CPUX86_HAS_SSE2}
  1678. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1679. var
  1680. CompareDWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareDWord_Dispatch;
  1681. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1682. begin
  1683. if not fpc_cpucodeinit_performed then
  1684. exit(CompareDWord_Plain(buf1, buf2, len));
  1685. if has_sse2_support then
  1686. CompareDWord_Impl:=@CompareDWord_SSE2
  1687. else
  1688. CompareDWord_Impl:=@CompareDWord_Plain;
  1689. result:=CompareDWord_Impl(buf1, buf2, len);
  1690. end;
  1691. function CompareDWord(const buf1, buf2; len: SizeInt): SizeInt;
  1692. begin
  1693. result:=CompareDWord_Impl(buf1, buf2, len);
  1694. end;
  1695. {$endif ndef CPUX86_HAS_SSE2 (need CompareDWord dispatcher)}
  1696. {$endif FPC_SYSTEM_HAS_COMPAREDWORD}
  1697. {$ifndef FPC_SYSTEM_HAS_INDEXCHAR0}
  1698. {$define FPC_SYSTEM_HAS_INDEXCHAR0}
  1699. function IndexChar0(Const buf;len:SizeInt;b:AnsiChar):SizeInt; assembler;
  1700. var
  1701. saveesi,saveebx : longint;
  1702. asm
  1703. movl %esi,saveesi
  1704. movl %ebx,saveebx
  1705. // Can't use scasb, or will have to do it twice, think this
  1706. // is faster for small "len"
  1707. movl %eax,%esi // Load address
  1708. movzbl %cl,%ebx // Load searchpattern
  1709. testl %edx,%edx
  1710. je .LFound
  1711. xorl %ecx,%ecx // zero index in Buf
  1712. xorl %eax,%eax // To make DWord compares possible
  1713. .balign 4
  1714. .LLoop:
  1715. movb (%esi),%al // Load byte
  1716. cmpb %al,%bl
  1717. je .LFound // byte the same?
  1718. incl %ecx
  1719. incl %esi
  1720. cmpl %edx,%ecx // Maximal distance reached?
  1721. je .LNotFound
  1722. testl %eax,%eax // Nullchar = end of search?
  1723. jne .LLoop
  1724. .LNotFound:
  1725. movl $-1,%ecx // Not found return -1
  1726. .LFound:
  1727. movl %ecx,%eax
  1728. movl saveesi,%esi
  1729. movl saveebx,%ebx
  1730. end;
  1731. {$endif FPC_SYSTEM_HAS_INDEXCHAR0}
  1732. {****************************************************************************
  1733. String
  1734. ****************************************************************************}
  1735. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1736. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1737. procedure fpc_shortstr_to_shortstr(out res:shortstring; const sstr: shortstring);assembler;[public,alias:'FPC_SHORTSTR_TO_SHORTSTR']; compilerproc;
  1738. {$ifndef FPC_PROFILE}
  1739. nostackframe;
  1740. {$endif}
  1741. { eax = res, edx = high(res), ecx = sstr }
  1742. asm
  1743. {$ifdef FPC_PROFILE}
  1744. push %eax
  1745. push %edx
  1746. push %ecx
  1747. call mcount
  1748. pop %ecx
  1749. pop %edx
  1750. pop %eax
  1751. {$endif FPC_PROFILE}
  1752. cmp (%ecx), %dl { length(sstr) fits into res? }
  1753. jbe .LEdxIsLen { use high(res) if length(sstr) does not fit }
  1754. movzbl (%ecx), %edx { use length(sstr) }
  1755. .LEdxIsLen:
  1756. mov %dl, (%eax) { store length to res[0] }
  1757. xchg %ecx, %edx { ecx = length = Move count, edx = sstr }
  1758. xchg %eax, %edx { eax = sstr = Move src, edx = res = Move dest }
  1759. inc %eax
  1760. inc %edx
  1761. {$ifdef FPC_PROFILE}
  1762. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1763. lea -8(%esp), %esp
  1764. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1765. call Move
  1766. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1767. lea 8(%esp), %esp
  1768. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1769. {$else FPC_PROFILE}
  1770. jmp Move
  1771. {$endif FPC_PROFILE}
  1772. end;
  1773. procedure fpc_shortstr_assign(len:longint;sstr,dstr:pointer);[public,alias:'FPC_SHORTSTR_ASSIGN'];
  1774. begin
  1775. asm
  1776. {$ifdef FPC_PROFILE}
  1777. push %eax
  1778. push %edx
  1779. push %ecx
  1780. call mcount
  1781. pop %ecx
  1782. pop %edx
  1783. pop %eax
  1784. {$endif FPC_PROFILE}
  1785. pushl %eax
  1786. pushl %ecx
  1787. {$ifdef FPC_ENABLED_CLD}
  1788. cld
  1789. {$endif FPC_ENABLED_CLD}
  1790. movl dstr,%edi
  1791. movl sstr,%esi
  1792. xorl %eax,%eax
  1793. movl len,%ecx
  1794. lodsb
  1795. cmpl %ecx,%eax
  1796. jbe .LStrCopy1
  1797. movl %ecx,%eax
  1798. .LStrCopy1:
  1799. stosb
  1800. cmpl $7,%eax
  1801. jl .LStrCopy2
  1802. movl %edi,%ecx { Align on 32bits }
  1803. negl %ecx
  1804. andl $3,%ecx
  1805. subl %ecx,%eax
  1806. rep
  1807. movsb
  1808. movl %eax,%ecx
  1809. andl $3,%eax
  1810. shrl $2,%ecx
  1811. rep
  1812. movsl
  1813. .LStrCopy2:
  1814. movl %eax,%ecx
  1815. rep
  1816. movsb
  1817. popl %ecx
  1818. popl %eax
  1819. end ['ESI','EDI'];
  1820. end;
  1821. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1822. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1823. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1824. function fpc_shortstr_compare(const left,right:shortstring): longint;assembler; [public,alias:'FPC_SHORTSTR_COMPARE']; compilerproc;
  1825. { eax = left, edx = right }
  1826. asm
  1827. {$ifdef FPC_PROFILE}
  1828. push %eax
  1829. push %edx
  1830. push %ecx
  1831. call mcount
  1832. pop %ecx
  1833. pop %edx
  1834. pop %eax
  1835. {$endif FPC_PROFILE}
  1836. push %ebx
  1837. movzbl (%eax), %ecx { ecx = len(left) }
  1838. movzbl (%edx), %ebx { ebx = len(right) }
  1839. cmp %ebx, %ecx
  1840. {$ifdef CPUX86_HAS_CMOV}
  1841. cmovg %ebx, %ecx
  1842. {$else}
  1843. jle .LEcxIsLen
  1844. mov %ebx, %ecx
  1845. .LEcxIsLen:
  1846. {$endif}
  1847. push %eax { save left }
  1848. inc %eax
  1849. inc %edx
  1850. { stack is already aligned to 16 bytes if required: return address + push ebp + push ebx + push eax. }
  1851. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1852. call CompareByte
  1853. {$else}
  1854. call CompareByte_Impl { manually inline CompareByte }
  1855. {$endif}
  1856. pop %edx { restore left }
  1857. test %eax, %eax
  1858. jnz .LReturn
  1859. movzbl (%edx), %eax
  1860. sub %ebx, %eax
  1861. .LReturn:
  1862. pop %ebx
  1863. end;
  1864. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1865. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1866. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1867. function fpc_shortstr_compare_equal(const left,right:shortstring) : longint;assembler;nostackframe; [public,alias:'FPC_SHORTSTR_COMPARE_EQUAL']; compilerproc; nostackframe;
  1868. { eax = left, edx = right }
  1869. asm
  1870. movzbl (%eax), %ecx
  1871. cmp (%edx), %cl
  1872. jne .LNotEqual
  1873. inc %eax
  1874. inc %edx
  1875. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1876. jmp CompareByte
  1877. {$else}
  1878. jmp CompareByte_Impl { manually inline CompareByte }
  1879. {$endif}
  1880. .LNotEqual:
  1881. or $-1, %eax
  1882. end;
  1883. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1884. {$ifndef FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1885. {$define FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1886. procedure fpc_pchar_to_shortstr(out res : shortstring;p:PAnsiChar);assembler;[public,alias:'FPC_PCHAR_TO_SHORTSTR']; compilerproc;
  1887. {$ifndef FPC_PROFILE}
  1888. nostackframe;
  1889. {$endif}
  1890. // eax = res, edx = high(res), ecx = p
  1891. asm
  1892. {$ifdef FPC_PROFILE}
  1893. push %eax
  1894. push %edx
  1895. push %ecx
  1896. call mcount
  1897. pop %ecx
  1898. pop %edx
  1899. pop %eax
  1900. {$endif FPC_PROFILE}
  1901. test %ecx, %ecx
  1902. jz .LEmpty
  1903. push %eax { save res }
  1904. push %ecx { save p }
  1905. push %edx { save high(res) }
  1906. mov %ecx, %eax { eax = IndexByte.buf }
  1907. { edx is already high(res) = IndexByte.count.
  1908. Careful: using high(res) instead of -1 limits the scan by high(res) which is a good thing,
  1909. but assumes that IndexByte is “safe” and won’t read potentially invalid memory past the searched byte even if formally (and wrongly) allowed by ‘count’.
  1910. Generic and x86 versions are “safe”. }
  1911. xor %ecx, %ecx { ecx = 0 = IndexByte.value }
  1912. { Stack is already aligned on 16 bytes if the function is nostackframe: return address + push eax + push ecx + push edx.
  1913. With a stack frame, there is an additional push ebp and need 12 more bytes to align. }
  1914. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1915. leal -12(%esp), %esp
  1916. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1917. {$if defined(FPC_PIC) or not declared(IndexByte_Impl)}
  1918. call IndexByte
  1919. {$else}
  1920. call IndexByte_Impl { manually inline IndexByte }
  1921. {$endif}
  1922. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1923. leal 12(%esp), %esp
  1924. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1925. pop %ecx { ecx = high(res) = Move.len }
  1926. test %eax, %eax { If IndexByte result (eax) is non-negative (terminator is among first high(res) characters), use it, otherwise keep high(res). }
  1927. {$ifdef CPUX86_HAS_CMOV}
  1928. cmovns %eax, %ecx
  1929. {$else}
  1930. js .LEcxIsLen
  1931. mov %eax, %ecx
  1932. .LEcxIsLen:
  1933. {$endif}
  1934. pop %eax { pop p to eax = Move.src }
  1935. pop %edx { pop res to edx }
  1936. mov %cl, (%edx) { res[0] := len }
  1937. inc %edx { res[1] = Move.dst }
  1938. {$ifdef FPC_PROFILE}
  1939. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1940. leal -12(%esp), %esp
  1941. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1942. call Move
  1943. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1944. leal 12(%esp), %esp
  1945. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1946. jmp .LReturn
  1947. {$else FPC_PROFILE}
  1948. jmp Move { can perform a tail call }
  1949. {$endif FPC_PROFILE}
  1950. .LEmpty:
  1951. movb $0, (%eax)
  1952. {$ifdef FPC_PROFILE}
  1953. .LReturn:
  1954. {$endif}
  1955. end;
  1956. {$endif FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1957. {$IFNDEF INTERNAL_BACKTRACE}
  1958. {$define FPC_SYSTEM_HAS_GET_FRAME}
  1959. function get_frame:pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  1960. asm
  1961. movl %ebp,%eax
  1962. end;
  1963. {$ENDIF not INTERNAL_BACKTRACE}
  1964. {$define FPC_SYSTEM_HAS_GET_PC_ADDR}
  1965. Function Get_pc_addr : Pointer;assembler;nostackframe;
  1966. asm
  1967. movl (%esp),%eax
  1968. end;
  1969. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  1970. function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;
  1971. {$if defined(win32)}
  1972. { Windows has StackTop always properly set }
  1973. begin
  1974. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  1975. Result:=PPointer(framebp+4)^
  1976. else
  1977. Result:=nil;
  1978. end;
  1979. {$else defined(win32)}
  1980. nostackframe;assembler;
  1981. asm
  1982. orl %eax,%eax
  1983. jz .Lg_a_null
  1984. movl 4(%eax),%eax
  1985. .Lg_a_null:
  1986. end;
  1987. {$endif defined(win32)}
  1988. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  1989. function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;
  1990. {$if defined(win32)}
  1991. { Windows has StackTop always properly set }
  1992. begin
  1993. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  1994. Result:=PPointer(framebp)^
  1995. else
  1996. Result:=nil;
  1997. end;
  1998. {$else defined(win32)}
  1999. nostackframe;assembler;
  2000. asm
  2001. orl %eax,%eax
  2002. jz .Lgnf_null
  2003. movl (%eax),%eax
  2004. .Lgnf_null:
  2005. end;
  2006. {$endif defined(win32)}
  2007. {$define FPC_SYSTEM_HAS_SPTR}
  2008. Function Sptr : Pointer;assembler;nostackframe;
  2009. asm
  2010. movl %esp,%eax
  2011. end;
  2012. {****************************************************************************
  2013. Str()
  2014. ****************************************************************************}
  2015. {$if defined(disabled) and defined(regcall) }
  2016. {$define FPC_SYSTEM_HAS_INT_STR_LONGWORD}
  2017. {$define FPC_SYSTEM_HAS_INT_STR_LONGINT}
  2018. label str_int_shortcut;
  2019. procedure int_str(l:longword;out s:shortstring);assembler;nostackframe;
  2020. asm
  2021. pushl %esi
  2022. pushl %edi
  2023. pushl %ebx
  2024. mov %edx,%edi
  2025. xor %edx,%edx
  2026. jmp str_int_shortcut
  2027. end;
  2028. procedure int_str(l:longint;out s:shortstring);assembler;nostackframe;
  2029. {Optimized for speed, but balanced with size.}
  2030. const digits:array[0..9] of cardinal=(0,10,100,1000,10000,
  2031. 100000,1000000,10000000,
  2032. 100000000,1000000000);
  2033. asm
  2034. {$ifdef FPC_PROFILE}
  2035. push %eax
  2036. push %edx
  2037. push %ecx
  2038. call mcount
  2039. pop %ecx
  2040. pop %edx
  2041. pop %eax
  2042. {$endif FPC_PROFILE}
  2043. push %esi
  2044. push %edi
  2045. push %ebx
  2046. movl %edx,%edi
  2047. { Calculate absolute value and put sign in edx}
  2048. cltd
  2049. xorl %edx,%eax
  2050. subl %edx,%eax
  2051. negl %edx
  2052. str_int_shortcut:
  2053. movl %ecx,%esi
  2054. {Calculate amount of digits in ecx.}
  2055. xorl %ecx,%ecx
  2056. bsrl %eax,%ecx
  2057. incl %ecx
  2058. imul $1233,%ecx
  2059. shr $12,%ecx
  2060. {$ifdef FPC_PIC}
  2061. call fpc_geteipasebx
  2062. {$ifdef darwin}
  2063. movl digits-.Lpic(%ebx),%ebx
  2064. {$else}
  2065. addl $_GLOBAL_OFFSET_TABLE_,%ebx
  2066. movl digits@GOT(%ebx),%ebx
  2067. {$endif}
  2068. cmpl (%ebx,%ecx,4),%eax
  2069. {$else}
  2070. cmpl digits(,%ecx,4),%eax
  2071. {$endif}
  2072. cmc
  2073. adcl $0,%ecx {Nr. digits ready in ecx.}
  2074. {Write length & sign.}
  2075. lea (%edx,%ecx),%ebx
  2076. movb $45,%bh {movb $'-,%bh Not supported by our ATT reader.}
  2077. movw %bx,(%edi)
  2078. addl %edx,%edi
  2079. subl %edx,%esi
  2080. {Skip digits beyond string length.}
  2081. movl %eax,%edx
  2082. subl %ecx,%esi
  2083. jae .Lloop_write
  2084. .balign 4
  2085. .Lloop_skip:
  2086. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  2087. mull %edx
  2088. shrl $3,%edx
  2089. decl %ecx
  2090. jz .Ldone {If (l<0) and (high(s)=1) this jump is taken.}
  2091. incl %esi
  2092. jnz .Lloop_skip
  2093. {Write out digits.}
  2094. .balign 4
  2095. .Lloop_write:
  2096. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  2097. {Pre-add '0'}
  2098. leal 48(%edx),%ebx {leal $'0(,%edx),%ebx Not supported by our ATT reader.}
  2099. mull %edx
  2100. shrl $3,%edx
  2101. leal (%edx,%edx,8),%eax {x mod 10 = x-10*(x div 10)}
  2102. subl %edx,%ebx
  2103. subl %eax,%ebx
  2104. movb %bl,(%edi,%ecx)
  2105. decl %ecx
  2106. jnz .Lloop_write
  2107. .Ldone:
  2108. popl %ebx
  2109. popl %edi
  2110. popl %esi
  2111. end;
  2112. {$endif}
  2113. {****************************************************************************
  2114. Bounds Check
  2115. ****************************************************************************}
  2116. { do a thread-safe inc/dec }
  2117. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  2118. function cpudeclocked(var l : longint) : boolean;assembler;nostackframe;
  2119. asm
  2120. lock
  2121. decl (%eax)
  2122. setzb %al
  2123. end;
  2124. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  2125. procedure cpuinclocked(var l : longint);assembler;nostackframe;
  2126. asm
  2127. lock
  2128. incl (%eax)
  2129. end;
  2130. // inline SMP check and normal lock.
  2131. // the locked one is so slow, inlining doesn't matter.
  2132. function declocked(var l : longint) : boolean; inline;
  2133. begin
  2134. if not ismultithread then
  2135. begin
  2136. dec(l);
  2137. declocked:=l=0;
  2138. end
  2139. else
  2140. declocked:=cpudeclocked(l);
  2141. end;
  2142. procedure inclocked(var l : longint); inline;
  2143. begin
  2144. if not ismultithread then
  2145. inc(l)
  2146. else
  2147. cpuinclocked(l);
  2148. end;
  2149. function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
  2150. asm
  2151. movl $-1,%edx
  2152. lock
  2153. xaddl %edx, (%eax)
  2154. lea -1(%edx),%eax
  2155. end;
  2156. function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
  2157. asm
  2158. movl $1,%edx
  2159. lock
  2160. xaddl %edx, (%eax)
  2161. lea 1(%edx),%eax
  2162. end;
  2163. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2164. asm
  2165. xchgl (%eax),%edx
  2166. movl %edx,%eax
  2167. end;
  2168. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2169. asm
  2170. lock
  2171. xaddl %edx, (%eax)
  2172. movl %edx,%eax
  2173. end;
  2174. function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler; nostackframe;
  2175. asm
  2176. xchgl %eax,%ecx
  2177. lock
  2178. cmpxchgl %edx, (%ecx)
  2179. end;
  2180. function InterlockedCompareExchange64(var Target: int64; NewValue: int64; Comperand: int64): int64; assembler;
  2181. asm
  2182. pushl %ebx
  2183. pushl %edi
  2184. movl %eax,%edi
  2185. movl Comperand+4,%edx
  2186. movl Comperand+0,%eax
  2187. movl NewValue+4,%ecx
  2188. movl NewValue+0,%ebx
  2189. lock cmpxchg8b (%edi)
  2190. pop %edi
  2191. pop %ebx
  2192. end;
  2193. {****************************************************************************
  2194. FPU
  2195. ****************************************************************************}
  2196. const
  2197. { Internal constants for use in system unit }
  2198. FPU_Invalid = 1;
  2199. FPU_Denormal = 2;
  2200. FPU_DivisionByZero = 4;
  2201. FPU_Overflow = 8;
  2202. FPU_Underflow = $10;
  2203. FPU_StackUnderflow = $20;
  2204. FPU_StackOverflow = $40;
  2205. FPU_ExceptionMask = $ff;
  2206. MM_Invalid = 1;
  2207. MM_Denormal = 2;
  2208. MM_DivisionByZero = 4;
  2209. MM_Overflow = 8;
  2210. MM_Underflow = $10;
  2211. MM_Precicion = $20;
  2212. MM_ExceptionMask = $3f;
  2213. MM_MaskInvalidOp = %0000000010000000;
  2214. MM_MaskDenorm = %0000000100000000;
  2215. MM_MaskDivZero = %0000001000000000;
  2216. MM_MaskOverflow = %0000010000000000;
  2217. MM_MaskUnderflow = %0000100000000000;
  2218. MM_MaskPrecision = %0001000000000000;
  2219. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  2220. Procedure SysInitFPU;
  2221. begin
  2222. end;
  2223. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  2224. Procedure SysResetFPU;
  2225. var
  2226. { these locals are so we don't have to hack pic code in the assembler }
  2227. localmxcsr: dword;
  2228. localfpucw: word;
  2229. begin
  2230. localfpucw:=Default8087CW;
  2231. asm
  2232. fninit
  2233. fwait
  2234. fldcw localfpucw
  2235. end;
  2236. if has_sse_support then
  2237. begin
  2238. localmxcsr:=DefaultMXCSR;
  2239. asm
  2240. { setup sse exceptions }
  2241. {$ifndef OLD_ASSEMBLER}
  2242. ldmxcsr localmxcsr
  2243. {$else OLD_ASSEMBLER}
  2244. mov localmxcsr,%eax
  2245. subl $4,%esp
  2246. mov %eax,(%esp)
  2247. //ldmxcsr (%esp)
  2248. .byte 0x0f,0xae,0x14,0x24
  2249. addl $4,%esp
  2250. {$endif OLD_ASSEMBLER}
  2251. end;
  2252. end;
  2253. end;
  2254. { because of the brain dead sse detection on x86, this test is post poned }
  2255. procedure fpc_cpucodeinit;
  2256. var
  2257. _eax,_ecx_cpuid1,_edx_cpuid1,_ebx_cpuid7 : longint;
  2258. begin
  2259. if cpuid_support then
  2260. begin
  2261. asm
  2262. movl $1,%eax
  2263. xorl %ecx,%ecx
  2264. cpuid
  2265. movl %edx,_edx_cpuid1
  2266. movl %ecx,_ecx_cpuid1
  2267. end ['ebx'];
  2268. has_mmx_support:=(_edx_cpuid1 and $800000)<>0;
  2269. if ((_edx_cpuid1 and $2000000)<>0) then
  2270. begin
  2271. os_supports_sse:=true;
  2272. sse_check:=true;
  2273. asm
  2274. { force an sse exception if no sse is supported, the exception handler sets
  2275. os_supports_sse to false then }
  2276. { don't change this instruction, the code above depends on its size }
  2277. {$ifdef OLD_ASSEMBLER}
  2278. .byte 0x0f,0x28,0xf7
  2279. {$else}
  2280. movaps %xmm7, %xmm6
  2281. {$endif not EMX}
  2282. end;
  2283. sse_check:=false;
  2284. has_sse_support:=os_supports_sse;
  2285. end;
  2286. if has_sse_support then
  2287. begin
  2288. has_sse2_support:=((_edx_cpuid1 and $4000000)<>0);
  2289. has_sse3_support:=((_ecx_cpuid1 and $200)<>0);
  2290. has_sse41_support:=boolean(_ecx_cpuid1 shr 19 and 1);
  2291. { now avx }
  2292. asm
  2293. xorl %eax,%eax
  2294. cpuid
  2295. movl %eax,_eax
  2296. end;
  2297. if _eax>=7 then
  2298. begin
  2299. asm
  2300. movl $7,%eax
  2301. xorl %ecx,%ecx
  2302. cpuid
  2303. movl %ebx,_ebx_cpuid7
  2304. end;
  2305. fast_large_repmovstosb:=_ebx_cpuid7 and (1 shl 9)<>0;
  2306. if (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then
  2307. begin
  2308. asm
  2309. xorl %ecx,%ecx
  2310. .byte 0x0f,0x01,0xd0 { xgetbv }
  2311. movl %eax,_eax
  2312. end;
  2313. if (_eax and 6)=6 then
  2314. begin
  2315. has_avx_support:=(_ecx_cpuid1 and $10000000)<>0;
  2316. has_avx2_support:=(_ebx_cpuid7 and $20)<>0;
  2317. end;
  2318. end;
  2319. end;
  2320. end;
  2321. end;
  2322. { don't let libraries influence the FPU cw set by the host program }
  2323. if IsLibrary then
  2324. begin
  2325. Default8087CW:=Get8087CW;
  2326. if has_sse_support then
  2327. DefaultMXCSR:=GetMXCSR;
  2328. end;
  2329. SysResetFPU;
  2330. fpc_cpucodeinit_performed:=true;
  2331. end;
  2332. {$if not defined(darwin) and defined(regcall) }
  2333. { darwin requires that the stack is aligned to 16 bytes when calling another function }
  2334. {$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
  2335. {$define FPC_SYSTEM_HAS_ANSISTR_DECR_REF}
  2336. Procedure fpc_AnsiStr_Decr_Ref (Var S : Pointer); [Public,Alias:'FPC_ANSISTR_DECR_REF']; compilerproc; nostackframe; assembler;
  2337. asm
  2338. movl (%eax),%edx
  2339. testl %edx,%edx
  2340. jz .Lquit
  2341. movl $0,(%eax) // s:=nil
  2342. cmpl $0,-8(%edx) // exit if refcount<0
  2343. jl .Lquit
  2344. {$ifdef FPC_PIC}
  2345. call fpc_geteipasecx
  2346. addl $_GLOBAL_OFFSET_TABLE_,%ecx
  2347. movl ismultithread@GOT(%ecx),%ecx
  2348. cmpl $0,(%ecx)
  2349. {$else FPC_PIC}
  2350. cmpl $0,ismultithread
  2351. {$endif FPC_PIC}
  2352. je .Lskiplock
  2353. .byte 0xF0 // LOCK prefix, jumped over if IsMultiThread = false. FPC assembler does not accept disjoint LOCK mnemonic.
  2354. .Lskiplock:
  2355. decl -8(%edx)
  2356. jz .Lfree
  2357. .Lquit:
  2358. ret
  2359. .Lfree:
  2360. leal -12(%edx),%eax // points to start of allocation
  2361. { freemem is not an assembler leaf function like fpc_geteipasecx, so it
  2362. needs to be called with proper stack alignment }
  2363. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2364. leal -12(%esp),%esp
  2365. call FPC_FREEMEM
  2366. leal 12(%esp),%esp
  2367. {$else FPC_SYSTEM_STACKALIGNMENT16}
  2368. jmp FPC_FREEMEM // can perform a tail call
  2369. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2370. end;
  2371. function fpc_truely_ansistr_unique(Var S : Pointer): Pointer; forward;
  2372. {$define FPC_SYSTEM_HAS_ANSISTR_UNIQUE}
  2373. Function fpc_ansistr_Unique(Var S : Pointer): Pointer; [Public,Alias : 'FPC_ANSISTR_UNIQUE']; compilerproc; nostackframe;assembler;
  2374. asm
  2375. movl (%eax),%edx
  2376. testl %edx,%edx
  2377. jz .Lunchanged
  2378. cmpl $1,-8(%edx)
  2379. jne fpc_truely_ansistr_unique
  2380. .Lunchanged:
  2381. movl %edx,%eax
  2382. end;
  2383. {$endif FPC_HAS_FEATURE_ANSISTRINGS}
  2384. {$endif ndef darwin and defined(regcall) }
  2385. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  2386. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  2387. procedure ReadBarrier;assembler;nostackframe;
  2388. asm
  2389. {$ifdef CPUX86_HAS_SSE2}
  2390. lfence
  2391. {$else CPUX86_HAS_SSE2}
  2392. lock
  2393. addl $0,0(%esp)
  2394. {$endif CPUX86_HAS_SSE2}
  2395. end;
  2396. procedure ReadDependencyBarrier;
  2397. begin
  2398. { reads imply barrier on earlier reads depended on }
  2399. end;
  2400. procedure ReadWriteBarrier;assembler;nostackframe;
  2401. asm
  2402. {$ifdef CPUX86_HAS_SSE2}
  2403. mfence
  2404. {$else CPUX86_HAS_SSE2}
  2405. lock
  2406. addl $0,0(%esp)
  2407. {$endif CPUX86_HAS_SSE2}
  2408. end;
  2409. procedure WriteBarrier;assembler;nostackframe;
  2410. asm
  2411. {$ifdef CPUX86_HAS_SSEUNIT}
  2412. sfence
  2413. {$endif CPUX86_HAS_SSEUNIT}
  2414. end;
  2415. {$endif}
  2416. {$ifndef FPC_SYSTEM_HAS_BSF_QWORD}
  2417. {$define FPC_SYSTEM_HAS_BSF_QWORD}
  2418. function BsfQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2419. asm
  2420. bsfl 4(%esp),%eax
  2421. jz .L1
  2422. ret $8
  2423. .L1:
  2424. bsfl 8(%esp),%eax
  2425. jz .L2
  2426. add $32,%eax
  2427. ret $8
  2428. .L2:
  2429. movl $255,%eax
  2430. end;
  2431. {$endif FPC_SYSTEM_HAS_BSF_QWORD}
  2432. {$ifndef FPC_SYSTEM_HAS_BSR_QWORD}
  2433. {$define FPC_SYSTEM_HAS_BSR_QWORD}
  2434. function BsrQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2435. asm
  2436. bsrl 8(%esp),%eax
  2437. jz .L1
  2438. add $32,%eax
  2439. ret $8
  2440. .L1:
  2441. bsrl 4(%esp),%eax
  2442. jz .L2
  2443. ret $8
  2444. .L2:
  2445. movl $255,%eax
  2446. end;
  2447. {$endif FPC_SYSTEM_HAS_BSR_QWORD}
  2448. {$ifndef FPC_SYSTEM_HAS_SAR_QWORD}
  2449. {$define FPC_SYSTEM_HAS_SAR_QWORD}
  2450. function fpc_SarInt64(Const AValue : Int64;const Shift : Byte): Int64; [Public,Alias:'FPC_SARINT64']; compilerproc; assembler; nostackframe;
  2451. asm
  2452. movl 8(%esp),%edx
  2453. movzbl %al,%ecx
  2454. cmpb $32,%al
  2455. jnb .L1
  2456. movl 4(%esp),%eax
  2457. shrdl %cl,%edx,%eax
  2458. sarl %cl,%edx
  2459. ret $8
  2460. .L1:
  2461. movl %edx,%eax
  2462. sarl $31,%edx
  2463. sarl %cl,%eax // uses 5 lower bits of cl.
  2464. end;
  2465. {$endif FPC_SYSTEM_HAS_SAR_QWORD}