i386.inc 77 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 1999-2000 by the Free Pascal development team.
  4. Processor dependent implementation for the system unit for
  5. intel i386+
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$if not(defined(VER3_0)) and defined(linux)}
  13. {$define FPC_SYSTEM_STACKALIGNMENT16}
  14. {$endif not(defined(VER3_0)) and defined(linux)}
  15. {****************************************************************************
  16. Primitives
  17. ****************************************************************************}
  18. var
  19. os_supports_sse : boolean;
  20. { this variable is set to true, if currently an sse check is executed and no sig ill should be generated }
  21. sse_check : boolean;
  22. fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
  23. has_sse41_support : boolean;
  24. fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. }
  25. {$asmmode ATT}
  26. function cpuid_support : boolean;assembler;nostackframe;
  27. {
  28. Check if the ID-flag can be changed, if changed then CpuID is supported.
  29. Tested under go32v1 and Linux on c6x86 with CpuID enabled and disabled (PFV)
  30. }
  31. asm
  32. pushfl
  33. movl (%esp),%eax
  34. xorl $0x200000,%eax
  35. pushl %eax
  36. popfl
  37. pushfl
  38. popl %eax
  39. xorl (%esp),%eax
  40. popfl
  41. testl $0x200000,%eax
  42. setnz %al
  43. end;
  44. {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
  45. procedure fpc_cpuinit;
  46. begin
  47. { because of the brain dead sse detection on x86, this test is post poned to fpc_cpucodeinit which
  48. must be implemented OS dependend (FK)
  49. has_sse_support:=sse_support;
  50. has_mmx_support:=mmx_support;
  51. }
  52. end;
  53. {$ifndef darwin}
  54. procedure fpc_geteipasebx; [public, alias: 'fpc_geteipasebx'];assembler; nostackframe;
  55. asm
  56. movl (%esp),%ebx
  57. end;
  58. procedure fpc_geteipasecx; [public, alias: 'fpc_geteipasecx'];assembler; nostackframe;
  59. asm
  60. movl (%esp),%ecx
  61. end;
  62. {$endif}
  63. {$if not defined(FPC_SYSTEM_HAS_MOVE)
  64. and not defined(OLD_ASSEMBLER)
  65. and not defined(darwin)}
  66. {$i fastmove.inc}
  67. {$endif}
  68. {$ifndef FPC_SYSTEM_HAS_MOVE}
  69. {$define FPC_SYSTEM_HAS_MOVE}
  70. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;
  71. var
  72. saveesi,saveedi : longint;
  73. asm
  74. movl %edi,saveedi
  75. movl %esi,saveesi
  76. movl %eax,%esi
  77. movl %edx,%edi
  78. movl %ecx,%edx
  79. movl %edi,%eax
  80. { check for zero or negative count }
  81. cmpl $0,%edx
  82. jle .LMoveEnd
  83. { Check for back or forward }
  84. sub %esi,%eax
  85. jz .LMoveEnd { Do nothing when source=dest }
  86. jc .LFMove { Do forward, dest<source }
  87. cmp %edx,%eax
  88. jb .LBMove { Dest is in range of move, do backward }
  89. { Forward Copy }
  90. .LFMove:
  91. {$ifdef FPC_ENABLED_CLD}
  92. cld
  93. {$endif FPC_ENABLED_CLD}
  94. cmpl $15,%edx
  95. jl .LFMove1
  96. movl %edi,%ecx { Align on 32bits }
  97. negl %ecx
  98. andl $3,%ecx
  99. subl %ecx,%edx
  100. rep
  101. movsb
  102. movl %edx,%ecx
  103. andl $3,%edx
  104. shrl $2,%ecx
  105. rep
  106. movsl
  107. .LFMove1:
  108. movl %edx,%ecx
  109. rep
  110. movsb
  111. jmp .LMoveEnd
  112. { Backward Copy }
  113. .LBMove:
  114. std
  115. addl %edx,%esi
  116. addl %edx,%edi
  117. movl %edi,%ecx
  118. decl %esi
  119. decl %edi
  120. cmpl $15,%edx
  121. jl .LBMove1
  122. negl %ecx { Align on 32bits }
  123. andl $3,%ecx
  124. subl %ecx,%edx
  125. rep
  126. movsb
  127. movl %edx,%ecx
  128. andl $3,%edx
  129. shrl $2,%ecx
  130. subl $3,%esi
  131. subl $3,%edi
  132. rep
  133. movsl
  134. addl $3,%esi
  135. addl $3,%edi
  136. .LBMove1:
  137. movl %edx,%ecx
  138. rep
  139. movsb
  140. cld
  141. .LMoveEnd:
  142. movl saveedi,%edi
  143. movl saveesi,%esi
  144. end;
  145. {$endif FPC_SYSTEM_HAS_MOVE}
  146. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  147. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  148. or not defined(FPC_SYSTEM_HAS_FILLDWORD)
  149. or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  150. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  151. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  152. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  153. const
  154. FillXxxx_RepStosThreshold_ERMS = 1024;
  155. FillXxxx_RepStosThreshold_NoERMS = 512 * 1024;
  156. procedure FillXxxx_U32Pattern_RepStos_8OrMore; assembler; nostackframe;
  157. { eax — x, ecx — uint32 pattern, edx — byte count >= 8 (preferably >= FillXxxx_RepStosThreshold_(No)ERMS, depending on fast_large_repmovstosb). }
  158. asm
  159. {$ifdef FPC_ENABLED_CLD}
  160. cld
  161. {$endif FPC_ENABLED_CLD}
  162. mov %ecx, (%eax) { Write first 4 bytes unaligned. }
  163. push %ecx { pattern }
  164. push %edi
  165. mov %eax, %edi { Move x to edi, as expected by ‘rep stosl’. }
  166. xchg %eax, %ecx { now eax = pattern (as expected by ‘rep stosl’) and ecx = x (to rotate the pattern by its misalignment) }
  167. shl $3, %ecx { ecx = misalignment of x in bits. }
  168. rol %cl, %eax { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
  169. add %edi, %edx { edx = x end }
  170. lea -1(%edx), %ecx { ecx = x end - 1. }
  171. add $4, %edi
  172. and $-4, %edi { edi = 4-byte aligned pointer strictly to the right of the start. }
  173. and $-4, %ecx { ecx = 4-byte aligned pointer strictly to the left of the end. }
  174. sub %edi, %ecx { ecx = byte count between them. }
  175. shr $2, %ecx { ecx = uint32 count, as expected by ‘rep stosl’. }
  176. rep stosl
  177. pop %edi
  178. pop %ecx
  179. mov %ecx, -4(%edx) { Write last 4 bytes unaligned. }
  180. end;
  181. {$endif FillChar/Word/DWord required.}
  182. label
  183. FillXxxx_MoreThanTwoXMMs;
  184. procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
  185. { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
  186. const
  187. NtThreshold = 4 * 1024 * 1024;
  188. asm
  189. movd %ecx, %xmm0
  190. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  191. movdqu %xmm0, (%eax)
  192. movdqu %xmm0, -16(%eax,%edx)
  193. cmp $32, %edx
  194. ja .LMoreThanTwoVectors
  195. ret
  196. .byte 144 { Turn .balign 16 before .L64x_Body into a no-op. }
  197. { x can start and end misaligned on the vector boundary:
  198. x = ~~][H1][H2][...][T2][T1]~
  199. [UH] [UT]
  200. UH/UT stands for “unaligned head/tail”, both have 1~16 bytes. }
  201. .LMoreThanTwoVectors:
  202. push %esi
  203. mov %ecx, %esi { esi = pattern }
  204. mov %eax, %ecx
  205. shl $3, %ecx { ecx = misalignment of x in bits }
  206. rol %cl, %esi { misalign the pattern }
  207. movd %esi, %xmm0
  208. pshufd $0, %xmm0, %xmm0
  209. pop %esi
  210. { FillChar (to skip the misaligning above) and FillQWord jump here.
  211. eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
  212. FillXxxx_MoreThanTwoXMMs:
  213. lea -65(%eax,%edx), %ecx
  214. and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
  215. mov %ecx, %edx { Remember T4 to edx. }
  216. and $-16, %eax { eax = H1 − 16. }
  217. sub %eax, %ecx { ecx = aligned byte count − 48. }
  218. movdqa %xmm0, 16(%eax) { Write H1. }
  219. cmp $32-48, %ecx
  220. jle .LOneAlignedTailWrite
  221. movdqa %xmm0, 32(%eax) { Write H2. }
  222. cmp $64-48, %ecx
  223. jle .LTwoAlignedTailWrites
  224. sub $48, %ecx { ecx = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
  225. jle .LFourAlignedTailWrites { ecx was ≤ 96−48 }
  226. add $48, %eax { eax = H3. }
  227. cmp $NtThreshold, %ecx
  228. jae .L64xNT_Body
  229. .balign 16 { no-op }
  230. .L64x_Body:
  231. movdqa %xmm0, (%eax)
  232. movdqa %xmm0, 16(%eax)
  233. movdqa %xmm0, 32(%eax)
  234. movdqa %xmm0, 48(%eax)
  235. add $64, %eax
  236. sub $64, %ecx
  237. ja .L64x_Body
  238. .LFourAlignedTailWrites:
  239. movdqa %xmm0, (%edx) { T4 }
  240. movdqa %xmm0, 16(%edx) { T3 }
  241. .LTwoAlignedTailWrites:
  242. movdqa %xmm0, 32(%edx) { T2 }
  243. .LOneAlignedTailWrite:
  244. movdqa %xmm0, 48(%edx) { T1 }
  245. ret
  246. .balign 16
  247. .L64xNT_Body:
  248. movntdq %xmm0, (%eax)
  249. movntdq %xmm0, 16(%eax)
  250. movntdq %xmm0, 32(%eax)
  251. movntdq %xmm0, 48(%eax)
  252. add $64, %eax
  253. sub $64, %ecx
  254. ja .L64xNT_Body
  255. sfence
  256. jmp .LFourAlignedTailWrites
  257. end;
  258. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  259. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  260. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  261. {$ifndef CPUX86_HAS_SSE2}
  262. procedure FillXxxx_U32Pattern_Plain_16OrMore; assembler; nostackframe;
  263. { eax — x, ecx — uint32 pattern, edx — byte count >= 12 (preferably >= 16). }
  264. asm
  265. mov %ecx, (%eax) { Write first 4 bytes. }
  266. lea -9(%eax,%edx), %edx
  267. mov %ecx, 5(%edx) { Write last 4 bytes. }
  268. and $-4, %edx { edx = loop bound. }
  269. push %esi
  270. mov %ecx, %esi { esi = pattern }
  271. mov %eax, %ecx
  272. shl $3, %ecx { ecx = misalignment of x in bits }
  273. rol %cl, %esi { misalign the pattern }
  274. add $4, %eax
  275. and $-4, %eax
  276. .balign 16
  277. .L8xLoop:
  278. mov %esi, (%eax)
  279. mov %esi, 4(%eax)
  280. add $8, %eax
  281. cmp %edx, %eax
  282. jb .L8xLoop
  283. mov %esi, (%edx)
  284. mov %esi, 4(%edx)
  285. pop %esi
  286. end;
  287. {$endif ndef CPUX86_HAS_SSE2 (need Fill*_Plain)}
  288. procedure FillXxxx_U32Pattern_Ladder_4to16; assembler; nostackframe;
  289. { eax — x, ecx — uint32 pattern, edx — byte count, 4 <= edx <= 16. }
  290. asm
  291. mov %ecx, (%eax)
  292. cmp $8, %edx
  293. jle .LLast4
  294. mov %ecx, 4(%eax)
  295. mov %ecx, -8(%eax,%edx)
  296. .LLast4:
  297. mov %ecx, -4(%eax,%edx)
  298. end;
  299. {$endif FillChar/Word/DWord required.}
  300. {$endif FillChar/Word/DWord/QWord required.}
  301. {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
  302. {$define FPC_SYSTEM_HAS_FILLCHAR}
  303. procedure FillChar_3OrLess; assembler; nostackframe;
  304. { cl — x, edx — byte count, Low(int32) <= edx <= 3. }
  305. asm
  306. test %edx, %edx
  307. jle .LQuit
  308. mov %cl, (%eax)
  309. mov %cl, -1(%eax,%edx)
  310. shr $1, %edx
  311. mov %cl, (%eax,%edx)
  312. .LQuit:
  313. end;
  314. {$ifndef CPUX86_HAS_SSE2}
  315. procedure FillChar_Plain(var x;count:SizeInt;value:byte);assembler;nostackframe;
  316. asm
  317. cmp $3, %edx
  318. jle FillChar_3OrLess
  319. movzbl %cl, %ecx
  320. imul $0x01010101, %ecx
  321. cmp $16, %edx
  322. jbe FillXxxx_U32Pattern_Ladder_4to16
  323. jmp FillXxxx_U32Pattern_Plain_16OrMore
  324. end;
  325. {$endif ndef CPUX86_HAS_SSE2}
  326. procedure FillChar_SSE2(var x;count:SizeInt;value:byte);assembler;nostackframe;
  327. asm
  328. cmp $3, %edx
  329. jle FillChar_3OrLess
  330. movzbl %cl, %ecx
  331. imul $0x01010101, %ecx
  332. cmp $16, %edx
  333. jbe FillXxxx_U32Pattern_Ladder_4to16
  334. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  335. jae FillXxxx_U32Pattern_RepStos_8OrMore
  336. movd %ecx, %xmm0
  337. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  338. movdqu %xmm0, (%eax)
  339. movdqu %xmm0, -16(%eax,%edx)
  340. cmp $32, %edx
  341. ja FillXxxx_MoreThanTwoXMMs
  342. end;
  343. procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
  344. asm
  345. cmp $3, %edx
  346. jle FillChar_3OrLess
  347. movzbl %cl, %ecx
  348. imul $0x01010101, %ecx
  349. cmp $16, %edx
  350. jbe FillXxxx_U32Pattern_Ladder_4to16
  351. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  352. jae FillXxxx_U32Pattern_RepStos_8OrMore
  353. movd %ecx, %xmm0
  354. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  355. movdqu %xmm0, (%eax)
  356. movdqu %xmm0, -16(%eax,%edx)
  357. cmp $32, %edx
  358. ja FillXxxx_MoreThanTwoXMMs
  359. end;
  360. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
  361. var
  362. FillChar_Impl: procedure(var x;count:SizeInt;value:byte) = @FillChar_Dispatch;
  363. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte);
  364. begin
  365. if not fpc_cpucodeinit_performed then
  366. begin
  367. {$ifdef CPUX86_HAS_SSE2} FillChar_SSE2 {$else} FillChar_Plain {$endif} (x, count, value);
  368. exit;
  369. end;
  370. if fast_large_repmovstosb then
  371. FillChar_Impl := @FillChar_SSE2_ERMS
  372. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  373. FillChar_Impl := @FillChar_SSE2
  374. {$ifndef CPUX86_HAS_SSE2}
  375. else
  376. FillChar_Impl := @FillChar_Plain
  377. {$endif ndef CPUX86_HAS_SSE2};
  378. FillChar_Impl(x, count, value);
  379. end;
  380. procedure FillChar(var x;count:SizeInt;value:byte);
  381. begin
  382. FillChar_Impl(x, count, value);
  383. end;
  384. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  385. {$ifndef FPC_SYSTEM_HAS_FILLWORD}
  386. {$define FPC_SYSTEM_HAS_FILLWORD}
  387. procedure FillWord_3OrLess; assembler; nostackframe;
  388. asm
  389. test %edx, %edx
  390. jle .LQuit
  391. mov %cx, (%eax)
  392. mov %cx, -2(%eax,%edx,2)
  393. shr $1, %edx
  394. mov %cx, (%eax,%edx,2)
  395. .LQuit:
  396. end;
  397. {$ifndef CPUX86_HAS_SSE2}
  398. procedure FillWord_Plain(var x;count:SizeInt;value:word);assembler;nostackframe;
  399. asm
  400. cmp $3, %edx
  401. jle FillWord_3OrLess
  402. shl $1, %edx
  403. movzwl %cx, %ecx
  404. imul $0x00010001, %ecx
  405. cmp $16, %edx
  406. jbe FillXxxx_U32Pattern_Ladder_4to16
  407. jmp FillXxxx_U32Pattern_Plain_16OrMore
  408. end;
  409. {$endif ndef CPUX86_HAS_SSE2}
  410. procedure FillWord_SSE2(var x;count:SizeInt;value:word);assembler;nostackframe;
  411. asm
  412. cmp $3, %edx
  413. jle FillWord_3OrLess
  414. shl $1, %edx
  415. movzwl %cx, %ecx
  416. imul $0x00010001, %ecx
  417. cmp $16, %edx
  418. jbe FillXxxx_U32Pattern_Ladder_4to16
  419. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  420. jb FillXxxx_U32Pattern_SSE2_16OrMore
  421. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  422. end;
  423. procedure FillWord_SSE2_ERMS(var x;count:SizeInt;value:word);assembler;nostackframe;
  424. asm
  425. cmp $3, %edx
  426. jle FillWord_3OrLess
  427. shl $1, %edx
  428. movzwl %cx, %ecx
  429. imul $0x00010001, %ecx
  430. cmp $16, %edx
  431. jbe FillXxxx_U32Pattern_Ladder_4to16
  432. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  433. jb FillXxxx_U32Pattern_SSE2_16OrMore
  434. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  435. end;
  436. procedure FillWord_Dispatch(var x;count:SizeInt;value:word); forward;
  437. var
  438. FillWord_Impl: procedure(var x;count:SizeInt;value:word) = @FillWord_Dispatch;
  439. procedure FillWord_Dispatch(var x;count:SizeInt;value:word);
  440. begin
  441. if not fpc_cpucodeinit_performed then
  442. begin
  443. {$ifdef CPUX86_HAS_SSE2} FillWord_SSE2 {$else} FillWord_Plain {$endif} (x, count, value);
  444. exit;
  445. end;
  446. if fast_large_repmovstosb then
  447. FillWord_Impl := @FillWord_SSE2_ERMS
  448. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  449. FillWord_Impl := @FillWord_SSE2
  450. {$ifndef CPUX86_HAS_SSE2}
  451. else
  452. FillWord_Impl := @FillWord_Plain
  453. {$endif ndef CPUX86_HAS_SSE2};
  454. FillWord_Impl(x, count, value);
  455. end;
  456. procedure FillWord(var x;count:SizeInt;value:word);
  457. begin
  458. FillWord_Impl(x, count, value);
  459. end;
  460. {$endif FPC_SYSTEM_HAS_FILLWORD}
  461. {$ifndef FPC_SYSTEM_HAS_FILLDWORD}
  462. {$define FPC_SYSTEM_HAS_FILLDWORD}
  463. procedure FillDWord_4OrLess; assembler; nostackframe;
  464. asm
  465. cmp $1, %edx
  466. jl .LQuit
  467. mov %ecx, (%eax)
  468. je .LQuit
  469. mov %ecx, 4(%eax)
  470. mov %ecx, -8(%eax,%edx,4)
  471. mov %ecx, -4(%eax,%edx,4)
  472. .LQuit:
  473. end;
  474. {$ifndef CPUX86_HAS_SSE2}
  475. procedure FillDWord_Plain(var x;count:SizeInt;value:dword);assembler;nostackframe;
  476. asm
  477. cmp $4, %edx
  478. jle FillDWord_4OrLess
  479. shl $2, %edx
  480. jmp FillXxxx_U32Pattern_Plain_16OrMore
  481. end;
  482. {$endif ndef CPUX86_HAS_SSE2}
  483. procedure FillDWord_SSE2(var x;count:SizeInt;value:dword);assembler;nostackframe;
  484. asm
  485. cmp $4, %edx
  486. jle FillDWord_4OrLess
  487. shl $2, %edx
  488. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  489. jb FillXxxx_U32Pattern_SSE2_16OrMore
  490. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  491. end;
  492. procedure FillDWord_SSE2_ERMS(var x;count:SizeInt;value:dword);assembler;nostackframe;
  493. asm
  494. cmp $4, %edx
  495. jle FillDWord_4OrLess
  496. shl $2, %edx
  497. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  498. jb FillXxxx_U32Pattern_SSE2_16OrMore
  499. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  500. end;
  501. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword); forward;
  502. var
  503. FillDWord_Impl: procedure(var x;count:SizeInt;value:dword) = @FillDWord_Dispatch;
  504. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword);
  505. begin
  506. if not fpc_cpucodeinit_performed then
  507. begin
  508. {$ifdef CPUX86_HAS_SSE2} FillDWord_SSE2 {$else} FillDWord_Plain {$endif}(x, count, value);
  509. exit;
  510. end;
  511. if fast_large_repmovstosb then
  512. FillDWord_Impl := @FillDWord_SSE2_ERMS
  513. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  514. FillDWord_Impl := @FillDWord_SSE2
  515. {$ifndef CPUX86_HAS_SSE2}
  516. else
  517. FillDWord_Impl := @FillDWord_Plain
  518. {$endif ndef CPUX86_HAS_SSE2};
  519. FillDWord_Impl(x, count, value);
  520. end;
  521. procedure FillDWord(var x;count:SizeInt;value:dword);
  522. begin
  523. FillDWord_Impl(x, count, value);
  524. end;
  525. {$endif FPC_SYSTEM_HAS_FILLDWORD}
  526. {$ifndef FPC_SYSTEM_HAS_FILLQWORD}
  527. {$define FPC_SYSTEM_HAS_FILLQWORD}
  528. {$ifndef CPUX86_HAS_SSE2}
  529. procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  530. { eax = x, edx = count, [esp + 4] = value }
  531. asm
  532. test %edx, %edx { Don't care about clever things like separate small branches or aligning writes by misaligning the pattern, the function is dead. }
  533. jle .LQuit
  534. push %esi
  535. mov 4+4(%esp), %esi { esi = value[0:31] }
  536. mov 4+8(%esp), %ecx { ecx = value[32:63] }
  537. .balign 16
  538. .LLoop:
  539. mov %esi, (%eax)
  540. mov %ecx, 4(%eax)
  541. add $8, %eax
  542. sub $1, %edx
  543. jnz .LLoop
  544. pop %esi
  545. .LQuit:
  546. end;
  547. {$endif ndef CPUX86_HAS_SSE2}
  548. procedure {$ifdef CPUX86_HAS_SSE2} FillQWord {$else} FillQWord_SSE2 {$endif}(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  549. { eax = x, edx = count, [esp + 4] = value }
  550. asm
  551. cmp $4, %edx
  552. jle .L4OrLess
  553. movq 4(%esp), %xmm0
  554. punpcklqdq %xmm0, %xmm0
  555. { Stack is 12 bytes:
  556. [esp] = return address, [esp + 4] = value (not required anymore).
  557. Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs:
  558. [esp] = return address. }
  559. mov (%esp), %ecx
  560. add $8, %esp
  561. mov %ecx, (%esp)
  562. shl $3, %edx
  563. movdqu %xmm0, (%eax)
  564. movdqu %xmm0, -16(%eax,%edx)
  565. test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
  566. jz FillXxxx_MoreThanTwoXMMs
  567. mov %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x. }
  568. shl $3, %ecx
  569. and $63, %ecx
  570. movd %ecx, %xmm2
  571. movdqa %xmm0, %xmm1
  572. psllq %xmm2, %xmm1
  573. neg %ecx { Can also do not ecx; and $63, %ecx; inc ecx to support ecx = 0, as SSE shifts correctly handle count >= bitsizeof. }
  574. and $63, %ecx { But in the definitely unaligned case, ecx = 0 is impossible. }
  575. movd %ecx, %xmm2
  576. psrlq %xmm2, %xmm0
  577. por %xmm1, %xmm0
  578. jmp FillXxxx_MoreThanTwoXMMs
  579. .L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ }
  580. cmp $1, %edx
  581. jl .LQuit
  582. mov 4(%esp), %ecx
  583. mov %ecx, (%eax)
  584. je .LSecondHalfOf1
  585. mov %ecx, 8(%eax)
  586. mov %ecx, -16(%eax,%edx,8)
  587. mov %ecx, -8(%eax,%edx,8)
  588. mov 8(%esp), %ecx
  589. mov %ecx, 4(%eax)
  590. mov %ecx, 12(%eax)
  591. mov %ecx, -12(%eax,%edx,8)
  592. mov %ecx, -4(%eax,%edx,8)
  593. .LQuit:
  594. ret $8
  595. .LSecondHalfOf1:
  596. mov 8(%esp), %ecx
  597. mov %ecx, 4(%eax)
  598. end;
  599. {$ifndef CPUX86_HAS_SSE2}
  600. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword); forward;
  601. var
  602. FillQWord_Impl: procedure(var x;count:SizeInt;value:qword) = @FillQWord_Dispatch;
  603. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword);
  604. begin
  605. if not fpc_cpucodeinit_performed then
  606. begin
  607. FillQWord_Plain(x, count, value);
  608. exit;
  609. end;
  610. if has_sse2_support then
  611. FillQWord_Impl := @FillQWord_SSE2
  612. else
  613. FillQWord_Impl := @FillQWord_Plain;
  614. FillQWord_Impl(x, count, value);
  615. end;
  616. procedure FillQWord(var x;count:SizeInt;value:qword);
  617. begin
  618. FillQWord_Impl(x, count, value);
  619. end;
  620. {$endif ndef CPUX86_HAS_SSE2 (need FillQWord dispatcher)}
  621. {$endif FPC_SYSTEM_HAS_FILLQWORD}
  622. {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
  623. {$define FPC_SYSTEM_HAS_INDEXBYTE}
  624. {$ifndef CPUX86_HAS_SSE2}
  625. function IndexByte_Plain(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  626. { eax = buf, edx = len, cl = b }
  627. asm
  628. test %edx,%edx
  629. jz .Lnothing0
  630. push %eax { save initial value of 'buf' }
  631. test $3,%al
  632. jz .Laligned4
  633. .Lalignloop: { align to 4 bytes }
  634. cmp %cl,(%eax)
  635. je .Lfoundateax
  636. inc %eax
  637. dec %edx
  638. jz .Lnothing1
  639. test $3,%al
  640. jnz .Lalignloop
  641. .Laligned4: { align to 8 bytes }
  642. push %esi
  643. push %edi
  644. mov %cl,%ch { prepare pattern }
  645. movzwl %cx,%esi
  646. shl $16,%ecx
  647. or %esi,%ecx
  648. test $7,%al
  649. jz .Lloop
  650. test %edx,%edx { Adjust buf+len and reuse the second unroll from the loop body. Careful with len < 0 (esp. len = −1). }
  651. jl .Ldontfixuplen
  652. add $4,%edx
  653. .Ldontfixuplen:
  654. sub $4,%eax
  655. jmp .Lalignfrom4to8
  656. .balign 16
  657. .Lloop: { Requires 8-byte alignment of eax, to safely over-read up to 7 bytes on last iteration. }
  658. mov (%eax),%esi { load dword }
  659. xor %ecx,%esi { XOR with pattern, bytes equal to target are now 0 }
  660. lea -0x01010101(%esi),%edi
  661. not %esi
  662. and $0x80808080,%esi
  663. and %edi,%esi { (x-0x01010101) and (not x) and 0x80808080 }
  664. jnz .Lfound0 { one of the bytes matches }
  665. .Lalignfrom4to8:
  666. mov 4(%eax),%esi
  667. xor %ecx,%esi
  668. lea -0x01010101(%esi),%edi
  669. not %esi
  670. and $0x80808080,%esi
  671. and %edi,%esi
  672. jnz .Lfound1
  673. add $8,%eax
  674. sub $8,%edx
  675. ja .Lloop
  676. .Lnothing3:
  677. pop %edi
  678. pop %esi
  679. .Lnothing1:
  680. pop %edx
  681. .Lnothing0:
  682. or $-1,%eax
  683. ret
  684. .Lfound1:
  685. sub $4,%edx
  686. jbe .Lnothing3
  687. add $4,%eax
  688. .Lfound0:
  689. bsf %esi,%esi
  690. shr $3,%esi
  691. cmp %edx,%esi { Garbage after remaining length? }
  692. jae .Lnothing3
  693. add %esi,%eax
  694. pop %edi
  695. pop %esi
  696. .Lfoundateax:
  697. pop %ecx
  698. sub %ecx,%eax
  699. end;
  700. {$endif ndef CPUX86_HAS_SSE2}
  701. function {$ifdef CPUX86_HAS_SSE2} IndexByte {$else} IndexByte_SSE2 {$endif} (const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  702. asm
  703. test %edx, %edx
  704. jz .Lnotfound { exit if len=0 }
  705. push %ebx
  706. movd %ecx, %xmm1
  707. lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
  708. punpcklbw %xmm1, %xmm1
  709. and $-0x10, %ecx { first aligned address after buf }
  710. punpcklbw %xmm1, %xmm1
  711. pshufd $0, %xmm1, %xmm1
  712. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
  713. sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
  714. pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
  715. pmovmskb %xmm0, %ebx
  716. shl %cl, %ebx { shift valid bits into high word }
  717. and $0xffff0000, %ebx { clear low word containing invalid bits }
  718. shr %cl, %ebx { shift back }
  719. jz .Lcontinue
  720. .Lmatch:
  721. bsf %ebx, %ebx
  722. lea -16(%ecx,%ebx), %eax
  723. pop %ebx
  724. cmp %eax, %edx { check against the buffer length }
  725. jbe .Lnotfound
  726. ret
  727. .balign 16
  728. .Lloop:
  729. movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, }
  730. add $16, %ecx { but their sum is evenly divisible by 16. }
  731. pcmpeqb %xmm1, %xmm0
  732. pmovmskb %xmm0, %ebx
  733. test %ebx, %ebx
  734. jnz .Lmatch
  735. .Lcontinue:
  736. cmp %ecx, %edx
  737. ja .Lloop
  738. pop %ebx
  739. .Lnotfound:
  740. or $-1, %eax
  741. end;
  742. {$ifndef CPUX86_HAS_SSE2}
  743. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt; forward;
  744. var
  745. IndexByte_Impl: function(const buf;len:SizeInt;b:byte):SizeInt = @IndexByte_Dispatch;
  746. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt;
  747. begin
  748. if not fpc_cpucodeinit_performed then
  749. exit(IndexByte_Plain(buf,len,b));
  750. if has_sse2_support then
  751. IndexByte_Impl:=@IndexByte_SSE2
  752. else
  753. IndexByte_Impl:=@IndexByte_Plain;
  754. result:=IndexByte_Impl(buf,len,b);
  755. end;
  756. function IndexByte(const buf;len:SizeInt;b:byte):SizeInt;
  757. begin
  758. result:=IndexByte_Impl(buf,len,b);
  759. end;
  760. {$endif ndef CPUX86_HAS_SSE2}
  761. {$endif FPC_SYSTEM_HAS_INDEXBYTE}
  762. {$ifndef FPC_SYSTEM_HAS_INDEXWORD}
  763. {$define FPC_SYSTEM_HAS_INDEXWORD}
  764. {$ifndef CPUX86_HAS_SSE2}
  765. function IndexWord_Plain(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  766. asm
  767. test %edx, %edx
  768. jz .LNotFound
  769. push %eax
  770. .LWordwise_Body: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  771. cmp %cx, (%eax)
  772. je .LFound
  773. add $2, %eax
  774. dec %edx
  775. jnz .LWordwise_Body
  776. pop %edx
  777. .LNotFound:
  778. or $-1, %eax
  779. ret
  780. .LFound:
  781. pop %edx
  782. sub %edx, %eax
  783. shr $1, %eax
  784. end;
  785. {$endif ndef CPUX86_HAS_SSE2}
  786. function {$ifdef CPUX86_HAS_SSE2} IndexWord {$else} IndexWord_SSE2 {$endif} (const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  787. asm
  788. test %edx, %edx { exit if len=0 }
  789. je .Lnotfound
  790. push %ebx
  791. movd %ecx, %xmm1
  792. punpcklwd %xmm1, %xmm1
  793. pshufd $0, %xmm1, %xmm1
  794. lea 16(%eax), %ecx
  795. and $-16, %ecx
  796. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
  797. sub %eax, %ecx
  798. test $1, %eax { if buffer isn't aligned to word boundary, }
  799. jnz .Lunaligned { use a different algorithm }
  800. pcmpeqw %xmm1, %xmm0
  801. pmovmskb %xmm0, %ebx
  802. shl %cl, %ebx
  803. and $0xffff0000, %ebx
  804. shr %cl, %ebx
  805. shr $1, %ecx { ecx=number of valid bytes }
  806. test %ebx, %ebx
  807. jz .Lcontinue
  808. .Lmatch:
  809. bsf %ebx, %ebx
  810. shr $1, %ebx { in words }
  811. lea -8(%ecx,%ebx), %eax
  812. pop %ebx
  813. cmp %eax, %edx
  814. jbe .Lnotfound { if match is after the specified length, ignore it }
  815. ret
  816. .balign 16
  817. .Lloop:
  818. movdqa (%eax,%ecx,2), %xmm0
  819. add $8, %ecx
  820. pcmpeqw %xmm1, %xmm0
  821. pmovmskb %xmm0, %ebx
  822. test %ebx, %ebx
  823. jnz .Lmatch
  824. .Lcontinue:
  825. cmp %ecx, %edx
  826. ja .Lloop
  827. pop %ebx
  828. .Lnotfound:
  829. or $-1, %eax
  830. ret
  831. .Lunaligned:
  832. push %esi
  833. movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
  834. psllw $8, %xmm1 { swap bytes of each word of pattern) }
  835. psrlw $8, %xmm2
  836. por %xmm2, %xmm1
  837. pcmpeqb %xmm1, %xmm0
  838. pmovmskb %xmm0, %ebx
  839. shl %cl, %ebx
  840. and $0xffff0000, %ebx
  841. shr %cl, %ebx
  842. xor %esi, %esi { nothing to merge yet }
  843. add %edx, %edx { length words -> bytes }
  844. jmp .Lcontinue_u
  845. .balign 16
  846. .Lloop_u:
  847. movdqa (%eax,%ecx), %xmm0
  848. add $16, %ecx
  849. pcmpeqb %xmm1, %xmm0 { compare by bytes }
  850. shr $16, %esi { bit 16 shifts into 0 }
  851. pmovmskb %xmm0, %ebx
  852. .Lcontinue_u:
  853. shl $1, %ebx { 15:0 -> 16:1 }
  854. or %esi, %ebx { merge bit 0 from previous round }
  855. mov %ebx, %esi
  856. shr $1, %ebx { now AND together adjacent pairs of bits }
  857. and %esi, %ebx
  858. and $0x5555, %ebx { also reset odd bits }
  859. jnz .Lmatch_u
  860. cmp %ecx, %edx
  861. ja .Lloop_u
  862. .Lnotfound_u:
  863. pop %esi
  864. pop %ebx
  865. or $-1, %eax
  866. ret
  867. .Lmatch_u:
  868. bsf %ebx, %ebx
  869. lea -16(%ecx,%ebx), %eax
  870. cmp %eax, %edx
  871. jbe .Lnotfound_u { if match is after the specified length, ignore it }
  872. sar $1, %eax { in words }
  873. pop %esi
  874. pop %ebx
  875. end;
  876. {$ifndef CPUX86_HAS_SSE2}
  877. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt; forward;
  878. var
  879. IndexWord_Impl: function(const buf;len:SizeInt;b:word):SizeInt = @IndexWord_Dispatch;
  880. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt;
  881. begin
  882. if not fpc_cpucodeinit_performed then
  883. exit(IndexWord_Plain(buf,len,b));
  884. if has_sse2_support then
  885. IndexWord_Impl:=@IndexWord_SSE2
  886. else
  887. IndexWord_Impl:=@IndexWord_Plain;
  888. result:=IndexWord_Impl(buf,len,b);
  889. end;
  890. function IndexWord(const buf;len:SizeInt;b:word):SizeInt; inline;
  891. begin
  892. result:=IndexWord_Impl(buf,len,b);
  893. end;
  894. {$endif ndef CPUX86_HAS_SSE2}
  895. {$endif FPC_SYSTEM_HAS_INDEXWORD}
  896. {$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
  897. {$define FPC_SYSTEM_HAS_INDEXDWORD}
  898. {$ifndef CPUX86_HAS_SSE2}
  899. function IndexDWord_Plain(Const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  900. asm
  901. push %eax
  902. sub $4, %eax
  903. .LDWordwise_Next: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  904. add $4, %eax
  905. sub $1, %edx
  906. jb .LNotFound
  907. cmp %ecx, (%eax)
  908. jne .LDWordwise_Next
  909. pop %edx
  910. sub %edx, %eax
  911. shr $2, %eax
  912. ret
  913. .LNotFound:
  914. pop %edx
  915. mov $-1, %eax
  916. end;
  917. {$endif ndef CPUX86_HAS_SSE2}
  918. function {$ifdef CPUX86_HAS_SSE2} IndexDWord {$else} IndexDWord_SSE2 {$endif} (const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  919. asm
  920. push %eax
  921. sub $4, %edx
  922. jle .LDwordwise_Prepare
  923. movd %ecx, %xmm1
  924. pshufd $0, %xmm1, %xmm1
  925. .balign 16 { 1-byte NOP. }
  926. .L4x_Body:
  927. movdqu (%eax), %xmm0
  928. pcmpeqd %xmm1, %xmm0
  929. pmovmskb %xmm0, %ecx
  930. test %ecx, %ecx
  931. jnz .LFoundAtMask
  932. add $16, %eax
  933. sub $4, %edx
  934. jg .L4x_Body
  935. lea (%eax,%edx,4), %eax
  936. movdqu (%eax), %xmm0
  937. pcmpeqd %xmm1, %xmm0
  938. pmovmskb %xmm0, %ecx
  939. test %ecx, %ecx
  940. jz .LNothing
  941. .LFoundAtMask:
  942. bsf %ecx, %ecx
  943. add %ecx, %eax
  944. .LFoundAtEax:
  945. pop %edx
  946. sub %edx, %eax
  947. shr $2, %eax
  948. ret
  949. nop { Turns .balign 16 before .LDwordwise_Body into a no-op. }
  950. .LDwordwise_Prepare:
  951. add $3, %edx
  952. cmp $-1, %edx
  953. je .LNothing
  954. .balign 16 { no-op }
  955. .LDwordwise_Body:
  956. cmp (%eax), %ecx
  957. je .LFoundAtEax
  958. add $4, %eax
  959. sub $1, %edx
  960. jae .LDwordwise_Body
  961. .LNothing:
  962. pop %edx
  963. or $-1, %eax
  964. end;
  965. {$ifndef CPUX86_HAS_SSE2}
  966. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt; forward;
  967. var
  968. IndexDWord_Impl: function(const buf;len:SizeInt;b:DWord):SizeInt = @IndexDWord_Dispatch;
  969. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt;
  970. begin
  971. if not fpc_cpucodeinit_performed then
  972. exit(IndexDWord_Plain(buf,len,b));
  973. if has_sse2_support then
  974. IndexDWord_Impl:=@IndexDWord_SSE2
  975. else
  976. IndexDWord_Impl:=@IndexDWord_Plain;
  977. result:=IndexDWord_Impl(buf,len,b);
  978. end;
  979. function IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt;
  980. begin
  981. result:=IndexDWord_Impl(buf,len,b);
  982. end;
  983. {$endif CPUX86_HAS_SSE2}
  984. {$endif FPC_SYSTEM_HAS_INDEXDWORD}
  985. {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
  986. {$define FPC_SYSTEM_HAS_INDEXQWORD}
  987. function IndexQWord_Plain(const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  988. { eax = buf, edx = len, [esp+4] = b }
  989. asm
  990. push %ebx
  991. mov 8(%esp), %ecx { ecx = b[0:31] }
  992. mov 12(%esp), %ebx { ebx = b[32:63] }
  993. mov %eax, 8(%esp) { remember original buf }
  994. sub $8, %eax
  995. .balign 16 { no-op }
  996. .LQWordwise_Next:
  997. add $8, %eax
  998. sub $1, %edx
  999. jb .LNotFound
  1000. cmp %ecx, (%eax)
  1001. jne .LQWordwise_Next
  1002. cmp %ebx, 4(%eax)
  1003. jne .LQWordwise_Next
  1004. sub 8(%esp), %eax
  1005. pop %ebx
  1006. shr $3, %eax
  1007. ret $8
  1008. .LNotFound:
  1009. pop %ebx
  1010. mov $-1, %eax
  1011. end;
  1012. function {$ifdef CPUX86_HAS_SSE4_1} IndexQWord {$else} IndexQWord_SSE41 {$endif}(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  1013. { eax = buf, edx = len, [esp+4] = b }
  1014. asm
  1015. cmp $6, len
  1016. jle IndexQWord_Plain
  1017. movddup 4(%esp), %xmm0 { xmm0 = pattern of 'b's. }
  1018. mov %eax, %ecx { ecx = original buf }
  1019. sub $6, len
  1020. .balign 16
  1021. .L6x_Loop:
  1022. movdqu (%eax), %xmm1
  1023. pcmpeqq %xmm0, %xmm1 { xmm1 = cmpeq(vec 0, pattern) }
  1024. movdqu 16(%eax), %xmm2
  1025. pcmpeqq %xmm0, %xmm2
  1026. por %xmm1, %xmm2 { xmm2 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) }
  1027. movdqu 32(%eax), %xmm3
  1028. pcmpeqq %xmm0, %xmm3
  1029. por %xmm2, %xmm3 { xmm3 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) or cmpeq(vec 2, pattern) }
  1030. ptest %xmm3, %xmm3
  1031. jnz .LFound
  1032. add $48, %eax
  1033. sub $6, len
  1034. jge .L6x_Loop
  1035. lea (%eax,%edx,8), %eax { Point to last 3 vectors. }
  1036. cmp $-5, len
  1037. jge .L6x_Loop { Reuse .L6x_Loop to compare last 3 vectors, if not compared already. }
  1038. mov $-1, %eax
  1039. ret $8
  1040. .LFound:
  1041. sub %ecx, %eax
  1042. ptest %xmm1, %xmm1
  1043. jnz .LFoundAtXmm1
  1044. ptest %xmm2, %xmm2
  1045. jnz .LFoundAtXmm2
  1046. add $16, %eax
  1047. movdqa %xmm3, %xmm2
  1048. .LFoundAtXmm2:
  1049. add $16, %eax
  1050. movdqa %xmm2, %xmm1
  1051. .LFoundAtXmm1:
  1052. pmovmskb %xmm1, %ecx
  1053. bsf %ecx, %ecx
  1054. add %ecx, %eax
  1055. shr $3, %eax
  1056. end;
  1057. {$ifndef CPUX86_HAS_SSE4_1}
  1058. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt; forward;
  1059. var
  1060. IndexQWord_Impl: function(const buf;len:SizeInt;b:QWord):SizeInt = @IndexQWord_Dispatch;
  1061. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt;
  1062. begin
  1063. if not fpc_cpucodeinit_performed then
  1064. exit(IndexQWord_Plain(buf,len,b));
  1065. if has_sse41_support then
  1066. IndexQWord_Impl:=@IndexQWord_SSE41
  1067. else
  1068. IndexQWord_Impl:=@IndexQWord_Plain;
  1069. result:=IndexQWord_Impl(buf,len,b);
  1070. end;
  1071. function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt;
  1072. begin
  1073. result:=IndexQWord_Impl(buf,len,b);
  1074. end;
  1075. {$endif ndef CPUX86_HAS_SSE4_1}
  1076. {$endif FPC_SYSTEM_HAS_INDEXQWORD}
  1077. {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
  1078. {$define FPC_SYSTEM_HAS_COMPAREBYTE}
  1079. {$ifndef CPUX86_HAS_SSE2}
  1080. function CompareByte_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1081. asm
  1082. { eax = buf1, edx = buf2, ecx = len }
  1083. push %ebx
  1084. sub %eax, %edx { edx = buf2 - buf1 }
  1085. cmp $3, %ecx
  1086. jle .LBytewise_Prepare
  1087. { Align buf1 on 4 bytes. }
  1088. mov (%edx,%eax), %ebx
  1089. cmp (%eax), %ebx
  1090. jne .L4xDiffer
  1091. lea -4(%eax,%ecx), %ecx { ecx = buf1 end - (4 + buf1 and -4) = count remaining }
  1092. and $-4, %eax
  1093. sub %eax, %ecx
  1094. .balign 16
  1095. .L4x_Next:
  1096. add $4, %eax
  1097. sub $4, %ecx { at .LLast4, ecx is 4 less than remaining bytes }
  1098. jle .LLast4
  1099. mov (%edx,%eax), %ebx
  1100. cmp (%eax), %ebx
  1101. je .L4x_Next
  1102. .L4xDiffer:
  1103. mov (%eax), %edx
  1104. {$ifdef CPUX86_HAS_BSWAP}
  1105. bswap %ebx
  1106. bswap %edx
  1107. {$else}
  1108. rol $8, %bx
  1109. rol $16, %ebx
  1110. rol $8, %bx
  1111. rol $8, %dx
  1112. rol $16, %edx
  1113. rol $8, %dx
  1114. {$endif}
  1115. cmp %ebx, %edx
  1116. .LDoSbb:
  1117. sbb %eax, %eax
  1118. or $1, %eax
  1119. pop %ebx
  1120. ret
  1121. .LLast4:
  1122. add %ecx, %eax
  1123. mov (%edx,%eax), %ebx
  1124. cmp (%eax), %ebx
  1125. jne .L4xDiffer
  1126. xor %eax, %eax
  1127. pop %ebx
  1128. ret
  1129. .LBytewise_Prepare:
  1130. sub $1, %ecx
  1131. jb .LNothing
  1132. .balign 16 { no-op }
  1133. .LBytewise_Body:
  1134. movzbl (%edx,%eax), %ebx
  1135. cmp %bl, (%eax)
  1136. jne .LDoSbb
  1137. add $1, %eax
  1138. sub $1, %ecx
  1139. jae .LBytewise_Body
  1140. .LNothing:
  1141. xor %eax, %eax
  1142. pop %ebx
  1143. end;
  1144. {$endif ndef CPUX86_HAS_SSE2}
  1145. function {$ifdef CPUX86_HAS_SSE2} CompareByte {$else} CompareByte_SSE2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
  1146. asm
  1147. { eax = buf1, edx = buf2, ecx = len }
  1148. cmp $1, %ecx
  1149. jle .L1OrLess
  1150. push %ebx
  1151. cmp $16, %ecx
  1152. jae .LVecOrMore
  1153. { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
  1154. mov %eax, %ebx
  1155. or %edx, %ebx
  1156. and $4095, %ebx
  1157. cmp $4080, %ebx
  1158. ja .LCantOverReadBoth
  1159. { Over-read both as XMMs. }
  1160. movdqu (%eax), %xmm0
  1161. movdqu (%edx), %xmm1
  1162. pcmpeqb %xmm1, %xmm0
  1163. pmovmskb %xmm0, %ebx
  1164. inc %bx { Here and below, 2-byte 'inc r16' after 'pmovmskb' can be replaced with 5-byte 'add $1, r16' or 6-byte 'xor $65535, r32'. }
  1165. jz .LNothing
  1166. bsf %ebx, %ebx
  1167. cmp %ecx, %ebx { Ignore garbage beyond 'len'. }
  1168. jae .LNothing
  1169. movzbl (%eax,%ebx), %eax
  1170. movzbl (%edx,%ebx), %edx
  1171. sub %edx, %eax
  1172. pop %ebx
  1173. ret
  1174. .LNothing:
  1175. pop %ebx
  1176. xor %eax, %eax
  1177. ret
  1178. .LVecOrMore:
  1179. { Compare first vectors. }
  1180. movdqu (%eax), %xmm0
  1181. movdqu (%edx), %xmm1
  1182. pcmpeqb %xmm1, %xmm0
  1183. pmovmskb %xmm0, %ebx
  1184. inc %bx
  1185. jnz .LVec0Differs
  1186. sub $32, %ecx { now ecx is len - 32... mostly just to save bytes on offsets improving .LAligned32xLoop_Body alignment :) }
  1187. jbe .LLastVec
  1188. { Compare second vectors. }
  1189. movdqu 16(%eax), %xmm0
  1190. movdqu 16(%edx), %xmm1
  1191. pcmpeqb %xmm1, %xmm0
  1192. pmovmskb %xmm0, %ebx
  1193. inc %bx
  1194. jnz .LVec1Differs
  1195. { More than four vectors: aligned loop. }
  1196. cmp $32, %ecx
  1197. ja .LAligned32xLoop_Prepare
  1198. { Compare last two vectors. }
  1199. movdqu (%eax,%ecx), %xmm0
  1200. movdqu (%edx,%ecx), %xmm1
  1201. pcmpeqb %xmm1, %xmm0
  1202. pmovmskb %xmm0, %ebx
  1203. inc %bx
  1204. jnz .LVecEm2Differs
  1205. .LLastVec:
  1206. movdqu 16(%eax,%ecx), %xmm0
  1207. movdqu 16(%edx,%ecx), %xmm1
  1208. pcmpeqb %xmm1, %xmm0
  1209. pmovmskb %xmm0, %ebx
  1210. inc %bx
  1211. jnz .LVecEm1Differs
  1212. pop %ebx
  1213. xor %eax, %eax
  1214. ret
  1215. .LVecEm2Differs:
  1216. sub $16, %ecx
  1217. .LVecEm1Differs:
  1218. bsf %ebx, %ebx
  1219. add %ecx, %ebx
  1220. movzbl 16(%eax,%ebx), %eax
  1221. movzbl 16(%edx,%ebx), %edx
  1222. sub %edx, %eax
  1223. pop %ebx
  1224. ret
  1225. nop { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
  1226. .LAligned32xLoop_Prepare:
  1227. lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
  1228. sub %eax, %edx { edx = buf2 - buf1 }
  1229. and $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
  1230. sub %eax, %ecx { ecx = count to be handled with loop }
  1231. .balign 16 { No-op. }
  1232. .LAligned32xLoop_Body:
  1233. add $32, %eax
  1234. { Compare two XMMs, reduce the result with 'and'. }
  1235. movdqu (%edx,%eax), %xmm0
  1236. pcmpeqb (%eax), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
  1237. movdqu 16(%edx,%eax), %xmm1
  1238. pcmpeqb 16(%eax), %xmm1
  1239. pand %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
  1240. pmovmskb %xmm1, %ebx
  1241. inc %bx
  1242. jnz .LAligned32xLoop_TwoVectorsDiffer
  1243. sub $32, %ecx
  1244. ja .LAligned32xLoop_Body
  1245. { Compare last two vectors after the loop by doing one more loop iteration, modified. }
  1246. lea 32(%eax,%ecx), %eax
  1247. movdqu (%edx,%eax), %xmm0
  1248. movdqu (%eax), %xmm2
  1249. pcmpeqb %xmm2, %xmm0
  1250. movdqu 16(%edx,%eax), %xmm1
  1251. movdqu 16(%eax), %xmm2
  1252. pcmpeqb %xmm2, %xmm1
  1253. pand %xmm0, %xmm1
  1254. pmovmskb %xmm1, %ebx
  1255. inc %bx
  1256. jnz .LAligned32xLoop_TwoVectorsDiffer
  1257. pop %ebx
  1258. xor %eax, %eax
  1259. ret
  1260. .LAligned32xLoop_TwoVectorsDiffer:
  1261. add %eax, %edx { restore edx = buf2 }
  1262. pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
  1263. inc %cx
  1264. jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
  1265. bsf %ecx, %ebx
  1266. movzbl (%eax,%ebx), %eax
  1267. movzbl (%edx,%ebx), %edx
  1268. sub %edx, %eax
  1269. pop %ebx
  1270. ret
  1271. .LVec1Differs:
  1272. add $16, %eax
  1273. add $16, %edx
  1274. .LVec0Differs:
  1275. bsf %ebx, %ebx
  1276. movzbl (%eax,%ebx), %eax
  1277. movzbl (%edx,%ebx), %edx
  1278. sub %edx, %eax
  1279. pop %ebx
  1280. ret
  1281. .LCantOverReadBoth:
  1282. cmp $3, %ecx
  1283. jle .L2to3
  1284. push %esi
  1285. mov (%eax), %ebx
  1286. mov (%edx), %esi
  1287. cmp %esi, %ebx
  1288. jne .L4xDiffer
  1289. cmp $8, %ecx
  1290. jbe .LLast4x
  1291. mov 4(%eax), %ebx
  1292. mov 4(%edx), %esi
  1293. cmp %esi, %ebx
  1294. jne .L4xDiffer
  1295. mov -8(%eax,%ecx), %ebx
  1296. mov -8(%edx,%ecx), %esi
  1297. cmp %esi, %ebx
  1298. jne .L4xDiffer
  1299. .LLast4x:
  1300. mov -4(%eax,%ecx), %ebx
  1301. mov -4(%edx,%ecx), %esi
  1302. cmp %esi, %ebx
  1303. jne .L4xDiffer
  1304. pop %esi
  1305. pop %ebx
  1306. xor %eax, %eax
  1307. ret
  1308. .L4xDiffer:
  1309. bswap %ebx
  1310. bswap %esi
  1311. cmp %esi, %ebx
  1312. pop %esi
  1313. sbb %eax, %eax
  1314. or $1, %eax
  1315. pop %ebx
  1316. ret
  1317. .L2to3:
  1318. movzwl (%edx), %ebx
  1319. bswap %ebx
  1320. shr $1, %ebx
  1321. mov -1(%edx,%ecx), %bl
  1322. movzwl (%eax), %edx
  1323. bswap %edx
  1324. shr $1, %edx
  1325. mov -1(%eax,%ecx), %dl
  1326. mov %edx, %eax
  1327. sub %ebx, %eax
  1328. pop %ebx
  1329. ret
  1330. .L1OrLess:
  1331. jl .LUnbounded_Prepare
  1332. movzbl (%eax), %eax
  1333. movzbl (%edx), %edx
  1334. sub %edx, %eax
  1335. ret
  1336. .LUnbounded_Prepare:
  1337. sub %eax, %edx { edx = buf2 - buf1 }
  1338. test %ecx, %ecx
  1339. jnz .LUnbounded_Body
  1340. xor %eax, %eax
  1341. ret
  1342. .balign 16
  1343. .LUnbounded_Next:
  1344. add $1, %eax
  1345. .LUnbounded_Body:
  1346. movzbl (%edx,%eax), %ecx
  1347. cmp %cl, (%eax)
  1348. je .LUnbounded_Next
  1349. sbb %eax, %eax
  1350. or $1, %eax
  1351. end;
  1352. {$ifndef CPUX86_HAS_SSE2}
  1353. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1354. var
  1355. CompareByte_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareByte_Dispatch;
  1356. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1357. begin
  1358. if not fpc_cpucodeinit_performed then
  1359. exit(CompareByte_Plain(buf1, buf2, len));
  1360. if has_sse2_support then
  1361. CompareByte_Impl:=@CompareByte_SSE2
  1362. else
  1363. CompareByte_Impl:=@CompareByte_Plain;
  1364. result:=CompareByte_Impl(buf1, buf2, len);
  1365. end;
  1366. function CompareByte(const buf1, buf2; len: SizeInt): SizeInt;
  1367. begin
  1368. result:=CompareByte_Impl(buf1, buf2, len);
  1369. end;
  1370. {$endif ndef CPUX86_HAS_SSE2 (need CompareByte dispatcher)}
  1371. {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
  1372. {$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
  1373. {$define FPC_SYSTEM_HAS_COMPAREWORD}
  1374. {$ifndef CPUX86_HAS_SSE2}
  1375. function CompareWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1376. asm
  1377. push %ebx
  1378. sub %eax, %edx { edx = buf2 - buf1 }
  1379. lea -4(%ecx), %ebx { Go wordwise if ecx <= 3 or ecx > 1073741823 (High(int32) div 2) ==> uint32(ecx - 4) > 1073741819. }
  1380. cmp $1073741819, %ebx
  1381. ja .LWordwise_Prepare
  1382. test $2, %al
  1383. je .LAlignedToPtrUintOrNaturallyMisaligned
  1384. movzwl (%edx,%eax), %ebx
  1385. cmp %bx, (%eax)
  1386. jne .LDoSbb
  1387. add $2, %eax
  1388. sub $1, %ecx
  1389. .LAlignedToPtrUintOrNaturallyMisaligned:
  1390. sub $2, %ecx
  1391. .balign 16
  1392. .LPtrUintWise_Next:
  1393. mov (%edx,%eax), %ebx
  1394. cmp %ebx, (%eax)
  1395. jne .LPtrUintsDiffer
  1396. add $4, %eax
  1397. sub $2, %ecx
  1398. jg .LPtrUintWise_Next
  1399. lea (%eax,%ecx,2), %eax
  1400. mov (%edx,%eax), %ebx
  1401. cmp %ebx, (%eax)
  1402. jne .LPtrUintsDiffer
  1403. pop %ebx
  1404. xor %eax, %eax
  1405. ret
  1406. .LPtrUintsDiffer:
  1407. cmp %bx, (%eax)
  1408. jne .LDoSbb
  1409. shr $16, %ebx
  1410. cmp %bx, 2(%eax)
  1411. .LDoSbb:
  1412. sbb %eax, %eax
  1413. or $1, %eax
  1414. pop %ebx
  1415. ret
  1416. .balign 16
  1417. .LWordwise_Body:
  1418. movzwl (%edx,%eax), %ebx
  1419. cmp %bx, (%eax)
  1420. jne .LDoSbb
  1421. add $2, %eax
  1422. .LWordwise_Prepare:
  1423. sub $1, %ecx
  1424. jnb .LWordwise_Body
  1425. pop %ebx
  1426. xor %eax, %eax
  1427. end;
  1428. {$endif ndef CPUX86_HAS_SSE2}
  1429. function {$ifdef CPUX86_HAS_SSE2} CompareWord {$else} CompareWord_SSE2 {$endif} (Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1430. asm
  1431. push %ebx
  1432. sub %eax, %edx { edx = buf2 - buf1 }
  1433. lea -2(%ecx), %ebx { Go wordwise if ecx <= 1 or uint32(ecx) > 1073741823 (High(int32) div 2) ==> uint32(ecx - 2) > 1073741821. }
  1434. cmp $1073741821, %ebx
  1435. ja .LWordwise_Prepare
  1436. cmp $8, %ecx
  1437. jge .LVecOrMore
  1438. lea (%edx,%eax), %ebx
  1439. or %eax, %ebx
  1440. and $4095, %ebx
  1441. cmp $4080, %ebx
  1442. ja .LWordwise_Prepare
  1443. movdqu (%edx,%eax), %xmm0
  1444. movdqu (%eax), %xmm1
  1445. pcmpeqw %xmm1, %xmm0
  1446. pmovmskb %xmm0, %ebx
  1447. inc %bx
  1448. jz .LNothing
  1449. shl $1, %ecx { convert to bytes }
  1450. bsf %ebx, %ebx
  1451. cmp %ecx, %ebx
  1452. jb .LSubtractWords
  1453. .LNothing:
  1454. pop %ebx
  1455. xor %eax, %eax
  1456. ret
  1457. .balign 16
  1458. .LWordwise_Body:
  1459. movzwl (%edx,%eax), %ebx
  1460. cmp %bx, (%eax)
  1461. jne .LDoSbb
  1462. add $2, %eax
  1463. .LWordwise_Prepare:
  1464. sub $1, %ecx
  1465. jae .LWordwise_Body
  1466. xor %eax, %eax
  1467. pop %ebx
  1468. ret
  1469. .LDoSbb:
  1470. sbb %eax, %eax
  1471. or $1, %eax
  1472. pop %ebx
  1473. ret
  1474. .LVecOrMore:
  1475. movdqu (%edx,%eax), %xmm0 { Compare first vectors. }
  1476. movdqu (%eax), %xmm1
  1477. pcmpeqw %xmm1, %xmm0
  1478. pmovmskb %xmm0, %ebx
  1479. inc %bx
  1480. jnz .LVec0Differs
  1481. shl $1, %ecx { convert to bytes }
  1482. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1483. jle .LLastVec
  1484. push %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1485. add %eax, %ecx
  1486. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1487. sub %eax, %ecx
  1488. .balign 16
  1489. .LAligned8xLoop_Body:
  1490. add $16, %eax
  1491. movdqu (%edx,%eax), %xmm0
  1492. pcmpeqb (%eax), %xmm0
  1493. pmovmskb %xmm0, %ebx
  1494. inc %bx
  1495. jnz .LAligned8xLoop_VecDiffers
  1496. sub $16, %ecx
  1497. ja .LAligned8xLoop_Body
  1498. pop %ebx { drop original buf1 }
  1499. .LLastVec:
  1500. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1501. movdqu (%edx,%eax), %xmm0
  1502. movdqu (%eax), %xmm1
  1503. pcmpeqw %xmm1, %xmm0
  1504. pmovmskb %xmm0, %ebx
  1505. inc %bx
  1506. jnz .LVec0Differs
  1507. pop %ebx
  1508. xor %eax, %eax
  1509. ret
  1510. .LVec0Differs:
  1511. bsf %ebx, %ebx
  1512. .LSubtractWords:
  1513. add %eax, %edx
  1514. movzwl (%eax,%ebx), %eax
  1515. movzwl (%edx,%ebx), %edx
  1516. sub %edx, %eax
  1517. pop %ebx
  1518. ret
  1519. .LAligned8xLoop_VecDiffers:
  1520. bsf %ebx, %ebx
  1521. add %ebx, %eax
  1522. pop %ecx
  1523. sub %ecx, %eax
  1524. and $-2, %eax
  1525. add %ecx, %eax
  1526. movzwl (%edx,%eax), %edx
  1527. movzwl (%eax), %eax
  1528. sub %edx, %eax
  1529. pop %ebx
  1530. end;
  1531. {$ifndef CPUX86_HAS_SSE2}
  1532. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1533. var
  1534. CompareWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareWord_Dispatch;
  1535. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1536. begin
  1537. if not fpc_cpucodeinit_performed then
  1538. exit(CompareWord_Plain(buf1, buf2, len));
  1539. if has_sse2_support then
  1540. CompareWord_Impl:=@CompareWord_SSE2
  1541. else
  1542. CompareWord_Impl:=@CompareWord_Plain;
  1543. result:=CompareWord_Impl(buf1, buf2, len);
  1544. end;
  1545. function CompareWord(const buf1, buf2; len: SizeInt): SizeInt;
  1546. begin
  1547. result:=CompareWord_Impl(buf1, buf2, len);
  1548. end;
  1549. {$endif ndef CPUX86_HAS_SSE2 (need CompareWord dispatcher)}
  1550. {$endif FPC_SYSTEM_HAS_COMPAREWORD}
  1551. {$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
  1552. {$define FPC_SYSTEM_HAS_COMPAREDWORD}
  1553. {$ifndef CPUX86_HAS_SSE2}
  1554. function CompareDWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1555. asm
  1556. sub $1, %ecx
  1557. jb .LNothing
  1558. push %ebx
  1559. sub %eax, %edx
  1560. .balign 16
  1561. .LDwordwise_Body:
  1562. mov (%edx,%eax), %ebx
  1563. cmp %ebx, (%eax)
  1564. jne .LDoSbb
  1565. add $4, %eax
  1566. sub $1, %ecx
  1567. jnb .LDwordwise_Body
  1568. pop %ebx
  1569. .LNothing:
  1570. xor %eax, %eax
  1571. ret
  1572. .LDoSbb:
  1573. pop %ebx
  1574. sbb %eax, %eax
  1575. or $1, %eax
  1576. end;
  1577. {$endif}
  1578. function {$ifdef CPUX86_HAS_SSE2} CompareDWord {$else} CompareDWord_SSE2 {$endif} (Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1579. asm
  1580. push %ebx
  1581. sub %eax, %edx { edx = buf2 - buf1 }
  1582. lea -5(%ecx), %ebx { Go dwordwise if ecx <= 4 or ecx > 536870911 (High(int32) div 4) ==> uint32(ecx - 5) > 536870906. }
  1583. cmp $536870906, %ebx
  1584. ja .LDwordwise_Prepare
  1585. shl $2, %ecx { convert to bytes }
  1586. movdqu (%edx,%eax), %xmm1 { Compare first vectors. }
  1587. movdqu (%eax), %xmm0
  1588. pcmpeqd %xmm1, %xmm0
  1589. pmovmskb %xmm0, %ebx
  1590. inc %bx
  1591. jnz .LVec0Differs
  1592. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1593. jle .LLastVec
  1594. push %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1595. add %eax, %ecx
  1596. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1597. sub %eax, %ecx
  1598. .balign 16
  1599. .LAligned4xLoop_Body:
  1600. add $16, %eax
  1601. movdqu (%eax,%edx), %xmm0
  1602. pcmpeqb (%eax), %xmm0
  1603. pmovmskb %xmm0, %ebx
  1604. inc %bx
  1605. jnz .LAligned4xLoop_VecDiffers
  1606. sub $16, %ecx
  1607. ja .LAligned4xLoop_Body
  1608. pop %ebx { drop original buf1 }
  1609. .LLastVec:
  1610. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1611. movdqu (%edx,%eax), %xmm1
  1612. movdqu (%eax), %xmm0
  1613. pcmpeqd %xmm1, %xmm0
  1614. pmovmskb %xmm0, %ebx
  1615. inc %bx
  1616. jnz .LVec0Differs
  1617. pop %ebx
  1618. xor %eax, %eax
  1619. ret
  1620. .LVec0Differs:
  1621. bsf %ebx, %ebx
  1622. add %eax, %edx { recover edx = buf2 }
  1623. mov (%edx,%ebx), %edx
  1624. cmp %edx, (%eax,%ebx)
  1625. sbb %eax, %eax
  1626. or $1, %eax
  1627. pop %ebx
  1628. ret
  1629. .LAligned4xLoop_VecDiffers:
  1630. bsf %ebx, %ebx
  1631. add %ebx, %eax
  1632. pop %ecx
  1633. sub %ecx, %eax
  1634. and $-4, %eax
  1635. add %ecx, %eax
  1636. mov (%edx,%eax), %edx
  1637. cmp %edx, (%eax)
  1638. .LDoSbb:
  1639. sbb %eax, %eax
  1640. or $1, %eax
  1641. pop %ebx
  1642. ret
  1643. .balign 16
  1644. .LDwordwise_Body:
  1645. mov (%edx,%eax), %ebx
  1646. cmp %ebx, (%eax)
  1647. jne .LDoSbb
  1648. add $4, %eax
  1649. .LDwordwise_Prepare:
  1650. sub $1, %ecx
  1651. jnb .LDwordwise_Body
  1652. pop %ebx
  1653. xor %eax, %eax
  1654. end;
  1655. {$ifndef CPUX86_HAS_SSE2}
  1656. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1657. var
  1658. CompareDWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareDWord_Dispatch;
  1659. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1660. begin
  1661. if not fpc_cpucodeinit_performed then
  1662. exit(CompareDWord_Plain(buf1, buf2, len));
  1663. if has_sse2_support then
  1664. CompareDWord_Impl:=@CompareDWord_SSE2
  1665. else
  1666. CompareDWord_Impl:=@CompareDWord_Plain;
  1667. result:=CompareDWord_Impl(buf1, buf2, len);
  1668. end;
  1669. function CompareDWord(const buf1, buf2; len: SizeInt): SizeInt;
  1670. begin
  1671. result:=CompareDWord_Impl(buf1, buf2, len);
  1672. end;
  1673. {$endif ndef CPUX86_HAS_SSE2 (need CompareDWord dispatcher)}
  1674. {$endif FPC_SYSTEM_HAS_COMPAREDWORD}
  1675. {$ifndef FPC_SYSTEM_HAS_INDEXCHAR0}
  1676. {$define FPC_SYSTEM_HAS_INDEXCHAR0}
  1677. function IndexChar0(Const buf;len:SizeInt;b:AnsiChar):SizeInt; assembler;
  1678. var
  1679. saveesi,saveebx : longint;
  1680. asm
  1681. movl %esi,saveesi
  1682. movl %ebx,saveebx
  1683. // Can't use scasb, or will have to do it twice, think this
  1684. // is faster for small "len"
  1685. movl %eax,%esi // Load address
  1686. movzbl %cl,%ebx // Load searchpattern
  1687. testl %edx,%edx
  1688. je .LFound
  1689. xorl %ecx,%ecx // zero index in Buf
  1690. xorl %eax,%eax // To make DWord compares possible
  1691. .balign 4
  1692. .LLoop:
  1693. movb (%esi),%al // Load byte
  1694. cmpb %al,%bl
  1695. je .LFound // byte the same?
  1696. incl %ecx
  1697. incl %esi
  1698. cmpl %edx,%ecx // Maximal distance reached?
  1699. je .LNotFound
  1700. testl %eax,%eax // Nullchar = end of search?
  1701. jne .LLoop
  1702. .LNotFound:
  1703. movl $-1,%ecx // Not found return -1
  1704. .LFound:
  1705. movl %ecx,%eax
  1706. movl saveesi,%esi
  1707. movl saveebx,%ebx
  1708. end;
  1709. {$endif FPC_SYSTEM_HAS_INDEXCHAR0}
  1710. {****************************************************************************
  1711. String
  1712. ****************************************************************************}
  1713. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1714. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1715. procedure fpc_shortstr_to_shortstr(out res:shortstring; const sstr: shortstring);assembler;[public,alias:'FPC_SHORTSTR_TO_SHORTSTR']; compilerproc;
  1716. {$ifndef FPC_PROFILE}
  1717. nostackframe;
  1718. {$endif}
  1719. { eax = res, edx = high(res), ecx = sstr }
  1720. asm
  1721. {$ifdef FPC_PROFILE}
  1722. push %eax
  1723. push %edx
  1724. push %ecx
  1725. call mcount
  1726. pop %ecx
  1727. pop %edx
  1728. pop %eax
  1729. {$endif FPC_PROFILE}
  1730. cmp (%ecx), %dl { length(sstr) fits into res? }
  1731. jbe .LEdxIsLen { use high(res) if length(sstr) does not fit }
  1732. movzbl (%ecx), %edx { use length(sstr) }
  1733. .LEdxIsLen:
  1734. mov %dl, (%eax) { store length to res[0] }
  1735. xchg %ecx, %edx { ecx = length = Move count, edx = sstr }
  1736. xchg %eax, %edx { eax = sstr = Move src, edx = res = Move dest }
  1737. inc %eax
  1738. inc %edx
  1739. {$ifdef FPC_PROFILE}
  1740. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1741. lea -8(%esp), %esp
  1742. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1743. call Move
  1744. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1745. lea 8(%esp), %esp
  1746. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1747. {$else FPC_PROFILE}
  1748. jmp Move
  1749. {$endif FPC_PROFILE}
  1750. end;
  1751. procedure fpc_shortstr_assign(len:longint;sstr,dstr:pointer);[public,alias:'FPC_SHORTSTR_ASSIGN'];
  1752. begin
  1753. asm
  1754. {$ifdef FPC_PROFILE}
  1755. push %eax
  1756. push %edx
  1757. push %ecx
  1758. call mcount
  1759. pop %ecx
  1760. pop %edx
  1761. pop %eax
  1762. {$endif FPC_PROFILE}
  1763. pushl %eax
  1764. pushl %ecx
  1765. {$ifdef FPC_ENABLED_CLD}
  1766. cld
  1767. {$endif FPC_ENABLED_CLD}
  1768. movl dstr,%edi
  1769. movl sstr,%esi
  1770. xorl %eax,%eax
  1771. movl len,%ecx
  1772. lodsb
  1773. cmpl %ecx,%eax
  1774. jbe .LStrCopy1
  1775. movl %ecx,%eax
  1776. .LStrCopy1:
  1777. stosb
  1778. cmpl $7,%eax
  1779. jl .LStrCopy2
  1780. movl %edi,%ecx { Align on 32bits }
  1781. negl %ecx
  1782. andl $3,%ecx
  1783. subl %ecx,%eax
  1784. rep
  1785. movsb
  1786. movl %eax,%ecx
  1787. andl $3,%eax
  1788. shrl $2,%ecx
  1789. rep
  1790. movsl
  1791. .LStrCopy2:
  1792. movl %eax,%ecx
  1793. rep
  1794. movsb
  1795. popl %ecx
  1796. popl %eax
  1797. end ['ESI','EDI'];
  1798. end;
  1799. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1800. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1801. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1802. function fpc_shortstr_compare(const left,right:shortstring): longint;assembler; [public,alias:'FPC_SHORTSTR_COMPARE']; compilerproc;
  1803. { eax = left, edx = right }
  1804. asm
  1805. {$ifdef FPC_PROFILE}
  1806. push %eax
  1807. push %edx
  1808. push %ecx
  1809. call mcount
  1810. pop %ecx
  1811. pop %edx
  1812. pop %eax
  1813. {$endif FPC_PROFILE}
  1814. push %ebx
  1815. movzbl (%eax), %ecx { ecx = len(left) }
  1816. movzbl (%edx), %ebx { ebx = len(right) }
  1817. cmp %ebx, %ecx
  1818. {$ifdef CPUX86_HAS_CMOV}
  1819. cmovg %ebx, %ecx
  1820. {$else}
  1821. jle .LEcxIsLen
  1822. mov %ebx, %ecx
  1823. .LEcxIsLen:
  1824. {$endif}
  1825. push %eax { save left }
  1826. inc %eax
  1827. inc %edx
  1828. { stack is already aligned to 16 bytes if required: return address + push ebp + push ebx + push eax. }
  1829. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1830. call CompareByte
  1831. {$else}
  1832. call CompareByte_Impl { manually inline CompareByte }
  1833. {$endif}
  1834. pop %edx { restore left }
  1835. test %eax, %eax
  1836. jnz .LReturn
  1837. movzbl (%edx), %eax
  1838. sub %ebx, %eax
  1839. .LReturn:
  1840. pop %ebx
  1841. end;
  1842. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1843. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1844. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1845. function fpc_shortstr_compare_equal(const left,right:shortstring) : longint;assembler;nostackframe; [public,alias:'FPC_SHORTSTR_COMPARE_EQUAL']; compilerproc; nostackframe;
  1846. { eax = left, edx = right }
  1847. asm
  1848. movzbl (%eax), %ecx
  1849. cmp (%edx), %cl
  1850. jne .LNotEqual
  1851. inc %eax
  1852. inc %edx
  1853. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1854. jmp CompareByte
  1855. {$else}
  1856. jmp CompareByte_Impl { manually inline CompareByte }
  1857. {$endif}
  1858. .LNotEqual:
  1859. or $-1, %eax
  1860. end;
  1861. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1862. {$ifndef FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1863. {$define FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1864. procedure fpc_pchar_to_shortstr(out res : shortstring;p:PAnsiChar);assembler;[public,alias:'FPC_PCHAR_TO_SHORTSTR']; compilerproc;
  1865. {$ifndef FPC_PROFILE}
  1866. nostackframe;
  1867. {$endif}
  1868. // eax = res, edx = high(res), ecx = p
  1869. asm
  1870. {$ifdef FPC_PROFILE}
  1871. push %eax
  1872. push %edx
  1873. push %ecx
  1874. call mcount
  1875. pop %ecx
  1876. pop %edx
  1877. pop %eax
  1878. {$endif FPC_PROFILE}
  1879. test %ecx, %ecx
  1880. jz .LEmpty
  1881. push %eax { save res }
  1882. push %ecx { save p }
  1883. push %edx { save high(res) }
  1884. mov %ecx, %eax { eax = IndexByte.buf }
  1885. { edx is already high(res) = IndexByte.count.
  1886. Careful: using high(res) instead of -1 limits the scan by high(res) which is a good thing,
  1887. but assumes that IndexByte is “safe” and won’t read potentially invalid memory past the searched byte even if formally (and wrongly) allowed by ‘count’.
  1888. Generic and x86 versions are “safe”. }
  1889. xor %ecx, %ecx { ecx = 0 = IndexByte.value }
  1890. { Stack is already aligned on 16 bytes if the function is nostackframe: return address + push eax + push ecx + push edx.
  1891. With a stack frame, there is an additional push ebp and need 12 more bytes to align. }
  1892. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1893. leal -12(%esp), %esp
  1894. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1895. {$if defined(FPC_PIC) or not declared(IndexByte_Impl)}
  1896. call IndexByte
  1897. {$else}
  1898. call IndexByte_Impl { manually inline IndexByte }
  1899. {$endif}
  1900. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1901. leal 12(%esp), %esp
  1902. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1903. pop %ecx { ecx = high(res) = Move.len }
  1904. test %eax, %eax { If IndexByte result (eax) is non-negative (terminator is among first high(res) characters), use it, otherwise keep high(res). }
  1905. {$ifdef CPUX86_HAS_CMOV}
  1906. cmovns %eax, %ecx
  1907. {$else}
  1908. js .LEcxIsLen
  1909. mov %eax, %ecx
  1910. .LEcxIsLen:
  1911. {$endif}
  1912. pop %eax { pop p to eax = Move.src }
  1913. pop %edx { pop res to edx }
  1914. mov %cl, (%edx) { res[0] := len }
  1915. inc %edx { res[1] = Move.dst }
  1916. {$ifdef FPC_PROFILE}
  1917. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1918. leal -12(%esp), %esp
  1919. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1920. call Move
  1921. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1922. leal 12(%esp), %esp
  1923. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1924. jmp .LReturn
  1925. {$else FPC_PROFILE}
  1926. jmp Move { can perform a tail call }
  1927. {$endif FPC_PROFILE}
  1928. .LEmpty:
  1929. movb $0, (%eax)
  1930. {$ifdef FPC_PROFILE}
  1931. .LReturn:
  1932. {$endif}
  1933. end;
  1934. {$endif FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1935. {$IFNDEF INTERNAL_BACKTRACE}
  1936. {$define FPC_SYSTEM_HAS_GET_FRAME}
  1937. function get_frame:pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  1938. asm
  1939. movl %ebp,%eax
  1940. end;
  1941. {$ENDIF not INTERNAL_BACKTRACE}
  1942. {$define FPC_SYSTEM_HAS_GET_PC_ADDR}
  1943. Function Get_pc_addr : Pointer;assembler;nostackframe;
  1944. asm
  1945. movl (%esp),%eax
  1946. end;
  1947. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  1948. function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;
  1949. {$if defined(win32)}
  1950. { Windows has StackTop always properly set }
  1951. begin
  1952. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  1953. Result:=PPointer(framebp+4)^
  1954. else
  1955. Result:=nil;
  1956. end;
  1957. {$else defined(win32)}
  1958. nostackframe;assembler;
  1959. asm
  1960. orl %eax,%eax
  1961. jz .Lg_a_null
  1962. movl 4(%eax),%eax
  1963. .Lg_a_null:
  1964. end;
  1965. {$endif defined(win32)}
  1966. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  1967. function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;
  1968. {$if defined(win32)}
  1969. { Windows has StackTop always properly set }
  1970. begin
  1971. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  1972. Result:=PPointer(framebp)^
  1973. else
  1974. Result:=nil;
  1975. end;
  1976. {$else defined(win32)}
  1977. nostackframe;assembler;
  1978. asm
  1979. orl %eax,%eax
  1980. jz .Lgnf_null
  1981. movl (%eax),%eax
  1982. .Lgnf_null:
  1983. end;
  1984. {$endif defined(win32)}
  1985. {$define FPC_SYSTEM_HAS_SPTR}
  1986. Function Sptr : Pointer;assembler;nostackframe;
  1987. asm
  1988. movl %esp,%eax
  1989. end;
  1990. {****************************************************************************
  1991. Str()
  1992. ****************************************************************************}
  1993. {$if defined(disabled) and defined(regcall) }
  1994. {$define FPC_SYSTEM_HAS_INT_STR_LONGWORD}
  1995. {$define FPC_SYSTEM_HAS_INT_STR_LONGINT}
  1996. label str_int_shortcut;
  1997. procedure int_str(l:longword;out s:shortstring);assembler;nostackframe;
  1998. asm
  1999. pushl %esi
  2000. pushl %edi
  2001. pushl %ebx
  2002. mov %edx,%edi
  2003. xor %edx,%edx
  2004. jmp str_int_shortcut
  2005. end;
  2006. procedure int_str(l:longint;out s:shortstring);assembler;nostackframe;
  2007. {Optimized for speed, but balanced with size.}
  2008. const digits:array[0..9] of cardinal=(0,10,100,1000,10000,
  2009. 100000,1000000,10000000,
  2010. 100000000,1000000000);
  2011. asm
  2012. {$ifdef FPC_PROFILE}
  2013. push %eax
  2014. push %edx
  2015. push %ecx
  2016. call mcount
  2017. pop %ecx
  2018. pop %edx
  2019. pop %eax
  2020. {$endif FPC_PROFILE}
  2021. push %esi
  2022. push %edi
  2023. push %ebx
  2024. movl %edx,%edi
  2025. { Calculate absolute value and put sign in edx}
  2026. cltd
  2027. xorl %edx,%eax
  2028. subl %edx,%eax
  2029. negl %edx
  2030. str_int_shortcut:
  2031. movl %ecx,%esi
  2032. {Calculate amount of digits in ecx.}
  2033. xorl %ecx,%ecx
  2034. bsrl %eax,%ecx
  2035. incl %ecx
  2036. imul $1233,%ecx
  2037. shr $12,%ecx
  2038. {$ifdef FPC_PIC}
  2039. call fpc_geteipasebx
  2040. {$ifdef darwin}
  2041. movl digits-.Lpic(%ebx),%ebx
  2042. {$else}
  2043. addl $_GLOBAL_OFFSET_TABLE_,%ebx
  2044. movl digits@GOT(%ebx),%ebx
  2045. {$endif}
  2046. cmpl (%ebx,%ecx,4),%eax
  2047. {$else}
  2048. cmpl digits(,%ecx,4),%eax
  2049. {$endif}
  2050. cmc
  2051. adcl $0,%ecx {Nr. digits ready in ecx.}
  2052. {Write length & sign.}
  2053. lea (%edx,%ecx),%ebx
  2054. movb $45,%bh {movb $'-,%bh Not supported by our ATT reader.}
  2055. movw %bx,(%edi)
  2056. addl %edx,%edi
  2057. subl %edx,%esi
  2058. {Skip digits beyond string length.}
  2059. movl %eax,%edx
  2060. subl %ecx,%esi
  2061. jae .Lloop_write
  2062. .balign 4
  2063. .Lloop_skip:
  2064. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  2065. mull %edx
  2066. shrl $3,%edx
  2067. decl %ecx
  2068. jz .Ldone {If (l<0) and (high(s)=1) this jump is taken.}
  2069. incl %esi
  2070. jnz .Lloop_skip
  2071. {Write out digits.}
  2072. .balign 4
  2073. .Lloop_write:
  2074. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  2075. {Pre-add '0'}
  2076. leal 48(%edx),%ebx {leal $'0(,%edx),%ebx Not supported by our ATT reader.}
  2077. mull %edx
  2078. shrl $3,%edx
  2079. leal (%edx,%edx,8),%eax {x mod 10 = x-10*(x div 10)}
  2080. subl %edx,%ebx
  2081. subl %eax,%ebx
  2082. movb %bl,(%edi,%ecx)
  2083. decl %ecx
  2084. jnz .Lloop_write
  2085. .Ldone:
  2086. popl %ebx
  2087. popl %edi
  2088. popl %esi
  2089. end;
  2090. {$endif}
  2091. {****************************************************************************
  2092. Bounds Check
  2093. ****************************************************************************}
  2094. { do a thread-safe inc/dec }
  2095. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  2096. function cpudeclocked(var l : longint) : boolean;assembler;nostackframe;
  2097. asm
  2098. lock
  2099. decl (%eax)
  2100. setzb %al
  2101. end;
  2102. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  2103. procedure cpuinclocked(var l : longint);assembler;nostackframe;
  2104. asm
  2105. lock
  2106. incl (%eax)
  2107. end;
  2108. // inline SMP check and normal lock.
  2109. // the locked one is so slow, inlining doesn't matter.
  2110. function declocked(var l : longint) : boolean; inline;
  2111. begin
  2112. if not ismultithread then
  2113. begin
  2114. dec(l);
  2115. declocked:=l=0;
  2116. end
  2117. else
  2118. declocked:=cpudeclocked(l);
  2119. end;
  2120. procedure inclocked(var l : longint); inline;
  2121. begin
  2122. if not ismultithread then
  2123. inc(l)
  2124. else
  2125. cpuinclocked(l);
  2126. end;
  2127. function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
  2128. asm
  2129. movl $-1,%edx
  2130. lock
  2131. xaddl %edx, (%eax)
  2132. lea -1(%edx),%eax
  2133. end;
  2134. function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
  2135. asm
  2136. movl $1,%edx
  2137. lock
  2138. xaddl %edx, (%eax)
  2139. lea 1(%edx),%eax
  2140. end;
  2141. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2142. asm
  2143. xchgl (%eax),%edx
  2144. movl %edx,%eax
  2145. end;
  2146. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2147. asm
  2148. lock
  2149. xaddl %edx, (%eax)
  2150. movl %edx,%eax
  2151. end;
  2152. function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler; nostackframe;
  2153. asm
  2154. xchgl %eax,%ecx
  2155. lock
  2156. cmpxchgl %edx, (%ecx)
  2157. end;
  2158. function InterlockedCompareExchange64(var Target: int64; NewValue: int64; Comperand: int64): int64; assembler;
  2159. asm
  2160. pushl %ebx
  2161. pushl %edi
  2162. movl %eax,%edi
  2163. movl Comperand+4,%edx
  2164. movl Comperand+0,%eax
  2165. movl NewValue+4,%ecx
  2166. movl NewValue+0,%ebx
  2167. lock cmpxchg8b (%edi)
  2168. pop %edi
  2169. pop %ebx
  2170. end;
  2171. {****************************************************************************
  2172. FPU
  2173. ****************************************************************************}
  2174. const
  2175. { Internal constants for use in system unit }
  2176. FPU_Invalid = 1;
  2177. FPU_Denormal = 2;
  2178. FPU_DivisionByZero = 4;
  2179. FPU_Overflow = 8;
  2180. FPU_Underflow = $10;
  2181. FPU_StackUnderflow = $20;
  2182. FPU_StackOverflow = $40;
  2183. FPU_ExceptionMask = $ff;
  2184. MM_Invalid = 1;
  2185. MM_Denormal = 2;
  2186. MM_DivisionByZero = 4;
  2187. MM_Overflow = 8;
  2188. MM_Underflow = $10;
  2189. MM_Precicion = $20;
  2190. MM_ExceptionMask = $3f;
  2191. MM_MaskInvalidOp = %0000000010000000;
  2192. MM_MaskDenorm = %0000000100000000;
  2193. MM_MaskDivZero = %0000001000000000;
  2194. MM_MaskOverflow = %0000010000000000;
  2195. MM_MaskUnderflow = %0000100000000000;
  2196. MM_MaskPrecision = %0001000000000000;
  2197. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  2198. Procedure SysInitFPU;
  2199. begin
  2200. end;
  2201. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  2202. Procedure SysResetFPU;
  2203. var
  2204. { these locals are so we don't have to hack pic code in the assembler }
  2205. localmxcsr: dword;
  2206. localfpucw: word;
  2207. begin
  2208. localfpucw:=Default8087CW;
  2209. asm
  2210. fninit
  2211. fwait
  2212. fldcw localfpucw
  2213. end;
  2214. if has_sse_support then
  2215. begin
  2216. localmxcsr:=DefaultMXCSR;
  2217. asm
  2218. { setup sse exceptions }
  2219. {$ifndef OLD_ASSEMBLER}
  2220. ldmxcsr localmxcsr
  2221. {$else OLD_ASSEMBLER}
  2222. mov localmxcsr,%eax
  2223. subl $4,%esp
  2224. mov %eax,(%esp)
  2225. //ldmxcsr (%esp)
  2226. .byte 0x0f,0xae,0x14,0x24
  2227. addl $4,%esp
  2228. {$endif OLD_ASSEMBLER}
  2229. end;
  2230. end;
  2231. end;
  2232. { because of the brain dead sse detection on x86, this test is post poned }
  2233. procedure fpc_cpucodeinit;
  2234. var
  2235. _eax,_ecx_cpuid1,_edx_cpuid1,_ebx_cpuid7 : longint;
  2236. begin
  2237. if cpuid_support then
  2238. begin
  2239. asm
  2240. movl $1,%eax
  2241. xorl %ecx,%ecx
  2242. cpuid
  2243. movl %edx,_edx_cpuid1
  2244. movl %ecx,_ecx_cpuid1
  2245. end ['ebx'];
  2246. has_mmx_support:=(_edx_cpuid1 and $800000)<>0;
  2247. if ((_edx_cpuid1 and $2000000)<>0) then
  2248. begin
  2249. os_supports_sse:=true;
  2250. sse_check:=true;
  2251. asm
  2252. { force an sse exception if no sse is supported, the exception handler sets
  2253. os_supports_sse to false then }
  2254. { don't change this instruction, the code above depends on its size }
  2255. {$ifdef OLD_ASSEMBLER}
  2256. .byte 0x0f,0x28,0xf7
  2257. {$else}
  2258. movaps %xmm7, %xmm6
  2259. {$endif not EMX}
  2260. end;
  2261. sse_check:=false;
  2262. has_sse_support:=os_supports_sse;
  2263. end;
  2264. if has_sse_support then
  2265. begin
  2266. has_sse2_support:=((_edx_cpuid1 and $4000000)<>0);
  2267. has_sse3_support:=((_ecx_cpuid1 and $200)<>0);
  2268. has_sse41_support:=boolean(_ecx_cpuid1 shr 19 and 1);
  2269. { now avx }
  2270. asm
  2271. xorl %eax,%eax
  2272. cpuid
  2273. movl %eax,_eax
  2274. end;
  2275. if _eax>=7 then
  2276. begin
  2277. asm
  2278. movl $7,%eax
  2279. xorl %ecx,%ecx
  2280. cpuid
  2281. movl %ebx,_ebx_cpuid7
  2282. end;
  2283. fast_large_repmovstosb:=_ebx_cpuid7 and (1 shl 9)<>0;
  2284. if (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then
  2285. begin
  2286. asm
  2287. xorl %ecx,%ecx
  2288. .byte 0x0f,0x01,0xd0 { xgetbv }
  2289. movl %eax,_eax
  2290. end;
  2291. if (_eax and 6)=6 then
  2292. begin
  2293. has_avx_support:=(_ecx_cpuid1 and $10000000)<>0;
  2294. has_avx2_support:=(_ebx_cpuid7 and $20)<>0;
  2295. end;
  2296. end;
  2297. end;
  2298. end;
  2299. end;
  2300. { don't let libraries influence the FPU cw set by the host program }
  2301. if IsLibrary then
  2302. begin
  2303. Default8087CW:=Get8087CW;
  2304. if has_sse_support then
  2305. DefaultMXCSR:=GetMXCSR;
  2306. end;
  2307. SysResetFPU;
  2308. fpc_cpucodeinit_performed:=true;
  2309. end;
  2310. {$if not defined(darwin) and defined(regcall) }
  2311. { darwin requires that the stack is aligned to 16 bytes when calling another function }
  2312. {$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
  2313. {$define FPC_SYSTEM_HAS_ANSISTR_DECR_REF}
  2314. Procedure fpc_AnsiStr_Decr_Ref (Var S : Pointer); [Public,Alias:'FPC_ANSISTR_DECR_REF']; compilerproc; nostackframe; assembler;
  2315. asm
  2316. movl (%eax),%edx
  2317. testl %edx,%edx
  2318. jz .Lquit
  2319. movl $0,(%eax) // s:=nil
  2320. cmpl $0,-8(%edx) // exit if refcount<0
  2321. jl .Lquit
  2322. {$ifdef FPC_PIC}
  2323. call fpc_geteipasecx
  2324. addl $_GLOBAL_OFFSET_TABLE_,%ecx
  2325. movl ismultithread@GOT(%ecx),%ecx
  2326. cmpl $0,(%ecx)
  2327. {$else FPC_PIC}
  2328. cmpl $0,ismultithread
  2329. {$endif FPC_PIC}
  2330. je .Lskiplock
  2331. .byte 0xF0 // LOCK prefix, jumped over if IsMultiThread = false. FPC assembler does not accept disjoint LOCK mnemonic.
  2332. .Lskiplock:
  2333. decl -8(%edx)
  2334. jz .Lfree
  2335. .Lquit:
  2336. ret
  2337. .Lfree:
  2338. leal -12(%edx),%eax // points to start of allocation
  2339. { freemem is not an assembler leaf function like fpc_geteipasecx, so it
  2340. needs to be called with proper stack alignment }
  2341. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2342. leal -12(%esp),%esp
  2343. call FPC_FREEMEM
  2344. leal 12(%esp),%esp
  2345. {$else FPC_SYSTEM_STACKALIGNMENT16}
  2346. jmp FPC_FREEMEM // can perform a tail call
  2347. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2348. end;
  2349. function fpc_truely_ansistr_unique(Var S : Pointer): Pointer; forward;
  2350. {$define FPC_SYSTEM_HAS_ANSISTR_UNIQUE}
  2351. Function fpc_ansistr_Unique(Var S : Pointer): Pointer; [Public,Alias : 'FPC_ANSISTR_UNIQUE']; compilerproc; nostackframe;assembler;
  2352. asm
  2353. movl (%eax),%edx
  2354. testl %edx,%edx
  2355. jz .Lunchanged
  2356. cmpl $1,-8(%edx)
  2357. jne fpc_truely_ansistr_unique
  2358. .Lunchanged:
  2359. movl %edx,%eax
  2360. end;
  2361. {$endif FPC_HAS_FEATURE_ANSISTRINGS}
  2362. {$endif ndef darwin and defined(regcall) }
  2363. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  2364. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  2365. procedure ReadBarrier;assembler;nostackframe;
  2366. asm
  2367. {$ifdef CPUX86_HAS_SSE2}
  2368. lfence
  2369. {$else CPUX86_HAS_SSE2}
  2370. lock
  2371. addl $0,0(%esp)
  2372. {$endif CPUX86_HAS_SSE2}
  2373. end;
  2374. procedure ReadDependencyBarrier;
  2375. begin
  2376. { reads imply barrier on earlier reads depended on }
  2377. end;
  2378. procedure ReadWriteBarrier;assembler;nostackframe;
  2379. asm
  2380. {$ifdef CPUX86_HAS_SSE2}
  2381. mfence
  2382. {$else CPUX86_HAS_SSE2}
  2383. lock
  2384. addl $0,0(%esp)
  2385. {$endif CPUX86_HAS_SSE2}
  2386. end;
  2387. procedure WriteBarrier;assembler;nostackframe;
  2388. asm
  2389. {$ifdef CPUX86_HAS_SSEUNIT}
  2390. sfence
  2391. {$endif CPUX86_HAS_SSEUNIT}
  2392. end;
  2393. {$endif}
  2394. {$ifndef FPC_SYSTEM_HAS_BSF_QWORD}
  2395. {$define FPC_SYSTEM_HAS_BSF_QWORD}
  2396. function BsfQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2397. asm
  2398. bsfl 4(%esp),%eax
  2399. jz .L1
  2400. ret $8
  2401. .L1:
  2402. bsfl 8(%esp),%eax
  2403. jz .L2
  2404. add $32,%eax
  2405. ret $8
  2406. .L2:
  2407. movl $255,%eax
  2408. end;
  2409. {$endif FPC_SYSTEM_HAS_BSF_QWORD}
  2410. {$ifndef FPC_SYSTEM_HAS_BSR_QWORD}
  2411. {$define FPC_SYSTEM_HAS_BSR_QWORD}
  2412. function BsrQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2413. asm
  2414. bsrl 8(%esp),%eax
  2415. jz .L1
  2416. add $32,%eax
  2417. ret $8
  2418. .L1:
  2419. bsrl 4(%esp),%eax
  2420. jz .L2
  2421. ret $8
  2422. .L2:
  2423. movl $255,%eax
  2424. end;
  2425. {$endif FPC_SYSTEM_HAS_BSR_QWORD}
  2426. {$ifndef FPC_SYSTEM_HAS_SAR_QWORD}
  2427. {$define FPC_SYSTEM_HAS_SAR_QWORD}
  2428. function fpc_SarInt64(Const AValue : Int64;const Shift : Byte): Int64; [Public,Alias:'FPC_SARINT64']; compilerproc; assembler; nostackframe;
  2429. asm
  2430. movl 8(%esp),%edx
  2431. movzbl %al,%ecx
  2432. cmpb $32,%al
  2433. jnb .L1
  2434. movl 4(%esp),%eax
  2435. shrdl %cl,%edx,%eax
  2436. sarl %cl,%edx
  2437. ret $8
  2438. .L1:
  2439. movl %edx,%eax
  2440. sarl $31,%edx
  2441. sarl %cl,%eax // uses 5 lower bits of cl.
  2442. end;
  2443. {$endif FPC_SYSTEM_HAS_SAR_QWORD}