i386.inc 90 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 1999-2000 by the Free Pascal development team.
  4. Processor dependent implementation for the system unit for
  5. intel i386+
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$if defined(linux)}
  13. {$define FPC_SYSTEM_STACKALIGNMENT16}
  14. {$endif defined(linux)}
  15. {****************************************************************************
  16. Primitives
  17. ****************************************************************************}
  18. var
  19. os_supports_sse : boolean;
  20. { this variable is set to true, if currently an sse check is executed and no sig ill should be generated }
  21. sse_check : boolean;
  22. fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
  23. has_sse41_support : boolean;
  24. fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. }
  25. {$asmmode ATT}
  26. function cpuid_support : boolean;assembler;nostackframe;
  27. {
  28. Check if the ID-flag can be changed, if changed then CpuID is supported.
  29. Tested under go32v1 and Linux on c6x86 with CpuID enabled and disabled (PFV)
  30. }
  31. asm
  32. pushfl
  33. movl (%esp),%eax
  34. xorl $0x200000,%eax
  35. pushl %eax
  36. popfl
  37. pushfl
  38. popl %eax
  39. xorl (%esp),%eax
  40. popfl
  41. testl $0x200000,%eax
  42. setnz %al
  43. end;
  44. {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
  45. procedure fpc_cpuinit;
  46. begin
  47. { because of the brain dead sse detection on x86, this test is post poned to fpc_cpucodeinit which
  48. must be implemented OS dependend (FK)
  49. has_sse_support:=sse_support;
  50. has_mmx_support:=mmx_support;
  51. }
  52. end;
  53. {$ifndef darwin}
  54. procedure fpc_geteipasebx; [public, alias: 'fpc_geteipasebx'];assembler; nostackframe;
  55. asm
  56. movl (%esp),%ebx
  57. end;
  58. procedure fpc_geteipasecx; [public, alias: 'fpc_geteipasecx'];assembler; nostackframe;
  59. asm
  60. movl (%esp),%ecx
  61. end;
  62. {$endif}
  63. {$if not defined(FPC_SYSTEM_HAS_MOVE)
  64. and not defined(OLD_ASSEMBLER)
  65. and not defined(darwin)}
  66. {$i fastmove.inc}
  67. {$endif}
  68. {$ifndef FPC_SYSTEM_HAS_MOVE}
  69. {$define FPC_SYSTEM_HAS_MOVE}
  70. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;
  71. var
  72. saveesi,saveedi : longint;
  73. asm
  74. movl %edi,saveedi
  75. movl %esi,saveesi
  76. movl %eax,%esi
  77. movl %edx,%edi
  78. movl %ecx,%edx
  79. movl %edi,%eax
  80. { check for zero or negative count }
  81. cmpl $0,%edx
  82. jle .LMoveEnd
  83. { Check for back or forward }
  84. sub %esi,%eax
  85. jz .LMoveEnd { Do nothing when source=dest }
  86. jc .LFMove { Do forward, dest<source }
  87. cmp %edx,%eax
  88. jb .LBMove { Dest is in range of move, do backward }
  89. { Forward Copy }
  90. .LFMove:
  91. {$ifdef FPC_ENABLED_CLD}
  92. cld
  93. {$endif FPC_ENABLED_CLD}
  94. cmpl $15,%edx
  95. jl .LFMove1
  96. movl %edi,%ecx { Align on 32bits }
  97. negl %ecx
  98. andl $3,%ecx
  99. subl %ecx,%edx
  100. rep
  101. movsb
  102. movl %edx,%ecx
  103. andl $3,%edx
  104. shrl $2,%ecx
  105. rep
  106. movsl
  107. .LFMove1:
  108. movl %edx,%ecx
  109. rep
  110. movsb
  111. jmp .LMoveEnd
  112. { Backward Copy }
  113. .LBMove:
  114. std
  115. addl %edx,%esi
  116. addl %edx,%edi
  117. movl %edi,%ecx
  118. decl %esi
  119. decl %edi
  120. cmpl $15,%edx
  121. jl .LBMove1
  122. negl %ecx { Align on 32bits }
  123. andl $3,%ecx
  124. subl %ecx,%edx
  125. rep
  126. movsb
  127. movl %edx,%ecx
  128. andl $3,%edx
  129. shrl $2,%ecx
  130. subl $3,%esi
  131. subl $3,%edi
  132. rep
  133. movsl
  134. addl $3,%esi
  135. addl $3,%edi
  136. .LBMove1:
  137. movl %edx,%ecx
  138. rep
  139. movsb
  140. cld
  141. .LMoveEnd:
  142. movl saveedi,%edi
  143. movl saveesi,%esi
  144. end;
  145. {$endif FPC_SYSTEM_HAS_MOVE}
  146. { Darwin uses Clang to assemble. Recent Clang versions (rightly) give an error when you add global labels in
  147. the middle of .cfi_startproc / .cfi_endproc pairs, since this means you could jump into it from other code
  148. whose CFI state is completely different without the compiler even having the theoretical ability to analyse
  149. all code and generate balanced information.
  150. Since FPC does not attempt it even for local labels, this kind of code is inherently unsafe.
  151. }
  152. {$ifndef darwin}
  153. {$define can_jump_into_the_middle_of_a_procedure}
  154. {$endif darwin}
  155. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  156. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  157. or not defined(FPC_SYSTEM_HAS_FILLDWORD)
  158. or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  159. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  160. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  161. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  162. const
  163. FillXxxx_RepStosThreshold_ERMS = 1024;
  164. FillXxxx_RepStosThreshold_NoERMS = 512 * 1024;
  165. procedure FillXxxx_U32Pattern_RepStos_8OrMore; assembler; nostackframe;
  166. { eax — x, ecx — uint32 pattern, edx — byte count >= 8 (preferably >= FillXxxx_RepStosThreshold_(No)ERMS, depending on fast_large_repmovstosb). }
  167. asm
  168. {$ifdef FPC_ENABLED_CLD}
  169. cld
  170. {$endif FPC_ENABLED_CLD}
  171. mov %ecx, (%eax) { Write first 4 bytes unaligned. }
  172. push %ecx { pattern }
  173. push %edi
  174. mov %eax, %edi { Move x to edi, as expected by ‘rep stosl’. }
  175. xchg %eax, %ecx { now eax = pattern (as expected by ‘rep stosl’) and ecx = x (to rotate the pattern by its misalignment) }
  176. shl $3, %ecx { ecx = misalignment of x in bits. }
  177. rol %cl, %eax { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
  178. add %edi, %edx { edx = x end }
  179. lea -1(%edx), %ecx { ecx = x end - 1. }
  180. add $4, %edi
  181. and $-4, %edi { edi = 4-byte aligned pointer strictly to the right of the start. }
  182. and $-4, %ecx { ecx = 4-byte aligned pointer strictly to the left of the end. }
  183. sub %edi, %ecx { ecx = byte count between them. }
  184. shr $2, %ecx { ecx = uint32 count, as expected by ‘rep stosl’. }
  185. rep stosl
  186. pop %edi
  187. pop %ecx
  188. mov %ecx, -4(%edx) { Write last 4 bytes unaligned. }
  189. end;
  190. {$endif FillChar/Word/DWord required.}
  191. {$ifdef can_jump_into_the_middle_of_a_procedure}
  192. label
  193. FillXxxx_MoreThanTwoXMMs;
  194. {$else can_jump_into_the_middle_of_a_procedure}
  195. procedure FillXxxx_MoreThanTwoXMMs; forward;
  196. {$endif can_jump_into_the_middle_of_a_procedure}
  197. procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
  198. { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
  199. asm
  200. movd %ecx, %xmm0
  201. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  202. movdqu %xmm0, (%eax)
  203. movdqu %xmm0, -16(%eax,%edx)
  204. cmp $32, %edx
  205. ja .LMoreThanTwoVectors
  206. ret
  207. .byte 144 { Turn .balign 16 before .L64x_Body into a no-op. }
  208. { x can start and end misaligned on the vector boundary:
  209. x = ~~][H1][H2][...][T2][T1]~
  210. [UH] [UT]
  211. UH/UT stands for “unaligned head/tail”, both have 1~16 bytes. }
  212. .LMoreThanTwoVectors:
  213. push %esi
  214. mov %ecx, %esi { esi = pattern }
  215. mov %eax, %ecx
  216. shl $3, %ecx { ecx = misalignment of x in bits }
  217. rol %cl, %esi { misalign the pattern }
  218. movd %esi, %xmm0
  219. pshufd $0, %xmm0, %xmm0
  220. pop %esi
  221. {$ifdef can_jump_into_the_middle_of_a_procedure}
  222. { FillChar (to skip the misaligning above) and FillQWord jump here.
  223. eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
  224. FillXxxx_MoreThanTwoXMMs:
  225. {$else can_jump_into_the_middle_of_a_procedure}
  226. jmp FillXxxx_MoreThanTwoXMMs
  227. end;
  228. procedure FillXxxx_MoreThanTwoXMMs; assembler; nostackframe;
  229. asm
  230. {$endif can_jump_into_the_middle_of_a_procedure}
  231. lea -65(%eax,%edx), %ecx
  232. and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
  233. mov %ecx, %edx { Remember T4 to edx. }
  234. and $-16, %eax { eax = H1 − 16. }
  235. sub %eax, %ecx { ecx = aligned byte count − 48. }
  236. movdqa %xmm0, 16(%eax) { Write H1. }
  237. cmp $32-48, %ecx
  238. jle .LOneAlignedTailWrite
  239. movdqa %xmm0, 32(%eax) { Write H2. }
  240. cmp $64-48, %ecx
  241. jle .LTwoAlignedTailWrites
  242. sub $48, %ecx { ecx = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
  243. jle .LFourAlignedTailWrites { ecx was ≤ 96−48 }
  244. add $48, %eax { eax = H3. }
  245. cmp $4 * 1024 * 1024, %ecx { Non-temporal fill threshold. }
  246. jae .L64xNT_Body
  247. .balign 16 { no-op }
  248. .L64x_Body:
  249. movdqa %xmm0, (%eax)
  250. movdqa %xmm0, 16(%eax)
  251. movdqa %xmm0, 32(%eax)
  252. movdqa %xmm0, 48(%eax)
  253. add $64, %eax
  254. sub $64, %ecx
  255. ja .L64x_Body
  256. .LFourAlignedTailWrites:
  257. movdqa %xmm0, (%edx) { T4 }
  258. movdqa %xmm0, 16(%edx) { T3 }
  259. .LTwoAlignedTailWrites:
  260. movdqa %xmm0, 32(%edx) { T2 }
  261. .LOneAlignedTailWrite:
  262. movdqa %xmm0, 48(%edx) { T1 }
  263. ret
  264. .balign 16
  265. .L64xNT_Body:
  266. movntdq %xmm0, (%eax)
  267. movntdq %xmm0, 16(%eax)
  268. movntdq %xmm0, 32(%eax)
  269. movntdq %xmm0, 48(%eax)
  270. add $64, %eax
  271. sub $64, %ecx
  272. ja .L64xNT_Body
  273. sfence
  274. jmp .LFourAlignedTailWrites
  275. end;
  276. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  277. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  278. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  279. {$ifndef CPUX86_HAS_SSE2}
  280. procedure FillXxxx_U32Pattern_Plain_16OrMore; assembler; nostackframe;
  281. { eax — x, ecx — uint32 pattern, edx — byte count >= 12 (preferably >= 16). }
  282. asm
  283. mov %ecx, (%eax) { Write first 4 bytes. }
  284. lea -9(%eax,%edx), %edx
  285. mov %ecx, 5(%edx) { Write last 4 bytes. }
  286. and $-4, %edx { edx = loop bound. }
  287. push %esi
  288. mov %ecx, %esi { esi = pattern }
  289. mov %eax, %ecx
  290. shl $3, %ecx { ecx = misalignment of x in bits }
  291. rol %cl, %esi { misalign the pattern }
  292. add $4, %eax
  293. and $-4, %eax
  294. .balign 16
  295. .L8xLoop:
  296. mov %esi, (%eax)
  297. mov %esi, 4(%eax)
  298. add $8, %eax
  299. cmp %edx, %eax
  300. jb .L8xLoop
  301. mov %esi, (%edx)
  302. mov %esi, 4(%edx)
  303. pop %esi
  304. end;
  305. {$endif ndef CPUX86_HAS_SSE2 (need Fill*_Plain)}
  306. procedure FillXxxx_U32Pattern_Ladder_4to16; assembler; nostackframe;
  307. { eax — x, ecx — uint32 pattern, edx — byte count, 4 <= edx <= 16. }
  308. asm
  309. mov %ecx, (%eax)
  310. cmp $8, %edx
  311. jle .LLast4
  312. mov %ecx, 4(%eax)
  313. mov %ecx, -8(%eax,%edx)
  314. .LLast4:
  315. mov %ecx, -4(%eax,%edx)
  316. end;
  317. {$endif FillChar/Word/DWord required.}
  318. {$endif FillChar/Word/DWord/QWord required.}
  319. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)}
  320. {$define FPC_SYSTEM_HAS_FILLCHAR}
  321. procedure FillChar_3OrLess; assembler; nostackframe;
  322. { cl — x, edx — byte count, Low(int32) <= edx <= 3. }
  323. asm
  324. test %edx, %edx
  325. jle .LQuit
  326. mov %cl, (%eax)
  327. mov %cl, -1(%eax,%edx)
  328. shr $1, %edx
  329. mov %cl, (%eax,%edx)
  330. .LQuit:
  331. end;
  332. {$ifndef CPUX86_HAS_SSE2}
  333. procedure FillChar_Plain(var x;count:SizeInt;value:byte);assembler;nostackframe;
  334. asm
  335. cmp $3, %edx
  336. jle FillChar_3OrLess
  337. movzbl %cl, %ecx
  338. imul $0x01010101, %ecx
  339. cmp $16, %edx
  340. jbe FillXxxx_U32Pattern_Ladder_4to16
  341. jmp FillXxxx_U32Pattern_Plain_16OrMore
  342. end;
  343. {$endif ndef CPUX86_HAS_SSE2}
  344. procedure FillChar_SSE2(var x;count:SizeInt;value:byte);assembler;nostackframe;
  345. asm
  346. cmp $3, %edx
  347. jle FillChar_3OrLess
  348. movzbl %cl, %ecx
  349. imul $0x01010101, %ecx
  350. cmp $16, %edx
  351. jbe FillXxxx_U32Pattern_Ladder_4to16
  352. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  353. jae FillXxxx_U32Pattern_RepStos_8OrMore
  354. movd %ecx, %xmm0
  355. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  356. movdqu %xmm0, (%eax)
  357. movdqu %xmm0, -16(%eax,%edx)
  358. cmp $32, %edx
  359. ja FillXxxx_MoreThanTwoXMMs
  360. end;
  361. procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
  362. asm
  363. cmp $3, %edx
  364. jle FillChar_3OrLess
  365. movzbl %cl, %ecx
  366. imul $0x01010101, %ecx
  367. cmp $16, %edx
  368. jbe FillXxxx_U32Pattern_Ladder_4to16
  369. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  370. jae FillXxxx_U32Pattern_RepStos_8OrMore
  371. movd %ecx, %xmm0
  372. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  373. movdqu %xmm0, (%eax)
  374. movdqu %xmm0, -16(%eax,%edx)
  375. cmp $32, %edx
  376. ja FillXxxx_MoreThanTwoXMMs
  377. end;
  378. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
  379. var
  380. FillChar_Impl: procedure(var x;count:SizeInt;value:byte) = @FillChar_Dispatch;
  381. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte);
  382. begin
  383. if not fpc_cpucodeinit_performed then
  384. begin
  385. {$ifdef CPUX86_HAS_SSE2} FillChar_SSE2 {$else} FillChar_Plain {$endif} (x, count, value);
  386. exit;
  387. end;
  388. if fast_large_repmovstosb then
  389. FillChar_Impl := @FillChar_SSE2_ERMS
  390. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  391. FillChar_Impl := @FillChar_SSE2
  392. {$ifndef CPUX86_HAS_SSE2}
  393. else
  394. FillChar_Impl := @FillChar_Plain
  395. {$endif ndef CPUX86_HAS_SSE2};
  396. FillChar_Impl(x, count, value);
  397. end;
  398. procedure FillChar(var x;count:SizeInt;value:byte);
  399. begin
  400. FillChar_Impl(x, count, value);
  401. end;
  402. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  403. {$if not defined(FPC_SYSTEM_HAS_FILLWORD)}
  404. {$define FPC_SYSTEM_HAS_FILLWORD}
  405. procedure FillWord_3OrLess; assembler; nostackframe;
  406. asm
  407. test %edx, %edx
  408. jle .LQuit
  409. mov %cx, (%eax)
  410. mov %cx, -2(%eax,%edx,2)
  411. shr $1, %edx
  412. mov %cx, (%eax,%edx,2)
  413. .LQuit:
  414. end;
  415. {$ifndef CPUX86_HAS_SSE2}
  416. procedure FillWord_Plain(var x;count:SizeInt;value:word);assembler;nostackframe;
  417. asm
  418. cmp $3, %edx
  419. jle FillWord_3OrLess
  420. shl $1, %edx
  421. movzwl %cx, %ecx
  422. imul $0x00010001, %ecx
  423. cmp $16, %edx
  424. jbe FillXxxx_U32Pattern_Ladder_4to16
  425. jmp FillXxxx_U32Pattern_Plain_16OrMore
  426. end;
  427. {$endif ndef CPUX86_HAS_SSE2}
  428. procedure FillWord_SSE2(var x;count:SizeInt;value:word);assembler;nostackframe;
  429. asm
  430. cmp $3, %edx
  431. jle FillWord_3OrLess
  432. shl $1, %edx
  433. movzwl %cx, %ecx
  434. imul $0x00010001, %ecx
  435. cmp $16, %edx
  436. jbe FillXxxx_U32Pattern_Ladder_4to16
  437. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  438. jb FillXxxx_U32Pattern_SSE2_16OrMore
  439. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  440. end;
  441. procedure FillWord_SSE2_ERMS(var x;count:SizeInt;value:word);assembler;nostackframe;
  442. asm
  443. cmp $3, %edx
  444. jle FillWord_3OrLess
  445. shl $1, %edx
  446. movzwl %cx, %ecx
  447. imul $0x00010001, %ecx
  448. cmp $16, %edx
  449. jbe FillXxxx_U32Pattern_Ladder_4to16
  450. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  451. jb FillXxxx_U32Pattern_SSE2_16OrMore
  452. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  453. end;
  454. procedure FillWord_Dispatch(var x;count:SizeInt;value:word); forward;
  455. var
  456. FillWord_Impl: procedure(var x;count:SizeInt;value:word) = @FillWord_Dispatch;
  457. procedure FillWord_Dispatch(var x;count:SizeInt;value:word);
  458. begin
  459. if not fpc_cpucodeinit_performed then
  460. begin
  461. {$ifdef CPUX86_HAS_SSE2} FillWord_SSE2 {$else} FillWord_Plain {$endif} (x, count, value);
  462. exit;
  463. end;
  464. if fast_large_repmovstosb then
  465. FillWord_Impl := @FillWord_SSE2_ERMS
  466. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  467. FillWord_Impl := @FillWord_SSE2
  468. {$ifndef CPUX86_HAS_SSE2}
  469. else
  470. FillWord_Impl := @FillWord_Plain
  471. {$endif ndef CPUX86_HAS_SSE2};
  472. FillWord_Impl(x, count, value);
  473. end;
  474. procedure FillWord(var x;count:SizeInt;value:word);
  475. begin
  476. FillWord_Impl(x, count, value);
  477. end;
  478. {$endif FPC_SYSTEM_HAS_FILLWORD}
  479. {$if not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  480. {$define FPC_SYSTEM_HAS_FILLDWORD}
  481. procedure FillDWord_4OrLess; assembler; nostackframe;
  482. asm
  483. cmp $1, %edx
  484. jl .LQuit
  485. mov %ecx, (%eax)
  486. je .LQuit
  487. mov %ecx, 4(%eax)
  488. mov %ecx, -8(%eax,%edx,4)
  489. mov %ecx, -4(%eax,%edx,4)
  490. .LQuit:
  491. end;
  492. {$ifndef CPUX86_HAS_SSE2}
  493. procedure FillDWord_Plain(var x;count:SizeInt;value:dword);assembler;nostackframe;
  494. asm
  495. cmp $4, %edx
  496. jle FillDWord_4OrLess
  497. shl $2, %edx
  498. jmp FillXxxx_U32Pattern_Plain_16OrMore
  499. end;
  500. {$endif ndef CPUX86_HAS_SSE2}
  501. procedure FillDWord_SSE2(var x;count:SizeInt;value:dword);assembler;nostackframe;
  502. asm
  503. cmp $4, %edx
  504. jle FillDWord_4OrLess
  505. shl $2, %edx
  506. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  507. jb FillXxxx_U32Pattern_SSE2_16OrMore
  508. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  509. end;
  510. procedure FillDWord_SSE2_ERMS(var x;count:SizeInt;value:dword);assembler;nostackframe;
  511. asm
  512. cmp $4, %edx
  513. jle FillDWord_4OrLess
  514. shl $2, %edx
  515. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  516. jb FillXxxx_U32Pattern_SSE2_16OrMore
  517. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  518. end;
  519. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword); forward;
  520. var
  521. FillDWord_Impl: procedure(var x;count:SizeInt;value:dword) = @FillDWord_Dispatch;
  522. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword);
  523. begin
  524. if not fpc_cpucodeinit_performed then
  525. begin
  526. {$ifdef CPUX86_HAS_SSE2} FillDWord_SSE2 {$else} FillDWord_Plain {$endif}(x, count, value);
  527. exit;
  528. end;
  529. if fast_large_repmovstosb then
  530. FillDWord_Impl := @FillDWord_SSE2_ERMS
  531. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  532. FillDWord_Impl := @FillDWord_SSE2
  533. {$ifndef CPUX86_HAS_SSE2}
  534. else
  535. FillDWord_Impl := @FillDWord_Plain
  536. {$endif ndef CPUX86_HAS_SSE2};
  537. FillDWord_Impl(x, count, value);
  538. end;
  539. procedure FillDWord(var x;count:SizeInt;value:dword);
  540. begin
  541. FillDWord_Impl(x, count, value);
  542. end;
  543. {$endif FPC_SYSTEM_HAS_FILLDWORD}
  544. {$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  545. {$define FPC_SYSTEM_HAS_FILLQWORD}
  546. {$ifndef CPUX86_HAS_SSE2}
  547. procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  548. { eax = x, edx = count, [esp + 4] = value }
  549. asm
  550. test %edx, %edx { Don't care about clever things like separate small branches or aligning writes by misaligning the pattern, the function is dead. }
  551. jle .LQuit
  552. push %esi
  553. mov 4+4(%esp), %esi { esi = value[0:31] }
  554. mov 4+8(%esp), %ecx { ecx = value[32:63] }
  555. .balign 16
  556. .LLoop:
  557. mov %esi, (%eax)
  558. mov %ecx, 4(%eax)
  559. add $8, %eax
  560. sub $1, %edx
  561. jnz .LLoop
  562. pop %esi
  563. .LQuit:
  564. end;
  565. {$endif ndef CPUX86_HAS_SSE2}
  566. procedure {$ifdef CPUX86_HAS_SSE2} FillQWord {$else} FillQWord_SSE2 {$endif}(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  567. { eax = x, edx = count, [esp + 4] = value }
  568. asm
  569. cmp $4, %edx
  570. jle .L4OrLess
  571. movq 4(%esp), %xmm0
  572. punpcklqdq %xmm0, %xmm0
  573. { Stack is 12 bytes:
  574. [esp] = return address, [esp + 4] = value (not required anymore).
  575. Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs:
  576. [esp] = return address. }
  577. mov (%esp), %ecx
  578. add $8, %esp
  579. mov %ecx, (%esp)
  580. shl $3, %edx
  581. movdqu %xmm0, (%eax)
  582. movdqu %xmm0, -16(%eax,%edx)
  583. test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
  584. jz FillXxxx_MoreThanTwoXMMs
  585. mov %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x. }
  586. shl $3, %ecx
  587. and $63, %ecx
  588. movd %ecx, %xmm2
  589. movdqa %xmm0, %xmm1
  590. psllq %xmm2, %xmm1
  591. neg %ecx { Can also do not ecx; and $63, %ecx; inc ecx to support ecx = 0, as SSE shifts correctly handle count >= bitsizeof. }
  592. and $63, %ecx { But in the definitely unaligned case, ecx = 0 is impossible. }
  593. movd %ecx, %xmm2
  594. psrlq %xmm2, %xmm0
  595. por %xmm1, %xmm0
  596. jmp FillXxxx_MoreThanTwoXMMs
  597. .L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ }
  598. cmp $1, %edx
  599. jl .LQuit
  600. mov 4(%esp), %ecx
  601. mov %ecx, (%eax)
  602. je .LSecondHalfOf1
  603. mov %ecx, 8(%eax)
  604. mov %ecx, -16(%eax,%edx,8)
  605. mov %ecx, -8(%eax,%edx,8)
  606. mov 8(%esp), %ecx
  607. mov %ecx, 4(%eax)
  608. mov %ecx, 12(%eax)
  609. mov %ecx, -12(%eax,%edx,8)
  610. mov %ecx, -4(%eax,%edx,8)
  611. .LQuit:
  612. ret $8
  613. .LSecondHalfOf1:
  614. mov 8(%esp), %ecx
  615. mov %ecx, 4(%eax)
  616. end;
  617. {$ifndef CPUX86_HAS_SSE2}
  618. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword); forward;
  619. var
  620. FillQWord_Impl: procedure(var x;count:SizeInt;value:qword) = @FillQWord_Dispatch;
  621. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword);
  622. begin
  623. if not fpc_cpucodeinit_performed then
  624. begin
  625. FillQWord_Plain(x, count, value);
  626. exit;
  627. end;
  628. if has_sse2_support then
  629. FillQWord_Impl := @FillQWord_SSE2
  630. else
  631. FillQWord_Impl := @FillQWord_Plain;
  632. FillQWord_Impl(x, count, value);
  633. end;
  634. procedure FillQWord(var x;count:SizeInt;value:qword);
  635. begin
  636. FillQWord_Impl(x, count, value);
  637. end;
  638. {$endif ndef CPUX86_HAS_SSE2 (need FillQWord dispatcher)}
  639. {$endif FPC_SYSTEM_HAS_FILLQWORD}
  640. {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
  641. {$define FPC_SYSTEM_HAS_INDEXBYTE}
  642. {$ifndef CPUX86_HAS_SSE2}
  643. function IndexByte_Plain(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  644. { eax = buf, edx = len, cl = b }
  645. asm
  646. test %edx,%edx
  647. jz .Lnothing0
  648. push %eax { save initial value of 'buf' }
  649. test $3,%al
  650. jz .Laligned4
  651. .Lalignloop: { align to 4 bytes }
  652. cmp %cl,(%eax)
  653. je .Lfoundateax
  654. inc %eax
  655. dec %edx
  656. jz .Lnothing1
  657. test $3,%al
  658. jnz .Lalignloop
  659. .Laligned4: { align to 8 bytes }
  660. push %esi
  661. push %edi
  662. mov %cl,%ch { prepare pattern }
  663. movzwl %cx,%esi
  664. shl $16,%ecx
  665. or %esi,%ecx
  666. test $7,%al
  667. jz .Lloop
  668. test %edx,%edx { Adjust buf+len and reuse the second unroll from the loop body. Careful with len < 0 (esp. len = −1). }
  669. jl .Ldontfixuplen
  670. add $4,%edx
  671. .Ldontfixuplen:
  672. sub $4,%eax
  673. jmp .Lalignfrom4to8
  674. .balign 16
  675. .Lloop: { Requires 8-byte alignment of eax, to safely over-read up to 7 bytes on last iteration. }
  676. mov (%eax),%esi { load dword }
  677. xor %ecx,%esi { XOR with pattern, bytes equal to target are now 0 }
  678. lea -0x01010101(%esi),%edi
  679. not %esi
  680. and $0x80808080,%esi
  681. and %edi,%esi { (x-0x01010101) and (not x) and 0x80808080 }
  682. jnz .Lfound0 { one of the bytes matches }
  683. .Lalignfrom4to8:
  684. mov 4(%eax),%esi
  685. xor %ecx,%esi
  686. lea -0x01010101(%esi),%edi
  687. not %esi
  688. and $0x80808080,%esi
  689. and %edi,%esi
  690. jnz .Lfound1
  691. add $8,%eax
  692. sub $8,%edx
  693. ja .Lloop
  694. .Lnothing3:
  695. pop %edi
  696. pop %esi
  697. .Lnothing1:
  698. pop %edx
  699. .Lnothing0:
  700. or $-1,%eax
  701. ret
  702. .Lfound1:
  703. sub $4,%edx
  704. jbe .Lnothing3
  705. add $4,%eax
  706. .Lfound0:
  707. bsf %esi,%esi
  708. shr $3,%esi
  709. cmp %edx,%esi { Garbage after remaining length? }
  710. jae .Lnothing3
  711. add %esi,%eax
  712. pop %edi
  713. pop %esi
  714. .Lfoundateax:
  715. pop %ecx
  716. sub %ecx,%eax
  717. end;
  718. {$endif ndef CPUX86_HAS_SSE2}
  719. function {$ifdef CPUX86_HAS_SSE2} IndexByte {$else} IndexByte_SSE2 {$endif} (const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  720. asm
  721. test %edx, %edx
  722. jz .Lnotfound { exit if len=0 }
  723. movd %ecx, %xmm1
  724. mov %eax, %ecx
  725. punpcklbw %xmm1, %xmm1
  726. punpcklbw %xmm1, %xmm1
  727. and $4095, %ecx
  728. pshufd $0, %xmm1, %xmm1
  729. cmp $4080, %ecx
  730. ja .LCrossPage
  731. movdqu (%eax), %xmm0 { Analyze first 16 bytes, unaligned. }
  732. pcmpeqb %xmm1, %xmm0
  733. pmovmskb %xmm0, %ecx
  734. test %ecx, %ecx
  735. jz .LContinueAligned
  736. bsf %ecx, %eax
  737. cmp %edx, %eax
  738. jae .Lnotfound
  739. ret
  740. .byte 144 { Make .balign 16 before .Lloop a no-op. }
  741. .LContinueAligned:
  742. cmp $16, %edx { Length might be explicitly set to 16 or less; if so, skip a bit of work. }
  743. jbe .Lnotfound { (Or rather, this check is *required* unless jumping to .Lcontinue instead of going directly to .Lloop) }
  744. push %ebx
  745. lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
  746. and $-0x10, %ecx { first aligned address after buf }
  747. sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
  748. .balign 16
  749. .Lloop:
  750. movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, }
  751. add $16, %ecx { but their sum is evenly divisible by 16. }
  752. pcmpeqb %xmm1, %xmm0
  753. pmovmskb %xmm0, %ebx
  754. test %ebx, %ebx
  755. jnz .Lmatch
  756. .Lcontinue:
  757. cmp %ecx, %edx
  758. ja .Lloop
  759. pop %ebx
  760. .Lnotfound:
  761. or $-1, %eax
  762. ret
  763. .LCrossPage:
  764. push %ebx
  765. lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
  766. and $-0x10, %ecx { first aligned address after buf }
  767. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
  768. sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
  769. pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
  770. pmovmskb %xmm0, %ebx
  771. shl %cl, %ebx { shift valid bits into high word }
  772. and $0xffff0000, %ebx { clear low word containing invalid bits }
  773. shr %cl, %ebx { shift back }
  774. jz .Lcontinue
  775. .Lmatch:
  776. bsf %ebx, %ebx
  777. lea -16(%ecx,%ebx), %eax
  778. pop %ebx
  779. cmp %eax, %edx { check against the buffer length }
  780. jbe .Lnotfound
  781. end;
  782. {$ifndef CPUX86_HAS_SSE2}
  783. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt; forward;
  784. var
  785. IndexByte_Impl: function(const buf;len:SizeInt;b:byte):SizeInt = @IndexByte_Dispatch;
  786. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt;
  787. begin
  788. if not fpc_cpucodeinit_performed then
  789. exit(IndexByte_Plain(buf,len,b));
  790. if has_sse2_support then
  791. IndexByte_Impl:=@IndexByte_SSE2
  792. else
  793. IndexByte_Impl:=@IndexByte_Plain;
  794. result:=IndexByte_Impl(buf,len,b);
  795. end;
  796. function IndexByte(const buf;len:SizeInt;b:byte):SizeInt;
  797. begin
  798. result:=IndexByte_Impl(buf,len,b);
  799. end;
  800. {$endif ndef CPUX86_HAS_SSE2}
  801. {$endif FPC_SYSTEM_HAS_INDEXBYTE}
  802. {$ifndef FPC_SYSTEM_HAS_INDEXWORD}
  803. {$define FPC_SYSTEM_HAS_INDEXWORD}
  804. {$ifndef CPUX86_HAS_SSE2}
  805. function IndexWord_Plain(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  806. asm
  807. test %edx, %edx
  808. jz .LNotFound
  809. push %eax
  810. .LWordwise_Body: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  811. cmp %cx, (%eax)
  812. je .LFound
  813. add $2, %eax
  814. dec %edx
  815. jnz .LWordwise_Body
  816. pop %edx
  817. .LNotFound:
  818. or $-1, %eax
  819. ret
  820. .LFound:
  821. pop %edx
  822. sub %edx, %eax
  823. shr $1, %eax
  824. end;
  825. {$endif ndef CPUX86_HAS_SSE2}
  826. function {$ifdef CPUX86_HAS_SSE2} IndexWord {$else} IndexWord_SSE2 {$endif} (const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  827. asm
  828. test %edx, %edx { exit if len=0 }
  829. je .Lnotfound
  830. push %ebx
  831. movd %ecx, %xmm1
  832. punpcklwd %xmm1, %xmm1
  833. pshufd $0, %xmm1, %xmm1
  834. lea 16(%eax), %ecx
  835. and $-16, %ecx
  836. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
  837. sub %eax, %ecx
  838. test $1, %eax { if buffer isn't aligned to word boundary, }
  839. jnz .Lunaligned { use a different algorithm }
  840. pcmpeqw %xmm1, %xmm0
  841. pmovmskb %xmm0, %ebx
  842. shl %cl, %ebx
  843. and $0xffff0000, %ebx
  844. shr %cl, %ebx
  845. shr $1, %ecx { ecx=number of valid bytes }
  846. test %ebx, %ebx
  847. jz .Lcontinue
  848. .Lmatch:
  849. bsf %ebx, %ebx
  850. shr $1, %ebx { in words }
  851. lea -8(%ecx,%ebx), %eax
  852. pop %ebx
  853. cmp %eax, %edx
  854. jbe .Lnotfound { if match is after the specified length, ignore it }
  855. ret
  856. .balign 16
  857. .Lloop:
  858. movdqa (%eax,%ecx,2), %xmm0
  859. add $8, %ecx
  860. pcmpeqw %xmm1, %xmm0
  861. pmovmskb %xmm0, %ebx
  862. test %ebx, %ebx
  863. jnz .Lmatch
  864. .Lcontinue:
  865. cmp %ecx, %edx
  866. ja .Lloop
  867. pop %ebx
  868. .Lnotfound:
  869. or $-1, %eax
  870. ret
  871. .Lunaligned:
  872. push %esi
  873. movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
  874. psllw $8, %xmm1 { swap bytes of each word of pattern) }
  875. psrlw $8, %xmm2
  876. por %xmm2, %xmm1
  877. pcmpeqb %xmm1, %xmm0
  878. pmovmskb %xmm0, %ebx
  879. shl %cl, %ebx
  880. and $0xffff0000, %ebx
  881. shr %cl, %ebx
  882. xor %esi, %esi { nothing to merge yet }
  883. add %edx, %edx { length words -> bytes }
  884. jmp .Lcontinue_u
  885. .balign 16
  886. .Lloop_u:
  887. movdqa (%eax,%ecx), %xmm0
  888. add $16, %ecx
  889. pcmpeqb %xmm1, %xmm0 { compare by bytes }
  890. shr $16, %esi { bit 16 shifts into 0 }
  891. pmovmskb %xmm0, %ebx
  892. .Lcontinue_u:
  893. shl $1, %ebx { 15:0 -> 16:1 }
  894. or %esi, %ebx { merge bit 0 from previous round }
  895. mov %ebx, %esi
  896. shr $1, %ebx { now AND together adjacent pairs of bits }
  897. and %esi, %ebx
  898. and $0x5555, %ebx { also reset odd bits }
  899. jnz .Lmatch_u
  900. cmp %ecx, %edx
  901. ja .Lloop_u
  902. .Lnotfound_u:
  903. pop %esi
  904. pop %ebx
  905. or $-1, %eax
  906. ret
  907. .Lmatch_u:
  908. bsf %ebx, %ebx
  909. lea -16(%ecx,%ebx), %eax
  910. cmp %eax, %edx
  911. jbe .Lnotfound_u { if match is after the specified length, ignore it }
  912. sar $1, %eax { in words }
  913. pop %esi
  914. pop %ebx
  915. end;
  916. {$ifndef CPUX86_HAS_SSE2}
  917. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt; forward;
  918. var
  919. IndexWord_Impl: function(const buf;len:SizeInt;b:word):SizeInt = @IndexWord_Dispatch;
  920. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt;
  921. begin
  922. if not fpc_cpucodeinit_performed then
  923. exit(IndexWord_Plain(buf,len,b));
  924. if has_sse2_support then
  925. IndexWord_Impl:=@IndexWord_SSE2
  926. else
  927. IndexWord_Impl:=@IndexWord_Plain;
  928. result:=IndexWord_Impl(buf,len,b);
  929. end;
  930. function IndexWord(const buf;len:SizeInt;b:word):SizeInt; inline;
  931. begin
  932. result:=IndexWord_Impl(buf,len,b);
  933. end;
  934. {$endif ndef CPUX86_HAS_SSE2}
  935. {$endif FPC_SYSTEM_HAS_INDEXWORD}
  936. {$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
  937. {$define FPC_SYSTEM_HAS_INDEXDWORD}
  938. {$ifndef CPUX86_HAS_SSE2}
  939. function IndexDWord_Plain(Const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  940. asm
  941. push %eax
  942. sub $4, %eax
  943. .LDWordwise_Next: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  944. add $4, %eax
  945. sub $1, %edx
  946. jb .LNotFound
  947. cmp %ecx, (%eax)
  948. jne .LDWordwise_Next
  949. pop %edx
  950. sub %edx, %eax
  951. shr $2, %eax
  952. ret
  953. .LNotFound:
  954. pop %edx
  955. mov $-1, %eax
  956. end;
  957. {$endif ndef CPUX86_HAS_SSE2}
  958. function {$ifdef CPUX86_HAS_SSE2} IndexDWord {$else} IndexDWord_SSE2 {$endif} (const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  959. asm
  960. push %eax
  961. sub $4, %edx
  962. jle .LDwordwise_Prepare
  963. movd %ecx, %xmm1
  964. pshufd $0, %xmm1, %xmm1
  965. .balign 16 { 1-byte NOP. }
  966. .L4x_Body:
  967. movdqu (%eax), %xmm0
  968. pcmpeqd %xmm1, %xmm0
  969. pmovmskb %xmm0, %ecx
  970. test %ecx, %ecx
  971. jnz .LFoundAtMask
  972. add $16, %eax
  973. sub $4, %edx
  974. jg .L4x_Body
  975. lea (%eax,%edx,4), %eax
  976. movdqu (%eax), %xmm0
  977. pcmpeqd %xmm1, %xmm0
  978. pmovmskb %xmm0, %ecx
  979. test %ecx, %ecx
  980. jz .LNothing
  981. .LFoundAtMask:
  982. bsf %ecx, %ecx
  983. add %ecx, %eax
  984. .LFoundAtEax:
  985. pop %edx
  986. sub %edx, %eax
  987. shr $2, %eax
  988. ret
  989. nop { Turns .balign 16 before .LDwordwise_Body into a no-op. }
  990. .LDwordwise_Prepare:
  991. add $3, %edx
  992. cmp $-1, %edx
  993. je .LNothing
  994. .balign 16 { no-op }
  995. .LDwordwise_Body:
  996. cmp (%eax), %ecx
  997. je .LFoundAtEax
  998. add $4, %eax
  999. sub $1, %edx
  1000. jae .LDwordwise_Body
  1001. .LNothing:
  1002. pop %edx
  1003. or $-1, %eax
  1004. end;
  1005. {$ifndef CPUX86_HAS_SSE2}
  1006. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt; forward;
  1007. var
  1008. IndexDWord_Impl: function(const buf;len:SizeInt;b:DWord):SizeInt = @IndexDWord_Dispatch;
  1009. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt;
  1010. begin
  1011. if not fpc_cpucodeinit_performed then
  1012. exit(IndexDWord_Plain(buf,len,b));
  1013. if has_sse2_support then
  1014. IndexDWord_Impl:=@IndexDWord_SSE2
  1015. else
  1016. IndexDWord_Impl:=@IndexDWord_Plain;
  1017. result:=IndexDWord_Impl(buf,len,b);
  1018. end;
  1019. function IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt;
  1020. begin
  1021. result:=IndexDWord_Impl(buf,len,b);
  1022. end;
  1023. {$endif CPUX86_HAS_SSE2}
  1024. {$endif FPC_SYSTEM_HAS_INDEXDWORD}
  1025. {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
  1026. {$define FPC_SYSTEM_HAS_INDEXQWORD}
  1027. function IndexQWord_Plain(const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  1028. { eax = buf, edx = len, [esp+4] = b }
  1029. asm
  1030. push %ebx
  1031. mov 8(%esp), %ecx { ecx = b[0:31] }
  1032. mov 12(%esp), %ebx { ebx = b[32:63] }
  1033. mov %eax, 8(%esp) { remember original buf }
  1034. sub $8, %eax
  1035. .balign 16 { no-op }
  1036. .LQWordwise_Next:
  1037. add $8, %eax
  1038. sub $1, %edx
  1039. jb .LNotFound
  1040. cmp %ecx, (%eax)
  1041. jne .LQWordwise_Next
  1042. cmp %ebx, 4(%eax)
  1043. jne .LQWordwise_Next
  1044. sub 8(%esp), %eax
  1045. pop %ebx
  1046. shr $3, %eax
  1047. ret $8
  1048. .LNotFound:
  1049. pop %ebx
  1050. mov $-1, %eax
  1051. end;
  1052. function {$ifdef CPUX86_HAS_SSE4_1} IndexQWord {$else} IndexQWord_SSE41 {$endif}(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  1053. { eax = buf, edx = len, [esp+4] = b }
  1054. asm
  1055. cmp $6, len
  1056. jle IndexQWord_Plain
  1057. movddup 4(%esp), %xmm0 { xmm0 = pattern of 'b's. }
  1058. mov %eax, %ecx { ecx = original buf }
  1059. sub $6, len
  1060. .balign 16
  1061. .L6x_Loop:
  1062. movdqu (%eax), %xmm1
  1063. pcmpeqq %xmm0, %xmm1 { xmm1 = cmpeq(vec 0, pattern) }
  1064. movdqu 16(%eax), %xmm2
  1065. pcmpeqq %xmm0, %xmm2
  1066. por %xmm1, %xmm2 { xmm2 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) }
  1067. movdqu 32(%eax), %xmm3
  1068. pcmpeqq %xmm0, %xmm3
  1069. por %xmm2, %xmm3 { xmm3 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) or cmpeq(vec 2, pattern) }
  1070. ptest %xmm3, %xmm3
  1071. jnz .LFound
  1072. add $48, %eax
  1073. sub $6, len
  1074. jge .L6x_Loop
  1075. lea (%eax,%edx,8), %eax { Point to last 3 vectors. }
  1076. cmp $-5, len
  1077. jge .L6x_Loop { Reuse .L6x_Loop to compare last 3 vectors, if not compared already. }
  1078. mov $-1, %eax
  1079. ret $8
  1080. .LFound:
  1081. sub %ecx, %eax
  1082. ptest %xmm1, %xmm1
  1083. jnz .LFoundAtXmm1
  1084. ptest %xmm2, %xmm2
  1085. jnz .LFoundAtXmm2
  1086. add $16, %eax
  1087. movdqa %xmm3, %xmm2
  1088. .LFoundAtXmm2:
  1089. add $16, %eax
  1090. movdqa %xmm2, %xmm1
  1091. .LFoundAtXmm1:
  1092. pmovmskb %xmm1, %ecx
  1093. bsf %ecx, %ecx
  1094. add %ecx, %eax
  1095. shr $3, %eax
  1096. end;
  1097. {$ifndef CPUX86_HAS_SSE4_1}
  1098. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt; forward;
  1099. var
  1100. IndexQWord_Impl: function(const buf;len:SizeInt;b:QWord):SizeInt = @IndexQWord_Dispatch;
  1101. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt;
  1102. begin
  1103. if not fpc_cpucodeinit_performed then
  1104. exit(IndexQWord_Plain(buf,len,b));
  1105. if has_sse41_support then
  1106. IndexQWord_Impl:=@IndexQWord_SSE41
  1107. else
  1108. IndexQWord_Impl:=@IndexQWord_Plain;
  1109. result:=IndexQWord_Impl(buf,len,b);
  1110. end;
  1111. function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt;
  1112. begin
  1113. result:=IndexQWord_Impl(buf,len,b);
  1114. end;
  1115. {$endif ndef CPUX86_HAS_SSE4_1}
  1116. {$endif FPC_SYSTEM_HAS_INDEXQWORD}
  1117. {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
  1118. {$define FPC_SYSTEM_HAS_COMPAREBYTE}
  1119. {$ifndef CPUX86_HAS_SSE2}
  1120. function CompareByte_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1121. asm
  1122. { eax = buf1, edx = buf2, ecx = len }
  1123. push %ebx
  1124. sub %eax, %edx { edx = buf2 - buf1 }
  1125. cmp $3, %ecx
  1126. jle .LBytewise_Prepare
  1127. { Align buf1 on 4 bytes. }
  1128. mov (%edx,%eax), %ebx
  1129. cmp (%eax), %ebx
  1130. jne .L4xDiffer
  1131. lea -4(%eax,%ecx), %ecx { ecx = buf1 end - (4 + buf1 and -4) = count remaining }
  1132. and $-4, %eax
  1133. sub %eax, %ecx
  1134. .balign 16
  1135. .L4x_Next:
  1136. add $4, %eax
  1137. sub $4, %ecx { at .LLast4, ecx is 4 less than remaining bytes }
  1138. jle .LLast4
  1139. mov (%edx,%eax), %ebx
  1140. cmp (%eax), %ebx
  1141. je .L4x_Next
  1142. .L4xDiffer:
  1143. mov (%eax), %edx
  1144. {$ifdef CPUX86_HAS_BSWAP}
  1145. bswap %ebx
  1146. bswap %edx
  1147. {$else}
  1148. rol $8, %bx
  1149. rol $16, %ebx
  1150. rol $8, %bx
  1151. rol $8, %dx
  1152. rol $16, %edx
  1153. rol $8, %dx
  1154. {$endif}
  1155. cmp %ebx, %edx
  1156. .LDoSbb:
  1157. sbb %eax, %eax
  1158. or $1, %eax
  1159. pop %ebx
  1160. ret
  1161. .LLast4:
  1162. add %ecx, %eax
  1163. mov (%edx,%eax), %ebx
  1164. cmp (%eax), %ebx
  1165. jne .L4xDiffer
  1166. xor %eax, %eax
  1167. pop %ebx
  1168. ret
  1169. .LBytewise_Prepare:
  1170. sub $1, %ecx
  1171. jb .LNothing
  1172. .balign 16 { no-op }
  1173. .LBytewise_Body:
  1174. movzbl (%edx,%eax), %ebx
  1175. cmp %bl, (%eax)
  1176. jne .LDoSbb
  1177. add $1, %eax
  1178. sub $1, %ecx
  1179. jae .LBytewise_Body
  1180. .LNothing:
  1181. xor %eax, %eax
  1182. pop %ebx
  1183. end;
  1184. {$endif ndef CPUX86_HAS_SSE2}
  1185. label
  1186. CompareByte_1OrLess, CompareByte_CantOverReadBoth_AVX2;
  1187. function CompareByte_SSE2(const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
  1188. asm
  1189. { eax = buf1, edx = buf2, ecx = len }
  1190. cmp $1, %ecx
  1191. jle CompareByte_1OrLess
  1192. push %ebx
  1193. cmp $16, %ecx
  1194. jae .LVecOrMore
  1195. { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
  1196. mov %eax, %ebx
  1197. or %edx, %ebx
  1198. and $4095, %ebx
  1199. cmp $4080, %ebx
  1200. ja .LCantOverReadBoth
  1201. { Over-read both as XMMs. }
  1202. movdqu (%eax), %xmm0
  1203. movdqu (%edx), %xmm1
  1204. pcmpeqb %xmm1, %xmm0
  1205. pmovmskb %xmm0, %ebx
  1206. inc %bx { Here and below, 2-byte 'inc r16' after 'pmovmskb' can be replaced with 5-byte 'add $1, r16' or 6-byte 'xor $65535, r32'. }
  1207. jz .LNothing
  1208. bsf %ebx, %ebx
  1209. cmp %ecx, %ebx { Ignore garbage beyond 'len'. }
  1210. jae .LNothing
  1211. movzbl (%eax,%ebx), %eax
  1212. movzbl (%edx,%ebx), %edx
  1213. sub %edx, %eax
  1214. pop %ebx
  1215. ret
  1216. .LNothing:
  1217. pop %ebx
  1218. xor %eax, %eax
  1219. ret
  1220. .LAligned32xLoop_TwoVectorsDiffer:
  1221. add %eax, %edx { restore edx = buf2 }
  1222. pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
  1223. inc %cx
  1224. jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
  1225. mov %ecx, %ebx
  1226. .LVec0Differs:
  1227. bsf %ebx, %ebx
  1228. movzbl (%eax,%ebx), %eax
  1229. movzbl (%edx,%ebx), %edx
  1230. sub %edx, %eax
  1231. pop %ebx
  1232. ret
  1233. .byte 144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
  1234. CompareByte_CantOverReadBoth_AVX2:
  1235. cmp $16, %ecx
  1236. jb .LCantOverReadBoth
  1237. .LVecOrMore:
  1238. { Compare first vectors. }
  1239. movdqu (%eax), %xmm0
  1240. movdqu (%edx), %xmm1
  1241. pcmpeqb %xmm1, %xmm0
  1242. pmovmskb %xmm0, %ebx
  1243. inc %bx
  1244. jnz .LVec0Differs
  1245. sub $32, %ecx { now ecx is len - 32. }
  1246. jbe .LLastVec
  1247. { Compare second vectors. }
  1248. movdqu 16(%eax), %xmm0
  1249. movdqu 16(%edx), %xmm1
  1250. pcmpeqb %xmm1, %xmm0
  1251. pmovmskb %xmm0, %ebx
  1252. inc %bx
  1253. jnz .LVec1Differs
  1254. cmp $32, %ecx
  1255. jbe .LLastTwoVectors
  1256. { More than four vectors: aligned loop. }
  1257. lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
  1258. sub %eax, %edx { edx = buf2 - buf1 }
  1259. and $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
  1260. sub %eax, %ecx { ecx = count to be handled with loop }
  1261. .balign 16 { No-op. }
  1262. .LAligned32xLoop_Body:
  1263. add $32, %eax
  1264. { Compare two XMMs, reduce the result with 'and'. }
  1265. movdqu (%edx,%eax), %xmm0
  1266. pcmpeqb (%eax), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
  1267. movdqu 16(%edx,%eax), %xmm1
  1268. pcmpeqb 16(%eax), %xmm1
  1269. pand %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
  1270. pmovmskb %xmm1, %ebx
  1271. inc %bx
  1272. jnz .LAligned32xLoop_TwoVectorsDiffer
  1273. sub $32, %ecx
  1274. ja .LAligned32xLoop_Body
  1275. add %eax, %edx { restore edx = buf2 }
  1276. add $32, %ecx
  1277. .LLastTwoVectors:
  1278. movdqu (%eax,%ecx), %xmm0
  1279. movdqu (%edx,%ecx), %xmm1
  1280. pcmpeqb %xmm1, %xmm0
  1281. pmovmskb %xmm0, %ebx
  1282. inc %bx
  1283. jnz .LVecEm2Differs
  1284. .LLastVec:
  1285. movdqu 16(%eax,%ecx), %xmm0
  1286. movdqu 16(%edx,%ecx), %xmm1
  1287. pcmpeqb %xmm1, %xmm0
  1288. pmovmskb %xmm0, %ebx
  1289. inc %bx
  1290. jnz .LVecEm1Differs
  1291. pop %ebx
  1292. xor %eax, %eax
  1293. ret
  1294. .LVec1Differs:
  1295. xor %ecx, %ecx
  1296. .LVecEm1Differs:
  1297. add $16, %ecx
  1298. .LVecEm2Differs:
  1299. bsf %ebx, %ebx
  1300. add %ecx, %ebx
  1301. movzbl (%eax,%ebx), %eax
  1302. movzbl (%edx,%ebx), %edx
  1303. sub %edx, %eax
  1304. pop %ebx
  1305. ret
  1306. .LCantOverReadBoth:
  1307. cmp $3, %ecx
  1308. jle .L2to3
  1309. push %esi
  1310. mov (%eax), %ebx
  1311. mov (%edx), %esi
  1312. cmp %esi, %ebx
  1313. jne .L4xDiffer
  1314. cmp $8, %ecx
  1315. jbe .LLast4x
  1316. mov 4(%eax), %ebx
  1317. mov 4(%edx), %esi
  1318. cmp %esi, %ebx
  1319. jne .L4xDiffer
  1320. mov -8(%eax,%ecx), %ebx
  1321. mov -8(%edx,%ecx), %esi
  1322. cmp %esi, %ebx
  1323. jne .L4xDiffer
  1324. .LLast4x:
  1325. mov -4(%eax,%ecx), %ebx
  1326. mov -4(%edx,%ecx), %esi
  1327. cmp %esi, %ebx
  1328. jne .L4xDiffer
  1329. pop %esi
  1330. pop %ebx
  1331. xor %eax, %eax
  1332. ret
  1333. .L4xDiffer:
  1334. bswap %ebx
  1335. bswap %esi
  1336. cmp %esi, %ebx
  1337. pop %esi
  1338. sbb %eax, %eax
  1339. or $1, %eax
  1340. pop %ebx
  1341. ret
  1342. .L2to3:
  1343. movzwl (%edx), %ebx
  1344. bswap %ebx
  1345. shr $1, %ebx
  1346. mov -1(%edx,%ecx), %bl
  1347. movzwl (%eax), %edx
  1348. bswap %edx
  1349. shr $1, %edx
  1350. mov -1(%eax,%ecx), %dl
  1351. mov %edx, %eax
  1352. sub %ebx, %eax
  1353. pop %ebx
  1354. ret
  1355. CompareByte_1OrLess:
  1356. jl .LUnbounded_Prepare
  1357. movzbl (%eax), %eax
  1358. movzbl (%edx), %edx
  1359. sub %edx, %eax
  1360. ret
  1361. .LUnbounded_Prepare:
  1362. sub %eax, %edx { edx = buf2 - buf1 }
  1363. test %ecx, %ecx
  1364. jnz .LUnbounded_Body
  1365. xor %eax, %eax
  1366. ret
  1367. .balign 16
  1368. .LUnbounded_Next:
  1369. add $1, %eax
  1370. .LUnbounded_Body:
  1371. movzbl (%edx,%eax), %ecx
  1372. cmp %cl, (%eax)
  1373. je .LUnbounded_Next
  1374. sbb %eax, %eax
  1375. or $1, %eax
  1376. end;
  1377. function {$ifdef CPUX86_HAS_BMI2} CompareByte {$else} CompareByte_AVX2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
  1378. asm
  1379. { eax = buf1, edx = buf2, ecx = len }
  1380. cmp $1, %ecx
  1381. jle CompareByte_1OrLess
  1382. push %ebx
  1383. cmp $32, %ecx
  1384. jae .LVecOrMore
  1385. { 2 to 31 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
  1386. mov %eax, %ebx
  1387. or %edx, %ebx
  1388. and $4095, %ebx
  1389. cmp $4064, %ebx
  1390. ja CompareByte_CantOverReadBoth_AVX2
  1391. { Over-read both as YMMs. }
  1392. vmovdqu (%eax), %ymm0
  1393. vpcmpeqb (%edx), %ymm0, %ymm0
  1394. vpmovmskb %ymm0, %ebx
  1395. inc %ebx
  1396. { bzhi %ecx, %ebx, %ecx }
  1397. .byte 0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi }
  1398. jnz .LVec0Differs
  1399. vzeroupper
  1400. pop %ebx
  1401. xor %eax, %eax
  1402. ret
  1403. .byte 144 { Turn .balign 16 before .LAligned64xLoop_Body into a no-op. }
  1404. .LAligned64xLoop_TwoVectorsDiffer:
  1405. add %eax, %edx { restore edx = buf2 }
  1406. vpmovmskb %ymm0, %ecx { Is there a difference in the first vector? }
  1407. inc %ecx
  1408. jz .LVec1Differs { No difference in the first vector, ymm0 is all ones, ebx = vpmovmskb(vpcmpeqb(buf1 + 32, buf2 + 32)) from the loop body. }
  1409. mov %ecx, %ebx
  1410. .LVec0Differs:
  1411. vzeroupper
  1412. tzcnt %ebx, %ebx
  1413. movzbl (%eax,%ebx), %eax
  1414. movzbl (%edx,%ebx), %edx
  1415. sub %edx, %eax
  1416. pop %ebx
  1417. ret
  1418. .LVecOrMore:
  1419. { Compare first vectors. }
  1420. vmovdqu (%eax), %ymm0
  1421. vpcmpeqb (%edx), %ymm0, %ymm0
  1422. vpmovmskb %ymm0, %ebx
  1423. inc %ebx
  1424. jnz .LVec0Differs
  1425. sub $64, %ecx { now ecx is len - 64. }
  1426. jbe .LLastVec
  1427. { Compare second vectors. }
  1428. vmovdqu 32(%eax), %ymm0
  1429. vpcmpeqb 32(%edx), %ymm0, %ymm0
  1430. vpmovmskb %ymm0, %ebx
  1431. inc %ebx
  1432. jnz .LVec1Differs
  1433. cmp $64, %ecx
  1434. jbe .LLastTwoVectors
  1435. { More than four vectors: aligned loop. }
  1436. lea -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
  1437. sub %eax, %edx { edx = buf2 - buf1 }
  1438. and $-32, %eax { Align buf1. First two vectors already analyzed are skipped by +64 on the first loop iteration. }
  1439. sub %eax, %ecx { ecx = count to be handled with loop }
  1440. .balign 16 { No-op. }
  1441. .LAligned64xLoop_Body:
  1442. add $64, %eax
  1443. { Compare two YMMs, reduce the result with 'and'. }
  1444. vmovdqu (%edx,%eax), %ymm0
  1445. vpcmpeqb (%eax), %ymm0, %ymm0 { ymm0 = vpcmpeqb(buf1, buf2) }
  1446. vmovdqu 32(%edx,%eax), %ymm1
  1447. vpcmpeqb 32(%eax), %ymm1, %ymm1
  1448. vpand %ymm0, %ymm1, %ymm1 { ymm1 = ymm0 and vpcmpeqb(buf1 + 32, buf2 + 32) }
  1449. vpmovmskb %ymm1, %ebx
  1450. inc %ebx
  1451. jnz .LAligned64xLoop_TwoVectorsDiffer
  1452. sub $64, %ecx
  1453. ja .LAligned64xLoop_Body
  1454. add %eax, %edx { restore edx = buf2 }
  1455. add $64, %ecx
  1456. .LLastTwoVectors:
  1457. vmovdqu (%eax,%ecx), %ymm0
  1458. vpcmpeqb (%edx,%ecx), %ymm0, %ymm0
  1459. vpmovmskb %ymm0, %ebx
  1460. inc %ebx
  1461. jnz .LVecEm2Differs
  1462. .LLastVec:
  1463. vmovdqu 32(%eax,%ecx), %ymm0
  1464. vpcmpeqb 32(%edx,%ecx), %ymm0, %ymm0
  1465. vpmovmskb %ymm0, %ebx
  1466. inc %ebx
  1467. jnz .LVecEm1Differs
  1468. vzeroupper
  1469. pop %ebx
  1470. xor %eax, %eax
  1471. ret
  1472. .LVec1Differs:
  1473. xor %ecx, %ecx
  1474. .LVecEm1Differs:
  1475. add $32, %ecx
  1476. .LVecEm2Differs:
  1477. vzeroupper
  1478. tzcnt %ebx, %ebx
  1479. add %ecx, %ebx
  1480. movzbl (%eax,%ebx), %eax
  1481. movzbl (%edx,%ebx), %edx
  1482. sub %edx, %eax
  1483. pop %ebx
  1484. end;
  1485. {$ifndef CPUX86_HAS_BMI2}
  1486. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1487. var
  1488. CompareByte_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareByte_Dispatch;
  1489. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1490. begin
  1491. if not fpc_cpucodeinit_performed then
  1492. exit({$ifdef CPUX86_HAS_SSE2} CompareByte_SSE2 {$else} CompareByte_Plain {$endif} (buf1, buf2, len));
  1493. if has_avx2_support then
  1494. CompareByte_Impl:=@CompareByte_AVX2
  1495. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  1496. CompareByte_Impl:=@CompareByte_SSE2
  1497. {$ifndef CPUX86_HAS_SSE2}
  1498. else
  1499. CompareByte_Impl:=@CompareByte_Plain
  1500. {$endif};
  1501. result:=CompareByte_Impl(buf1, buf2, len);
  1502. end;
  1503. function CompareByte(const buf1, buf2; len: SizeInt): SizeInt;
  1504. begin
  1505. result:=CompareByte_Impl(buf1, buf2, len);
  1506. end;
  1507. {$endif ndef CPUX86_HAS_BMI2 (need CompareByte dispatcher)}
  1508. {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
  1509. {$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
  1510. {$define FPC_SYSTEM_HAS_COMPAREWORD}
  1511. {$ifndef CPUX86_HAS_SSE2}
  1512. function CompareWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1513. asm
  1514. push %ebx
  1515. sub %eax, %edx { edx = buf2 - buf1 }
  1516. lea -4(%ecx), %ebx { Go wordwise if ecx <= 3 or ecx > 1073741823 (High(int32) div 2) ==> uint32(ecx - 4) > 1073741819. }
  1517. cmp $1073741819, %ebx
  1518. ja .LWordwise_Prepare
  1519. test $2, %al
  1520. je .LAlignedToPtrUintOrNaturallyMisaligned
  1521. movzwl (%edx,%eax), %ebx
  1522. cmp %bx, (%eax)
  1523. jne .LDoSbb
  1524. add $2, %eax
  1525. sub $1, %ecx
  1526. .LAlignedToPtrUintOrNaturallyMisaligned:
  1527. sub $2, %ecx
  1528. .balign 16
  1529. .LPtrUintWise_Next:
  1530. mov (%edx,%eax), %ebx
  1531. cmp %ebx, (%eax)
  1532. jne .LPtrUintsDiffer
  1533. add $4, %eax
  1534. sub $2, %ecx
  1535. jg .LPtrUintWise_Next
  1536. lea (%eax,%ecx,2), %eax
  1537. mov (%edx,%eax), %ebx
  1538. cmp %ebx, (%eax)
  1539. jne .LPtrUintsDiffer
  1540. pop %ebx
  1541. xor %eax, %eax
  1542. ret
  1543. .LPtrUintsDiffer:
  1544. cmp %bx, (%eax)
  1545. jne .LDoSbb
  1546. shr $16, %ebx
  1547. cmp %bx, 2(%eax)
  1548. .LDoSbb:
  1549. sbb %eax, %eax
  1550. or $1, %eax
  1551. pop %ebx
  1552. ret
  1553. .balign 16
  1554. .LWordwise_Body:
  1555. movzwl (%edx,%eax), %ebx
  1556. cmp %bx, (%eax)
  1557. jne .LDoSbb
  1558. add $2, %eax
  1559. .LWordwise_Prepare:
  1560. sub $1, %ecx
  1561. jnb .LWordwise_Body
  1562. pop %ebx
  1563. xor %eax, %eax
  1564. end;
  1565. {$endif ndef CPUX86_HAS_SSE2}
  1566. function {$ifdef CPUX86_HAS_SSE2} CompareWord {$else} CompareWord_SSE2 {$endif} (Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1567. asm
  1568. push %ebx
  1569. sub %eax, %edx { edx = buf2 - buf1 }
  1570. lea -2(%ecx), %ebx { Go wordwise if ecx <= 1 or uint32(ecx) > 1073741823 (High(int32) div 2) ==> uint32(ecx - 2) > 1073741821. }
  1571. cmp $1073741821, %ebx
  1572. ja .LWordwise_Prepare
  1573. cmp $8, %ecx
  1574. jge .LVecOrMore
  1575. lea (%edx,%eax), %ebx
  1576. or %eax, %ebx
  1577. and $4095, %ebx
  1578. cmp $4080, %ebx
  1579. ja .LWordwise_Prepare
  1580. movdqu (%edx,%eax), %xmm0
  1581. movdqu (%eax), %xmm1
  1582. pcmpeqw %xmm1, %xmm0
  1583. pmovmskb %xmm0, %ebx
  1584. inc %bx
  1585. jz .LNothing
  1586. shl $1, %ecx { convert to bytes }
  1587. bsf %ebx, %ebx
  1588. cmp %ecx, %ebx
  1589. jb .LSubtractWords
  1590. .LNothing:
  1591. pop %ebx
  1592. xor %eax, %eax
  1593. ret
  1594. .balign 16
  1595. .LWordwise_Body:
  1596. movzwl (%edx,%eax), %ebx
  1597. cmp %bx, (%eax)
  1598. jne .LDoSbb
  1599. add $2, %eax
  1600. .LWordwise_Prepare:
  1601. sub $1, %ecx
  1602. jae .LWordwise_Body
  1603. xor %eax, %eax
  1604. pop %ebx
  1605. ret
  1606. .LDoSbb:
  1607. sbb %eax, %eax
  1608. or $1, %eax
  1609. pop %ebx
  1610. ret
  1611. .LVecOrMore:
  1612. movdqu (%edx,%eax), %xmm0 { Compare first vectors. }
  1613. movdqu (%eax), %xmm1
  1614. pcmpeqw %xmm1, %xmm0
  1615. pmovmskb %xmm0, %ebx
  1616. inc %bx
  1617. jnz .LVec0Differs
  1618. shl $1, %ecx { convert to bytes }
  1619. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1620. jle .LLastVec
  1621. push %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1622. add %eax, %ecx
  1623. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1624. sub %eax, %ecx
  1625. .balign 16
  1626. .LAligned8xLoop_Body:
  1627. add $16, %eax
  1628. movdqu (%edx,%eax), %xmm0
  1629. pcmpeqb (%eax), %xmm0
  1630. pmovmskb %xmm0, %ebx
  1631. inc %bx
  1632. jnz .LAligned8xLoop_VecDiffers
  1633. sub $16, %ecx
  1634. ja .LAligned8xLoop_Body
  1635. pop %ebx { drop original buf1 }
  1636. .LLastVec:
  1637. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1638. movdqu (%edx,%eax), %xmm0
  1639. movdqu (%eax), %xmm1
  1640. pcmpeqw %xmm1, %xmm0
  1641. pmovmskb %xmm0, %ebx
  1642. inc %bx
  1643. jnz .LVec0Differs
  1644. pop %ebx
  1645. xor %eax, %eax
  1646. ret
  1647. .LVec0Differs:
  1648. bsf %ebx, %ebx
  1649. .LSubtractWords:
  1650. add %eax, %edx
  1651. movzwl (%eax,%ebx), %eax
  1652. movzwl (%edx,%ebx), %edx
  1653. sub %edx, %eax
  1654. pop %ebx
  1655. ret
  1656. .LAligned8xLoop_VecDiffers:
  1657. bsf %ebx, %ebx
  1658. add %ebx, %eax
  1659. pop %ecx
  1660. sub %ecx, %eax
  1661. and $-2, %eax
  1662. add %ecx, %eax
  1663. movzwl (%edx,%eax), %edx
  1664. movzwl (%eax), %eax
  1665. sub %edx, %eax
  1666. pop %ebx
  1667. end;
  1668. {$ifndef CPUX86_HAS_SSE2}
  1669. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1670. var
  1671. CompareWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareWord_Dispatch;
  1672. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1673. begin
  1674. if not fpc_cpucodeinit_performed then
  1675. exit(CompareWord_Plain(buf1, buf2, len));
  1676. if has_sse2_support then
  1677. CompareWord_Impl:=@CompareWord_SSE2
  1678. else
  1679. CompareWord_Impl:=@CompareWord_Plain;
  1680. result:=CompareWord_Impl(buf1, buf2, len);
  1681. end;
  1682. function CompareWord(const buf1, buf2; len: SizeInt): SizeInt;
  1683. begin
  1684. result:=CompareWord_Impl(buf1, buf2, len);
  1685. end;
  1686. {$endif ndef CPUX86_HAS_SSE2 (need CompareWord dispatcher)}
  1687. {$endif FPC_SYSTEM_HAS_COMPAREWORD}
  1688. {$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
  1689. {$define FPC_SYSTEM_HAS_COMPAREDWORD}
  1690. {$ifndef CPUX86_HAS_SSE2}
  1691. function CompareDWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1692. asm
  1693. sub $1, %ecx
  1694. jb .LNothing
  1695. push %ebx
  1696. sub %eax, %edx
  1697. .balign 16
  1698. .LDwordwise_Body:
  1699. mov (%edx,%eax), %ebx
  1700. cmp %ebx, (%eax)
  1701. jne .LDoSbb
  1702. add $4, %eax
  1703. sub $1, %ecx
  1704. jnb .LDwordwise_Body
  1705. pop %ebx
  1706. .LNothing:
  1707. xor %eax, %eax
  1708. ret
  1709. .LDoSbb:
  1710. pop %ebx
  1711. sbb %eax, %eax
  1712. or $1, %eax
  1713. end;
  1714. {$endif}
  1715. function {$ifdef CPUX86_HAS_SSE2} CompareDWord {$else} CompareDWord_SSE2 {$endif} (Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1716. asm
  1717. push %ebx
  1718. sub %eax, %edx { edx = buf2 - buf1 }
  1719. lea -5(%ecx), %ebx { Go dwordwise if ecx <= 4 or ecx > 536870911 (High(int32) div 4) ==> uint32(ecx - 5) > 536870906. }
  1720. cmp $536870906, %ebx
  1721. ja .LDwordwise_Prepare
  1722. shl $2, %ecx { convert to bytes }
  1723. movdqu (%edx,%eax), %xmm1 { Compare first vectors. }
  1724. movdqu (%eax), %xmm0
  1725. pcmpeqd %xmm1, %xmm0
  1726. pmovmskb %xmm0, %ebx
  1727. inc %bx
  1728. jnz .LVec0Differs
  1729. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1730. jle .LLastVec
  1731. push %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1732. add %eax, %ecx
  1733. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1734. sub %eax, %ecx
  1735. .balign 16
  1736. .LAligned4xLoop_Body:
  1737. add $16, %eax
  1738. movdqu (%eax,%edx), %xmm0
  1739. pcmpeqb (%eax), %xmm0
  1740. pmovmskb %xmm0, %ebx
  1741. inc %bx
  1742. jnz .LAligned4xLoop_VecDiffers
  1743. sub $16, %ecx
  1744. ja .LAligned4xLoop_Body
  1745. pop %ebx { drop original buf1 }
  1746. .LLastVec:
  1747. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1748. movdqu (%edx,%eax), %xmm1
  1749. movdqu (%eax), %xmm0
  1750. pcmpeqd %xmm1, %xmm0
  1751. pmovmskb %xmm0, %ebx
  1752. inc %bx
  1753. jnz .LVec0Differs
  1754. pop %ebx
  1755. xor %eax, %eax
  1756. ret
  1757. .LVec0Differs:
  1758. bsf %ebx, %ebx
  1759. add %eax, %edx { recover edx = buf2 }
  1760. mov (%edx,%ebx), %edx
  1761. cmp %edx, (%eax,%ebx)
  1762. sbb %eax, %eax
  1763. or $1, %eax
  1764. pop %ebx
  1765. ret
  1766. .LAligned4xLoop_VecDiffers:
  1767. bsf %ebx, %ebx
  1768. add %ebx, %eax
  1769. pop %ecx
  1770. sub %ecx, %eax
  1771. and $-4, %eax
  1772. add %ecx, %eax
  1773. mov (%edx,%eax), %edx
  1774. cmp %edx, (%eax)
  1775. .LDoSbb:
  1776. sbb %eax, %eax
  1777. or $1, %eax
  1778. pop %ebx
  1779. ret
  1780. .balign 16
  1781. .LDwordwise_Body:
  1782. mov (%edx,%eax), %ebx
  1783. cmp %ebx, (%eax)
  1784. jne .LDoSbb
  1785. add $4, %eax
  1786. .LDwordwise_Prepare:
  1787. sub $1, %ecx
  1788. jnb .LDwordwise_Body
  1789. pop %ebx
  1790. xor %eax, %eax
  1791. end;
  1792. {$ifndef CPUX86_HAS_SSE2}
  1793. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1794. var
  1795. CompareDWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareDWord_Dispatch;
  1796. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1797. begin
  1798. if not fpc_cpucodeinit_performed then
  1799. exit(CompareDWord_Plain(buf1, buf2, len));
  1800. if has_sse2_support then
  1801. CompareDWord_Impl:=@CompareDWord_SSE2
  1802. else
  1803. CompareDWord_Impl:=@CompareDWord_Plain;
  1804. result:=CompareDWord_Impl(buf1, buf2, len);
  1805. end;
  1806. function CompareDWord(const buf1, buf2; len: SizeInt): SizeInt;
  1807. begin
  1808. result:=CompareDWord_Impl(buf1, buf2, len);
  1809. end;
  1810. {$endif ndef CPUX86_HAS_SSE2 (need CompareDWord dispatcher)}
  1811. {$endif FPC_SYSTEM_HAS_COMPAREDWORD}
  1812. {$ifndef FPC_SYSTEM_HAS_INDEXCHAR0}
  1813. {$define FPC_SYSTEM_HAS_INDEXCHAR0}
  1814. function IndexChar0(Const buf;len:SizeInt;b:AnsiChar):SizeInt; assembler;
  1815. var
  1816. saveesi,saveebx : longint;
  1817. asm
  1818. movl %esi,saveesi
  1819. movl %ebx,saveebx
  1820. // Can't use scasb, or will have to do it twice, think this
  1821. // is faster for small "len"
  1822. movl %eax,%esi // Load address
  1823. movzbl %cl,%ebx // Load searchpattern
  1824. testl %edx,%edx
  1825. je .LFound
  1826. xorl %ecx,%ecx // zero index in Buf
  1827. xorl %eax,%eax // To make DWord compares possible
  1828. .balign 4
  1829. .LLoop:
  1830. movb (%esi),%al // Load byte
  1831. cmpb %al,%bl
  1832. je .LFound // byte the same?
  1833. incl %ecx
  1834. incl %esi
  1835. cmpl %edx,%ecx // Maximal distance reached?
  1836. je .LNotFound
  1837. testl %eax,%eax // Nullchar = end of search?
  1838. jne .LLoop
  1839. .LNotFound:
  1840. movl $-1,%ecx // Not found return -1
  1841. .LFound:
  1842. movl %ecx,%eax
  1843. movl saveesi,%esi
  1844. movl saveebx,%ebx
  1845. end;
  1846. {$endif FPC_SYSTEM_HAS_INDEXCHAR0}
  1847. {****************************************************************************
  1848. String
  1849. ****************************************************************************}
  1850. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1851. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1852. procedure fpc_shortstr_to_shortstr(out res:shortstring; const sstr: shortstring);assembler;[public,alias:'FPC_SHORTSTR_TO_SHORTSTR']; compilerproc;
  1853. {$ifndef FPC_PROFILE}
  1854. nostackframe;
  1855. {$endif}
  1856. { eax = res, edx = high(res), ecx = sstr }
  1857. asm
  1858. {$ifdef FPC_PROFILE}
  1859. push %eax
  1860. push %edx
  1861. push %ecx
  1862. call mcount
  1863. pop %ecx
  1864. pop %edx
  1865. pop %eax
  1866. {$endif FPC_PROFILE}
  1867. cmp (%ecx), %dl { length(sstr) fits into res? }
  1868. jbe .LEdxIsLen { use high(res) if length(sstr) does not fit }
  1869. movzbl (%ecx), %edx { use length(sstr) }
  1870. .LEdxIsLen:
  1871. mov %dl, (%eax) { store length to res[0] }
  1872. xchg %ecx, %edx { ecx = length = Move count, edx = sstr }
  1873. xchg %eax, %edx { eax = sstr = Move src, edx = res = Move dest }
  1874. inc %eax
  1875. inc %edx
  1876. {$ifdef FPC_PROFILE}
  1877. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1878. lea -8(%esp), %esp
  1879. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1880. call Move
  1881. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1882. lea 8(%esp), %esp
  1883. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1884. {$else FPC_PROFILE}
  1885. jmp Move
  1886. {$endif FPC_PROFILE}
  1887. end;
  1888. procedure fpc_shortstr_assign(len:longint;sstr,dstr:pointer);[public,alias:'FPC_SHORTSTR_ASSIGN'];
  1889. begin
  1890. asm
  1891. {$ifdef FPC_PROFILE}
  1892. push %eax
  1893. push %edx
  1894. push %ecx
  1895. call mcount
  1896. pop %ecx
  1897. pop %edx
  1898. pop %eax
  1899. {$endif FPC_PROFILE}
  1900. pushl %eax
  1901. pushl %ecx
  1902. {$ifdef FPC_ENABLED_CLD}
  1903. cld
  1904. {$endif FPC_ENABLED_CLD}
  1905. movl dstr,%edi
  1906. movl sstr,%esi
  1907. xorl %eax,%eax
  1908. movl len,%ecx
  1909. lodsb
  1910. cmpl %ecx,%eax
  1911. jbe .LStrCopy1
  1912. movl %ecx,%eax
  1913. .LStrCopy1:
  1914. stosb
  1915. cmpl $7,%eax
  1916. jl .LStrCopy2
  1917. movl %edi,%ecx { Align on 32bits }
  1918. negl %ecx
  1919. andl $3,%ecx
  1920. subl %ecx,%eax
  1921. rep
  1922. movsb
  1923. movl %eax,%ecx
  1924. andl $3,%eax
  1925. shrl $2,%ecx
  1926. rep
  1927. movsl
  1928. .LStrCopy2:
  1929. movl %eax,%ecx
  1930. rep
  1931. movsb
  1932. popl %ecx
  1933. popl %eax
  1934. end ['ESI','EDI'];
  1935. end;
  1936. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1937. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1938. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1939. function fpc_shortstr_compare(const left,right:shortstring): longint;assembler; [public,alias:'FPC_SHORTSTR_COMPARE']; compilerproc;
  1940. { eax = left, edx = right }
  1941. asm
  1942. {$ifdef FPC_PROFILE}
  1943. push %eax
  1944. push %edx
  1945. push %ecx
  1946. call mcount
  1947. pop %ecx
  1948. pop %edx
  1949. pop %eax
  1950. {$endif FPC_PROFILE}
  1951. push %ebx
  1952. movzbl (%eax), %ecx { ecx = len(left) }
  1953. movzbl (%edx), %ebx { ebx = len(right) }
  1954. cmp %ebx, %ecx
  1955. {$ifdef CPUX86_HAS_CMOV}
  1956. cmovg %ebx, %ecx
  1957. {$else}
  1958. jle .LEcxIsLen
  1959. mov %ebx, %ecx
  1960. .LEcxIsLen:
  1961. {$endif}
  1962. push %eax { save left }
  1963. inc %eax
  1964. inc %edx
  1965. { stack is already aligned to 16 bytes if required: return address + push ebp + push ebx + push eax. }
  1966. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1967. call CompareByte
  1968. {$else}
  1969. call CompareByte_Impl { manually inline CompareByte }
  1970. {$endif}
  1971. pop %edx { restore left }
  1972. test %eax, %eax
  1973. jnz .LReturn
  1974. movzbl (%edx), %eax
  1975. sub %ebx, %eax
  1976. .LReturn:
  1977. pop %ebx
  1978. end;
  1979. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1980. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1981. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1982. function fpc_shortstr_compare_equal(const left,right:shortstring) : longint;assembler;nostackframe; [public,alias:'FPC_SHORTSTR_COMPARE_EQUAL']; compilerproc; nostackframe;
  1983. { eax = left, edx = right }
  1984. asm
  1985. movzbl (%eax), %ecx
  1986. cmp (%edx), %cl
  1987. jne .LNotEqual
  1988. inc %eax
  1989. inc %edx
  1990. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1991. jmp CompareByte
  1992. {$else}
  1993. jmp CompareByte_Impl { manually inline CompareByte }
  1994. {$endif}
  1995. .LNotEqual:
  1996. or $-1, %eax
  1997. end;
  1998. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1999. {$ifndef FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  2000. {$define FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  2001. procedure fpc_pchar_to_shortstr(out res : shortstring;p:PAnsiChar);assembler;[public,alias:'FPC_PCHAR_TO_SHORTSTR']; compilerproc;
  2002. {$ifndef FPC_PROFILE}
  2003. nostackframe;
  2004. {$endif}
  2005. // eax = res, edx = high(res), ecx = p
  2006. asm
  2007. {$ifdef FPC_PROFILE}
  2008. push %eax
  2009. push %edx
  2010. push %ecx
  2011. call mcount
  2012. pop %ecx
  2013. pop %edx
  2014. pop %eax
  2015. {$endif FPC_PROFILE}
  2016. test %ecx, %ecx
  2017. jz .LEmpty
  2018. push %eax { save res }
  2019. push %ecx { save p }
  2020. push %edx { save high(res) }
  2021. mov %ecx, %eax { eax = IndexByte.buf }
  2022. { edx is already high(res) = IndexByte.count.
  2023. Careful: using high(res) instead of -1 limits the scan by high(res) which is a good thing,
  2024. but assumes that IndexByte is “safe” and won’t read potentially invalid memory past the searched byte even if formally (and wrongly) allowed by ‘count’.
  2025. Generic and x86 versions are “safe”. }
  2026. xor %ecx, %ecx { ecx = 0 = IndexByte.value }
  2027. { Stack is already aligned on 16 bytes if the function is nostackframe: return address + push eax + push ecx + push edx.
  2028. With a stack frame, there is an additional push ebp and need 12 more bytes to align. }
  2029. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  2030. leal -12(%esp), %esp
  2031. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  2032. {$if defined(FPC_PIC) or not declared(IndexByte_Impl)}
  2033. call IndexByte
  2034. {$else}
  2035. call IndexByte_Impl { manually inline IndexByte }
  2036. {$endif}
  2037. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  2038. leal 12(%esp), %esp
  2039. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  2040. pop %ecx { ecx = high(res) = Move.len }
  2041. test %eax, %eax { If IndexByte result (eax) is non-negative (terminator is among first high(res) characters), use it, otherwise keep high(res). }
  2042. {$ifdef CPUX86_HAS_CMOV}
  2043. cmovns %eax, %ecx
  2044. {$else}
  2045. js .LEcxIsLen
  2046. mov %eax, %ecx
  2047. .LEcxIsLen:
  2048. {$endif}
  2049. pop %eax { pop p to eax = Move.src }
  2050. pop %edx { pop res to edx }
  2051. mov %cl, (%edx) { res[0] := len }
  2052. inc %edx { res[1] = Move.dst }
  2053. {$ifdef FPC_PROFILE}
  2054. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2055. leal -12(%esp), %esp
  2056. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2057. call Move
  2058. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2059. leal 12(%esp), %esp
  2060. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2061. jmp .LReturn
  2062. {$else FPC_PROFILE}
  2063. jmp Move { can perform a tail call }
  2064. {$endif FPC_PROFILE}
  2065. .LEmpty:
  2066. movb $0, (%eax)
  2067. {$ifdef FPC_PROFILE}
  2068. .LReturn:
  2069. {$endif}
  2070. end;
  2071. {$endif FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  2072. {$IFNDEF INTERNAL_BACKTRACE}
  2073. {$define FPC_SYSTEM_HAS_GET_FRAME}
  2074. function get_frame:pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  2075. asm
  2076. movl %ebp,%eax
  2077. end;
  2078. {$ENDIF not INTERNAL_BACKTRACE}
  2079. {$define FPC_SYSTEM_HAS_GET_PC_ADDR}
  2080. Function Get_pc_addr : Pointer;assembler;nostackframe;
  2081. asm
  2082. movl (%esp),%eax
  2083. end;
  2084. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  2085. function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;
  2086. {$if defined(win32)}
  2087. { Windows has StackTop always properly set }
  2088. begin
  2089. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  2090. Result:=PPointer(framebp+4)^
  2091. else
  2092. Result:=nil;
  2093. end;
  2094. {$else defined(win32)}
  2095. nostackframe;assembler;
  2096. asm
  2097. orl %eax,%eax
  2098. jz .Lg_a_null
  2099. movl 4(%eax),%eax
  2100. .Lg_a_null:
  2101. end;
  2102. {$endif defined(win32)}
  2103. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  2104. function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;
  2105. {$if defined(win32)}
  2106. { Windows has StackTop always properly set }
  2107. begin
  2108. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  2109. Result:=PPointer(framebp)^
  2110. else
  2111. Result:=nil;
  2112. end;
  2113. {$else defined(win32)}
  2114. nostackframe;assembler;
  2115. asm
  2116. orl %eax,%eax
  2117. jz .Lgnf_null
  2118. movl (%eax),%eax
  2119. .Lgnf_null:
  2120. end;
  2121. {$endif defined(win32)}
  2122. {$define FPC_SYSTEM_HAS_SPTR}
  2123. Function Sptr : Pointer;assembler;nostackframe;
  2124. asm
  2125. movl %esp,%eax
  2126. end;
  2127. {****************************************************************************
  2128. Str()
  2129. ****************************************************************************}
  2130. {$if defined(disabled) and defined(regcall) }
  2131. {$define FPC_SYSTEM_HAS_INT_STR_LONGWORD}
  2132. {$define FPC_SYSTEM_HAS_INT_STR_LONGINT}
  2133. label str_int_shortcut;
  2134. procedure int_str(l:longword;out s:shortstring);assembler;nostackframe;
  2135. asm
  2136. pushl %esi
  2137. pushl %edi
  2138. pushl %ebx
  2139. mov %edx,%edi
  2140. xor %edx,%edx
  2141. jmp str_int_shortcut
  2142. end;
  2143. procedure int_str(l:longint;out s:shortstring);assembler;nostackframe;
  2144. {Optimized for speed, but balanced with size.}
  2145. const digits:array[0..9] of cardinal=(0,10,100,1000,10000,
  2146. 100000,1000000,10000000,
  2147. 100000000,1000000000);
  2148. asm
  2149. {$ifdef FPC_PROFILE}
  2150. push %eax
  2151. push %edx
  2152. push %ecx
  2153. call mcount
  2154. pop %ecx
  2155. pop %edx
  2156. pop %eax
  2157. {$endif FPC_PROFILE}
  2158. push %esi
  2159. push %edi
  2160. push %ebx
  2161. movl %edx,%edi
  2162. { Calculate absolute value and put sign in edx}
  2163. cltd
  2164. xorl %edx,%eax
  2165. subl %edx,%eax
  2166. negl %edx
  2167. str_int_shortcut:
  2168. movl %ecx,%esi
  2169. {Calculate amount of digits in ecx.}
  2170. xorl %ecx,%ecx
  2171. bsrl %eax,%ecx
  2172. incl %ecx
  2173. imul $1233,%ecx
  2174. shr $12,%ecx
  2175. {$ifdef FPC_PIC}
  2176. call fpc_geteipasebx
  2177. {$ifdef darwin}
  2178. movl digits-.Lpic(%ebx),%ebx
  2179. {$else}
  2180. addl $_GLOBAL_OFFSET_TABLE_,%ebx
  2181. movl digits@GOT(%ebx),%ebx
  2182. {$endif}
  2183. cmpl (%ebx,%ecx,4),%eax
  2184. {$else}
  2185. cmpl digits(,%ecx,4),%eax
  2186. {$endif}
  2187. cmc
  2188. adcl $0,%ecx {Nr. digits ready in ecx.}
  2189. {Write length & sign.}
  2190. lea (%edx,%ecx),%ebx
  2191. movb $45,%bh {movb $'-,%bh Not supported by our ATT reader.}
  2192. movw %bx,(%edi)
  2193. addl %edx,%edi
  2194. subl %edx,%esi
  2195. {Skip digits beyond string length.}
  2196. movl %eax,%edx
  2197. subl %ecx,%esi
  2198. jae .Lloop_write
  2199. .balign 4
  2200. .Lloop_skip:
  2201. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  2202. mull %edx
  2203. shrl $3,%edx
  2204. decl %ecx
  2205. jz .Ldone {If (l<0) and (high(s)=1) this jump is taken.}
  2206. incl %esi
  2207. jnz .Lloop_skip
  2208. {Write out digits.}
  2209. .balign 4
  2210. .Lloop_write:
  2211. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  2212. {Pre-add '0'}
  2213. leal 48(%edx),%ebx {leal $'0(,%edx),%ebx Not supported by our ATT reader.}
  2214. mull %edx
  2215. shrl $3,%edx
  2216. leal (%edx,%edx,8),%eax {x mod 10 = x-10*(x div 10)}
  2217. subl %edx,%ebx
  2218. subl %eax,%ebx
  2219. movb %bl,(%edi,%ecx)
  2220. decl %ecx
  2221. jnz .Lloop_write
  2222. .Ldone:
  2223. popl %ebx
  2224. popl %edi
  2225. popl %esi
  2226. end;
  2227. {$endif}
  2228. {****************************************************************************
  2229. Bounds Check
  2230. ****************************************************************************}
  2231. { do a thread-safe inc/dec }
  2232. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  2233. function cpudeclocked(var l : longint) : boolean;assembler;nostackframe;
  2234. asm
  2235. lock
  2236. decl (%eax)
  2237. setzb %al
  2238. end;
  2239. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  2240. procedure cpuinclocked(var l : longint);assembler;nostackframe;
  2241. asm
  2242. lock
  2243. incl (%eax)
  2244. end;
  2245. // inline SMP check and normal lock.
  2246. // the locked one is so slow, inlining doesn't matter.
  2247. function declocked(var l : longint) : boolean; inline;
  2248. begin
  2249. if not ismultithread then
  2250. begin
  2251. dec(l);
  2252. declocked:=l=0;
  2253. end
  2254. else
  2255. declocked:=cpudeclocked(l);
  2256. end;
  2257. procedure inclocked(var l : longint); inline;
  2258. begin
  2259. if not ismultithread then
  2260. inc(l)
  2261. else
  2262. cpuinclocked(l);
  2263. end;
  2264. {$ifndef VER3_2}
  2265. {$define FPC_SYSTEM_HAS_ATOMIC_CMP_XCHG_8}
  2266. function fpc_atomic_cmp_xchg_8(var Target: shortint; NewValue: shortint; Comparand: shortint): shortint; assembler; nostackframe;
  2267. asm
  2268. xchgl %eax,%ecx
  2269. lock
  2270. cmpxchgb %dl,(%ecx)
  2271. end;
  2272. {$define FPC_SYSTEM_HAS_ATOMIC_CMP_XCHG_16}
  2273. function fpc_atomic_cmp_xchg_16(var Target: smallint; NewValue: smallint; Comparand: smallint): smallint; assembler; nostackframe;
  2274. asm
  2275. xchgl %eax,%ecx
  2276. lock
  2277. cmpxchgw %dx,(%ecx)
  2278. end;
  2279. {$define FPC_SYSTEM_HAS_ATOMIC_XCHG_64}
  2280. function fpc_atomic_xchg_64(var Target: int64; Source: int64): int64; assembler; nostackframe;
  2281. { eax = Target, [esp + 4] = Source. }
  2282. asm
  2283. pushl %ebx
  2284. pushl %edi
  2285. movl %eax,%edi
  2286. movl 8+4(%esp),%ebx
  2287. movl 8+8(%esp),%ecx
  2288. .LAgain:
  2289. movl (%edi),%eax
  2290. movl 4(%edi),%edx
  2291. lock cmpxchg8b (%edi)
  2292. jne .LAgain
  2293. pop %edi
  2294. pop %ebx
  2295. end;
  2296. {$define FPC_SYSTEM_HAS_ATOMIC_SUB_32}
  2297. function fpc_atomic_sub_32(var Target: longint; Value: longint): longint; assembler; nostackframe;
  2298. asm
  2299. neg %edx
  2300. lock
  2301. xaddl %edx, (%eax)
  2302. movl %edx,%eax
  2303. end;
  2304. {$define FPC_SYSTEM_HAS_ATOMIC_INC_64}
  2305. function fpc_atomic_inc_64(var Target: int64): int64; assembler; nostackframe;
  2306. { eax = Target. }
  2307. asm
  2308. pushl %ebx
  2309. pushl %edi
  2310. movl %eax,%edi
  2311. .LAgain:
  2312. movl (%edi),%eax
  2313. movl 4(%edi),%edx
  2314. movl %eax,%ebx { ecx:ebx := edx:eax + 1. }
  2315. movl %edx,%ecx
  2316. addl $1,%ebx
  2317. adcl $0,%ecx
  2318. lock cmpxchg8b (%edi)
  2319. jne .LAgain
  2320. movl %ebx,%eax
  2321. movl %ecx,%edx
  2322. pop %edi
  2323. pop %ebx
  2324. end;
  2325. {$define FPC_SYSTEM_HAS_ATOMIC_DEC_64}
  2326. function fpc_atomic_dec_64(var Target: int64): int64; assembler; nostackframe;
  2327. { eax = Target. }
  2328. asm
  2329. pushl %ebx
  2330. pushl %edi
  2331. movl %eax,%edi
  2332. .LAgain:
  2333. movl (%edi),%eax
  2334. movl 4(%edi),%edx
  2335. movl %eax,%ebx { ecx:ebx := edx:eax - 1. }
  2336. movl %edx,%ecx
  2337. subl $1,%ebx
  2338. sbbl $0,%ecx
  2339. lock cmpxchg8b (%edi)
  2340. jne .LAgain
  2341. movl %ebx,%eax
  2342. movl %ecx,%edx
  2343. pop %edi
  2344. pop %ebx
  2345. end;
  2346. {$define FPC_SYSTEM_HAS_ATOMIC_ADD_64}
  2347. function fpc_atomic_add_64(var Target: int64; Value: int64): int64; assembler; nostackframe;
  2348. { eax = Target, [esp + 4] = Value. }
  2349. asm
  2350. pushl %ebx
  2351. pushl %edi
  2352. movl %eax,%edi
  2353. .LAgain:
  2354. movl (%edi),%eax
  2355. movl 4(%edi),%edx
  2356. movl %eax,%ebx { ecx:ebx := edx:eax + Value. }
  2357. movl %edx,%ecx
  2358. addl 8+4(%esp),%ebx
  2359. adcl 8+8(%esp),%ecx
  2360. lock cmpxchg8b (%edi)
  2361. jne .LAgain
  2362. pop %edi
  2363. pop %ebx
  2364. end;
  2365. {$define FPC_SYSTEM_HAS_ATOMIC_SUB_64}
  2366. function fpc_atomic_sub_64(var Target: int64; Value: int64): int64; assembler; nostackframe;
  2367. { eax = Target, [esp + 4] = Value. }
  2368. asm
  2369. pushl %ebx
  2370. pushl %edi
  2371. movl %eax,%edi
  2372. .LAgain:
  2373. movl (%edi),%eax
  2374. movl 4(%edi),%edx
  2375. movl %eax,%ebx { ecx:ebx := edx:eax - Value. }
  2376. movl %edx,%ecx
  2377. subl 8+4(%esp),%ebx
  2378. sbbl 8+8(%esp),%ecx
  2379. lock cmpxchg8b (%edi)
  2380. jne .LAgain
  2381. pop %edi
  2382. pop %ebx
  2383. end;
  2384. {$endif VER3_2}
  2385. {$ifdef VER3_2}
  2386. function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
  2387. {$else VER3_2}
  2388. {$define FPC_SYSTEM_HAS_ATOMIC_DEC_32}
  2389. function fpc_atomic_dec_32 (var Target: longint) : longint; assembler; nostackframe;
  2390. {$endif VER3_2}
  2391. asm
  2392. movl $-1,%edx
  2393. lock
  2394. xaddl %edx, (%eax)
  2395. lea -1(%edx),%eax
  2396. end;
  2397. {$ifdef VER3_2}
  2398. function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
  2399. {$else VER3_2}
  2400. {$define FPC_SYSTEM_HAS_ATOMIC_INC_32}
  2401. function fpc_atomic_inc_32 (var Target: longint) : longint; assembler; nostackframe;
  2402. {$endif VER3_2}
  2403. asm
  2404. movl $1,%edx
  2405. lock
  2406. xaddl %edx, (%eax)
  2407. lea 1(%edx),%eax
  2408. end;
  2409. {$ifdef VER3_2}
  2410. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2411. {$else VER3_2}
  2412. {$define FPC_SYSTEM_HAS_ATOMIC_XCHG_32}
  2413. function fpc_atomic_xchg_32 (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2414. {$endif VER3_2}
  2415. asm
  2416. xchgl (%eax),%edx
  2417. movl %edx,%eax
  2418. end;
  2419. {$ifdef VER3_2}
  2420. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2421. {$else VER3_2}
  2422. {$define FPC_SYSTEM_HAS_ATOMIC_ADD_32}
  2423. function fpc_atomic_add_32 (var Target: longint;Value : longint) : longint; assembler; nostackframe;
  2424. {$endif VER3_2}
  2425. asm
  2426. lock
  2427. xaddl %edx, (%eax)
  2428. movl %edx,%eax
  2429. end;
  2430. {$ifdef VER3_2}
  2431. function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler; nostackframe;
  2432. {$else VER3_2}
  2433. {$define FPC_SYSTEM_HAS_ATOMIC_CMP_XCHG_32}
  2434. function fpc_atomic_cmp_xchg_32(var Target: longint; NewValue, Comparand : longint): longint; [public, alias:'FPC_ATOMIC_CMP_XCHG_32']; assembler; nostackframe;
  2435. {$endif VER3_2}
  2436. asm
  2437. xchgl %eax,%ecx
  2438. lock
  2439. cmpxchgl %edx, (%ecx)
  2440. end;
  2441. {$ifdef VER3_2}
  2442. function InterlockedCompareExchange64(var Target: int64; NewValue: int64; Comperand: int64): int64; assembler; nostackframe;
  2443. {$else VER3_2}
  2444. {$define FPC_SYSTEM_HAS_ATOMIC_CMP_XCHG_64}
  2445. function fpc_atomic_cmp_xchg_64 (var Target: int64; NewValue: int64; Comparand: int64) : int64; assembler; nostackframe;
  2446. {$endif VER3_2}
  2447. { eax = Target, [esp + 12] = NewValue, [esp + 4] = Comparand. }
  2448. asm
  2449. pushl %ebx
  2450. pushl %edi
  2451. movl %eax,%edi
  2452. movl 8+4(%esp),%eax
  2453. movl 8+8(%esp),%edx
  2454. movl 8+12(%esp),%ebx
  2455. movl 8+16(%esp),%ecx
  2456. lock cmpxchg8b (%edi)
  2457. pop %edi
  2458. pop %ebx
  2459. end;
  2460. {****************************************************************************
  2461. FPU
  2462. ****************************************************************************}
  2463. const
  2464. { Internal constants for use in system unit }
  2465. FPU_Invalid = 1;
  2466. FPU_Denormal = 2;
  2467. FPU_DivisionByZero = 4;
  2468. FPU_Overflow = 8;
  2469. FPU_Underflow = $10;
  2470. FPU_StackUnderflow = $20;
  2471. FPU_StackOverflow = $40;
  2472. FPU_ExceptionMask = $ff;
  2473. MM_Invalid = 1;
  2474. MM_Denormal = 2;
  2475. MM_DivisionByZero = 4;
  2476. MM_Overflow = 8;
  2477. MM_Underflow = $10;
  2478. MM_Precicion = $20;
  2479. MM_ExceptionMask = $3f;
  2480. MM_MaskInvalidOp = %0000000010000000;
  2481. MM_MaskDenorm = %0000000100000000;
  2482. MM_MaskDivZero = %0000001000000000;
  2483. MM_MaskOverflow = %0000010000000000;
  2484. MM_MaskUnderflow = %0000100000000000;
  2485. MM_MaskPrecision = %0001000000000000;
  2486. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  2487. Procedure SysInitFPU;
  2488. begin
  2489. end;
  2490. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  2491. Procedure SysResetFPU;
  2492. var
  2493. { these locals are so we don't have to hack pic code in the assembler }
  2494. localmxcsr: dword;
  2495. localfpucw: word;
  2496. begin
  2497. localfpucw:=Default8087CW;
  2498. asm
  2499. fninit
  2500. fwait
  2501. fldcw localfpucw
  2502. end;
  2503. if has_sse_support then
  2504. begin
  2505. localmxcsr:=DefaultMXCSR;
  2506. asm
  2507. { setup sse exceptions }
  2508. {$ifndef OLD_ASSEMBLER}
  2509. ldmxcsr localmxcsr
  2510. {$else OLD_ASSEMBLER}
  2511. mov localmxcsr,%eax
  2512. subl $4,%esp
  2513. mov %eax,(%esp)
  2514. //ldmxcsr (%esp)
  2515. .byte 0x0f,0xae,0x14,0x24
  2516. addl $4,%esp
  2517. {$endif OLD_ASSEMBLER}
  2518. end;
  2519. end;
  2520. end;
  2521. { because of the brain dead sse detection on x86, this test is post poned }
  2522. procedure fpc_cpucodeinit;
  2523. var
  2524. _eax,_ecx_cpuid1,_edx_cpuid1,_ebx_cpuid7 : longint;
  2525. begin
  2526. if cpuid_support then
  2527. begin
  2528. asm
  2529. movl $1,%eax
  2530. xorl %ecx,%ecx
  2531. cpuid
  2532. movl %edx,_edx_cpuid1
  2533. movl %ecx,_ecx_cpuid1
  2534. end ['ebx'];
  2535. has_mmx_support:=(_edx_cpuid1 and $800000)<>0;
  2536. if ((_edx_cpuid1 and $2000000)<>0) then
  2537. begin
  2538. os_supports_sse:=true;
  2539. sse_check:=true;
  2540. asm
  2541. { force an sse exception if no sse is supported, the exception handler sets
  2542. os_supports_sse to false then }
  2543. { don't change this instruction, the code above depends on its size }
  2544. {$ifdef OLD_ASSEMBLER}
  2545. .byte 0x0f,0x28,0xf7
  2546. {$else}
  2547. movaps %xmm7, %xmm6
  2548. {$endif not EMX}
  2549. end;
  2550. sse_check:=false;
  2551. has_sse_support:=os_supports_sse;
  2552. end;
  2553. if has_sse_support then
  2554. begin
  2555. has_sse2_support:=((_edx_cpuid1 and $4000000)<>0);
  2556. has_sse3_support:=((_ecx_cpuid1 and $200)<>0);
  2557. has_sse41_support:=boolean(_ecx_cpuid1 shr 19 and 1);
  2558. { now avx }
  2559. asm
  2560. xorl %eax,%eax
  2561. cpuid
  2562. movl %eax,_eax
  2563. end;
  2564. if _eax>=7 then
  2565. begin
  2566. asm
  2567. movl $7,%eax
  2568. xorl %ecx,%ecx
  2569. cpuid
  2570. movl %ebx,_ebx_cpuid7
  2571. end;
  2572. fast_large_repmovstosb:=_ebx_cpuid7 and (1 shl 9)<>0;
  2573. if (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then
  2574. begin
  2575. asm
  2576. xorl %ecx,%ecx
  2577. .byte 0x0f,0x01,0xd0 { xgetbv }
  2578. movl %eax,_eax
  2579. end;
  2580. if (_eax and 6)=6 then
  2581. begin
  2582. has_avx_support:=(_ecx_cpuid1 and $10000000)<>0;
  2583. has_avx2_support:=(_ebx_cpuid7 and $20)<>0;
  2584. end;
  2585. end;
  2586. end;
  2587. end;
  2588. end;
  2589. { don't let libraries influence the FPU cw set by the host program }
  2590. if IsLibrary then
  2591. begin
  2592. Default8087CW:=Get8087CW;
  2593. if has_sse_support then
  2594. DefaultMXCSR:=GetMXCSR;
  2595. end;
  2596. SysResetFPU;
  2597. fpc_cpucodeinit_performed:=true;
  2598. end;
  2599. {$if not defined(darwin) and defined(regcall) }
  2600. { darwin requires that the stack is aligned to 16 bytes when calling another function }
  2601. {$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
  2602. {$define FPC_SYSTEM_HAS_ANSISTR_DECR_REF}
  2603. Procedure fpc_AnsiStr_Decr_Ref (Var S : Pointer); [Public,Alias:'FPC_ANSISTR_DECR_REF']; compilerproc; nostackframe; assembler;
  2604. asm
  2605. movl (%eax),%edx
  2606. testl %edx,%edx
  2607. jz .Lquit
  2608. movl $0,(%eax) // s:=nil
  2609. cmpl $1,-8(%edx) // exit if refcount<1
  2610. je .Lfree // skip the decrement if refcount=1.
  2611. jl .Lquit
  2612. {$ifdef FPC_PIC}
  2613. call fpc_geteipasecx
  2614. addl $_GLOBAL_OFFSET_TABLE_,%ecx
  2615. movl ismultithread@GOT(%ecx),%ecx
  2616. cmpl $0,(%ecx)
  2617. {$else FPC_PIC}
  2618. cmpl $0,ismultithread
  2619. {$endif FPC_PIC}
  2620. je .Lskiplock
  2621. .byte 0xF0 // LOCK prefix, jumped over if IsMultiThread = false. FPC assembler does not accept disjoint LOCK mnemonic.
  2622. .Lskiplock:
  2623. decl -8(%edx)
  2624. jz .Lfree
  2625. .Lquit:
  2626. ret
  2627. .Lfree:
  2628. leal -12(%edx),%eax // points to start of allocation
  2629. jmp FPC_FREEMEM // nostackframe + jmp allows to ignore stack alignment.
  2630. end;
  2631. function fpc_truely_ansistr_unique(Var S : Pointer): Pointer; forward;
  2632. {$define FPC_SYSTEM_HAS_ANSISTR_UNIQUE}
  2633. Function fpc_ansistr_Unique(Var S : Pointer): Pointer; [Public,Alias : 'FPC_ANSISTR_UNIQUE']; compilerproc; nostackframe;assembler;
  2634. asm
  2635. movl (%eax),%edx
  2636. testl %edx,%edx
  2637. jz .Lunchanged
  2638. cmpl $1,-8(%edx)
  2639. jne fpc_truely_ansistr_unique
  2640. .Lunchanged:
  2641. movl %edx,%eax
  2642. end;
  2643. {$endif FPC_HAS_FEATURE_ANSISTRINGS}
  2644. {$endif ndef darwin and defined(regcall) }
  2645. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  2646. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  2647. procedure ReadBarrier;assembler;nostackframe;
  2648. asm
  2649. {$ifdef CPUX86_HAS_SSE2}
  2650. lfence
  2651. {$else CPUX86_HAS_SSE2}
  2652. lock
  2653. addl $0,0(%esp)
  2654. {$endif CPUX86_HAS_SSE2}
  2655. end;
  2656. procedure ReadDependencyBarrier;
  2657. begin
  2658. { reads imply barrier on earlier reads depended on }
  2659. end;
  2660. procedure ReadWriteBarrier;assembler;nostackframe;
  2661. asm
  2662. {$ifdef CPUX86_HAS_SSE2}
  2663. mfence
  2664. {$else CPUX86_HAS_SSE2}
  2665. lock
  2666. addl $0,0(%esp)
  2667. {$endif CPUX86_HAS_SSE2}
  2668. end;
  2669. procedure WriteBarrier;assembler;nostackframe;
  2670. asm
  2671. {$ifdef CPUX86_HAS_SSEUNIT}
  2672. sfence
  2673. {$endif CPUX86_HAS_SSEUNIT}
  2674. end;
  2675. {$endif}
  2676. {$ifndef FPC_SYSTEM_HAS_BSF_QWORD}
  2677. {$define FPC_SYSTEM_HAS_BSF_QWORD}
  2678. function BsfQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2679. asm
  2680. {$ifdef CPUX86_HINT_BSX_DEST_UNCHANGED_ON_ZF_1}
  2681. mov $255-32,%eax { On AMD, BSF/R are documented to not change the destination on zero input. }
  2682. bsfl 8(%esp),%eax { On Intel, destination is formally undefined on zero input, but in practice the behavior is the same. }
  2683. add $32,%eax
  2684. bsfl 4(%esp),%eax
  2685. {$else}
  2686. bsfl 4(%esp),%eax
  2687. jz .L1
  2688. ret $8
  2689. .L1:
  2690. bsfl 8(%esp),%eax
  2691. jz .L2
  2692. add $32,%eax
  2693. ret $8
  2694. .L2:
  2695. movl $255,%eax
  2696. {$endif}
  2697. end;
  2698. {$endif FPC_SYSTEM_HAS_BSF_QWORD}
  2699. {$ifndef FPC_SYSTEM_HAS_BSR_QWORD}
  2700. {$define FPC_SYSTEM_HAS_BSR_QWORD}
  2701. function BsrQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2702. asm
  2703. {$ifdef CPUX86_HINT_BSX_DEST_UNCHANGED_ON_ZF_1}
  2704. mov $255,%eax
  2705. bsrl 4(%esp),%eax
  2706. sub $32,%eax
  2707. bsrl 8(%esp),%eax
  2708. add $32,%eax
  2709. {$else}
  2710. mov 8(%esp),%eax
  2711. test %eax,%eax
  2712. jnz .L1 { Speculate Hi(q) = 0. }
  2713. bsrl 4(%esp),%eax
  2714. jz .L2
  2715. ret $8
  2716. .L1:
  2717. bsrl %eax,%eax
  2718. add $32,%eax
  2719. ret $8
  2720. .L2:
  2721. movl $255,%eax
  2722. {$endif}
  2723. end;
  2724. {$endif FPC_SYSTEM_HAS_BSR_QWORD}
  2725. {$ifndef FPC_SYSTEM_HAS_SAR_QWORD}
  2726. {$define FPC_SYSTEM_HAS_SAR_QWORD}
  2727. function fpc_SarInt64(Const AValue : Int64;const Shift : Byte): Int64; [Public,Alias:'FPC_SARINT64']; compilerproc; assembler; nostackframe;
  2728. asm
  2729. movl 8(%esp),%edx
  2730. movzbl %al,%ecx
  2731. cmpb $32,%al
  2732. jnb .L1
  2733. movl 4(%esp),%eax
  2734. shrdl %cl,%edx,%eax
  2735. sarl %cl,%edx
  2736. ret $8
  2737. .L1:
  2738. movl %edx,%eax
  2739. sarl $31,%edx
  2740. sarl %cl,%eax // uses 5 lower bits of cl.
  2741. end;
  2742. {$endif FPC_SYSTEM_HAS_SAR_QWORD}
  2743. {$ifndef FPC_SYSTEM_HAS_UMUL64X64_128}
  2744. {$define FPC_SYSTEM_HAS_UMUL64X64_128}
  2745. function UMul64x64_128(a,b: uint64; out rHi: uint64): uint64; assembler; nostackframe;
  2746. { [esp + 12] = a, [esp + 4] = b, eax = rHi }
  2747. asm
  2748. { Hi(a) Lo(a)
  2749. x Hi(b) Lo(b)
  2750. -------------------------------------------------------------------------
  2751. Hi(Lo(a) * Lo(b)) Lo(Lo(a) * Lo(b))
  2752. + Hi(Hi(a) * Lo(b)) Lo(Hi(a) * Lo(b))
  2753. + Hi(Lo(a) * Hi(b)) Lo(Lo(a) * Hi(b))
  2754. + Hi(Hi(a) * Hi(b)) Lo(Hi(a) * Hi(b))
  2755. edi esi ebx, then edx eax }
  2756. push %ebx
  2757. push %esi
  2758. push %edi
  2759. mov %eax, %ecx { ecx = rHi. }
  2760. mov 12+16(%esp), %eax
  2761. mull 12+8(%esp) { edx:eax = Hi(a) * Hi(b). }
  2762. mov %eax, %esi
  2763. mov %edx, %edi { edi:esi = Hi(a) * Hi(b). }
  2764. mov 12+16(%esp), %eax
  2765. mull 12+4(%esp) { edx:eax = Hi(a) * Lo(b). }
  2766. mov %eax, %ebx
  2767. add %edx, %esi { edi:esi += Hi(Hi(a) * Lo(b)). }
  2768. adc $0, %edi
  2769. mov 12+12(%esp), %eax
  2770. mull 12+8(%esp) { edx:eax = Lo(a) * Hi(b). }
  2771. add %eax, %ebx // edi:esi:ebx += Lo(a) * Hi(b).
  2772. adc %edx, %esi
  2773. adc $0, %edi
  2774. mov 12+12(%esp), %eax
  2775. mull 12+4(%esp) { edx:eax = Lo(a) * Lo(b). }
  2776. add %ebx, %edx { edi:esi:edx += Hi(Lo(a) * Lo(b)). }
  2777. adc $0, %esi
  2778. adc $0, %edi
  2779. mov %esi, (%ecx)
  2780. mov %edi, 4(%ecx)
  2781. pop %edi
  2782. pop %esi
  2783. pop %ebx
  2784. end;
  2785. {$endif FPC_SYSTEM_HAS_UMUL64X64_128}