i386.inc 82 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 1999-2000 by the Free Pascal development team.
  4. Processor dependent implementation for the system unit for
  5. intel i386+
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$if defined(linux)}
  13. {$define FPC_SYSTEM_STACKALIGNMENT16}
  14. {$endif defined(linux)}
  15. {****************************************************************************
  16. Primitives
  17. ****************************************************************************}
  18. var
  19. os_supports_sse : boolean;
  20. { this variable is set to true, if currently an sse check is executed and no sig ill should be generated }
  21. sse_check : boolean;
  22. fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
  23. has_sse41_support : boolean;
  24. fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. }
  25. {$asmmode ATT}
  26. function cpuid_support : boolean;assembler;nostackframe;
  27. {
  28. Check if the ID-flag can be changed, if changed then CpuID is supported.
  29. Tested under go32v1 and Linux on c6x86 with CpuID enabled and disabled (PFV)
  30. }
  31. asm
  32. pushfl
  33. movl (%esp),%eax
  34. xorl $0x200000,%eax
  35. pushl %eax
  36. popfl
  37. pushfl
  38. popl %eax
  39. xorl (%esp),%eax
  40. popfl
  41. testl $0x200000,%eax
  42. setnz %al
  43. end;
  44. {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
  45. procedure fpc_cpuinit;
  46. begin
  47. { because of the brain dead sse detection on x86, this test is post poned to fpc_cpucodeinit which
  48. must be implemented OS dependend (FK)
  49. has_sse_support:=sse_support;
  50. has_mmx_support:=mmx_support;
  51. }
  52. end;
  53. {$ifndef darwin}
  54. procedure fpc_geteipasebx; [public, alias: 'fpc_geteipasebx'];assembler; nostackframe;
  55. asm
  56. movl (%esp),%ebx
  57. end;
  58. procedure fpc_geteipasecx; [public, alias: 'fpc_geteipasecx'];assembler; nostackframe;
  59. asm
  60. movl (%esp),%ecx
  61. end;
  62. {$endif}
  63. {$if not defined(FPC_SYSTEM_HAS_MOVE)
  64. and not defined(OLD_ASSEMBLER)
  65. and not defined(darwin)}
  66. {$i fastmove.inc}
  67. {$endif}
  68. {$ifndef FPC_SYSTEM_HAS_MOVE}
  69. {$define FPC_SYSTEM_HAS_MOVE}
  70. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;
  71. var
  72. saveesi,saveedi : longint;
  73. asm
  74. movl %edi,saveedi
  75. movl %esi,saveesi
  76. movl %eax,%esi
  77. movl %edx,%edi
  78. movl %ecx,%edx
  79. movl %edi,%eax
  80. { check for zero or negative count }
  81. cmpl $0,%edx
  82. jle .LMoveEnd
  83. { Check for back or forward }
  84. sub %esi,%eax
  85. jz .LMoveEnd { Do nothing when source=dest }
  86. jc .LFMove { Do forward, dest<source }
  87. cmp %edx,%eax
  88. jb .LBMove { Dest is in range of move, do backward }
  89. { Forward Copy }
  90. .LFMove:
  91. {$ifdef FPC_ENABLED_CLD}
  92. cld
  93. {$endif FPC_ENABLED_CLD}
  94. cmpl $15,%edx
  95. jl .LFMove1
  96. movl %edi,%ecx { Align on 32bits }
  97. negl %ecx
  98. andl $3,%ecx
  99. subl %ecx,%edx
  100. rep
  101. movsb
  102. movl %edx,%ecx
  103. andl $3,%edx
  104. shrl $2,%ecx
  105. rep
  106. movsl
  107. .LFMove1:
  108. movl %edx,%ecx
  109. rep
  110. movsb
  111. jmp .LMoveEnd
  112. { Backward Copy }
  113. .LBMove:
  114. std
  115. addl %edx,%esi
  116. addl %edx,%edi
  117. movl %edi,%ecx
  118. decl %esi
  119. decl %edi
  120. cmpl $15,%edx
  121. jl .LBMove1
  122. negl %ecx { Align on 32bits }
  123. andl $3,%ecx
  124. subl %ecx,%edx
  125. rep
  126. movsb
  127. movl %edx,%ecx
  128. andl $3,%edx
  129. shrl $2,%ecx
  130. subl $3,%esi
  131. subl $3,%edi
  132. rep
  133. movsl
  134. addl $3,%esi
  135. addl $3,%edi
  136. .LBMove1:
  137. movl %edx,%ecx
  138. rep
  139. movsb
  140. cld
  141. .LMoveEnd:
  142. movl saveedi,%edi
  143. movl saveesi,%esi
  144. end;
  145. {$endif FPC_SYSTEM_HAS_MOVE}
  146. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  147. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  148. or not defined(FPC_SYSTEM_HAS_FILLDWORD)
  149. or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  150. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  151. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  152. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  153. const
  154. FillXxxx_RepStosThreshold_ERMS = 1024;
  155. FillXxxx_RepStosThreshold_NoERMS = 512 * 1024;
  156. procedure FillXxxx_U32Pattern_RepStos_8OrMore; assembler; nostackframe;
  157. { eax — x, ecx — uint32 pattern, edx — byte count >= 8 (preferably >= FillXxxx_RepStosThreshold_(No)ERMS, depending on fast_large_repmovstosb). }
  158. asm
  159. {$ifdef FPC_ENABLED_CLD}
  160. cld
  161. {$endif FPC_ENABLED_CLD}
  162. mov %ecx, (%eax) { Write first 4 bytes unaligned. }
  163. push %ecx { pattern }
  164. push %edi
  165. mov %eax, %edi { Move x to edi, as expected by ‘rep stosl’. }
  166. xchg %eax, %ecx { now eax = pattern (as expected by ‘rep stosl’) and ecx = x (to rotate the pattern by its misalignment) }
  167. shl $3, %ecx { ecx = misalignment of x in bits. }
  168. rol %cl, %eax { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
  169. add %edi, %edx { edx = x end }
  170. lea -1(%edx), %ecx { ecx = x end - 1. }
  171. add $4, %edi
  172. and $-4, %edi { edi = 4-byte aligned pointer strictly to the right of the start. }
  173. and $-4, %ecx { ecx = 4-byte aligned pointer strictly to the left of the end. }
  174. sub %edi, %ecx { ecx = byte count between them. }
  175. shr $2, %ecx { ecx = uint32 count, as expected by ‘rep stosl’. }
  176. rep stosl
  177. pop %edi
  178. pop %ecx
  179. mov %ecx, -4(%edx) { Write last 4 bytes unaligned. }
  180. end;
  181. {$endif FillChar/Word/DWord required.}
  182. label
  183. FillXxxx_MoreThanTwoXMMs;
  184. procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
  185. { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
  186. const
  187. NtThreshold = 4 * 1024 * 1024;
  188. asm
  189. movd %ecx, %xmm0
  190. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  191. movdqu %xmm0, (%eax)
  192. movdqu %xmm0, -16(%eax,%edx)
  193. cmp $32, %edx
  194. ja .LMoreThanTwoVectors
  195. ret
  196. .byte 144 { Turn .balign 16 before .L64x_Body into a no-op. }
  197. { x can start and end misaligned on the vector boundary:
  198. x = ~~][H1][H2][...][T2][T1]~
  199. [UH] [UT]
  200. UH/UT stands for “unaligned head/tail”, both have 1~16 bytes. }
  201. .LMoreThanTwoVectors:
  202. push %esi
  203. mov %ecx, %esi { esi = pattern }
  204. mov %eax, %ecx
  205. shl $3, %ecx { ecx = misalignment of x in bits }
  206. rol %cl, %esi { misalign the pattern }
  207. movd %esi, %xmm0
  208. pshufd $0, %xmm0, %xmm0
  209. pop %esi
  210. { FillChar (to skip the misaligning above) and FillQWord jump here.
  211. eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
  212. FillXxxx_MoreThanTwoXMMs:
  213. lea -65(%eax,%edx), %ecx
  214. and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
  215. mov %ecx, %edx { Remember T4 to edx. }
  216. and $-16, %eax { eax = H1 − 16. }
  217. sub %eax, %ecx { ecx = aligned byte count − 48. }
  218. movdqa %xmm0, 16(%eax) { Write H1. }
  219. cmp $32-48, %ecx
  220. jle .LOneAlignedTailWrite
  221. movdqa %xmm0, 32(%eax) { Write H2. }
  222. cmp $64-48, %ecx
  223. jle .LTwoAlignedTailWrites
  224. sub $48, %ecx { ecx = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
  225. jle .LFourAlignedTailWrites { ecx was ≤ 96−48 }
  226. add $48, %eax { eax = H3. }
  227. cmp $NtThreshold, %ecx
  228. jae .L64xNT_Body
  229. .balign 16 { no-op }
  230. .L64x_Body:
  231. movdqa %xmm0, (%eax)
  232. movdqa %xmm0, 16(%eax)
  233. movdqa %xmm0, 32(%eax)
  234. movdqa %xmm0, 48(%eax)
  235. add $64, %eax
  236. sub $64, %ecx
  237. ja .L64x_Body
  238. .LFourAlignedTailWrites:
  239. movdqa %xmm0, (%edx) { T4 }
  240. movdqa %xmm0, 16(%edx) { T3 }
  241. .LTwoAlignedTailWrites:
  242. movdqa %xmm0, 32(%edx) { T2 }
  243. .LOneAlignedTailWrite:
  244. movdqa %xmm0, 48(%edx) { T1 }
  245. ret
  246. .balign 16
  247. .L64xNT_Body:
  248. movntdq %xmm0, (%eax)
  249. movntdq %xmm0, 16(%eax)
  250. movntdq %xmm0, 32(%eax)
  251. movntdq %xmm0, 48(%eax)
  252. add $64, %eax
  253. sub $64, %ecx
  254. ja .L64xNT_Body
  255. sfence
  256. jmp .LFourAlignedTailWrites
  257. end;
  258. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  259. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  260. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  261. {$ifndef CPUX86_HAS_SSE2}
  262. procedure FillXxxx_U32Pattern_Plain_16OrMore; assembler; nostackframe;
  263. { eax — x, ecx — uint32 pattern, edx — byte count >= 12 (preferably >= 16). }
  264. asm
  265. mov %ecx, (%eax) { Write first 4 bytes. }
  266. lea -9(%eax,%edx), %edx
  267. mov %ecx, 5(%edx) { Write last 4 bytes. }
  268. and $-4, %edx { edx = loop bound. }
  269. push %esi
  270. mov %ecx, %esi { esi = pattern }
  271. mov %eax, %ecx
  272. shl $3, %ecx { ecx = misalignment of x in bits }
  273. rol %cl, %esi { misalign the pattern }
  274. add $4, %eax
  275. and $-4, %eax
  276. .balign 16
  277. .L8xLoop:
  278. mov %esi, (%eax)
  279. mov %esi, 4(%eax)
  280. add $8, %eax
  281. cmp %edx, %eax
  282. jb .L8xLoop
  283. mov %esi, (%edx)
  284. mov %esi, 4(%edx)
  285. pop %esi
  286. end;
  287. {$endif ndef CPUX86_HAS_SSE2 (need Fill*_Plain)}
  288. procedure FillXxxx_U32Pattern_Ladder_4to16; assembler; nostackframe;
  289. { eax — x, ecx — uint32 pattern, edx — byte count, 4 <= edx <= 16. }
  290. asm
  291. mov %ecx, (%eax)
  292. cmp $8, %edx
  293. jle .LLast4
  294. mov %ecx, 4(%eax)
  295. mov %ecx, -8(%eax,%edx)
  296. .LLast4:
  297. mov %ecx, -4(%eax,%edx)
  298. end;
  299. {$endif FillChar/Word/DWord required.}
  300. {$endif FillChar/Word/DWord/QWord required.}
  301. {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
  302. {$define FPC_SYSTEM_HAS_FILLCHAR}
  303. procedure FillChar_3OrLess; assembler; nostackframe;
  304. { cl — x, edx — byte count, Low(int32) <= edx <= 3. }
  305. asm
  306. test %edx, %edx
  307. jle .LQuit
  308. mov %cl, (%eax)
  309. mov %cl, -1(%eax,%edx)
  310. shr $1, %edx
  311. mov %cl, (%eax,%edx)
  312. .LQuit:
  313. end;
  314. {$ifndef CPUX86_HAS_SSE2}
  315. procedure FillChar_Plain(var x;count:SizeInt;value:byte);assembler;nostackframe;
  316. asm
  317. cmp $3, %edx
  318. jle FillChar_3OrLess
  319. movzbl %cl, %ecx
  320. imul $0x01010101, %ecx
  321. cmp $16, %edx
  322. jbe FillXxxx_U32Pattern_Ladder_4to16
  323. jmp FillXxxx_U32Pattern_Plain_16OrMore
  324. end;
  325. {$endif ndef CPUX86_HAS_SSE2}
  326. procedure FillChar_SSE2(var x;count:SizeInt;value:byte);assembler;nostackframe;
  327. asm
  328. cmp $3, %edx
  329. jle FillChar_3OrLess
  330. movzbl %cl, %ecx
  331. imul $0x01010101, %ecx
  332. cmp $16, %edx
  333. jbe FillXxxx_U32Pattern_Ladder_4to16
  334. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  335. jae FillXxxx_U32Pattern_RepStos_8OrMore
  336. movd %ecx, %xmm0
  337. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  338. movdqu %xmm0, (%eax)
  339. movdqu %xmm0, -16(%eax,%edx)
  340. cmp $32, %edx
  341. ja FillXxxx_MoreThanTwoXMMs
  342. end;
  343. procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
  344. asm
  345. cmp $3, %edx
  346. jle FillChar_3OrLess
  347. movzbl %cl, %ecx
  348. imul $0x01010101, %ecx
  349. cmp $16, %edx
  350. jbe FillXxxx_U32Pattern_Ladder_4to16
  351. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  352. jae FillXxxx_U32Pattern_RepStos_8OrMore
  353. movd %ecx, %xmm0
  354. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  355. movdqu %xmm0, (%eax)
  356. movdqu %xmm0, -16(%eax,%edx)
  357. cmp $32, %edx
  358. ja FillXxxx_MoreThanTwoXMMs
  359. end;
  360. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
  361. var
  362. FillChar_Impl: procedure(var x;count:SizeInt;value:byte) = @FillChar_Dispatch;
  363. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte);
  364. begin
  365. if not fpc_cpucodeinit_performed then
  366. begin
  367. {$ifdef CPUX86_HAS_SSE2} FillChar_SSE2 {$else} FillChar_Plain {$endif} (x, count, value);
  368. exit;
  369. end;
  370. if fast_large_repmovstosb then
  371. FillChar_Impl := @FillChar_SSE2_ERMS
  372. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  373. FillChar_Impl := @FillChar_SSE2
  374. {$ifndef CPUX86_HAS_SSE2}
  375. else
  376. FillChar_Impl := @FillChar_Plain
  377. {$endif ndef CPUX86_HAS_SSE2};
  378. FillChar_Impl(x, count, value);
  379. end;
  380. procedure FillChar(var x;count:SizeInt;value:byte);
  381. begin
  382. FillChar_Impl(x, count, value);
  383. end;
  384. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  385. {$ifndef FPC_SYSTEM_HAS_FILLWORD}
  386. {$define FPC_SYSTEM_HAS_FILLWORD}
  387. procedure FillWord_3OrLess; assembler; nostackframe;
  388. asm
  389. test %edx, %edx
  390. jle .LQuit
  391. mov %cx, (%eax)
  392. mov %cx, -2(%eax,%edx,2)
  393. shr $1, %edx
  394. mov %cx, (%eax,%edx,2)
  395. .LQuit:
  396. end;
  397. {$ifndef CPUX86_HAS_SSE2}
  398. procedure FillWord_Plain(var x;count:SizeInt;value:word);assembler;nostackframe;
  399. asm
  400. cmp $3, %edx
  401. jle FillWord_3OrLess
  402. shl $1, %edx
  403. movzwl %cx, %ecx
  404. imul $0x00010001, %ecx
  405. cmp $16, %edx
  406. jbe FillXxxx_U32Pattern_Ladder_4to16
  407. jmp FillXxxx_U32Pattern_Plain_16OrMore
  408. end;
  409. {$endif ndef CPUX86_HAS_SSE2}
  410. procedure FillWord_SSE2(var x;count:SizeInt;value:word);assembler;nostackframe;
  411. asm
  412. cmp $3, %edx
  413. jle FillWord_3OrLess
  414. shl $1, %edx
  415. movzwl %cx, %ecx
  416. imul $0x00010001, %ecx
  417. cmp $16, %edx
  418. jbe FillXxxx_U32Pattern_Ladder_4to16
  419. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  420. jb FillXxxx_U32Pattern_SSE2_16OrMore
  421. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  422. end;
  423. procedure FillWord_SSE2_ERMS(var x;count:SizeInt;value:word);assembler;nostackframe;
  424. asm
  425. cmp $3, %edx
  426. jle FillWord_3OrLess
  427. shl $1, %edx
  428. movzwl %cx, %ecx
  429. imul $0x00010001, %ecx
  430. cmp $16, %edx
  431. jbe FillXxxx_U32Pattern_Ladder_4to16
  432. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  433. jb FillXxxx_U32Pattern_SSE2_16OrMore
  434. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  435. end;
  436. procedure FillWord_Dispatch(var x;count:SizeInt;value:word); forward;
  437. var
  438. FillWord_Impl: procedure(var x;count:SizeInt;value:word) = @FillWord_Dispatch;
  439. procedure FillWord_Dispatch(var x;count:SizeInt;value:word);
  440. begin
  441. if not fpc_cpucodeinit_performed then
  442. begin
  443. {$ifdef CPUX86_HAS_SSE2} FillWord_SSE2 {$else} FillWord_Plain {$endif} (x, count, value);
  444. exit;
  445. end;
  446. if fast_large_repmovstosb then
  447. FillWord_Impl := @FillWord_SSE2_ERMS
  448. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  449. FillWord_Impl := @FillWord_SSE2
  450. {$ifndef CPUX86_HAS_SSE2}
  451. else
  452. FillWord_Impl := @FillWord_Plain
  453. {$endif ndef CPUX86_HAS_SSE2};
  454. FillWord_Impl(x, count, value);
  455. end;
  456. procedure FillWord(var x;count:SizeInt;value:word);
  457. begin
  458. FillWord_Impl(x, count, value);
  459. end;
  460. {$endif FPC_SYSTEM_HAS_FILLWORD}
  461. {$ifndef FPC_SYSTEM_HAS_FILLDWORD}
  462. {$define FPC_SYSTEM_HAS_FILLDWORD}
  463. procedure FillDWord_4OrLess; assembler; nostackframe;
  464. asm
  465. cmp $1, %edx
  466. jl .LQuit
  467. mov %ecx, (%eax)
  468. je .LQuit
  469. mov %ecx, 4(%eax)
  470. mov %ecx, -8(%eax,%edx,4)
  471. mov %ecx, -4(%eax,%edx,4)
  472. .LQuit:
  473. end;
  474. {$ifndef CPUX86_HAS_SSE2}
  475. procedure FillDWord_Plain(var x;count:SizeInt;value:dword);assembler;nostackframe;
  476. asm
  477. cmp $4, %edx
  478. jle FillDWord_4OrLess
  479. shl $2, %edx
  480. jmp FillXxxx_U32Pattern_Plain_16OrMore
  481. end;
  482. {$endif ndef CPUX86_HAS_SSE2}
  483. procedure FillDWord_SSE2(var x;count:SizeInt;value:dword);assembler;nostackframe;
  484. asm
  485. cmp $4, %edx
  486. jle FillDWord_4OrLess
  487. shl $2, %edx
  488. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  489. jb FillXxxx_U32Pattern_SSE2_16OrMore
  490. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  491. end;
  492. procedure FillDWord_SSE2_ERMS(var x;count:SizeInt;value:dword);assembler;nostackframe;
  493. asm
  494. cmp $4, %edx
  495. jle FillDWord_4OrLess
  496. shl $2, %edx
  497. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  498. jb FillXxxx_U32Pattern_SSE2_16OrMore
  499. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  500. end;
  501. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword); forward;
  502. var
  503. FillDWord_Impl: procedure(var x;count:SizeInt;value:dword) = @FillDWord_Dispatch;
  504. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword);
  505. begin
  506. if not fpc_cpucodeinit_performed then
  507. begin
  508. {$ifdef CPUX86_HAS_SSE2} FillDWord_SSE2 {$else} FillDWord_Plain {$endif}(x, count, value);
  509. exit;
  510. end;
  511. if fast_large_repmovstosb then
  512. FillDWord_Impl := @FillDWord_SSE2_ERMS
  513. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  514. FillDWord_Impl := @FillDWord_SSE2
  515. {$ifndef CPUX86_HAS_SSE2}
  516. else
  517. FillDWord_Impl := @FillDWord_Plain
  518. {$endif ndef CPUX86_HAS_SSE2};
  519. FillDWord_Impl(x, count, value);
  520. end;
  521. procedure FillDWord(var x;count:SizeInt;value:dword);
  522. begin
  523. FillDWord_Impl(x, count, value);
  524. end;
  525. {$endif FPC_SYSTEM_HAS_FILLDWORD}
  526. {$ifndef FPC_SYSTEM_HAS_FILLQWORD}
  527. {$define FPC_SYSTEM_HAS_FILLQWORD}
  528. {$ifndef CPUX86_HAS_SSE2}
  529. procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  530. { eax = x, edx = count, [esp + 4] = value }
  531. asm
  532. test %edx, %edx { Don't care about clever things like separate small branches or aligning writes by misaligning the pattern, the function is dead. }
  533. jle .LQuit
  534. push %esi
  535. mov 4+4(%esp), %esi { esi = value[0:31] }
  536. mov 4+8(%esp), %ecx { ecx = value[32:63] }
  537. .balign 16
  538. .LLoop:
  539. mov %esi, (%eax)
  540. mov %ecx, 4(%eax)
  541. add $8, %eax
  542. sub $1, %edx
  543. jnz .LLoop
  544. pop %esi
  545. .LQuit:
  546. end;
  547. {$endif ndef CPUX86_HAS_SSE2}
  548. procedure {$ifdef CPUX86_HAS_SSE2} FillQWord {$else} FillQWord_SSE2 {$endif}(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  549. { eax = x, edx = count, [esp + 4] = value }
  550. asm
  551. cmp $4, %edx
  552. jle .L4OrLess
  553. movq 4(%esp), %xmm0
  554. punpcklqdq %xmm0, %xmm0
  555. { Stack is 12 bytes:
  556. [esp] = return address, [esp + 4] = value (not required anymore).
  557. Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs:
  558. [esp] = return address. }
  559. mov (%esp), %ecx
  560. add $8, %esp
  561. mov %ecx, (%esp)
  562. shl $3, %edx
  563. movdqu %xmm0, (%eax)
  564. movdqu %xmm0, -16(%eax,%edx)
  565. test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
  566. jz FillXxxx_MoreThanTwoXMMs
  567. mov %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x. }
  568. shl $3, %ecx
  569. and $63, %ecx
  570. movd %ecx, %xmm2
  571. movdqa %xmm0, %xmm1
  572. psllq %xmm2, %xmm1
  573. neg %ecx { Can also do not ecx; and $63, %ecx; inc ecx to support ecx = 0, as SSE shifts correctly handle count >= bitsizeof. }
  574. and $63, %ecx { But in the definitely unaligned case, ecx = 0 is impossible. }
  575. movd %ecx, %xmm2
  576. psrlq %xmm2, %xmm0
  577. por %xmm1, %xmm0
  578. jmp FillXxxx_MoreThanTwoXMMs
  579. .L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ }
  580. cmp $1, %edx
  581. jl .LQuit
  582. mov 4(%esp), %ecx
  583. mov %ecx, (%eax)
  584. je .LSecondHalfOf1
  585. mov %ecx, 8(%eax)
  586. mov %ecx, -16(%eax,%edx,8)
  587. mov %ecx, -8(%eax,%edx,8)
  588. mov 8(%esp), %ecx
  589. mov %ecx, 4(%eax)
  590. mov %ecx, 12(%eax)
  591. mov %ecx, -12(%eax,%edx,8)
  592. mov %ecx, -4(%eax,%edx,8)
  593. .LQuit:
  594. ret $8
  595. .LSecondHalfOf1:
  596. mov 8(%esp), %ecx
  597. mov %ecx, 4(%eax)
  598. end;
  599. {$ifndef CPUX86_HAS_SSE2}
  600. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword); forward;
  601. var
  602. FillQWord_Impl: procedure(var x;count:SizeInt;value:qword) = @FillQWord_Dispatch;
  603. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword);
  604. begin
  605. if not fpc_cpucodeinit_performed then
  606. begin
  607. FillQWord_Plain(x, count, value);
  608. exit;
  609. end;
  610. if has_sse2_support then
  611. FillQWord_Impl := @FillQWord_SSE2
  612. else
  613. FillQWord_Impl := @FillQWord_Plain;
  614. FillQWord_Impl(x, count, value);
  615. end;
  616. procedure FillQWord(var x;count:SizeInt;value:qword);
  617. begin
  618. FillQWord_Impl(x, count, value);
  619. end;
  620. {$endif ndef CPUX86_HAS_SSE2 (need FillQWord dispatcher)}
  621. {$endif FPC_SYSTEM_HAS_FILLQWORD}
  622. {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
  623. {$define FPC_SYSTEM_HAS_INDEXBYTE}
  624. {$ifndef CPUX86_HAS_SSE2}
  625. function IndexByte_Plain(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  626. { eax = buf, edx = len, cl = b }
  627. asm
  628. test %edx,%edx
  629. jz .Lnothing0
  630. push %eax { save initial value of 'buf' }
  631. test $3,%al
  632. jz .Laligned4
  633. .Lalignloop: { align to 4 bytes }
  634. cmp %cl,(%eax)
  635. je .Lfoundateax
  636. inc %eax
  637. dec %edx
  638. jz .Lnothing1
  639. test $3,%al
  640. jnz .Lalignloop
  641. .Laligned4: { align to 8 bytes }
  642. push %esi
  643. push %edi
  644. mov %cl,%ch { prepare pattern }
  645. movzwl %cx,%esi
  646. shl $16,%ecx
  647. or %esi,%ecx
  648. test $7,%al
  649. jz .Lloop
  650. test %edx,%edx { Adjust buf+len and reuse the second unroll from the loop body. Careful with len < 0 (esp. len = −1). }
  651. jl .Ldontfixuplen
  652. add $4,%edx
  653. .Ldontfixuplen:
  654. sub $4,%eax
  655. jmp .Lalignfrom4to8
  656. .balign 16
  657. .Lloop: { Requires 8-byte alignment of eax, to safely over-read up to 7 bytes on last iteration. }
  658. mov (%eax),%esi { load dword }
  659. xor %ecx,%esi { XOR with pattern, bytes equal to target are now 0 }
  660. lea -0x01010101(%esi),%edi
  661. not %esi
  662. and $0x80808080,%esi
  663. and %edi,%esi { (x-0x01010101) and (not x) and 0x80808080 }
  664. jnz .Lfound0 { one of the bytes matches }
  665. .Lalignfrom4to8:
  666. mov 4(%eax),%esi
  667. xor %ecx,%esi
  668. lea -0x01010101(%esi),%edi
  669. not %esi
  670. and $0x80808080,%esi
  671. and %edi,%esi
  672. jnz .Lfound1
  673. add $8,%eax
  674. sub $8,%edx
  675. ja .Lloop
  676. .Lnothing3:
  677. pop %edi
  678. pop %esi
  679. .Lnothing1:
  680. pop %edx
  681. .Lnothing0:
  682. or $-1,%eax
  683. ret
  684. .Lfound1:
  685. sub $4,%edx
  686. jbe .Lnothing3
  687. add $4,%eax
  688. .Lfound0:
  689. bsf %esi,%esi
  690. shr $3,%esi
  691. cmp %edx,%esi { Garbage after remaining length? }
  692. jae .Lnothing3
  693. add %esi,%eax
  694. pop %edi
  695. pop %esi
  696. .Lfoundateax:
  697. pop %ecx
  698. sub %ecx,%eax
  699. end;
  700. {$endif ndef CPUX86_HAS_SSE2}
  701. function {$ifdef CPUX86_HAS_SSE2} IndexByte {$else} IndexByte_SSE2 {$endif} (const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  702. asm
  703. test %edx, %edx
  704. jz .Lnotfound { exit if len=0 }
  705. movd %ecx, %xmm1
  706. mov %eax, %ecx
  707. punpcklbw %xmm1, %xmm1
  708. punpcklbw %xmm1, %xmm1
  709. and $4095, %ecx
  710. pshufd $0, %xmm1, %xmm1
  711. cmp $4080, %ecx
  712. ja .LCrossPage
  713. movdqu (%eax), %xmm0 { Analyze first 16 bytes, unaligned. }
  714. pcmpeqb %xmm1, %xmm0
  715. pmovmskb %xmm0, %ecx
  716. test %ecx, %ecx
  717. jz .LContinueAligned
  718. bsf %ecx, %eax
  719. cmp %edx, %eax
  720. jae .Lnotfound
  721. ret
  722. .byte 144 { Make .balign 16 before .Lloop a no-op. }
  723. .LContinueAligned:
  724. cmp $16, %edx { Length might be explicitly set to 16 or less; if so, skip a bit of work. }
  725. jbe .Lnotfound { (Or rather, this check is *required* unless jumping to .Lcontinue instead of going directly to .Lloop) }
  726. push %ebx
  727. lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
  728. and $-0x10, %ecx { first aligned address after buf }
  729. sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
  730. .balign 16
  731. .Lloop:
  732. movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, }
  733. add $16, %ecx { but their sum is evenly divisible by 16. }
  734. pcmpeqb %xmm1, %xmm0
  735. pmovmskb %xmm0, %ebx
  736. test %ebx, %ebx
  737. jnz .Lmatch
  738. .Lcontinue:
  739. cmp %ecx, %edx
  740. ja .Lloop
  741. pop %ebx
  742. .Lnotfound:
  743. or $-1, %eax
  744. ret
  745. .LCrossPage:
  746. push %ebx
  747. lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
  748. and $-0x10, %ecx { first aligned address after buf }
  749. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
  750. sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
  751. pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
  752. pmovmskb %xmm0, %ebx
  753. shl %cl, %ebx { shift valid bits into high word }
  754. and $0xffff0000, %ebx { clear low word containing invalid bits }
  755. shr %cl, %ebx { shift back }
  756. jz .Lcontinue
  757. .Lmatch:
  758. bsf %ebx, %ebx
  759. lea -16(%ecx,%ebx), %eax
  760. pop %ebx
  761. cmp %eax, %edx { check against the buffer length }
  762. jbe .Lnotfound
  763. end;
  764. {$ifndef CPUX86_HAS_SSE2}
  765. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt; forward;
  766. var
  767. IndexByte_Impl: function(const buf;len:SizeInt;b:byte):SizeInt = @IndexByte_Dispatch;
  768. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt;
  769. begin
  770. if not fpc_cpucodeinit_performed then
  771. exit(IndexByte_Plain(buf,len,b));
  772. if has_sse2_support then
  773. IndexByte_Impl:=@IndexByte_SSE2
  774. else
  775. IndexByte_Impl:=@IndexByte_Plain;
  776. result:=IndexByte_Impl(buf,len,b);
  777. end;
  778. function IndexByte(const buf;len:SizeInt;b:byte):SizeInt;
  779. begin
  780. result:=IndexByte_Impl(buf,len,b);
  781. end;
  782. {$endif ndef CPUX86_HAS_SSE2}
  783. {$endif FPC_SYSTEM_HAS_INDEXBYTE}
  784. {$ifndef FPC_SYSTEM_HAS_INDEXWORD}
  785. {$define FPC_SYSTEM_HAS_INDEXWORD}
  786. {$ifndef CPUX86_HAS_SSE2}
  787. function IndexWord_Plain(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  788. asm
  789. test %edx, %edx
  790. jz .LNotFound
  791. push %eax
  792. .LWordwise_Body: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  793. cmp %cx, (%eax)
  794. je .LFound
  795. add $2, %eax
  796. dec %edx
  797. jnz .LWordwise_Body
  798. pop %edx
  799. .LNotFound:
  800. or $-1, %eax
  801. ret
  802. .LFound:
  803. pop %edx
  804. sub %edx, %eax
  805. shr $1, %eax
  806. end;
  807. {$endif ndef CPUX86_HAS_SSE2}
  808. function {$ifdef CPUX86_HAS_SSE2} IndexWord {$else} IndexWord_SSE2 {$endif} (const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  809. asm
  810. test %edx, %edx { exit if len=0 }
  811. je .Lnotfound
  812. push %ebx
  813. movd %ecx, %xmm1
  814. punpcklwd %xmm1, %xmm1
  815. pshufd $0, %xmm1, %xmm1
  816. lea 16(%eax), %ecx
  817. and $-16, %ecx
  818. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
  819. sub %eax, %ecx
  820. test $1, %eax { if buffer isn't aligned to word boundary, }
  821. jnz .Lunaligned { use a different algorithm }
  822. pcmpeqw %xmm1, %xmm0
  823. pmovmskb %xmm0, %ebx
  824. shl %cl, %ebx
  825. and $0xffff0000, %ebx
  826. shr %cl, %ebx
  827. shr $1, %ecx { ecx=number of valid bytes }
  828. test %ebx, %ebx
  829. jz .Lcontinue
  830. .Lmatch:
  831. bsf %ebx, %ebx
  832. shr $1, %ebx { in words }
  833. lea -8(%ecx,%ebx), %eax
  834. pop %ebx
  835. cmp %eax, %edx
  836. jbe .Lnotfound { if match is after the specified length, ignore it }
  837. ret
  838. .balign 16
  839. .Lloop:
  840. movdqa (%eax,%ecx,2), %xmm0
  841. add $8, %ecx
  842. pcmpeqw %xmm1, %xmm0
  843. pmovmskb %xmm0, %ebx
  844. test %ebx, %ebx
  845. jnz .Lmatch
  846. .Lcontinue:
  847. cmp %ecx, %edx
  848. ja .Lloop
  849. pop %ebx
  850. .Lnotfound:
  851. or $-1, %eax
  852. ret
  853. .Lunaligned:
  854. push %esi
  855. movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
  856. psllw $8, %xmm1 { swap bytes of each word of pattern) }
  857. psrlw $8, %xmm2
  858. por %xmm2, %xmm1
  859. pcmpeqb %xmm1, %xmm0
  860. pmovmskb %xmm0, %ebx
  861. shl %cl, %ebx
  862. and $0xffff0000, %ebx
  863. shr %cl, %ebx
  864. xor %esi, %esi { nothing to merge yet }
  865. add %edx, %edx { length words -> bytes }
  866. jmp .Lcontinue_u
  867. .balign 16
  868. .Lloop_u:
  869. movdqa (%eax,%ecx), %xmm0
  870. add $16, %ecx
  871. pcmpeqb %xmm1, %xmm0 { compare by bytes }
  872. shr $16, %esi { bit 16 shifts into 0 }
  873. pmovmskb %xmm0, %ebx
  874. .Lcontinue_u:
  875. shl $1, %ebx { 15:0 -> 16:1 }
  876. or %esi, %ebx { merge bit 0 from previous round }
  877. mov %ebx, %esi
  878. shr $1, %ebx { now AND together adjacent pairs of bits }
  879. and %esi, %ebx
  880. and $0x5555, %ebx { also reset odd bits }
  881. jnz .Lmatch_u
  882. cmp %ecx, %edx
  883. ja .Lloop_u
  884. .Lnotfound_u:
  885. pop %esi
  886. pop %ebx
  887. or $-1, %eax
  888. ret
  889. .Lmatch_u:
  890. bsf %ebx, %ebx
  891. lea -16(%ecx,%ebx), %eax
  892. cmp %eax, %edx
  893. jbe .Lnotfound_u { if match is after the specified length, ignore it }
  894. sar $1, %eax { in words }
  895. pop %esi
  896. pop %ebx
  897. end;
  898. {$ifndef CPUX86_HAS_SSE2}
  899. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt; forward;
  900. var
  901. IndexWord_Impl: function(const buf;len:SizeInt;b:word):SizeInt = @IndexWord_Dispatch;
  902. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt;
  903. begin
  904. if not fpc_cpucodeinit_performed then
  905. exit(IndexWord_Plain(buf,len,b));
  906. if has_sse2_support then
  907. IndexWord_Impl:=@IndexWord_SSE2
  908. else
  909. IndexWord_Impl:=@IndexWord_Plain;
  910. result:=IndexWord_Impl(buf,len,b);
  911. end;
  912. function IndexWord(const buf;len:SizeInt;b:word):SizeInt; inline;
  913. begin
  914. result:=IndexWord_Impl(buf,len,b);
  915. end;
  916. {$endif ndef CPUX86_HAS_SSE2}
  917. {$endif FPC_SYSTEM_HAS_INDEXWORD}
  918. {$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
  919. {$define FPC_SYSTEM_HAS_INDEXDWORD}
  920. {$ifndef CPUX86_HAS_SSE2}
  921. function IndexDWord_Plain(Const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  922. asm
  923. push %eax
  924. sub $4, %eax
  925. .LDWordwise_Next: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  926. add $4, %eax
  927. sub $1, %edx
  928. jb .LNotFound
  929. cmp %ecx, (%eax)
  930. jne .LDWordwise_Next
  931. pop %edx
  932. sub %edx, %eax
  933. shr $2, %eax
  934. ret
  935. .LNotFound:
  936. pop %edx
  937. mov $-1, %eax
  938. end;
  939. {$endif ndef CPUX86_HAS_SSE2}
  940. function {$ifdef CPUX86_HAS_SSE2} IndexDWord {$else} IndexDWord_SSE2 {$endif} (const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  941. asm
  942. push %eax
  943. sub $4, %edx
  944. jle .LDwordwise_Prepare
  945. movd %ecx, %xmm1
  946. pshufd $0, %xmm1, %xmm1
  947. .balign 16 { 1-byte NOP. }
  948. .L4x_Body:
  949. movdqu (%eax), %xmm0
  950. pcmpeqd %xmm1, %xmm0
  951. pmovmskb %xmm0, %ecx
  952. test %ecx, %ecx
  953. jnz .LFoundAtMask
  954. add $16, %eax
  955. sub $4, %edx
  956. jg .L4x_Body
  957. lea (%eax,%edx,4), %eax
  958. movdqu (%eax), %xmm0
  959. pcmpeqd %xmm1, %xmm0
  960. pmovmskb %xmm0, %ecx
  961. test %ecx, %ecx
  962. jz .LNothing
  963. .LFoundAtMask:
  964. bsf %ecx, %ecx
  965. add %ecx, %eax
  966. .LFoundAtEax:
  967. pop %edx
  968. sub %edx, %eax
  969. shr $2, %eax
  970. ret
  971. nop { Turns .balign 16 before .LDwordwise_Body into a no-op. }
  972. .LDwordwise_Prepare:
  973. add $3, %edx
  974. cmp $-1, %edx
  975. je .LNothing
  976. .balign 16 { no-op }
  977. .LDwordwise_Body:
  978. cmp (%eax), %ecx
  979. je .LFoundAtEax
  980. add $4, %eax
  981. sub $1, %edx
  982. jae .LDwordwise_Body
  983. .LNothing:
  984. pop %edx
  985. or $-1, %eax
  986. end;
  987. {$ifndef CPUX86_HAS_SSE2}
  988. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt; forward;
  989. var
  990. IndexDWord_Impl: function(const buf;len:SizeInt;b:DWord):SizeInt = @IndexDWord_Dispatch;
  991. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt;
  992. begin
  993. if not fpc_cpucodeinit_performed then
  994. exit(IndexDWord_Plain(buf,len,b));
  995. if has_sse2_support then
  996. IndexDWord_Impl:=@IndexDWord_SSE2
  997. else
  998. IndexDWord_Impl:=@IndexDWord_Plain;
  999. result:=IndexDWord_Impl(buf,len,b);
  1000. end;
  1001. function IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt;
  1002. begin
  1003. result:=IndexDWord_Impl(buf,len,b);
  1004. end;
  1005. {$endif CPUX86_HAS_SSE2}
  1006. {$endif FPC_SYSTEM_HAS_INDEXDWORD}
  1007. {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
  1008. {$define FPC_SYSTEM_HAS_INDEXQWORD}
  1009. function IndexQWord_Plain(const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  1010. { eax = buf, edx = len, [esp+4] = b }
  1011. asm
  1012. push %ebx
  1013. mov 8(%esp), %ecx { ecx = b[0:31] }
  1014. mov 12(%esp), %ebx { ebx = b[32:63] }
  1015. mov %eax, 8(%esp) { remember original buf }
  1016. sub $8, %eax
  1017. .balign 16 { no-op }
  1018. .LQWordwise_Next:
  1019. add $8, %eax
  1020. sub $1, %edx
  1021. jb .LNotFound
  1022. cmp %ecx, (%eax)
  1023. jne .LQWordwise_Next
  1024. cmp %ebx, 4(%eax)
  1025. jne .LQWordwise_Next
  1026. sub 8(%esp), %eax
  1027. pop %ebx
  1028. shr $3, %eax
  1029. ret $8
  1030. .LNotFound:
  1031. pop %ebx
  1032. mov $-1, %eax
  1033. end;
  1034. function {$ifdef CPUX86_HAS_SSE4_1} IndexQWord {$else} IndexQWord_SSE41 {$endif}(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  1035. { eax = buf, edx = len, [esp+4] = b }
  1036. asm
  1037. cmp $6, len
  1038. jle IndexQWord_Plain
  1039. movddup 4(%esp), %xmm0 { xmm0 = pattern of 'b's. }
  1040. mov %eax, %ecx { ecx = original buf }
  1041. sub $6, len
  1042. .balign 16
  1043. .L6x_Loop:
  1044. movdqu (%eax), %xmm1
  1045. pcmpeqq %xmm0, %xmm1 { xmm1 = cmpeq(vec 0, pattern) }
  1046. movdqu 16(%eax), %xmm2
  1047. pcmpeqq %xmm0, %xmm2
  1048. por %xmm1, %xmm2 { xmm2 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) }
  1049. movdqu 32(%eax), %xmm3
  1050. pcmpeqq %xmm0, %xmm3
  1051. por %xmm2, %xmm3 { xmm3 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) or cmpeq(vec 2, pattern) }
  1052. ptest %xmm3, %xmm3
  1053. jnz .LFound
  1054. add $48, %eax
  1055. sub $6, len
  1056. jge .L6x_Loop
  1057. lea (%eax,%edx,8), %eax { Point to last 3 vectors. }
  1058. cmp $-5, len
  1059. jge .L6x_Loop { Reuse .L6x_Loop to compare last 3 vectors, if not compared already. }
  1060. mov $-1, %eax
  1061. ret $8
  1062. .LFound:
  1063. sub %ecx, %eax
  1064. ptest %xmm1, %xmm1
  1065. jnz .LFoundAtXmm1
  1066. ptest %xmm2, %xmm2
  1067. jnz .LFoundAtXmm2
  1068. add $16, %eax
  1069. movdqa %xmm3, %xmm2
  1070. .LFoundAtXmm2:
  1071. add $16, %eax
  1072. movdqa %xmm2, %xmm1
  1073. .LFoundAtXmm1:
  1074. pmovmskb %xmm1, %ecx
  1075. bsf %ecx, %ecx
  1076. add %ecx, %eax
  1077. shr $3, %eax
  1078. end;
  1079. {$ifndef CPUX86_HAS_SSE4_1}
  1080. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt; forward;
  1081. var
  1082. IndexQWord_Impl: function(const buf;len:SizeInt;b:QWord):SizeInt = @IndexQWord_Dispatch;
  1083. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt;
  1084. begin
  1085. if not fpc_cpucodeinit_performed then
  1086. exit(IndexQWord_Plain(buf,len,b));
  1087. if has_sse41_support then
  1088. IndexQWord_Impl:=@IndexQWord_SSE41
  1089. else
  1090. IndexQWord_Impl:=@IndexQWord_Plain;
  1091. result:=IndexQWord_Impl(buf,len,b);
  1092. end;
  1093. function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt;
  1094. begin
  1095. result:=IndexQWord_Impl(buf,len,b);
  1096. end;
  1097. {$endif ndef CPUX86_HAS_SSE4_1}
  1098. {$endif FPC_SYSTEM_HAS_INDEXQWORD}
  1099. {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
  1100. {$define FPC_SYSTEM_HAS_COMPAREBYTE}
  1101. {$ifndef CPUX86_HAS_SSE2}
  1102. function CompareByte_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1103. asm
  1104. { eax = buf1, edx = buf2, ecx = len }
  1105. push %ebx
  1106. sub %eax, %edx { edx = buf2 - buf1 }
  1107. cmp $3, %ecx
  1108. jle .LBytewise_Prepare
  1109. { Align buf1 on 4 bytes. }
  1110. mov (%edx,%eax), %ebx
  1111. cmp (%eax), %ebx
  1112. jne .L4xDiffer
  1113. lea -4(%eax,%ecx), %ecx { ecx = buf1 end - (4 + buf1 and -4) = count remaining }
  1114. and $-4, %eax
  1115. sub %eax, %ecx
  1116. .balign 16
  1117. .L4x_Next:
  1118. add $4, %eax
  1119. sub $4, %ecx { at .LLast4, ecx is 4 less than remaining bytes }
  1120. jle .LLast4
  1121. mov (%edx,%eax), %ebx
  1122. cmp (%eax), %ebx
  1123. je .L4x_Next
  1124. .L4xDiffer:
  1125. mov (%eax), %edx
  1126. {$ifdef CPUX86_HAS_BSWAP}
  1127. bswap %ebx
  1128. bswap %edx
  1129. {$else}
  1130. rol $8, %bx
  1131. rol $16, %ebx
  1132. rol $8, %bx
  1133. rol $8, %dx
  1134. rol $16, %edx
  1135. rol $8, %dx
  1136. {$endif}
  1137. cmp %ebx, %edx
  1138. .LDoSbb:
  1139. sbb %eax, %eax
  1140. or $1, %eax
  1141. pop %ebx
  1142. ret
  1143. .LLast4:
  1144. add %ecx, %eax
  1145. mov (%edx,%eax), %ebx
  1146. cmp (%eax), %ebx
  1147. jne .L4xDiffer
  1148. xor %eax, %eax
  1149. pop %ebx
  1150. ret
  1151. .LBytewise_Prepare:
  1152. sub $1, %ecx
  1153. jb .LNothing
  1154. .balign 16 { no-op }
  1155. .LBytewise_Body:
  1156. movzbl (%edx,%eax), %ebx
  1157. cmp %bl, (%eax)
  1158. jne .LDoSbb
  1159. add $1, %eax
  1160. sub $1, %ecx
  1161. jae .LBytewise_Body
  1162. .LNothing:
  1163. xor %eax, %eax
  1164. pop %ebx
  1165. end;
  1166. {$endif ndef CPUX86_HAS_SSE2}
  1167. label
  1168. CompareByte_1OrLess, CompareByte_CantOverReadBoth_AVX2;
  1169. function CompareByte_SSE2(const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
  1170. asm
  1171. { eax = buf1, edx = buf2, ecx = len }
  1172. cmp $1, %ecx
  1173. jle CompareByte_1OrLess
  1174. push %ebx
  1175. cmp $16, %ecx
  1176. jae .LVecOrMore
  1177. { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
  1178. mov %eax, %ebx
  1179. or %edx, %ebx
  1180. and $4095, %ebx
  1181. cmp $4080, %ebx
  1182. ja .LCantOverReadBoth
  1183. { Over-read both as XMMs. }
  1184. movdqu (%eax), %xmm0
  1185. movdqu (%edx), %xmm1
  1186. pcmpeqb %xmm1, %xmm0
  1187. pmovmskb %xmm0, %ebx
  1188. inc %bx { Here and below, 2-byte 'inc r16' after 'pmovmskb' can be replaced with 5-byte 'add $1, r16' or 6-byte 'xor $65535, r32'. }
  1189. jz .LNothing
  1190. bsf %ebx, %ebx
  1191. cmp %ecx, %ebx { Ignore garbage beyond 'len'. }
  1192. jae .LNothing
  1193. movzbl (%eax,%ebx), %eax
  1194. movzbl (%edx,%ebx), %edx
  1195. sub %edx, %eax
  1196. pop %ebx
  1197. ret
  1198. .LNothing:
  1199. pop %ebx
  1200. xor %eax, %eax
  1201. ret
  1202. .LAligned32xLoop_TwoVectorsDiffer:
  1203. add %eax, %edx { restore edx = buf2 }
  1204. pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
  1205. inc %cx
  1206. jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
  1207. mov %ecx, %ebx
  1208. .LVec0Differs:
  1209. bsf %ebx, %ebx
  1210. movzbl (%eax,%ebx), %eax
  1211. movzbl (%edx,%ebx), %edx
  1212. sub %edx, %eax
  1213. pop %ebx
  1214. ret
  1215. .byte 144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
  1216. CompareByte_CantOverReadBoth_AVX2:
  1217. cmp $16, %ecx
  1218. jb .LCantOverReadBoth
  1219. .LVecOrMore:
  1220. { Compare first vectors. }
  1221. movdqu (%eax), %xmm0
  1222. movdqu (%edx), %xmm1
  1223. pcmpeqb %xmm1, %xmm0
  1224. pmovmskb %xmm0, %ebx
  1225. inc %bx
  1226. jnz .LVec0Differs
  1227. sub $32, %ecx { now ecx is len - 32. }
  1228. jbe .LLastVec
  1229. { Compare second vectors. }
  1230. movdqu 16(%eax), %xmm0
  1231. movdqu 16(%edx), %xmm1
  1232. pcmpeqb %xmm1, %xmm0
  1233. pmovmskb %xmm0, %ebx
  1234. inc %bx
  1235. jnz .LVec1Differs
  1236. cmp $32, %ecx
  1237. jbe .LLastTwoVectors
  1238. { More than four vectors: aligned loop. }
  1239. lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
  1240. sub %eax, %edx { edx = buf2 - buf1 }
  1241. and $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
  1242. sub %eax, %ecx { ecx = count to be handled with loop }
  1243. .balign 16 { No-op. }
  1244. .LAligned32xLoop_Body:
  1245. add $32, %eax
  1246. { Compare two XMMs, reduce the result with 'and'. }
  1247. movdqu (%edx,%eax), %xmm0
  1248. pcmpeqb (%eax), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
  1249. movdqu 16(%edx,%eax), %xmm1
  1250. pcmpeqb 16(%eax), %xmm1
  1251. pand %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
  1252. pmovmskb %xmm1, %ebx
  1253. inc %bx
  1254. jnz .LAligned32xLoop_TwoVectorsDiffer
  1255. sub $32, %ecx
  1256. ja .LAligned32xLoop_Body
  1257. add %eax, %edx { restore edx = buf2 }
  1258. add $32, %ecx
  1259. .LLastTwoVectors:
  1260. movdqu (%eax,%ecx), %xmm0
  1261. movdqu (%edx,%ecx), %xmm1
  1262. pcmpeqb %xmm1, %xmm0
  1263. pmovmskb %xmm0, %ebx
  1264. inc %bx
  1265. jnz .LVecEm2Differs
  1266. .LLastVec:
  1267. movdqu 16(%eax,%ecx), %xmm0
  1268. movdqu 16(%edx,%ecx), %xmm1
  1269. pcmpeqb %xmm1, %xmm0
  1270. pmovmskb %xmm0, %ebx
  1271. inc %bx
  1272. jnz .LVecEm1Differs
  1273. pop %ebx
  1274. xor %eax, %eax
  1275. ret
  1276. .LVec1Differs:
  1277. xor %ecx, %ecx
  1278. .LVecEm1Differs:
  1279. add $16, %ecx
  1280. .LVecEm2Differs:
  1281. bsf %ebx, %ebx
  1282. add %ecx, %ebx
  1283. movzbl (%eax,%ebx), %eax
  1284. movzbl (%edx,%ebx), %edx
  1285. sub %edx, %eax
  1286. pop %ebx
  1287. ret
  1288. .LCantOverReadBoth:
  1289. cmp $3, %ecx
  1290. jle .L2to3
  1291. push %esi
  1292. mov (%eax), %ebx
  1293. mov (%edx), %esi
  1294. cmp %esi, %ebx
  1295. jne .L4xDiffer
  1296. cmp $8, %ecx
  1297. jbe .LLast4x
  1298. mov 4(%eax), %ebx
  1299. mov 4(%edx), %esi
  1300. cmp %esi, %ebx
  1301. jne .L4xDiffer
  1302. mov -8(%eax,%ecx), %ebx
  1303. mov -8(%edx,%ecx), %esi
  1304. cmp %esi, %ebx
  1305. jne .L4xDiffer
  1306. .LLast4x:
  1307. mov -4(%eax,%ecx), %ebx
  1308. mov -4(%edx,%ecx), %esi
  1309. cmp %esi, %ebx
  1310. jne .L4xDiffer
  1311. pop %esi
  1312. pop %ebx
  1313. xor %eax, %eax
  1314. ret
  1315. .L4xDiffer:
  1316. bswap %ebx
  1317. bswap %esi
  1318. cmp %esi, %ebx
  1319. pop %esi
  1320. sbb %eax, %eax
  1321. or $1, %eax
  1322. pop %ebx
  1323. ret
  1324. .L2to3:
  1325. movzwl (%edx), %ebx
  1326. bswap %ebx
  1327. shr $1, %ebx
  1328. mov -1(%edx,%ecx), %bl
  1329. movzwl (%eax), %edx
  1330. bswap %edx
  1331. shr $1, %edx
  1332. mov -1(%eax,%ecx), %dl
  1333. mov %edx, %eax
  1334. sub %ebx, %eax
  1335. pop %ebx
  1336. ret
  1337. CompareByte_1OrLess:
  1338. jl .LUnbounded_Prepare
  1339. movzbl (%eax), %eax
  1340. movzbl (%edx), %edx
  1341. sub %edx, %eax
  1342. ret
  1343. .LUnbounded_Prepare:
  1344. sub %eax, %edx { edx = buf2 - buf1 }
  1345. test %ecx, %ecx
  1346. jnz .LUnbounded_Body
  1347. xor %eax, %eax
  1348. ret
  1349. .balign 16
  1350. .LUnbounded_Next:
  1351. add $1, %eax
  1352. .LUnbounded_Body:
  1353. movzbl (%edx,%eax), %ecx
  1354. cmp %cl, (%eax)
  1355. je .LUnbounded_Next
  1356. sbb %eax, %eax
  1357. or $1, %eax
  1358. end;
  1359. function {$ifdef CPUX86_HAS_BMI1} CompareByte {$else} CompareByte_AVX2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
  1360. asm
  1361. { eax = buf1, edx = buf2, ecx = len }
  1362. cmp $1, %ecx
  1363. jle CompareByte_1OrLess
  1364. push %ebx
  1365. cmp $32, %ecx
  1366. jae .LVecOrMore
  1367. { 2 to 31 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
  1368. mov %eax, %ebx
  1369. or %edx, %ebx
  1370. and $4095, %ebx
  1371. cmp $4064, %ebx
  1372. ja CompareByte_CantOverReadBoth_AVX2
  1373. { Over-read both as YMMs. }
  1374. vmovdqu (%eax), %ymm0
  1375. vpcmpeqb (%edx), %ymm0, %ymm0
  1376. vpmovmskb %ymm0, %ebx
  1377. inc %ebx
  1378. { bzhi %ecx, %ebx, %ecx }
  1379. .byte 0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi }
  1380. jnz .LVec0Differs
  1381. vzeroupper
  1382. pop %ebx
  1383. xor %eax, %eax
  1384. ret
  1385. .byte 144 { Turn .balign 16 before .LAligned64xLoop_Body into a no-op. }
  1386. .LAligned64xLoop_TwoVectorsDiffer:
  1387. add %eax, %edx { restore edx = buf2 }
  1388. vpmovmskb %ymm0, %ecx { Is there a difference in the first vector? }
  1389. inc %ecx
  1390. jz .LVec1Differs { No difference in the first vector, ymm0 is all ones, ebx = vpmovmskb(vpcmpeqb(buf1 + 32, buf2 + 32)) from the loop body. }
  1391. mov %ecx, %ebx
  1392. .LVec0Differs:
  1393. vzeroupper
  1394. tzcnt %ebx, %ebx
  1395. movzbl (%eax,%ebx), %eax
  1396. movzbl (%edx,%ebx), %edx
  1397. sub %edx, %eax
  1398. pop %ebx
  1399. ret
  1400. .LVecOrMore:
  1401. { Compare first vectors. }
  1402. vmovdqu (%eax), %ymm0
  1403. vpcmpeqb (%edx), %ymm0, %ymm0
  1404. vpmovmskb %ymm0, %ebx
  1405. inc %ebx
  1406. jnz .LVec0Differs
  1407. sub $64, %ecx { now ecx is len - 64. }
  1408. jbe .LLastVec
  1409. { Compare second vectors. }
  1410. vmovdqu 32(%eax), %ymm0
  1411. vpcmpeqb 32(%edx), %ymm0, %ymm0
  1412. vpmovmskb %ymm0, %ebx
  1413. inc %ebx
  1414. jnz .LVec1Differs
  1415. cmp $64, %ecx
  1416. jbe .LLastTwoVectors
  1417. { More than four vectors: aligned loop. }
  1418. lea -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
  1419. sub %eax, %edx { edx = buf2 - buf1 }
  1420. and $-32, %eax { Align buf1. First two vectors already analyzed are skipped by +64 on the first loop iteration. }
  1421. sub %eax, %ecx { ecx = count to be handled with loop }
  1422. .balign 16 { No-op. }
  1423. .LAligned64xLoop_Body:
  1424. add $64, %eax
  1425. { Compare two YMMs, reduce the result with 'and'. }
  1426. vmovdqu (%edx,%eax), %ymm0
  1427. vpcmpeqb (%eax), %ymm0, %ymm0 { ymm0 = vpcmpeqb(buf1, buf2) }
  1428. vmovdqu 32(%edx,%eax), %ymm1
  1429. vpcmpeqb 32(%eax), %ymm1, %ymm1
  1430. vpand %ymm0, %ymm1, %ymm1 { ymm1 = ymm0 and vpcmpeqb(buf1 + 32, buf2 + 32) }
  1431. vpmovmskb %ymm1, %ebx
  1432. inc %ebx
  1433. jnz .LAligned64xLoop_TwoVectorsDiffer
  1434. sub $64, %ecx
  1435. ja .LAligned64xLoop_Body
  1436. add %eax, %edx { restore edx = buf2 }
  1437. add $64, %ecx
  1438. .LLastTwoVectors:
  1439. vmovdqu (%eax,%ecx), %ymm0
  1440. vpcmpeqb (%edx,%ecx), %ymm0, %ymm0
  1441. vpmovmskb %ymm0, %ebx
  1442. inc %ebx
  1443. jnz .LVecEm2Differs
  1444. .LLastVec:
  1445. vmovdqu 32(%eax,%ecx), %ymm0
  1446. vpcmpeqb 32(%edx,%ecx), %ymm0, %ymm0
  1447. vpmovmskb %ymm0, %ebx
  1448. inc %ebx
  1449. jnz .LVecEm1Differs
  1450. vzeroupper
  1451. pop %ebx
  1452. xor %eax, %eax
  1453. ret
  1454. .LVec1Differs:
  1455. xor %ecx, %ecx
  1456. .LVecEm1Differs:
  1457. add $32, %ecx
  1458. .LVecEm2Differs:
  1459. vzeroupper
  1460. tzcnt %ebx, %ebx
  1461. add %ecx, %ebx
  1462. movzbl (%eax,%ebx), %eax
  1463. movzbl (%edx,%ebx), %edx
  1464. sub %edx, %eax
  1465. pop %ebx
  1466. end;
  1467. {$ifndef CPUX86_HAS_BMI1}
  1468. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1469. var
  1470. CompareByte_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareByte_Dispatch;
  1471. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1472. begin
  1473. if not fpc_cpucodeinit_performed then
  1474. exit({$ifdef CPUX86_HAS_SSE2} CompareByte_SSE2 {$else} CompareByte_Plain {$endif} (buf1, buf2, len));
  1475. if has_avx2_support then
  1476. CompareByte_Impl:=@CompareByte_AVX2
  1477. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  1478. CompareByte_Impl:=@CompareByte_SSE2
  1479. {$ifndef CPUX86_HAS_SSE2}
  1480. else
  1481. CompareByte_Impl:=@CompareByte_Plain
  1482. {$endif};
  1483. result:=CompareByte_Impl(buf1, buf2, len);
  1484. end;
  1485. function CompareByte(const buf1, buf2; len: SizeInt): SizeInt;
  1486. begin
  1487. result:=CompareByte_Impl(buf1, buf2, len);
  1488. end;
  1489. {$endif ndef CPUX86_HAS_BMI1 (need CompareByte dispatcher)}
  1490. {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
  1491. {$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
  1492. {$define FPC_SYSTEM_HAS_COMPAREWORD}
  1493. {$ifndef CPUX86_HAS_SSE2}
  1494. function CompareWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1495. asm
  1496. push %ebx
  1497. sub %eax, %edx { edx = buf2 - buf1 }
  1498. lea -4(%ecx), %ebx { Go wordwise if ecx <= 3 or ecx > 1073741823 (High(int32) div 2) ==> uint32(ecx - 4) > 1073741819. }
  1499. cmp $1073741819, %ebx
  1500. ja .LWordwise_Prepare
  1501. test $2, %al
  1502. je .LAlignedToPtrUintOrNaturallyMisaligned
  1503. movzwl (%edx,%eax), %ebx
  1504. cmp %bx, (%eax)
  1505. jne .LDoSbb
  1506. add $2, %eax
  1507. sub $1, %ecx
  1508. .LAlignedToPtrUintOrNaturallyMisaligned:
  1509. sub $2, %ecx
  1510. .balign 16
  1511. .LPtrUintWise_Next:
  1512. mov (%edx,%eax), %ebx
  1513. cmp %ebx, (%eax)
  1514. jne .LPtrUintsDiffer
  1515. add $4, %eax
  1516. sub $2, %ecx
  1517. jg .LPtrUintWise_Next
  1518. lea (%eax,%ecx,2), %eax
  1519. mov (%edx,%eax), %ebx
  1520. cmp %ebx, (%eax)
  1521. jne .LPtrUintsDiffer
  1522. pop %ebx
  1523. xor %eax, %eax
  1524. ret
  1525. .LPtrUintsDiffer:
  1526. cmp %bx, (%eax)
  1527. jne .LDoSbb
  1528. shr $16, %ebx
  1529. cmp %bx, 2(%eax)
  1530. .LDoSbb:
  1531. sbb %eax, %eax
  1532. or $1, %eax
  1533. pop %ebx
  1534. ret
  1535. .balign 16
  1536. .LWordwise_Body:
  1537. movzwl (%edx,%eax), %ebx
  1538. cmp %bx, (%eax)
  1539. jne .LDoSbb
  1540. add $2, %eax
  1541. .LWordwise_Prepare:
  1542. sub $1, %ecx
  1543. jnb .LWordwise_Body
  1544. pop %ebx
  1545. xor %eax, %eax
  1546. end;
  1547. {$endif ndef CPUX86_HAS_SSE2}
  1548. function {$ifdef CPUX86_HAS_SSE2} CompareWord {$else} CompareWord_SSE2 {$endif} (Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1549. asm
  1550. push %ebx
  1551. sub %eax, %edx { edx = buf2 - buf1 }
  1552. lea -2(%ecx), %ebx { Go wordwise if ecx <= 1 or uint32(ecx) > 1073741823 (High(int32) div 2) ==> uint32(ecx - 2) > 1073741821. }
  1553. cmp $1073741821, %ebx
  1554. ja .LWordwise_Prepare
  1555. cmp $8, %ecx
  1556. jge .LVecOrMore
  1557. lea (%edx,%eax), %ebx
  1558. or %eax, %ebx
  1559. and $4095, %ebx
  1560. cmp $4080, %ebx
  1561. ja .LWordwise_Prepare
  1562. movdqu (%edx,%eax), %xmm0
  1563. movdqu (%eax), %xmm1
  1564. pcmpeqw %xmm1, %xmm0
  1565. pmovmskb %xmm0, %ebx
  1566. inc %bx
  1567. jz .LNothing
  1568. shl $1, %ecx { convert to bytes }
  1569. bsf %ebx, %ebx
  1570. cmp %ecx, %ebx
  1571. jb .LSubtractWords
  1572. .LNothing:
  1573. pop %ebx
  1574. xor %eax, %eax
  1575. ret
  1576. .balign 16
  1577. .LWordwise_Body:
  1578. movzwl (%edx,%eax), %ebx
  1579. cmp %bx, (%eax)
  1580. jne .LDoSbb
  1581. add $2, %eax
  1582. .LWordwise_Prepare:
  1583. sub $1, %ecx
  1584. jae .LWordwise_Body
  1585. xor %eax, %eax
  1586. pop %ebx
  1587. ret
  1588. .LDoSbb:
  1589. sbb %eax, %eax
  1590. or $1, %eax
  1591. pop %ebx
  1592. ret
  1593. .LVecOrMore:
  1594. movdqu (%edx,%eax), %xmm0 { Compare first vectors. }
  1595. movdqu (%eax), %xmm1
  1596. pcmpeqw %xmm1, %xmm0
  1597. pmovmskb %xmm0, %ebx
  1598. inc %bx
  1599. jnz .LVec0Differs
  1600. shl $1, %ecx { convert to bytes }
  1601. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1602. jle .LLastVec
  1603. push %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1604. add %eax, %ecx
  1605. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1606. sub %eax, %ecx
  1607. .balign 16
  1608. .LAligned8xLoop_Body:
  1609. add $16, %eax
  1610. movdqu (%edx,%eax), %xmm0
  1611. pcmpeqb (%eax), %xmm0
  1612. pmovmskb %xmm0, %ebx
  1613. inc %bx
  1614. jnz .LAligned8xLoop_VecDiffers
  1615. sub $16, %ecx
  1616. ja .LAligned8xLoop_Body
  1617. pop %ebx { drop original buf1 }
  1618. .LLastVec:
  1619. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1620. movdqu (%edx,%eax), %xmm0
  1621. movdqu (%eax), %xmm1
  1622. pcmpeqw %xmm1, %xmm0
  1623. pmovmskb %xmm0, %ebx
  1624. inc %bx
  1625. jnz .LVec0Differs
  1626. pop %ebx
  1627. xor %eax, %eax
  1628. ret
  1629. .LVec0Differs:
  1630. bsf %ebx, %ebx
  1631. .LSubtractWords:
  1632. add %eax, %edx
  1633. movzwl (%eax,%ebx), %eax
  1634. movzwl (%edx,%ebx), %edx
  1635. sub %edx, %eax
  1636. pop %ebx
  1637. ret
  1638. .LAligned8xLoop_VecDiffers:
  1639. bsf %ebx, %ebx
  1640. add %ebx, %eax
  1641. pop %ecx
  1642. sub %ecx, %eax
  1643. and $-2, %eax
  1644. add %ecx, %eax
  1645. movzwl (%edx,%eax), %edx
  1646. movzwl (%eax), %eax
  1647. sub %edx, %eax
  1648. pop %ebx
  1649. end;
  1650. {$ifndef CPUX86_HAS_SSE2}
  1651. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1652. var
  1653. CompareWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareWord_Dispatch;
  1654. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1655. begin
  1656. if not fpc_cpucodeinit_performed then
  1657. exit(CompareWord_Plain(buf1, buf2, len));
  1658. if has_sse2_support then
  1659. CompareWord_Impl:=@CompareWord_SSE2
  1660. else
  1661. CompareWord_Impl:=@CompareWord_Plain;
  1662. result:=CompareWord_Impl(buf1, buf2, len);
  1663. end;
  1664. function CompareWord(const buf1, buf2; len: SizeInt): SizeInt;
  1665. begin
  1666. result:=CompareWord_Impl(buf1, buf2, len);
  1667. end;
  1668. {$endif ndef CPUX86_HAS_SSE2 (need CompareWord dispatcher)}
  1669. {$endif FPC_SYSTEM_HAS_COMPAREWORD}
  1670. {$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
  1671. {$define FPC_SYSTEM_HAS_COMPAREDWORD}
  1672. {$ifndef CPUX86_HAS_SSE2}
  1673. function CompareDWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1674. asm
  1675. sub $1, %ecx
  1676. jb .LNothing
  1677. push %ebx
  1678. sub %eax, %edx
  1679. .balign 16
  1680. .LDwordwise_Body:
  1681. mov (%edx,%eax), %ebx
  1682. cmp %ebx, (%eax)
  1683. jne .LDoSbb
  1684. add $4, %eax
  1685. sub $1, %ecx
  1686. jnb .LDwordwise_Body
  1687. pop %ebx
  1688. .LNothing:
  1689. xor %eax, %eax
  1690. ret
  1691. .LDoSbb:
  1692. pop %ebx
  1693. sbb %eax, %eax
  1694. or $1, %eax
  1695. end;
  1696. {$endif}
  1697. function {$ifdef CPUX86_HAS_SSE2} CompareDWord {$else} CompareDWord_SSE2 {$endif} (Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1698. asm
  1699. push %ebx
  1700. sub %eax, %edx { edx = buf2 - buf1 }
  1701. lea -5(%ecx), %ebx { Go dwordwise if ecx <= 4 or ecx > 536870911 (High(int32) div 4) ==> uint32(ecx - 5) > 536870906. }
  1702. cmp $536870906, %ebx
  1703. ja .LDwordwise_Prepare
  1704. shl $2, %ecx { convert to bytes }
  1705. movdqu (%edx,%eax), %xmm1 { Compare first vectors. }
  1706. movdqu (%eax), %xmm0
  1707. pcmpeqd %xmm1, %xmm0
  1708. pmovmskb %xmm0, %ebx
  1709. inc %bx
  1710. jnz .LVec0Differs
  1711. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1712. jle .LLastVec
  1713. push %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1714. add %eax, %ecx
  1715. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1716. sub %eax, %ecx
  1717. .balign 16
  1718. .LAligned4xLoop_Body:
  1719. add $16, %eax
  1720. movdqu (%eax,%edx), %xmm0
  1721. pcmpeqb (%eax), %xmm0
  1722. pmovmskb %xmm0, %ebx
  1723. inc %bx
  1724. jnz .LAligned4xLoop_VecDiffers
  1725. sub $16, %ecx
  1726. ja .LAligned4xLoop_Body
  1727. pop %ebx { drop original buf1 }
  1728. .LLastVec:
  1729. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1730. movdqu (%edx,%eax), %xmm1
  1731. movdqu (%eax), %xmm0
  1732. pcmpeqd %xmm1, %xmm0
  1733. pmovmskb %xmm0, %ebx
  1734. inc %bx
  1735. jnz .LVec0Differs
  1736. pop %ebx
  1737. xor %eax, %eax
  1738. ret
  1739. .LVec0Differs:
  1740. bsf %ebx, %ebx
  1741. add %eax, %edx { recover edx = buf2 }
  1742. mov (%edx,%ebx), %edx
  1743. cmp %edx, (%eax,%ebx)
  1744. sbb %eax, %eax
  1745. or $1, %eax
  1746. pop %ebx
  1747. ret
  1748. .LAligned4xLoop_VecDiffers:
  1749. bsf %ebx, %ebx
  1750. add %ebx, %eax
  1751. pop %ecx
  1752. sub %ecx, %eax
  1753. and $-4, %eax
  1754. add %ecx, %eax
  1755. mov (%edx,%eax), %edx
  1756. cmp %edx, (%eax)
  1757. .LDoSbb:
  1758. sbb %eax, %eax
  1759. or $1, %eax
  1760. pop %ebx
  1761. ret
  1762. .balign 16
  1763. .LDwordwise_Body:
  1764. mov (%edx,%eax), %ebx
  1765. cmp %ebx, (%eax)
  1766. jne .LDoSbb
  1767. add $4, %eax
  1768. .LDwordwise_Prepare:
  1769. sub $1, %ecx
  1770. jnb .LDwordwise_Body
  1771. pop %ebx
  1772. xor %eax, %eax
  1773. end;
  1774. {$ifndef CPUX86_HAS_SSE2}
  1775. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1776. var
  1777. CompareDWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareDWord_Dispatch;
  1778. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1779. begin
  1780. if not fpc_cpucodeinit_performed then
  1781. exit(CompareDWord_Plain(buf1, buf2, len));
  1782. if has_sse2_support then
  1783. CompareDWord_Impl:=@CompareDWord_SSE2
  1784. else
  1785. CompareDWord_Impl:=@CompareDWord_Plain;
  1786. result:=CompareDWord_Impl(buf1, buf2, len);
  1787. end;
  1788. function CompareDWord(const buf1, buf2; len: SizeInt): SizeInt;
  1789. begin
  1790. result:=CompareDWord_Impl(buf1, buf2, len);
  1791. end;
  1792. {$endif ndef CPUX86_HAS_SSE2 (need CompareDWord dispatcher)}
  1793. {$endif FPC_SYSTEM_HAS_COMPAREDWORD}
  1794. {$ifndef FPC_SYSTEM_HAS_INDEXCHAR0}
  1795. {$define FPC_SYSTEM_HAS_INDEXCHAR0}
  1796. function IndexChar0(Const buf;len:SizeInt;b:AnsiChar):SizeInt; assembler;
  1797. var
  1798. saveesi,saveebx : longint;
  1799. asm
  1800. movl %esi,saveesi
  1801. movl %ebx,saveebx
  1802. // Can't use scasb, or will have to do it twice, think this
  1803. // is faster for small "len"
  1804. movl %eax,%esi // Load address
  1805. movzbl %cl,%ebx // Load searchpattern
  1806. testl %edx,%edx
  1807. je .LFound
  1808. xorl %ecx,%ecx // zero index in Buf
  1809. xorl %eax,%eax // To make DWord compares possible
  1810. .balign 4
  1811. .LLoop:
  1812. movb (%esi),%al // Load byte
  1813. cmpb %al,%bl
  1814. je .LFound // byte the same?
  1815. incl %ecx
  1816. incl %esi
  1817. cmpl %edx,%ecx // Maximal distance reached?
  1818. je .LNotFound
  1819. testl %eax,%eax // Nullchar = end of search?
  1820. jne .LLoop
  1821. .LNotFound:
  1822. movl $-1,%ecx // Not found return -1
  1823. .LFound:
  1824. movl %ecx,%eax
  1825. movl saveesi,%esi
  1826. movl saveebx,%ebx
  1827. end;
  1828. {$endif FPC_SYSTEM_HAS_INDEXCHAR0}
  1829. {****************************************************************************
  1830. String
  1831. ****************************************************************************}
  1832. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1833. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1834. procedure fpc_shortstr_to_shortstr(out res:shortstring; const sstr: shortstring);assembler;[public,alias:'FPC_SHORTSTR_TO_SHORTSTR']; compilerproc;
  1835. {$ifndef FPC_PROFILE}
  1836. nostackframe;
  1837. {$endif}
  1838. { eax = res, edx = high(res), ecx = sstr }
  1839. asm
  1840. {$ifdef FPC_PROFILE}
  1841. push %eax
  1842. push %edx
  1843. push %ecx
  1844. call mcount
  1845. pop %ecx
  1846. pop %edx
  1847. pop %eax
  1848. {$endif FPC_PROFILE}
  1849. cmp (%ecx), %dl { length(sstr) fits into res? }
  1850. jbe .LEdxIsLen { use high(res) if length(sstr) does not fit }
  1851. movzbl (%ecx), %edx { use length(sstr) }
  1852. .LEdxIsLen:
  1853. mov %dl, (%eax) { store length to res[0] }
  1854. xchg %ecx, %edx { ecx = length = Move count, edx = sstr }
  1855. xchg %eax, %edx { eax = sstr = Move src, edx = res = Move dest }
  1856. inc %eax
  1857. inc %edx
  1858. {$ifdef FPC_PROFILE}
  1859. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1860. lea -8(%esp), %esp
  1861. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1862. call Move
  1863. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1864. lea 8(%esp), %esp
  1865. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1866. {$else FPC_PROFILE}
  1867. jmp Move
  1868. {$endif FPC_PROFILE}
  1869. end;
  1870. procedure fpc_shortstr_assign(len:longint;sstr,dstr:pointer);[public,alias:'FPC_SHORTSTR_ASSIGN'];
  1871. begin
  1872. asm
  1873. {$ifdef FPC_PROFILE}
  1874. push %eax
  1875. push %edx
  1876. push %ecx
  1877. call mcount
  1878. pop %ecx
  1879. pop %edx
  1880. pop %eax
  1881. {$endif FPC_PROFILE}
  1882. pushl %eax
  1883. pushl %ecx
  1884. {$ifdef FPC_ENABLED_CLD}
  1885. cld
  1886. {$endif FPC_ENABLED_CLD}
  1887. movl dstr,%edi
  1888. movl sstr,%esi
  1889. xorl %eax,%eax
  1890. movl len,%ecx
  1891. lodsb
  1892. cmpl %ecx,%eax
  1893. jbe .LStrCopy1
  1894. movl %ecx,%eax
  1895. .LStrCopy1:
  1896. stosb
  1897. cmpl $7,%eax
  1898. jl .LStrCopy2
  1899. movl %edi,%ecx { Align on 32bits }
  1900. negl %ecx
  1901. andl $3,%ecx
  1902. subl %ecx,%eax
  1903. rep
  1904. movsb
  1905. movl %eax,%ecx
  1906. andl $3,%eax
  1907. shrl $2,%ecx
  1908. rep
  1909. movsl
  1910. .LStrCopy2:
  1911. movl %eax,%ecx
  1912. rep
  1913. movsb
  1914. popl %ecx
  1915. popl %eax
  1916. end ['ESI','EDI'];
  1917. end;
  1918. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1919. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1920. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1921. function fpc_shortstr_compare(const left,right:shortstring): longint;assembler; [public,alias:'FPC_SHORTSTR_COMPARE']; compilerproc;
  1922. { eax = left, edx = right }
  1923. asm
  1924. {$ifdef FPC_PROFILE}
  1925. push %eax
  1926. push %edx
  1927. push %ecx
  1928. call mcount
  1929. pop %ecx
  1930. pop %edx
  1931. pop %eax
  1932. {$endif FPC_PROFILE}
  1933. push %ebx
  1934. movzbl (%eax), %ecx { ecx = len(left) }
  1935. movzbl (%edx), %ebx { ebx = len(right) }
  1936. cmp %ebx, %ecx
  1937. {$ifdef CPUX86_HAS_CMOV}
  1938. cmovg %ebx, %ecx
  1939. {$else}
  1940. jle .LEcxIsLen
  1941. mov %ebx, %ecx
  1942. .LEcxIsLen:
  1943. {$endif}
  1944. push %eax { save left }
  1945. inc %eax
  1946. inc %edx
  1947. { stack is already aligned to 16 bytes if required: return address + push ebp + push ebx + push eax. }
  1948. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1949. call CompareByte
  1950. {$else}
  1951. call CompareByte_Impl { manually inline CompareByte }
  1952. {$endif}
  1953. pop %edx { restore left }
  1954. test %eax, %eax
  1955. jnz .LReturn
  1956. movzbl (%edx), %eax
  1957. sub %ebx, %eax
  1958. .LReturn:
  1959. pop %ebx
  1960. end;
  1961. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1962. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1963. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1964. function fpc_shortstr_compare_equal(const left,right:shortstring) : longint;assembler;nostackframe; [public,alias:'FPC_SHORTSTR_COMPARE_EQUAL']; compilerproc; nostackframe;
  1965. { eax = left, edx = right }
  1966. asm
  1967. movzbl (%eax), %ecx
  1968. cmp (%edx), %cl
  1969. jne .LNotEqual
  1970. inc %eax
  1971. inc %edx
  1972. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1973. jmp CompareByte
  1974. {$else}
  1975. jmp CompareByte_Impl { manually inline CompareByte }
  1976. {$endif}
  1977. .LNotEqual:
  1978. or $-1, %eax
  1979. end;
  1980. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1981. {$ifndef FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1982. {$define FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1983. procedure fpc_pchar_to_shortstr(out res : shortstring;p:PAnsiChar);assembler;[public,alias:'FPC_PCHAR_TO_SHORTSTR']; compilerproc;
  1984. {$ifndef FPC_PROFILE}
  1985. nostackframe;
  1986. {$endif}
  1987. // eax = res, edx = high(res), ecx = p
  1988. asm
  1989. {$ifdef FPC_PROFILE}
  1990. push %eax
  1991. push %edx
  1992. push %ecx
  1993. call mcount
  1994. pop %ecx
  1995. pop %edx
  1996. pop %eax
  1997. {$endif FPC_PROFILE}
  1998. test %ecx, %ecx
  1999. jz .LEmpty
  2000. push %eax { save res }
  2001. push %ecx { save p }
  2002. push %edx { save high(res) }
  2003. mov %ecx, %eax { eax = IndexByte.buf }
  2004. { edx is already high(res) = IndexByte.count.
  2005. Careful: using high(res) instead of -1 limits the scan by high(res) which is a good thing,
  2006. but assumes that IndexByte is “safe” and won’t read potentially invalid memory past the searched byte even if formally (and wrongly) allowed by ‘count’.
  2007. Generic and x86 versions are “safe”. }
  2008. xor %ecx, %ecx { ecx = 0 = IndexByte.value }
  2009. { Stack is already aligned on 16 bytes if the function is nostackframe: return address + push eax + push ecx + push edx.
  2010. With a stack frame, there is an additional push ebp and need 12 more bytes to align. }
  2011. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  2012. leal -12(%esp), %esp
  2013. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  2014. {$if defined(FPC_PIC) or not declared(IndexByte_Impl)}
  2015. call IndexByte
  2016. {$else}
  2017. call IndexByte_Impl { manually inline IndexByte }
  2018. {$endif}
  2019. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  2020. leal 12(%esp), %esp
  2021. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  2022. pop %ecx { ecx = high(res) = Move.len }
  2023. test %eax, %eax { If IndexByte result (eax) is non-negative (terminator is among first high(res) characters), use it, otherwise keep high(res). }
  2024. {$ifdef CPUX86_HAS_CMOV}
  2025. cmovns %eax, %ecx
  2026. {$else}
  2027. js .LEcxIsLen
  2028. mov %eax, %ecx
  2029. .LEcxIsLen:
  2030. {$endif}
  2031. pop %eax { pop p to eax = Move.src }
  2032. pop %edx { pop res to edx }
  2033. mov %cl, (%edx) { res[0] := len }
  2034. inc %edx { res[1] = Move.dst }
  2035. {$ifdef FPC_PROFILE}
  2036. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2037. leal -12(%esp), %esp
  2038. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2039. call Move
  2040. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2041. leal 12(%esp), %esp
  2042. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2043. jmp .LReturn
  2044. {$else FPC_PROFILE}
  2045. jmp Move { can perform a tail call }
  2046. {$endif FPC_PROFILE}
  2047. .LEmpty:
  2048. movb $0, (%eax)
  2049. {$ifdef FPC_PROFILE}
  2050. .LReturn:
  2051. {$endif}
  2052. end;
  2053. {$endif FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  2054. {$IFNDEF INTERNAL_BACKTRACE}
  2055. {$define FPC_SYSTEM_HAS_GET_FRAME}
  2056. function get_frame:pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  2057. asm
  2058. movl %ebp,%eax
  2059. end;
  2060. {$ENDIF not INTERNAL_BACKTRACE}
  2061. {$define FPC_SYSTEM_HAS_GET_PC_ADDR}
  2062. Function Get_pc_addr : Pointer;assembler;nostackframe;
  2063. asm
  2064. movl (%esp),%eax
  2065. end;
  2066. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  2067. function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;
  2068. {$if defined(win32)}
  2069. { Windows has StackTop always properly set }
  2070. begin
  2071. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  2072. Result:=PPointer(framebp+4)^
  2073. else
  2074. Result:=nil;
  2075. end;
  2076. {$else defined(win32)}
  2077. nostackframe;assembler;
  2078. asm
  2079. orl %eax,%eax
  2080. jz .Lg_a_null
  2081. movl 4(%eax),%eax
  2082. .Lg_a_null:
  2083. end;
  2084. {$endif defined(win32)}
  2085. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  2086. function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;
  2087. {$if defined(win32)}
  2088. { Windows has StackTop always properly set }
  2089. begin
  2090. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  2091. Result:=PPointer(framebp)^
  2092. else
  2093. Result:=nil;
  2094. end;
  2095. {$else defined(win32)}
  2096. nostackframe;assembler;
  2097. asm
  2098. orl %eax,%eax
  2099. jz .Lgnf_null
  2100. movl (%eax),%eax
  2101. .Lgnf_null:
  2102. end;
  2103. {$endif defined(win32)}
  2104. {$define FPC_SYSTEM_HAS_SPTR}
  2105. Function Sptr : Pointer;assembler;nostackframe;
  2106. asm
  2107. movl %esp,%eax
  2108. end;
  2109. {****************************************************************************
  2110. Str()
  2111. ****************************************************************************}
  2112. {$if defined(disabled) and defined(regcall) }
  2113. {$define FPC_SYSTEM_HAS_INT_STR_LONGWORD}
  2114. {$define FPC_SYSTEM_HAS_INT_STR_LONGINT}
  2115. label str_int_shortcut;
  2116. procedure int_str(l:longword;out s:shortstring);assembler;nostackframe;
  2117. asm
  2118. pushl %esi
  2119. pushl %edi
  2120. pushl %ebx
  2121. mov %edx,%edi
  2122. xor %edx,%edx
  2123. jmp str_int_shortcut
  2124. end;
  2125. procedure int_str(l:longint;out s:shortstring);assembler;nostackframe;
  2126. {Optimized for speed, but balanced with size.}
  2127. const digits:array[0..9] of cardinal=(0,10,100,1000,10000,
  2128. 100000,1000000,10000000,
  2129. 100000000,1000000000);
  2130. asm
  2131. {$ifdef FPC_PROFILE}
  2132. push %eax
  2133. push %edx
  2134. push %ecx
  2135. call mcount
  2136. pop %ecx
  2137. pop %edx
  2138. pop %eax
  2139. {$endif FPC_PROFILE}
  2140. push %esi
  2141. push %edi
  2142. push %ebx
  2143. movl %edx,%edi
  2144. { Calculate absolute value and put sign in edx}
  2145. cltd
  2146. xorl %edx,%eax
  2147. subl %edx,%eax
  2148. negl %edx
  2149. str_int_shortcut:
  2150. movl %ecx,%esi
  2151. {Calculate amount of digits in ecx.}
  2152. xorl %ecx,%ecx
  2153. bsrl %eax,%ecx
  2154. incl %ecx
  2155. imul $1233,%ecx
  2156. shr $12,%ecx
  2157. {$ifdef FPC_PIC}
  2158. call fpc_geteipasebx
  2159. {$ifdef darwin}
  2160. movl digits-.Lpic(%ebx),%ebx
  2161. {$else}
  2162. addl $_GLOBAL_OFFSET_TABLE_,%ebx
  2163. movl digits@GOT(%ebx),%ebx
  2164. {$endif}
  2165. cmpl (%ebx,%ecx,4),%eax
  2166. {$else}
  2167. cmpl digits(,%ecx,4),%eax
  2168. {$endif}
  2169. cmc
  2170. adcl $0,%ecx {Nr. digits ready in ecx.}
  2171. {Write length & sign.}
  2172. lea (%edx,%ecx),%ebx
  2173. movb $45,%bh {movb $'-,%bh Not supported by our ATT reader.}
  2174. movw %bx,(%edi)
  2175. addl %edx,%edi
  2176. subl %edx,%esi
  2177. {Skip digits beyond string length.}
  2178. movl %eax,%edx
  2179. subl %ecx,%esi
  2180. jae .Lloop_write
  2181. .balign 4
  2182. .Lloop_skip:
  2183. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  2184. mull %edx
  2185. shrl $3,%edx
  2186. decl %ecx
  2187. jz .Ldone {If (l<0) and (high(s)=1) this jump is taken.}
  2188. incl %esi
  2189. jnz .Lloop_skip
  2190. {Write out digits.}
  2191. .balign 4
  2192. .Lloop_write:
  2193. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  2194. {Pre-add '0'}
  2195. leal 48(%edx),%ebx {leal $'0(,%edx),%ebx Not supported by our ATT reader.}
  2196. mull %edx
  2197. shrl $3,%edx
  2198. leal (%edx,%edx,8),%eax {x mod 10 = x-10*(x div 10)}
  2199. subl %edx,%ebx
  2200. subl %eax,%ebx
  2201. movb %bl,(%edi,%ecx)
  2202. decl %ecx
  2203. jnz .Lloop_write
  2204. .Ldone:
  2205. popl %ebx
  2206. popl %edi
  2207. popl %esi
  2208. end;
  2209. {$endif}
  2210. {****************************************************************************
  2211. Bounds Check
  2212. ****************************************************************************}
  2213. { do a thread-safe inc/dec }
  2214. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  2215. function cpudeclocked(var l : longint) : boolean;assembler;nostackframe;
  2216. asm
  2217. lock
  2218. decl (%eax)
  2219. setzb %al
  2220. end;
  2221. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  2222. procedure cpuinclocked(var l : longint);assembler;nostackframe;
  2223. asm
  2224. lock
  2225. incl (%eax)
  2226. end;
  2227. // inline SMP check and normal lock.
  2228. // the locked one is so slow, inlining doesn't matter.
  2229. function declocked(var l : longint) : boolean; inline;
  2230. begin
  2231. if not ismultithread then
  2232. begin
  2233. dec(l);
  2234. declocked:=l=0;
  2235. end
  2236. else
  2237. declocked:=cpudeclocked(l);
  2238. end;
  2239. procedure inclocked(var l : longint); inline;
  2240. begin
  2241. if not ismultithread then
  2242. inc(l)
  2243. else
  2244. cpuinclocked(l);
  2245. end;
  2246. function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
  2247. asm
  2248. movl $-1,%edx
  2249. lock
  2250. xaddl %edx, (%eax)
  2251. lea -1(%edx),%eax
  2252. end;
  2253. function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
  2254. asm
  2255. movl $1,%edx
  2256. lock
  2257. xaddl %edx, (%eax)
  2258. lea 1(%edx),%eax
  2259. end;
  2260. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2261. asm
  2262. xchgl (%eax),%edx
  2263. movl %edx,%eax
  2264. end;
  2265. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2266. asm
  2267. lock
  2268. xaddl %edx, (%eax)
  2269. movl %edx,%eax
  2270. end;
  2271. function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler; nostackframe;
  2272. asm
  2273. xchgl %eax,%ecx
  2274. lock
  2275. cmpxchgl %edx, (%ecx)
  2276. end;
  2277. function InterlockedCompareExchange64(var Target: int64; NewValue: int64; Comperand: int64): int64; assembler;
  2278. asm
  2279. pushl %ebx
  2280. pushl %edi
  2281. movl %eax,%edi
  2282. movl Comperand+4,%edx
  2283. movl Comperand+0,%eax
  2284. movl NewValue+4,%ecx
  2285. movl NewValue+0,%ebx
  2286. lock cmpxchg8b (%edi)
  2287. pop %edi
  2288. pop %ebx
  2289. end;
  2290. {****************************************************************************
  2291. FPU
  2292. ****************************************************************************}
  2293. const
  2294. { Internal constants for use in system unit }
  2295. FPU_Invalid = 1;
  2296. FPU_Denormal = 2;
  2297. FPU_DivisionByZero = 4;
  2298. FPU_Overflow = 8;
  2299. FPU_Underflow = $10;
  2300. FPU_StackUnderflow = $20;
  2301. FPU_StackOverflow = $40;
  2302. FPU_ExceptionMask = $ff;
  2303. MM_Invalid = 1;
  2304. MM_Denormal = 2;
  2305. MM_DivisionByZero = 4;
  2306. MM_Overflow = 8;
  2307. MM_Underflow = $10;
  2308. MM_Precicion = $20;
  2309. MM_ExceptionMask = $3f;
  2310. MM_MaskInvalidOp = %0000000010000000;
  2311. MM_MaskDenorm = %0000000100000000;
  2312. MM_MaskDivZero = %0000001000000000;
  2313. MM_MaskOverflow = %0000010000000000;
  2314. MM_MaskUnderflow = %0000100000000000;
  2315. MM_MaskPrecision = %0001000000000000;
  2316. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  2317. Procedure SysInitFPU;
  2318. begin
  2319. end;
  2320. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  2321. Procedure SysResetFPU;
  2322. var
  2323. { these locals are so we don't have to hack pic code in the assembler }
  2324. localmxcsr: dword;
  2325. localfpucw: word;
  2326. begin
  2327. localfpucw:=Default8087CW;
  2328. asm
  2329. fninit
  2330. fwait
  2331. fldcw localfpucw
  2332. end;
  2333. if has_sse_support then
  2334. begin
  2335. localmxcsr:=DefaultMXCSR;
  2336. asm
  2337. { setup sse exceptions }
  2338. {$ifndef OLD_ASSEMBLER}
  2339. ldmxcsr localmxcsr
  2340. {$else OLD_ASSEMBLER}
  2341. mov localmxcsr,%eax
  2342. subl $4,%esp
  2343. mov %eax,(%esp)
  2344. //ldmxcsr (%esp)
  2345. .byte 0x0f,0xae,0x14,0x24
  2346. addl $4,%esp
  2347. {$endif OLD_ASSEMBLER}
  2348. end;
  2349. end;
  2350. end;
  2351. { because of the brain dead sse detection on x86, this test is post poned }
  2352. procedure fpc_cpucodeinit;
  2353. var
  2354. _eax,_ecx_cpuid1,_edx_cpuid1,_ebx_cpuid7 : longint;
  2355. begin
  2356. if cpuid_support then
  2357. begin
  2358. asm
  2359. movl $1,%eax
  2360. xorl %ecx,%ecx
  2361. cpuid
  2362. movl %edx,_edx_cpuid1
  2363. movl %ecx,_ecx_cpuid1
  2364. end ['ebx'];
  2365. has_mmx_support:=(_edx_cpuid1 and $800000)<>0;
  2366. if ((_edx_cpuid1 and $2000000)<>0) then
  2367. begin
  2368. os_supports_sse:=true;
  2369. sse_check:=true;
  2370. asm
  2371. { force an sse exception if no sse is supported, the exception handler sets
  2372. os_supports_sse to false then }
  2373. { don't change this instruction, the code above depends on its size }
  2374. {$ifdef OLD_ASSEMBLER}
  2375. .byte 0x0f,0x28,0xf7
  2376. {$else}
  2377. movaps %xmm7, %xmm6
  2378. {$endif not EMX}
  2379. end;
  2380. sse_check:=false;
  2381. has_sse_support:=os_supports_sse;
  2382. end;
  2383. if has_sse_support then
  2384. begin
  2385. has_sse2_support:=((_edx_cpuid1 and $4000000)<>0);
  2386. has_sse3_support:=((_ecx_cpuid1 and $200)<>0);
  2387. has_sse41_support:=boolean(_ecx_cpuid1 shr 19 and 1);
  2388. { now avx }
  2389. asm
  2390. xorl %eax,%eax
  2391. cpuid
  2392. movl %eax,_eax
  2393. end;
  2394. if _eax>=7 then
  2395. begin
  2396. asm
  2397. movl $7,%eax
  2398. xorl %ecx,%ecx
  2399. cpuid
  2400. movl %ebx,_ebx_cpuid7
  2401. end;
  2402. fast_large_repmovstosb:=_ebx_cpuid7 and (1 shl 9)<>0;
  2403. if (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then
  2404. begin
  2405. asm
  2406. xorl %ecx,%ecx
  2407. .byte 0x0f,0x01,0xd0 { xgetbv }
  2408. movl %eax,_eax
  2409. end;
  2410. if (_eax and 6)=6 then
  2411. begin
  2412. has_avx_support:=(_ecx_cpuid1 and $10000000)<>0;
  2413. has_avx2_support:=(_ebx_cpuid7 and $20)<>0;
  2414. end;
  2415. end;
  2416. end;
  2417. end;
  2418. end;
  2419. { don't let libraries influence the FPU cw set by the host program }
  2420. if IsLibrary then
  2421. begin
  2422. Default8087CW:=Get8087CW;
  2423. if has_sse_support then
  2424. DefaultMXCSR:=GetMXCSR;
  2425. end;
  2426. SysResetFPU;
  2427. fpc_cpucodeinit_performed:=true;
  2428. end;
  2429. {$if not defined(darwin) and defined(regcall) }
  2430. { darwin requires that the stack is aligned to 16 bytes when calling another function }
  2431. {$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
  2432. {$define FPC_SYSTEM_HAS_ANSISTR_DECR_REF}
  2433. Procedure fpc_AnsiStr_Decr_Ref (Var S : Pointer); [Public,Alias:'FPC_ANSISTR_DECR_REF']; compilerproc; nostackframe; assembler;
  2434. asm
  2435. movl (%eax),%edx
  2436. testl %edx,%edx
  2437. jz .Lquit
  2438. movl $0,(%eax) // s:=nil
  2439. cmpl $0,-8(%edx) // exit if refcount<0
  2440. jl .Lquit
  2441. {$ifdef FPC_PIC}
  2442. call fpc_geteipasecx
  2443. addl $_GLOBAL_OFFSET_TABLE_,%ecx
  2444. movl ismultithread@GOT(%ecx),%ecx
  2445. cmpl $0,(%ecx)
  2446. {$else FPC_PIC}
  2447. cmpl $0,ismultithread
  2448. {$endif FPC_PIC}
  2449. je .Lskiplock
  2450. .byte 0xF0 // LOCK prefix, jumped over if IsMultiThread = false. FPC assembler does not accept disjoint LOCK mnemonic.
  2451. .Lskiplock:
  2452. decl -8(%edx)
  2453. jz .Lfree
  2454. .Lquit:
  2455. ret
  2456. .Lfree:
  2457. leal -12(%edx),%eax // points to start of allocation
  2458. { freemem is not an assembler leaf function like fpc_geteipasecx, so it
  2459. needs to be called with proper stack alignment }
  2460. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2461. leal -12(%esp),%esp
  2462. call FPC_FREEMEM
  2463. leal 12(%esp),%esp
  2464. {$else FPC_SYSTEM_STACKALIGNMENT16}
  2465. jmp FPC_FREEMEM // can perform a tail call
  2466. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2467. end;
  2468. function fpc_truely_ansistr_unique(Var S : Pointer): Pointer; forward;
  2469. {$define FPC_SYSTEM_HAS_ANSISTR_UNIQUE}
  2470. Function fpc_ansistr_Unique(Var S : Pointer): Pointer; [Public,Alias : 'FPC_ANSISTR_UNIQUE']; compilerproc; nostackframe;assembler;
  2471. asm
  2472. movl (%eax),%edx
  2473. testl %edx,%edx
  2474. jz .Lunchanged
  2475. cmpl $1,-8(%edx)
  2476. jne fpc_truely_ansistr_unique
  2477. .Lunchanged:
  2478. movl %edx,%eax
  2479. end;
  2480. {$endif FPC_HAS_FEATURE_ANSISTRINGS}
  2481. {$endif ndef darwin and defined(regcall) }
  2482. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  2483. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  2484. procedure ReadBarrier;assembler;nostackframe;
  2485. asm
  2486. {$ifdef CPUX86_HAS_SSE2}
  2487. lfence
  2488. {$else CPUX86_HAS_SSE2}
  2489. lock
  2490. addl $0,0(%esp)
  2491. {$endif CPUX86_HAS_SSE2}
  2492. end;
  2493. procedure ReadDependencyBarrier;
  2494. begin
  2495. { reads imply barrier on earlier reads depended on }
  2496. end;
  2497. procedure ReadWriteBarrier;assembler;nostackframe;
  2498. asm
  2499. {$ifdef CPUX86_HAS_SSE2}
  2500. mfence
  2501. {$else CPUX86_HAS_SSE2}
  2502. lock
  2503. addl $0,0(%esp)
  2504. {$endif CPUX86_HAS_SSE2}
  2505. end;
  2506. procedure WriteBarrier;assembler;nostackframe;
  2507. asm
  2508. {$ifdef CPUX86_HAS_SSEUNIT}
  2509. sfence
  2510. {$endif CPUX86_HAS_SSEUNIT}
  2511. end;
  2512. {$endif}
  2513. {$ifndef FPC_SYSTEM_HAS_BSF_QWORD}
  2514. {$define FPC_SYSTEM_HAS_BSF_QWORD}
  2515. function BsfQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2516. asm
  2517. {$ifdef CPUX86_HINT_BSX_DEST_UNCHANGED_ON_ZF_1}
  2518. mov $255-32,%eax { On AMD, BSF/R are documented to not change the destination on zero input. }
  2519. bsfl 8(%esp),%eax { On Intel, destination is formally undefined on zero input, but in practice the behavior is the same. }
  2520. add $32,%eax
  2521. bsfl 4(%esp),%eax
  2522. {$else}
  2523. bsfl 4(%esp),%eax
  2524. jz .L1
  2525. ret $8
  2526. .L1:
  2527. bsfl 8(%esp),%eax
  2528. jz .L2
  2529. add $32,%eax
  2530. ret $8
  2531. .L2:
  2532. movl $255,%eax
  2533. {$endif}
  2534. end;
  2535. {$endif FPC_SYSTEM_HAS_BSF_QWORD}
  2536. {$ifndef FPC_SYSTEM_HAS_BSR_QWORD}
  2537. {$define FPC_SYSTEM_HAS_BSR_QWORD}
  2538. function BsrQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2539. asm
  2540. {$ifdef CPUX86_HINT_BSX_DEST_UNCHANGED_ON_ZF_1}
  2541. mov $255,%eax
  2542. bsrl 4(%esp),%eax
  2543. sub $32,%eax
  2544. bsrl 8(%esp),%eax
  2545. add $32,%eax
  2546. {$else}
  2547. mov 8(%esp),%eax
  2548. test %eax,%eax
  2549. jnz .L1 { Speculate Hi(q) = 0. }
  2550. bsrl 4(%esp),%eax
  2551. jz .L2
  2552. ret $8
  2553. .L1:
  2554. bsrl %eax,%eax
  2555. add $32,%eax
  2556. ret $8
  2557. .L2:
  2558. movl $255,%eax
  2559. {$endif}
  2560. end;
  2561. {$endif FPC_SYSTEM_HAS_BSR_QWORD}
  2562. {$ifndef FPC_SYSTEM_HAS_SAR_QWORD}
  2563. {$define FPC_SYSTEM_HAS_SAR_QWORD}
  2564. function fpc_SarInt64(Const AValue : Int64;const Shift : Byte): Int64; [Public,Alias:'FPC_SARINT64']; compilerproc; assembler; nostackframe;
  2565. asm
  2566. movl 8(%esp),%edx
  2567. movzbl %al,%ecx
  2568. cmpb $32,%al
  2569. jnb .L1
  2570. movl 4(%esp),%eax
  2571. shrdl %cl,%edx,%eax
  2572. sarl %cl,%edx
  2573. ret $8
  2574. .L1:
  2575. movl %edx,%eax
  2576. sarl $31,%edx
  2577. sarl %cl,%eax // uses 5 lower bits of cl.
  2578. end;
  2579. {$endif FPC_SYSTEM_HAS_SAR_QWORD}