i386.inc 77 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 1999-2000 by the Free Pascal development team.
  4. Processor dependent implementation for the system unit for
  5. intel i386+
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$if not(defined(VER3_0)) and defined(linux)}
  13. {$define FPC_SYSTEM_STACKALIGNMENT16}
  14. {$endif not(defined(VER3_0)) and defined(linux)}
  15. {****************************************************************************
  16. Primitives
  17. ****************************************************************************}
  18. var
  19. os_supports_sse : boolean;
  20. { this variable is set to true, if currently an sse check is executed and no sig ill should be generated }
  21. sse_check : boolean;
  22. fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
  23. has_sse41_support : boolean;
  24. fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. }
  25. {$asmmode ATT}
  26. function cpuid_support : boolean;assembler;nostackframe;
  27. {
  28. Check if the ID-flag can be changed, if changed then CpuID is supported.
  29. Tested under go32v1 and Linux on c6x86 with CpuID enabled and disabled (PFV)
  30. }
  31. asm
  32. pushfl
  33. movl (%esp),%eax
  34. xorl $0x200000,%eax
  35. pushl %eax
  36. popfl
  37. pushfl
  38. popl %eax
  39. xorl (%esp),%eax
  40. popfl
  41. testl $0x200000,%eax
  42. setnz %al
  43. end;
  44. {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
  45. procedure fpc_cpuinit;
  46. begin
  47. { because of the brain dead sse detection on x86, this test is post poned to fpc_cpucodeinit which
  48. must be implemented OS dependend (FK)
  49. has_sse_support:=sse_support;
  50. has_mmx_support:=mmx_support;
  51. }
  52. end;
  53. {$ifndef darwin}
  54. procedure fpc_geteipasebx; [public, alias: 'fpc_geteipasebx'];assembler; nostackframe;
  55. asm
  56. movl (%esp),%ebx
  57. end;
  58. procedure fpc_geteipasecx; [public, alias: 'fpc_geteipasecx'];assembler; nostackframe;
  59. asm
  60. movl (%esp),%ecx
  61. end;
  62. {$endif}
  63. {$if not defined(FPC_SYSTEM_HAS_MOVE)
  64. and not defined(OLD_ASSEMBLER)
  65. and not defined(darwin)}
  66. {$i fastmove.inc}
  67. {$endif}
  68. {$ifndef FPC_SYSTEM_HAS_MOVE}
  69. {$define FPC_SYSTEM_HAS_MOVE}
  70. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;
  71. var
  72. saveesi,saveedi : longint;
  73. asm
  74. movl %edi,saveedi
  75. movl %esi,saveesi
  76. movl %eax,%esi
  77. movl %edx,%edi
  78. movl %ecx,%edx
  79. movl %edi,%eax
  80. { check for zero or negative count }
  81. cmpl $0,%edx
  82. jle .LMoveEnd
  83. { Check for back or forward }
  84. sub %esi,%eax
  85. jz .LMoveEnd { Do nothing when source=dest }
  86. jc .LFMove { Do forward, dest<source }
  87. cmp %edx,%eax
  88. jb .LBMove { Dest is in range of move, do backward }
  89. { Forward Copy }
  90. .LFMove:
  91. {$ifdef FPC_ENABLED_CLD}
  92. cld
  93. {$endif FPC_ENABLED_CLD}
  94. cmpl $15,%edx
  95. jl .LFMove1
  96. movl %edi,%ecx { Align on 32bits }
  97. negl %ecx
  98. andl $3,%ecx
  99. subl %ecx,%edx
  100. rep
  101. movsb
  102. movl %edx,%ecx
  103. andl $3,%edx
  104. shrl $2,%ecx
  105. rep
  106. movsl
  107. .LFMove1:
  108. movl %edx,%ecx
  109. rep
  110. movsb
  111. jmp .LMoveEnd
  112. { Backward Copy }
  113. .LBMove:
  114. std
  115. addl %edx,%esi
  116. addl %edx,%edi
  117. movl %edi,%ecx
  118. decl %esi
  119. decl %edi
  120. cmpl $15,%edx
  121. jl .LBMove1
  122. negl %ecx { Align on 32bits }
  123. andl $3,%ecx
  124. subl %ecx,%edx
  125. rep
  126. movsb
  127. movl %edx,%ecx
  128. andl $3,%edx
  129. shrl $2,%ecx
  130. subl $3,%esi
  131. subl $3,%edi
  132. rep
  133. movsl
  134. addl $3,%esi
  135. addl $3,%edi
  136. .LBMove1:
  137. movl %edx,%ecx
  138. rep
  139. movsb
  140. cld
  141. .LMoveEnd:
  142. movl saveedi,%edi
  143. movl saveesi,%esi
  144. end;
  145. {$endif FPC_SYSTEM_HAS_MOVE}
  146. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  147. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  148. or not defined(FPC_SYSTEM_HAS_FILLDWORD)
  149. or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  150. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  151. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  152. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  153. const
  154. FillXxxx_RepStosThreshold_ERMS = 1024;
  155. FillXxxx_RepStosThreshold_NoERMS = 512 * 1024;
  156. procedure FillXxxx_U32Pattern_RepStos_8OrMore; assembler; nostackframe;
  157. { eax — x, ecx — uint32 pattern, edx — byte count >= 8 (preferably >= FillXxxx_RepStosThreshold_(No)ERMS, depending on fast_large_repmovstosb). }
  158. asm
  159. {$ifdef FPC_ENABLED_CLD}
  160. cld
  161. {$endif FPC_ENABLED_CLD}
  162. mov %ecx, (%eax) { Write first 4 bytes unaligned. }
  163. push %ecx { pattern }
  164. push %edi
  165. mov %eax, %edi { Move x to edi, as expected by ‘rep stosl’. }
  166. xchg %eax, %ecx { now eax = pattern (as expected by ‘rep stosl’) and ecx = x (to rotate the pattern by its misalignment) }
  167. shl $3, %ecx { ecx = misalignment of x in bits. }
  168. rol %cl, %eax { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
  169. add %edi, %edx { edx = x end }
  170. lea -1(%edx), %ecx { ecx = x end - 1. }
  171. add $4, %edi
  172. and $-4, %edi { edi = 4-byte aligned pointer strictly to the right of the start. }
  173. and $-4, %ecx { ecx = 4-byte aligned pointer strictly to the left of the end. }
  174. sub %edi, %ecx { ecx = byte count between them. }
  175. shr $2, %ecx { ecx = uint32 count, as expected by ‘rep stosl’. }
  176. rep stosl
  177. pop %edi
  178. pop %ecx
  179. mov %ecx, -4(%edx) { Write last 4 bytes unaligned. }
  180. end;
  181. {$endif FillChar/Word/DWord required.}
  182. label
  183. FillXxxx_MoreThanTwoXMMs;
  184. procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
  185. { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
  186. const
  187. NtThreshold = 4 * 1024 * 1024;
  188. asm
  189. movd %ecx, %xmm0
  190. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  191. movdqu %xmm0, (%eax)
  192. movdqu %xmm0, -16(%eax,%edx)
  193. cmp $32, %edx
  194. ja .LMoreThanTwoVectors
  195. ret
  196. .byte 144 { Turn .balign 16 before .L64x_Body into a no-op. }
  197. { x can start and end misaligned on the vector boundary:
  198. x = ~~][H1][H2][...][T2][T1]~
  199. [UH] [UT]
  200. UH/UT stands for “unaligned head/tail”, both have 1~16 bytes. }
  201. .LMoreThanTwoVectors:
  202. push %esi
  203. mov %ecx, %esi { esi = pattern }
  204. mov %eax, %ecx
  205. shl $3, %ecx { ecx = misalignment of x in bits }
  206. rol %cl, %esi { misalign the pattern }
  207. movd %esi, %xmm0
  208. pshufd $0, %xmm0, %xmm0
  209. pop %esi
  210. { FillChar (to skip the misaligning above) and FillQWord jump here.
  211. eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
  212. FillXxxx_MoreThanTwoXMMs:
  213. lea -65(%eax,%edx), %ecx
  214. and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
  215. mov %ecx, %edx { Remember T4 to edx. }
  216. and $-16, %eax { eax = H1 − 16. }
  217. sub %eax, %ecx { ecx = aligned byte count − 48. }
  218. movdqa %xmm0, 16(%eax) { Write H1. }
  219. cmp $32-48, %ecx
  220. jle .LOneAlignedTailWrite
  221. movdqa %xmm0, 32(%eax) { Write H2. }
  222. cmp $64-48, %ecx
  223. jle .LTwoAlignedTailWrites
  224. sub $48, %ecx { ecx = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
  225. jle .LFourAlignedTailWrites { ecx was ≤ 96−48 }
  226. add $48, %eax { eax = H3. }
  227. cmp $NtThreshold, %ecx
  228. jae .L64xNT_Body
  229. .balign 16 { no-op }
  230. .L64x_Body:
  231. movdqa %xmm0, (%eax)
  232. movdqa %xmm0, 16(%eax)
  233. movdqa %xmm0, 32(%eax)
  234. movdqa %xmm0, 48(%eax)
  235. add $64, %eax
  236. sub $64, %ecx
  237. ja .L64x_Body
  238. .LFourAlignedTailWrites:
  239. movdqa %xmm0, (%edx) { T4 }
  240. movdqa %xmm0, 16(%edx) { T3 }
  241. .LTwoAlignedTailWrites:
  242. movdqa %xmm0, 32(%edx) { T2 }
  243. .LOneAlignedTailWrite:
  244. movdqa %xmm0, 48(%edx) { T1 }
  245. ret
  246. .balign 16
  247. .L64xNT_Body:
  248. movntdq %xmm0, (%eax)
  249. movntdq %xmm0, 16(%eax)
  250. movntdq %xmm0, 32(%eax)
  251. movntdq %xmm0, 48(%eax)
  252. add $64, %eax
  253. sub $64, %ecx
  254. ja .L64xNT_Body
  255. sfence
  256. jmp .LFourAlignedTailWrites
  257. end;
  258. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  259. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  260. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  261. {$ifndef CPUX86_HAS_SSE2}
  262. procedure FillXxxx_U32Pattern_Plain_16OrMore; assembler; nostackframe;
  263. { eax — x, ecx — uint32 pattern, edx — byte count >= 12 (preferably >= 16). }
  264. asm
  265. mov %ecx, (%eax) { Write first 4 bytes. }
  266. lea -9(%eax,%edx), %edx
  267. mov %ecx, 5(%edx) { Write last 4 bytes. }
  268. and $-4, %edx { edx = loop bound. }
  269. push %esi
  270. mov %ecx, %esi { esi = pattern }
  271. mov %eax, %ecx
  272. shl $3, %ecx { ecx = misalignment of x in bits }
  273. rol %cl, %esi { misalign the pattern }
  274. add $4, %eax
  275. and $-4, %eax
  276. .balign 16
  277. .L8xLoop:
  278. mov %esi, (%eax)
  279. mov %esi, 4(%eax)
  280. add $8, %eax
  281. cmp %edx, %eax
  282. jb .L8xLoop
  283. mov %esi, (%edx)
  284. mov %esi, 4(%edx)
  285. pop %esi
  286. end;
  287. {$endif ndef CPUX86_HAS_SSE2 (need Fill*_Plain)}
  288. procedure FillXxxx_U32Pattern_Ladder_4to16; assembler; nostackframe;
  289. { eax — x, ecx — uint32 pattern, edx — byte count, 4 <= edx <= 16. }
  290. asm
  291. mov %ecx, (%eax)
  292. cmp $8, %edx
  293. jle .LLast4
  294. mov %ecx, 4(%eax)
  295. mov %ecx, -8(%eax,%edx)
  296. .LLast4:
  297. mov %ecx, -4(%eax,%edx)
  298. end;
  299. {$endif FillChar/Word/DWord required.}
  300. {$endif FillChar/Word/DWord/QWord required.}
  301. {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
  302. {$define FPC_SYSTEM_HAS_FILLCHAR}
  303. procedure FillChar_3OrLess; assembler; nostackframe;
  304. { cl — x, edx — byte count, Low(int32) <= edx <= 3. }
  305. asm
  306. test %edx, %edx
  307. jle .LQuit
  308. mov %cl, (%eax)
  309. mov %cl, -1(%eax,%edx)
  310. shr $1, %edx
  311. mov %cl, (%eax,%edx)
  312. .LQuit:
  313. end;
  314. {$ifndef CPUX86_HAS_SSE2}
  315. procedure FillChar_Plain(var x;count:SizeInt;value:byte);assembler;nostackframe;
  316. asm
  317. cmp $3, %edx
  318. jle FillChar_3OrLess
  319. movzbl %cl, %ecx
  320. imul $0x01010101, %ecx
  321. cmp $16, %edx
  322. jbe FillXxxx_U32Pattern_Ladder_4to16
  323. jmp FillXxxx_U32Pattern_Plain_16OrMore
  324. end;
  325. {$endif ndef CPUX86_HAS_SSE2}
  326. procedure FillChar_SSE2(var x;count:SizeInt;value:byte);assembler;nostackframe;
  327. asm
  328. cmp $3, %edx
  329. jle FillChar_3OrLess
  330. movzbl %cl, %ecx
  331. imul $0x01010101, %ecx
  332. cmp $16, %edx
  333. jbe FillXxxx_U32Pattern_Ladder_4to16
  334. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  335. jae FillXxxx_U32Pattern_RepStos_8OrMore
  336. movd %ecx, %xmm0
  337. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  338. movdqu %xmm0, (%eax)
  339. movdqu %xmm0, -16(%eax,%edx)
  340. cmp $32, %edx
  341. ja FillXxxx_MoreThanTwoXMMs
  342. end;
  343. procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
  344. asm
  345. cmp $3, %edx
  346. jle FillChar_3OrLess
  347. movzbl %cl, %ecx
  348. imul $0x01010101, %ecx
  349. cmp $16, %edx
  350. jbe FillXxxx_U32Pattern_Ladder_4to16
  351. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  352. jae FillXxxx_U32Pattern_RepStos_8OrMore
  353. movd %ecx, %xmm0
  354. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  355. movdqu %xmm0, (%eax)
  356. movdqu %xmm0, -16(%eax,%edx)
  357. cmp $32, %edx
  358. ja FillXxxx_MoreThanTwoXMMs
  359. end;
  360. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
  361. var
  362. FillChar_Impl: procedure(var x;count:SizeInt;value:byte) = @FillChar_Dispatch;
  363. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte);
  364. begin
  365. if not fpc_cpucodeinit_performed then
  366. begin
  367. {$ifdef CPUX86_HAS_SSE2} FillChar_SSE2 {$else} FillChar_Plain {$endif} (x, count, value);
  368. exit;
  369. end;
  370. if fast_large_repmovstosb then
  371. FillChar_Impl := @FillChar_SSE2_ERMS
  372. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  373. FillChar_Impl := @FillChar_SSE2
  374. {$ifndef CPUX86_HAS_SSE2}
  375. else
  376. FillChar_Impl := @FillChar_Plain
  377. {$endif ndef CPUX86_HAS_SSE2};
  378. FillChar_Impl(x, count, value);
  379. end;
  380. procedure FillChar(var x;count:SizeInt;value:byte);
  381. begin
  382. FillChar_Impl(x, count, value);
  383. end;
  384. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  385. {$ifndef FPC_SYSTEM_HAS_FILLWORD}
  386. {$define FPC_SYSTEM_HAS_FILLWORD}
  387. procedure FillWord_3OrLess; assembler; nostackframe;
  388. asm
  389. test %edx, %edx
  390. jle .LQuit
  391. mov %cx, (%eax)
  392. mov %cx, -2(%eax,%edx,2)
  393. shr $1, %edx
  394. mov %cx, (%eax,%edx,2)
  395. .LQuit:
  396. end;
  397. {$ifndef CPUX86_HAS_SSE2}
  398. procedure FillWord_Plain(var x;count:SizeInt;value:word);assembler;nostackframe;
  399. asm
  400. cmp $3, %edx
  401. jle FillWord_3OrLess
  402. shl $1, %edx
  403. movzwl %cx, %ecx
  404. imul $0x00010001, %ecx
  405. cmp $16, %edx
  406. jbe FillXxxx_U32Pattern_Ladder_4to16
  407. jmp FillXxxx_U32Pattern_Plain_16OrMore
  408. end;
  409. {$endif ndef CPUX86_HAS_SSE2}
  410. procedure FillWord_SSE2(var x;count:SizeInt;value:word);assembler;nostackframe;
  411. asm
  412. cmp $3, %edx
  413. jle FillWord_3OrLess
  414. shl $1, %edx
  415. movzwl %cx, %ecx
  416. imul $0x00010001, %ecx
  417. cmp $16, %edx
  418. jbe FillXxxx_U32Pattern_Ladder_4to16
  419. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  420. jb FillXxxx_U32Pattern_SSE2_16OrMore
  421. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  422. end;
  423. procedure FillWord_SSE2_ERMS(var x;count:SizeInt;value:word);assembler;nostackframe;
  424. asm
  425. cmp $3, %edx
  426. jle FillWord_3OrLess
  427. shl $1, %edx
  428. movzwl %cx, %ecx
  429. imul $0x00010001, %ecx
  430. cmp $16, %edx
  431. jbe FillXxxx_U32Pattern_Ladder_4to16
  432. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  433. jb FillXxxx_U32Pattern_SSE2_16OrMore
  434. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  435. end;
  436. procedure FillWord_Dispatch(var x;count:SizeInt;value:word); forward;
  437. var
  438. FillWord_Impl: procedure(var x;count:SizeInt;value:word) = @FillWord_Dispatch;
  439. procedure FillWord_Dispatch(var x;count:SizeInt;value:word);
  440. begin
  441. if not fpc_cpucodeinit_performed then
  442. begin
  443. {$ifdef CPUX86_HAS_SSE2} FillWord_SSE2 {$else} FillWord_Plain {$endif} (x, count, value);
  444. exit;
  445. end;
  446. if fast_large_repmovstosb then
  447. FillWord_Impl := @FillWord_SSE2_ERMS
  448. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  449. FillWord_Impl := @FillWord_SSE2
  450. {$ifndef CPUX86_HAS_SSE2}
  451. else
  452. FillWord_Impl := @FillWord_Plain
  453. {$endif ndef CPUX86_HAS_SSE2};
  454. FillWord_Impl(x, count, value);
  455. end;
  456. procedure FillWord(var x;count:SizeInt;value:word);
  457. begin
  458. FillWord_Impl(x, count, value);
  459. end;
  460. {$endif FPC_SYSTEM_HAS_FILLWORD}
  461. {$ifndef FPC_SYSTEM_HAS_FILLDWORD}
  462. {$define FPC_SYSTEM_HAS_FILLDWORD}
  463. procedure FillDWord_4OrLess; assembler; nostackframe;
  464. asm
  465. cmp $1, %edx
  466. jl .LQuit
  467. mov %ecx, (%eax)
  468. je .LQuit
  469. mov %ecx, 4(%eax)
  470. mov %ecx, -8(%eax,%edx,4)
  471. mov %ecx, -4(%eax,%edx,4)
  472. .LQuit:
  473. end;
  474. {$ifndef CPUX86_HAS_SSE2}
  475. procedure FillDWord_Plain(var x;count:SizeInt;value:dword);assembler;nostackframe;
  476. asm
  477. cmp $4, %edx
  478. jle FillDWord_4OrLess
  479. shl $2, %edx
  480. jmp FillXxxx_U32Pattern_Plain_16OrMore
  481. end;
  482. {$endif ndef CPUX86_HAS_SSE2}
  483. procedure FillDWord_SSE2(var x;count:SizeInt;value:dword);assembler;nostackframe;
  484. asm
  485. cmp $4, %edx
  486. jle FillDWord_4OrLess
  487. shl $2, %edx
  488. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  489. jb FillXxxx_U32Pattern_SSE2_16OrMore
  490. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  491. end;
  492. procedure FillDWord_SSE2_ERMS(var x;count:SizeInt;value:dword);assembler;nostackframe;
  493. asm
  494. cmp $4, %edx
  495. jle FillDWord_4OrLess
  496. shl $2, %edx
  497. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  498. jb FillXxxx_U32Pattern_SSE2_16OrMore
  499. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  500. end;
  501. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword); forward;
  502. var
  503. FillDWord_Impl: procedure(var x;count:SizeInt;value:dword) = @FillDWord_Dispatch;
  504. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword);
  505. begin
  506. if not fpc_cpucodeinit_performed then
  507. begin
  508. {$ifdef CPUX86_HAS_SSE2} FillDWord_SSE2 {$else} FillDWord_Plain {$endif}(x, count, value);
  509. exit;
  510. end;
  511. if fast_large_repmovstosb then
  512. FillDWord_Impl := @FillDWord_SSE2_ERMS
  513. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  514. FillDWord_Impl := @FillDWord_SSE2
  515. {$ifndef CPUX86_HAS_SSE2}
  516. else
  517. FillDWord_Impl := @FillDWord_Plain
  518. {$endif ndef CPUX86_HAS_SSE2};
  519. FillDWord_Impl(x, count, value);
  520. end;
  521. procedure FillDWord(var x;count:SizeInt;value:dword);
  522. begin
  523. FillDWord_Impl(x, count, value);
  524. end;
  525. {$endif FPC_SYSTEM_HAS_FILLDWORD}
  526. {$ifndef FPC_SYSTEM_HAS_FILLQWORD}
  527. {$define FPC_SYSTEM_HAS_FILLQWORD}
  528. {$ifndef CPUX86_HAS_SSE2}
  529. procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  530. { eax = x, edx = count, [esp + 4] = value }
  531. asm
  532. test %edx, %edx { Don't care about clever things like separate small branches or aligning writes by misaligning the pattern, the function is dead. }
  533. jle .LQuit
  534. push %esi
  535. mov 4+4(%esp), %esi { esi = value[0:31] }
  536. mov 4+8(%esp), %ecx { ecx = value[32:63] }
  537. .balign 16
  538. .LLoop:
  539. mov %esi, (%eax)
  540. mov %ecx, 4(%eax)
  541. add $8, %eax
  542. sub $1, %edx
  543. jnz .LLoop
  544. pop %esi
  545. .LQuit:
  546. end;
  547. {$endif ndef CPUX86_HAS_SSE2}
  548. procedure {$ifdef CPUX86_HAS_SSE2} FillQWord {$else} FillQWord_SSE2 {$endif}(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  549. { eax = x, edx = count, [esp + 4] = value }
  550. asm
  551. cmp $4, %edx
  552. jle .L4OrLess
  553. movq 4(%esp), %xmm0
  554. punpcklqdq %xmm0, %xmm0
  555. { Stack is 12 bytes:
  556. [esp] = return address, [esp + 4] = value (not required anymore).
  557. Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs:
  558. [esp] = return address. }
  559. mov (%esp), %ecx
  560. add $8, %esp
  561. mov %ecx, (%esp)
  562. shl $3, %edx
  563. movdqu %xmm0, (%eax)
  564. movdqu %xmm0, -16(%eax,%edx)
  565. test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
  566. jz FillXxxx_MoreThanTwoXMMs
  567. mov %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x. }
  568. shl $3, %ecx
  569. and $63, %ecx
  570. movd %ecx, %xmm2
  571. movdqa %xmm0, %xmm1
  572. psllq %xmm2, %xmm1
  573. neg %ecx { Can also do not ecx; and $63, %ecx; inc ecx to support ecx = 0, as SSE shifts correctly handle count >= bitsizeof. }
  574. and $63, %ecx { But in the definitely unaligned case, ecx = 0 is impossible. }
  575. movd %ecx, %xmm2
  576. psrlq %xmm2, %xmm0
  577. por %xmm1, %xmm0
  578. jmp FillXxxx_MoreThanTwoXMMs
  579. .L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ }
  580. cmp $1, %edx
  581. jl .LQuit
  582. mov 4(%esp), %ecx
  583. mov %ecx, (%eax)
  584. je .LSecondHalfOf1
  585. mov %ecx, 8(%eax)
  586. mov %ecx, -16(%eax,%edx,8)
  587. mov %ecx, -8(%eax,%edx,8)
  588. mov 8(%esp), %ecx
  589. mov %ecx, 4(%eax)
  590. mov %ecx, 12(%eax)
  591. mov %ecx, -12(%eax,%edx,8)
  592. mov %ecx, -4(%eax,%edx,8)
  593. .LQuit:
  594. ret $8
  595. .LSecondHalfOf1:
  596. mov 8(%esp), %ecx
  597. mov %ecx, 4(%eax)
  598. end;
  599. {$ifndef CPUX86_HAS_SSE2}
  600. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword); forward;
  601. var
  602. FillQWord_Impl: procedure(var x;count:SizeInt;value:qword) = @FillQWord_Dispatch;
  603. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword);
  604. begin
  605. if not fpc_cpucodeinit_performed then
  606. begin
  607. FillQWord_Plain(x, count, value);
  608. exit;
  609. end;
  610. if has_sse2_support then
  611. FillQWord_Impl := @FillQWord_SSE2
  612. else
  613. FillQWord_Impl := @FillQWord_Plain;
  614. FillQWord_Impl(x, count, value);
  615. end;
  616. procedure FillQWord(var x;count:SizeInt;value:qword);
  617. begin
  618. FillQWord_Impl(x, count, value);
  619. end;
  620. {$endif ndef CPUX86_HAS_SSE2 (need FillQWord dispatcher)}
  621. {$endif FPC_SYSTEM_HAS_FILLQWORD}
  622. {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
  623. {$define FPC_SYSTEM_HAS_INDEXBYTE}
  624. {$ifndef CPUX86_HAS_SSE2}
  625. function IndexByte_Plain(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  626. { eax = buf, edx = len, cl = b }
  627. asm
  628. test %edx,%edx
  629. jz .Lnothing0
  630. push %eax { save initial value of 'buf' }
  631. test $3,%al
  632. jz .Laligned4
  633. .Lalignloop: { align to 4 bytes }
  634. cmp %cl,(%eax)
  635. je .Lfoundateax
  636. inc %eax
  637. dec %edx
  638. jz .Lnothing1
  639. test $3,%al
  640. jnz .Lalignloop
  641. .Laligned4: { align to 8 bytes }
  642. push %esi
  643. push %edi
  644. mov %cl,%ch { prepare pattern }
  645. movzwl %cx,%esi
  646. shl $16,%ecx
  647. or %esi,%ecx
  648. test $7,%al
  649. jz .Lloop
  650. test %edx,%edx { Adjust buf+len and reuse the second unroll from the loop body. Careful with len < 0 (esp. len = −1). }
  651. jl .Ldontfixuplen
  652. add $4,%edx
  653. .Ldontfixuplen:
  654. sub $4,%eax
  655. jmp .Lalignfrom4to8
  656. .balign 16
  657. .Lloop: { Requires 8-byte alignment of eax, to safely over-read up to 7 bytes on last iteration. }
  658. mov (%eax),%esi { load dword }
  659. xor %ecx,%esi { XOR with pattern, bytes equal to target are now 0 }
  660. lea -0x01010101(%esi),%edi
  661. not %esi
  662. and $0x80808080,%esi
  663. and %edi,%esi { (x-0x01010101) and (not x) and 0x80808080 }
  664. jnz .Lfound0 { one of the bytes matches }
  665. .Lalignfrom4to8:
  666. mov 4(%eax),%esi
  667. xor %ecx,%esi
  668. lea -0x01010101(%esi),%edi
  669. not %esi
  670. and $0x80808080,%esi
  671. and %edi,%esi
  672. jnz .Lfound1
  673. add $8,%eax
  674. sub $8,%edx
  675. ja .Lloop
  676. .Lnothing3:
  677. pop %edi
  678. pop %esi
  679. .Lnothing1:
  680. pop %edx
  681. .Lnothing0:
  682. or $-1,%eax
  683. ret
  684. .Lfound1:
  685. sub $4,%edx
  686. jbe .Lnothing3
  687. add $4,%eax
  688. .Lfound0:
  689. bsf %esi,%esi
  690. shr $3,%esi
  691. cmp %edx,%esi { Garbage after remaining length? }
  692. jae .Lnothing3
  693. add %esi,%eax
  694. pop %edi
  695. pop %esi
  696. .Lfoundateax:
  697. pop %ecx
  698. sub %ecx,%eax
  699. end;
  700. {$endif ndef CPUX86_HAS_SSE2}
  701. function {$ifdef CPUX86_HAS_SSE2} IndexByte {$else} IndexByte_SSE2 {$endif} (const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  702. asm
  703. test %edx, %edx
  704. jz .Lnotfound { exit if len=0 }
  705. push %ebx
  706. movd %ecx, %xmm1
  707. lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
  708. punpcklbw %xmm1, %xmm1
  709. and $-0x10, %ecx { first aligned address after buf }
  710. punpcklbw %xmm1, %xmm1
  711. pshufd $0, %xmm1, %xmm1
  712. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
  713. sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
  714. pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
  715. pmovmskb %xmm0, %ebx
  716. shl %cl, %ebx { shift valid bits into high word }
  717. and $0xffff0000, %ebx { clear low word containing invalid bits }
  718. shr %cl, %ebx { shift back }
  719. jz .Lcontinue
  720. .Lmatch:
  721. bsf %ebx, %ebx
  722. lea -16(%ecx,%ebx), %eax
  723. pop %ebx
  724. cmp %eax, %edx { check against the buffer length }
  725. jbe .Lnotfound
  726. ret
  727. .balign 16
  728. .Lloop:
  729. movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, }
  730. add $16, %ecx { but their sum is evenly divisible by 16. }
  731. pcmpeqb %xmm1, %xmm0
  732. pmovmskb %xmm0, %ebx
  733. test %ebx, %ebx
  734. jnz .Lmatch
  735. .Lcontinue:
  736. cmp %ecx, %edx
  737. ja .Lloop
  738. pop %ebx
  739. .Lnotfound:
  740. or $-1, %eax
  741. end;
  742. {$ifndef CPUX86_HAS_SSE2}
  743. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt; forward;
  744. var
  745. IndexByte_Impl: function(const buf;len:SizeInt;b:byte):SizeInt = @IndexByte_Dispatch;
  746. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt;
  747. begin
  748. if not fpc_cpucodeinit_performed then
  749. exit(IndexByte_Plain(buf,len,b));
  750. if has_sse2_support then
  751. IndexByte_Impl:=@IndexByte_SSE2
  752. else
  753. IndexByte_Impl:=@IndexByte_Plain;
  754. result:=IndexByte_Impl(buf,len,b);
  755. end;
  756. function IndexByte(const buf;len:SizeInt;b:byte):SizeInt;
  757. begin
  758. result:=IndexByte_Impl(buf,len,b);
  759. end;
  760. {$endif ndef CPUX86_HAS_SSE2}
  761. {$endif FPC_SYSTEM_HAS_INDEXBYTE}
  762. {$ifndef FPC_SYSTEM_HAS_INDEXWORD}
  763. {$define FPC_SYSTEM_HAS_INDEXWORD}
  764. {$ifndef CPUX86_HAS_SSE2}
  765. function IndexWord_Plain(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  766. asm
  767. test %edx, %edx
  768. jz .LNotFound
  769. push %eax
  770. .LWordwise_Body: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  771. cmp %cx, (%eax)
  772. je .LFound
  773. add $2, %eax
  774. dec %edx
  775. jnz .LWordwise_Body
  776. pop %edx
  777. .LNotFound:
  778. or $-1, %eax
  779. ret
  780. .LFound:
  781. pop %edx
  782. sub %edx, %eax
  783. shr $1, %eax
  784. end;
  785. {$endif ndef CPUX86_HAS_SSE2}
  786. function {$ifdef CPUX86_HAS_SSE2} IndexWord {$else} IndexWord_SSE2 {$endif} (const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  787. asm
  788. test %edx, %edx { exit if len=0 }
  789. je .Lnotfound
  790. push %ebx
  791. movd %ecx, %xmm1
  792. punpcklwd %xmm1, %xmm1
  793. pshufd $0, %xmm1, %xmm1
  794. lea 16(%eax), %ecx
  795. and $-16, %ecx
  796. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
  797. sub %eax, %ecx
  798. test $1, %eax { if buffer isn't aligned to word boundary, }
  799. jnz .Lunaligned { use a different algorithm }
  800. pcmpeqw %xmm1, %xmm0
  801. pmovmskb %xmm0, %ebx
  802. shl %cl, %ebx
  803. and $0xffff0000, %ebx
  804. shr %cl, %ebx
  805. shr $1, %ecx { ecx=number of valid bytes }
  806. test %ebx, %ebx
  807. jz .Lcontinue
  808. .Lmatch:
  809. bsf %ebx, %ebx
  810. shr $1, %ebx { in words }
  811. lea -8(%ecx,%ebx), %eax
  812. pop %ebx
  813. cmp %eax, %edx
  814. jbe .Lnotfound { if match is after the specified length, ignore it }
  815. ret
  816. .balign 16
  817. .Lloop:
  818. movdqa (%eax,%ecx,2), %xmm0
  819. add $8, %ecx
  820. pcmpeqw %xmm1, %xmm0
  821. pmovmskb %xmm0, %ebx
  822. test %ebx, %ebx
  823. jnz .Lmatch
  824. .Lcontinue:
  825. cmp %ecx, %edx
  826. ja .Lloop
  827. pop %ebx
  828. .Lnotfound:
  829. or $-1, %eax
  830. ret
  831. .Lunaligned:
  832. push %esi
  833. movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
  834. psllw $8, %xmm1 { swap bytes of each word of pattern) }
  835. psrlw $8, %xmm2
  836. por %xmm2, %xmm1
  837. pcmpeqb %xmm1, %xmm0
  838. pmovmskb %xmm0, %ebx
  839. shl %cl, %ebx
  840. and $0xffff0000, %ebx
  841. shr %cl, %ebx
  842. xor %esi, %esi { nothing to merge yet }
  843. add %edx, %edx { length words -> bytes }
  844. jmp .Lcontinue_u
  845. .balign 16
  846. .Lloop_u:
  847. movdqa (%eax,%ecx), %xmm0
  848. add $16, %ecx
  849. pcmpeqb %xmm1, %xmm0 { compare by bytes }
  850. shr $16, %esi { bit 16 shifts into 0 }
  851. pmovmskb %xmm0, %ebx
  852. .Lcontinue_u:
  853. shl $1, %ebx { 15:0 -> 16:1 }
  854. or %esi, %ebx { merge bit 0 from previous round }
  855. mov %ebx, %esi
  856. shr $1, %ebx { now AND together adjacent pairs of bits }
  857. and %esi, %ebx
  858. and $0x5555, %ebx { also reset odd bits }
  859. jnz .Lmatch_u
  860. cmp %ecx, %edx
  861. ja .Lloop_u
  862. .Lnotfound_u:
  863. pop %esi
  864. pop %ebx
  865. or $-1, %eax
  866. ret
  867. .Lmatch_u:
  868. bsf %ebx, %ebx
  869. lea -16(%ecx,%ebx), %eax
  870. cmp %eax, %edx
  871. jbe .Lnotfound_u { if match is after the specified length, ignore it }
  872. sar $1, %eax { in words }
  873. pop %esi
  874. pop %ebx
  875. end;
  876. {$ifndef CPUX86_HAS_SSE2}
  877. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt; forward;
  878. var
  879. IndexWord_Impl: function(const buf;len:SizeInt;b:word):SizeInt = @IndexWord_Dispatch;
  880. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt;
  881. begin
  882. if not fpc_cpucodeinit_performed then
  883. exit(IndexWord_Plain(buf,len,b));
  884. if has_sse2_support then
  885. IndexWord_Impl:=@IndexWord_SSE2
  886. else
  887. IndexWord_Impl:=@IndexWord_Plain;
  888. result:=IndexWord_Impl(buf,len,b);
  889. end;
  890. function IndexWord(const buf;len:SizeInt;b:word):SizeInt; inline;
  891. begin
  892. result:=IndexWord_Impl(buf,len,b);
  893. end;
  894. {$endif ndef CPUX86_HAS_SSE2}
  895. {$endif FPC_SYSTEM_HAS_INDEXWORD}
  896. {$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
  897. {$define FPC_SYSTEM_HAS_INDEXDWORD}
  898. {$ifndef CPUX86_HAS_SSE2}
  899. function IndexDWord_Plain(Const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  900. asm
  901. push %eax
  902. sub $4, %eax
  903. .LDWordwise_Next: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  904. add $4, %eax
  905. sub $1, %edx
  906. jb .LNotFound
  907. cmp %ecx, (%eax)
  908. jne .LDWordwise_Next
  909. pop %edx
  910. sub %edx, %eax
  911. shr $2, %eax
  912. ret
  913. .LNotFound:
  914. pop %edx
  915. mov $-1, %eax
  916. end;
  917. {$endif ndef CPUX86_HAS_SSE2}
  918. function {$ifdef CPUX86_HAS_SSE2} IndexDWord {$else} IndexDWord_SSE2 {$endif} (const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  919. asm
  920. push %eax
  921. sub $4, %edx
  922. jle .LDwordwise_Prepare
  923. movd %ecx, %xmm1
  924. pshufd $0, %xmm1, %xmm1
  925. .balign 16 { 1-byte NOP. }
  926. .L4x_Body:
  927. movdqu (%eax), %xmm0
  928. pcmpeqd %xmm1, %xmm0
  929. pmovmskb %xmm0, %ecx
  930. test %ecx, %ecx
  931. jnz .LFoundAtMask
  932. add $16, %eax
  933. sub $4, %edx
  934. jg .L4x_Body
  935. lea (%eax,%edx,4), %eax
  936. movdqu (%eax), %xmm0
  937. pcmpeqd %xmm1, %xmm0
  938. pmovmskb %xmm0, %ecx
  939. test %ecx, %ecx
  940. jz .LNothing
  941. .LFoundAtMask:
  942. bsf %ecx, %ecx
  943. add %ecx, %eax
  944. .LFoundAtEax:
  945. pop %edx
  946. sub %edx, %eax
  947. shr $2, %eax
  948. ret
  949. nop { Turns .balign 16 before .LDwordwise_Body into a no-op. }
  950. .LDwordwise_Prepare:
  951. add $3, %edx
  952. cmp $-1, %edx
  953. je .LNothing
  954. .balign 16 { no-op }
  955. .LDwordwise_Body:
  956. cmp (%eax), %ecx
  957. je .LFoundAtEax
  958. add $4, %eax
  959. sub $1, %edx
  960. jae .LDwordwise_Body
  961. .LNothing:
  962. pop %edx
  963. or $-1, %eax
  964. end;
  965. {$ifndef CPUX86_HAS_SSE2}
  966. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt; forward;
  967. var
  968. IndexDWord_Impl: function(const buf;len:SizeInt;b:DWord):SizeInt = @IndexDWord_Dispatch;
  969. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt;
  970. begin
  971. if not fpc_cpucodeinit_performed then
  972. exit(IndexDWord_Plain(buf,len,b));
  973. if has_sse2_support then
  974. IndexDWord_Impl:=@IndexDWord_SSE2
  975. else
  976. IndexDWord_Impl:=@IndexDWord_Plain;
  977. result:=IndexDWord_Impl(buf,len,b);
  978. end;
  979. function IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt;
  980. begin
  981. result:=IndexDWord_Impl(buf,len,b);
  982. end;
  983. {$endif CPUX86_HAS_SSE2}
  984. {$endif FPC_SYSTEM_HAS_INDEXDWORD}
  985. {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
  986. {$define FPC_SYSTEM_HAS_INDEXQWORD}
  987. function IndexQWord_Plain(const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  988. { eax = buf, edx = len, [esp+4] = b }
  989. asm
  990. push %ebx
  991. mov 8(%esp), %ecx { ecx = b[0:31] }
  992. mov 12(%esp), %ebx { ebx = b[32:63] }
  993. mov %eax, 8(%esp) { remember original buf }
  994. sub $8, %eax
  995. .balign 16 { no-op }
  996. .LQWordwise_Next:
  997. add $8, %eax
  998. sub $1, %edx
  999. jb .LNotFound
  1000. cmp %ecx, (%eax)
  1001. jne .LQWordwise_Next
  1002. cmp %ebx, 4(%eax)
  1003. jne .LQWordwise_Next
  1004. sub 8(%esp), %eax
  1005. pop %ebx
  1006. shr $3, %eax
  1007. ret $8
  1008. .LNotFound:
  1009. pop %ebx
  1010. mov $-1, %eax
  1011. end;
  1012. function IndexQWord_SSE41(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  1013. { eax = buf, edx = len, [esp+4] = b }
  1014. asm
  1015. cmp $6, len
  1016. jle IndexQWord_Plain
  1017. movddup 4(%esp), %xmm0 { xmm0 = pattern of 'b's. }
  1018. mov %eax, %ecx { ecx = original buf }
  1019. sub $6, len
  1020. .balign 16
  1021. .L6x_Loop:
  1022. movdqu (%eax), %xmm1
  1023. pcmpeqq %xmm0, %xmm1 { xmm1 = cmpeq(vec 0, pattern) }
  1024. movdqu 16(%eax), %xmm2
  1025. pcmpeqq %xmm0, %xmm2
  1026. por %xmm1, %xmm2 { xmm2 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) }
  1027. movdqu 32(%eax), %xmm3
  1028. pcmpeqq %xmm0, %xmm3
  1029. por %xmm2, %xmm3 { xmm3 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) or cmpeq(vec 2, pattern) }
  1030. ptest %xmm3, %xmm3
  1031. jnz .LFound
  1032. add $48, %eax
  1033. sub $6, len
  1034. jge .L6x_Loop
  1035. lea (%eax,%edx,8), %eax { Point to last 3 vectors. }
  1036. cmp $-5, len
  1037. jge .L6x_Loop { Reuse .L6x_Loop to compare last 3 vectors, if not compared already. }
  1038. mov $-1, %eax
  1039. ret $8
  1040. .LFound:
  1041. sub %ecx, %eax
  1042. ptest %xmm1, %xmm1
  1043. jnz .LFoundAtXmm1
  1044. ptest %xmm2, %xmm2
  1045. jnz .LFoundAtXmm2
  1046. add $16, %eax
  1047. movdqa %xmm3, %xmm2
  1048. .LFoundAtXmm2:
  1049. add $16, %eax
  1050. movdqa %xmm2, %xmm1
  1051. .LFoundAtXmm1:
  1052. pmovmskb %xmm1, %ecx
  1053. bsf %ecx, %ecx
  1054. add %ecx, %eax
  1055. shr $3, %eax
  1056. end;
  1057. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt; forward;
  1058. var
  1059. IndexQWord_Impl: function(const buf;len:SizeInt;b:QWord):SizeInt = @IndexQWord_Dispatch;
  1060. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt;
  1061. begin
  1062. if not fpc_cpucodeinit_performed then
  1063. exit(IndexQWord_Plain(buf,len,b));
  1064. if has_sse41_support then
  1065. IndexQWord_Impl:=@IndexQWord_SSE41
  1066. else
  1067. IndexQWord_Impl:=@IndexQWord_Plain;
  1068. result:=IndexQWord_Impl(buf,len,b);
  1069. end;
  1070. function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt;
  1071. begin
  1072. result:=IndexQWord_Impl(buf,len,b);
  1073. end;
  1074. {$endif FPC_SYSTEM_HAS_INDEXQWORD}
  1075. {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
  1076. {$define FPC_SYSTEM_HAS_COMPAREBYTE}
  1077. {$ifndef CPUX86_HAS_SSE2}
  1078. function CompareByte_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1079. asm
  1080. { eax = buf1, edx = buf2, ecx = len }
  1081. push %ebx
  1082. sub %eax, %edx { edx = buf2 - buf1 }
  1083. cmp $3, %ecx
  1084. jle .LBytewise_Prepare
  1085. { Align buf1 on 4 bytes. }
  1086. mov (%edx,%eax), %ebx
  1087. cmp (%eax), %ebx
  1088. jne .L4xDiffer
  1089. lea -4(%eax,%ecx), %ecx { ecx = buf1 end - (4 + buf1 and -4) = count remaining }
  1090. and $-4, %eax
  1091. sub %eax, %ecx
  1092. .balign 16
  1093. .L4x_Next:
  1094. add $4, %eax
  1095. sub $4, %ecx { at .LLast4, ecx is 4 less than remaining bytes }
  1096. jle .LLast4
  1097. mov (%edx,%eax), %ebx
  1098. cmp (%eax), %ebx
  1099. je .L4x_Next
  1100. .L4xDiffer:
  1101. mov (%eax), %edx
  1102. {$ifdef CPUX86_HAS_BSWAP}
  1103. bswap %ebx
  1104. bswap %edx
  1105. {$else}
  1106. rol $8, %bx
  1107. rol $16, %ebx
  1108. rol $8, %bx
  1109. rol $8, %dx
  1110. rol $16, %edx
  1111. rol $8, %dx
  1112. {$endif}
  1113. cmp %ebx, %edx
  1114. .LDoSbb:
  1115. sbb %eax, %eax
  1116. or $1, %eax
  1117. pop %ebx
  1118. ret
  1119. .LLast4:
  1120. add %ecx, %eax
  1121. mov (%edx,%eax), %ebx
  1122. cmp (%eax), %ebx
  1123. jne .L4xDiffer
  1124. xor %eax, %eax
  1125. pop %ebx
  1126. ret
  1127. .LBytewise_Prepare:
  1128. sub $1, %ecx
  1129. jb .LNothing
  1130. .balign 16 { no-op }
  1131. .LBytewise_Body:
  1132. movzbl (%edx,%eax), %ebx
  1133. cmp %bl, (%eax)
  1134. jne .LDoSbb
  1135. add $1, %eax
  1136. sub $1, %ecx
  1137. jae .LBytewise_Body
  1138. .LNothing:
  1139. xor %eax, %eax
  1140. pop %ebx
  1141. end;
  1142. {$endif ndef CPUX86_HAS_SSE2}
  1143. function {$ifdef CPUX86_HAS_SSE2} CompareByte {$else} CompareByte_SSE2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
  1144. asm
  1145. { eax = buf1, edx = buf2, ecx = len }
  1146. cmp $1, %ecx
  1147. jle .L1OrLess
  1148. push %ebx
  1149. cmp $16, %ecx
  1150. jae .LVecOrMore
  1151. { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
  1152. mov %eax, %ebx
  1153. or %edx, %ebx
  1154. and $4095, %ebx
  1155. cmp $4080, %ebx
  1156. ja .LCantOverReadBoth
  1157. { Over-read both as XMMs. }
  1158. movdqu (%eax), %xmm0
  1159. movdqu (%edx), %xmm1
  1160. pcmpeqb %xmm1, %xmm0
  1161. pmovmskb %xmm0, %ebx
  1162. inc %bx { Here and below, 2-byte 'inc r16' after 'pmovmskb' can be replaced with 5-byte 'add $1, r16' or 6-byte 'xor $65535, r32'. }
  1163. jz .LNothing
  1164. bsf %ebx, %ebx
  1165. cmp %ecx, %ebx { Ignore garbage beyond 'len'. }
  1166. jae .LNothing
  1167. movzbl (%eax,%ebx), %eax
  1168. movzbl (%edx,%ebx), %edx
  1169. sub %edx, %eax
  1170. pop %ebx
  1171. ret
  1172. .LNothing:
  1173. pop %ebx
  1174. xor %eax, %eax
  1175. ret
  1176. .LVecOrMore:
  1177. { Compare first vectors. }
  1178. movdqu (%eax), %xmm0
  1179. movdqu (%edx), %xmm1
  1180. pcmpeqb %xmm1, %xmm0
  1181. pmovmskb %xmm0, %ebx
  1182. inc %bx
  1183. jnz .LVec0Differs
  1184. sub $32, %ecx { now ecx is len - 32... mostly just to save bytes on offsets improving .LAligned32xLoop_Body alignment :) }
  1185. jbe .LLastVec
  1186. { Compare second vectors. }
  1187. movdqu 16(%eax), %xmm0
  1188. movdqu 16(%edx), %xmm1
  1189. pcmpeqb %xmm1, %xmm0
  1190. pmovmskb %xmm0, %ebx
  1191. inc %bx
  1192. jnz .LVec1Differs
  1193. { More than four vectors: aligned loop. }
  1194. cmp $32, %ecx
  1195. ja .LAligned32xLoop_Prepare
  1196. { Compare last two vectors. }
  1197. movdqu (%eax,%ecx), %xmm0
  1198. movdqu (%edx,%ecx), %xmm1
  1199. pcmpeqb %xmm1, %xmm0
  1200. pmovmskb %xmm0, %ebx
  1201. inc %bx
  1202. jnz .LVecEm2Differs
  1203. .LLastVec:
  1204. movdqu 16(%eax,%ecx), %xmm0
  1205. movdqu 16(%edx,%ecx), %xmm1
  1206. pcmpeqb %xmm1, %xmm0
  1207. pmovmskb %xmm0, %ebx
  1208. inc %bx
  1209. jnz .LVecEm1Differs
  1210. pop %ebx
  1211. xor %eax, %eax
  1212. ret
  1213. .LVecEm2Differs:
  1214. sub $16, %ecx
  1215. .LVecEm1Differs:
  1216. bsf %ebx, %ebx
  1217. add %ecx, %ebx
  1218. movzbl 16(%eax,%ebx), %eax
  1219. movzbl 16(%edx,%ebx), %edx
  1220. sub %edx, %eax
  1221. pop %ebx
  1222. ret
  1223. nop { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
  1224. .LAligned32xLoop_Prepare:
  1225. lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
  1226. sub %eax, %edx { edx = buf2 - buf1 }
  1227. and $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
  1228. sub %eax, %ecx { ecx = count to be handled with loop }
  1229. .balign 16 { No-op. }
  1230. .LAligned32xLoop_Body:
  1231. add $32, %eax
  1232. { Compare two XMMs, reduce the result with 'and'. }
  1233. movdqu (%edx,%eax), %xmm0
  1234. pcmpeqb (%eax), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
  1235. movdqu 16(%edx,%eax), %xmm1
  1236. pcmpeqb 16(%eax), %xmm1
  1237. pand %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
  1238. pmovmskb %xmm1, %ebx
  1239. inc %bx
  1240. jnz .LAligned32xLoop_TwoVectorsDiffer
  1241. sub $32, %ecx
  1242. ja .LAligned32xLoop_Body
  1243. { Compare last two vectors after the loop by doing one more loop iteration, modified. }
  1244. lea 32(%eax,%ecx), %eax
  1245. movdqu (%edx,%eax), %xmm0
  1246. movdqu (%eax), %xmm2
  1247. pcmpeqb %xmm2, %xmm0
  1248. movdqu 16(%edx,%eax), %xmm1
  1249. movdqu 16(%eax), %xmm2
  1250. pcmpeqb %xmm2, %xmm1
  1251. pand %xmm0, %xmm1
  1252. pmovmskb %xmm1, %ebx
  1253. inc %bx
  1254. jnz .LAligned32xLoop_TwoVectorsDiffer
  1255. pop %ebx
  1256. xor %eax, %eax
  1257. ret
  1258. .LAligned32xLoop_TwoVectorsDiffer:
  1259. add %eax, %edx { restore edx = buf2 }
  1260. pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
  1261. inc %cx
  1262. jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
  1263. bsf %ecx, %ebx
  1264. movzbl (%eax,%ebx), %eax
  1265. movzbl (%edx,%ebx), %edx
  1266. sub %edx, %eax
  1267. pop %ebx
  1268. ret
  1269. .LVec1Differs:
  1270. add $16, %eax
  1271. add $16, %edx
  1272. .LVec0Differs:
  1273. bsf %ebx, %ebx
  1274. movzbl (%eax,%ebx), %eax
  1275. movzbl (%edx,%ebx), %edx
  1276. sub %edx, %eax
  1277. pop %ebx
  1278. ret
  1279. .LCantOverReadBoth:
  1280. cmp $3, %ecx
  1281. jle .L2to3
  1282. push %esi
  1283. mov (%eax), %ebx
  1284. mov (%edx), %esi
  1285. cmp %esi, %ebx
  1286. jne .L4xDiffer
  1287. cmp $8, %ecx
  1288. jbe .LLast4x
  1289. mov 4(%eax), %ebx
  1290. mov 4(%edx), %esi
  1291. cmp %esi, %ebx
  1292. jne .L4xDiffer
  1293. mov -8(%eax,%ecx), %ebx
  1294. mov -8(%edx,%ecx), %esi
  1295. cmp %esi, %ebx
  1296. jne .L4xDiffer
  1297. .LLast4x:
  1298. mov -4(%eax,%ecx), %ebx
  1299. mov -4(%edx,%ecx), %esi
  1300. cmp %esi, %ebx
  1301. jne .L4xDiffer
  1302. pop %esi
  1303. pop %ebx
  1304. xor %eax, %eax
  1305. ret
  1306. .L4xDiffer:
  1307. bswap %ebx
  1308. bswap %esi
  1309. cmp %esi, %ebx
  1310. pop %esi
  1311. sbb %eax, %eax
  1312. or $1, %eax
  1313. pop %ebx
  1314. ret
  1315. .L2to3:
  1316. movzwl (%edx), %ebx
  1317. bswap %ebx
  1318. shr $1, %ebx
  1319. mov -1(%edx,%ecx), %bl
  1320. movzwl (%eax), %edx
  1321. bswap %edx
  1322. shr $1, %edx
  1323. mov -1(%eax,%ecx), %dl
  1324. mov %edx, %eax
  1325. sub %ebx, %eax
  1326. pop %ebx
  1327. ret
  1328. .L1OrLess:
  1329. jl .LUnbounded_Prepare
  1330. movzbl (%eax), %eax
  1331. movzbl (%edx), %edx
  1332. sub %edx, %eax
  1333. ret
  1334. .LUnbounded_Prepare:
  1335. sub %eax, %edx { edx = buf2 - buf1 }
  1336. test %ecx, %ecx
  1337. jnz .LUnbounded_Body
  1338. xor %eax, %eax
  1339. ret
  1340. .balign 16
  1341. .LUnbounded_Next:
  1342. add $1, %eax
  1343. .LUnbounded_Body:
  1344. movzbl (%edx,%eax), %ecx
  1345. cmp %cl, (%eax)
  1346. je .LUnbounded_Next
  1347. sbb %eax, %eax
  1348. or $1, %eax
  1349. end;
  1350. {$ifndef CPUX86_HAS_SSE2}
  1351. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1352. var
  1353. CompareByte_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareByte_Dispatch;
  1354. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1355. begin
  1356. if not fpc_cpucodeinit_performed then
  1357. exit(CompareByte_Plain(buf1, buf2, len));
  1358. if has_sse2_support then
  1359. CompareByte_Impl:=@CompareByte_SSE2
  1360. else
  1361. CompareByte_Impl:=@CompareByte_Plain;
  1362. result:=CompareByte_Impl(buf1, buf2, len);
  1363. end;
  1364. function CompareByte(const buf1, buf2; len: SizeInt): SizeInt;
  1365. begin
  1366. result:=CompareByte_Impl(buf1, buf2, len);
  1367. end;
  1368. {$endif ndef CPUX86_HAS_SSE2 (need CompareByte dispatcher)}
  1369. {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
  1370. {$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
  1371. {$define FPC_SYSTEM_HAS_COMPAREWORD}
  1372. {$ifndef CPUX86_HAS_SSE2}
  1373. function CompareWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1374. asm
  1375. push %ebx
  1376. sub %eax, %edx { edx = buf2 - buf1 }
  1377. lea -4(%ecx), %ebx { Go wordwise if ecx <= 3 or ecx > 1073741823 (High(int32) div 2) ==> uint32(ecx - 4) > 1073741819. }
  1378. cmp $1073741819, %ebx
  1379. ja .LWordwise_Prepare
  1380. test $2, %al
  1381. je .LAlignedToPtrUintOrNaturallyMisaligned
  1382. movzwl (%edx,%eax), %ebx
  1383. cmp %bx, (%eax)
  1384. jne .LDoSbb
  1385. add $2, %eax
  1386. sub $1, %ecx
  1387. .LAlignedToPtrUintOrNaturallyMisaligned:
  1388. sub $2, %ecx
  1389. .balign 16
  1390. .LPtrUintWise_Next:
  1391. mov (%edx,%eax), %ebx
  1392. cmp %ebx, (%eax)
  1393. jne .LPtrUintsDiffer
  1394. add $4, %eax
  1395. sub $2, %ecx
  1396. jg .LPtrUintWise_Next
  1397. lea (%eax,%ecx,2), %eax
  1398. mov (%edx,%eax), %ebx
  1399. cmp %ebx, (%eax)
  1400. jne .LPtrUintsDiffer
  1401. pop %ebx
  1402. xor %eax, %eax
  1403. ret
  1404. .LPtrUintsDiffer:
  1405. cmp %bx, (%eax)
  1406. jne .LDoSbb
  1407. shr $16, %ebx
  1408. cmp %bx, 2(%eax)
  1409. .LDoSbb:
  1410. sbb %eax, %eax
  1411. or $1, %eax
  1412. pop %ebx
  1413. ret
  1414. .balign 16
  1415. .LWordwise_Body:
  1416. movzwl (%edx,%eax), %ebx
  1417. cmp %bx, (%eax)
  1418. jne .LDoSbb
  1419. add $2, %eax
  1420. .LWordwise_Prepare:
  1421. sub $1, %ecx
  1422. jnb .LWordwise_Body
  1423. pop %ebx
  1424. xor %eax, %eax
  1425. end;
  1426. {$endif ndef CPUX86_HAS_SSE2}
  1427. function {$ifdef CPUX86_HAS_SSE2} CompareWord {$else} CompareWord_SSE2 {$endif} (Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1428. asm
  1429. push %ebx
  1430. sub %eax, %edx { edx = buf2 - buf1 }
  1431. lea -2(%ecx), %ebx { Go wordwise if ecx <= 1 or uint32(ecx) > 1073741823 (High(int32) div 2) ==> uint32(ecx - 2) > 1073741821. }
  1432. cmp $1073741821, %ebx
  1433. ja .LWordwise_Prepare
  1434. cmp $8, %ecx
  1435. jge .LVecOrMore
  1436. lea (%edx,%eax), %ebx
  1437. or %eax, %ebx
  1438. and $4095, %ebx
  1439. cmp $4080, %ebx
  1440. ja .LWordwise_Prepare
  1441. movdqu (%edx,%eax), %xmm0
  1442. movdqu (%eax), %xmm1
  1443. pcmpeqw %xmm1, %xmm0
  1444. pmovmskb %xmm0, %ebx
  1445. inc %bx
  1446. jz .LNothing
  1447. shl $1, %ecx { convert to bytes }
  1448. bsf %ebx, %ebx
  1449. cmp %ecx, %ebx
  1450. jb .LSubtractWords
  1451. .LNothing:
  1452. pop %ebx
  1453. xor %eax, %eax
  1454. ret
  1455. .balign 16
  1456. .LWordwise_Body:
  1457. movzwl (%edx,%eax), %ebx
  1458. cmp %bx, (%eax)
  1459. jne .LDoSbb
  1460. add $2, %eax
  1461. .LWordwise_Prepare:
  1462. sub $1, %ecx
  1463. jae .LWordwise_Body
  1464. xor %eax, %eax
  1465. pop %ebx
  1466. ret
  1467. .LDoSbb:
  1468. sbb %eax, %eax
  1469. or $1, %eax
  1470. pop %ebx
  1471. ret
  1472. .LVecOrMore:
  1473. movdqu (%edx,%eax), %xmm0 { Compare first vectors. }
  1474. movdqu (%eax), %xmm1
  1475. pcmpeqw %xmm1, %xmm0
  1476. pmovmskb %xmm0, %ebx
  1477. inc %bx
  1478. jnz .LVec0Differs
  1479. shl $1, %ecx { convert to bytes }
  1480. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1481. jle .LLastVec
  1482. push %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1483. add %eax, %ecx
  1484. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1485. sub %eax, %ecx
  1486. .balign 16
  1487. .LAligned8xLoop_Body:
  1488. add $16, %eax
  1489. movdqu (%edx,%eax), %xmm0
  1490. pcmpeqb (%eax), %xmm0
  1491. pmovmskb %xmm0, %ebx
  1492. inc %bx
  1493. jnz .LAligned8xLoop_VecDiffers
  1494. sub $16, %ecx
  1495. ja .LAligned8xLoop_Body
  1496. pop %ebx { drop original buf1 }
  1497. .LLastVec:
  1498. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1499. movdqu (%edx,%eax), %xmm0
  1500. movdqu (%eax), %xmm1
  1501. pcmpeqw %xmm1, %xmm0
  1502. pmovmskb %xmm0, %ebx
  1503. inc %bx
  1504. jnz .LVec0Differs
  1505. pop %ebx
  1506. xor %eax, %eax
  1507. ret
  1508. .LVec0Differs:
  1509. bsf %ebx, %ebx
  1510. .LSubtractWords:
  1511. add %eax, %edx
  1512. movzwl (%eax,%ebx), %eax
  1513. movzwl (%edx,%ebx), %edx
  1514. sub %edx, %eax
  1515. pop %ebx
  1516. ret
  1517. .LAligned8xLoop_VecDiffers:
  1518. bsf %ebx, %ebx
  1519. add %ebx, %eax
  1520. pop %ecx
  1521. sub %ecx, %eax
  1522. and $-2, %eax
  1523. add %ecx, %eax
  1524. movzwl (%edx,%eax), %edx
  1525. movzwl (%eax), %eax
  1526. sub %edx, %eax
  1527. pop %ebx
  1528. end;
  1529. {$ifndef CPUX86_HAS_SSE2}
  1530. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1531. var
  1532. CompareWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareWord_Dispatch;
  1533. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1534. begin
  1535. if not fpc_cpucodeinit_performed then
  1536. exit(CompareWord_Plain(buf1, buf2, len));
  1537. if has_sse2_support then
  1538. CompareWord_Impl:=@CompareWord_SSE2
  1539. else
  1540. CompareWord_Impl:=@CompareWord_Plain;
  1541. result:=CompareWord_Impl(buf1, buf2, len);
  1542. end;
  1543. function CompareWord(const buf1, buf2; len: SizeInt): SizeInt;
  1544. begin
  1545. result:=CompareWord_Impl(buf1, buf2, len);
  1546. end;
  1547. {$endif ndef CPUX86_HAS_SSE2 (need CompareWord dispatcher)}
  1548. {$endif FPC_SYSTEM_HAS_COMPAREWORD}
  1549. {$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
  1550. {$define FPC_SYSTEM_HAS_COMPAREDWORD}
  1551. {$ifndef CPUX86_HAS_SSE2}
  1552. function CompareDWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1553. asm
  1554. sub $1, %ecx
  1555. jb .LNothing
  1556. push %ebx
  1557. sub %eax, %edx
  1558. .balign 16
  1559. .LDwordwise_Body:
  1560. mov (%edx,%eax), %ebx
  1561. cmp %ebx, (%eax)
  1562. jne .LDoSbb
  1563. add $4, %eax
  1564. sub $1, %ecx
  1565. jnb .LDwordwise_Body
  1566. pop %ebx
  1567. .LNothing:
  1568. xor %eax, %eax
  1569. ret
  1570. .LDoSbb:
  1571. pop %ebx
  1572. sbb %eax, %eax
  1573. or $1, %eax
  1574. end;
  1575. {$endif}
  1576. function {$ifdef CPUX86_HAS_SSE2} CompareDWord {$else} CompareDWord_SSE2 {$endif} (Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1577. asm
  1578. push %ebx
  1579. sub %eax, %edx { edx = buf2 - buf1 }
  1580. lea -5(%ecx), %ebx { Go dwordwise if ecx <= 4 or ecx > 536870911 (High(int32) div 4) ==> uint32(ecx - 5) > 536870906. }
  1581. cmp $536870906, %ebx
  1582. ja .LDwordwise_Prepare
  1583. shl $2, %ecx { convert to bytes }
  1584. movdqu (%edx,%eax), %xmm1 { Compare first vectors. }
  1585. movdqu (%eax), %xmm0
  1586. pcmpeqd %xmm1, %xmm0
  1587. pmovmskb %xmm0, %ebx
  1588. inc %bx
  1589. jnz .LVec0Differs
  1590. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1591. jle .LLastVec
  1592. push %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1593. add %eax, %ecx
  1594. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1595. sub %eax, %ecx
  1596. .balign 16
  1597. .LAligned4xLoop_Body:
  1598. add $16, %eax
  1599. movdqu (%eax,%edx), %xmm0
  1600. pcmpeqb (%eax), %xmm0
  1601. pmovmskb %xmm0, %ebx
  1602. inc %bx
  1603. jnz .LAligned4xLoop_VecDiffers
  1604. sub $16, %ecx
  1605. ja .LAligned4xLoop_Body
  1606. pop %ebx { drop original buf1 }
  1607. .LLastVec:
  1608. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1609. movdqu (%edx,%eax), %xmm1
  1610. movdqu (%eax), %xmm0
  1611. pcmpeqd %xmm1, %xmm0
  1612. pmovmskb %xmm0, %ebx
  1613. inc %bx
  1614. jnz .LVec0Differs
  1615. pop %ebx
  1616. xor %eax, %eax
  1617. ret
  1618. .LVec0Differs:
  1619. bsf %ebx, %ebx
  1620. add %eax, %edx { recover edx = buf2 }
  1621. mov (%edx,%ebx), %edx
  1622. cmp %edx, (%eax,%ebx)
  1623. sbb %eax, %eax
  1624. or $1, %eax
  1625. pop %ebx
  1626. ret
  1627. .LAligned4xLoop_VecDiffers:
  1628. bsf %ebx, %ebx
  1629. add %ebx, %eax
  1630. pop %ecx
  1631. sub %ecx, %eax
  1632. and $-4, %eax
  1633. add %ecx, %eax
  1634. mov (%edx,%eax), %edx
  1635. cmp %edx, (%eax)
  1636. .LDoSbb:
  1637. sbb %eax, %eax
  1638. or $1, %eax
  1639. pop %ebx
  1640. ret
  1641. .balign 16
  1642. .LDwordwise_Body:
  1643. mov (%edx,%eax), %ebx
  1644. cmp %ebx, (%eax)
  1645. jne .LDoSbb
  1646. add $4, %eax
  1647. .LDwordwise_Prepare:
  1648. sub $1, %ecx
  1649. jnb .LDwordwise_Body
  1650. pop %ebx
  1651. xor %eax, %eax
  1652. end;
  1653. {$ifndef CPUX86_HAS_SSE2}
  1654. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1655. var
  1656. CompareDWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareDWord_Dispatch;
  1657. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1658. begin
  1659. if not fpc_cpucodeinit_performed then
  1660. exit(CompareDWord_Plain(buf1, buf2, len));
  1661. if has_sse2_support then
  1662. CompareDWord_Impl:=@CompareDWord_SSE2
  1663. else
  1664. CompareDWord_Impl:=@CompareDWord_Plain;
  1665. result:=CompareDWord_Impl(buf1, buf2, len);
  1666. end;
  1667. function CompareDWord(const buf1, buf2; len: SizeInt): SizeInt;
  1668. begin
  1669. result:=CompareDWord_Impl(buf1, buf2, len);
  1670. end;
  1671. {$endif ndef CPUX86_HAS_SSE2 (need CompareDWord dispatcher)}
  1672. {$endif FPC_SYSTEM_HAS_COMPAREDWORD}
  1673. {$ifndef FPC_SYSTEM_HAS_INDEXCHAR0}
  1674. {$define FPC_SYSTEM_HAS_INDEXCHAR0}
  1675. function IndexChar0(Const buf;len:SizeInt;b:AnsiChar):SizeInt; assembler;
  1676. var
  1677. saveesi,saveebx : longint;
  1678. asm
  1679. movl %esi,saveesi
  1680. movl %ebx,saveebx
  1681. // Can't use scasb, or will have to do it twice, think this
  1682. // is faster for small "len"
  1683. movl %eax,%esi // Load address
  1684. movzbl %cl,%ebx // Load searchpattern
  1685. testl %edx,%edx
  1686. je .LFound
  1687. xorl %ecx,%ecx // zero index in Buf
  1688. xorl %eax,%eax // To make DWord compares possible
  1689. .balign 4
  1690. .LLoop:
  1691. movb (%esi),%al // Load byte
  1692. cmpb %al,%bl
  1693. je .LFound // byte the same?
  1694. incl %ecx
  1695. incl %esi
  1696. cmpl %edx,%ecx // Maximal distance reached?
  1697. je .LNotFound
  1698. testl %eax,%eax // Nullchar = end of search?
  1699. jne .LLoop
  1700. .LNotFound:
  1701. movl $-1,%ecx // Not found return -1
  1702. .LFound:
  1703. movl %ecx,%eax
  1704. movl saveesi,%esi
  1705. movl saveebx,%ebx
  1706. end;
  1707. {$endif FPC_SYSTEM_HAS_INDEXCHAR0}
  1708. {****************************************************************************
  1709. String
  1710. ****************************************************************************}
  1711. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1712. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1713. procedure fpc_shortstr_to_shortstr(out res:shortstring; const sstr: shortstring);assembler;[public,alias:'FPC_SHORTSTR_TO_SHORTSTR']; compilerproc;
  1714. {$ifndef FPC_PROFILE}
  1715. nostackframe;
  1716. {$endif}
  1717. { eax = res, edx = high(res), ecx = sstr }
  1718. asm
  1719. {$ifdef FPC_PROFILE}
  1720. push %eax
  1721. push %edx
  1722. push %ecx
  1723. call mcount
  1724. pop %ecx
  1725. pop %edx
  1726. pop %eax
  1727. {$endif FPC_PROFILE}
  1728. cmp (%ecx), %dl { length(sstr) fits into res? }
  1729. jbe .LEdxIsLen { use high(res) if length(sstr) does not fit }
  1730. movzbl (%ecx), %edx { use length(sstr) }
  1731. .LEdxIsLen:
  1732. mov %dl, (%eax) { store length to res[0] }
  1733. xchg %ecx, %edx { ecx = length = Move count, edx = sstr }
  1734. xchg %eax, %edx { eax = sstr = Move src, edx = res = Move dest }
  1735. inc %eax
  1736. inc %edx
  1737. {$ifdef FPC_PROFILE}
  1738. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1739. lea -8(%esp), %esp
  1740. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1741. call Move
  1742. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1743. lea 8(%esp), %esp
  1744. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1745. {$else FPC_PROFILE}
  1746. jmp Move
  1747. {$endif FPC_PROFILE}
  1748. end;
  1749. procedure fpc_shortstr_assign(len:longint;sstr,dstr:pointer);[public,alias:'FPC_SHORTSTR_ASSIGN'];
  1750. begin
  1751. asm
  1752. {$ifdef FPC_PROFILE}
  1753. push %eax
  1754. push %edx
  1755. push %ecx
  1756. call mcount
  1757. pop %ecx
  1758. pop %edx
  1759. pop %eax
  1760. {$endif FPC_PROFILE}
  1761. pushl %eax
  1762. pushl %ecx
  1763. {$ifdef FPC_ENABLED_CLD}
  1764. cld
  1765. {$endif FPC_ENABLED_CLD}
  1766. movl dstr,%edi
  1767. movl sstr,%esi
  1768. xorl %eax,%eax
  1769. movl len,%ecx
  1770. lodsb
  1771. cmpl %ecx,%eax
  1772. jbe .LStrCopy1
  1773. movl %ecx,%eax
  1774. .LStrCopy1:
  1775. stosb
  1776. cmpl $7,%eax
  1777. jl .LStrCopy2
  1778. movl %edi,%ecx { Align on 32bits }
  1779. negl %ecx
  1780. andl $3,%ecx
  1781. subl %ecx,%eax
  1782. rep
  1783. movsb
  1784. movl %eax,%ecx
  1785. andl $3,%eax
  1786. shrl $2,%ecx
  1787. rep
  1788. movsl
  1789. .LStrCopy2:
  1790. movl %eax,%ecx
  1791. rep
  1792. movsb
  1793. popl %ecx
  1794. popl %eax
  1795. end ['ESI','EDI'];
  1796. end;
  1797. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1798. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1799. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1800. function fpc_shortstr_compare(const left,right:shortstring): longint;assembler; [public,alias:'FPC_SHORTSTR_COMPARE']; compilerproc;
  1801. { eax = left, edx = right }
  1802. asm
  1803. {$ifdef FPC_PROFILE}
  1804. push %eax
  1805. push %edx
  1806. push %ecx
  1807. call mcount
  1808. pop %ecx
  1809. pop %edx
  1810. pop %eax
  1811. {$endif FPC_PROFILE}
  1812. push %ebx
  1813. movzbl (%eax), %ecx { ecx = len(left) }
  1814. movzbl (%edx), %ebx { ebx = len(right) }
  1815. cmp %ebx, %ecx
  1816. {$ifdef CPUX86_HAS_CMOV}
  1817. cmovg %ebx, %ecx
  1818. {$else}
  1819. jle .LEcxIsLen
  1820. mov %ebx, %ecx
  1821. .LEcxIsLen:
  1822. {$endif}
  1823. push %eax { save left }
  1824. inc %eax
  1825. inc %edx
  1826. { stack is already aligned to 16 bytes if required: return address + push ebp + push ebx + push eax. }
  1827. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1828. call CompareByte
  1829. {$else}
  1830. call CompareByte_Impl { manually inline CompareByte }
  1831. {$endif}
  1832. pop %edx { restore left }
  1833. test %eax, %eax
  1834. jnz .LReturn
  1835. movzbl (%edx), %eax
  1836. sub %ebx, %eax
  1837. .LReturn:
  1838. pop %ebx
  1839. end;
  1840. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1841. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1842. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1843. function fpc_shortstr_compare_equal(const left,right:shortstring) : longint;assembler;nostackframe; [public,alias:'FPC_SHORTSTR_COMPARE_EQUAL']; compilerproc; nostackframe;
  1844. { eax = left, edx = right }
  1845. asm
  1846. movzbl (%eax), %ecx
  1847. cmp (%edx), %cl
  1848. jne .LNotEqual
  1849. inc %eax
  1850. inc %edx
  1851. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1852. jmp CompareByte
  1853. {$else}
  1854. jmp CompareByte_Impl { manually inline CompareByte }
  1855. {$endif}
  1856. .LNotEqual:
  1857. or $-1, %eax
  1858. end;
  1859. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1860. {$ifndef FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1861. {$define FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1862. procedure fpc_pchar_to_shortstr(out res : shortstring;p:PAnsiChar);assembler;[public,alias:'FPC_PCHAR_TO_SHORTSTR']; compilerproc;
  1863. {$ifndef FPC_PROFILE}
  1864. nostackframe;
  1865. {$endif}
  1866. // eax = res, edx = high(res), ecx = p
  1867. asm
  1868. {$ifdef FPC_PROFILE}
  1869. push %eax
  1870. push %edx
  1871. push %ecx
  1872. call mcount
  1873. pop %ecx
  1874. pop %edx
  1875. pop %eax
  1876. {$endif FPC_PROFILE}
  1877. test %ecx, %ecx
  1878. jz .LEmpty
  1879. push %eax { save res }
  1880. push %ecx { save p }
  1881. push %edx { save high(res) }
  1882. mov %ecx, %eax { eax = IndexByte.buf }
  1883. { edx is already high(res) = IndexByte.count.
  1884. Careful: using high(res) instead of -1 limits the scan by high(res) which is a good thing,
  1885. but assumes that IndexByte is “safe” and won’t read potentially invalid memory past the searched byte even if formally (and wrongly) allowed by ‘count’.
  1886. Generic and x86 versions are “safe”. }
  1887. xor %ecx, %ecx { ecx = 0 = IndexByte.value }
  1888. { Stack is already aligned on 16 bytes if the function is nostackframe: return address + push eax + push ecx + push edx.
  1889. With a stack frame, there is an additional push ebp and need 12 more bytes to align. }
  1890. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1891. leal -12(%esp), %esp
  1892. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1893. {$if defined(FPC_PIC) or not declared(IndexByte_Impl)}
  1894. call IndexByte
  1895. {$else}
  1896. call IndexByte_Impl { manually inline IndexByte }
  1897. {$endif}
  1898. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1899. leal 12(%esp), %esp
  1900. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1901. pop %ecx { ecx = high(res) = Move.len }
  1902. test %eax, %eax { If IndexByte result (eax) is non-negative (terminator is among first high(res) characters), use it, otherwise keep high(res). }
  1903. {$ifdef CPUX86_HAS_CMOV}
  1904. cmovns %eax, %ecx
  1905. {$else}
  1906. js .LEcxIsLen
  1907. mov %eax, %ecx
  1908. .LEcxIsLen:
  1909. {$endif}
  1910. pop %eax { pop p to eax = Move.src }
  1911. pop %edx { pop res to edx }
  1912. mov %cl, (%edx) { res[0] := len }
  1913. inc %edx { res[1] = Move.dst }
  1914. {$ifdef FPC_PROFILE}
  1915. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1916. leal -12(%esp), %esp
  1917. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1918. call Move
  1919. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1920. leal 12(%esp), %esp
  1921. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1922. jmp .LReturn
  1923. {$else FPC_PROFILE}
  1924. jmp Move { can perform a tail call }
  1925. {$endif FPC_PROFILE}
  1926. .LEmpty:
  1927. movb $0, (%eax)
  1928. {$ifdef FPC_PROFILE}
  1929. .LReturn:
  1930. {$endif}
  1931. end;
  1932. {$endif FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1933. {$IFNDEF INTERNAL_BACKTRACE}
  1934. {$define FPC_SYSTEM_HAS_GET_FRAME}
  1935. function get_frame:pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  1936. asm
  1937. movl %ebp,%eax
  1938. end;
  1939. {$ENDIF not INTERNAL_BACKTRACE}
  1940. {$define FPC_SYSTEM_HAS_GET_PC_ADDR}
  1941. Function Get_pc_addr : Pointer;assembler;nostackframe;
  1942. asm
  1943. movl (%esp),%eax
  1944. end;
  1945. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  1946. function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;
  1947. {$if defined(win32)}
  1948. { Windows has StackTop always properly set }
  1949. begin
  1950. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  1951. Result:=PPointer(framebp+4)^
  1952. else
  1953. Result:=nil;
  1954. end;
  1955. {$else defined(win32)}
  1956. nostackframe;assembler;
  1957. asm
  1958. orl %eax,%eax
  1959. jz .Lg_a_null
  1960. movl 4(%eax),%eax
  1961. .Lg_a_null:
  1962. end;
  1963. {$endif defined(win32)}
  1964. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  1965. function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;
  1966. {$if defined(win32)}
  1967. { Windows has StackTop always properly set }
  1968. begin
  1969. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  1970. Result:=PPointer(framebp)^
  1971. else
  1972. Result:=nil;
  1973. end;
  1974. {$else defined(win32)}
  1975. nostackframe;assembler;
  1976. asm
  1977. orl %eax,%eax
  1978. jz .Lgnf_null
  1979. movl (%eax),%eax
  1980. .Lgnf_null:
  1981. end;
  1982. {$endif defined(win32)}
  1983. {$define FPC_SYSTEM_HAS_SPTR}
  1984. Function Sptr : Pointer;assembler;nostackframe;
  1985. asm
  1986. movl %esp,%eax
  1987. end;
  1988. {****************************************************************************
  1989. Str()
  1990. ****************************************************************************}
  1991. {$if defined(disabled) and defined(regcall) }
  1992. {$define FPC_SYSTEM_HAS_INT_STR_LONGWORD}
  1993. {$define FPC_SYSTEM_HAS_INT_STR_LONGINT}
  1994. label str_int_shortcut;
  1995. procedure int_str(l:longword;out s:shortstring);assembler;nostackframe;
  1996. asm
  1997. pushl %esi
  1998. pushl %edi
  1999. pushl %ebx
  2000. mov %edx,%edi
  2001. xor %edx,%edx
  2002. jmp str_int_shortcut
  2003. end;
  2004. procedure int_str(l:longint;out s:shortstring);assembler;nostackframe;
  2005. {Optimized for speed, but balanced with size.}
  2006. const digits:array[0..9] of cardinal=(0,10,100,1000,10000,
  2007. 100000,1000000,10000000,
  2008. 100000000,1000000000);
  2009. asm
  2010. {$ifdef FPC_PROFILE}
  2011. push %eax
  2012. push %edx
  2013. push %ecx
  2014. call mcount
  2015. pop %ecx
  2016. pop %edx
  2017. pop %eax
  2018. {$endif FPC_PROFILE}
  2019. push %esi
  2020. push %edi
  2021. push %ebx
  2022. movl %edx,%edi
  2023. { Calculate absolute value and put sign in edx}
  2024. cltd
  2025. xorl %edx,%eax
  2026. subl %edx,%eax
  2027. negl %edx
  2028. str_int_shortcut:
  2029. movl %ecx,%esi
  2030. {Calculate amount of digits in ecx.}
  2031. xorl %ecx,%ecx
  2032. bsrl %eax,%ecx
  2033. incl %ecx
  2034. imul $1233,%ecx
  2035. shr $12,%ecx
  2036. {$ifdef FPC_PIC}
  2037. call fpc_geteipasebx
  2038. {$ifdef darwin}
  2039. movl digits-.Lpic(%ebx),%ebx
  2040. {$else}
  2041. addl $_GLOBAL_OFFSET_TABLE_,%ebx
  2042. movl digits@GOT(%ebx),%ebx
  2043. {$endif}
  2044. cmpl (%ebx,%ecx,4),%eax
  2045. {$else}
  2046. cmpl digits(,%ecx,4),%eax
  2047. {$endif}
  2048. cmc
  2049. adcl $0,%ecx {Nr. digits ready in ecx.}
  2050. {Write length & sign.}
  2051. lea (%edx,%ecx),%ebx
  2052. movb $45,%bh {movb $'-,%bh Not supported by our ATT reader.}
  2053. movw %bx,(%edi)
  2054. addl %edx,%edi
  2055. subl %edx,%esi
  2056. {Skip digits beyond string length.}
  2057. movl %eax,%edx
  2058. subl %ecx,%esi
  2059. jae .Lloop_write
  2060. .balign 4
  2061. .Lloop_skip:
  2062. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  2063. mull %edx
  2064. shrl $3,%edx
  2065. decl %ecx
  2066. jz .Ldone {If (l<0) and (high(s)=1) this jump is taken.}
  2067. incl %esi
  2068. jnz .Lloop_skip
  2069. {Write out digits.}
  2070. .balign 4
  2071. .Lloop_write:
  2072. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  2073. {Pre-add '0'}
  2074. leal 48(%edx),%ebx {leal $'0(,%edx),%ebx Not supported by our ATT reader.}
  2075. mull %edx
  2076. shrl $3,%edx
  2077. leal (%edx,%edx,8),%eax {x mod 10 = x-10*(x div 10)}
  2078. subl %edx,%ebx
  2079. subl %eax,%ebx
  2080. movb %bl,(%edi,%ecx)
  2081. decl %ecx
  2082. jnz .Lloop_write
  2083. .Ldone:
  2084. popl %ebx
  2085. popl %edi
  2086. popl %esi
  2087. end;
  2088. {$endif}
  2089. {****************************************************************************
  2090. Bounds Check
  2091. ****************************************************************************}
  2092. { do a thread-safe inc/dec }
  2093. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  2094. function cpudeclocked(var l : longint) : boolean;assembler;nostackframe;
  2095. asm
  2096. lock
  2097. decl (%eax)
  2098. setzb %al
  2099. end;
  2100. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  2101. procedure cpuinclocked(var l : longint);assembler;nostackframe;
  2102. asm
  2103. lock
  2104. incl (%eax)
  2105. end;
  2106. // inline SMP check and normal lock.
  2107. // the locked one is so slow, inlining doesn't matter.
  2108. function declocked(var l : longint) : boolean; inline;
  2109. begin
  2110. if not ismultithread then
  2111. begin
  2112. dec(l);
  2113. declocked:=l=0;
  2114. end
  2115. else
  2116. declocked:=cpudeclocked(l);
  2117. end;
  2118. procedure inclocked(var l : longint); inline;
  2119. begin
  2120. if not ismultithread then
  2121. inc(l)
  2122. else
  2123. cpuinclocked(l);
  2124. end;
  2125. function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
  2126. asm
  2127. movl $-1,%edx
  2128. lock
  2129. xaddl %edx, (%eax)
  2130. lea -1(%edx),%eax
  2131. end;
  2132. function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
  2133. asm
  2134. movl $1,%edx
  2135. lock
  2136. xaddl %edx, (%eax)
  2137. lea 1(%edx),%eax
  2138. end;
  2139. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2140. asm
  2141. xchgl (%eax),%edx
  2142. movl %edx,%eax
  2143. end;
  2144. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2145. asm
  2146. lock
  2147. xaddl %edx, (%eax)
  2148. movl %edx,%eax
  2149. end;
  2150. function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler; nostackframe;
  2151. asm
  2152. xchgl %eax,%ecx
  2153. lock
  2154. cmpxchgl %edx, (%ecx)
  2155. end;
  2156. function InterlockedCompareExchange64(var Target: int64; NewValue: int64; Comperand: int64): int64; assembler;
  2157. asm
  2158. pushl %ebx
  2159. pushl %edi
  2160. movl %eax,%edi
  2161. movl Comperand+4,%edx
  2162. movl Comperand+0,%eax
  2163. movl NewValue+4,%ecx
  2164. movl NewValue+0,%ebx
  2165. lock cmpxchg8b (%edi)
  2166. pop %edi
  2167. pop %ebx
  2168. end;
  2169. {****************************************************************************
  2170. FPU
  2171. ****************************************************************************}
  2172. const
  2173. { Internal constants for use in system unit }
  2174. FPU_Invalid = 1;
  2175. FPU_Denormal = 2;
  2176. FPU_DivisionByZero = 4;
  2177. FPU_Overflow = 8;
  2178. FPU_Underflow = $10;
  2179. FPU_StackUnderflow = $20;
  2180. FPU_StackOverflow = $40;
  2181. FPU_ExceptionMask = $ff;
  2182. MM_Invalid = 1;
  2183. MM_Denormal = 2;
  2184. MM_DivisionByZero = 4;
  2185. MM_Overflow = 8;
  2186. MM_Underflow = $10;
  2187. MM_Precicion = $20;
  2188. MM_ExceptionMask = $3f;
  2189. MM_MaskInvalidOp = %0000000010000000;
  2190. MM_MaskDenorm = %0000000100000000;
  2191. MM_MaskDivZero = %0000001000000000;
  2192. MM_MaskOverflow = %0000010000000000;
  2193. MM_MaskUnderflow = %0000100000000000;
  2194. MM_MaskPrecision = %0001000000000000;
  2195. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  2196. Procedure SysInitFPU;
  2197. begin
  2198. end;
  2199. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  2200. Procedure SysResetFPU;
  2201. var
  2202. { these locals are so we don't have to hack pic code in the assembler }
  2203. localmxcsr: dword;
  2204. localfpucw: word;
  2205. begin
  2206. localfpucw:=Default8087CW;
  2207. asm
  2208. fninit
  2209. fwait
  2210. fldcw localfpucw
  2211. end;
  2212. if has_sse_support then
  2213. begin
  2214. localmxcsr:=DefaultMXCSR;
  2215. asm
  2216. { setup sse exceptions }
  2217. {$ifndef OLD_ASSEMBLER}
  2218. ldmxcsr localmxcsr
  2219. {$else OLD_ASSEMBLER}
  2220. mov localmxcsr,%eax
  2221. subl $4,%esp
  2222. mov %eax,(%esp)
  2223. //ldmxcsr (%esp)
  2224. .byte 0x0f,0xae,0x14,0x24
  2225. addl $4,%esp
  2226. {$endif OLD_ASSEMBLER}
  2227. end;
  2228. end;
  2229. end;
  2230. { because of the brain dead sse detection on x86, this test is post poned }
  2231. procedure fpc_cpucodeinit;
  2232. var
  2233. _eax,_ecx_cpuid1,_edx_cpuid1,_ebx_cpuid7 : longint;
  2234. begin
  2235. if cpuid_support then
  2236. begin
  2237. asm
  2238. movl $1,%eax
  2239. xorl %ecx,%ecx
  2240. cpuid
  2241. movl %edx,_edx_cpuid1
  2242. movl %ecx,_ecx_cpuid1
  2243. end ['ebx'];
  2244. has_mmx_support:=(_edx_cpuid1 and $800000)<>0;
  2245. if ((_edx_cpuid1 and $2000000)<>0) then
  2246. begin
  2247. os_supports_sse:=true;
  2248. sse_check:=true;
  2249. asm
  2250. { force an sse exception if no sse is supported, the exception handler sets
  2251. os_supports_sse to false then }
  2252. { don't change this instruction, the code above depends on its size }
  2253. {$ifdef OLD_ASSEMBLER}
  2254. .byte 0x0f,0x28,0xf7
  2255. {$else}
  2256. movaps %xmm7, %xmm6
  2257. {$endif not EMX}
  2258. end;
  2259. sse_check:=false;
  2260. has_sse_support:=os_supports_sse;
  2261. end;
  2262. if has_sse_support then
  2263. begin
  2264. has_sse2_support:=((_edx_cpuid1 and $4000000)<>0);
  2265. has_sse3_support:=((_ecx_cpuid1 and $200)<>0);
  2266. has_sse41_support:=boolean(_ecx_cpuid1 shr 19 and 1);
  2267. { now avx }
  2268. asm
  2269. xorl %eax,%eax
  2270. cpuid
  2271. movl %eax,_eax
  2272. end;
  2273. if _eax>=7 then
  2274. begin
  2275. asm
  2276. movl $7,%eax
  2277. xorl %ecx,%ecx
  2278. cpuid
  2279. movl %ebx,_ebx_cpuid7
  2280. end;
  2281. fast_large_repmovstosb:=_ebx_cpuid7 and (1 shl 9)<>0;
  2282. if (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then
  2283. begin
  2284. asm
  2285. xorl %ecx,%ecx
  2286. .byte 0x0f,0x01,0xd0 { xgetbv }
  2287. movl %eax,_eax
  2288. end;
  2289. if (_eax and 6)=6 then
  2290. begin
  2291. has_avx_support:=(_ecx_cpuid1 and $10000000)<>0;
  2292. has_avx2_support:=(_ebx_cpuid7 and $20)<>0;
  2293. end;
  2294. end;
  2295. end;
  2296. end;
  2297. end;
  2298. { don't let libraries influence the FPU cw set by the host program }
  2299. if IsLibrary then
  2300. begin
  2301. Default8087CW:=Get8087CW;
  2302. if has_sse_support then
  2303. DefaultMXCSR:=GetMXCSR;
  2304. end;
  2305. SysResetFPU;
  2306. fpc_cpucodeinit_performed:=true;
  2307. end;
  2308. {$if not defined(darwin) and defined(regcall) }
  2309. { darwin requires that the stack is aligned to 16 bytes when calling another function }
  2310. {$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
  2311. {$define FPC_SYSTEM_HAS_ANSISTR_DECR_REF}
  2312. Procedure fpc_AnsiStr_Decr_Ref (Var S : Pointer); [Public,Alias:'FPC_ANSISTR_DECR_REF']; compilerproc; nostackframe; assembler;
  2313. asm
  2314. movl (%eax),%edx
  2315. testl %edx,%edx
  2316. jz .Lquit
  2317. movl $0,(%eax) // s:=nil
  2318. cmpl $0,-8(%edx) // exit if refcount<0
  2319. jl .Lquit
  2320. {$ifdef FPC_PIC}
  2321. call fpc_geteipasecx
  2322. addl $_GLOBAL_OFFSET_TABLE_,%ecx
  2323. movl ismultithread@GOT(%ecx),%ecx
  2324. cmpl $0,(%ecx)
  2325. {$else FPC_PIC}
  2326. cmpl $0,ismultithread
  2327. {$endif FPC_PIC}
  2328. je .Lskiplock
  2329. .byte 0xF0 // LOCK prefix, jumped over if IsMultiThread = false. FPC assembler does not accept disjoint LOCK mnemonic.
  2330. .Lskiplock:
  2331. decl -8(%edx)
  2332. jz .Lfree
  2333. .Lquit:
  2334. ret
  2335. .Lfree:
  2336. leal -12(%edx),%eax // points to start of allocation
  2337. { freemem is not an assembler leaf function like fpc_geteipasecx, so it
  2338. needs to be called with proper stack alignment }
  2339. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2340. leal -12(%esp),%esp
  2341. call FPC_FREEMEM
  2342. leal 12(%esp),%esp
  2343. {$else FPC_SYSTEM_STACKALIGNMENT16}
  2344. jmp FPC_FREEMEM // can perform a tail call
  2345. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2346. end;
  2347. function fpc_truely_ansistr_unique(Var S : Pointer): Pointer; forward;
  2348. {$define FPC_SYSTEM_HAS_ANSISTR_UNIQUE}
  2349. Function fpc_ansistr_Unique(Var S : Pointer): Pointer; [Public,Alias : 'FPC_ANSISTR_UNIQUE']; compilerproc; nostackframe;assembler;
  2350. asm
  2351. movl (%eax),%edx
  2352. testl %edx,%edx
  2353. jz .Lunchanged
  2354. cmpl $1,-8(%edx)
  2355. jne fpc_truely_ansistr_unique
  2356. .Lunchanged:
  2357. movl %edx,%eax
  2358. end;
  2359. {$endif FPC_HAS_FEATURE_ANSISTRINGS}
  2360. {$endif ndef darwin and defined(regcall) }
  2361. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  2362. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  2363. procedure ReadBarrier;assembler;nostackframe;
  2364. asm
  2365. {$ifdef CPUX86_HAS_SSE2}
  2366. lfence
  2367. {$else CPUX86_HAS_SSE2}
  2368. lock
  2369. addl $0,0(%esp)
  2370. {$endif CPUX86_HAS_SSE2}
  2371. end;
  2372. procedure ReadDependencyBarrier;
  2373. begin
  2374. { reads imply barrier on earlier reads depended on }
  2375. end;
  2376. procedure ReadWriteBarrier;assembler;nostackframe;
  2377. asm
  2378. {$ifdef CPUX86_HAS_SSE2}
  2379. mfence
  2380. {$else CPUX86_HAS_SSE2}
  2381. lock
  2382. addl $0,0(%esp)
  2383. {$endif CPUX86_HAS_SSE2}
  2384. end;
  2385. procedure WriteBarrier;assembler;nostackframe;
  2386. asm
  2387. {$ifdef CPUX86_HAS_SSEUNIT}
  2388. sfence
  2389. {$endif CPUX86_HAS_SSEUNIT}
  2390. end;
  2391. {$endif}
  2392. {$ifndef FPC_SYSTEM_HAS_BSF_QWORD}
  2393. {$define FPC_SYSTEM_HAS_BSF_QWORD}
  2394. function BsfQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2395. asm
  2396. bsfl 4(%esp),%eax
  2397. jz .L1
  2398. ret $8
  2399. .L1:
  2400. bsfl 8(%esp),%eax
  2401. jz .L2
  2402. add $32,%eax
  2403. ret $8
  2404. .L2:
  2405. movl $255,%eax
  2406. end;
  2407. {$endif FPC_SYSTEM_HAS_BSF_QWORD}
  2408. {$ifndef FPC_SYSTEM_HAS_BSR_QWORD}
  2409. {$define FPC_SYSTEM_HAS_BSR_QWORD}
  2410. function BsrQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2411. asm
  2412. bsrl 8(%esp),%eax
  2413. jz .L1
  2414. add $32,%eax
  2415. ret $8
  2416. .L1:
  2417. bsrl 4(%esp),%eax
  2418. jz .L2
  2419. ret $8
  2420. .L2:
  2421. movl $255,%eax
  2422. end;
  2423. {$endif FPC_SYSTEM_HAS_BSR_QWORD}
  2424. {$ifndef FPC_SYSTEM_HAS_SAR_QWORD}
  2425. {$define FPC_SYSTEM_HAS_SAR_QWORD}
  2426. function fpc_SarInt64(Const AValue : Int64;const Shift : Byte): Int64; [Public,Alias:'FPC_SARINT64']; compilerproc; assembler; nostackframe;
  2427. asm
  2428. movl 8(%esp),%edx
  2429. movzbl %al,%ecx
  2430. cmpb $32,%al
  2431. jnb .L1
  2432. movl 4(%esp),%eax
  2433. shrdl %cl,%edx,%eax
  2434. sarl %cl,%edx
  2435. ret $8
  2436. .L1:
  2437. movl %edx,%eax
  2438. sarl $31,%edx
  2439. sarl %cl,%eax // uses 5 lower bits of cl.
  2440. end;
  2441. {$endif FPC_SYSTEM_HAS_SAR_QWORD}