i386.inc 92 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 1999-2000 by the Free Pascal development team.
  4. Processor dependent implementation for the system unit for
  5. intel i386+
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$if defined(linux)}
  13. {$define FPC_SYSTEM_STACKALIGNMENT16}
  14. {$endif defined(linux)}
  15. {****************************************************************************
  16. Primitives
  17. ****************************************************************************}
  18. var
  19. os_supports_sse : boolean;
  20. { this variable is set to true, if currently an sse check is executed and no sig ill should be generated }
  21. sse_check : boolean;
  22. fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
  23. has_sse41_support : boolean;
  24. fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. }
  25. {$asmmode ATT}
  26. function cpuid_support : boolean;assembler;nostackframe;
  27. {
  28. Check if the ID-flag can be changed, if changed then CpuID is supported.
  29. Tested under go32v1 and Linux on c6x86 with CpuID enabled and disabled (PFV)
  30. }
  31. asm
  32. pushfl
  33. movl (%esp),%eax
  34. xorl $0x200000,%eax
  35. pushl %eax
  36. popfl
  37. pushfl
  38. popl %eax
  39. xorl (%esp),%eax
  40. popfl
  41. testl $0x200000,%eax
  42. setnz %al
  43. end;
  44. {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
  45. procedure fpc_cpuinit;
  46. begin
  47. { because of the brain dead sse detection on x86, this test is post poned to fpc_cpucodeinit which
  48. must be implemented OS dependend (FK)
  49. has_sse_support:=sse_support;
  50. has_mmx_support:=mmx_support;
  51. }
  52. end;
  53. {$ifndef darwin}
  54. procedure fpc_geteipasebx; [public, alias: 'fpc_geteipasebx'];assembler; nostackframe;
  55. asm
  56. movl (%esp),%ebx
  57. end;
  58. procedure fpc_geteipasecx; [public, alias: 'fpc_geteipasecx'];assembler; nostackframe;
  59. asm
  60. movl (%esp),%ecx
  61. end;
  62. {$endif}
  63. {$if not defined(FPC_SYSTEM_HAS_MOVE)
  64. and not defined(OLD_ASSEMBLER)
  65. and not defined(darwin)}
  66. {$i fastmove.inc}
  67. {$endif}
  68. {$ifndef FPC_SYSTEM_HAS_MOVE}
  69. {$define FPC_SYSTEM_HAS_MOVE}
  70. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;
  71. var
  72. saveesi,saveedi : longint;
  73. asm
  74. movl %edi,saveedi
  75. movl %esi,saveesi
  76. movl %eax,%esi
  77. movl %edx,%edi
  78. movl %ecx,%edx
  79. movl %edi,%eax
  80. { check for zero or negative count }
  81. cmpl $0,%edx
  82. jle .LMoveEnd
  83. { Check for back or forward }
  84. sub %esi,%eax
  85. jz .LMoveEnd { Do nothing when source=dest }
  86. jc .LFMove { Do forward, dest<source }
  87. cmp %edx,%eax
  88. jb .LBMove { Dest is in range of move, do backward }
  89. { Forward Copy }
  90. .LFMove:
  91. {$ifdef FPC_ENABLED_CLD}
  92. cld
  93. {$endif FPC_ENABLED_CLD}
  94. cmpl $15,%edx
  95. jl .LFMove1
  96. movl %edi,%ecx { Align on 32bits }
  97. negl %ecx
  98. andl $3,%ecx
  99. subl %ecx,%edx
  100. rep
  101. movsb
  102. movl %edx,%ecx
  103. andl $3,%edx
  104. shrl $2,%ecx
  105. rep
  106. movsl
  107. .LFMove1:
  108. movl %edx,%ecx
  109. rep
  110. movsb
  111. jmp .LMoveEnd
  112. { Backward Copy }
  113. .LBMove:
  114. std
  115. addl %edx,%esi
  116. addl %edx,%edi
  117. movl %edi,%ecx
  118. decl %esi
  119. decl %edi
  120. cmpl $15,%edx
  121. jl .LBMove1
  122. negl %ecx { Align on 32bits }
  123. andl $3,%ecx
  124. subl %ecx,%edx
  125. rep
  126. movsb
  127. movl %edx,%ecx
  128. andl $3,%edx
  129. shrl $2,%ecx
  130. subl $3,%esi
  131. subl $3,%edi
  132. rep
  133. movsl
  134. addl $3,%esi
  135. addl $3,%edi
  136. .LBMove1:
  137. movl %edx,%ecx
  138. rep
  139. movsb
  140. cld
  141. .LMoveEnd:
  142. movl saveedi,%edi
  143. movl saveesi,%esi
  144. end;
  145. {$endif FPC_SYSTEM_HAS_MOVE}
  146. { Darwin uses Clang to assemble. Recent Clang versions (rightly) give an error when you add global labels in
  147. the middle of .cfi_startproc / .cfi_endproc pairs, since this means you could jump into it from other code
  148. whose CFI state is completely different without the compiler even having the theoretical ability to analyse
  149. all code and generate balanced information.
  150. Since FPC does not attempt it even for local labels, this kind of code is inherently unsafe.
  151. }
  152. {$ifndef darwin}
  153. {$define can_jump_into_the_middle_of_a_procedure}
  154. {$endif darwin}
  155. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  156. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  157. or not defined(FPC_SYSTEM_HAS_FILLDWORD)
  158. or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  159. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  160. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  161. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  162. const
  163. FillXxxx_RepStosThreshold_ERMS = 1024;
  164. FillXxxx_RepStosThreshold_NoERMS = 512 * 1024;
  165. procedure FillXxxx_U32Pattern_RepStos_8OrMore; assembler; nostackframe;
  166. { eax — x, ecx — uint32 pattern, edx — byte count >= 8 (preferably >= FillXxxx_RepStosThreshold_(No)ERMS, depending on fast_large_repmovstosb). }
  167. asm
  168. {$ifdef FPC_ENABLED_CLD}
  169. cld
  170. {$endif FPC_ENABLED_CLD}
  171. mov %ecx, (%eax) { Write first 4 bytes unaligned. }
  172. push %ecx { pattern }
  173. push %edi
  174. mov %eax, %edi { Move x to edi, as expected by ‘rep stosl’. }
  175. xchg %eax, %ecx { now eax = pattern (as expected by ‘rep stosl’) and ecx = x (to rotate the pattern by its misalignment) }
  176. shl $3, %ecx { ecx = misalignment of x in bits. }
  177. rol %cl, %eax { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
  178. add %edi, %edx { edx = x end }
  179. lea -1(%edx), %ecx { ecx = x end - 1. }
  180. add $4, %edi
  181. and $-4, %edi { edi = 4-byte aligned pointer strictly to the right of the start. }
  182. and $-4, %ecx { ecx = 4-byte aligned pointer strictly to the left of the end. }
  183. sub %edi, %ecx { ecx = byte count between them. }
  184. shr $2, %ecx { ecx = uint32 count, as expected by ‘rep stosl’. }
  185. rep stosl
  186. pop %edi
  187. pop %ecx
  188. mov %ecx, -4(%edx) { Write last 4 bytes unaligned. }
  189. end;
  190. {$endif FillChar/Word/DWord required.}
  191. {$ifdef can_jump_into_the_middle_of_a_procedure}
  192. label
  193. FillXxxx_MoreThanTwoXMMs;
  194. {$else can_jump_into_the_middle_of_a_procedure}
  195. procedure FillXxxx_MoreThanTwoXMMs; forward;
  196. {$endif can_jump_into_the_middle_of_a_procedure}
  197. procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
  198. { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
  199. asm
  200. movd %ecx, %xmm0
  201. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  202. movdqu %xmm0, (%eax)
  203. movdqu %xmm0, -16(%eax,%edx)
  204. cmp $32, %edx
  205. ja .LMoreThanTwoVectors
  206. ret
  207. .byte 144 { Turn .balign 16 before .L64x_Body into a no-op. }
  208. { x can start and end misaligned on the vector boundary:
  209. x = ~~][H1][H2][...][T2][T1]~
  210. [UH] [UT]
  211. UH/UT stands for “unaligned head/tail”, both have 1~16 bytes. }
  212. .LMoreThanTwoVectors:
  213. push %esi
  214. mov %ecx, %esi { esi = pattern }
  215. mov %eax, %ecx
  216. shl $3, %ecx { ecx = misalignment of x in bits }
  217. rol %cl, %esi { misalign the pattern }
  218. movd %esi, %xmm0
  219. pshufd $0, %xmm0, %xmm0
  220. pop %esi
  221. {$ifdef can_jump_into_the_middle_of_a_procedure}
  222. { FillChar (to skip the misaligning above) and FillQWord jump here.
  223. eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
  224. FillXxxx_MoreThanTwoXMMs:
  225. {$else can_jump_into_the_middle_of_a_procedure}
  226. jmp FillXxxx_MoreThanTwoXMMs
  227. end;
  228. procedure FillXxxx_MoreThanTwoXMMs; assembler; nostackframe;
  229. asm
  230. {$endif can_jump_into_the_middle_of_a_procedure}
  231. lea -65(%eax,%edx), %ecx
  232. and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
  233. mov %ecx, %edx { Remember T4 to edx. }
  234. and $-16, %eax { eax = H1 − 16. }
  235. sub %eax, %ecx { ecx = aligned byte count − 48. }
  236. movdqa %xmm0, 16(%eax) { Write H1. }
  237. cmp $32-48, %ecx
  238. jle .LOneAlignedTailWrite
  239. movdqa %xmm0, 32(%eax) { Write H2. }
  240. cmp $64-48, %ecx
  241. jle .LTwoAlignedTailWrites
  242. sub $48, %ecx { ecx = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
  243. jle .LFourAlignedTailWrites { ecx was ≤ 96−48 }
  244. add $48, %eax { eax = H3. }
  245. cmp $4 * 1024 * 1024, %ecx { Non-temporal fill threshold. }
  246. jae .L64xNT_Body
  247. .balign 16 { no-op }
  248. .L64x_Body:
  249. movdqa %xmm0, (%eax)
  250. movdqa %xmm0, 16(%eax)
  251. movdqa %xmm0, 32(%eax)
  252. movdqa %xmm0, 48(%eax)
  253. add $64, %eax
  254. sub $64, %ecx
  255. ja .L64x_Body
  256. .LFourAlignedTailWrites:
  257. movdqa %xmm0, (%edx) { T4 }
  258. movdqa %xmm0, 16(%edx) { T3 }
  259. .LTwoAlignedTailWrites:
  260. movdqa %xmm0, 32(%edx) { T2 }
  261. .LOneAlignedTailWrite:
  262. movdqa %xmm0, 48(%edx) { T1 }
  263. ret
  264. .balign 16
  265. .L64xNT_Body:
  266. movntdq %xmm0, (%eax)
  267. movntdq %xmm0, 16(%eax)
  268. movntdq %xmm0, 32(%eax)
  269. movntdq %xmm0, 48(%eax)
  270. add $64, %eax
  271. sub $64, %ecx
  272. ja .L64xNT_Body
  273. sfence
  274. jmp .LFourAlignedTailWrites
  275. end;
  276. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  277. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  278. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  279. {$ifndef CPUX86_HAS_SSE2}
  280. procedure FillXxxx_U32Pattern_Plain_16OrMore; assembler; nostackframe;
  281. { eax — x, ecx — uint32 pattern, edx — byte count >= 12 (preferably >= 16). }
  282. asm
  283. mov %ecx, (%eax) { Write first 4 bytes. }
  284. lea -9(%eax,%edx), %edx
  285. mov %ecx, 5(%edx) { Write last 4 bytes. }
  286. and $-4, %edx { edx = loop bound. }
  287. push %esi
  288. mov %ecx, %esi { esi = pattern }
  289. mov %eax, %ecx
  290. shl $3, %ecx { ecx = misalignment of x in bits }
  291. rol %cl, %esi { misalign the pattern }
  292. add $4, %eax
  293. and $-4, %eax
  294. .balign 16
  295. .L8xLoop:
  296. mov %esi, (%eax)
  297. mov %esi, 4(%eax)
  298. add $8, %eax
  299. cmp %edx, %eax
  300. jb .L8xLoop
  301. mov %esi, (%edx)
  302. mov %esi, 4(%edx)
  303. pop %esi
  304. end;
  305. {$endif ndef CPUX86_HAS_SSE2 (need Fill*_Plain)}
  306. procedure FillXxxx_U32Pattern_Ladder_4to16; assembler; nostackframe;
  307. { eax — x, ecx — uint32 pattern, edx — byte count, 4 <= edx <= 16. }
  308. asm
  309. mov %ecx, (%eax)
  310. cmp $8, %edx
  311. jle .LLast4
  312. mov %ecx, 4(%eax)
  313. mov %ecx, -8(%eax,%edx)
  314. .LLast4:
  315. mov %ecx, -4(%eax,%edx)
  316. end;
  317. {$endif FillChar/Word/DWord required.}
  318. {$endif FillChar/Word/DWord/QWord required.}
  319. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)}
  320. {$define FPC_SYSTEM_HAS_FILLCHAR}
  321. procedure FillChar_3OrLess; assembler; nostackframe;
  322. { cl — x, edx — byte count, Low(int32) <= edx <= 3. }
  323. asm
  324. test %edx, %edx
  325. jle .LQuit
  326. mov %cl, (%eax)
  327. mov %cl, -1(%eax,%edx)
  328. shr $1, %edx
  329. mov %cl, (%eax,%edx)
  330. .LQuit:
  331. end;
  332. {$ifndef CPUX86_HAS_SSE2}
  333. procedure FillChar_Plain(var x;count:SizeInt;value:byte);assembler;nostackframe;
  334. asm
  335. cmp $3, %edx
  336. jle FillChar_3OrLess
  337. movzbl %cl, %ecx
  338. imul $0x01010101, %ecx
  339. cmp $16, %edx
  340. jbe FillXxxx_U32Pattern_Ladder_4to16
  341. jmp FillXxxx_U32Pattern_Plain_16OrMore
  342. end;
  343. {$endif ndef CPUX86_HAS_SSE2}
  344. procedure FillChar_SSE2(var x;count:SizeInt;value:byte);assembler;nostackframe;
  345. asm
  346. cmp $3, %edx
  347. jle FillChar_3OrLess
  348. movzbl %cl, %ecx
  349. imul $0x01010101, %ecx
  350. cmp $16, %edx
  351. jbe FillXxxx_U32Pattern_Ladder_4to16
  352. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  353. jae FillXxxx_U32Pattern_RepStos_8OrMore
  354. movd %ecx, %xmm0
  355. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  356. movdqu %xmm0, (%eax)
  357. movdqu %xmm0, -16(%eax,%edx)
  358. cmp $32, %edx
  359. ja FillXxxx_MoreThanTwoXMMs
  360. end;
  361. procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
  362. asm
  363. cmp $3, %edx
  364. jle FillChar_3OrLess
  365. movzbl %cl, %ecx
  366. imul $0x01010101, %ecx
  367. cmp $16, %edx
  368. jbe FillXxxx_U32Pattern_Ladder_4to16
  369. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  370. jae FillXxxx_U32Pattern_RepStos_8OrMore
  371. movd %ecx, %xmm0
  372. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  373. movdqu %xmm0, (%eax)
  374. movdqu %xmm0, -16(%eax,%edx)
  375. cmp $32, %edx
  376. ja FillXxxx_MoreThanTwoXMMs
  377. end;
  378. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
  379. var
  380. FillChar_Impl: procedure(var x;count:SizeInt;value:byte) = @FillChar_Dispatch;
  381. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte);
  382. begin
  383. if not fpc_cpucodeinit_performed then
  384. begin
  385. {$ifdef CPUX86_HAS_SSE2} FillChar_SSE2 {$else} FillChar_Plain {$endif} (x, count, value);
  386. exit;
  387. end;
  388. if fast_large_repmovstosb then
  389. FillChar_Impl := @FillChar_SSE2_ERMS
  390. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  391. FillChar_Impl := @FillChar_SSE2
  392. {$ifndef CPUX86_HAS_SSE2}
  393. else
  394. FillChar_Impl := @FillChar_Plain
  395. {$endif ndef CPUX86_HAS_SSE2};
  396. FillChar_Impl(x, count, value);
  397. end;
  398. procedure FillChar(var x;count:SizeInt;value:byte);
  399. begin
  400. FillChar_Impl(x, count, value);
  401. end;
  402. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  403. {$if not defined(FPC_SYSTEM_HAS_FILLWORD)}
  404. {$define FPC_SYSTEM_HAS_FILLWORD}
  405. procedure FillWord_3OrLess; assembler; nostackframe;
  406. asm
  407. test %edx, %edx
  408. jle .LQuit
  409. mov %cx, (%eax)
  410. mov %cx, -2(%eax,%edx,2)
  411. shr $1, %edx
  412. mov %cx, (%eax,%edx,2)
  413. .LQuit:
  414. end;
  415. {$ifndef CPUX86_HAS_SSE2}
  416. procedure FillWord_Plain(var x;count:SizeInt;value:word);assembler;nostackframe;
  417. asm
  418. cmp $3, %edx
  419. jle FillWord_3OrLess
  420. shl $1, %edx
  421. movzwl %cx, %ecx
  422. imul $0x00010001, %ecx
  423. cmp $16, %edx
  424. jbe FillXxxx_U32Pattern_Ladder_4to16
  425. jmp FillXxxx_U32Pattern_Plain_16OrMore
  426. end;
  427. {$endif ndef CPUX86_HAS_SSE2}
  428. procedure FillWord_SSE2(var x;count:SizeInt;value:word);assembler;nostackframe;
  429. asm
  430. cmp $3, %edx
  431. jle FillWord_3OrLess
  432. shl $1, %edx
  433. movzwl %cx, %ecx
  434. imul $0x00010001, %ecx
  435. cmp $16, %edx
  436. jbe FillXxxx_U32Pattern_Ladder_4to16
  437. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  438. jb FillXxxx_U32Pattern_SSE2_16OrMore
  439. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  440. end;
  441. procedure FillWord_SSE2_ERMS(var x;count:SizeInt;value:word);assembler;nostackframe;
  442. asm
  443. cmp $3, %edx
  444. jle FillWord_3OrLess
  445. shl $1, %edx
  446. movzwl %cx, %ecx
  447. imul $0x00010001, %ecx
  448. cmp $16, %edx
  449. jbe FillXxxx_U32Pattern_Ladder_4to16
  450. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  451. jb FillXxxx_U32Pattern_SSE2_16OrMore
  452. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  453. end;
  454. procedure FillWord_Dispatch(var x;count:SizeInt;value:word); forward;
  455. var
  456. FillWord_Impl: procedure(var x;count:SizeInt;value:word) = @FillWord_Dispatch;
  457. procedure FillWord_Dispatch(var x;count:SizeInt;value:word);
  458. begin
  459. if not fpc_cpucodeinit_performed then
  460. begin
  461. {$ifdef CPUX86_HAS_SSE2} FillWord_SSE2 {$else} FillWord_Plain {$endif} (x, count, value);
  462. exit;
  463. end;
  464. if fast_large_repmovstosb then
  465. FillWord_Impl := @FillWord_SSE2_ERMS
  466. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  467. FillWord_Impl := @FillWord_SSE2
  468. {$ifndef CPUX86_HAS_SSE2}
  469. else
  470. FillWord_Impl := @FillWord_Plain
  471. {$endif ndef CPUX86_HAS_SSE2};
  472. FillWord_Impl(x, count, value);
  473. end;
  474. procedure FillWord(var x;count:SizeInt;value:word);
  475. begin
  476. FillWord_Impl(x, count, value);
  477. end;
  478. {$endif FPC_SYSTEM_HAS_FILLWORD}
  479. {$if not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  480. {$define FPC_SYSTEM_HAS_FILLDWORD}
  481. procedure FillDWord_4OrLess; assembler; nostackframe;
  482. asm
  483. cmp $1, %edx
  484. jl .LQuit
  485. mov %ecx, (%eax)
  486. je .LQuit
  487. mov %ecx, 4(%eax)
  488. mov %ecx, -8(%eax,%edx,4)
  489. mov %ecx, -4(%eax,%edx,4)
  490. .LQuit:
  491. end;
  492. {$ifndef CPUX86_HAS_SSE2}
  493. procedure FillDWord_Plain(var x;count:SizeInt;value:dword);assembler;nostackframe;
  494. asm
  495. cmp $4, %edx
  496. jle FillDWord_4OrLess
  497. shl $2, %edx
  498. jmp FillXxxx_U32Pattern_Plain_16OrMore
  499. end;
  500. {$endif ndef CPUX86_HAS_SSE2}
  501. procedure FillDWord_SSE2(var x;count:SizeInt;value:dword);assembler;nostackframe;
  502. asm
  503. cmp $4, %edx
  504. jle FillDWord_4OrLess
  505. shl $2, %edx
  506. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  507. jb FillXxxx_U32Pattern_SSE2_16OrMore
  508. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  509. end;
  510. procedure FillDWord_SSE2_ERMS(var x;count:SizeInt;value:dword);assembler;nostackframe;
  511. asm
  512. cmp $4, %edx
  513. jle FillDWord_4OrLess
  514. shl $2, %edx
  515. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  516. jb FillXxxx_U32Pattern_SSE2_16OrMore
  517. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  518. end;
  519. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword); forward;
  520. var
  521. FillDWord_Impl: procedure(var x;count:SizeInt;value:dword) = @FillDWord_Dispatch;
  522. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword);
  523. begin
  524. if not fpc_cpucodeinit_performed then
  525. begin
  526. {$ifdef CPUX86_HAS_SSE2} FillDWord_SSE2 {$else} FillDWord_Plain {$endif}(x, count, value);
  527. exit;
  528. end;
  529. if fast_large_repmovstosb then
  530. FillDWord_Impl := @FillDWord_SSE2_ERMS
  531. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  532. FillDWord_Impl := @FillDWord_SSE2
  533. {$ifndef CPUX86_HAS_SSE2}
  534. else
  535. FillDWord_Impl := @FillDWord_Plain
  536. {$endif ndef CPUX86_HAS_SSE2};
  537. FillDWord_Impl(x, count, value);
  538. end;
  539. procedure FillDWord(var x;count:SizeInt;value:dword);
  540. begin
  541. FillDWord_Impl(x, count, value);
  542. end;
  543. {$endif FPC_SYSTEM_HAS_FILLDWORD}
  544. {$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  545. {$define FPC_SYSTEM_HAS_FILLQWORD}
  546. {$ifndef CPUX86_HAS_SSE2}
  547. procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  548. { eax = x, edx = count, [esp + 4] = value }
  549. asm
  550. test %edx, %edx { Don't care about clever things like separate small branches or aligning writes by misaligning the pattern, the function is dead. }
  551. jle .LQuit
  552. push %esi
  553. mov 4+4(%esp), %esi { esi = value[0:31] }
  554. mov 4+8(%esp), %ecx { ecx = value[32:63] }
  555. .balign 16
  556. .LLoop:
  557. mov %esi, (%eax)
  558. mov %ecx, 4(%eax)
  559. add $8, %eax
  560. sub $1, %edx
  561. jnz .LLoop
  562. pop %esi
  563. .LQuit:
  564. end;
  565. {$endif ndef CPUX86_HAS_SSE2}
  566. procedure {$ifdef CPUX86_HAS_SSE2} FillQWord {$else} FillQWord_SSE2 {$endif}(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  567. { eax = x, edx = count, [esp + 4] = value }
  568. asm
  569. cmp $4, %edx
  570. jle .L4OrLess
  571. movq 4(%esp), %xmm0
  572. punpcklqdq %xmm0, %xmm0
  573. { Stack is 12 bytes:
  574. [esp] = return address, [esp + 4] = value (not required anymore).
  575. Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs:
  576. [esp] = return address. }
  577. mov (%esp), %ecx
  578. add $8, %esp
  579. mov %ecx, (%esp)
  580. shl $3, %edx
  581. movdqu %xmm0, (%eax)
  582. movdqu %xmm0, -16(%eax,%edx)
  583. test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
  584. jz FillXxxx_MoreThanTwoXMMs
  585. mov %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x. }
  586. shl $3, %ecx
  587. and $63, %ecx
  588. movd %ecx, %xmm2
  589. movdqa %xmm0, %xmm1
  590. psllq %xmm2, %xmm1
  591. neg %ecx { Can also do not ecx; and $63, %ecx; inc ecx to support ecx = 0, as SSE shifts correctly handle count >= bitsizeof. }
  592. and $63, %ecx { But in the definitely unaligned case, ecx = 0 is impossible. }
  593. movd %ecx, %xmm2
  594. psrlq %xmm2, %xmm0
  595. por %xmm1, %xmm0
  596. jmp FillXxxx_MoreThanTwoXMMs
  597. .L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ }
  598. cmp $1, %edx
  599. jl .LQuit
  600. mov 4(%esp), %ecx
  601. mov %ecx, (%eax)
  602. je .LSecondHalfOf1
  603. mov %ecx, 8(%eax)
  604. mov %ecx, -16(%eax,%edx,8)
  605. mov %ecx, -8(%eax,%edx,8)
  606. mov 8(%esp), %ecx
  607. mov %ecx, 4(%eax)
  608. mov %ecx, 12(%eax)
  609. mov %ecx, -12(%eax,%edx,8)
  610. mov %ecx, -4(%eax,%edx,8)
  611. .LQuit:
  612. ret $8
  613. .LSecondHalfOf1:
  614. mov 8(%esp), %ecx
  615. mov %ecx, 4(%eax)
  616. end;
  617. {$ifndef CPUX86_HAS_SSE2}
  618. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword); forward;
  619. var
  620. FillQWord_Impl: procedure(var x;count:SizeInt;value:qword) = @FillQWord_Dispatch;
  621. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword);
  622. begin
  623. if not fpc_cpucodeinit_performed then
  624. begin
  625. FillQWord_Plain(x, count, value);
  626. exit;
  627. end;
  628. if has_sse2_support then
  629. FillQWord_Impl := @FillQWord_SSE2
  630. else
  631. FillQWord_Impl := @FillQWord_Plain;
  632. FillQWord_Impl(x, count, value);
  633. end;
  634. procedure FillQWord(var x;count:SizeInt;value:qword);
  635. begin
  636. FillQWord_Impl(x, count, value);
  637. end;
  638. {$endif ndef CPUX86_HAS_SSE2 (need FillQWord dispatcher)}
  639. {$endif FPC_SYSTEM_HAS_FILLQWORD}
  640. {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
  641. {$define FPC_SYSTEM_HAS_INDEXBYTE}
  642. {$ifndef CPUX86_HAS_SSE2}
  643. function IndexByte_Plain(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  644. { eax = buf, edx = len, cl = b }
  645. asm
  646. test %edx,%edx
  647. jz .Lnothing0
  648. push %eax { save initial value of 'buf' }
  649. test $3,%al
  650. jz .Laligned4
  651. .Lalignloop: { align to 4 bytes }
  652. cmp %cl,(%eax)
  653. je .Lfoundateax
  654. inc %eax
  655. dec %edx
  656. jz .Lnothing1
  657. test $3,%al
  658. jnz .Lalignloop
  659. .Laligned4: { align to 8 bytes }
  660. push %esi
  661. push %edi
  662. mov %cl,%ch { prepare pattern }
  663. movzwl %cx,%esi
  664. shl $16,%ecx
  665. or %esi,%ecx
  666. test $7,%al
  667. jz .Lloop
  668. test %edx,%edx { Adjust buf+len and reuse the second unroll from the loop body. Careful with len < 0 (esp. len = −1). }
  669. jl .Ldontfixuplen
  670. add $4,%edx
  671. .Ldontfixuplen:
  672. sub $4,%eax
  673. jmp .Lalignfrom4to8
  674. .balign 16
  675. .Lloop: { Requires 8-byte alignment of eax, to safely over-read up to 7 bytes on last iteration. }
  676. mov (%eax),%esi { load dword }
  677. xor %ecx,%esi { XOR with pattern, bytes equal to target are now 0 }
  678. lea -0x01010101(%esi),%edi
  679. not %esi
  680. and $0x80808080,%esi
  681. and %edi,%esi { (x-0x01010101) and (not x) and 0x80808080 }
  682. jnz .Lfound0 { one of the bytes matches }
  683. .Lalignfrom4to8:
  684. mov 4(%eax),%esi
  685. xor %ecx,%esi
  686. lea -0x01010101(%esi),%edi
  687. not %esi
  688. and $0x80808080,%esi
  689. and %edi,%esi
  690. jnz .Lfound1
  691. add $8,%eax
  692. sub $8,%edx
  693. ja .Lloop
  694. .Lnothing3:
  695. pop %edi
  696. pop %esi
  697. .Lnothing1:
  698. pop %edx
  699. .Lnothing0:
  700. or $-1,%eax
  701. ret
  702. .Lfound1:
  703. sub $4,%edx
  704. jbe .Lnothing3
  705. add $4,%eax
  706. .Lfound0:
  707. bsf %esi,%esi
  708. shr $3,%esi
  709. cmp %edx,%esi { Garbage after remaining length? }
  710. jae .Lnothing3
  711. add %esi,%eax
  712. pop %edi
  713. pop %esi
  714. .Lfoundateax:
  715. pop %ecx
  716. sub %ecx,%eax
  717. end;
  718. {$endif ndef CPUX86_HAS_SSE2}
  719. function {$ifdef CPUX86_HAS_SSE2} IndexByte {$else} IndexByte_SSE2 {$endif} (const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  720. asm
  721. test %edx, %edx
  722. jz .Lnotfound { exit if len=0 }
  723. movd %ecx, %xmm1
  724. mov %eax, %ecx
  725. punpcklbw %xmm1, %xmm1
  726. punpcklbw %xmm1, %xmm1
  727. and $4095, %ecx
  728. pshufd $0, %xmm1, %xmm1
  729. cmp $4080, %ecx
  730. ja .LCrossPage
  731. movdqu (%eax), %xmm0 { Analyze first 16 bytes, unaligned. }
  732. pcmpeqb %xmm1, %xmm0
  733. pmovmskb %xmm0, %ecx
  734. test %ecx, %ecx
  735. jz .LContinueAligned
  736. bsf %ecx, %eax
  737. cmp %edx, %eax
  738. jae .Lnotfound
  739. ret
  740. .byte 144 { Make .balign 16 before .Lloop a no-op. }
  741. .LContinueAligned:
  742. cmp $16, %edx { Length might be explicitly set to 16 or less; if so, skip a bit of work. }
  743. jbe .Lnotfound { (Or rather, this check is *required* unless jumping to .Lcontinue instead of going directly to .Lloop) }
  744. push %ebx
  745. lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
  746. and $-0x10, %ecx { first aligned address after buf }
  747. sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
  748. .balign 16
  749. .Lloop:
  750. movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, }
  751. add $16, %ecx { but their sum is evenly divisible by 16. }
  752. pcmpeqb %xmm1, %xmm0
  753. pmovmskb %xmm0, %ebx
  754. test %ebx, %ebx
  755. jnz .Lmatch
  756. .Lcontinue:
  757. cmp %ecx, %edx
  758. ja .Lloop
  759. pop %ebx
  760. .Lnotfound:
  761. or $-1, %eax
  762. ret
  763. .LCrossPage:
  764. push %ebx
  765. lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
  766. and $-0x10, %ecx { first aligned address after buf }
  767. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
  768. sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
  769. pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
  770. pmovmskb %xmm0, %ebx
  771. shl %cl, %ebx { shift valid bits into high word }
  772. and $0xffff0000, %ebx { clear low word containing invalid bits }
  773. shr %cl, %ebx { shift back }
  774. jz .Lcontinue
  775. .Lmatch:
  776. bsf %ebx, %ebx
  777. lea -16(%ecx,%ebx), %eax
  778. pop %ebx
  779. cmp %eax, %edx { check against the buffer length }
  780. jbe .Lnotfound
  781. end;
  782. {$ifndef CPUX86_HAS_SSE2}
  783. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt; forward;
  784. var
  785. IndexByte_Impl: function(const buf;len:SizeInt;b:byte):SizeInt = @IndexByte_Dispatch;
  786. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt;
  787. begin
  788. if not fpc_cpucodeinit_performed then
  789. exit(IndexByte_Plain(buf,len,b));
  790. if has_sse2_support then
  791. IndexByte_Impl:=@IndexByte_SSE2
  792. else
  793. IndexByte_Impl:=@IndexByte_Plain;
  794. result:=IndexByte_Impl(buf,len,b);
  795. end;
  796. function IndexByte(const buf;len:SizeInt;b:byte):SizeInt;
  797. begin
  798. result:=IndexByte_Impl(buf,len,b);
  799. end;
  800. {$endif ndef CPUX86_HAS_SSE2}
  801. {$endif FPC_SYSTEM_HAS_INDEXBYTE}
  802. {$ifndef FPC_SYSTEM_HAS_INDEXWORD}
  803. {$define FPC_SYSTEM_HAS_INDEXWORD}
  804. {$ifndef CPUX86_HAS_SSE2}
  805. function IndexWord_Plain(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  806. asm
  807. test %edx, %edx
  808. jz .LNotFound
  809. push %eax
  810. .LWordwise_Body: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  811. cmp %cx, (%eax)
  812. je .LFound
  813. add $2, %eax
  814. dec %edx
  815. jnz .LWordwise_Body
  816. pop %edx
  817. .LNotFound:
  818. or $-1, %eax
  819. ret
  820. .LFound:
  821. pop %edx
  822. sub %edx, %eax
  823. shr $1, %eax
  824. end;
  825. {$endif ndef CPUX86_HAS_SSE2}
  826. function {$ifdef CPUX86_HAS_SSE2} IndexWord {$else} IndexWord_SSE2 {$endif} (const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  827. asm
  828. test %edx, %edx { exit if len=0 }
  829. je .Lnotfound
  830. push %ebx
  831. movd %ecx, %xmm1
  832. punpcklwd %xmm1, %xmm1
  833. pshufd $0, %xmm1, %xmm1
  834. lea 16(%eax), %ecx
  835. and $-16, %ecx
  836. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
  837. sub %eax, %ecx
  838. test $1, %eax { if buffer isn't aligned to word boundary, }
  839. jnz .Lunaligned { use a different algorithm }
  840. pcmpeqw %xmm1, %xmm0
  841. pmovmskb %xmm0, %ebx
  842. shl %cl, %ebx
  843. and $0xffff0000, %ebx
  844. shr %cl, %ebx
  845. shr $1, %ecx { ecx=number of valid bytes }
  846. test %ebx, %ebx
  847. jz .Lcontinue
  848. .Lmatch:
  849. bsf %ebx, %ebx
  850. shr $1, %ebx { in words }
  851. lea -8(%ecx,%ebx), %eax
  852. pop %ebx
  853. cmp %eax, %edx
  854. jbe .Lnotfound { if match is after the specified length, ignore it }
  855. ret
  856. .balign 16
  857. .Lloop:
  858. movdqa (%eax,%ecx,2), %xmm0
  859. add $8, %ecx
  860. pcmpeqw %xmm1, %xmm0
  861. pmovmskb %xmm0, %ebx
  862. test %ebx, %ebx
  863. jnz .Lmatch
  864. .Lcontinue:
  865. cmp %ecx, %edx
  866. ja .Lloop
  867. pop %ebx
  868. .Lnotfound:
  869. or $-1, %eax
  870. ret
  871. .Lunaligned:
  872. push %esi
  873. movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
  874. psllw $8, %xmm1 { swap bytes of each word of pattern) }
  875. psrlw $8, %xmm2
  876. por %xmm2, %xmm1
  877. pcmpeqb %xmm1, %xmm0
  878. pmovmskb %xmm0, %ebx
  879. shl %cl, %ebx
  880. and $0xffff0000, %ebx
  881. shr %cl, %ebx
  882. xor %esi, %esi { nothing to merge yet }
  883. add %edx, %edx { length words -> bytes }
  884. jmp .Lcontinue_u
  885. .balign 16
  886. .Lloop_u:
  887. movdqa (%eax,%ecx), %xmm0
  888. add $16, %ecx
  889. pcmpeqb %xmm1, %xmm0 { compare by bytes }
  890. shr $16, %esi { bit 16 shifts into 0 }
  891. pmovmskb %xmm0, %ebx
  892. .Lcontinue_u:
  893. shl $1, %ebx { 15:0 -> 16:1 }
  894. or %esi, %ebx { merge bit 0 from previous round }
  895. mov %ebx, %esi
  896. shr $1, %ebx { now AND together adjacent pairs of bits }
  897. and %esi, %ebx
  898. and $0x5555, %ebx { also reset odd bits }
  899. jnz .Lmatch_u
  900. cmp %ecx, %edx
  901. ja .Lloop_u
  902. .Lnotfound_u:
  903. pop %esi
  904. pop %ebx
  905. or $-1, %eax
  906. ret
  907. .Lmatch_u:
  908. bsf %ebx, %ebx
  909. lea -16(%ecx,%ebx), %eax
  910. cmp %eax, %edx
  911. jbe .Lnotfound_u { if match is after the specified length, ignore it }
  912. sar $1, %eax { in words }
  913. pop %esi
  914. pop %ebx
  915. end;
  916. {$ifndef CPUX86_HAS_SSE2}
  917. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt; forward;
  918. var
  919. IndexWord_Impl: function(const buf;len:SizeInt;b:word):SizeInt = @IndexWord_Dispatch;
  920. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt;
  921. begin
  922. if not fpc_cpucodeinit_performed then
  923. exit(IndexWord_Plain(buf,len,b));
  924. if has_sse2_support then
  925. IndexWord_Impl:=@IndexWord_SSE2
  926. else
  927. IndexWord_Impl:=@IndexWord_Plain;
  928. result:=IndexWord_Impl(buf,len,b);
  929. end;
  930. function IndexWord(const buf;len:SizeInt;b:word):SizeInt; inline;
  931. begin
  932. result:=IndexWord_Impl(buf,len,b);
  933. end;
  934. {$endif ndef CPUX86_HAS_SSE2}
  935. {$endif FPC_SYSTEM_HAS_INDEXWORD}
  936. {$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
  937. {$define FPC_SYSTEM_HAS_INDEXDWORD}
  938. {$ifndef CPUX86_HAS_SSE2}
  939. function IndexDWord_Plain(Const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  940. asm
  941. push %eax
  942. sub $4, %eax
  943. .LDWordwise_Next: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  944. add $4, %eax
  945. sub $1, %edx
  946. jb .LNotFound
  947. cmp %ecx, (%eax)
  948. jne .LDWordwise_Next
  949. pop %edx
  950. sub %edx, %eax
  951. shr $2, %eax
  952. ret
  953. .LNotFound:
  954. pop %edx
  955. mov $-1, %eax
  956. end;
  957. {$endif ndef CPUX86_HAS_SSE2}
  958. function {$ifdef CPUX86_HAS_SSE2} IndexDWord {$else} IndexDWord_SSE2 {$endif} (const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  959. asm
  960. push %eax
  961. sub $4, %edx
  962. jle .LDwordwise_Prepare
  963. movd %ecx, %xmm1
  964. pshufd $0, %xmm1, %xmm1
  965. .balign 16 { 1-byte NOP. }
  966. .L4x_Body:
  967. movdqu (%eax), %xmm0
  968. pcmpeqd %xmm1, %xmm0
  969. pmovmskb %xmm0, %ecx
  970. test %ecx, %ecx
  971. jnz .LFoundAtMask
  972. add $16, %eax
  973. sub $4, %edx
  974. jg .L4x_Body
  975. lea (%eax,%edx,4), %eax
  976. movdqu (%eax), %xmm0
  977. pcmpeqd %xmm1, %xmm0
  978. pmovmskb %xmm0, %ecx
  979. test %ecx, %ecx
  980. jz .LNothing
  981. .LFoundAtMask:
  982. bsf %ecx, %ecx
  983. add %ecx, %eax
  984. .LFoundAtEax:
  985. pop %edx
  986. sub %edx, %eax
  987. shr $2, %eax
  988. ret
  989. nop { Turns .balign 16 before .LDwordwise_Body into a no-op. }
  990. .LDwordwise_Prepare:
  991. add $3, %edx
  992. cmp $-1, %edx
  993. je .LNothing
  994. .balign 16 { no-op }
  995. .LDwordwise_Body:
  996. cmp (%eax), %ecx
  997. je .LFoundAtEax
  998. add $4, %eax
  999. sub $1, %edx
  1000. jae .LDwordwise_Body
  1001. .LNothing:
  1002. pop %edx
  1003. or $-1, %eax
  1004. end;
  1005. {$ifndef CPUX86_HAS_SSE2}
  1006. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt; forward;
  1007. var
  1008. IndexDWord_Impl: function(const buf;len:SizeInt;b:DWord):SizeInt = @IndexDWord_Dispatch;
  1009. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt;
  1010. begin
  1011. if not fpc_cpucodeinit_performed then
  1012. exit(IndexDWord_Plain(buf,len,b));
  1013. if has_sse2_support then
  1014. IndexDWord_Impl:=@IndexDWord_SSE2
  1015. else
  1016. IndexDWord_Impl:=@IndexDWord_Plain;
  1017. result:=IndexDWord_Impl(buf,len,b);
  1018. end;
  1019. function IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt;
  1020. begin
  1021. result:=IndexDWord_Impl(buf,len,b);
  1022. end;
  1023. {$endif CPUX86_HAS_SSE2}
  1024. {$endif FPC_SYSTEM_HAS_INDEXDWORD}
  1025. {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
  1026. {$define FPC_SYSTEM_HAS_INDEXQWORD}
  1027. function IndexQWord_Plain(const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  1028. { eax = buf, edx = len, [esp+4] = b }
  1029. asm
  1030. push %ebx
  1031. mov 8(%esp), %ecx { ecx = b[0:31] }
  1032. mov 12(%esp), %ebx { ebx = b[32:63] }
  1033. mov %eax, 8(%esp) { remember original buf }
  1034. sub $8, %eax
  1035. .balign 16 { no-op }
  1036. .LQWordwise_Next:
  1037. add $8, %eax
  1038. sub $1, %edx
  1039. jb .LNotFound
  1040. cmp %ecx, (%eax)
  1041. jne .LQWordwise_Next
  1042. cmp %ebx, 4(%eax)
  1043. jne .LQWordwise_Next
  1044. sub 8(%esp), %eax
  1045. pop %ebx
  1046. shr $3, %eax
  1047. ret $8
  1048. .LNotFound:
  1049. pop %ebx
  1050. mov $-1, %eax
  1051. end;
  1052. function {$ifdef CPUX86_HAS_SSE4_1} IndexQWord {$else} IndexQWord_SSE41 {$endif}(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  1053. { eax = buf, edx = len, [esp+4] = b }
  1054. asm
  1055. cmp $6, len
  1056. jle IndexQWord_Plain
  1057. {$ifndef OLD_ASSEMBLER} movddup 4(%esp), %xmm0 {$else} .byte 0xF2,0x0F,0x12,0x44,0x24,0x04 {$endif} { xmm0 = pattern of 'b's. }
  1058. mov %eax, %ecx { ecx = original buf }
  1059. sub $6, len
  1060. .balign 16
  1061. .L6x_Loop:
  1062. movdqu (%eax), %xmm1
  1063. {$ifndef OLD_ASSEMBLER} pcmpeqq %xmm0, %xmm1 {$else} .byte 0x66,0x0F,0x38,0x29,0xC8 {$endif} { xmm1 = cmpeq(vec 0, pattern) }
  1064. movdqu 16(%eax), %xmm2
  1065. {$ifndef OLD_ASSEMBLER} pcmpeqq %xmm0, %xmm2 {$else} .byte 0x66,0x0F,0x38,0x29,0xD0 {$endif}
  1066. por %xmm1, %xmm2 { xmm2 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) }
  1067. movdqu 32(%eax), %xmm3
  1068. {$ifndef OLD_ASSEMBLER} pcmpeqq %xmm0, %xmm3 {$else} .byte 0x66,0x0F,0x38,0x29,0xD8 {$endif}
  1069. por %xmm2, %xmm3 { xmm3 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) or cmpeq(vec 2, pattern) }
  1070. {$ifndef OLD_ASSEMBLER} ptest %xmm3, %xmm3 {$else} .byte 0x66,0x0F,0x38,0x17,0xDB {$endif}
  1071. jnz .LFound
  1072. add $48, %eax
  1073. sub $6, len
  1074. jge .L6x_Loop
  1075. lea (%eax,%edx,8), %eax { Point to last 3 vectors. }
  1076. cmp $-5, len
  1077. jge .L6x_Loop { Reuse .L6x_Loop to compare last 3 vectors, if not compared already. }
  1078. mov $-1, %eax
  1079. ret $8
  1080. .LFound:
  1081. sub %ecx, %eax
  1082. {$ifndef OLD_ASSEMBLER} ptest %xmm1, %xmm1 {$else} .byte 0x66,0x0F,0x38,0x17,0xC9 {$endif}
  1083. jnz .LFoundAtXmm1
  1084. {$ifndef OLD_ASSEMBLER} ptest %xmm2, %xmm2 {$else} .byte 0x66,0x0F,0x38,0x17,0xD2 {$endif}
  1085. jnz .LFoundAtXmm2
  1086. add $16, %eax
  1087. movdqa %xmm3, %xmm2
  1088. .LFoundAtXmm2:
  1089. add $16, %eax
  1090. movdqa %xmm2, %xmm1
  1091. .LFoundAtXmm1:
  1092. pmovmskb %xmm1, %ecx
  1093. bsf %ecx, %ecx
  1094. add %ecx, %eax
  1095. shr $3, %eax
  1096. end;
  1097. {$ifndef CPUX86_HAS_SSE4_1}
  1098. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt; forward;
  1099. var
  1100. IndexQWord_Impl: function(const buf;len:SizeInt;b:QWord):SizeInt = @IndexQWord_Dispatch;
  1101. function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt;
  1102. begin
  1103. if not fpc_cpucodeinit_performed then
  1104. exit(IndexQWord_Plain(buf,len,b));
  1105. if has_sse41_support then
  1106. IndexQWord_Impl:=@IndexQWord_SSE41
  1107. else
  1108. IndexQWord_Impl:=@IndexQWord_Plain;
  1109. result:=IndexQWord_Impl(buf,len,b);
  1110. end;
  1111. function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt;
  1112. begin
  1113. result:=IndexQWord_Impl(buf,len,b);
  1114. end;
  1115. {$endif ndef CPUX86_HAS_SSE4_1}
  1116. {$endif FPC_SYSTEM_HAS_INDEXQWORD}
  1117. {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
  1118. {$define FPC_SYSTEM_HAS_COMPAREBYTE}
  1119. {$ifndef CPUX86_HAS_SSE2}
  1120. function CompareByte_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1121. asm
  1122. { eax = buf1, edx = buf2, ecx = len }
  1123. push %ebx
  1124. sub %eax, %edx { edx = buf2 - buf1 }
  1125. cmp $3, %ecx
  1126. jle .LBytewise_Prepare
  1127. { Align buf1 on 4 bytes. }
  1128. mov (%edx,%eax), %ebx
  1129. cmp (%eax), %ebx
  1130. jne .L4xDiffer
  1131. lea -4(%eax,%ecx), %ecx { ecx = buf1 end - (4 + buf1 and -4) = count remaining }
  1132. and $-4, %eax
  1133. sub %eax, %ecx
  1134. .balign 16
  1135. .L4x_Next:
  1136. add $4, %eax
  1137. sub $4, %ecx { at .LLast4, ecx is 4 less than remaining bytes }
  1138. jle .LLast4
  1139. mov (%edx,%eax), %ebx
  1140. cmp (%eax), %ebx
  1141. je .L4x_Next
  1142. .L4xDiffer:
  1143. mov (%eax), %edx
  1144. {$ifdef CPUX86_HAS_BSWAP}
  1145. bswap %ebx
  1146. bswap %edx
  1147. {$else}
  1148. rol $8, %bx
  1149. rol $16, %ebx
  1150. rol $8, %bx
  1151. rol $8, %dx
  1152. rol $16, %edx
  1153. rol $8, %dx
  1154. {$endif}
  1155. cmp %ebx, %edx
  1156. .LDoSbb:
  1157. sbb %eax, %eax
  1158. or $1, %eax
  1159. pop %ebx
  1160. ret
  1161. .LLast4:
  1162. add %ecx, %eax
  1163. mov (%edx,%eax), %ebx
  1164. cmp (%eax), %ebx
  1165. jne .L4xDiffer
  1166. xor %eax, %eax
  1167. pop %ebx
  1168. ret
  1169. .LBytewise_Prepare:
  1170. sub $1, %ecx
  1171. jb .LNothing
  1172. .balign 16 { no-op }
  1173. .LBytewise_Body:
  1174. movzbl (%edx,%eax), %ebx
  1175. cmp %bl, (%eax)
  1176. jne .LDoSbb
  1177. add $1, %eax
  1178. sub $1, %ecx
  1179. jae .LBytewise_Body
  1180. .LNothing:
  1181. xor %eax, %eax
  1182. pop %ebx
  1183. end;
  1184. {$endif ndef CPUX86_HAS_SSE2}
  1185. label
  1186. CompareByte_1OrLess, CompareByte_CantOverReadBoth_AVX2;
  1187. function CompareByte_SSE2(const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
  1188. asm
  1189. { eax = buf1, edx = buf2, ecx = len }
  1190. cmp $1, %ecx
  1191. jle CompareByte_1OrLess
  1192. push %ebx
  1193. cmp $16, %ecx
  1194. jae .LVecOrMore
  1195. { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
  1196. mov %eax, %ebx
  1197. or %edx, %ebx
  1198. and $4095, %ebx
  1199. cmp $4080, %ebx
  1200. ja .LCantOverReadBoth
  1201. { Over-read both as XMMs. }
  1202. movdqu (%eax), %xmm0
  1203. movdqu (%edx), %xmm1
  1204. pcmpeqb %xmm1, %xmm0
  1205. pmovmskb %xmm0, %ebx
  1206. inc %bx { Here and below, 2-byte 'inc r16' after 'pmovmskb' can be replaced with 5-byte 'add $1, r16' or 6-byte 'xor $65535, r32'. }
  1207. jz .LNothing
  1208. bsf %ebx, %ebx
  1209. cmp %ecx, %ebx { Ignore garbage beyond 'len'. }
  1210. jae .LNothing
  1211. movzbl (%eax,%ebx), %eax
  1212. movzbl (%edx,%ebx), %edx
  1213. sub %edx, %eax
  1214. pop %ebx
  1215. ret
  1216. .byte 102,102,102,102,102,144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
  1217. CompareByte_CantOverReadBoth_AVX2:
  1218. cmp $16, %ecx
  1219. jb .LCantOverReadBoth
  1220. .LVecOrMore:
  1221. { Compare first vectors. }
  1222. movdqu (%eax), %xmm0
  1223. movdqu (%edx), %xmm1
  1224. pcmpeqb %xmm1, %xmm0
  1225. pmovmskb %xmm0, %ebx
  1226. inc %bx
  1227. jnz .LVec0Differs
  1228. sub $32, %ecx { now ecx is len - 32. }
  1229. jbe .LLastVec
  1230. { Compare second vectors. }
  1231. movdqu 16(%eax), %xmm0
  1232. movdqu 16(%edx), %xmm1
  1233. pcmpeqb %xmm1, %xmm0
  1234. pmovmskb %xmm0, %ebx
  1235. inc %bx
  1236. jnz .LVec1Differs
  1237. cmp $32, %ecx
  1238. jbe .LLastTwoVectors
  1239. { More than four vectors: aligned loop. }
  1240. sub %eax, %edx { edx = buf2 - buf1 }
  1241. jz .LNothing { Exit if buf1 = buf2. }
  1242. lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
  1243. and $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
  1244. sub %eax, %ecx { ecx = count to be handled with loop }
  1245. .balign 16 { No-op. }
  1246. .LAligned32xLoop_Body:
  1247. add $32, %eax
  1248. { Compare two XMMs, reduce the result with 'and'. }
  1249. movdqu (%edx,%eax), %xmm0
  1250. pcmpeqb (%eax), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
  1251. movdqu 16(%edx,%eax), %xmm1
  1252. pcmpeqb 16(%eax), %xmm1
  1253. pand %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
  1254. pmovmskb %xmm1, %ebx
  1255. inc %bx
  1256. jnz .LAligned32xLoop_TwoVectorsDiffer
  1257. sub $32, %ecx
  1258. ja .LAligned32xLoop_Body
  1259. add %eax, %edx { restore edx = buf2 }
  1260. add $32, %ecx
  1261. .LLastTwoVectors:
  1262. movdqu (%eax,%ecx), %xmm0
  1263. movdqu (%edx,%ecx), %xmm1
  1264. pcmpeqb %xmm1, %xmm0
  1265. pmovmskb %xmm0, %ebx
  1266. inc %bx
  1267. jnz .LVecEm2Differs
  1268. .LLastVec:
  1269. movdqu 16(%eax,%ecx), %xmm0
  1270. movdqu 16(%edx,%ecx), %xmm1
  1271. pcmpeqb %xmm1, %xmm0
  1272. pmovmskb %xmm0, %ebx
  1273. inc %bx
  1274. jnz .LVecEm1Differs
  1275. .LNothing:
  1276. pop %ebx
  1277. xor %eax, %eax
  1278. ret
  1279. .LAligned32xLoop_TwoVectorsDiffer:
  1280. add %eax, %edx { restore edx = buf2 }
  1281. pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
  1282. inc %cx
  1283. jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
  1284. mov %ecx, %ebx
  1285. .LVec0Differs:
  1286. bsf %ebx, %ebx
  1287. movzbl (%eax,%ebx), %eax
  1288. movzbl (%edx,%ebx), %edx
  1289. sub %edx, %eax
  1290. pop %ebx
  1291. ret
  1292. .LVec1Differs:
  1293. xor %ecx, %ecx
  1294. .LVecEm1Differs:
  1295. add $16, %ecx
  1296. .LVecEm2Differs:
  1297. bsf %ebx, %ebx
  1298. add %ecx, %ebx
  1299. movzbl (%eax,%ebx), %eax
  1300. movzbl (%edx,%ebx), %edx
  1301. sub %edx, %eax
  1302. pop %ebx
  1303. ret
  1304. .LCantOverReadBoth:
  1305. cmp $3, %ecx
  1306. jle .L2to3
  1307. push %esi
  1308. mov (%eax), %ebx
  1309. mov (%edx), %esi
  1310. cmp %esi, %ebx
  1311. jne .L4xDiffer
  1312. cmp $8, %ecx
  1313. jbe .LLast4x
  1314. mov 4(%eax), %ebx
  1315. mov 4(%edx), %esi
  1316. cmp %esi, %ebx
  1317. jne .L4xDiffer
  1318. mov -8(%eax,%ecx), %ebx
  1319. mov -8(%edx,%ecx), %esi
  1320. cmp %esi, %ebx
  1321. jne .L4xDiffer
  1322. .LLast4x:
  1323. mov -4(%eax,%ecx), %ebx
  1324. mov -4(%edx,%ecx), %esi
  1325. cmp %esi, %ebx
  1326. jne .L4xDiffer
  1327. pop %esi
  1328. pop %ebx
  1329. xor %eax, %eax
  1330. ret
  1331. .L4xDiffer:
  1332. bswap %ebx
  1333. bswap %esi
  1334. cmp %esi, %ebx
  1335. pop %esi
  1336. sbb %eax, %eax
  1337. or $1, %eax
  1338. pop %ebx
  1339. ret
  1340. .L2to3:
  1341. movzwl (%edx), %ebx
  1342. bswap %ebx
  1343. shr $1, %ebx
  1344. mov -1(%edx,%ecx), %bl
  1345. movzwl (%eax), %edx
  1346. bswap %edx
  1347. shr $1, %edx
  1348. mov -1(%eax,%ecx), %dl
  1349. mov %edx, %eax
  1350. sub %ebx, %eax
  1351. pop %ebx
  1352. ret
  1353. CompareByte_1OrLess:
  1354. jl .LUnbounded_Prepare
  1355. movzbl (%eax), %eax
  1356. movzbl (%edx), %edx
  1357. sub %edx, %eax
  1358. ret
  1359. .LUnbounded_Prepare:
  1360. sub %eax, %edx { edx = buf2 - buf1 }
  1361. test %ecx, %ecx
  1362. jnz .LUnbounded_Body
  1363. xor %eax, %eax
  1364. ret
  1365. .balign 16
  1366. .LUnbounded_Next:
  1367. add $1, %eax
  1368. .LUnbounded_Body:
  1369. movzbl (%edx,%eax), %ecx
  1370. cmp %cl, (%eax)
  1371. je .LUnbounded_Next
  1372. sbb %eax, %eax
  1373. or $1, %eax
  1374. end;
  1375. function {$ifdef CPUX86_HAS_BMI2} CompareByte {$else} CompareByte_AVX2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
  1376. asm
  1377. { eax = buf1, edx = buf2, ecx = len }
  1378. cmp $1, %ecx
  1379. jle CompareByte_1OrLess
  1380. push %ebx
  1381. cmp $32, %ecx
  1382. jae .LVecOrMore
  1383. { 2 to 31 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
  1384. mov %eax, %ebx
  1385. or %edx, %ebx
  1386. and $4095, %ebx
  1387. cmp $4064, %ebx
  1388. ja CompareByte_CantOverReadBoth_AVX2
  1389. { Over-read both as YMMs. }
  1390. {$ifndef OLD_ASSEMBLER} vmovdqu (%eax), %ymm0 {$else} .byte 0xC5,0xFE,0x6F,0x00 {$endif}
  1391. {$ifndef OLD_ASSEMBLER} vpcmpeqb (%edx), %ymm0, %ymm0 {$else} .byte 0xC5,0xFD,0x74,0x02 {$endif}
  1392. {$ifndef OLD_ASSEMBLER} vpmovmskb %ymm0, %ebx {$else} .byte 0xC5,0xFD,0xD7,0xD8 {$endif}
  1393. inc %ebx
  1394. {$if not defined(OLD_ASSEMBLER) and not defined(VER3_2)} bzhi %ecx, %ebx, %ecx {$else} .byte 0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi } {$endif}
  1395. jnz .LVec0Differs
  1396. .LNothing:
  1397. {$ifndef OLD_ASSEMBLER} vzeroupper {$else} .byte 0xC5,0xF8,0x77 {$endif}
  1398. pop %ebx
  1399. xor %eax, %eax
  1400. ret
  1401. .byte 144 { Turn .balign 16 before .LAligned64xLoop_Body into a no-op. }
  1402. .LAligned64xLoop_TwoVectorsDiffer:
  1403. add %eax, %edx { restore edx = buf2 }
  1404. {$ifndef OLD_ASSEMBLER} vpmovmskb %ymm0, %ecx {$else} .byte 0xC5,0xFD,0xD7,0xC8 {$endif} { Is there a difference in the first vector? }
  1405. inc %ecx
  1406. jz .LVec1Differs { No difference in the first vector, ymm0 is all ones, ebx = vpmovmskb(vpcmpeqb(buf1 + 32, buf2 + 32)) from the loop body. }
  1407. mov %ecx, %ebx
  1408. .LVec0Differs:
  1409. {$ifndef OLD_ASSEMBLER} vzeroupper {$else} .byte 0xC5,0xF8,0x77 {$endif}
  1410. {$ifndef OLD_ASSEMBLER} tzcnt %ebx, %ebx {$else} .byte 0xF3,0x0F,0xBC,0xDB {$endif}
  1411. movzbl (%eax,%ebx), %eax
  1412. movzbl (%edx,%ebx), %edx
  1413. sub %edx, %eax
  1414. pop %ebx
  1415. ret
  1416. .LVec1Differs:
  1417. xor %ecx, %ecx
  1418. .LVecEm1Differs:
  1419. add $32, %ecx
  1420. .LVecEm2Differs:
  1421. {$ifndef OLD_ASSEMBLER} vzeroupper {$else} .byte 0xC5,0xF8,0x77 {$endif}
  1422. {$ifndef OLD_ASSEMBLER} tzcnt %ebx, %ebx {$else} .byte 0xF3,0x0F,0xBC,0xDB {$endif}
  1423. add %ecx, %ebx
  1424. movzbl (%eax,%ebx), %eax
  1425. movzbl (%edx,%ebx), %edx
  1426. sub %edx, %eax
  1427. pop %ebx
  1428. ret
  1429. .LVecOrMore:
  1430. { Compare first vectors. }
  1431. {$ifndef OLD_ASSEMBLER} vmovdqu (%eax), %ymm0 {$else} .byte 0xC5,0xFE,0x6F,0x00 {$endif}
  1432. {$ifndef OLD_ASSEMBLER} vpcmpeqb (%edx), %ymm0, %ymm0 {$else} .byte 0xC5,0xFD,0x74,0x02 {$endif}
  1433. {$ifndef OLD_ASSEMBLER} vpmovmskb %ymm0, %ebx {$else} .byte 0xC5,0xFD,0xD7,0xD8 {$endif}
  1434. inc %ebx
  1435. jnz .LVec0Differs
  1436. sub $64, %ecx { now ecx is len - 64. }
  1437. jbe .LLastVec
  1438. { Compare second vectors. }
  1439. {$ifndef OLD_ASSEMBLER} vmovdqu 32(%eax), %ymm0 {$else} .byte 0xC5,0xFE,0x6F,0x40,0x20 {$endif}
  1440. {$ifndef OLD_ASSEMBLER} vpcmpeqb 32(%edx), %ymm0, %ymm0 {$else} .byte 0xC5,0xFD,0x74,0x42,0x20 {$endif}
  1441. {$ifndef OLD_ASSEMBLER} vpmovmskb %ymm0, %ebx {$else} .byte 0xC5,0xFD,0xD7,0xD8 {$endif}
  1442. inc %ebx
  1443. jnz .LVec1Differs
  1444. cmp $64, %ecx
  1445. jbe .LLastTwoVectors
  1446. { More than four vectors: aligned loop. }
  1447. sub %eax, %edx { edx = buf2 - buf1 }
  1448. jz .LNothing { Exit if buf1 = buf2. }
  1449. lea -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
  1450. and $-32, %eax { Align buf1. First two vectors already analyzed are skipped by +64 on the first loop iteration. }
  1451. sub %eax, %ecx { ecx = count to be handled with loop }
  1452. .balign 16 { No-op. }
  1453. .LAligned64xLoop_Body:
  1454. add $64, %eax
  1455. { Compare two YMMs, reduce the result with 'and'. }
  1456. {$ifndef OLD_ASSEMBLER} vmovdqu (%edx,%eax), %ymm0 {$else} .byte 0xC5,0xFE,0x6F,0x04,0x02 {$endif}
  1457. {$ifndef OLD_ASSEMBLER} vpcmpeqb (%eax), %ymm0, %ymm0 {$else} .byte 0xC5,0xFD,0x74,0x00 {$endif} { ymm0 = vpcmpeqb(buf1, buf2) }
  1458. {$ifndef OLD_ASSEMBLER} vmovdqu 32(%edx,%eax), %ymm1 {$else} .byte 0xC5,0xFE,0x6F,0x4C,0x02,0x20 {$endif}
  1459. {$ifndef OLD_ASSEMBLER} vpcmpeqb 32(%eax), %ymm1, %ymm1 {$else} .byte 0xC5,0xF5,0x74,0x48,0x20 {$endif}
  1460. {$ifndef OLD_ASSEMBLER} vpand %ymm0, %ymm1, %ymm1 {$else} .byte 0xC5,0xF5,0xDB,0xC8 {$endif} { ymm1 = ymm0 and vpcmpeqb(buf1 + 32, buf2 + 32) }
  1461. {$ifndef OLD_ASSEMBLER} vpmovmskb %ymm1, %ebx {$else} .byte 0xC5,0xFD,0xD7,0xD9 {$endif}
  1462. inc %ebx
  1463. jnz .LAligned64xLoop_TwoVectorsDiffer
  1464. sub $64, %ecx
  1465. ja .LAligned64xLoop_Body
  1466. add %eax, %edx { restore edx = buf2 }
  1467. add $64, %ecx
  1468. .LLastTwoVectors:
  1469. {$ifndef OLD_ASSEMBLER} vmovdqu (%eax,%ecx), %ymm0 {$else} .byte 0xC5,0xFE,0x6F,0x04,0x08 {$endif}
  1470. {$ifndef OLD_ASSEMBLER} vpcmpeqb (%edx,%ecx), %ymm0, %ymm0 {$else} .byte 0xC5,0xFD,0x74,0x04,0x0A {$endif}
  1471. {$ifndef OLD_ASSEMBLER} vpmovmskb %ymm0, %ebx {$else} .byte 0xC5,0xFD,0xD7,0xD8 {$endif}
  1472. inc %ebx
  1473. jnz .LVecEm2Differs
  1474. .LLastVec:
  1475. {$ifndef OLD_ASSEMBLER} vmovdqu 32(%eax,%ecx), %ymm0 {$else} .byte 0xC5,0xFE,0x6F,0x44,0x08,0x20 {$endif}
  1476. {$ifndef OLD_ASSEMBLER} vpcmpeqb 32(%edx,%ecx), %ymm0, %ymm0 {$else} .byte 0xC5,0xFD,0x74,0x44,0x0A,0x20 {$endif}
  1477. {$ifndef OLD_ASSEMBLER} vpmovmskb %ymm0, %ebx {$else} .byte 0xC5,0xFD,0xD7,0xD8 {$endif}
  1478. inc %ebx
  1479. jnz .LVecEm1Differs
  1480. {$ifndef OLD_ASSEMBLER} vzeroupper {$else} .byte 0xC5,0xF8,0x77 {$endif}
  1481. pop %ebx
  1482. xor %eax, %eax
  1483. end;
  1484. {$ifndef CPUX86_HAS_BMI2}
  1485. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1486. var
  1487. CompareByte_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareByte_Dispatch;
  1488. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1489. begin
  1490. if not fpc_cpucodeinit_performed then
  1491. exit({$ifdef CPUX86_HAS_SSE2} CompareByte_SSE2 {$else} CompareByte_Plain {$endif} (buf1, buf2, len));
  1492. if has_avx2_support then
  1493. CompareByte_Impl:=@CompareByte_AVX2
  1494. else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
  1495. CompareByte_Impl:=@CompareByte_SSE2
  1496. {$ifndef CPUX86_HAS_SSE2}
  1497. else
  1498. CompareByte_Impl:=@CompareByte_Plain
  1499. {$endif};
  1500. result:=CompareByte_Impl(buf1, buf2, len);
  1501. end;
  1502. function CompareByte(const buf1, buf2; len: SizeInt): SizeInt;
  1503. begin
  1504. result:=CompareByte_Impl(buf1, buf2, len);
  1505. end;
  1506. {$endif ndef CPUX86_HAS_BMI2 (need CompareByte dispatcher)}
  1507. {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
  1508. {$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
  1509. {$define FPC_SYSTEM_HAS_COMPAREWORD}
  1510. {$ifndef CPUX86_HAS_SSE2}
  1511. function CompareWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1512. asm
  1513. push %ebx
  1514. sub %eax, %edx { edx = buf2 - buf1 }
  1515. lea -4(%ecx), %ebx { Go wordwise if ecx <= 3 or ecx > 1073741823 (High(int32) div 2) ==> uint32(ecx - 4) > 1073741819. }
  1516. cmp $1073741819, %ebx
  1517. ja .LWordwise_Prepare
  1518. test $2, %al
  1519. je .LAlignedToPtrUintOrNaturallyMisaligned
  1520. movzwl (%edx,%eax), %ebx
  1521. cmp %bx, (%eax)
  1522. jne .LDoSbb
  1523. add $2, %eax
  1524. sub $1, %ecx
  1525. .LAlignedToPtrUintOrNaturallyMisaligned:
  1526. sub $2, %ecx
  1527. .balign 16
  1528. .LPtrUintWise_Next:
  1529. mov (%edx,%eax), %ebx
  1530. cmp %ebx, (%eax)
  1531. jne .LPtrUintsDiffer
  1532. add $4, %eax
  1533. sub $2, %ecx
  1534. jg .LPtrUintWise_Next
  1535. lea (%eax,%ecx,2), %eax
  1536. mov (%edx,%eax), %ebx
  1537. cmp %ebx, (%eax)
  1538. jne .LPtrUintsDiffer
  1539. pop %ebx
  1540. xor %eax, %eax
  1541. ret
  1542. .LPtrUintsDiffer:
  1543. cmp %bx, (%eax)
  1544. jne .LDoSbb
  1545. shr $16, %ebx
  1546. cmp %bx, 2(%eax)
  1547. .LDoSbb:
  1548. sbb %eax, %eax
  1549. or $1, %eax
  1550. pop %ebx
  1551. ret
  1552. .balign 16
  1553. .LWordwise_Body:
  1554. movzwl (%edx,%eax), %ebx
  1555. cmp %bx, (%eax)
  1556. jne .LDoSbb
  1557. add $2, %eax
  1558. .LWordwise_Prepare:
  1559. sub $1, %ecx
  1560. jnb .LWordwise_Body
  1561. pop %ebx
  1562. xor %eax, %eax
  1563. end;
  1564. {$endif ndef CPUX86_HAS_SSE2}
  1565. function {$ifdef CPUX86_HAS_SSE2} CompareWord {$else} CompareWord_SSE2 {$endif} (Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1566. asm
  1567. push %ebx
  1568. sub %eax, %edx { edx = buf2 - buf1 }
  1569. lea -2(%ecx), %ebx { Go wordwise if ecx <= 1 or uint32(ecx) > 1073741823 (High(int32) div 2) ==> uint32(ecx - 2) > 1073741821. }
  1570. cmp $1073741821, %ebx
  1571. ja .LWordwise_Prepare
  1572. cmp $8, %ecx
  1573. jge .LVecOrMore
  1574. lea (%edx,%eax), %ebx
  1575. or %eax, %ebx
  1576. and $4095, %ebx
  1577. cmp $4080, %ebx
  1578. ja .LWordwise_Prepare
  1579. movdqu (%edx,%eax), %xmm0
  1580. movdqu (%eax), %xmm1
  1581. pcmpeqw %xmm1, %xmm0
  1582. pmovmskb %xmm0, %ebx
  1583. inc %bx
  1584. jz .LNothing
  1585. shl $1, %ecx { convert to bytes }
  1586. bsf %ebx, %ebx
  1587. cmp %ecx, %ebx
  1588. jb .LSubtractWords
  1589. .LNothing:
  1590. pop %ebx
  1591. xor %eax, %eax
  1592. ret
  1593. .balign 16
  1594. .LWordwise_Body:
  1595. movzwl (%edx,%eax), %ebx
  1596. cmp %bx, (%eax)
  1597. jne .LDoSbb
  1598. add $2, %eax
  1599. .LWordwise_Prepare:
  1600. sub $1, %ecx
  1601. jae .LWordwise_Body
  1602. xor %eax, %eax
  1603. pop %ebx
  1604. ret
  1605. .LDoSbb:
  1606. sbb %eax, %eax
  1607. or $1, %eax
  1608. pop %ebx
  1609. ret
  1610. .byte 102,102,102,102,102,102,102,102,102,102,102,144
  1611. .LVecOrMore:
  1612. movdqu (%edx,%eax), %xmm0 { Compare first vectors. }
  1613. movdqu (%eax), %xmm1
  1614. pcmpeqw %xmm1, %xmm0
  1615. pmovmskb %xmm0, %ebx
  1616. inc %bx
  1617. jnz .LVec0Differs
  1618. shl $1, %ecx { convert to bytes }
  1619. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1620. jle .LLastVec
  1621. test %edx, %edx
  1622. jz .LNothing { Exit if buf1 = buf2. }
  1623. push %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1624. add %eax, %ecx
  1625. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1626. sub %eax, %ecx
  1627. .balign 16
  1628. .LAligned8xLoop_Body:
  1629. add $16, %eax
  1630. movdqu (%edx,%eax), %xmm0
  1631. pcmpeqb (%eax), %xmm0
  1632. pmovmskb %xmm0, %ebx
  1633. inc %bx
  1634. jnz .LAligned8xLoop_VecDiffers
  1635. sub $16, %ecx
  1636. ja .LAligned8xLoop_Body
  1637. pop %ebx { drop original buf1 }
  1638. .LLastVec:
  1639. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1640. movdqu (%edx,%eax), %xmm0
  1641. movdqu (%eax), %xmm1
  1642. pcmpeqw %xmm1, %xmm0
  1643. pmovmskb %xmm0, %ebx
  1644. inc %bx
  1645. jnz .LVec0Differs
  1646. pop %ebx
  1647. xor %eax, %eax
  1648. ret
  1649. .LVec0Differs:
  1650. bsf %ebx, %ebx
  1651. .LSubtractWords:
  1652. add %eax, %edx
  1653. movzwl (%eax,%ebx), %eax
  1654. movzwl (%edx,%ebx), %edx
  1655. sub %edx, %eax
  1656. pop %ebx
  1657. ret
  1658. .LAligned8xLoop_VecDiffers:
  1659. bsf %ebx, %ebx
  1660. add %ebx, %eax
  1661. pop %ecx
  1662. sub %ecx, %eax
  1663. and $-2, %eax
  1664. add %ecx, %eax
  1665. movzwl (%edx,%eax), %edx
  1666. movzwl (%eax), %eax
  1667. sub %edx, %eax
  1668. pop %ebx
  1669. end;
  1670. {$ifndef CPUX86_HAS_SSE2}
  1671. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1672. var
  1673. CompareWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareWord_Dispatch;
  1674. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1675. begin
  1676. if not fpc_cpucodeinit_performed then
  1677. exit(CompareWord_Plain(buf1, buf2, len));
  1678. if has_sse2_support then
  1679. CompareWord_Impl:=@CompareWord_SSE2
  1680. else
  1681. CompareWord_Impl:=@CompareWord_Plain;
  1682. result:=CompareWord_Impl(buf1, buf2, len);
  1683. end;
  1684. function CompareWord(const buf1, buf2; len: SizeInt): SizeInt;
  1685. begin
  1686. result:=CompareWord_Impl(buf1, buf2, len);
  1687. end;
  1688. {$endif ndef CPUX86_HAS_SSE2 (need CompareWord dispatcher)}
  1689. {$endif FPC_SYSTEM_HAS_COMPAREWORD}
  1690. {$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
  1691. {$define FPC_SYSTEM_HAS_COMPAREDWORD}
  1692. {$ifndef CPUX86_HAS_SSE2}
  1693. function CompareDWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1694. asm
  1695. sub $1, %ecx
  1696. jb .LNothing
  1697. push %ebx
  1698. sub %eax, %edx
  1699. .balign 16
  1700. .LDwordwise_Body:
  1701. mov (%edx,%eax), %ebx
  1702. cmp %ebx, (%eax)
  1703. jne .LDoSbb
  1704. add $4, %eax
  1705. sub $1, %ecx
  1706. jnb .LDwordwise_Body
  1707. pop %ebx
  1708. .LNothing:
  1709. xor %eax, %eax
  1710. ret
  1711. .LDoSbb:
  1712. pop %ebx
  1713. sbb %eax, %eax
  1714. or $1, %eax
  1715. end;
  1716. {$endif}
  1717. function {$ifdef CPUX86_HAS_SSE2} CompareDWord {$else} CompareDWord_SSE2 {$endif} (Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1718. asm
  1719. push %ebx
  1720. sub %eax, %edx { edx = buf2 - buf1 }
  1721. lea -5(%ecx), %ebx { Go dwordwise if ecx <= 4 or ecx > 536870911 (High(int32) div 4) ==> uint32(ecx - 5) > 536870906. }
  1722. cmp $536870906, %ebx
  1723. ja .LDwordwise_Prepare
  1724. shl $2, %ecx { convert to bytes }
  1725. movdqu (%edx,%eax), %xmm1 { Compare first vectors. }
  1726. movdqu (%eax), %xmm0
  1727. pcmpeqd %xmm1, %xmm0
  1728. pmovmskb %xmm0, %ebx
  1729. inc %bx
  1730. jnz .LVec0Differs
  1731. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1732. jle .LLastVec
  1733. test %edx, %edx
  1734. jz .LNothing { Exit if buf1 = buf2. }
  1735. push %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1736. add %eax, %ecx
  1737. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1738. sub %eax, %ecx
  1739. .balign 16
  1740. .LAligned4xLoop_Body:
  1741. add $16, %eax
  1742. movdqu (%eax,%edx), %xmm0
  1743. pcmpeqb (%eax), %xmm0
  1744. pmovmskb %xmm0, %ebx
  1745. inc %bx
  1746. jnz .LAligned4xLoop_VecDiffers
  1747. sub $16, %ecx
  1748. ja .LAligned4xLoop_Body
  1749. pop %ebx { drop original buf1 }
  1750. .LLastVec:
  1751. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1752. movdqu (%edx,%eax), %xmm1
  1753. movdqu (%eax), %xmm0
  1754. pcmpeqd %xmm1, %xmm0
  1755. pmovmskb %xmm0, %ebx
  1756. inc %bx
  1757. jnz .LVec0Differs
  1758. .LNothing:
  1759. pop %ebx
  1760. xor %eax, %eax
  1761. ret
  1762. .LVec0Differs:
  1763. bsf %ebx, %ebx
  1764. add %eax, %edx { recover edx = buf2 }
  1765. mov (%edx,%ebx), %edx
  1766. cmp %edx, (%eax,%ebx)
  1767. sbb %eax, %eax
  1768. or $1, %eax
  1769. pop %ebx
  1770. ret
  1771. .LAligned4xLoop_VecDiffers:
  1772. bsf %ebx, %ebx
  1773. add %ebx, %eax
  1774. pop %ecx
  1775. sub %ecx, %eax
  1776. and $-4, %eax
  1777. add %ecx, %eax
  1778. mov (%edx,%eax), %edx
  1779. cmp %edx, (%eax)
  1780. .LDoSbb:
  1781. sbb %eax, %eax
  1782. or $1, %eax
  1783. pop %ebx
  1784. ret
  1785. .balign 16
  1786. .LDwordwise_Body:
  1787. mov (%edx,%eax), %ebx
  1788. cmp %ebx, (%eax)
  1789. jne .LDoSbb
  1790. add $4, %eax
  1791. .LDwordwise_Prepare:
  1792. sub $1, %ecx
  1793. jnb .LDwordwise_Body
  1794. pop %ebx
  1795. xor %eax, %eax
  1796. end;
  1797. {$ifndef CPUX86_HAS_SSE2}
  1798. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1799. var
  1800. CompareDWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareDWord_Dispatch;
  1801. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1802. begin
  1803. if not fpc_cpucodeinit_performed then
  1804. exit(CompareDWord_Plain(buf1, buf2, len));
  1805. if has_sse2_support then
  1806. CompareDWord_Impl:=@CompareDWord_SSE2
  1807. else
  1808. CompareDWord_Impl:=@CompareDWord_Plain;
  1809. result:=CompareDWord_Impl(buf1, buf2, len);
  1810. end;
  1811. function CompareDWord(const buf1, buf2; len: SizeInt): SizeInt;
  1812. begin
  1813. result:=CompareDWord_Impl(buf1, buf2, len);
  1814. end;
  1815. {$endif ndef CPUX86_HAS_SSE2 (need CompareDWord dispatcher)}
  1816. {$endif FPC_SYSTEM_HAS_COMPAREDWORD}
  1817. {$ifndef FPC_SYSTEM_HAS_INDEXCHAR0}
  1818. {$define FPC_SYSTEM_HAS_INDEXCHAR0}
  1819. function IndexChar0(Const buf;len:SizeInt;b:AnsiChar):SizeInt; assembler;
  1820. var
  1821. saveesi,saveebx : longint;
  1822. asm
  1823. movl %esi,saveesi
  1824. movl %ebx,saveebx
  1825. // Can't use scasb, or will have to do it twice, think this
  1826. // is faster for small "len"
  1827. movl %eax,%esi // Load address
  1828. movzbl %cl,%ebx // Load searchpattern
  1829. testl %edx,%edx
  1830. je .LFound
  1831. xorl %ecx,%ecx // zero index in Buf
  1832. xorl %eax,%eax // To make DWord compares possible
  1833. .balign 4
  1834. .LLoop:
  1835. movb (%esi),%al // Load byte
  1836. cmpb %al,%bl
  1837. je .LFound // byte the same?
  1838. incl %ecx
  1839. incl %esi
  1840. cmpl %edx,%ecx // Maximal distance reached?
  1841. je .LNotFound
  1842. testl %eax,%eax // Nullchar = end of search?
  1843. jne .LLoop
  1844. .LNotFound:
  1845. movl $-1,%ecx // Not found return -1
  1846. .LFound:
  1847. movl %ecx,%eax
  1848. movl saveesi,%esi
  1849. movl saveebx,%ebx
  1850. end;
  1851. {$endif FPC_SYSTEM_HAS_INDEXCHAR0}
  1852. {****************************************************************************
  1853. String
  1854. ****************************************************************************}
  1855. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1856. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1857. procedure fpc_shortstr_to_shortstr(out res:shortstring; const sstr: shortstring);assembler;[public,alias:'FPC_SHORTSTR_TO_SHORTSTR']; compilerproc;
  1858. {$ifndef FPC_PROFILE}
  1859. nostackframe;
  1860. {$endif}
  1861. { eax = res, edx = high(res), ecx = sstr }
  1862. asm
  1863. {$ifdef FPC_PROFILE}
  1864. push %eax
  1865. push %edx
  1866. push %ecx
  1867. call mcount
  1868. pop %ecx
  1869. pop %edx
  1870. pop %eax
  1871. {$endif FPC_PROFILE}
  1872. cmp (%ecx), %dl { length(sstr) fits into res? }
  1873. jbe .LEdxIsLen { use high(res) if length(sstr) does not fit }
  1874. movzbl (%ecx), %edx { use length(sstr) }
  1875. .LEdxIsLen:
  1876. mov %dl, (%eax) { store length to res[0] }
  1877. xchg %ecx, %edx { ecx = length = Move count, edx = sstr }
  1878. xchg %eax, %edx { eax = sstr = Move src, edx = res = Move dest }
  1879. inc %eax
  1880. inc %edx
  1881. {$ifdef FPC_PROFILE}
  1882. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1883. lea -8(%esp), %esp
  1884. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1885. call Move
  1886. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1887. lea 8(%esp), %esp
  1888. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1889. {$else FPC_PROFILE}
  1890. jmp Move
  1891. {$endif FPC_PROFILE}
  1892. end;
  1893. procedure fpc_shortstr_assign(len:longint;sstr,dstr:pointer);[public,alias:'FPC_SHORTSTR_ASSIGN'];
  1894. begin
  1895. asm
  1896. {$ifdef FPC_PROFILE}
  1897. push %eax
  1898. push %edx
  1899. push %ecx
  1900. call mcount
  1901. pop %ecx
  1902. pop %edx
  1903. pop %eax
  1904. {$endif FPC_PROFILE}
  1905. pushl %eax
  1906. pushl %ecx
  1907. {$ifdef FPC_ENABLED_CLD}
  1908. cld
  1909. {$endif FPC_ENABLED_CLD}
  1910. movl dstr,%edi
  1911. movl sstr,%esi
  1912. xorl %eax,%eax
  1913. movl len,%ecx
  1914. lodsb
  1915. cmpl %ecx,%eax
  1916. jbe .LStrCopy1
  1917. movl %ecx,%eax
  1918. .LStrCopy1:
  1919. stosb
  1920. cmpl $7,%eax
  1921. jl .LStrCopy2
  1922. movl %edi,%ecx { Align on 32bits }
  1923. negl %ecx
  1924. andl $3,%ecx
  1925. subl %ecx,%eax
  1926. rep
  1927. movsb
  1928. movl %eax,%ecx
  1929. andl $3,%eax
  1930. shrl $2,%ecx
  1931. rep
  1932. movsl
  1933. .LStrCopy2:
  1934. movl %eax,%ecx
  1935. rep
  1936. movsb
  1937. popl %ecx
  1938. popl %eax
  1939. end ['ESI','EDI'];
  1940. end;
  1941. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1942. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1943. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1944. function fpc_shortstr_compare(const left,right:shortstring): longint;assembler; [public,alias:'FPC_SHORTSTR_COMPARE']; compilerproc;
  1945. { eax = left, edx = right }
  1946. asm
  1947. {$ifdef FPC_PROFILE}
  1948. push %eax
  1949. push %edx
  1950. push %ecx
  1951. call mcount
  1952. pop %ecx
  1953. pop %edx
  1954. pop %eax
  1955. {$endif FPC_PROFILE}
  1956. push %ebx
  1957. movzbl (%eax), %ecx { ecx = len(left) }
  1958. movzbl (%edx), %ebx { ebx = len(right) }
  1959. cmp %ebx, %ecx
  1960. {$ifdef CPUX86_HAS_CMOV}
  1961. cmovg %ebx, %ecx
  1962. {$else}
  1963. jle .LEcxIsLen
  1964. mov %ebx, %ecx
  1965. .LEcxIsLen:
  1966. {$endif}
  1967. push %eax { save left }
  1968. inc %eax
  1969. inc %edx
  1970. { stack is already aligned to 16 bytes if required: return address + push ebp + push ebx + push eax. }
  1971. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1972. call CompareByte
  1973. {$else}
  1974. call CompareByte_Impl { manually inline CompareByte }
  1975. {$endif}
  1976. pop %edx { restore left }
  1977. test %eax, %eax
  1978. jnz .LReturn
  1979. movzbl (%edx), %eax
  1980. sub %ebx, %eax
  1981. .LReturn:
  1982. pop %ebx
  1983. end;
  1984. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1985. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1986. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1987. function fpc_shortstr_compare_equal(const left,right:shortstring) : longint;assembler;nostackframe; [public,alias:'FPC_SHORTSTR_COMPARE_EQUAL']; compilerproc; nostackframe;
  1988. { eax = left, edx = right }
  1989. asm
  1990. movzbl (%eax), %ecx
  1991. cmp (%edx), %cl
  1992. jne .LNotEqual
  1993. inc %eax
  1994. inc %edx
  1995. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1996. jmp CompareByte
  1997. {$else}
  1998. jmp CompareByte_Impl { manually inline CompareByte }
  1999. {$endif}
  2000. .LNotEqual:
  2001. or $-1, %eax
  2002. end;
  2003. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  2004. {$ifndef FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  2005. {$define FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  2006. procedure fpc_pchar_to_shortstr(out res : shortstring;p:PAnsiChar);assembler;[public,alias:'FPC_PCHAR_TO_SHORTSTR']; compilerproc;
  2007. {$ifndef FPC_PROFILE}
  2008. nostackframe;
  2009. {$endif}
  2010. // eax = res, edx = high(res), ecx = p
  2011. asm
  2012. {$ifdef FPC_PROFILE}
  2013. push %eax
  2014. push %edx
  2015. push %ecx
  2016. call mcount
  2017. pop %ecx
  2018. pop %edx
  2019. pop %eax
  2020. {$endif FPC_PROFILE}
  2021. test %ecx, %ecx
  2022. jz .LEmpty
  2023. push %eax { save res }
  2024. push %ecx { save p }
  2025. push %edx { save high(res) }
  2026. mov %ecx, %eax { eax = IndexByte.buf }
  2027. { edx is already high(res) = IndexByte.count.
  2028. Careful: using high(res) instead of -1 limits the scan by high(res) which is a good thing,
  2029. but assumes that IndexByte is “safe” and won’t read potentially invalid memory past the searched byte even if formally (and wrongly) allowed by ‘count’.
  2030. Generic and x86 versions are “safe”. }
  2031. xor %ecx, %ecx { ecx = 0 = IndexByte.value }
  2032. { Stack is already aligned on 16 bytes if the function is nostackframe: return address + push eax + push ecx + push edx.
  2033. With a stack frame, there is an additional push ebp and need 12 more bytes to align. }
  2034. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  2035. leal -12(%esp), %esp
  2036. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  2037. {$if defined(FPC_PIC) or not declared(IndexByte_Impl)}
  2038. call IndexByte
  2039. {$else}
  2040. call IndexByte_Impl { manually inline IndexByte }
  2041. {$endif}
  2042. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  2043. leal 12(%esp), %esp
  2044. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  2045. pop %ecx { ecx = high(res) = Move.len }
  2046. test %eax, %eax { If IndexByte result (eax) is non-negative (terminator is among first high(res) characters), use it, otherwise keep high(res). }
  2047. {$ifdef CPUX86_HAS_CMOV}
  2048. cmovns %eax, %ecx
  2049. {$else}
  2050. js .LEcxIsLen
  2051. mov %eax, %ecx
  2052. .LEcxIsLen:
  2053. {$endif}
  2054. pop %eax { pop p to eax = Move.src }
  2055. pop %edx { pop res to edx }
  2056. mov %cl, (%edx) { res[0] := len }
  2057. inc %edx { res[1] = Move.dst }
  2058. {$ifdef FPC_PROFILE}
  2059. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2060. leal -12(%esp), %esp
  2061. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2062. call Move
  2063. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2064. leal 12(%esp), %esp
  2065. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2066. jmp .LReturn
  2067. {$else FPC_PROFILE}
  2068. jmp Move { can perform a tail call }
  2069. {$endif FPC_PROFILE}
  2070. .LEmpty:
  2071. movb $0, (%eax)
  2072. {$ifdef FPC_PROFILE}
  2073. .LReturn:
  2074. {$endif}
  2075. end;
  2076. {$endif FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  2077. {$IFNDEF INTERNAL_BACKTRACE}
  2078. {$define FPC_SYSTEM_HAS_GET_FRAME}
  2079. function get_frame:pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  2080. asm
  2081. movl %ebp,%eax
  2082. end;
  2083. {$ENDIF not INTERNAL_BACKTRACE}
  2084. {$define FPC_SYSTEM_HAS_GET_PC_ADDR}
  2085. Function Get_pc_addr : Pointer;assembler;nostackframe;
  2086. asm
  2087. movl (%esp),%eax
  2088. end;
  2089. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  2090. function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;
  2091. {$if defined(win32)}
  2092. { Windows has StackTop always properly set }
  2093. begin
  2094. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  2095. Result:=PPointer(framebp+4)^
  2096. else
  2097. Result:=nil;
  2098. end;
  2099. {$else defined(win32)}
  2100. nostackframe;assembler;
  2101. asm
  2102. orl %eax,%eax
  2103. jz .Lg_a_null
  2104. movl 4(%eax),%eax
  2105. .Lg_a_null:
  2106. end;
  2107. {$endif defined(win32)}
  2108. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  2109. function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;
  2110. {$if defined(win32)}
  2111. { Windows has StackTop always properly set }
  2112. begin
  2113. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  2114. Result:=PPointer(framebp)^
  2115. else
  2116. Result:=nil;
  2117. end;
  2118. {$else defined(win32)}
  2119. nostackframe;assembler;
  2120. asm
  2121. orl %eax,%eax
  2122. jz .Lgnf_null
  2123. movl (%eax),%eax
  2124. .Lgnf_null:
  2125. end;
  2126. {$endif defined(win32)}
  2127. {$define FPC_SYSTEM_HAS_SPTR}
  2128. Function Sptr : Pointer;assembler;nostackframe;
  2129. asm
  2130. movl %esp,%eax
  2131. end;
  2132. {****************************************************************************
  2133. Str()
  2134. ****************************************************************************}
  2135. {$if defined(disabled) and defined(regcall) }
  2136. {$define FPC_SYSTEM_HAS_INT_STR_LONGWORD}
  2137. {$define FPC_SYSTEM_HAS_INT_STR_LONGINT}
  2138. label str_int_shortcut;
  2139. procedure int_str(l:longword;out s:shortstring);assembler;nostackframe;
  2140. asm
  2141. pushl %esi
  2142. pushl %edi
  2143. pushl %ebx
  2144. mov %edx,%edi
  2145. xor %edx,%edx
  2146. jmp str_int_shortcut
  2147. end;
  2148. procedure int_str(l:longint;out s:shortstring);assembler;nostackframe;
  2149. {Optimized for speed, but balanced with size.}
  2150. const digits:array[0..9] of cardinal=(0,10,100,1000,10000,
  2151. 100000,1000000,10000000,
  2152. 100000000,1000000000);
  2153. asm
  2154. {$ifdef FPC_PROFILE}
  2155. push %eax
  2156. push %edx
  2157. push %ecx
  2158. call mcount
  2159. pop %ecx
  2160. pop %edx
  2161. pop %eax
  2162. {$endif FPC_PROFILE}
  2163. push %esi
  2164. push %edi
  2165. push %ebx
  2166. movl %edx,%edi
  2167. { Calculate absolute value and put sign in edx}
  2168. cltd
  2169. xorl %edx,%eax
  2170. subl %edx,%eax
  2171. negl %edx
  2172. str_int_shortcut:
  2173. movl %ecx,%esi
  2174. {Calculate amount of digits in ecx.}
  2175. xorl %ecx,%ecx
  2176. bsrl %eax,%ecx
  2177. incl %ecx
  2178. imul $1233,%ecx
  2179. shr $12,%ecx
  2180. {$ifdef FPC_PIC}
  2181. call fpc_geteipasebx
  2182. {$ifdef darwin}
  2183. movl digits-.Lpic(%ebx),%ebx
  2184. {$else}
  2185. addl $_GLOBAL_OFFSET_TABLE_,%ebx
  2186. movl digits@GOT(%ebx),%ebx
  2187. {$endif}
  2188. cmpl (%ebx,%ecx,4),%eax
  2189. {$else}
  2190. cmpl digits(,%ecx,4),%eax
  2191. {$endif}
  2192. cmc
  2193. adcl $0,%ecx {Nr. digits ready in ecx.}
  2194. {Write length & sign.}
  2195. lea (%edx,%ecx),%ebx
  2196. movb $45,%bh {movb $'-,%bh Not supported by our ATT reader.}
  2197. movw %bx,(%edi)
  2198. addl %edx,%edi
  2199. subl %edx,%esi
  2200. {Skip digits beyond string length.}
  2201. movl %eax,%edx
  2202. subl %ecx,%esi
  2203. jae .Lloop_write
  2204. .balign 4
  2205. .Lloop_skip:
  2206. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  2207. mull %edx
  2208. shrl $3,%edx
  2209. decl %ecx
  2210. jz .Ldone {If (l<0) and (high(s)=1) this jump is taken.}
  2211. incl %esi
  2212. jnz .Lloop_skip
  2213. {Write out digits.}
  2214. .balign 4
  2215. .Lloop_write:
  2216. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  2217. {Pre-add '0'}
  2218. leal 48(%edx),%ebx {leal $'0(,%edx),%ebx Not supported by our ATT reader.}
  2219. mull %edx
  2220. shrl $3,%edx
  2221. leal (%edx,%edx,8),%eax {x mod 10 = x-10*(x div 10)}
  2222. subl %edx,%ebx
  2223. subl %eax,%ebx
  2224. movb %bl,(%edi,%ecx)
  2225. decl %ecx
  2226. jnz .Lloop_write
  2227. .Ldone:
  2228. popl %ebx
  2229. popl %edi
  2230. popl %esi
  2231. end;
  2232. {$endif}
  2233. {****************************************************************************
  2234. Bounds Check
  2235. ****************************************************************************}
  2236. { do a thread-safe inc/dec }
  2237. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  2238. function cpudeclocked(var l : longint) : boolean;assembler;nostackframe;
  2239. asm
  2240. lock
  2241. decl (%eax)
  2242. setzb %al
  2243. end;
  2244. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  2245. procedure cpuinclocked(var l : longint);assembler;nostackframe;
  2246. asm
  2247. lock
  2248. incl (%eax)
  2249. end;
  2250. // inline SMP check and normal lock.
  2251. // the locked one is so slow, inlining doesn't matter.
  2252. function declocked(var l : longint) : boolean; inline;
  2253. begin
  2254. if not ismultithread then
  2255. begin
  2256. dec(l);
  2257. declocked:=l=0;
  2258. end
  2259. else
  2260. declocked:=cpudeclocked(l);
  2261. end;
  2262. procedure inclocked(var l : longint); inline;
  2263. begin
  2264. if not ismultithread then
  2265. inc(l)
  2266. else
  2267. cpuinclocked(l);
  2268. end;
  2269. {$ifndef VER3_2}
  2270. {$define FPC_SYSTEM_HAS_ATOMIC_CMP_XCHG_8}
  2271. function fpc_atomic_cmp_xchg_8(var Target: shortint; NewValue: shortint; Comparand: shortint): shortint; assembler; nostackframe;
  2272. asm
  2273. xchgl %eax,%ecx
  2274. lock
  2275. cmpxchgb %dl,(%ecx)
  2276. end;
  2277. {$define FPC_SYSTEM_HAS_ATOMIC_CMP_XCHG_16}
  2278. function fpc_atomic_cmp_xchg_16(var Target: smallint; NewValue: smallint; Comparand: smallint): smallint; assembler; nostackframe;
  2279. asm
  2280. xchgl %eax,%ecx
  2281. lock
  2282. cmpxchgw %dx,(%ecx)
  2283. end;
  2284. {$define FPC_SYSTEM_HAS_ATOMIC_XCHG_64}
  2285. function fpc_atomic_xchg_64(var Target: int64; Source: int64): int64; assembler; nostackframe;
  2286. { eax = Target, [esp + 4] = Source. }
  2287. asm
  2288. pushl %ebx
  2289. pushl %edi
  2290. movl %eax,%edi
  2291. movl 8+4(%esp),%ebx
  2292. movl 8+8(%esp),%ecx
  2293. .LAgain:
  2294. movl (%edi),%eax
  2295. movl 4(%edi),%edx
  2296. lock cmpxchg8b (%edi)
  2297. jne .LAgain
  2298. pop %edi
  2299. pop %ebx
  2300. end;
  2301. {$define FPC_SYSTEM_HAS_ATOMIC_SUB_32}
  2302. function fpc_atomic_sub_32(var Target: longint; Value: longint): longint; assembler; nostackframe;
  2303. asm
  2304. neg %edx
  2305. lock
  2306. xaddl %edx, (%eax)
  2307. movl %edx,%eax
  2308. end;
  2309. {$define FPC_SYSTEM_HAS_ATOMIC_INC_64}
  2310. function fpc_atomic_inc_64(var Target: int64): int64; assembler; nostackframe;
  2311. { eax = Target. }
  2312. asm
  2313. pushl %ebx
  2314. pushl %edi
  2315. movl %eax,%edi
  2316. .LAgain:
  2317. movl (%edi),%eax
  2318. movl 4(%edi),%edx
  2319. movl %eax,%ebx { ecx:ebx := edx:eax + 1. }
  2320. movl %edx,%ecx
  2321. addl $1,%ebx
  2322. adcl $0,%ecx
  2323. lock cmpxchg8b (%edi)
  2324. jne .LAgain
  2325. movl %ebx,%eax
  2326. movl %ecx,%edx
  2327. pop %edi
  2328. pop %ebx
  2329. end;
  2330. {$define FPC_SYSTEM_HAS_ATOMIC_DEC_64}
  2331. function fpc_atomic_dec_64(var Target: int64): int64; assembler; nostackframe;
  2332. { eax = Target. }
  2333. asm
  2334. pushl %ebx
  2335. pushl %edi
  2336. movl %eax,%edi
  2337. .LAgain:
  2338. movl (%edi),%eax
  2339. movl 4(%edi),%edx
  2340. movl %eax,%ebx { ecx:ebx := edx:eax - 1. }
  2341. movl %edx,%ecx
  2342. subl $1,%ebx
  2343. sbbl $0,%ecx
  2344. lock cmpxchg8b (%edi)
  2345. jne .LAgain
  2346. movl %ebx,%eax
  2347. movl %ecx,%edx
  2348. pop %edi
  2349. pop %ebx
  2350. end;
  2351. {$define FPC_SYSTEM_HAS_ATOMIC_ADD_64}
  2352. function fpc_atomic_add_64(var Target: int64; Value: int64): int64; assembler; nostackframe;
  2353. { eax = Target, [esp + 4] = Value. }
  2354. asm
  2355. pushl %ebx
  2356. pushl %edi
  2357. movl %eax,%edi
  2358. .LAgain:
  2359. movl (%edi),%eax
  2360. movl 4(%edi),%edx
  2361. movl %eax,%ebx { ecx:ebx := edx:eax + Value. }
  2362. movl %edx,%ecx
  2363. addl 8+4(%esp),%ebx
  2364. adcl 8+8(%esp),%ecx
  2365. lock cmpxchg8b (%edi)
  2366. jne .LAgain
  2367. pop %edi
  2368. pop %ebx
  2369. end;
  2370. {$define FPC_SYSTEM_HAS_ATOMIC_SUB_64}
  2371. function fpc_atomic_sub_64(var Target: int64; Value: int64): int64; assembler; nostackframe;
  2372. { eax = Target, [esp + 4] = Value. }
  2373. asm
  2374. pushl %ebx
  2375. pushl %edi
  2376. movl %eax,%edi
  2377. .LAgain:
  2378. movl (%edi),%eax
  2379. movl 4(%edi),%edx
  2380. movl %eax,%ebx { ecx:ebx := edx:eax - Value. }
  2381. movl %edx,%ecx
  2382. subl 8+4(%esp),%ebx
  2383. sbbl 8+8(%esp),%ecx
  2384. lock cmpxchg8b (%edi)
  2385. jne .LAgain
  2386. pop %edi
  2387. pop %ebx
  2388. end;
  2389. {$endif VER3_2}
  2390. {$ifdef VER3_2}
  2391. function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
  2392. {$else VER3_2}
  2393. {$define FPC_SYSTEM_HAS_ATOMIC_DEC_32}
  2394. function fpc_atomic_dec_32 (var Target: longint) : longint; assembler; nostackframe;
  2395. {$endif VER3_2}
  2396. asm
  2397. movl $-1,%edx
  2398. lock
  2399. xaddl %edx, (%eax)
  2400. lea -1(%edx),%eax
  2401. end;
  2402. {$ifdef VER3_2}
  2403. function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
  2404. {$else VER3_2}
  2405. {$define FPC_SYSTEM_HAS_ATOMIC_INC_32}
  2406. function fpc_atomic_inc_32 (var Target: longint) : longint; assembler; nostackframe;
  2407. {$endif VER3_2}
  2408. asm
  2409. movl $1,%edx
  2410. lock
  2411. xaddl %edx, (%eax)
  2412. lea 1(%edx),%eax
  2413. end;
  2414. {$ifdef VER3_2}
  2415. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2416. {$else VER3_2}
  2417. {$define FPC_SYSTEM_HAS_ATOMIC_XCHG_32}
  2418. function fpc_atomic_xchg_32 (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2419. {$endif VER3_2}
  2420. asm
  2421. xchgl (%eax),%edx
  2422. movl %edx,%eax
  2423. end;
  2424. {$ifdef VER3_2}
  2425. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2426. {$else VER3_2}
  2427. {$define FPC_SYSTEM_HAS_ATOMIC_ADD_32}
  2428. function fpc_atomic_add_32 (var Target: longint;Value : longint) : longint; assembler; nostackframe;
  2429. {$endif VER3_2}
  2430. asm
  2431. lock
  2432. xaddl %edx, (%eax)
  2433. movl %edx,%eax
  2434. end;
  2435. {$ifdef VER3_2}
  2436. function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler; nostackframe;
  2437. {$else VER3_2}
  2438. {$define FPC_SYSTEM_HAS_ATOMIC_CMP_XCHG_32}
  2439. function fpc_atomic_cmp_xchg_32(var Target: longint; NewValue, Comparand : longint): longint; [public, alias:'FPC_ATOMIC_CMP_XCHG_32']; assembler; nostackframe;
  2440. {$endif VER3_2}
  2441. asm
  2442. xchgl %eax,%ecx
  2443. lock
  2444. cmpxchgl %edx, (%ecx)
  2445. end;
  2446. {$ifdef VER3_2}
  2447. function InterlockedCompareExchange64(var Target: int64; NewValue: int64; Comperand: int64): int64; assembler; nostackframe;
  2448. {$else VER3_2}
  2449. {$define FPC_SYSTEM_HAS_ATOMIC_CMP_XCHG_64}
  2450. function fpc_atomic_cmp_xchg_64 (var Target: int64; NewValue: int64; Comparand: int64) : int64; assembler; nostackframe;
  2451. {$endif VER3_2}
  2452. { eax = Target, [esp + 12] = NewValue, [esp + 4] = Comparand. }
  2453. asm
  2454. pushl %ebx
  2455. pushl %edi
  2456. movl %eax,%edi
  2457. movl 8+4(%esp),%eax
  2458. movl 8+8(%esp),%edx
  2459. movl 8+12(%esp),%ebx
  2460. movl 8+16(%esp),%ecx
  2461. lock cmpxchg8b (%edi)
  2462. pop %edi
  2463. pop %ebx
  2464. end;
  2465. {****************************************************************************
  2466. FPU
  2467. ****************************************************************************}
  2468. const
  2469. { Internal constants for use in system unit }
  2470. FPU_Invalid = 1;
  2471. FPU_Denormal = 2;
  2472. FPU_DivisionByZero = 4;
  2473. FPU_Overflow = 8;
  2474. FPU_Underflow = $10;
  2475. FPU_StackUnderflow = $20;
  2476. FPU_StackOverflow = $40;
  2477. FPU_ExceptionMask = $ff;
  2478. MM_Invalid = 1;
  2479. MM_Denormal = 2;
  2480. MM_DivisionByZero = 4;
  2481. MM_Overflow = 8;
  2482. MM_Underflow = $10;
  2483. MM_Precicion = $20;
  2484. MM_ExceptionMask = $3f;
  2485. MM_MaskInvalidOp = %0000000010000000;
  2486. MM_MaskDenorm = %0000000100000000;
  2487. MM_MaskDivZero = %0000001000000000;
  2488. MM_MaskOverflow = %0000010000000000;
  2489. MM_MaskUnderflow = %0000100000000000;
  2490. MM_MaskPrecision = %0001000000000000;
  2491. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  2492. Procedure SysInitFPU;
  2493. begin
  2494. end;
  2495. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  2496. Procedure SysResetFPU;
  2497. var
  2498. { these locals are so we don't have to hack pic code in the assembler }
  2499. localmxcsr: dword;
  2500. localfpucw: word;
  2501. begin
  2502. localfpucw:=Default8087CW;
  2503. asm
  2504. fninit
  2505. fwait
  2506. fldcw localfpucw
  2507. end;
  2508. if has_sse_support then
  2509. begin
  2510. localmxcsr:=DefaultMXCSR;
  2511. asm
  2512. { setup sse exceptions }
  2513. {$ifndef OLD_ASSEMBLER}
  2514. ldmxcsr localmxcsr
  2515. {$else OLD_ASSEMBLER}
  2516. mov localmxcsr,%eax
  2517. subl $4,%esp
  2518. mov %eax,(%esp)
  2519. //ldmxcsr (%esp)
  2520. .byte 0x0f,0xae,0x14,0x24
  2521. addl $4,%esp
  2522. {$endif OLD_ASSEMBLER}
  2523. end;
  2524. end;
  2525. end;
  2526. { because of the brain dead sse detection on x86, this test is post poned }
  2527. procedure fpc_cpucodeinit;
  2528. var
  2529. _eax,_ecx_cpuid1,_edx_cpuid1,_ebx_cpuid7 : longint;
  2530. begin
  2531. if cpuid_support then
  2532. begin
  2533. asm
  2534. movl $1,%eax
  2535. xorl %ecx,%ecx
  2536. cpuid
  2537. movl %edx,_edx_cpuid1
  2538. movl %ecx,_ecx_cpuid1
  2539. end ['ebx'];
  2540. has_mmx_support:=(_edx_cpuid1 and $800000)<>0;
  2541. if ((_edx_cpuid1 and $2000000)<>0) then
  2542. begin
  2543. os_supports_sse:=true;
  2544. sse_check:=true;
  2545. asm
  2546. { force an sse exception if no sse is supported, the exception handler sets
  2547. os_supports_sse to false then }
  2548. { don't change this instruction, the code above depends on its size }
  2549. {$ifdef OLD_ASSEMBLER}
  2550. .byte 0x0f,0x28,0xf7
  2551. {$else}
  2552. movaps %xmm7, %xmm6
  2553. {$endif not EMX}
  2554. end;
  2555. sse_check:=false;
  2556. has_sse_support:=os_supports_sse;
  2557. end;
  2558. if has_sse_support then
  2559. begin
  2560. has_sse2_support:=((_edx_cpuid1 and $4000000)<>0);
  2561. has_sse3_support:=((_ecx_cpuid1 and $200)<>0);
  2562. has_sse41_support:=boolean(_ecx_cpuid1 shr 19 and 1);
  2563. { now avx }
  2564. asm
  2565. xorl %eax,%eax
  2566. cpuid
  2567. movl %eax,_eax
  2568. end;
  2569. if _eax>=7 then
  2570. begin
  2571. asm
  2572. movl $7,%eax
  2573. xorl %ecx,%ecx
  2574. cpuid
  2575. movl %ebx,_ebx_cpuid7
  2576. end;
  2577. fast_large_repmovstosb:=_ebx_cpuid7 and (1 shl 9)<>0;
  2578. if (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then
  2579. begin
  2580. asm
  2581. xorl %ecx,%ecx
  2582. .byte 0x0f,0x01,0xd0 { xgetbv }
  2583. movl %eax,_eax
  2584. end;
  2585. if (_eax and 6)=6 then
  2586. begin
  2587. has_avx_support:=(_ecx_cpuid1 and $10000000)<>0;
  2588. has_avx2_support:=(_ebx_cpuid7 and $20)<>0;
  2589. end;
  2590. end;
  2591. end;
  2592. end;
  2593. end;
  2594. { don't let libraries influence the FPU cw set by the host program }
  2595. if IsLibrary then
  2596. begin
  2597. Default8087CW:=Get8087CW;
  2598. if has_sse_support then
  2599. DefaultMXCSR:=GetMXCSR;
  2600. end;
  2601. SysResetFPU;
  2602. fpc_cpucodeinit_performed:=true;
  2603. end;
  2604. {$if not defined(darwin) and defined(regcall) }
  2605. { darwin requires that the stack is aligned to 16 bytes when calling another function }
  2606. {$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
  2607. {$define FPC_SYSTEM_HAS_ANSISTR_DECR_REF}
  2608. Procedure fpc_AnsiStr_Decr_Ref (Var S : Pointer); [Public,Alias:'FPC_ANSISTR_DECR_REF']; compilerproc; nostackframe; assembler;
  2609. asm
  2610. movl (%eax),%edx
  2611. testl %edx,%edx
  2612. jz .Lquit
  2613. movl $0,(%eax) // s:=nil
  2614. cmpl $1,-8(%edx) // exit if refcount<1
  2615. je .Lfree // skip the decrement if refcount=1.
  2616. jl .Lquit
  2617. {$ifdef FPC_PIC}
  2618. call fpc_geteipasecx
  2619. addl $_GLOBAL_OFFSET_TABLE_,%ecx
  2620. movl ismultithread@GOT(%ecx),%ecx
  2621. cmpl $0,(%ecx)
  2622. {$else FPC_PIC}
  2623. cmpl $0,ismultithread
  2624. {$endif FPC_PIC}
  2625. je .Lskiplock
  2626. .byte 0xF0 // LOCK prefix, jumped over if IsMultiThread = false. FPC assembler does not accept disjoint LOCK mnemonic.
  2627. .Lskiplock:
  2628. decl -8(%edx)
  2629. jz .Lfree
  2630. .Lquit:
  2631. ret
  2632. .Lfree:
  2633. leal -12(%edx),%eax // points to start of allocation
  2634. jmp FPC_FREEMEM // nostackframe + jmp allows to ignore stack alignment.
  2635. end;
  2636. function fpc_truely_ansistr_unique(Var S : Pointer): Pointer; forward;
  2637. {$define FPC_SYSTEM_HAS_ANSISTR_UNIQUE}
  2638. Function fpc_ansistr_Unique(Var S : Pointer): Pointer; [Public,Alias : 'FPC_ANSISTR_UNIQUE']; compilerproc; nostackframe;assembler;
  2639. asm
  2640. movl (%eax),%edx
  2641. testl %edx,%edx
  2642. jz .Lunchanged
  2643. cmpl $1,-8(%edx)
  2644. jne fpc_truely_ansistr_unique
  2645. .Lunchanged:
  2646. movl %edx,%eax
  2647. end;
  2648. {$endif FPC_HAS_FEATURE_ANSISTRINGS}
  2649. {$endif ndef darwin and defined(regcall) }
  2650. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  2651. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  2652. procedure ReadBarrier;assembler;nostackframe;
  2653. asm
  2654. {$ifdef CPUX86_HAS_SSE2}
  2655. lfence
  2656. {$else CPUX86_HAS_SSE2}
  2657. lock
  2658. addl $0,0(%esp)
  2659. {$endif CPUX86_HAS_SSE2}
  2660. end;
  2661. procedure ReadDependencyBarrier;
  2662. begin
  2663. { reads imply barrier on earlier reads depended on }
  2664. end;
  2665. procedure ReadWriteBarrier;assembler;nostackframe;
  2666. asm
  2667. {$ifdef CPUX86_HAS_SSE2}
  2668. mfence
  2669. {$else CPUX86_HAS_SSE2}
  2670. lock
  2671. addl $0,0(%esp)
  2672. {$endif CPUX86_HAS_SSE2}
  2673. end;
  2674. procedure WriteBarrier;assembler;nostackframe;
  2675. asm
  2676. {$ifdef CPUX86_HAS_SSEUNIT}
  2677. sfence
  2678. {$endif CPUX86_HAS_SSEUNIT}
  2679. end;
  2680. {$endif}
  2681. {$ifndef FPC_SYSTEM_HAS_BSF_QWORD}
  2682. {$define FPC_SYSTEM_HAS_BSF_QWORD}
  2683. function BsfQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2684. asm
  2685. {$ifdef CPUX86_HINT_BSX_DEST_UNCHANGED_ON_ZF_1}
  2686. mov $255-32,%eax { On AMD, BSF/R are documented to not change the destination on zero input. }
  2687. bsfl 8(%esp),%eax { On Intel, destination is formally undefined on zero input, but in practice the behavior is the same. }
  2688. add $32,%eax
  2689. bsfl 4(%esp),%eax
  2690. {$else}
  2691. bsfl 4(%esp),%eax
  2692. jz .L1
  2693. ret $8
  2694. .L1:
  2695. bsfl 8(%esp),%eax
  2696. jz .L2
  2697. add $32,%eax
  2698. ret $8
  2699. .L2:
  2700. movl $255,%eax
  2701. {$endif}
  2702. end;
  2703. {$endif FPC_SYSTEM_HAS_BSF_QWORD}
  2704. {$ifndef FPC_SYSTEM_HAS_BSR_QWORD}
  2705. {$define FPC_SYSTEM_HAS_BSR_QWORD}
  2706. function BsrQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2707. asm
  2708. {$ifdef CPUX86_HINT_BSX_DEST_UNCHANGED_ON_ZF_1}
  2709. mov $255,%eax
  2710. bsrl 4(%esp),%eax
  2711. sub $32,%eax
  2712. bsrl 8(%esp),%eax
  2713. add $32,%eax
  2714. {$else}
  2715. mov 8(%esp),%eax
  2716. test %eax,%eax
  2717. jnz .L1 { Speculate Hi(q) = 0. }
  2718. bsrl 4(%esp),%eax
  2719. jz .L2
  2720. ret $8
  2721. .L1:
  2722. bsrl %eax,%eax
  2723. add $32,%eax
  2724. ret $8
  2725. .L2:
  2726. movl $255,%eax
  2727. {$endif}
  2728. end;
  2729. {$endif FPC_SYSTEM_HAS_BSR_QWORD}
  2730. {$ifndef FPC_SYSTEM_HAS_SAR_QWORD}
  2731. {$define FPC_SYSTEM_HAS_SAR_QWORD}
  2732. function fpc_SarInt64(Const AValue : Int64;const Shift : Byte): Int64; [Public,Alias:'FPC_SARINT64']; compilerproc; assembler; nostackframe;
  2733. asm
  2734. movl 8(%esp),%edx
  2735. movzbl %al,%ecx
  2736. cmpb $32,%al
  2737. jnb .L1
  2738. movl 4(%esp),%eax
  2739. shrdl %cl,%edx,%eax
  2740. sarl %cl,%edx
  2741. ret $8
  2742. .L1:
  2743. movl %edx,%eax
  2744. sarl $31,%edx
  2745. sarl %cl,%eax // uses 5 lower bits of cl.
  2746. end;
  2747. {$endif FPC_SYSTEM_HAS_SAR_QWORD}
  2748. {$ifndef FPC_SYSTEM_HAS_UMUL64X64_128}
  2749. {$define FPC_SYSTEM_HAS_UMUL64X64_128}
  2750. function UMul64x64_128(a,b: uint64; out rHi: uint64): uint64; assembler; nostackframe;
  2751. { [esp + 12] = a, [esp + 4] = b, eax = rHi }
  2752. asm
  2753. { Hi(a) Lo(a)
  2754. x Hi(b) Lo(b)
  2755. -------------------------------------------------------------------------
  2756. Hi(Lo(a) * Lo(b)) Lo(Lo(a) * Lo(b))
  2757. + Hi(Hi(a) * Lo(b)) Lo(Hi(a) * Lo(b))
  2758. + Hi(Lo(a) * Hi(b)) Lo(Lo(a) * Hi(b))
  2759. + Hi(Hi(a) * Hi(b)) Lo(Hi(a) * Hi(b))
  2760. edi esi ebx, then edx eax }
  2761. push %ebx
  2762. push %esi
  2763. push %edi
  2764. mov %eax, %ecx { ecx = rHi. }
  2765. mov 12+16(%esp), %eax
  2766. mull 12+8(%esp) { edx:eax = Hi(a) * Hi(b). }
  2767. mov %eax, %esi
  2768. mov %edx, %edi { edi:esi = Hi(a) * Hi(b). }
  2769. mov 12+16(%esp), %eax
  2770. mull 12+4(%esp) { edx:eax = Hi(a) * Lo(b). }
  2771. mov %eax, %ebx
  2772. add %edx, %esi { edi:esi += Hi(Hi(a) * Lo(b)). }
  2773. adc $0, %edi
  2774. mov 12+12(%esp), %eax
  2775. mull 12+8(%esp) { edx:eax = Lo(a) * Hi(b). }
  2776. add %eax, %ebx // edi:esi:ebx += Lo(a) * Hi(b).
  2777. adc %edx, %esi
  2778. adc $0, %edi
  2779. mov 12+12(%esp), %eax
  2780. mull 12+4(%esp) { edx:eax = Lo(a) * Lo(b). }
  2781. add %ebx, %edx { edi:esi:edx += Hi(Lo(a) * Lo(b)). }
  2782. adc $0, %esi
  2783. adc $0, %edi
  2784. mov %esi, (%ecx)
  2785. mov %edi, 4(%ecx)
  2786. pop %edi
  2787. pop %esi
  2788. pop %ebx
  2789. end;
  2790. {$endif FPC_SYSTEM_HAS_UMUL64X64_128}