i386.inc 75 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 1999-2000 by the Free Pascal development team.
  4. Processor dependent implementation for the system unit for
  5. intel i386+
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$if not(defined(VER3_0)) and defined(linux)}
  13. {$define FPC_SYSTEM_STACKALIGNMENT16}
  14. {$endif not(defined(VER3_0)) and defined(linux)}
  15. {****************************************************************************
  16. Primitives
  17. ****************************************************************************}
  18. var
  19. os_supports_sse : boolean;
  20. { this variable is set to true, if currently an sse check is executed and no sig ill should be generated }
  21. sse_check : boolean;
  22. fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
  23. fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. }
  24. {$asmmode ATT}
  25. function cpuid_support : boolean;assembler;nostackframe;
  26. {
  27. Check if the ID-flag can be changed, if changed then CpuID is supported.
  28. Tested under go32v1 and Linux on c6x86 with CpuID enabled and disabled (PFV)
  29. }
  30. asm
  31. pushfl
  32. movl (%esp),%eax
  33. xorl $0x200000,%eax
  34. pushl %eax
  35. popfl
  36. pushfl
  37. popl %eax
  38. xorl (%esp),%eax
  39. popfl
  40. testl $0x200000,%eax
  41. setnz %al
  42. end;
  43. {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
  44. procedure fpc_cpuinit;
  45. begin
  46. { because of the brain dead sse detection on x86, this test is post poned to fpc_cpucodeinit which
  47. must be implemented OS dependend (FK)
  48. has_sse_support:=sse_support;
  49. has_mmx_support:=mmx_support;
  50. }
  51. end;
  52. {$ifndef darwin}
  53. procedure fpc_geteipasebx; [public, alias: 'fpc_geteipasebx'];assembler; nostackframe;
  54. asm
  55. movl (%esp),%ebx
  56. end;
  57. procedure fpc_geteipasecx; [public, alias: 'fpc_geteipasecx'];assembler; nostackframe;
  58. asm
  59. movl (%esp),%ecx
  60. end;
  61. {$endif}
  62. {$if not defined(FPC_SYSTEM_HAS_MOVE)
  63. and not defined(OLD_ASSEMBLER)
  64. and not defined(darwin)}
  65. {$i fastmove.inc}
  66. {$endif}
  67. {$ifndef FPC_SYSTEM_HAS_MOVE}
  68. {$define FPC_SYSTEM_HAS_MOVE}
  69. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;
  70. var
  71. saveesi,saveedi : longint;
  72. asm
  73. movl %edi,saveedi
  74. movl %esi,saveesi
  75. movl %eax,%esi
  76. movl %edx,%edi
  77. movl %ecx,%edx
  78. movl %edi,%eax
  79. { check for zero or negative count }
  80. cmpl $0,%edx
  81. jle .LMoveEnd
  82. { Check for back or forward }
  83. sub %esi,%eax
  84. jz .LMoveEnd { Do nothing when source=dest }
  85. jc .LFMove { Do forward, dest<source }
  86. cmp %edx,%eax
  87. jb .LBMove { Dest is in range of move, do backward }
  88. { Forward Copy }
  89. .LFMove:
  90. {$ifdef FPC_ENABLED_CLD}
  91. cld
  92. {$endif FPC_ENABLED_CLD}
  93. cmpl $15,%edx
  94. jl .LFMove1
  95. movl %edi,%ecx { Align on 32bits }
  96. negl %ecx
  97. andl $3,%ecx
  98. subl %ecx,%edx
  99. rep
  100. movsb
  101. movl %edx,%ecx
  102. andl $3,%edx
  103. shrl $2,%ecx
  104. rep
  105. movsl
  106. .LFMove1:
  107. movl %edx,%ecx
  108. rep
  109. movsb
  110. jmp .LMoveEnd
  111. { Backward Copy }
  112. .LBMove:
  113. std
  114. addl %edx,%esi
  115. addl %edx,%edi
  116. movl %edi,%ecx
  117. decl %esi
  118. decl %edi
  119. cmpl $15,%edx
  120. jl .LBMove1
  121. negl %ecx { Align on 32bits }
  122. andl $3,%ecx
  123. subl %ecx,%edx
  124. rep
  125. movsb
  126. movl %edx,%ecx
  127. andl $3,%edx
  128. shrl $2,%ecx
  129. subl $3,%esi
  130. subl $3,%edi
  131. rep
  132. movsl
  133. addl $3,%esi
  134. addl $3,%edi
  135. .LBMove1:
  136. movl %edx,%ecx
  137. rep
  138. movsb
  139. cld
  140. .LMoveEnd:
  141. movl saveedi,%edi
  142. movl saveesi,%esi
  143. end;
  144. {$endif FPC_SYSTEM_HAS_MOVE}
  145. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  146. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  147. or not defined(FPC_SYSTEM_HAS_FILLDWORD)
  148. or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  149. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  150. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  151. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  152. const
  153. FillXxxx_RepStosThreshold_ERMS = 1024;
  154. FillXxxx_RepStosThreshold_NoERMS = 512 * 1024;
  155. procedure FillXxxx_U32Pattern_RepStos_8OrMore; assembler; nostackframe;
  156. { eax — x, ecx — uint32 pattern, edx — byte count >= 8 (preferably >= FillXxxx_RepStosThreshold_(No)ERMS, depending on fast_large_repmovstosb). }
  157. asm
  158. {$ifdef FPC_ENABLED_CLD}
  159. cld
  160. {$endif FPC_ENABLED_CLD}
  161. mov %ecx, (%eax) { Write first 4 bytes unaligned. }
  162. push %ecx { pattern }
  163. push %edi
  164. mov %eax, %edi { Move x to edi, as expected by ‘rep stosl’. }
  165. xchg %eax, %ecx { now eax = pattern (as expected by ‘rep stosl’) and ecx = x (to rotate the pattern by its misalignment) }
  166. shl $3, %ecx { ecx = misalignment of x in bits. }
  167. rol %cl, %eax { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
  168. add %edi, %edx { edx = x end }
  169. lea -1(%edx), %ecx { ecx = x end - 1. }
  170. add $4, %edi
  171. and $-4, %edi { edi = 4-byte aligned pointer strictly to the right of the start. }
  172. and $-4, %ecx { ecx = 4-byte aligned pointer strictly to the left of the end. }
  173. sub %edi, %ecx { ecx = byte count between them. }
  174. shr $2, %ecx { ecx = uint32 count, as expected by ‘rep stosl’. }
  175. rep stosl
  176. pop %edi
  177. pop %ecx
  178. mov %ecx, -4(%edx) { Write last 4 bytes unaligned. }
  179. end;
  180. {$endif FillChar/Word/DWord required.}
  181. label
  182. FillXxxx_MoreThanTwoXMMs;
  183. procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
  184. { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
  185. const
  186. NtThreshold = 4 * 1024 * 1024;
  187. asm
  188. movd %ecx, %xmm0
  189. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  190. movdqu %xmm0, (%eax)
  191. movdqu %xmm0, -16(%eax,%edx)
  192. cmp $32, %edx
  193. ja .LMoreThanTwoVectors
  194. ret
  195. .byte 102,102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
  196. { x can start and end misaligned on the vector boundary:
  197. x = ~~][H1][H2][...][T2][T1]~
  198. [UH] [UT]
  199. UH/UT stands for “unaligned head/tail”, both have 1~16 bytes. }
  200. .LMoreThanTwoVectors:
  201. push %esi
  202. mov %ecx, %esi { esi = pattern }
  203. mov %eax, %ecx
  204. shl $3, %ecx { ecx = misalignment of x in bits }
  205. rol %cl, %esi { misalign the pattern }
  206. movd %esi, %xmm1
  207. pshufd $0, %xmm1, %xmm1
  208. pop %esi
  209. { FillChar (to skip the misaligning above) and FillQWord jump here.
  210. eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes, first and last 16 bytes written. }
  211. FillXxxx_MoreThanTwoXMMs:
  212. lea -65(%eax,%edx), %ecx
  213. and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
  214. and $-16, %eax { align eax to the LEFT (so needs to be offset by an additional +16 for a while). }
  215. movdqa %xmm1, 16(%eax) { Write H1. }
  216. cmp $49, %edx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
  217. jle .LOneAlignedTailWrite
  218. movdqa %xmm1, 32(%eax) { Write H2. }
  219. cmp $81, %edx { 50~81 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
  220. jle .LTwoAlignedTailWrites
  221. cmp $113, %edx { 82~113 bytes might contain 4~6 heads+tails; write as H1–2 and T4–1. }
  222. jle .LFourAlignedTailWrites
  223. add $48, %eax
  224. cmp $NtThreshold, %edx
  225. jae .L64xNT_Body
  226. .balign 16 { no-op }
  227. .L64x_Body:
  228. movdqa %xmm1, (%eax)
  229. movdqa %xmm1, 16(%eax)
  230. movdqa %xmm1, 32(%eax)
  231. movdqa %xmm1, 48(%eax)
  232. add $64, %eax
  233. cmp %ecx, %eax
  234. jb .L64x_Body
  235. .LFourAlignedTailWrites:
  236. movdqa %xmm1, (%ecx) { T4 }
  237. movdqa %xmm1, 16(%ecx) { T3 }
  238. .LTwoAlignedTailWrites:
  239. movdqa %xmm1, 32(%ecx) { T2 }
  240. .LOneAlignedTailWrite:
  241. movdqa %xmm1, 48(%ecx) { T1 }
  242. ret
  243. .balign 16
  244. .L64xNT_Body:
  245. movntdq %xmm1, (%eax)
  246. movntdq %xmm1, 16(%eax)
  247. movntdq %xmm1, 32(%eax)
  248. movntdq %xmm1, 48(%eax)
  249. add $64, %eax
  250. cmp %ecx, %eax
  251. jb .L64xNT_Body
  252. sfence
  253. jmp .LFourAlignedTailWrites
  254. end;
  255. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  256. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  257. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  258. procedure FillXxxx_U32Pattern_Plain_16OrMore; assembler; nostackframe;
  259. { eax — x, ecx — uint32 pattern, edx — byte count >= 12 (preferably >= 16). }
  260. asm
  261. mov %ecx, (%eax) { Write first 4 bytes. }
  262. lea -9(%eax,%edx), %edx
  263. mov %ecx, 5(%edx) { Write last 4 bytes. }
  264. and $-4, %edx { edx = loop bound. }
  265. push %esi
  266. mov %ecx, %esi { esi = pattern }
  267. mov %eax, %ecx
  268. shl $3, %ecx { ecx = misalignment of x in bits }
  269. rol %cl, %esi { misalign the pattern }
  270. add $4, %eax
  271. and $-4, %eax
  272. .balign 16
  273. .L8xLoop:
  274. mov %esi, (%eax)
  275. mov %esi, 4(%eax)
  276. add $8, %eax
  277. cmp %edx, %eax
  278. jb .L8xLoop
  279. mov %esi, (%edx)
  280. mov %esi, 4(%edx)
  281. pop %esi
  282. end;
  283. procedure FillXxxx_U32Pattern_Ladder_4to16; assembler; nostackframe;
  284. { eax — x, ecx — uint32 pattern, edx — byte count, 4 <= edx <= 16. }
  285. asm
  286. mov %ecx, (%eax)
  287. cmp $8, %edx
  288. jle .LLast4
  289. mov %ecx, 4(%eax)
  290. mov %ecx, -8(%eax,%edx)
  291. .LLast4:
  292. mov %ecx, -4(%eax,%edx)
  293. end;
  294. {$endif FillChar/Word/DWord required.}
  295. {$endif FillChar/Word/DWord/QWord required.}
  296. {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
  297. {$define FPC_SYSTEM_HAS_FILLCHAR}
  298. procedure FillChar_3OrLess; assembler; nostackframe;
  299. { cl — x, edx — byte count, Low(int32) <= edx <= 3. }
  300. asm
  301. test %edx, %edx
  302. jle .LQuit
  303. mov %cl, (%eax)
  304. mov %cl, -1(%eax,%edx)
  305. shr $1, %edx
  306. mov %cl, (%eax,%edx)
  307. .LQuit:
  308. end;
  309. procedure FillChar_Plain(var x;count:SizeInt;value:byte);assembler;nostackframe;
  310. asm
  311. cmp $3, %edx
  312. jle FillChar_3OrLess
  313. movzbl %cl, %ecx
  314. imul $0x01010101, %ecx
  315. cmp $16, %edx
  316. jbe FillXxxx_U32Pattern_Ladder_4to16
  317. jmp FillXxxx_U32Pattern_Plain_16OrMore
  318. end;
  319. procedure FillChar_SSE2(var x;count:SizeInt;value:byte);assembler;nostackframe;
  320. asm
  321. cmp $3, %edx
  322. jle FillChar_3OrLess
  323. movzbl %cl, %ecx
  324. imul $0x01010101, %ecx
  325. cmp $16, %edx
  326. jbe FillXxxx_U32Pattern_Ladder_4to16
  327. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  328. jae FillXxxx_U32Pattern_RepStos_8OrMore
  329. movd %ecx, %xmm0
  330. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  331. movdqu %xmm0, (%eax)
  332. movdqu %xmm0, -16(%eax,%edx)
  333. movdqa %xmm0, %xmm1
  334. cmp $32, %edx
  335. ja FillXxxx_MoreThanTwoXMMs
  336. end;
  337. procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
  338. asm
  339. cmp $3, %edx
  340. jle FillChar_3OrLess
  341. movzbl %cl, %ecx
  342. imul $0x01010101, %ecx
  343. cmp $16, %edx
  344. jbe FillXxxx_U32Pattern_Ladder_4to16
  345. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  346. jae FillXxxx_U32Pattern_RepStos_8OrMore
  347. movd %ecx, %xmm0
  348. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  349. movdqu %xmm0, (%eax)
  350. movdqu %xmm0, -16(%eax,%edx)
  351. movdqa %xmm0, %xmm1
  352. cmp $32, %edx
  353. ja FillXxxx_MoreThanTwoXMMs
  354. end;
  355. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
  356. var
  357. FillChar_Impl: procedure(var x;count:SizeInt;value:byte) = @FillChar_Dispatch;
  358. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte);
  359. begin
  360. if not fpc_cpucodeinit_performed then
  361. begin
  362. FillChar_Plain(x, count, value);
  363. exit;
  364. end;
  365. if fast_large_repmovstosb then
  366. FillChar_Impl := @FillChar_SSE2_ERMS
  367. else if has_sse2_support then
  368. FillChar_Impl := @FillChar_SSE2
  369. else
  370. FillChar_Impl := @FillChar_Plain;
  371. FillChar_Impl(x, count, value);
  372. end;
  373. procedure FillChar(var x;count:SizeInt;value:byte);
  374. begin
  375. FillChar_Impl(x, count, value);
  376. end;
  377. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  378. {$ifndef FPC_SYSTEM_HAS_FILLWORD}
  379. {$define FPC_SYSTEM_HAS_FILLWORD}
  380. procedure FillWord_3OrLess; assembler; nostackframe;
  381. asm
  382. test %edx, %edx
  383. jle .LQuit
  384. mov %cx, (%eax)
  385. mov %cx, -2(%eax,%edx,2)
  386. shr $1, %edx
  387. mov %cx, (%eax,%edx,2)
  388. .LQuit:
  389. end;
  390. procedure FillWord_Plain(var x;count:SizeInt;value:word);assembler;nostackframe;
  391. asm
  392. cmp $3, %edx
  393. jle FillWord_3OrLess
  394. shl $1, %edx
  395. movzwl %cx, %ecx
  396. imul $0x00010001, %ecx
  397. cmp $16, %edx
  398. jbe FillXxxx_U32Pattern_Ladder_4to16
  399. jmp FillXxxx_U32Pattern_Plain_16OrMore
  400. end;
  401. procedure FillWord_SSE2(var x;count:SizeInt;value:word);assembler;nostackframe;
  402. asm
  403. cmp $3, %edx
  404. jle FillWord_3OrLess
  405. shl $1, %edx
  406. movzwl %cx, %ecx
  407. imul $0x00010001, %ecx
  408. cmp $16, %edx
  409. jbe FillXxxx_U32Pattern_Ladder_4to16
  410. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  411. jb FillXxxx_U32Pattern_SSE2_16OrMore
  412. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  413. end;
  414. procedure FillWord_SSE2_ERMS(var x;count:SizeInt;value:word);assembler;nostackframe;
  415. asm
  416. cmp $3, %edx
  417. jle FillWord_3OrLess
  418. shl $1, %edx
  419. movzwl %cx, %ecx
  420. imul $0x00010001, %ecx
  421. cmp $16, %edx
  422. jbe FillXxxx_U32Pattern_Ladder_4to16
  423. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  424. jb FillXxxx_U32Pattern_SSE2_16OrMore
  425. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  426. end;
  427. procedure FillWord_Dispatch(var x;count:SizeInt;value:word); forward;
  428. var
  429. FillWord_Impl: procedure(var x;count:SizeInt;value:word) = @FillWord_Dispatch;
  430. procedure FillWord_Dispatch(var x;count:SizeInt;value:word);
  431. begin
  432. if not fpc_cpucodeinit_performed then
  433. begin
  434. FillWord_Plain(x, count, value);
  435. exit;
  436. end;
  437. if fast_large_repmovstosb then
  438. FillWord_Impl := @FillWord_SSE2_ERMS
  439. else if has_sse2_support then
  440. FillWord_Impl := @FillWord_SSE2
  441. else
  442. FillWord_Impl := @FillWord_Plain;
  443. FillWord_Impl(x, count, value);
  444. end;
  445. procedure FillWord(var x;count:SizeInt;value:word);
  446. begin
  447. FillWord_Impl(x, count, value);
  448. end;
  449. {$endif FPC_SYSTEM_HAS_FILLWORD}
  450. {$ifndef FPC_SYSTEM_HAS_FILLDWORD}
  451. {$define FPC_SYSTEM_HAS_FILLDWORD}
  452. procedure FillDWord_4OrLess; assembler; nostackframe;
  453. asm
  454. cmp $1, %edx
  455. jl .LQuit
  456. mov %ecx, (%eax)
  457. je .LQuit
  458. mov %ecx, 4(%eax)
  459. mov %ecx, -8(%eax,%edx,4)
  460. mov %ecx, -4(%eax,%edx,4)
  461. .LQuit:
  462. end;
  463. procedure FillDWord_Plain(var x;count:SizeInt;value:dword);assembler;nostackframe;
  464. asm
  465. cmp $4, %edx
  466. jle FillDWord_4OrLess
  467. shl $2, %edx
  468. jmp FillXxxx_U32Pattern_Plain_16OrMore
  469. end;
  470. procedure FillDWord_SSE2(var x;count:SizeInt;value:dword);assembler;nostackframe;
  471. asm
  472. cmp $4, %edx
  473. jle FillDWord_4OrLess
  474. shl $2, %edx
  475. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  476. jb FillXxxx_U32Pattern_SSE2_16OrMore
  477. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  478. end;
  479. procedure FillDWord_SSE2_ERMS(var x;count:SizeInt;value:dword);assembler;nostackframe;
  480. asm
  481. cmp $4, %edx
  482. jle FillDWord_4OrLess
  483. shl $2, %edx
  484. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  485. jb FillXxxx_U32Pattern_SSE2_16OrMore
  486. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  487. end;
  488. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword); forward;
  489. var
  490. FillDWord_Impl: procedure(var x;count:SizeInt;value:dword) = @FillDWord_Dispatch;
  491. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword);
  492. begin
  493. if not fpc_cpucodeinit_performed then
  494. begin
  495. FillDWord_Plain(x, count, value);
  496. exit;
  497. end;
  498. if fast_large_repmovstosb then
  499. FillDWord_Impl := @FillDWord_SSE2_ERMS
  500. else if has_sse2_support then
  501. FillDWord_Impl := @FillDWord_SSE2
  502. else
  503. FillDWord_Impl := @FillDWord_Plain;
  504. FillDWord_Impl(x, count, value);
  505. end;
  506. procedure FillDWord(var x;count:SizeInt;value:dword);
  507. begin
  508. FillDWord_Impl(x, count, value);
  509. end;
  510. {$endif FPC_SYSTEM_HAS_FILLDWORD}
  511. {$ifndef FPC_SYSTEM_HAS_FILLQWORD}
  512. {$define FPC_SYSTEM_HAS_FILLQWORD}
  513. procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  514. { eax = x, edx = count, [esp + 4] = value }
  515. asm
  516. test %edx, %edx { Don't care about clever things like separate small branches or aligning writes by misaligning the pattern, the function is dead. }
  517. jle .LQuit
  518. push %esi
  519. mov 4+4(%esp), %esi { esi = value[0:31] }
  520. mov 4+8(%esp), %ecx { ecx = value[32:63] }
  521. .balign 16
  522. .LLoop:
  523. mov %esi, (%eax)
  524. mov %ecx, 4(%eax)
  525. add $8, %eax
  526. sub $1, %edx
  527. jnz .LLoop
  528. pop %esi
  529. .LQuit:
  530. end;
  531. procedure FillQWord_SSE2(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  532. { eax = x, edx = count, [esp + 4] = value }
  533. asm
  534. cmp $4, %edx
  535. jle .L4OrLess
  536. movq 4(%esp), %xmm0
  537. punpcklqdq %xmm0, %xmm0
  538. { Stack is 12 bytes:
  539. [esp] = return address, [esp + 4] = value (not required anymore).
  540. Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs:
  541. [esp] = return address. }
  542. mov (%esp), %ecx
  543. add $8, %esp
  544. mov %ecx, (%esp)
  545. shl $3, %edx
  546. movdqu %xmm0, (%eax)
  547. movdqu %xmm0, -16(%eax,%edx)
  548. movdqa %xmm0, %xmm1
  549. test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
  550. jz FillXxxx_MoreThanTwoXMMs
  551. mov %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x into xmm1. }
  552. shl $3, %ecx
  553. and $63, %ecx
  554. movd %ecx, %xmm3
  555. psllq %xmm3, %xmm1
  556. neg %ecx { Can also do not ecx; and $63, %ecx; inc ecx to support ecx = 0, as SSE shifts correctly handle count >= bitsizeof. }
  557. and $63, %ecx { But in the definitely unaligned case, ecx = 0 is impossible. }
  558. movd %ecx, %xmm3
  559. movdqa %xmm0, %xmm2
  560. psrlq %xmm3, %xmm2
  561. por %xmm2, %xmm1
  562. jmp FillXxxx_MoreThanTwoXMMs
  563. .L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ }
  564. cmp $1, %edx
  565. jl .LQuit
  566. mov 4(%esp), %ecx
  567. mov %ecx, (%eax)
  568. je .LSecondHalfOf1
  569. mov %ecx, 8(%eax)
  570. mov %ecx, -16(%eax,%edx,8)
  571. mov %ecx, -8(%eax,%edx,8)
  572. mov 8(%esp), %ecx
  573. mov %ecx, 4(%eax)
  574. mov %ecx, 12(%eax)
  575. mov %ecx, -12(%eax,%edx,8)
  576. mov %ecx, -4(%eax,%edx,8)
  577. .LQuit:
  578. ret $8
  579. .LSecondHalfOf1:
  580. mov 8(%esp), %ecx
  581. mov %ecx, 4(%eax)
  582. end;
  583. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword); forward;
  584. var
  585. FillQWord_Impl: procedure(var x;count:SizeInt;value:qword) = @FillQWord_Dispatch;
  586. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword);
  587. begin
  588. if not fpc_cpucodeinit_performed then
  589. begin
  590. FillQWord_Plain(x, count, value);
  591. exit;
  592. end;
  593. if has_sse2_support then
  594. FillQWord_Impl := @FillQWord_SSE2
  595. else
  596. FillQWord_Impl := @FillQWord_Plain;
  597. FillQWord_Impl(x, count, value);
  598. end;
  599. procedure FillQWord(var x;count:SizeInt;value:qword);
  600. begin
  601. FillQWord_Impl(x, count, value);
  602. end;
  603. {$endif FPC_SYSTEM_HAS_FILLQWORD}
  604. {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
  605. {$define FPC_SYSTEM_HAS_INDEXBYTE}
  606. function IndexByte_Plain(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  607. asm
  608. push %esi
  609. push %edi
  610. push %eax { save initial value of 'buf' }
  611. cmp $4,%edx { less than 4 bytes, just test byte by byte. }
  612. jb .Ltail
  613. mov %cl,%ch { prepare pattern }
  614. movzwl %cx,%esi
  615. shl $16,%ecx
  616. or %esi,%ecx
  617. .Lalignloop:
  618. test $3,%al { align to 4 bytes if necessary }
  619. je .Laligned
  620. cmp %cl,(%eax)
  621. je .Lexit
  622. inc %eax
  623. dec %edx
  624. jmp .Lalignloop
  625. .balign 16 { Main loop, unrolled 4 times for speed }
  626. .Lloop:
  627. mov (%eax),%esi { load dword }
  628. xor %ecx,%esi { XOR with pattern, bytes equal to target are now 0 }
  629. lea -0x01010101(%esi),%edi
  630. xor %esi,%edi { (x-0x01010101) xor x }
  631. not %esi
  632. and $0x80808080,%esi
  633. and %edi,%esi { ((x-0x01010101) xor x) and (not x) and 0x80808080 }
  634. jnz .Lfound { one of the bytes matches }
  635. mov 4(%eax),%esi
  636. xor %ecx,%esi
  637. lea -0x01010101(%esi),%edi
  638. xor %esi,%edi
  639. not %esi
  640. and $0x80808080,%esi
  641. and %edi,%esi
  642. jnz .Lfound4
  643. mov 8(%eax),%esi
  644. xor %ecx,%esi
  645. lea -0x01010101(%esi),%edi
  646. xor %esi,%edi
  647. not %esi
  648. and $0x80808080,%esi
  649. and %edi,%esi
  650. jnz .Lfound8
  651. mov 12(%eax),%esi
  652. xor %ecx,%esi
  653. lea -0x01010101(%esi),%edi
  654. xor %esi,%edi
  655. not %esi
  656. and $0x80808080,%esi
  657. and %edi,%esi
  658. jnz .Lfound12
  659. add $16,%eax
  660. .Laligned:
  661. sub $16,%edx
  662. jae .Lloop { Still more than 16 bytes remaining }
  663. { Process remaining bytes (<16 left at this point) }
  664. { length is offset by -16 at this point }
  665. .Lloop2:
  666. cmp $4-16,%edx { < 4 bytes left? }
  667. jb .Ltail
  668. mov (%eax),%esi
  669. xor %ecx,%esi
  670. lea -0x01010101(%esi),%edi
  671. xor %esi,%edi
  672. not %esi
  673. and $0x80808080,%esi
  674. and %edi,%esi
  675. jne .Lfound
  676. add $4,%eax
  677. sub $4,%edx
  678. jmp .Lloop2
  679. .Ltail: { Less than 4 bytes remaining, check one by one }
  680. and $3, %edx
  681. jz .Lnotfound
  682. .Lloop3:
  683. cmp %cl,(%eax)
  684. je .Lexit
  685. inc %eax
  686. dec %edx
  687. jnz .Lloop3
  688. .Lnotfound:
  689. or $-1,%eax
  690. jmp .Lexit1
  691. { add missing source pointer increments }
  692. .Lfound12:
  693. add $4,%eax
  694. .Lfound8:
  695. add $4,%eax
  696. .Lfound4:
  697. add $4,%eax
  698. .Lfound:
  699. test $0xff,%esi
  700. jnz .Lexit
  701. inc %eax
  702. test $0xff00,%esi
  703. jnz .Lexit
  704. inc %eax
  705. test $0xff0000,%esi
  706. jnz .Lexit
  707. inc %eax
  708. .Lexit:
  709. sub (%esp),%eax
  710. .Lexit1:
  711. pop %ecx { removes initial 'buf' value }
  712. pop %edi
  713. pop %esi
  714. end;
  715. function IndexByte_SSE2(const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  716. asm
  717. test %edx, %edx
  718. jz .Lnotfound { exit if len=0 }
  719. push %ebx
  720. movd %ecx, %xmm1
  721. lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
  722. punpcklbw %xmm1, %xmm1
  723. and $-0x10, %ecx { first aligned address after buf }
  724. punpcklbw %xmm1, %xmm1
  725. pshufd $0, %xmm1, %xmm1
  726. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
  727. sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
  728. pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
  729. pmovmskb %xmm0, %ebx
  730. shl %cl, %ebx { shift valid bits into high word }
  731. and $0xffff0000, %ebx { clear low word containing invalid bits }
  732. shr %cl, %ebx { shift back }
  733. jz .Lcontinue
  734. .Lmatch:
  735. bsf %ebx, %ebx
  736. lea -16(%ecx,%ebx), %eax
  737. pop %ebx
  738. cmp %eax, %edx { check against the buffer length }
  739. jbe .Lnotfound
  740. ret
  741. .balign 16
  742. .Lloop:
  743. movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, }
  744. add $16, %ecx { but their sum is evenly divisible by 16. }
  745. pcmpeqb %xmm1, %xmm0
  746. pmovmskb %xmm0, %ebx
  747. test %ebx, %ebx
  748. jnz .Lmatch
  749. .Lcontinue:
  750. cmp %ecx, %edx
  751. ja .Lloop
  752. pop %ebx
  753. .Lnotfound:
  754. or $-1, %eax
  755. end;
  756. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt; forward;
  757. var
  758. IndexByte_Impl: function(const buf;len:SizeInt;b:byte):SizeInt = @IndexByte_Dispatch;
  759. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt;
  760. begin
  761. if not fpc_cpucodeinit_performed then
  762. exit(IndexByte_Plain(buf,len,b));
  763. if has_sse2_support then
  764. IndexByte_Impl:=@IndexByte_SSE2
  765. else
  766. IndexByte_Impl:=@IndexByte_Plain;
  767. result:=IndexByte_Impl(buf,len,b);
  768. end;
  769. function IndexByte(const buf;len:SizeInt;b:byte):SizeInt;
  770. begin
  771. result:=IndexByte_Impl(buf,len,b);
  772. end;
  773. {$endif FPC_SYSTEM_HAS_INDEXBYTE}
  774. {$ifndef FPC_SYSTEM_HAS_INDEXWORD}
  775. {$define FPC_SYSTEM_HAS_INDEXWORD}
  776. function IndexWord_Plain(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  777. asm
  778. test %edx, %edx
  779. jz .LNotFound
  780. push %eax
  781. .LWordwise_Body: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  782. cmp %cx, (%eax)
  783. je .LFound
  784. add $2, %eax
  785. dec %edx
  786. jnz .LWordwise_Body
  787. pop %edx
  788. .LNotFound:
  789. or $-1, %eax
  790. ret
  791. .LFound:
  792. pop %edx
  793. sub %edx, %eax
  794. shr $1, %eax
  795. end;
  796. function IndexWord_SSE2(const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  797. asm
  798. test %edx, %edx { exit if len=0 }
  799. je .Lnotfound
  800. push %ebx
  801. movd %ecx, %xmm1
  802. punpcklwd %xmm1, %xmm1
  803. pshufd $0, %xmm1, %xmm1
  804. lea 16(%eax), %ecx
  805. and $-16, %ecx
  806. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
  807. sub %eax, %ecx
  808. test $1, %eax { if buffer isn't aligned to word boundary, }
  809. jnz .Lunaligned { use a different algorithm }
  810. pcmpeqw %xmm1, %xmm0
  811. pmovmskb %xmm0, %ebx
  812. shl %cl, %ebx
  813. and $0xffff0000, %ebx
  814. shr %cl, %ebx
  815. shr $1, %ecx { ecx=number of valid bytes }
  816. test %ebx, %ebx
  817. jz .Lcontinue
  818. .Lmatch:
  819. bsf %ebx, %ebx
  820. shr $1, %ebx { in words }
  821. lea -8(%ecx,%ebx), %eax
  822. pop %ebx
  823. cmp %eax, %edx
  824. jbe .Lnotfound { if match is after the specified length, ignore it }
  825. ret
  826. .balign 16
  827. .Lloop:
  828. movdqa (%eax,%ecx,2), %xmm0
  829. add $8, %ecx
  830. pcmpeqw %xmm1, %xmm0
  831. pmovmskb %xmm0, %ebx
  832. test %ebx, %ebx
  833. jnz .Lmatch
  834. .Lcontinue:
  835. cmp %ecx, %edx
  836. ja .Lloop
  837. pop %ebx
  838. .Lnotfound:
  839. or $-1, %eax
  840. ret
  841. .Lunaligned:
  842. push %esi
  843. movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
  844. psllw $8, %xmm1 { swap bytes of each word of pattern) }
  845. psrlw $8, %xmm2
  846. por %xmm2, %xmm1
  847. pcmpeqb %xmm1, %xmm0
  848. pmovmskb %xmm0, %ebx
  849. shl %cl, %ebx
  850. and $0xffff0000, %ebx
  851. shr %cl, %ebx
  852. xor %esi, %esi { nothing to merge yet }
  853. add %edx, %edx { length words -> bytes }
  854. jmp .Lcontinue_u
  855. .balign 16
  856. .Lloop_u:
  857. movdqa (%eax,%ecx), %xmm0
  858. add $16, %ecx
  859. pcmpeqb %xmm1, %xmm0 { compare by bytes }
  860. shr $16, %esi { bit 16 shifts into 0 }
  861. pmovmskb %xmm0, %ebx
  862. .Lcontinue_u:
  863. shl $1, %ebx { 15:0 -> 16:1 }
  864. or %esi, %ebx { merge bit 0 from previous round }
  865. mov %ebx, %esi
  866. shr $1, %ebx { now AND together adjacent pairs of bits }
  867. and %esi, %ebx
  868. and $0x5555, %ebx { also reset odd bits }
  869. jnz .Lmatch_u
  870. cmp %ecx, %edx
  871. ja .Lloop_u
  872. .Lnotfound_u:
  873. pop %esi
  874. pop %ebx
  875. or $-1, %eax
  876. ret
  877. .Lmatch_u:
  878. bsf %ebx, %ebx
  879. lea -16(%ecx,%ebx), %eax
  880. cmp %eax, %edx
  881. jbe .Lnotfound_u { if match is after the specified length, ignore it }
  882. sar $1, %eax { in words }
  883. pop %esi
  884. pop %ebx
  885. end;
  886. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt; forward;
  887. var
  888. IndexWord_Impl: function(const buf;len:SizeInt;b:word):SizeInt = @IndexWord_Dispatch;
  889. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt;
  890. begin
  891. if not fpc_cpucodeinit_performed then
  892. exit(IndexWord_Plain(buf,len,b));
  893. if has_sse2_support then
  894. IndexWord_Impl:=@IndexWord_SSE2
  895. else
  896. IndexWord_Impl:=@IndexWord_Plain;
  897. result:=IndexWord_Impl(buf,len,b);
  898. end;
  899. function IndexWord(const buf;len:SizeInt;b:word):SizeInt; inline;
  900. begin
  901. result:=IndexWord_Impl(buf,len,b);
  902. end;
  903. {$endif FPC_SYSTEM_HAS_INDEXWORD}
  904. {$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
  905. {$define FPC_SYSTEM_HAS_INDEXDWORD}
  906. function IndexDWord_Plain(Const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  907. asm
  908. push %eax
  909. sub $4, %eax
  910. .LDWordwise_Next: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  911. add $4, %eax
  912. sub $1, %edx
  913. jb .LNotFound
  914. cmp %ecx, (%eax)
  915. jne .LDWordwise_Next
  916. pop %edx
  917. sub %edx, %eax
  918. shr $2, %eax
  919. ret
  920. .LNotFound:
  921. pop %edx
  922. mov $-1, %eax
  923. end;
  924. function IndexDWord_SSE2(const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  925. asm
  926. push %eax
  927. sub $4, %edx
  928. jle .LDwordwise_Prepare
  929. movd %ecx, %xmm1
  930. pshufd $0, %xmm1, %xmm1
  931. .balign 16 { 1-byte NOP. }
  932. .L4x_Body:
  933. movdqu (%eax), %xmm0
  934. pcmpeqd %xmm1, %xmm0
  935. pmovmskb %xmm0, %ecx
  936. test %ecx, %ecx
  937. jnz .LFoundAtMask
  938. add $16, %eax
  939. sub $4, %edx
  940. jg .L4x_Body
  941. lea (%eax,%edx,4), %eax
  942. movdqu (%eax), %xmm0
  943. pcmpeqd %xmm1, %xmm0
  944. pmovmskb %xmm0, %ecx
  945. test %ecx, %ecx
  946. jz .LNothing
  947. .LFoundAtMask:
  948. bsf %ecx, %ecx
  949. add %ecx, %eax
  950. .LFoundAtEax:
  951. pop %edx
  952. sub %edx, %eax
  953. shr $2, %eax
  954. ret
  955. nop { Turns .balign 16 before .LDwordwise_Body into a no-op. }
  956. .LDwordwise_Prepare:
  957. add $3, %edx
  958. cmp $-1, %edx
  959. je .LNothing
  960. .balign 16 { no-op }
  961. .LDwordwise_Body:
  962. cmp (%eax), %ecx
  963. je .LFoundAtEax
  964. add $4, %eax
  965. sub $1, %edx
  966. jae .LDwordwise_Body
  967. .LNothing:
  968. pop %edx
  969. or $-1, %eax
  970. end;
  971. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt; forward;
  972. var
  973. IndexDWord_Impl: function(const buf;len:SizeInt;b:DWord):SizeInt = @IndexDWord_Dispatch;
  974. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt;
  975. begin
  976. if not fpc_cpucodeinit_performed then
  977. exit(IndexDWord_Plain(buf,len,b));
  978. if has_sse2_support then
  979. IndexDWord_Impl:=@IndexDWord_SSE2
  980. else
  981. IndexDWord_Impl:=@IndexDWord_Plain;
  982. result:=IndexDWord_Impl(buf,len,b);
  983. end;
  984. function IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt;
  985. begin
  986. result:=IndexDWord_Impl(buf,len,b);
  987. end;
  988. {$endif FPC_SYSTEM_HAS_INDEXDWORD}
  989. {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
  990. {$define FPC_SYSTEM_HAS_INDEXQWORD}
  991. function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  992. { eax = buf, edx = len, [esp+4] = b }
  993. asm
  994. push %ebx
  995. mov 8(%esp), %ecx { ecx = b[0:31] }
  996. mov 12(%esp), %ebx { ebx = b[32:63] }
  997. mov %eax, 8(%esp) { remember original buf }
  998. sub $8, %eax
  999. .balign 16 { no-op }
  1000. .LQWordwise_Next:
  1001. add $8, %eax
  1002. sub $1, %edx
  1003. jb .LNotFound
  1004. cmp %ecx, (%eax)
  1005. jne .LQWordwise_Next
  1006. cmp %ebx, 4(%eax)
  1007. jne .LQWordwise_Next
  1008. sub 8(%esp), %eax
  1009. pop %ebx
  1010. shr $3, %eax
  1011. ret $8
  1012. .LNotFound:
  1013. pop %ebx
  1014. mov $-1, %eax
  1015. end;
  1016. {$endif FPC_SYSTEM_HAS_INDEXQWORD}
  1017. {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
  1018. {$define FPC_SYSTEM_HAS_COMPAREBYTE}
  1019. function CompareByte_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1020. asm
  1021. { eax = buf1, edx = buf2, ecx = len }
  1022. push %ebx
  1023. sub %eax, %edx { edx = buf2 - buf1 }
  1024. cmp $3, %ecx
  1025. jle .LBytewise_Prepare
  1026. { Align buf1 on 4 bytes. }
  1027. mov (%edx,%eax), %ebx
  1028. cmp (%eax), %ebx
  1029. jne .L4xDiffer
  1030. lea -4(%eax,%ecx), %ecx { ecx = buf1 end - (4 + buf1 and -4) = count remaining }
  1031. and $-4, %eax
  1032. sub %eax, %ecx
  1033. .balign 16
  1034. .L4x_Next:
  1035. add $4, %eax
  1036. sub $4, %ecx { at .LLast4, ecx is 4 less than remaining bytes }
  1037. jle .LLast4
  1038. mov (%edx,%eax), %ebx
  1039. cmp (%eax), %ebx
  1040. je .L4x_Next
  1041. .L4xDiffer:
  1042. mov (%eax), %edx
  1043. {$ifdef CPUX86_HAS_BSWAP}
  1044. bswap %ebx
  1045. bswap %edx
  1046. {$else}
  1047. rol $8, %bx
  1048. rol $16, %ebx
  1049. rol $8, %bx
  1050. rol $8, %dx
  1051. rol $16, %edx
  1052. rol $8, %dx
  1053. {$endif}
  1054. cmp %ebx, %edx
  1055. .LDoSbb:
  1056. sbb %eax, %eax
  1057. or $1, %eax
  1058. pop %ebx
  1059. ret
  1060. .LLast4:
  1061. add %ecx, %eax
  1062. mov (%edx,%eax), %ebx
  1063. cmp (%eax), %ebx
  1064. jne .L4xDiffer
  1065. xor %eax, %eax
  1066. pop %ebx
  1067. ret
  1068. .LBytewise_Prepare:
  1069. sub $1, %ecx
  1070. jb .LNothing
  1071. .balign 16 { no-op }
  1072. .LBytewise_Body:
  1073. movzbl (%edx,%eax), %ebx
  1074. cmp %bl, (%eax)
  1075. jne .LDoSbb
  1076. add $1, %eax
  1077. sub $1, %ecx
  1078. jae .LBytewise_Body
  1079. .LNothing:
  1080. xor %eax, %eax
  1081. pop %ebx
  1082. end;
  1083. function CompareByte_SSE2(const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
  1084. asm
  1085. { eax = buf1, edx = buf2, ecx = len }
  1086. cmp $1, %ecx
  1087. jle .L1OrLess
  1088. push %ebx
  1089. cmp $16, %ecx
  1090. jae .LVecOrMore
  1091. { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
  1092. mov %eax, %ebx
  1093. or %edx, %ebx
  1094. and $4095, %ebx
  1095. cmp $4080, %ebx
  1096. ja .LCantOverReadBoth
  1097. { Over-read both as XMMs. }
  1098. movdqu (%eax), %xmm0
  1099. movdqu (%edx), %xmm1
  1100. pcmpeqb %xmm1, %xmm0
  1101. pmovmskb %xmm0, %ebx
  1102. inc %bx { Here and below, 2-byte 'inc r16' after 'pmovmskb' can be replaced with 5-byte 'add $1, r16' or 6-byte 'xor $65535, r32'. }
  1103. jz .LNothing
  1104. bsf %ebx, %ebx
  1105. cmp %ecx, %ebx { Ignore garbage beyond 'len'. }
  1106. jae .LNothing
  1107. movzbl (%eax,%ebx), %eax
  1108. movzbl (%edx,%ebx), %edx
  1109. sub %edx, %eax
  1110. pop %ebx
  1111. ret
  1112. .LNothing:
  1113. pop %ebx
  1114. xor %eax, %eax
  1115. ret
  1116. .LVecOrMore:
  1117. { Compare first vectors. }
  1118. movdqu (%eax), %xmm0
  1119. movdqu (%edx), %xmm1
  1120. pcmpeqb %xmm1, %xmm0
  1121. pmovmskb %xmm0, %ebx
  1122. inc %bx
  1123. jnz .LVec0Differs
  1124. sub $32, %ecx { now ecx is len - 32... mostly just to save bytes on offsets improving .LAligned32xLoop_Body alignment :) }
  1125. jbe .LLastVec
  1126. { Compare second vectors. }
  1127. movdqu 16(%eax), %xmm0
  1128. movdqu 16(%edx), %xmm1
  1129. pcmpeqb %xmm1, %xmm0
  1130. pmovmskb %xmm0, %ebx
  1131. inc %bx
  1132. jnz .LVec1Differs
  1133. { More than four vectors: aligned loop. }
  1134. cmp $32, %ecx
  1135. ja .LAligned32xLoop_Prepare
  1136. { Compare last two vectors. }
  1137. movdqu (%eax,%ecx), %xmm0
  1138. movdqu (%edx,%ecx), %xmm1
  1139. pcmpeqb %xmm1, %xmm0
  1140. pmovmskb %xmm0, %ebx
  1141. inc %bx
  1142. jnz .LVecEm2Differs
  1143. .LLastVec:
  1144. movdqu 16(%eax,%ecx), %xmm0
  1145. movdqu 16(%edx,%ecx), %xmm1
  1146. pcmpeqb %xmm1, %xmm0
  1147. pmovmskb %xmm0, %ebx
  1148. inc %bx
  1149. jnz .LVecEm1Differs
  1150. pop %ebx
  1151. xor %eax, %eax
  1152. ret
  1153. .LVecEm2Differs:
  1154. sub $16, %ecx
  1155. .LVecEm1Differs:
  1156. bsf %ebx, %ebx
  1157. add %ecx, %ebx
  1158. movzbl 16(%eax,%ebx), %eax
  1159. movzbl 16(%edx,%ebx), %edx
  1160. sub %edx, %eax
  1161. pop %ebx
  1162. ret
  1163. nop { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
  1164. .LAligned32xLoop_Prepare:
  1165. lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
  1166. sub %eax, %edx { edx = buf2 - buf1 }
  1167. and $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
  1168. sub %eax, %ecx { ecx = count to be handled with loop }
  1169. .balign 16 { No-op. }
  1170. .LAligned32xLoop_Body:
  1171. add $32, %eax
  1172. { Compare two XMMs, reduce the result with 'and'. }
  1173. movdqu (%edx,%eax), %xmm0
  1174. pcmpeqb (%eax), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
  1175. movdqu 16(%edx,%eax), %xmm1
  1176. pcmpeqb 16(%eax), %xmm1
  1177. pand %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
  1178. pmovmskb %xmm1, %ebx
  1179. inc %bx
  1180. jnz .LAligned32xLoop_TwoVectorsDiffer
  1181. sub $32, %ecx
  1182. ja .LAligned32xLoop_Body
  1183. { Compare last two vectors after the loop by doing one more loop iteration, modified. }
  1184. lea 32(%eax,%ecx), %eax
  1185. movdqu (%edx,%eax), %xmm0
  1186. movdqu (%eax), %xmm2
  1187. pcmpeqb %xmm2, %xmm0
  1188. movdqu 16(%edx,%eax), %xmm1
  1189. movdqu 16(%eax), %xmm2
  1190. pcmpeqb %xmm2, %xmm1
  1191. pand %xmm0, %xmm1
  1192. pmovmskb %xmm1, %ebx
  1193. inc %bx
  1194. jnz .LAligned32xLoop_TwoVectorsDiffer
  1195. pop %ebx
  1196. xor %eax, %eax
  1197. ret
  1198. .LAligned32xLoop_TwoVectorsDiffer:
  1199. add %eax, %edx { restore edx = buf2 }
  1200. pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
  1201. inc %cx
  1202. jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
  1203. bsf %ecx, %ebx
  1204. movzbl (%eax,%ebx), %eax
  1205. movzbl (%edx,%ebx), %edx
  1206. sub %edx, %eax
  1207. pop %ebx
  1208. ret
  1209. .LVec1Differs:
  1210. add $16, %eax
  1211. add $16, %edx
  1212. .LVec0Differs:
  1213. bsf %ebx, %ebx
  1214. movzbl (%eax,%ebx), %eax
  1215. movzbl (%edx,%ebx), %edx
  1216. sub %edx, %eax
  1217. pop %ebx
  1218. ret
  1219. .LCantOverReadBoth:
  1220. cmp $3, %ecx
  1221. jle .L2to3
  1222. push %esi
  1223. mov (%eax), %ebx
  1224. mov (%edx), %esi
  1225. cmp %esi, %ebx
  1226. jne .L4xDiffer
  1227. cmp $8, %ecx
  1228. jbe .LLast4x
  1229. mov 4(%eax), %ebx
  1230. mov 4(%edx), %esi
  1231. cmp %esi, %ebx
  1232. jne .L4xDiffer
  1233. mov -8(%eax,%ecx), %ebx
  1234. mov -8(%edx,%ecx), %esi
  1235. cmp %esi, %ebx
  1236. jne .L4xDiffer
  1237. .LLast4x:
  1238. mov -4(%eax,%ecx), %ebx
  1239. mov -4(%edx,%ecx), %esi
  1240. cmp %esi, %ebx
  1241. jne .L4xDiffer
  1242. pop %esi
  1243. pop %ebx
  1244. xor %eax, %eax
  1245. ret
  1246. .L4xDiffer:
  1247. bswap %ebx
  1248. bswap %esi
  1249. cmp %esi, %ebx
  1250. pop %esi
  1251. sbb %eax, %eax
  1252. or $1, %eax
  1253. pop %ebx
  1254. ret
  1255. .L2to3:
  1256. movzwl (%edx), %ebx
  1257. bswap %ebx
  1258. shr $1, %ebx
  1259. mov -1(%edx,%ecx), %bl
  1260. movzwl (%eax), %edx
  1261. bswap %edx
  1262. shr $1, %edx
  1263. mov -1(%eax,%ecx), %dl
  1264. mov %edx, %eax
  1265. sub %ebx, %eax
  1266. pop %ebx
  1267. ret
  1268. .L1OrLess:
  1269. jl .LUnbounded_Prepare
  1270. movzbl (%eax), %eax
  1271. movzbl (%edx), %edx
  1272. sub %edx, %eax
  1273. ret
  1274. .LUnbounded_Prepare:
  1275. sub %eax, %edx { edx = buf2 - buf1 }
  1276. test %ecx, %ecx
  1277. jnz .LUnbounded_Body
  1278. xor %eax, %eax
  1279. ret
  1280. .balign 16
  1281. .LUnbounded_Next:
  1282. add $1, %eax
  1283. .LUnbounded_Body:
  1284. movzbl (%edx,%eax), %ecx
  1285. cmp %cl, (%eax)
  1286. je .LUnbounded_Next
  1287. sbb %eax, %eax
  1288. or $1, %eax
  1289. end;
  1290. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1291. var
  1292. CompareByte_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareByte_Dispatch;
  1293. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1294. begin
  1295. if not fpc_cpucodeinit_performed then
  1296. exit(CompareByte_Plain(buf1, buf2, len));
  1297. if has_sse2_support then
  1298. CompareByte_Impl:=@CompareByte_SSE2
  1299. else
  1300. CompareByte_Impl:=@CompareByte_Plain;
  1301. result:=CompareByte_Impl(buf1, buf2, len);
  1302. end;
  1303. function CompareByte(const buf1, buf2; len: SizeInt): SizeInt;
  1304. begin
  1305. result:=CompareByte_Impl(buf1, buf2, len);
  1306. end;
  1307. {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
  1308. {$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
  1309. {$define FPC_SYSTEM_HAS_COMPAREWORD}
  1310. function CompareWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1311. asm
  1312. push %ebx
  1313. sub %eax, %edx { edx = buf2 - buf1 }
  1314. lea -4(%ecx), %ebx { Go wordwise if ecx <= 3 or ecx > 1073741823 (High(int32) div 2) ==> uint32(ecx - 4) > 1073741819. }
  1315. cmp $1073741819, %ebx
  1316. ja .LWordwise_Prepare
  1317. test $2, %al
  1318. je .LAlignedToPtrUintOrNaturallyMisaligned
  1319. movzwl (%edx,%eax), %ebx
  1320. cmp %bx, (%eax)
  1321. jne .LDoSbb
  1322. add $2, %eax
  1323. sub $1, %ecx
  1324. .LAlignedToPtrUintOrNaturallyMisaligned:
  1325. sub $2, %ecx
  1326. .balign 16
  1327. .LPtrUintWise_Next:
  1328. mov (%edx,%eax), %ebx
  1329. cmp %ebx, (%eax)
  1330. jne .LPtrUintsDiffer
  1331. add $4, %eax
  1332. sub $2, %ecx
  1333. jg .LPtrUintWise_Next
  1334. lea (%eax,%ecx,2), %eax
  1335. mov (%edx,%eax), %ebx
  1336. cmp %ebx, (%eax)
  1337. jne .LPtrUintsDiffer
  1338. pop %ebx
  1339. xor %eax, %eax
  1340. ret
  1341. .LPtrUintsDiffer:
  1342. cmp %bx, (%eax)
  1343. jne .LDoSbb
  1344. shr $16, %ebx
  1345. cmp %bx, 2(%eax)
  1346. .LDoSbb:
  1347. sbb %eax, %eax
  1348. or $1, %eax
  1349. pop %ebx
  1350. ret
  1351. .balign 16
  1352. .LWordwise_Body:
  1353. movzwl (%edx,%eax), %ebx
  1354. cmp %bx, (%eax)
  1355. jne .LDoSbb
  1356. add $2, %eax
  1357. .LWordwise_Prepare:
  1358. sub $1, %ecx
  1359. jnb .LWordwise_Body
  1360. pop %ebx
  1361. xor %eax, %eax
  1362. end;
  1363. function CompareWord_SSE2(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1364. asm
  1365. push %ebx
  1366. sub %eax, %edx { edx = buf2 - buf1 }
  1367. lea -2(%ecx), %ebx { Go wordwise if ecx <= 1 or uint32(ecx) > 1073741823 (High(int32) div 2) ==> uint32(ecx - 2) > 1073741821. }
  1368. cmp $1073741821, %ebx
  1369. ja .LWordwise_Prepare
  1370. cmp $8, %ecx
  1371. jge .LVecOrMore
  1372. lea (%edx,%eax), %ebx
  1373. or %eax, %ebx
  1374. and $4095, %ebx
  1375. cmp $4080, %ebx
  1376. ja .LWordwise_Prepare
  1377. movdqu (%edx,%eax), %xmm0
  1378. movdqu (%eax), %xmm1
  1379. pcmpeqw %xmm1, %xmm0
  1380. pmovmskb %xmm0, %ebx
  1381. inc %bx
  1382. jz .LNothing
  1383. shl $1, %ecx { convert to bytes }
  1384. bsf %ebx, %ebx
  1385. cmp %ecx, %ebx
  1386. jb .LSubtractWords
  1387. .LNothing:
  1388. pop %ebx
  1389. xor %eax, %eax
  1390. ret
  1391. .balign 16
  1392. .LWordwise_Body:
  1393. movzwl (%edx,%eax), %ebx
  1394. cmp %bx, (%eax)
  1395. jne .LDoSbb
  1396. add $2, %eax
  1397. .LWordwise_Prepare:
  1398. sub $1, %ecx
  1399. jae .LWordwise_Body
  1400. xor %eax, %eax
  1401. pop %ebx
  1402. ret
  1403. .LDoSbb:
  1404. sbb %eax, %eax
  1405. or $1, %eax
  1406. pop %ebx
  1407. ret
  1408. .LVecOrMore:
  1409. movdqu (%edx,%eax), %xmm0 { Compare first vectors. }
  1410. movdqu (%eax), %xmm1
  1411. pcmpeqw %xmm1, %xmm0
  1412. pmovmskb %xmm0, %ebx
  1413. inc %bx
  1414. jnz .LVec0Differs
  1415. shl $1, %ecx { convert to bytes }
  1416. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1417. jle .LLastVec
  1418. push %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1419. add %eax, %ecx
  1420. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1421. sub %eax, %ecx
  1422. .balign 16
  1423. .LAligned8xLoop_Body:
  1424. add $16, %eax
  1425. movdqu (%edx,%eax), %xmm0
  1426. pcmpeqb (%eax), %xmm0
  1427. pmovmskb %xmm0, %ebx
  1428. inc %bx
  1429. jnz .LAligned8xLoop_VecDiffers
  1430. sub $16, %ecx
  1431. ja .LAligned8xLoop_Body
  1432. pop %ebx { drop original buf1 }
  1433. .LLastVec:
  1434. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1435. movdqu (%edx,%eax), %xmm0
  1436. movdqu (%eax), %xmm1
  1437. pcmpeqw %xmm1, %xmm0
  1438. pmovmskb %xmm0, %ebx
  1439. inc %bx
  1440. jnz .LVec0Differs
  1441. pop %ebx
  1442. xor %eax, %eax
  1443. ret
  1444. .LVec0Differs:
  1445. bsf %ebx, %ebx
  1446. .LSubtractWords:
  1447. add %eax, %edx
  1448. movzwl (%eax,%ebx), %eax
  1449. movzwl (%edx,%ebx), %edx
  1450. sub %edx, %eax
  1451. pop %ebx
  1452. ret
  1453. .LAligned8xLoop_VecDiffers:
  1454. bsf %ebx, %ebx
  1455. add %ebx, %eax
  1456. pop %ecx
  1457. sub %ecx, %eax
  1458. and $-2, %eax
  1459. add %ecx, %eax
  1460. movzwl (%edx,%eax), %edx
  1461. movzwl (%eax), %eax
  1462. sub %edx, %eax
  1463. pop %ebx
  1464. end;
  1465. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1466. var
  1467. CompareWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareWord_Dispatch;
  1468. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1469. begin
  1470. if not fpc_cpucodeinit_performed then
  1471. exit(CompareWord_Plain(buf1, buf2, len));
  1472. if has_sse2_support then
  1473. CompareWord_Impl:=@CompareWord_SSE2
  1474. else
  1475. CompareWord_Impl:=@CompareWord_Plain;
  1476. result:=CompareWord_Impl(buf1, buf2, len);
  1477. end;
  1478. function CompareWord(const buf1, buf2; len: SizeInt): SizeInt;
  1479. begin
  1480. result:=CompareWord_Impl(buf1, buf2, len);
  1481. end;
  1482. {$endif FPC_SYSTEM_HAS_COMPAREWORD}
  1483. {$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
  1484. {$define FPC_SYSTEM_HAS_COMPAREDWORD}
  1485. function CompareDWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1486. asm
  1487. sub $1, %ecx
  1488. jb .LNothing
  1489. push %ebx
  1490. sub %eax, %edx
  1491. .balign 16
  1492. .LDwordwise_Body:
  1493. mov (%edx,%eax), %ebx
  1494. cmp %ebx, (%eax)
  1495. jne .LDoSbb
  1496. add $4, %eax
  1497. sub $1, %ecx
  1498. jnb .LDwordwise_Body
  1499. pop %ebx
  1500. .LNothing:
  1501. xor %eax, %eax
  1502. ret
  1503. .LDoSbb:
  1504. pop %ebx
  1505. sbb %eax, %eax
  1506. or $1, %eax
  1507. end;
  1508. function CompareDWord_SSE2(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1509. asm
  1510. push %ebx
  1511. sub %eax, %edx { edx = buf2 - buf1 }
  1512. lea -5(%ecx), %ebx { Go dwordwise if ecx <= 4 or ecx > 536870911 (High(int32) div 4) ==> uint32(ecx - 5) > 536870906. }
  1513. cmp $536870906, %ebx
  1514. ja .LDwordwise_Prepare
  1515. shl $2, %ecx { convert to bytes }
  1516. movdqu (%edx,%eax), %xmm1 { Compare first vectors. }
  1517. movdqu (%eax), %xmm0
  1518. pcmpeqd %xmm1, %xmm0
  1519. pmovmskb %xmm0, %ebx
  1520. inc %bx
  1521. jnz .LVec0Differs
  1522. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1523. jle .LLastVec
  1524. push %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1525. add %eax, %ecx
  1526. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1527. sub %eax, %ecx
  1528. .balign 16
  1529. .LAligned4xLoop_Body:
  1530. add $16, %eax
  1531. movdqu (%eax,%edx), %xmm0
  1532. pcmpeqb (%eax), %xmm0
  1533. pmovmskb %xmm0, %ebx
  1534. inc %bx
  1535. jnz .LAligned4xLoop_VecDiffers
  1536. sub $16, %ecx
  1537. ja .LAligned4xLoop_Body
  1538. pop %ebx { drop original buf1 }
  1539. .LLastVec:
  1540. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1541. movdqu (%edx,%eax), %xmm1
  1542. movdqu (%eax), %xmm0
  1543. pcmpeqd %xmm1, %xmm0
  1544. pmovmskb %xmm0, %ebx
  1545. inc %bx
  1546. jnz .LVec0Differs
  1547. pop %ebx
  1548. xor %eax, %eax
  1549. ret
  1550. .LVec0Differs:
  1551. bsf %ebx, %ebx
  1552. add %eax, %edx { recover edx = buf2 }
  1553. mov (%edx,%ebx), %edx
  1554. cmp %edx, (%eax,%ebx)
  1555. sbb %eax, %eax
  1556. or $1, %eax
  1557. pop %ebx
  1558. ret
  1559. .LAligned4xLoop_VecDiffers:
  1560. bsf %ebx, %ebx
  1561. add %ebx, %eax
  1562. pop %ecx
  1563. sub %ecx, %eax
  1564. and $-4, %eax
  1565. add %ecx, %eax
  1566. mov (%edx,%eax), %edx
  1567. cmp %edx, (%eax)
  1568. .LDoSbb:
  1569. sbb %eax, %eax
  1570. or $1, %eax
  1571. pop %ebx
  1572. ret
  1573. .balign 16
  1574. .LDwordwise_Body:
  1575. mov (%edx,%eax), %ebx
  1576. cmp %ebx, (%eax)
  1577. jne .LDoSbb
  1578. add $4, %eax
  1579. .LDwordwise_Prepare:
  1580. sub $1, %ecx
  1581. jnb .LDwordwise_Body
  1582. pop %ebx
  1583. xor %eax, %eax
  1584. end;
  1585. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1586. var
  1587. CompareDWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareDWord_Dispatch;
  1588. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1589. begin
  1590. if not fpc_cpucodeinit_performed then
  1591. exit(CompareDWord_Plain(buf1, buf2, len));
  1592. if has_sse2_support then
  1593. CompareDWord_Impl:=@CompareDWord_SSE2
  1594. else
  1595. CompareDWord_Impl:=@CompareDWord_Plain;
  1596. result:=CompareDWord_Impl(buf1, buf2, len);
  1597. end;
  1598. function CompareDWord(const buf1, buf2; len: SizeInt): SizeInt;
  1599. begin
  1600. result:=CompareDWord_Impl(buf1, buf2, len);
  1601. end;
  1602. {$endif FPC_SYSTEM_HAS_COMPAREDWORD}
  1603. {$ifndef FPC_SYSTEM_HAS_INDEXCHAR0}
  1604. {$define FPC_SYSTEM_HAS_INDEXCHAR0}
  1605. function IndexChar0(Const buf;len:SizeInt;b:AnsiChar):SizeInt; assembler;
  1606. var
  1607. saveesi,saveebx : longint;
  1608. asm
  1609. movl %esi,saveesi
  1610. movl %ebx,saveebx
  1611. // Can't use scasb, or will have to do it twice, think this
  1612. // is faster for small "len"
  1613. movl %eax,%esi // Load address
  1614. movzbl %cl,%ebx // Load searchpattern
  1615. testl %edx,%edx
  1616. je .LFound
  1617. xorl %ecx,%ecx // zero index in Buf
  1618. xorl %eax,%eax // To make DWord compares possible
  1619. .balign 4
  1620. .LLoop:
  1621. movb (%esi),%al // Load byte
  1622. cmpb %al,%bl
  1623. je .LFound // byte the same?
  1624. incl %ecx
  1625. incl %esi
  1626. cmpl %edx,%ecx // Maximal distance reached?
  1627. je .LNotFound
  1628. testl %eax,%eax // Nullchar = end of search?
  1629. jne .LLoop
  1630. .LNotFound:
  1631. movl $-1,%ecx // Not found return -1
  1632. .LFound:
  1633. movl %ecx,%eax
  1634. movl saveesi,%esi
  1635. movl saveebx,%ebx
  1636. end;
  1637. {$endif FPC_SYSTEM_HAS_INDEXCHAR0}
  1638. {****************************************************************************
  1639. String
  1640. ****************************************************************************}
  1641. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1642. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1643. procedure fpc_shortstr_to_shortstr(out res:shortstring; const sstr: shortstring);assembler;[public,alias:'FPC_SHORTSTR_TO_SHORTSTR']; compilerproc;
  1644. {$ifndef FPC_PROFILE}
  1645. nostackframe;
  1646. {$endif}
  1647. { eax = res, edx = high(res), ecx = sstr }
  1648. asm
  1649. {$ifdef FPC_PROFILE}
  1650. push %eax
  1651. push %edx
  1652. push %ecx
  1653. call mcount
  1654. pop %ecx
  1655. pop %edx
  1656. pop %eax
  1657. {$endif FPC_PROFILE}
  1658. cmp (%ecx), %dl { length(sstr) fits into res? }
  1659. jbe .LEdxIsLen { use high(res) if length(sstr) does not fit }
  1660. movzbl (%ecx), %edx { use length(sstr) }
  1661. .LEdxIsLen:
  1662. mov %dl, (%eax) { store length to res[0] }
  1663. xchg %ecx, %edx { ecx = length = Move count, edx = sstr }
  1664. xchg %eax, %edx { eax = sstr = Move src, edx = res = Move dest }
  1665. inc %eax
  1666. inc %edx
  1667. {$ifdef FPC_PROFILE}
  1668. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1669. lea -8(%esp), %esp
  1670. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1671. call Move
  1672. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1673. lea 8(%esp), %esp
  1674. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1675. {$else FPC_PROFILE}
  1676. jmp Move
  1677. {$endif FPC_PROFILE}
  1678. end;
  1679. procedure fpc_shortstr_assign(len:longint;sstr,dstr:pointer);[public,alias:'FPC_SHORTSTR_ASSIGN'];
  1680. begin
  1681. asm
  1682. {$ifdef FPC_PROFILE}
  1683. push %eax
  1684. push %edx
  1685. push %ecx
  1686. call mcount
  1687. pop %ecx
  1688. pop %edx
  1689. pop %eax
  1690. {$endif FPC_PROFILE}
  1691. pushl %eax
  1692. pushl %ecx
  1693. {$ifdef FPC_ENABLED_CLD}
  1694. cld
  1695. {$endif FPC_ENABLED_CLD}
  1696. movl dstr,%edi
  1697. movl sstr,%esi
  1698. xorl %eax,%eax
  1699. movl len,%ecx
  1700. lodsb
  1701. cmpl %ecx,%eax
  1702. jbe .LStrCopy1
  1703. movl %ecx,%eax
  1704. .LStrCopy1:
  1705. stosb
  1706. cmpl $7,%eax
  1707. jl .LStrCopy2
  1708. movl %edi,%ecx { Align on 32bits }
  1709. negl %ecx
  1710. andl $3,%ecx
  1711. subl %ecx,%eax
  1712. rep
  1713. movsb
  1714. movl %eax,%ecx
  1715. andl $3,%eax
  1716. shrl $2,%ecx
  1717. rep
  1718. movsl
  1719. .LStrCopy2:
  1720. movl %eax,%ecx
  1721. rep
  1722. movsb
  1723. popl %ecx
  1724. popl %eax
  1725. end ['ESI','EDI'];
  1726. end;
  1727. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1728. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1729. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1730. function fpc_shortstr_compare(const left,right:shortstring): longint;assembler; [public,alias:'FPC_SHORTSTR_COMPARE']; compilerproc;
  1731. { eax = left, edx = right }
  1732. asm
  1733. {$ifdef FPC_PROFILE}
  1734. push %eax
  1735. push %edx
  1736. push %ecx
  1737. call mcount
  1738. pop %ecx
  1739. pop %edx
  1740. pop %eax
  1741. {$endif FPC_PROFILE}
  1742. push %ebx
  1743. movzbl (%eax), %ecx { ecx = len(left) }
  1744. movzbl (%edx), %ebx { ebx = len(right) }
  1745. cmp %ebx, %ecx
  1746. {$ifdef CPUX86_HAS_CMOV}
  1747. cmovg %ebx, %ecx
  1748. {$else}
  1749. jle .LEcxIsLen
  1750. mov %ebx, %ecx
  1751. .LEcxIsLen:
  1752. {$endif}
  1753. push %eax { save left }
  1754. inc %eax
  1755. inc %edx
  1756. { stack is already aligned to 16 bytes if required: return address + push ebp + push ebx + push eax. }
  1757. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1758. call CompareByte
  1759. {$else}
  1760. call CompareByte_Impl { manually inline CompareByte }
  1761. {$endif}
  1762. pop %edx { restore left }
  1763. test %eax, %eax
  1764. jnz .LReturn
  1765. movzbl (%edx), %eax
  1766. sub %ebx, %eax
  1767. .LReturn:
  1768. pop %ebx
  1769. end;
  1770. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1771. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1772. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1773. function fpc_shortstr_compare_equal(const left,right:shortstring) : longint;assembler;nostackframe; [public,alias:'FPC_SHORTSTR_COMPARE_EQUAL']; compilerproc; nostackframe;
  1774. { eax = left, edx = right }
  1775. asm
  1776. movzbl (%eax), %ecx
  1777. cmp (%edx), %cl
  1778. jne .LNotEqual
  1779. inc %eax
  1780. inc %edx
  1781. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1782. jmp CompareByte
  1783. {$else}
  1784. jmp CompareByte_Impl { manually inline CompareByte }
  1785. {$endif}
  1786. .LNotEqual:
  1787. or $-1, %eax
  1788. end;
  1789. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1790. {$ifndef FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1791. {$define FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1792. procedure fpc_pchar_to_shortstr(out res : shortstring;p:PAnsiChar);assembler;[public,alias:'FPC_PCHAR_TO_SHORTSTR']; compilerproc;
  1793. {$ifndef FPC_PROFILE}
  1794. nostackframe;
  1795. {$endif}
  1796. // eax = res, edx = high(res), ecx = p
  1797. asm
  1798. {$ifdef FPC_PROFILE}
  1799. push %eax
  1800. push %edx
  1801. push %ecx
  1802. call mcount
  1803. pop %ecx
  1804. pop %edx
  1805. pop %eax
  1806. {$endif FPC_PROFILE}
  1807. test %ecx, %ecx
  1808. jz .LEmpty
  1809. push %eax { save res }
  1810. push %ecx { save p }
  1811. push %edx { save high(res) }
  1812. mov %ecx, %eax { eax = IndexByte.buf }
  1813. { edx is already high(res) = IndexByte.count.
  1814. Careful: using high(res) instead of -1 limits the scan by high(res) which is a good thing,
  1815. but assumes that IndexByte is “safe” and won’t read potentially invalid memory past the searched byte even if formally (and wrongly) allowed by ‘count’.
  1816. Generic and x86 versions are “safe”. }
  1817. xor %ecx, %ecx { ecx = 0 = IndexByte.value }
  1818. { Stack is already aligned on 16 bytes if the function is nostackframe: return address + push eax + push ecx + push edx.
  1819. With a stack frame, there is an additional push ebp and need 12 more bytes to align. }
  1820. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1821. leal -12(%esp), %esp
  1822. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1823. {$if defined(FPC_PIC) or not declared(IndexByte_Impl)}
  1824. call IndexByte
  1825. {$else}
  1826. call IndexByte_Impl { manually inline IndexByte }
  1827. {$endif}
  1828. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1829. leal 12(%esp), %esp
  1830. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1831. pop %ecx { ecx = high(res) = Move.len }
  1832. test %eax, %eax { If IndexByte result (eax) is non-negative (terminator is among first high(res) characters), use it, otherwise keep high(res). }
  1833. {$ifdef CPUX86_HAS_CMOV}
  1834. cmovns %eax, %ecx
  1835. {$else}
  1836. js .LEcxIsLen
  1837. mov %eax, %ecx
  1838. .LEcxIsLen:
  1839. {$endif}
  1840. pop %eax { pop p to eax = Move.src }
  1841. pop %edx { pop res to edx }
  1842. mov %cl, (%edx) { res[0] := len }
  1843. inc %edx { res[1] = Move.dst }
  1844. {$ifdef FPC_PROFILE}
  1845. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1846. leal -12(%esp), %esp
  1847. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1848. call Move
  1849. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1850. leal 12(%esp), %esp
  1851. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1852. jmp .LReturn
  1853. {$else FPC_PROFILE}
  1854. jmp Move { can perform a tail call }
  1855. {$endif FPC_PROFILE}
  1856. .LEmpty:
  1857. movb $0, (%eax)
  1858. {$ifdef FPC_PROFILE}
  1859. .LReturn:
  1860. {$endif}
  1861. end;
  1862. {$endif FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1863. {$IFNDEF INTERNAL_BACKTRACE}
  1864. {$define FPC_SYSTEM_HAS_GET_FRAME}
  1865. function get_frame:pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  1866. asm
  1867. movl %ebp,%eax
  1868. end;
  1869. {$ENDIF not INTERNAL_BACKTRACE}
  1870. {$define FPC_SYSTEM_HAS_GET_PC_ADDR}
  1871. Function Get_pc_addr : Pointer;assembler;nostackframe;
  1872. asm
  1873. movl (%esp),%eax
  1874. end;
  1875. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  1876. function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;
  1877. {$if defined(win32)}
  1878. { Windows has StackTop always properly set }
  1879. begin
  1880. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  1881. Result:=PPointer(framebp+4)^
  1882. else
  1883. Result:=nil;
  1884. end;
  1885. {$else defined(win32)}
  1886. nostackframe;assembler;
  1887. asm
  1888. orl %eax,%eax
  1889. jz .Lg_a_null
  1890. movl 4(%eax),%eax
  1891. .Lg_a_null:
  1892. end;
  1893. {$endif defined(win32)}
  1894. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  1895. function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;
  1896. {$if defined(win32)}
  1897. { Windows has StackTop always properly set }
  1898. begin
  1899. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  1900. Result:=PPointer(framebp)^
  1901. else
  1902. Result:=nil;
  1903. end;
  1904. {$else defined(win32)}
  1905. nostackframe;assembler;
  1906. asm
  1907. orl %eax,%eax
  1908. jz .Lgnf_null
  1909. movl (%eax),%eax
  1910. .Lgnf_null:
  1911. end;
  1912. {$endif defined(win32)}
  1913. {$define FPC_SYSTEM_HAS_SPTR}
  1914. Function Sptr : Pointer;assembler;nostackframe;
  1915. asm
  1916. movl %esp,%eax
  1917. end;
  1918. {****************************************************************************
  1919. Str()
  1920. ****************************************************************************}
  1921. {$if defined(disabled) and defined(regcall) }
  1922. {$define FPC_SYSTEM_HAS_INT_STR_LONGWORD}
  1923. {$define FPC_SYSTEM_HAS_INT_STR_LONGINT}
  1924. label str_int_shortcut;
  1925. procedure int_str(l:longword;out s:shortstring);assembler;nostackframe;
  1926. asm
  1927. pushl %esi
  1928. pushl %edi
  1929. pushl %ebx
  1930. mov %edx,%edi
  1931. xor %edx,%edx
  1932. jmp str_int_shortcut
  1933. end;
  1934. procedure int_str(l:longint;out s:shortstring);assembler;nostackframe;
  1935. {Optimized for speed, but balanced with size.}
  1936. const digits:array[0..9] of cardinal=(0,10,100,1000,10000,
  1937. 100000,1000000,10000000,
  1938. 100000000,1000000000);
  1939. asm
  1940. {$ifdef FPC_PROFILE}
  1941. push %eax
  1942. push %edx
  1943. push %ecx
  1944. call mcount
  1945. pop %ecx
  1946. pop %edx
  1947. pop %eax
  1948. {$endif FPC_PROFILE}
  1949. push %esi
  1950. push %edi
  1951. push %ebx
  1952. movl %edx,%edi
  1953. { Calculate absolute value and put sign in edx}
  1954. cltd
  1955. xorl %edx,%eax
  1956. subl %edx,%eax
  1957. negl %edx
  1958. str_int_shortcut:
  1959. movl %ecx,%esi
  1960. {Calculate amount of digits in ecx.}
  1961. xorl %ecx,%ecx
  1962. bsrl %eax,%ecx
  1963. incl %ecx
  1964. imul $1233,%ecx
  1965. shr $12,%ecx
  1966. {$ifdef FPC_PIC}
  1967. call fpc_geteipasebx
  1968. {$ifdef darwin}
  1969. movl digits-.Lpic(%ebx),%ebx
  1970. {$else}
  1971. addl $_GLOBAL_OFFSET_TABLE_,%ebx
  1972. movl digits@GOT(%ebx),%ebx
  1973. {$endif}
  1974. cmpl (%ebx,%ecx,4),%eax
  1975. {$else}
  1976. cmpl digits(,%ecx,4),%eax
  1977. {$endif}
  1978. cmc
  1979. adcl $0,%ecx {Nr. digits ready in ecx.}
  1980. {Write length & sign.}
  1981. lea (%edx,%ecx),%ebx
  1982. movb $45,%bh {movb $'-,%bh Not supported by our ATT reader.}
  1983. movw %bx,(%edi)
  1984. addl %edx,%edi
  1985. subl %edx,%esi
  1986. {Skip digits beyond string length.}
  1987. movl %eax,%edx
  1988. subl %ecx,%esi
  1989. jae .Lloop_write
  1990. .balign 4
  1991. .Lloop_skip:
  1992. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  1993. mull %edx
  1994. shrl $3,%edx
  1995. decl %ecx
  1996. jz .Ldone {If (l<0) and (high(s)=1) this jump is taken.}
  1997. incl %esi
  1998. jnz .Lloop_skip
  1999. {Write out digits.}
  2000. .balign 4
  2001. .Lloop_write:
  2002. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  2003. {Pre-add '0'}
  2004. leal 48(%edx),%ebx {leal $'0(,%edx),%ebx Not supported by our ATT reader.}
  2005. mull %edx
  2006. shrl $3,%edx
  2007. leal (%edx,%edx,8),%eax {x mod 10 = x-10*(x div 10)}
  2008. subl %edx,%ebx
  2009. subl %eax,%ebx
  2010. movb %bl,(%edi,%ecx)
  2011. decl %ecx
  2012. jnz .Lloop_write
  2013. .Ldone:
  2014. popl %ebx
  2015. popl %edi
  2016. popl %esi
  2017. end;
  2018. {$endif}
  2019. {****************************************************************************
  2020. Bounds Check
  2021. ****************************************************************************}
  2022. { do a thread-safe inc/dec }
  2023. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  2024. function cpudeclocked(var l : longint) : boolean;assembler;nostackframe;
  2025. asm
  2026. lock
  2027. decl (%eax)
  2028. setzb %al
  2029. end;
  2030. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  2031. procedure cpuinclocked(var l : longint);assembler;nostackframe;
  2032. asm
  2033. lock
  2034. incl (%eax)
  2035. end;
  2036. // inline SMP check and normal lock.
  2037. // the locked one is so slow, inlining doesn't matter.
  2038. function declocked(var l : longint) : boolean; inline;
  2039. begin
  2040. if not ismultithread then
  2041. begin
  2042. dec(l);
  2043. declocked:=l=0;
  2044. end
  2045. else
  2046. declocked:=cpudeclocked(l);
  2047. end;
  2048. procedure inclocked(var l : longint); inline;
  2049. begin
  2050. if not ismultithread then
  2051. inc(l)
  2052. else
  2053. cpuinclocked(l);
  2054. end;
  2055. function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
  2056. asm
  2057. movl $-1,%edx
  2058. lock
  2059. xaddl %edx, (%eax)
  2060. lea -1(%edx),%eax
  2061. end;
  2062. function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
  2063. asm
  2064. movl $1,%edx
  2065. lock
  2066. xaddl %edx, (%eax)
  2067. lea 1(%edx),%eax
  2068. end;
  2069. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2070. asm
  2071. xchgl (%eax),%edx
  2072. movl %edx,%eax
  2073. end;
  2074. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2075. asm
  2076. lock
  2077. xaddl %edx, (%eax)
  2078. movl %edx,%eax
  2079. end;
  2080. function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler; nostackframe;
  2081. asm
  2082. xchgl %eax,%ecx
  2083. lock
  2084. cmpxchgl %edx, (%ecx)
  2085. end;
  2086. function InterlockedCompareExchange64(var Target: int64; NewValue: int64; Comperand: int64): int64; assembler;
  2087. asm
  2088. pushl %ebx
  2089. pushl %edi
  2090. movl %eax,%edi
  2091. movl Comperand+4,%edx
  2092. movl Comperand+0,%eax
  2093. movl NewValue+4,%ecx
  2094. movl NewValue+0,%ebx
  2095. lock cmpxchg8b (%edi)
  2096. pop %edi
  2097. pop %ebx
  2098. end;
  2099. {****************************************************************************
  2100. FPU
  2101. ****************************************************************************}
  2102. const
  2103. { Internal constants for use in system unit }
  2104. FPU_Invalid = 1;
  2105. FPU_Denormal = 2;
  2106. FPU_DivisionByZero = 4;
  2107. FPU_Overflow = 8;
  2108. FPU_Underflow = $10;
  2109. FPU_StackUnderflow = $20;
  2110. FPU_StackOverflow = $40;
  2111. FPU_ExceptionMask = $ff;
  2112. MM_Invalid = 1;
  2113. MM_Denormal = 2;
  2114. MM_DivisionByZero = 4;
  2115. MM_Overflow = 8;
  2116. MM_Underflow = $10;
  2117. MM_Precicion = $20;
  2118. MM_ExceptionMask = $3f;
  2119. MM_MaskInvalidOp = %0000000010000000;
  2120. MM_MaskDenorm = %0000000100000000;
  2121. MM_MaskDivZero = %0000001000000000;
  2122. MM_MaskOverflow = %0000010000000000;
  2123. MM_MaskUnderflow = %0000100000000000;
  2124. MM_MaskPrecision = %0001000000000000;
  2125. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  2126. Procedure SysInitFPU;
  2127. begin
  2128. end;
  2129. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  2130. Procedure SysResetFPU;
  2131. var
  2132. { these locals are so we don't have to hack pic code in the assembler }
  2133. localmxcsr: dword;
  2134. localfpucw: word;
  2135. begin
  2136. localfpucw:=Default8087CW;
  2137. asm
  2138. fninit
  2139. fwait
  2140. fldcw localfpucw
  2141. end;
  2142. if has_sse_support then
  2143. begin
  2144. localmxcsr:=DefaultMXCSR;
  2145. asm
  2146. { setup sse exceptions }
  2147. {$ifndef OLD_ASSEMBLER}
  2148. ldmxcsr localmxcsr
  2149. {$else OLD_ASSEMBLER}
  2150. mov localmxcsr,%eax
  2151. subl $4,%esp
  2152. mov %eax,(%esp)
  2153. //ldmxcsr (%esp)
  2154. .byte 0x0f,0xae,0x14,0x24
  2155. addl $4,%esp
  2156. {$endif OLD_ASSEMBLER}
  2157. end;
  2158. end;
  2159. end;
  2160. { because of the brain dead sse detection on x86, this test is post poned }
  2161. procedure fpc_cpucodeinit;
  2162. var
  2163. _eax,_ecx_cpuid1,_edx_cpuid1,_ebx_cpuid7 : longint;
  2164. begin
  2165. if cpuid_support then
  2166. begin
  2167. asm
  2168. movl $1,%eax
  2169. xorl %ecx,%ecx
  2170. cpuid
  2171. movl %edx,_edx_cpuid1
  2172. movl %ecx,_ecx_cpuid1
  2173. end ['ebx'];
  2174. has_mmx_support:=(_edx_cpuid1 and $800000)<>0;
  2175. if ((_edx_cpuid1 and $2000000)<>0) then
  2176. begin
  2177. os_supports_sse:=true;
  2178. sse_check:=true;
  2179. asm
  2180. { force an sse exception if no sse is supported, the exception handler sets
  2181. os_supports_sse to false then }
  2182. { don't change this instruction, the code above depends on its size }
  2183. {$ifdef OLD_ASSEMBLER}
  2184. .byte 0x0f,0x28,0xf7
  2185. {$else}
  2186. movaps %xmm7, %xmm6
  2187. {$endif not EMX}
  2188. end;
  2189. sse_check:=false;
  2190. has_sse_support:=os_supports_sse;
  2191. end;
  2192. if has_sse_support then
  2193. begin
  2194. has_sse2_support:=((_edx_cpuid1 and $4000000)<>0);
  2195. has_sse3_support:=((_ecx_cpuid1 and $200)<>0);
  2196. { now avx }
  2197. asm
  2198. xorl %eax,%eax
  2199. cpuid
  2200. movl %eax,_eax
  2201. end;
  2202. if _eax>=7 then
  2203. begin
  2204. asm
  2205. movl $7,%eax
  2206. xorl %ecx,%ecx
  2207. cpuid
  2208. movl %ebx,_ebx_cpuid7
  2209. end;
  2210. fast_large_repmovstosb:=_ebx_cpuid7 and (1 shl 9)<>0;
  2211. if (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then
  2212. begin
  2213. asm
  2214. xorl %ecx,%ecx
  2215. .byte 0x0f,0x01,0xd0 { xgetbv }
  2216. movl %eax,_eax
  2217. end;
  2218. if (_eax and 6)=6 then
  2219. begin
  2220. has_avx_support:=(_ecx_cpuid1 and $10000000)<>0;
  2221. has_avx2_support:=(_ebx_cpuid7 and $20)<>0;
  2222. end;
  2223. end;
  2224. end;
  2225. end;
  2226. end;
  2227. { don't let libraries influence the FPU cw set by the host program }
  2228. if IsLibrary then
  2229. begin
  2230. Default8087CW:=Get8087CW;
  2231. if has_sse_support then
  2232. DefaultMXCSR:=GetMXCSR;
  2233. end;
  2234. SysResetFPU;
  2235. fpc_cpucodeinit_performed:=true;
  2236. end;
  2237. {$if not defined(darwin) and defined(regcall) }
  2238. { darwin requires that the stack is aligned to 16 bytes when calling another function }
  2239. {$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
  2240. {$define FPC_SYSTEM_HAS_ANSISTR_DECR_REF}
  2241. Procedure fpc_AnsiStr_Decr_Ref (Var S : Pointer); [Public,Alias:'FPC_ANSISTR_DECR_REF']; compilerproc; nostackframe; assembler;
  2242. asm
  2243. movl (%eax),%edx
  2244. testl %edx,%edx
  2245. jz .Lquit
  2246. movl $0,(%eax) // s:=nil
  2247. cmpl $0,-8(%edx) // exit if refcount<0
  2248. jl .Lquit
  2249. {$ifdef FPC_PIC}
  2250. call fpc_geteipasecx
  2251. addl $_GLOBAL_OFFSET_TABLE_,%ecx
  2252. movl ismultithread@GOT(%ecx),%ecx
  2253. cmpl $0,(%ecx)
  2254. {$else FPC_PIC}
  2255. cmpl $0,ismultithread
  2256. {$endif FPC_PIC}
  2257. je .Lskiplock
  2258. .byte 0xF0 // LOCK prefix, jumped over if IsMultiThread = false. FPC assembler does not accept disjoint LOCK mnemonic.
  2259. .Lskiplock:
  2260. decl -8(%edx)
  2261. jz .Lfree
  2262. .Lquit:
  2263. ret
  2264. .Lfree:
  2265. leal -12(%edx),%eax // points to start of allocation
  2266. { freemem is not an assembler leaf function like fpc_geteipasecx, so it
  2267. needs to be called with proper stack alignment }
  2268. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2269. leal -12(%esp),%esp
  2270. call FPC_FREEMEM
  2271. leal 12(%esp),%esp
  2272. {$else FPC_SYSTEM_STACKALIGNMENT16}
  2273. jmp FPC_FREEMEM // can perform a tail call
  2274. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2275. end;
  2276. function fpc_truely_ansistr_unique(Var S : Pointer): Pointer; forward;
  2277. {$define FPC_SYSTEM_HAS_ANSISTR_UNIQUE}
  2278. Function fpc_ansistr_Unique(Var S : Pointer): Pointer; [Public,Alias : 'FPC_ANSISTR_UNIQUE']; compilerproc; nostackframe;assembler;
  2279. asm
  2280. // Var S located in register
  2281. // Var $result located in register
  2282. movl %eax,%edx
  2283. // [437] pointer(result) := pointer(s);
  2284. movl (%eax),%eax
  2285. // [438] If Pointer(S)=Nil then
  2286. testl %eax,%eax
  2287. je .Lj4031
  2288. .Lj4036:
  2289. // [440] if PAnsiRec(Pointer(S)-Firstoff)^.Ref<>1 then
  2290. movl -8(%eax),%ecx
  2291. cmpl $1,%ecx
  2292. je .Lj4038
  2293. // [441] result:=fpc_truely_ansistr_unique(s);
  2294. movl %edx,%eax
  2295. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2296. leal -12(%esp),%esp
  2297. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2298. call fpc_truely_ansistr_unique
  2299. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2300. leal 12(%esp),%esp
  2301. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2302. .Lj4038:
  2303. .Lj4031:
  2304. // [442] end;
  2305. end;
  2306. {$endif FPC_HAS_FEATURE_ANSISTRINGS}
  2307. {$endif ndef darwin and defined(regcall) }
  2308. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  2309. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  2310. procedure ReadBarrier;assembler;nostackframe;
  2311. asm
  2312. {$ifdef CPUX86_HAS_SSE2}
  2313. lfence
  2314. {$else CPUX86_HAS_SSE2}
  2315. lock
  2316. addl $0,0(%esp)
  2317. {$endif CPUX86_HAS_SSE2}
  2318. end;
  2319. procedure ReadDependencyBarrier;{$ifdef SYSTEMINLINE}inline;{$endif}
  2320. begin
  2321. { reads imply barrier on earlier reads depended on }
  2322. end;
  2323. procedure ReadWriteBarrier;assembler;nostackframe;
  2324. asm
  2325. {$ifdef CPUX86_HAS_SSE2}
  2326. mfence
  2327. {$else CPUX86_HAS_SSE2}
  2328. lock
  2329. addl $0,0(%esp)
  2330. {$endif CPUX86_HAS_SSE2}
  2331. end;
  2332. procedure WriteBarrier;assembler;nostackframe;
  2333. asm
  2334. {$ifdef CPUX86_HAS_SSEUNIT}
  2335. sfence
  2336. {$endif CPUX86_HAS_SSEUNIT}
  2337. end;
  2338. {$endif}
  2339. {$ifndef FPC_SYSTEM_HAS_BSF_QWORD}
  2340. {$define FPC_SYSTEM_HAS_BSF_QWORD}
  2341. function BsfQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2342. asm
  2343. bsfl 4(%esp),%eax
  2344. jz .L1
  2345. ret $8
  2346. .L1:
  2347. bsfl 8(%esp),%eax
  2348. jz .L2
  2349. add $32,%eax
  2350. ret $8
  2351. .L2:
  2352. movl $255,%eax
  2353. end;
  2354. {$endif FPC_SYSTEM_HAS_BSF_QWORD}
  2355. {$ifndef FPC_SYSTEM_HAS_BSR_QWORD}
  2356. {$define FPC_SYSTEM_HAS_BSR_QWORD}
  2357. function BsrQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2358. asm
  2359. bsrl 8(%esp),%eax
  2360. jz .L1
  2361. add $32,%eax
  2362. ret $8
  2363. .L1:
  2364. bsrl 4(%esp),%eax
  2365. jz .L2
  2366. ret $8
  2367. .L2:
  2368. movl $255,%eax
  2369. end;
  2370. {$endif FPC_SYSTEM_HAS_BSR_QWORD}
  2371. {$ifndef FPC_SYSTEM_HAS_SAR_QWORD}
  2372. {$define FPC_SYSTEM_HAS_SAR_QWORD}
  2373. function fpc_SarInt64(Const AValue : Int64;const Shift : Byte): Int64; [Public,Alias:'FPC_SARINT64']; compilerproc; assembler; nostackframe;
  2374. asm
  2375. movl 8(%esp),%edx
  2376. movzbl %al,%ecx
  2377. cmpb $32,%al
  2378. jnb .L1
  2379. movl 4(%esp),%eax
  2380. shrdl %cl,%edx,%eax
  2381. sarl %cl,%edx
  2382. ret $8
  2383. .L1:
  2384. movl %edx,%eax
  2385. sarl $31,%edx
  2386. sarl %cl,%eax // uses 5 lower bits of cl.
  2387. end;
  2388. {$endif FPC_SYSTEM_HAS_SAR_QWORD}