i386.inc 75 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 1999-2000 by the Free Pascal development team.
  4. Processor dependent implementation for the system unit for
  5. intel i386+
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$if not(defined(VER3_0)) and defined(linux)}
  13. {$define FPC_SYSTEM_STACKALIGNMENT16}
  14. {$endif not(defined(VER3_0)) and defined(linux)}
  15. {****************************************************************************
  16. Primitives
  17. ****************************************************************************}
  18. var
  19. os_supports_sse : boolean;
  20. { this variable is set to true, if currently an sse check is executed and no sig ill should be generated }
  21. sse_check : boolean;
  22. fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
  23. fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. }
  24. {$asmmode ATT}
  25. function cpuid_support : boolean;assembler;nostackframe;
  26. {
  27. Check if the ID-flag can be changed, if changed then CpuID is supported.
  28. Tested under go32v1 and Linux on c6x86 with CpuID enabled and disabled (PFV)
  29. }
  30. asm
  31. pushfl
  32. movl (%esp),%eax
  33. xorl $0x200000,%eax
  34. pushl %eax
  35. popfl
  36. pushfl
  37. popl %eax
  38. xorl (%esp),%eax
  39. popfl
  40. testl $0x200000,%eax
  41. setnz %al
  42. end;
  43. {$define FPC_SYSTEM_HAS_FPC_CPUINIT}
  44. procedure fpc_cpuinit;
  45. begin
  46. { because of the brain dead sse detection on x86, this test is post poned to fpc_cpucodeinit which
  47. must be implemented OS dependend (FK)
  48. has_sse_support:=sse_support;
  49. has_mmx_support:=mmx_support;
  50. }
  51. end;
  52. {$ifndef darwin}
  53. procedure fpc_geteipasebx; [public, alias: 'fpc_geteipasebx'];assembler; nostackframe;
  54. asm
  55. movl (%esp),%ebx
  56. end;
  57. procedure fpc_geteipasecx; [public, alias: 'fpc_geteipasecx'];assembler; nostackframe;
  58. asm
  59. movl (%esp),%ecx
  60. end;
  61. {$endif}
  62. {$if not defined(FPC_SYSTEM_HAS_MOVE)
  63. and not defined(OLD_ASSEMBLER)
  64. and not defined(darwin)}
  65. {$i fastmove.inc}
  66. {$endif}
  67. {$ifndef FPC_SYSTEM_HAS_MOVE}
  68. {$define FPC_SYSTEM_HAS_MOVE}
  69. procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;
  70. var
  71. saveesi,saveedi : longint;
  72. asm
  73. movl %edi,saveedi
  74. movl %esi,saveesi
  75. movl %eax,%esi
  76. movl %edx,%edi
  77. movl %ecx,%edx
  78. movl %edi,%eax
  79. { check for zero or negative count }
  80. cmpl $0,%edx
  81. jle .LMoveEnd
  82. { Check for back or forward }
  83. sub %esi,%eax
  84. jz .LMoveEnd { Do nothing when source=dest }
  85. jc .LFMove { Do forward, dest<source }
  86. cmp %edx,%eax
  87. jb .LBMove { Dest is in range of move, do backward }
  88. { Forward Copy }
  89. .LFMove:
  90. {$ifdef FPC_ENABLED_CLD}
  91. cld
  92. {$endif FPC_ENABLED_CLD}
  93. cmpl $15,%edx
  94. jl .LFMove1
  95. movl %edi,%ecx { Align on 32bits }
  96. negl %ecx
  97. andl $3,%ecx
  98. subl %ecx,%edx
  99. rep
  100. movsb
  101. movl %edx,%ecx
  102. andl $3,%edx
  103. shrl $2,%ecx
  104. rep
  105. movsl
  106. .LFMove1:
  107. movl %edx,%ecx
  108. rep
  109. movsb
  110. jmp .LMoveEnd
  111. { Backward Copy }
  112. .LBMove:
  113. std
  114. addl %edx,%esi
  115. addl %edx,%edi
  116. movl %edi,%ecx
  117. decl %esi
  118. decl %edi
  119. cmpl $15,%edx
  120. jl .LBMove1
  121. negl %ecx { Align on 32bits }
  122. andl $3,%ecx
  123. subl %ecx,%edx
  124. rep
  125. movsb
  126. movl %edx,%ecx
  127. andl $3,%edx
  128. shrl $2,%ecx
  129. subl $3,%esi
  130. subl $3,%edi
  131. rep
  132. movsl
  133. addl $3,%esi
  134. addl $3,%edi
  135. .LBMove1:
  136. movl %edx,%ecx
  137. rep
  138. movsb
  139. cld
  140. .LMoveEnd:
  141. movl saveedi,%edi
  142. movl saveesi,%esi
  143. end;
  144. {$endif FPC_SYSTEM_HAS_MOVE}
  145. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  146. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  147. or not defined(FPC_SYSTEM_HAS_FILLDWORD)
  148. or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  149. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  150. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  151. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  152. const
  153. FillXxxx_RepStosThreshold_ERMS = 1024;
  154. FillXxxx_RepStosThreshold_NoERMS = 512 * 1024;
  155. procedure FillXxxx_U32Pattern_RepStos_8OrMore; assembler; nostackframe;
  156. { eax — x, ecx — uint32 pattern, edx — byte count >= 8 (preferably >= FillXxxx_RepStosThreshold_(No)ERMS, depending on fast_large_repmovstosb). }
  157. asm
  158. {$ifdef FPC_ENABLED_CLD}
  159. cld
  160. {$endif FPC_ENABLED_CLD}
  161. mov %ecx, (%eax) { Write first 4 bytes unaligned. }
  162. push %ecx { pattern }
  163. push %edi
  164. mov %eax, %edi { Move x to edi, as expected by ‘rep stosl’. }
  165. xchg %eax, %ecx { now eax = pattern (as expected by ‘rep stosl’) and ecx = x (to rotate the pattern by its misalignment) }
  166. shl $3, %ecx { ecx = misalignment of x in bits. }
  167. rol %cl, %eax { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
  168. add %edi, %edx { edx = x end }
  169. lea -1(%edx), %ecx { ecx = x end - 1. }
  170. add $4, %edi
  171. and $-4, %edi { edi = 4-byte aligned pointer strictly to the right of the start. }
  172. and $-4, %ecx { ecx = 4-byte aligned pointer strictly to the left of the end. }
  173. sub %edi, %ecx { ecx = byte count between them. }
  174. shr $2, %ecx { ecx = uint32 count, as expected by ‘rep stosl’. }
  175. rep stosl
  176. pop %edi
  177. pop %ecx
  178. mov %ecx, -4(%edx) { Write last 4 bytes unaligned. }
  179. end;
  180. {$endif FillChar/Word/DWord required.}
  181. {$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  182. label
  183. FillXxxx_MoreThanTwoXMMs;
  184. {$endif FillQWord required.}
  185. procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
  186. { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
  187. const
  188. NtThreshold = 4 * 1024 * 1024;
  189. asm
  190. movd %ecx, %xmm0
  191. pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
  192. movdqu %xmm0, (%eax)
  193. cmp $32, %edx
  194. ja .LMoreThanTwoVectors
  195. movdqu %xmm0, -16(%eax,%edx)
  196. ret
  197. .byte 102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
  198. { x can start and end misaligned on the vector boundary:
  199. x = ~~][H1][H2][...][T2][T1]~
  200. [UH] [UT]
  201. UH/UT stands for “unaligned head/tail”, both have 1~16 bytes. }
  202. .LMoreThanTwoVectors:
  203. push %esi
  204. mov %ecx, %esi { esi = pattern }
  205. mov %eax, %ecx
  206. shl $3, %ecx { ecx = misalignment of x in bits }
  207. rol %cl, %esi { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
  208. movd %esi, %xmm1
  209. pshufd $0, %xmm1, %xmm1
  210. {$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
  211. { FillQWord jumps here.
  212. eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes.
  213. Expects first 16 bytes written...
  214. ...and ESI pushed! }
  215. FillXxxx_MoreThanTwoXMMs:
  216. {$endif FillQWord required.}
  217. lea -65(%eax,%edx), %ecx { ecx = end of x - 65, to get the loop bound and to write UT later (why not write it right away though...). }
  218. and $-16, %eax { align eax to the LEFT (so needs to be offset by an additional +16 for a while). }
  219. movdqa %xmm1, 16(%eax) { Write H1. }
  220. mov %ecx, %esi
  221. and $-16, %esi { esi = “T4” (possibly fictive) = aligned ecx = loop bound. }
  222. cmp $49, %edx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
  223. jle .LOneAlignedTailWrite
  224. movdqa %xmm1, 32(%eax) { Write H2. }
  225. cmp $81, %edx { 50~81 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
  226. jle .LTwoAlignedTailWrites
  227. cmp $113, %edx { 82~113 bytes might contain 4~6 heads+tails; write as H1–2 and T4–1. }
  228. jle .LFourAlignedTailWrites
  229. add $48, %eax
  230. cmp $NtThreshold, %edx
  231. jae .L64xNT_Body
  232. .balign 16
  233. .L64x_Body:
  234. movdqa %xmm1, (%eax)
  235. movdqa %xmm1, 16(%eax)
  236. movdqa %xmm1, 32(%eax)
  237. movdqa %xmm1, 48(%eax)
  238. add $64, %eax
  239. cmp %esi, %eax
  240. jb .L64x_Body
  241. .LFourAlignedTailWrites:
  242. movdqa %xmm1, (%esi) { T4 }
  243. movdqa %xmm1, 16(%esi) { T3 }
  244. .LTwoAlignedTailWrites:
  245. movdqa %xmm1, 32(%esi) { T2 }
  246. .LOneAlignedTailWrite:
  247. movdqa %xmm1, 48(%esi) { T1 }
  248. movdqu %xmm0, 49(%ecx) { UT }
  249. pop %esi
  250. ret
  251. .balign 16
  252. .L64xNT_Body:
  253. movntdq %xmm1, (%eax)
  254. movntdq %xmm1, 16(%eax)
  255. movntdq %xmm1, 32(%eax)
  256. movntdq %xmm1, 48(%eax)
  257. add $64, %eax
  258. cmp %esi, %eax
  259. jb .L64xNT_Body
  260. sfence
  261. jmp .LFourAlignedTailWrites
  262. end;
  263. {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  264. or not defined(FPC_SYSTEM_HAS_FILLWORD)
  265. or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
  266. procedure FillXxxx_U32Pattern_Plain_16OrMore; assembler; nostackframe;
  267. { eax — x, ecx — uint32 pattern, edx — byte count >= 12 (preferably >= 16). }
  268. asm
  269. mov %ecx, (%eax) { Write first 4 bytes. }
  270. lea -9(%eax,%edx), %edx
  271. mov %ecx, 5(%edx) { Write last 4 bytes. }
  272. and $-4, %edx { edx = loop bound. }
  273. push %esi
  274. mov %ecx, %esi { esi = pattern }
  275. mov %eax, %ecx
  276. shl $3, %ecx { ecx = misalignment of x in bits }
  277. rol %cl, %esi { misalign the pattern }
  278. add $4, %eax
  279. and $-4, %eax
  280. .balign 16
  281. .L8xLoop:
  282. mov %esi, (%eax)
  283. mov %esi, 4(%eax)
  284. add $8, %eax
  285. cmp %edx, %eax
  286. jb .L8xLoop
  287. mov %esi, (%edx)
  288. mov %esi, 4(%edx)
  289. pop %esi
  290. end;
  291. procedure FillXxxx_U32Pattern_Ladder_4to16; assembler; nostackframe;
  292. { eax — x, ecx — uint32 pattern, edx — byte count, 4 <= edx <= 16. }
  293. asm
  294. mov %ecx, (%eax)
  295. cmp $8, %edx
  296. jle .LLast4
  297. mov %ecx, 4(%eax)
  298. mov %ecx, -8(%eax,%edx)
  299. .LLast4:
  300. mov %ecx, -4(%eax,%edx)
  301. end;
  302. {$endif FillChar/Word/DWord required.}
  303. {$endif FillChar/Word/DWord/QWord required.}
  304. {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
  305. {$define FPC_SYSTEM_HAS_FILLCHAR}
  306. procedure FillChar_3OrLess; assembler; nostackframe;
  307. { cl — x, edx — byte count, Low(int32) <= edx <= 3. }
  308. asm
  309. test %edx, %edx
  310. jle .LQuit
  311. mov %cl, (%eax)
  312. mov %cl, -1(%eax,%edx)
  313. shr $1, %edx
  314. mov %cl, (%eax,%edx)
  315. .LQuit:
  316. end;
  317. procedure FillChar_Plain(var x;count:SizeInt;value:byte);assembler;nostackframe;
  318. asm
  319. cmp $3, %edx
  320. jle FillChar_3OrLess
  321. movzbl %cl, %ecx
  322. imul $0x01010101, %ecx
  323. cmp $16, %edx
  324. jbe FillXxxx_U32Pattern_Ladder_4to16
  325. jmp FillXxxx_U32Pattern_Plain_16OrMore
  326. end;
  327. procedure FillChar_SSE2(var x;count:SizeInt;value:byte);assembler;nostackframe;
  328. asm
  329. cmp $3, %edx
  330. jle FillChar_3OrLess
  331. movzbl %cl, %ecx
  332. imul $0x01010101, %ecx
  333. cmp $16, %edx
  334. jbe FillXxxx_U32Pattern_Ladder_4to16
  335. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  336. jb FillXxxx_U32Pattern_SSE2_16OrMore
  337. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  338. end;
  339. procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
  340. asm
  341. cmp $3, %edx
  342. jle FillChar_3OrLess
  343. movzbl %cl, %ecx
  344. imul $0x01010101, %ecx
  345. cmp $16, %edx
  346. jbe FillXxxx_U32Pattern_Ladder_4to16
  347. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  348. jb FillXxxx_U32Pattern_SSE2_16OrMore
  349. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  350. end;
  351. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
  352. var
  353. FillChar_Impl: procedure(var x;count:SizeInt;value:byte) = @FillChar_Dispatch;
  354. procedure FillChar_Dispatch(var x;count:SizeInt;value:byte);
  355. begin
  356. if not fpc_cpucodeinit_performed then
  357. begin
  358. FillChar_Plain(x, count, value);
  359. exit;
  360. end;
  361. if fast_large_repmovstosb then
  362. FillChar_Impl := @FillChar_SSE2_ERMS
  363. else if has_sse2_support then
  364. FillChar_Impl := @FillChar_SSE2
  365. else
  366. FillChar_Impl := @FillChar_Plain;
  367. FillChar_Impl(x, count, value);
  368. end;
  369. procedure FillChar(var x;count:SizeInt;value:byte);
  370. begin
  371. FillChar_Impl(x, count, value);
  372. end;
  373. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  374. {$ifndef FPC_SYSTEM_HAS_FILLWORD}
  375. {$define FPC_SYSTEM_HAS_FILLWORD}
  376. procedure FillWord_3OrLess; assembler; nostackframe;
  377. asm
  378. test %edx, %edx
  379. jle .LQuit
  380. mov %cx, (%eax)
  381. mov %cx, -2(%eax,%edx,2)
  382. shr $1, %edx
  383. mov %cx, (%eax,%edx,2)
  384. .LQuit:
  385. end;
  386. procedure FillWord_Plain(var x;count:SizeInt;value:word);assembler;nostackframe;
  387. asm
  388. cmp $3, %edx
  389. jle FillWord_3OrLess
  390. shl $1, %edx
  391. movzwl %cx, %ecx
  392. imul $0x00010001, %ecx
  393. cmp $16, %edx
  394. jbe FillXxxx_U32Pattern_Ladder_4to16
  395. jmp FillXxxx_U32Pattern_Plain_16OrMore
  396. end;
  397. procedure FillWord_SSE2(var x;count:SizeInt;value:word);assembler;nostackframe;
  398. asm
  399. cmp $3, %edx
  400. jle FillWord_3OrLess
  401. shl $1, %edx
  402. movzwl %cx, %ecx
  403. imul $0x00010001, %ecx
  404. cmp $16, %edx
  405. jbe FillXxxx_U32Pattern_Ladder_4to16
  406. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  407. jb FillXxxx_U32Pattern_SSE2_16OrMore
  408. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  409. end;
  410. procedure FillWord_SSE2_ERMS(var x;count:SizeInt;value:word);assembler;nostackframe;
  411. asm
  412. cmp $3, %edx
  413. jle FillWord_3OrLess
  414. shl $1, %edx
  415. movzwl %cx, %ecx
  416. imul $0x00010001, %ecx
  417. cmp $16, %edx
  418. jbe FillXxxx_U32Pattern_Ladder_4to16
  419. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  420. jb FillXxxx_U32Pattern_SSE2_16OrMore
  421. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  422. end;
  423. procedure FillWord_Dispatch(var x;count:SizeInt;value:word); forward;
  424. var
  425. FillWord_Impl: procedure(var x;count:SizeInt;value:word) = @FillWord_Dispatch;
  426. procedure FillWord_Dispatch(var x;count:SizeInt;value:word);
  427. begin
  428. if not fpc_cpucodeinit_performed then
  429. begin
  430. FillWord_Plain(x, count, value);
  431. exit;
  432. end;
  433. if fast_large_repmovstosb then
  434. FillWord_Impl := @FillWord_SSE2_ERMS
  435. else if has_sse2_support then
  436. FillWord_Impl := @FillWord_SSE2
  437. else
  438. FillWord_Impl := @FillWord_Plain;
  439. FillWord_Impl(x, count, value);
  440. end;
  441. procedure FillWord(var x;count:SizeInt;value:word);
  442. begin
  443. FillWord_Impl(x, count, value);
  444. end;
  445. {$endif FPC_SYSTEM_HAS_FILLWORD}
  446. {$ifndef FPC_SYSTEM_HAS_FILLDWORD}
  447. {$define FPC_SYSTEM_HAS_FILLDWORD}
  448. procedure FillDWord_4OrLess; assembler; nostackframe;
  449. asm
  450. cmp $1, %edx
  451. jl .LQuit
  452. mov %ecx, (%eax)
  453. je .LQuit
  454. mov %ecx, 4(%eax)
  455. mov %ecx, -8(%eax,%edx,4)
  456. mov %ecx, -4(%eax,%edx,4)
  457. .LQuit:
  458. end;
  459. procedure FillDWord_Plain(var x;count:SizeInt;value:dword);assembler;nostackframe;
  460. asm
  461. cmp $4, %edx
  462. jle FillDWord_4OrLess
  463. shl $2, %edx
  464. jmp FillXxxx_U32Pattern_Plain_16OrMore
  465. end;
  466. procedure FillDWord_SSE2(var x;count:SizeInt;value:dword);assembler;nostackframe;
  467. asm
  468. cmp $4, %edx
  469. jle FillDWord_4OrLess
  470. shl $2, %edx
  471. cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
  472. jb FillXxxx_U32Pattern_SSE2_16OrMore
  473. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  474. end;
  475. procedure FillDWord_SSE2_ERMS(var x;count:SizeInt;value:dword);assembler;nostackframe;
  476. asm
  477. cmp $4, %edx
  478. jle FillDWord_4OrLess
  479. shl $2, %edx
  480. cmp $FillXxxx_RepStosThreshold_ERMS, %edx
  481. jb FillXxxx_U32Pattern_SSE2_16OrMore
  482. jmp FillXxxx_U32Pattern_RepStos_8OrMore
  483. end;
  484. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword); forward;
  485. var
  486. FillDWord_Impl: procedure(var x;count:SizeInt;value:dword) = @FillDWord_Dispatch;
  487. procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword);
  488. begin
  489. if not fpc_cpucodeinit_performed then
  490. begin
  491. FillDWord_Plain(x, count, value);
  492. exit;
  493. end;
  494. if fast_large_repmovstosb then
  495. FillDWord_Impl := @FillDWord_SSE2_ERMS
  496. else if has_sse2_support then
  497. FillDWord_Impl := @FillDWord_SSE2
  498. else
  499. FillDWord_Impl := @FillDWord_Plain;
  500. FillDWord_Impl(x, count, value);
  501. end;
  502. procedure FillDWord(var x;count:SizeInt;value:dword);
  503. begin
  504. FillDWord_Impl(x, count, value);
  505. end;
  506. {$endif FPC_SYSTEM_HAS_FILLDWORD}
  507. {$ifndef FPC_SYSTEM_HAS_FILLQWORD}
  508. {$define FPC_SYSTEM_HAS_FILLQWORD}
  509. procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  510. { eax = x, edx = count, [esp + 4] = value }
  511. asm
  512. test %edx, %edx { Don't care about clever things like separate small branches or aligning writes by misaligning the pattern, the function is dead. }
  513. jle .LQuit
  514. push %esi
  515. mov 4+4(%esp), %esi { esi = value[0:31] }
  516. mov 4+8(%esp), %ecx { ecx = value[32:63] }
  517. .balign 16
  518. .LLoop:
  519. mov %esi, (%eax)
  520. mov %ecx, 4(%eax)
  521. add $8, %eax
  522. sub $1, %edx
  523. jnz .LLoop
  524. pop %esi
  525. .LQuit:
  526. end;
  527. procedure FillQWord_SSE2(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  528. { eax = x, edx = count, [esp + 4] = value }
  529. asm
  530. cmp $4, %edx
  531. jle .L4OrLess
  532. movq 4(%esp), %xmm0
  533. punpcklqdq %xmm0, %xmm0
  534. { Stack is 12 bytes:
  535. [esp] = return address, [esp + 4] = value (not required anymore).
  536. Convert to 8 bytes expected by FillXxxx_MoreThanTwoXMMs:
  537. [esp] = esi, [esp + 4] = return address. }
  538. mov (%esp), %ecx
  539. add $4, %esp
  540. mov %esi, (%esp)
  541. mov %ecx, 4(%esp)
  542. shl $3, %edx
  543. movdqu %xmm0, (%eax)
  544. movdqa %xmm0, %xmm1
  545. test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
  546. jz FillXxxx_MoreThanTwoXMMs
  547. mov %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x into xmm1. }
  548. shl $3, %ecx
  549. and $63, %ecx
  550. movd %ecx, %xmm3
  551. psllq %xmm3, %xmm1
  552. neg %ecx { Can also do not ecx; and $63, %ecx; inc ecx to support ecx = 0, as SSE shifts correctly handle count >= bitsizeof. }
  553. and $63, %ecx { But in the definitely unaligned case, ecx = 0 is impossible. }
  554. movd %ecx, %xmm3
  555. movdqa %xmm0, %xmm2
  556. psrlq %xmm3, %xmm2
  557. por %xmm2, %xmm1
  558. jmp FillXxxx_MoreThanTwoXMMs
  559. .L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ }
  560. cmp $1, %edx
  561. jl .LQuit
  562. mov 4(%esp), %ecx
  563. mov %ecx, (%eax)
  564. je .LSecondHalfOf1
  565. mov %ecx, 8(%eax)
  566. mov %ecx, -16(%eax,%edx,8)
  567. mov %ecx, -8(%eax,%edx,8)
  568. mov 8(%esp), %ecx
  569. mov %ecx, 4(%eax)
  570. mov %ecx, 12(%eax)
  571. mov %ecx, -12(%eax,%edx,8)
  572. mov %ecx, -4(%eax,%edx,8)
  573. .LQuit:
  574. ret $8
  575. .LSecondHalfOf1:
  576. mov 8(%esp), %ecx
  577. mov %ecx, 4(%eax)
  578. end;
  579. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword); forward;
  580. var
  581. FillQWord_Impl: procedure(var x;count:SizeInt;value:qword) = @FillQWord_Dispatch;
  582. procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword);
  583. begin
  584. if not fpc_cpucodeinit_performed then
  585. begin
  586. FillQWord_Plain(x, count, value);
  587. exit;
  588. end;
  589. if has_sse2_support then
  590. FillQWord_Impl := @FillQWord_SSE2
  591. else
  592. FillQWord_Impl := @FillQWord_Plain;
  593. FillQWord_Impl(x, count, value);
  594. end;
  595. procedure FillQWord(var x;count:SizeInt;value:qword);
  596. begin
  597. FillQWord_Impl(x, count, value);
  598. end;
  599. {$endif FPC_SYSTEM_HAS_FILLQWORD}
  600. {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
  601. {$define FPC_SYSTEM_HAS_INDEXBYTE}
  602. function IndexByte_Plain(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  603. asm
  604. push %esi
  605. push %edi
  606. push %eax { save initial value of 'buf' }
  607. cmp $4,%edx { less than 4 bytes, just test byte by byte. }
  608. jb .Ltail
  609. mov %cl,%ch { prepare pattern }
  610. movzwl %cx,%esi
  611. shl $16,%ecx
  612. or %esi,%ecx
  613. .Lalignloop:
  614. test $3,%al { align to 4 bytes if necessary }
  615. je .Laligned
  616. cmp %cl,(%eax)
  617. je .Lexit
  618. inc %eax
  619. dec %edx
  620. jmp .Lalignloop
  621. .balign 16 { Main loop, unrolled 4 times for speed }
  622. .Lloop:
  623. mov (%eax),%esi { load dword }
  624. xor %ecx,%esi { XOR with pattern, bytes equal to target are now 0 }
  625. lea -0x01010101(%esi),%edi
  626. xor %esi,%edi { (x-0x01010101) xor x }
  627. not %esi
  628. and $0x80808080,%esi
  629. and %edi,%esi { ((x-0x01010101) xor x) and (not x) and 0x80808080 }
  630. jnz .Lfound { one of the bytes matches }
  631. mov 4(%eax),%esi
  632. xor %ecx,%esi
  633. lea -0x01010101(%esi),%edi
  634. xor %esi,%edi
  635. not %esi
  636. and $0x80808080,%esi
  637. and %edi,%esi
  638. jnz .Lfound4
  639. mov 8(%eax),%esi
  640. xor %ecx,%esi
  641. lea -0x01010101(%esi),%edi
  642. xor %esi,%edi
  643. not %esi
  644. and $0x80808080,%esi
  645. and %edi,%esi
  646. jnz .Lfound8
  647. mov 12(%eax),%esi
  648. xor %ecx,%esi
  649. lea -0x01010101(%esi),%edi
  650. xor %esi,%edi
  651. not %esi
  652. and $0x80808080,%esi
  653. and %edi,%esi
  654. jnz .Lfound12
  655. add $16,%eax
  656. .Laligned:
  657. sub $16,%edx
  658. jae .Lloop { Still more than 16 bytes remaining }
  659. { Process remaining bytes (<16 left at this point) }
  660. { length is offset by -16 at this point }
  661. .Lloop2:
  662. cmp $4-16,%edx { < 4 bytes left? }
  663. jb .Ltail
  664. mov (%eax),%esi
  665. xor %ecx,%esi
  666. lea -0x01010101(%esi),%edi
  667. xor %esi,%edi
  668. not %esi
  669. and $0x80808080,%esi
  670. and %edi,%esi
  671. jne .Lfound
  672. add $4,%eax
  673. sub $4,%edx
  674. jmp .Lloop2
  675. .Ltail: { Less than 4 bytes remaining, check one by one }
  676. and $3, %edx
  677. jz .Lnotfound
  678. .Lloop3:
  679. cmp %cl,(%eax)
  680. je .Lexit
  681. inc %eax
  682. dec %edx
  683. jnz .Lloop3
  684. .Lnotfound:
  685. or $-1,%eax
  686. jmp .Lexit1
  687. { add missing source pointer increments }
  688. .Lfound12:
  689. add $4,%eax
  690. .Lfound8:
  691. add $4,%eax
  692. .Lfound4:
  693. add $4,%eax
  694. .Lfound:
  695. test $0xff,%esi
  696. jnz .Lexit
  697. inc %eax
  698. test $0xff00,%esi
  699. jnz .Lexit
  700. inc %eax
  701. test $0xff0000,%esi
  702. jnz .Lexit
  703. inc %eax
  704. .Lexit:
  705. sub (%esp),%eax
  706. .Lexit1:
  707. pop %ecx { removes initial 'buf' value }
  708. pop %edi
  709. pop %esi
  710. end;
  711. function IndexByte_SSE2(const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
  712. asm
  713. test %edx, %edx
  714. jz .Lnotfound { exit if len=0 }
  715. push %ebx
  716. movd %ecx, %xmm1
  717. lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
  718. punpcklbw %xmm1, %xmm1
  719. and $-0x10, %ecx { first aligned address after buf }
  720. punpcklbw %xmm1, %xmm1
  721. pshufd $0, %xmm1, %xmm1
  722. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
  723. sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
  724. pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
  725. pmovmskb %xmm0, %ebx
  726. shl %cl, %ebx { shift valid bits into high word }
  727. and $0xffff0000, %ebx { clear low word containing invalid bits }
  728. shr %cl, %ebx { shift back }
  729. jz .Lcontinue
  730. .Lmatch:
  731. bsf %ebx, %ebx
  732. lea -16(%ecx,%ebx), %eax
  733. pop %ebx
  734. cmp %eax, %edx { check against the buffer length }
  735. jbe .Lnotfound
  736. ret
  737. .balign 16
  738. .Lloop:
  739. movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, }
  740. add $16, %ecx { but their sum is evenly divisible by 16. }
  741. pcmpeqb %xmm1, %xmm0
  742. pmovmskb %xmm0, %ebx
  743. test %ebx, %ebx
  744. jnz .Lmatch
  745. .Lcontinue:
  746. cmp %ecx, %edx
  747. ja .Lloop
  748. pop %ebx
  749. .Lnotfound:
  750. or $-1, %eax
  751. end;
  752. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt; forward;
  753. var
  754. IndexByte_Impl: function(const buf;len:SizeInt;b:byte):SizeInt = @IndexByte_Dispatch;
  755. function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt;
  756. begin
  757. if not fpc_cpucodeinit_performed then
  758. exit(IndexByte_Plain(buf,len,b));
  759. if has_sse2_support then
  760. IndexByte_Impl:=@IndexByte_SSE2
  761. else
  762. IndexByte_Impl:=@IndexByte_Plain;
  763. result:=IndexByte_Impl(buf,len,b);
  764. end;
  765. function IndexByte(const buf;len:SizeInt;b:byte):SizeInt;
  766. begin
  767. result:=IndexByte_Impl(buf,len,b);
  768. end;
  769. {$endif FPC_SYSTEM_HAS_INDEXBYTE}
  770. {$ifndef FPC_SYSTEM_HAS_INDEXWORD}
  771. {$define FPC_SYSTEM_HAS_INDEXWORD}
  772. function IndexWord_Plain(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  773. asm
  774. test %edx, %edx
  775. jz .LNotFound
  776. push %eax
  777. .LWordwise_Body: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  778. cmp %cx, (%eax)
  779. je .LFound
  780. add $2, %eax
  781. dec %edx
  782. jnz .LWordwise_Body
  783. pop %edx
  784. .LNotFound:
  785. or $-1, %eax
  786. ret
  787. .LFound:
  788. pop %edx
  789. sub %edx, %eax
  790. shr $1, %eax
  791. end;
  792. function IndexWord_SSE2(const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
  793. asm
  794. test %edx, %edx { exit if len=0 }
  795. je .Lnotfound
  796. push %ebx
  797. movd %ecx, %xmm1
  798. punpcklwd %xmm1, %xmm1
  799. pshufd $0, %xmm1, %xmm1
  800. lea 16(%eax), %ecx
  801. and $-16, %ecx
  802. movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
  803. sub %eax, %ecx
  804. test $1, %eax { if buffer isn't aligned to word boundary, }
  805. jnz .Lunaligned { use a different algorithm }
  806. pcmpeqw %xmm1, %xmm0
  807. pmovmskb %xmm0, %ebx
  808. shl %cl, %ebx
  809. and $0xffff0000, %ebx
  810. shr %cl, %ebx
  811. shr $1, %ecx { ecx=number of valid bytes }
  812. test %ebx, %ebx
  813. jz .Lcontinue
  814. .Lmatch:
  815. bsf %ebx, %ebx
  816. shr $1, %ebx { in words }
  817. lea -8(%ecx,%ebx), %eax
  818. pop %ebx
  819. cmp %eax, %edx
  820. jbe .Lnotfound { if match is after the specified length, ignore it }
  821. ret
  822. .balign 16
  823. .Lloop:
  824. movdqa (%eax,%ecx,2), %xmm0
  825. add $8, %ecx
  826. pcmpeqw %xmm1, %xmm0
  827. pmovmskb %xmm0, %ebx
  828. test %ebx, %ebx
  829. jnz .Lmatch
  830. .Lcontinue:
  831. cmp %ecx, %edx
  832. ja .Lloop
  833. pop %ebx
  834. .Lnotfound:
  835. or $-1, %eax
  836. ret
  837. .Lunaligned:
  838. push %esi
  839. movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
  840. psllw $8, %xmm1 { swap bytes of each word of pattern) }
  841. psrlw $8, %xmm2
  842. por %xmm2, %xmm1
  843. pcmpeqb %xmm1, %xmm0
  844. pmovmskb %xmm0, %ebx
  845. shl %cl, %ebx
  846. and $0xffff0000, %ebx
  847. shr %cl, %ebx
  848. xor %esi, %esi { nothing to merge yet }
  849. add %edx, %edx { length words -> bytes }
  850. jmp .Lcontinue_u
  851. .balign 16
  852. .Lloop_u:
  853. movdqa (%eax,%ecx), %xmm0
  854. add $16, %ecx
  855. pcmpeqb %xmm1, %xmm0 { compare by bytes }
  856. shr $16, %esi { bit 16 shifts into 0 }
  857. pmovmskb %xmm0, %ebx
  858. .Lcontinue_u:
  859. shl $1, %ebx { 15:0 -> 16:1 }
  860. or %esi, %ebx { merge bit 0 from previous round }
  861. mov %ebx, %esi
  862. shr $1, %ebx { now AND together adjacent pairs of bits }
  863. and %esi, %ebx
  864. and $0x5555, %ebx { also reset odd bits }
  865. jnz .Lmatch_u
  866. cmp %ecx, %edx
  867. ja .Lloop_u
  868. .Lnotfound_u:
  869. pop %esi
  870. pop %ebx
  871. or $-1, %eax
  872. ret
  873. .Lmatch_u:
  874. bsf %ebx, %ebx
  875. lea -16(%ecx,%ebx), %eax
  876. cmp %eax, %edx
  877. jbe .Lnotfound_u { if match is after the specified length, ignore it }
  878. sar $1, %eax { in words }
  879. pop %esi
  880. pop %ebx
  881. end;
  882. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt; forward;
  883. var
  884. IndexWord_Impl: function(const buf;len:SizeInt;b:word):SizeInt = @IndexWord_Dispatch;
  885. function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt;
  886. begin
  887. if not fpc_cpucodeinit_performed then
  888. exit(IndexWord_Plain(buf,len,b));
  889. if has_sse2_support then
  890. IndexWord_Impl:=@IndexWord_SSE2
  891. else
  892. IndexWord_Impl:=@IndexWord_Plain;
  893. result:=IndexWord_Impl(buf,len,b);
  894. end;
  895. function IndexWord(const buf;len:SizeInt;b:word):SizeInt; inline;
  896. begin
  897. result:=IndexWord_Impl(buf,len,b);
  898. end;
  899. {$endif FPC_SYSTEM_HAS_INDEXWORD}
  900. {$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
  901. {$define FPC_SYSTEM_HAS_INDEXDWORD}
  902. function IndexDWord_Plain(Const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  903. asm
  904. push %eax
  905. sub $4, %eax
  906. .LDWordwise_Next: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
  907. add $4, %eax
  908. sub $1, %edx
  909. jb .LNotFound
  910. cmp %ecx, (%eax)
  911. jne .LDWordwise_Next
  912. pop %edx
  913. sub %edx, %eax
  914. shr $2, %eax
  915. ret
  916. .LNotFound:
  917. pop %edx
  918. mov $-1, %eax
  919. end;
  920. function IndexDWord_SSE2(const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
  921. asm
  922. push %eax
  923. sub $4, %edx
  924. jle .LDwordwise_Prepare
  925. movd %ecx, %xmm1
  926. pshufd $0, %xmm1, %xmm1
  927. .balign 16 { 1-byte NOP. }
  928. .L4x_Body:
  929. movdqu (%eax), %xmm0
  930. pcmpeqd %xmm1, %xmm0
  931. pmovmskb %xmm0, %ecx
  932. test %ecx, %ecx
  933. jnz .LFoundAtMask
  934. add $16, %eax
  935. sub $4, %edx
  936. jg .L4x_Body
  937. lea (%eax,%edx,4), %eax
  938. movdqu (%eax), %xmm0
  939. pcmpeqd %xmm1, %xmm0
  940. pmovmskb %xmm0, %ecx
  941. test %ecx, %ecx
  942. jz .LNothing
  943. .LFoundAtMask:
  944. bsf %ecx, %ecx
  945. add %ecx, %eax
  946. .LFoundAtEax:
  947. pop %edx
  948. sub %edx, %eax
  949. shr $2, %eax
  950. ret
  951. nop { Turns .balign 16 before .LDwordwise_Body into a no-op. }
  952. .LDwordwise_Prepare:
  953. add $3, %edx
  954. cmp $-1, %edx
  955. je .LNothing
  956. .balign 16 { no-op }
  957. .LDwordwise_Body:
  958. cmp (%eax), %ecx
  959. je .LFoundAtEax
  960. add $4, %eax
  961. sub $1, %edx
  962. jae .LDwordwise_Body
  963. .LNothing:
  964. pop %edx
  965. or $-1, %eax
  966. end;
  967. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt; forward;
  968. var
  969. IndexDWord_Impl: function(const buf;len:SizeInt;b:DWord):SizeInt = @IndexDWord_Dispatch;
  970. function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt;
  971. begin
  972. if not fpc_cpucodeinit_performed then
  973. exit(IndexDWord_Plain(buf,len,b));
  974. if has_sse2_support then
  975. IndexDWord_Impl:=@IndexDWord_SSE2
  976. else
  977. IndexDWord_Impl:=@IndexDWord_Plain;
  978. result:=IndexDWord_Impl(buf,len,b);
  979. end;
  980. function IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt;
  981. begin
  982. result:=IndexDWord_Impl(buf,len,b);
  983. end;
  984. {$endif FPC_SYSTEM_HAS_INDEXDWORD}
  985. {$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
  986. {$define FPC_SYSTEM_HAS_INDEXQWORD}
  987. function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
  988. { eax = buf, edx = len, [esp+4] = b }
  989. asm
  990. push %ebx
  991. mov 8(%esp), %ecx { ecx = b[0:31] }
  992. mov 12(%esp), %ebx { ebx = b[32:63] }
  993. mov %eax, 8(%esp) { remember original buf }
  994. sub $8, %eax
  995. .balign 16 { no-op }
  996. .LQWordwise_Next:
  997. add $8, %eax
  998. sub $1, %edx
  999. jb .LNotFound
  1000. cmp %ecx, (%eax)
  1001. jne .LQWordwise_Next
  1002. cmp %ebx, 4(%eax)
  1003. jne .LQWordwise_Next
  1004. sub 8(%esp), %eax
  1005. pop %ebx
  1006. shr $3, %eax
  1007. ret $8
  1008. .LNotFound:
  1009. pop %ebx
  1010. mov $-1, %eax
  1011. end;
  1012. {$endif FPC_SYSTEM_HAS_INDEXQWORD}
  1013. {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
  1014. {$define FPC_SYSTEM_HAS_COMPAREBYTE}
  1015. function CompareByte_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1016. asm
  1017. { eax = buf1, edx = buf2, ecx = len }
  1018. push %ebx
  1019. sub %eax, %edx { edx = buf2 - buf1 }
  1020. cmp $3, %ecx
  1021. jle .LBytewise_Prepare
  1022. { Align buf1 on 4 bytes. }
  1023. mov (%edx,%eax), %ebx
  1024. cmp (%eax), %ebx
  1025. jne .L4xDiffer
  1026. lea -4(%eax,%ecx), %ecx { ecx = buf1 end - (4 + buf1 and -4) = count remaining }
  1027. and $-4, %eax
  1028. sub %eax, %ecx
  1029. .balign 16
  1030. .L4x_Next:
  1031. add $4, %eax
  1032. sub $4, %ecx { at .LLast4, ecx is 4 less than remaining bytes }
  1033. jle .LLast4
  1034. mov (%edx,%eax), %ebx
  1035. cmp (%eax), %ebx
  1036. je .L4x_Next
  1037. .L4xDiffer:
  1038. mov (%eax), %edx
  1039. {$ifdef CPUX86_HAS_BSWAP}
  1040. bswap %ebx
  1041. bswap %edx
  1042. {$else}
  1043. rol $8, %bx
  1044. rol $16, %ebx
  1045. rol $8, %bx
  1046. rol $8, %dx
  1047. rol $16, %edx
  1048. rol $8, %dx
  1049. {$endif}
  1050. cmp %ebx, %edx
  1051. .LDoSbb:
  1052. sbb %eax, %eax
  1053. or $1, %eax
  1054. pop %ebx
  1055. ret
  1056. .LLast4:
  1057. add %ecx, %eax
  1058. mov (%edx,%eax), %ebx
  1059. cmp (%eax), %ebx
  1060. jne .L4xDiffer
  1061. xor %eax, %eax
  1062. pop %ebx
  1063. ret
  1064. .LBytewise_Prepare:
  1065. sub $1, %ecx
  1066. jb .LNothing
  1067. .balign 16 { no-op }
  1068. .LBytewise_Body:
  1069. movzbl (%edx,%eax), %ebx
  1070. cmp %bl, (%eax)
  1071. jne .LDoSbb
  1072. add $1, %eax
  1073. sub $1, %ecx
  1074. jae .LBytewise_Body
  1075. .LNothing:
  1076. xor %eax, %eax
  1077. pop %ebx
  1078. end;
  1079. function CompareByte_SSE2(const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
  1080. asm
  1081. { eax = buf1, edx = buf2, ecx = len }
  1082. cmp $1, %ecx
  1083. jle .L1OrLess
  1084. push %ebx
  1085. cmp $16, %ecx
  1086. jae .LVecOrMore
  1087. { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
  1088. mov %eax, %ebx
  1089. or %edx, %ebx
  1090. and $4095, %ebx
  1091. cmp $4080, %ebx
  1092. ja .LCantOverReadBoth
  1093. { Over-read both as XMMs. }
  1094. movdqu (%eax), %xmm0
  1095. movdqu (%edx), %xmm1
  1096. pcmpeqb %xmm1, %xmm0
  1097. pmovmskb %xmm0, %ebx
  1098. inc %bx { Here and below, 2-byte 'inc r16' after 'pmovmskb' can be replaced with 5-byte 'add $1, r16' or 6-byte 'xor $65535, r32'. }
  1099. jz .LNothing
  1100. bsf %ebx, %ebx
  1101. cmp %ecx, %ebx { Ignore garbage beyond 'len'. }
  1102. jae .LNothing
  1103. movzbl (%eax,%ebx), %eax
  1104. movzbl (%edx,%ebx), %edx
  1105. sub %edx, %eax
  1106. pop %ebx
  1107. ret
  1108. .LNothing:
  1109. pop %ebx
  1110. xor %eax, %eax
  1111. ret
  1112. .LVecOrMore:
  1113. { Compare first vectors. }
  1114. movdqu (%eax), %xmm0
  1115. movdqu (%edx), %xmm1
  1116. pcmpeqb %xmm1, %xmm0
  1117. pmovmskb %xmm0, %ebx
  1118. inc %bx
  1119. jnz .LVec0Differs
  1120. sub $32, %ecx { now ecx is len - 32... mostly just to save bytes on offsets improving .LAligned32xLoop_Body alignment :) }
  1121. jbe .LLastVec
  1122. { Compare second vectors. }
  1123. movdqu 16(%eax), %xmm0
  1124. movdqu 16(%edx), %xmm1
  1125. pcmpeqb %xmm1, %xmm0
  1126. pmovmskb %xmm0, %ebx
  1127. inc %bx
  1128. jnz .LVec1Differs
  1129. { More than four vectors: aligned loop. }
  1130. cmp $32, %ecx
  1131. ja .LAligned32xLoop_Prepare
  1132. { Compare last two vectors. }
  1133. movdqu (%eax,%ecx), %xmm0
  1134. movdqu (%edx,%ecx), %xmm1
  1135. pcmpeqb %xmm1, %xmm0
  1136. pmovmskb %xmm0, %ebx
  1137. inc %bx
  1138. jnz .LVecEm2Differs
  1139. .LLastVec:
  1140. movdqu 16(%eax,%ecx), %xmm0
  1141. movdqu 16(%edx,%ecx), %xmm1
  1142. pcmpeqb %xmm1, %xmm0
  1143. pmovmskb %xmm0, %ebx
  1144. inc %bx
  1145. jnz .LVecEm1Differs
  1146. pop %ebx
  1147. xor %eax, %eax
  1148. ret
  1149. .LVecEm2Differs:
  1150. sub $16, %ecx
  1151. .LVecEm1Differs:
  1152. bsf %ebx, %ebx
  1153. add %ecx, %ebx
  1154. movzbl 16(%eax,%ebx), %eax
  1155. movzbl 16(%edx,%ebx), %edx
  1156. sub %edx, %eax
  1157. pop %ebx
  1158. ret
  1159. nop { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
  1160. .LAligned32xLoop_Prepare:
  1161. lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
  1162. sub %eax, %edx { edx = buf2 - buf1 }
  1163. and $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
  1164. sub %eax, %ecx { ecx = count to be handled with loop }
  1165. .balign 16 { No-op. }
  1166. .LAligned32xLoop_Body:
  1167. add $32, %eax
  1168. { Compare two XMMs, reduce the result with 'and'. }
  1169. movdqu (%edx,%eax), %xmm0
  1170. pcmpeqb (%eax), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
  1171. movdqu 16(%edx,%eax), %xmm1
  1172. pcmpeqb 16(%eax), %xmm1
  1173. pand %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
  1174. pmovmskb %xmm1, %ebx
  1175. inc %bx
  1176. jnz .LAligned32xLoop_TwoVectorsDiffer
  1177. sub $32, %ecx
  1178. ja .LAligned32xLoop_Body
  1179. { Compare last two vectors after the loop by doing one more loop iteration, modified. }
  1180. lea 32(%eax,%ecx), %eax
  1181. movdqu (%edx,%eax), %xmm0
  1182. movdqu (%eax), %xmm2
  1183. pcmpeqb %xmm2, %xmm0
  1184. movdqu 16(%edx,%eax), %xmm1
  1185. movdqu 16(%eax), %xmm2
  1186. pcmpeqb %xmm2, %xmm1
  1187. pand %xmm0, %xmm1
  1188. pmovmskb %xmm1, %ebx
  1189. inc %bx
  1190. jnz .LAligned32xLoop_TwoVectorsDiffer
  1191. pop %ebx
  1192. xor %eax, %eax
  1193. ret
  1194. .LAligned32xLoop_TwoVectorsDiffer:
  1195. add %eax, %edx { restore edx = buf2 }
  1196. pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
  1197. inc %cx
  1198. jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
  1199. bsf %ecx, %ebx
  1200. movzbl (%eax,%ebx), %eax
  1201. movzbl (%edx,%ebx), %edx
  1202. sub %edx, %eax
  1203. pop %ebx
  1204. ret
  1205. .LVec1Differs:
  1206. add $16, %eax
  1207. add $16, %edx
  1208. .LVec0Differs:
  1209. bsf %ebx, %ebx
  1210. movzbl (%eax,%ebx), %eax
  1211. movzbl (%edx,%ebx), %edx
  1212. sub %edx, %eax
  1213. pop %ebx
  1214. ret
  1215. .LCantOverReadBoth:
  1216. cmp $3, %ecx
  1217. jle .L2to3
  1218. push %esi
  1219. mov (%eax), %ebx
  1220. mov (%edx), %esi
  1221. cmp %esi, %ebx
  1222. jne .L4xDiffer
  1223. cmp $8, %ecx
  1224. jbe .LLast4x
  1225. mov 4(%eax), %ebx
  1226. mov 4(%edx), %esi
  1227. cmp %esi, %ebx
  1228. jne .L4xDiffer
  1229. mov -8(%eax,%ecx), %ebx
  1230. mov -8(%edx,%ecx), %esi
  1231. cmp %esi, %ebx
  1232. jne .L4xDiffer
  1233. .LLast4x:
  1234. mov -4(%eax,%ecx), %ebx
  1235. mov -4(%edx,%ecx), %esi
  1236. cmp %esi, %ebx
  1237. jne .L4xDiffer
  1238. pop %esi
  1239. pop %ebx
  1240. xor %eax, %eax
  1241. ret
  1242. .L4xDiffer:
  1243. bswap %ebx
  1244. bswap %esi
  1245. cmp %esi, %ebx
  1246. pop %esi
  1247. sbb %eax, %eax
  1248. or $1, %eax
  1249. pop %ebx
  1250. ret
  1251. .L2to3:
  1252. movzwl (%edx), %ebx
  1253. bswap %ebx
  1254. shr $1, %ebx
  1255. mov -1(%edx,%ecx), %bl
  1256. movzwl (%eax), %edx
  1257. bswap %edx
  1258. shr $1, %edx
  1259. mov -1(%eax,%ecx), %dl
  1260. mov %edx, %eax
  1261. sub %ebx, %eax
  1262. pop %ebx
  1263. ret
  1264. .L1OrLess:
  1265. jl .LUnbounded_Prepare
  1266. movzbl (%eax), %eax
  1267. movzbl (%edx), %edx
  1268. sub %edx, %eax
  1269. ret
  1270. .LUnbounded_Prepare:
  1271. sub %eax, %edx { edx = buf2 - buf1 }
  1272. test %ecx, %ecx
  1273. jnz .LUnbounded_Body
  1274. xor %eax, %eax
  1275. ret
  1276. .balign 16
  1277. .LUnbounded_Next:
  1278. add $1, %eax
  1279. .LUnbounded_Body:
  1280. movzbl (%edx,%eax), %ecx
  1281. cmp %cl, (%eax)
  1282. je .LUnbounded_Next
  1283. sbb %eax, %eax
  1284. or $1, %eax
  1285. end;
  1286. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1287. var
  1288. CompareByte_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareByte_Dispatch;
  1289. function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1290. begin
  1291. if not fpc_cpucodeinit_performed then
  1292. exit(CompareByte_Plain(buf1, buf2, len));
  1293. if has_sse2_support then
  1294. CompareByte_Impl:=@CompareByte_SSE2
  1295. else
  1296. CompareByte_Impl:=@CompareByte_Plain;
  1297. result:=CompareByte_Impl(buf1, buf2, len);
  1298. end;
  1299. function CompareByte(const buf1, buf2; len: SizeInt): SizeInt;
  1300. begin
  1301. result:=CompareByte_Impl(buf1, buf2, len);
  1302. end;
  1303. {$endif FPC_SYSTEM_HAS_COMPAREBYTE}
  1304. {$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
  1305. {$define FPC_SYSTEM_HAS_COMPAREWORD}
  1306. function CompareWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1307. asm
  1308. push %ebx
  1309. sub %eax, %edx { edx = buf2 - buf1 }
  1310. lea -4(%ecx), %ebx { Go wordwise if ecx <= 3 or ecx > 1073741823 (High(int32) div 2) ==> uint32(ecx - 4) > 1073741819. }
  1311. cmp $1073741819, %ebx
  1312. ja .LWordwise_Prepare
  1313. test $2, %al
  1314. je .LAlignedToPtrUintOrNaturallyMisaligned
  1315. movzwl (%edx,%eax), %ebx
  1316. cmp %bx, (%eax)
  1317. jne .LDoSbb
  1318. add $2, %eax
  1319. sub $1, %ecx
  1320. .LAlignedToPtrUintOrNaturallyMisaligned:
  1321. sub $2, %ecx
  1322. .balign 16
  1323. .LPtrUintWise_Next:
  1324. mov (%edx,%eax), %ebx
  1325. cmp %ebx, (%eax)
  1326. jne .LPtrUintsDiffer
  1327. add $4, %eax
  1328. sub $2, %ecx
  1329. jg .LPtrUintWise_Next
  1330. lea (%eax,%ecx,2), %eax
  1331. mov (%edx,%eax), %ebx
  1332. cmp %ebx, (%eax)
  1333. jne .LPtrUintsDiffer
  1334. pop %ebx
  1335. xor %eax, %eax
  1336. ret
  1337. .LPtrUintsDiffer:
  1338. cmp %bx, (%eax)
  1339. jne .LDoSbb
  1340. shr $16, %ebx
  1341. cmp %bx, 2(%eax)
  1342. .LDoSbb:
  1343. sbb %eax, %eax
  1344. or $1, %eax
  1345. pop %ebx
  1346. ret
  1347. .balign 16
  1348. .LWordwise_Body:
  1349. movzwl (%edx,%eax), %ebx
  1350. cmp %bx, (%eax)
  1351. jne .LDoSbb
  1352. add $2, %eax
  1353. .LWordwise_Prepare:
  1354. sub $1, %ecx
  1355. jnb .LWordwise_Body
  1356. pop %ebx
  1357. xor %eax, %eax
  1358. end;
  1359. function CompareWord_SSE2(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1360. asm
  1361. push %ebx
  1362. sub %eax, %edx { edx = buf2 - buf1 }
  1363. lea -2(%ecx), %ebx { Go wordwise if ecx <= 1 or uint32(ecx) > 1073741823 (High(int32) div 2) ==> uint32(ecx - 2) > 1073741821. }
  1364. cmp $1073741821, %ebx
  1365. ja .LWordwise_Prepare
  1366. cmp $8, %ecx
  1367. jge .LVecOrMore
  1368. lea (%edx,%eax), %ebx
  1369. or %eax, %ebx
  1370. and $4095, %ebx
  1371. cmp $4080, %ebx
  1372. ja .LWordwise_Prepare
  1373. movdqu (%edx,%eax), %xmm0
  1374. movdqu (%eax), %xmm1
  1375. pcmpeqw %xmm1, %xmm0
  1376. pmovmskb %xmm0, %ebx
  1377. inc %bx
  1378. jz .LNothing
  1379. shl $1, %ecx { convert to bytes }
  1380. bsf %ebx, %ebx
  1381. cmp %ecx, %ebx
  1382. jb .LSubtractWords
  1383. .LNothing:
  1384. pop %ebx
  1385. xor %eax, %eax
  1386. ret
  1387. .balign 16
  1388. .LWordwise_Body:
  1389. movzwl (%edx,%eax), %ebx
  1390. cmp %bx, (%eax)
  1391. jne .LDoSbb
  1392. add $2, %eax
  1393. .LWordwise_Prepare:
  1394. sub $1, %ecx
  1395. jae .LWordwise_Body
  1396. xor %eax, %eax
  1397. pop %ebx
  1398. ret
  1399. .LDoSbb:
  1400. sbb %eax, %eax
  1401. or $1, %eax
  1402. pop %ebx
  1403. ret
  1404. .LVecOrMore:
  1405. movdqu (%edx,%eax), %xmm0 { Compare first vectors. }
  1406. movdqu (%eax), %xmm1
  1407. pcmpeqw %xmm1, %xmm0
  1408. pmovmskb %xmm0, %ebx
  1409. inc %bx
  1410. jnz .LVec0Differs
  1411. shl $1, %ecx { convert to bytes }
  1412. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1413. jle .LLastVec
  1414. push %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1415. add %eax, %ecx
  1416. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1417. sub %eax, %ecx
  1418. .balign 16
  1419. .LAligned8xLoop_Body:
  1420. add $16, %eax
  1421. movdqu (%edx,%eax), %xmm0
  1422. pcmpeqb (%eax), %xmm0
  1423. pmovmskb %xmm0, %ebx
  1424. inc %bx
  1425. jnz .LAligned8xLoop_VecDiffers
  1426. sub $16, %ecx
  1427. ja .LAligned8xLoop_Body
  1428. pop %ebx { drop original buf1 }
  1429. .LLastVec:
  1430. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1431. movdqu (%edx,%eax), %xmm0
  1432. movdqu (%eax), %xmm1
  1433. pcmpeqw %xmm1, %xmm0
  1434. pmovmskb %xmm0, %ebx
  1435. inc %bx
  1436. jnz .LVec0Differs
  1437. pop %ebx
  1438. xor %eax, %eax
  1439. ret
  1440. .LVec0Differs:
  1441. bsf %ebx, %ebx
  1442. .LSubtractWords:
  1443. add %eax, %edx
  1444. movzwl (%eax,%ebx), %eax
  1445. movzwl (%edx,%ebx), %edx
  1446. sub %edx, %eax
  1447. pop %ebx
  1448. ret
  1449. .LAligned8xLoop_VecDiffers:
  1450. bsf %ebx, %ebx
  1451. add %ebx, %eax
  1452. pop %ecx
  1453. sub %ecx, %eax
  1454. and $-2, %eax
  1455. add %ecx, %eax
  1456. movzwl (%edx,%eax), %edx
  1457. movzwl (%eax), %eax
  1458. sub %edx, %eax
  1459. pop %ebx
  1460. end;
  1461. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1462. var
  1463. CompareWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareWord_Dispatch;
  1464. function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1465. begin
  1466. if not fpc_cpucodeinit_performed then
  1467. exit(CompareWord_Plain(buf1, buf2, len));
  1468. if has_sse2_support then
  1469. CompareWord_Impl:=@CompareWord_SSE2
  1470. else
  1471. CompareWord_Impl:=@CompareWord_Plain;
  1472. result:=CompareWord_Impl(buf1, buf2, len);
  1473. end;
  1474. function CompareWord(const buf1, buf2; len: SizeInt): SizeInt;
  1475. begin
  1476. result:=CompareWord_Impl(buf1, buf2, len);
  1477. end;
  1478. {$endif FPC_SYSTEM_HAS_COMPAREWORD}
  1479. {$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
  1480. {$define FPC_SYSTEM_HAS_COMPAREDWORD}
  1481. function CompareDWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1482. asm
  1483. sub $1, %ecx
  1484. jb .LNothing
  1485. push %ebx
  1486. sub %eax, %edx
  1487. .balign 16
  1488. .LDwordwise_Body:
  1489. mov (%edx,%eax), %ebx
  1490. cmp %ebx, (%eax)
  1491. jne .LDoSbb
  1492. add $4, %eax
  1493. sub $1, %ecx
  1494. jnb .LDwordwise_Body
  1495. pop %ebx
  1496. .LNothing:
  1497. xor %eax, %eax
  1498. ret
  1499. .LDoSbb:
  1500. pop %ebx
  1501. sbb %eax, %eax
  1502. or $1, %eax
  1503. end;
  1504. function CompareDWord_SSE2(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
  1505. asm
  1506. push %ebx
  1507. sub %eax, %edx { edx = buf2 - buf1 }
  1508. lea -5(%ecx), %ebx { Go dwordwise if ecx <= 4 or ecx > 536870911 (High(int32) div 4) ==> uint32(ecx - 5) > 536870906. }
  1509. cmp $536870906, %ebx
  1510. ja .LDwordwise_Prepare
  1511. shl $2, %ecx { convert to bytes }
  1512. movdqu (%edx,%eax), %xmm1 { Compare first vectors. }
  1513. movdqu (%eax), %xmm0
  1514. pcmpeqd %xmm1, %xmm0
  1515. pmovmskb %xmm0, %ebx
  1516. inc %bx
  1517. jnz .LVec0Differs
  1518. sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
  1519. jle .LLastVec
  1520. push %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
  1521. add %eax, %ecx
  1522. and $-16, %eax { align buf1; +16 is performed by the loop. }
  1523. sub %eax, %ecx
  1524. .balign 16
  1525. .LAligned4xLoop_Body:
  1526. add $16, %eax
  1527. movdqu (%eax,%edx), %xmm0
  1528. pcmpeqb (%eax), %xmm0
  1529. pmovmskb %xmm0, %ebx
  1530. inc %bx
  1531. jnz .LAligned4xLoop_VecDiffers
  1532. sub $16, %ecx
  1533. ja .LAligned4xLoop_Body
  1534. pop %ebx { drop original buf1 }
  1535. .LLastVec:
  1536. lea 16(%eax,%ecx), %eax { point to the last 16 bytes }
  1537. movdqu (%edx,%eax), %xmm1
  1538. movdqu (%eax), %xmm0
  1539. pcmpeqd %xmm1, %xmm0
  1540. pmovmskb %xmm0, %ebx
  1541. inc %bx
  1542. jnz .LVec0Differs
  1543. pop %ebx
  1544. xor %eax, %eax
  1545. ret
  1546. .LVec0Differs:
  1547. bsf %ebx, %ebx
  1548. add %eax, %edx { recover edx = buf2 }
  1549. mov (%edx,%ebx), %edx
  1550. cmp %edx, (%eax,%ebx)
  1551. sbb %eax, %eax
  1552. or $1, %eax
  1553. pop %ebx
  1554. ret
  1555. .LAligned4xLoop_VecDiffers:
  1556. bsf %ebx, %ebx
  1557. add %ebx, %eax
  1558. pop %ecx
  1559. sub %ecx, %eax
  1560. and $-4, %eax
  1561. add %ecx, %eax
  1562. mov (%edx,%eax), %edx
  1563. cmp %edx, (%eax)
  1564. .LDoSbb:
  1565. sbb %eax, %eax
  1566. or $1, %eax
  1567. pop %ebx
  1568. ret
  1569. .balign 16
  1570. .LDwordwise_Body:
  1571. mov (%edx,%eax), %ebx
  1572. cmp %ebx, (%eax)
  1573. jne .LDoSbb
  1574. add $4, %eax
  1575. .LDwordwise_Prepare:
  1576. sub $1, %ecx
  1577. jnb .LDwordwise_Body
  1578. pop %ebx
  1579. xor %eax, %eax
  1580. end;
  1581. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;
  1582. var
  1583. CompareDWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareDWord_Dispatch;
  1584. function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
  1585. begin
  1586. if not fpc_cpucodeinit_performed then
  1587. exit(CompareDWord_Plain(buf1, buf2, len));
  1588. if has_sse2_support then
  1589. CompareDWord_Impl:=@CompareDWord_SSE2
  1590. else
  1591. CompareDWord_Impl:=@CompareDWord_Plain;
  1592. result:=CompareDWord_Impl(buf1, buf2, len);
  1593. end;
  1594. function CompareDWord(const buf1, buf2; len: SizeInt): SizeInt;
  1595. begin
  1596. result:=CompareDWord_Impl(buf1, buf2, len);
  1597. end;
  1598. {$endif FPC_SYSTEM_HAS_COMPAREDWORD}
  1599. {$ifndef FPC_SYSTEM_HAS_INDEXCHAR0}
  1600. {$define FPC_SYSTEM_HAS_INDEXCHAR0}
  1601. function IndexChar0(Const buf;len:SizeInt;b:AnsiChar):SizeInt; assembler;
  1602. var
  1603. saveesi,saveebx : longint;
  1604. asm
  1605. movl %esi,saveesi
  1606. movl %ebx,saveebx
  1607. // Can't use scasb, or will have to do it twice, think this
  1608. // is faster for small "len"
  1609. movl %eax,%esi // Load address
  1610. movzbl %cl,%ebx // Load searchpattern
  1611. testl %edx,%edx
  1612. je .LFound
  1613. xorl %ecx,%ecx // zero index in Buf
  1614. xorl %eax,%eax // To make DWord compares possible
  1615. .balign 4
  1616. .LLoop:
  1617. movb (%esi),%al // Load byte
  1618. cmpb %al,%bl
  1619. je .LFound // byte the same?
  1620. incl %ecx
  1621. incl %esi
  1622. cmpl %edx,%ecx // Maximal distance reached?
  1623. je .LNotFound
  1624. testl %eax,%eax // Nullchar = end of search?
  1625. jne .LLoop
  1626. .LNotFound:
  1627. movl $-1,%ecx // Not found return -1
  1628. .LFound:
  1629. movl %ecx,%eax
  1630. movl saveesi,%esi
  1631. movl saveebx,%ebx
  1632. end;
  1633. {$endif FPC_SYSTEM_HAS_INDEXCHAR0}
  1634. {****************************************************************************
  1635. String
  1636. ****************************************************************************}
  1637. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1638. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1639. procedure fpc_shortstr_to_shortstr(out res:shortstring; const sstr: shortstring);assembler;[public,alias:'FPC_SHORTSTR_TO_SHORTSTR']; compilerproc;
  1640. {$ifndef FPC_PROFILE}
  1641. nostackframe;
  1642. {$endif}
  1643. { eax = res, edx = high(res), ecx = sstr }
  1644. asm
  1645. {$ifdef FPC_PROFILE}
  1646. push %eax
  1647. push %edx
  1648. push %ecx
  1649. call mcount
  1650. pop %ecx
  1651. pop %edx
  1652. pop %eax
  1653. {$endif FPC_PROFILE}
  1654. cmp (%ecx), %dl { length(sstr) fits into res? }
  1655. jbe .LEdxIsLen { use high(res) if length(sstr) does not fit }
  1656. movzbl (%ecx), %edx { use length(sstr) }
  1657. .LEdxIsLen:
  1658. mov %dl, (%eax) { store length to res[0] }
  1659. xchg %ecx, %edx { ecx = length = Move count, edx = sstr }
  1660. xchg %eax, %edx { eax = sstr = Move src, edx = res = Move dest }
  1661. inc %eax
  1662. inc %edx
  1663. {$ifdef FPC_PROFILE}
  1664. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1665. lea -8(%esp), %esp
  1666. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1667. call Move
  1668. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1669. lea 8(%esp), %esp
  1670. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1671. {$else FPC_PROFILE}
  1672. jmp Move
  1673. {$endif FPC_PROFILE}
  1674. end;
  1675. procedure fpc_shortstr_assign(len:longint;sstr,dstr:pointer);[public,alias:'FPC_SHORTSTR_ASSIGN'];
  1676. begin
  1677. asm
  1678. {$ifdef FPC_PROFILE}
  1679. push %eax
  1680. push %edx
  1681. push %ecx
  1682. call mcount
  1683. pop %ecx
  1684. pop %edx
  1685. pop %eax
  1686. {$endif FPC_PROFILE}
  1687. pushl %eax
  1688. pushl %ecx
  1689. {$ifdef FPC_ENABLED_CLD}
  1690. cld
  1691. {$endif FPC_ENABLED_CLD}
  1692. movl dstr,%edi
  1693. movl sstr,%esi
  1694. xorl %eax,%eax
  1695. movl len,%ecx
  1696. lodsb
  1697. cmpl %ecx,%eax
  1698. jbe .LStrCopy1
  1699. movl %ecx,%eax
  1700. .LStrCopy1:
  1701. stosb
  1702. cmpl $7,%eax
  1703. jl .LStrCopy2
  1704. movl %edi,%ecx { Align on 32bits }
  1705. negl %ecx
  1706. andl $3,%ecx
  1707. subl %ecx,%eax
  1708. rep
  1709. movsb
  1710. movl %eax,%ecx
  1711. andl $3,%eax
  1712. shrl $2,%ecx
  1713. rep
  1714. movsl
  1715. .LStrCopy2:
  1716. movl %eax,%ecx
  1717. rep
  1718. movsb
  1719. popl %ecx
  1720. popl %eax
  1721. end ['ESI','EDI'];
  1722. end;
  1723. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  1724. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1725. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1726. function fpc_shortstr_compare(const left,right:shortstring): longint;assembler; [public,alias:'FPC_SHORTSTR_COMPARE']; compilerproc;
  1727. { eax = left, edx = right }
  1728. asm
  1729. {$ifdef FPC_PROFILE}
  1730. push %eax
  1731. push %edx
  1732. push %ecx
  1733. call mcount
  1734. pop %ecx
  1735. pop %edx
  1736. pop %eax
  1737. {$endif FPC_PROFILE}
  1738. push %ebx
  1739. movzbl (%eax), %ecx { ecx = len(left) }
  1740. movzbl (%edx), %ebx { ebx = len(right) }
  1741. cmp %ebx, %ecx
  1742. {$ifdef CPUX86_HAS_CMOV}
  1743. cmovg %ebx, %ecx
  1744. {$else}
  1745. jle .LEcxIsLen
  1746. mov %ebx, %ecx
  1747. .LEcxIsLen:
  1748. {$endif}
  1749. push %eax { save left }
  1750. inc %eax
  1751. inc %edx
  1752. { stack is already aligned to 16 bytes if required: return address + push ebp + push ebx + push eax. }
  1753. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1754. call CompareByte
  1755. {$else}
  1756. call CompareByte_Impl { manually inline CompareByte }
  1757. {$endif}
  1758. pop %edx { restore left }
  1759. test %eax, %eax
  1760. jnz .LReturn
  1761. movzbl (%edx), %eax
  1762. sub %ebx, %eax
  1763. .LReturn:
  1764. pop %ebx
  1765. end;
  1766. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
  1767. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1768. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1769. function fpc_shortstr_compare_equal(const left,right:shortstring) : longint;assembler;nostackframe; [public,alias:'FPC_SHORTSTR_COMPARE_EQUAL']; compilerproc; nostackframe;
  1770. { eax = left, edx = right }
  1771. asm
  1772. movzbl (%eax), %ecx
  1773. cmp (%edx), %cl
  1774. jne .LNotEqual
  1775. inc %eax
  1776. inc %edx
  1777. {$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
  1778. jmp CompareByte
  1779. {$else}
  1780. jmp CompareByte_Impl { manually inline CompareByte }
  1781. {$endif}
  1782. .LNotEqual:
  1783. or $-1, %eax
  1784. end;
  1785. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
  1786. {$ifndef FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1787. {$define FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1788. procedure fpc_pchar_to_shortstr(out res : shortstring;p:PAnsiChar);assembler;[public,alias:'FPC_PCHAR_TO_SHORTSTR']; compilerproc;
  1789. {$ifndef FPC_PROFILE}
  1790. nostackframe;
  1791. {$endif}
  1792. // eax = res, edx = high(res), ecx = p
  1793. asm
  1794. {$ifdef FPC_PROFILE}
  1795. push %eax
  1796. push %edx
  1797. push %ecx
  1798. call mcount
  1799. pop %ecx
  1800. pop %edx
  1801. pop %eax
  1802. {$endif FPC_PROFILE}
  1803. test %ecx, %ecx
  1804. jz .LEmpty
  1805. push %eax { save res }
  1806. push %ecx { save p }
  1807. push %edx { save high(res) }
  1808. mov %ecx, %eax { eax = IndexByte.buf }
  1809. { edx is already high(res) = IndexByte.count.
  1810. Careful: using high(res) instead of -1 limits the scan by high(res) which is a good thing,
  1811. but assumes that IndexByte is “safe” and won’t read potentially invalid memory past the searched byte even if formally (and wrongly) allowed by ‘count’.
  1812. Generic and x86 versions are “safe”. }
  1813. xor %ecx, %ecx { ecx = 0 = IndexByte.value }
  1814. { Stack is already aligned on 16 bytes if the function is nostackframe: return address + push eax + push ecx + push edx.
  1815. With a stack frame, there is an additional push ebp and need 12 more bytes to align. }
  1816. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1817. leal -12(%esp), %esp
  1818. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1819. {$if defined(FPC_PIC) or not declared(IndexByte_Impl)}
  1820. call IndexByte
  1821. {$else}
  1822. call IndexByte_Impl { manually inline IndexByte }
  1823. {$endif}
  1824. {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1825. leal 12(%esp), %esp
  1826. {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
  1827. pop %ecx { ecx = high(res) = Move.len }
  1828. test %eax, %eax { If IndexByte result (eax) is non-negative (terminator is among first high(res) characters), use it, otherwise keep high(res). }
  1829. {$ifdef CPUX86_HAS_CMOV}
  1830. cmovns %eax, %ecx
  1831. {$else}
  1832. js .LEcxIsLen
  1833. mov %eax, %ecx
  1834. .LEcxIsLen:
  1835. {$endif}
  1836. pop %eax { pop p to eax = Move.src }
  1837. pop %edx { pop res to edx }
  1838. mov %cl, (%edx) { res[0] := len }
  1839. inc %edx { res[1] = Move.dst }
  1840. {$ifdef FPC_PROFILE}
  1841. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1842. leal -12(%esp), %esp
  1843. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1844. call Move
  1845. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  1846. leal 12(%esp), %esp
  1847. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  1848. jmp .LReturn
  1849. {$else FPC_PROFILE}
  1850. jmp Move { can perform a tail call }
  1851. {$endif FPC_PROFILE}
  1852. .LEmpty:
  1853. movb $0, (%eax)
  1854. {$ifdef FPC_PROFILE}
  1855. .LReturn:
  1856. {$endif}
  1857. end;
  1858. {$endif FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
  1859. {$IFNDEF INTERNAL_BACKTRACE}
  1860. {$define FPC_SYSTEM_HAS_GET_FRAME}
  1861. function get_frame:pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
  1862. asm
  1863. movl %ebp,%eax
  1864. end;
  1865. {$ENDIF not INTERNAL_BACKTRACE}
  1866. {$define FPC_SYSTEM_HAS_GET_PC_ADDR}
  1867. Function Get_pc_addr : Pointer;assembler;nostackframe;
  1868. asm
  1869. movl (%esp),%eax
  1870. end;
  1871. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  1872. function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;
  1873. {$if defined(win32)}
  1874. { Windows has StackTop always properly set }
  1875. begin
  1876. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  1877. Result:=PPointer(framebp+4)^
  1878. else
  1879. Result:=nil;
  1880. end;
  1881. {$else defined(win32)}
  1882. nostackframe;assembler;
  1883. asm
  1884. orl %eax,%eax
  1885. jz .Lg_a_null
  1886. movl 4(%eax),%eax
  1887. .Lg_a_null:
  1888. end;
  1889. {$endif defined(win32)}
  1890. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  1891. function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;
  1892. {$if defined(win32)}
  1893. { Windows has StackTop always properly set }
  1894. begin
  1895. if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
  1896. Result:=PPointer(framebp)^
  1897. else
  1898. Result:=nil;
  1899. end;
  1900. {$else defined(win32)}
  1901. nostackframe;assembler;
  1902. asm
  1903. orl %eax,%eax
  1904. jz .Lgnf_null
  1905. movl (%eax),%eax
  1906. .Lgnf_null:
  1907. end;
  1908. {$endif defined(win32)}
  1909. {$define FPC_SYSTEM_HAS_SPTR}
  1910. Function Sptr : Pointer;assembler;nostackframe;
  1911. asm
  1912. movl %esp,%eax
  1913. end;
  1914. {****************************************************************************
  1915. Str()
  1916. ****************************************************************************}
  1917. {$if defined(disabled) and defined(regcall) }
  1918. {$define FPC_SYSTEM_HAS_INT_STR_LONGWORD}
  1919. {$define FPC_SYSTEM_HAS_INT_STR_LONGINT}
  1920. label str_int_shortcut;
  1921. procedure int_str(l:longword;out s:shortstring);assembler;nostackframe;
  1922. asm
  1923. pushl %esi
  1924. pushl %edi
  1925. pushl %ebx
  1926. mov %edx,%edi
  1927. xor %edx,%edx
  1928. jmp str_int_shortcut
  1929. end;
  1930. procedure int_str(l:longint;out s:shortstring);assembler;nostackframe;
  1931. {Optimized for speed, but balanced with size.}
  1932. const digits:array[0..9] of cardinal=(0,10,100,1000,10000,
  1933. 100000,1000000,10000000,
  1934. 100000000,1000000000);
  1935. asm
  1936. {$ifdef FPC_PROFILE}
  1937. push %eax
  1938. push %edx
  1939. push %ecx
  1940. call mcount
  1941. pop %ecx
  1942. pop %edx
  1943. pop %eax
  1944. {$endif FPC_PROFILE}
  1945. push %esi
  1946. push %edi
  1947. push %ebx
  1948. movl %edx,%edi
  1949. { Calculate absolute value and put sign in edx}
  1950. cltd
  1951. xorl %edx,%eax
  1952. subl %edx,%eax
  1953. negl %edx
  1954. str_int_shortcut:
  1955. movl %ecx,%esi
  1956. {Calculate amount of digits in ecx.}
  1957. xorl %ecx,%ecx
  1958. bsrl %eax,%ecx
  1959. incl %ecx
  1960. imul $1233,%ecx
  1961. shr $12,%ecx
  1962. {$ifdef FPC_PIC}
  1963. call fpc_geteipasebx
  1964. {$ifdef darwin}
  1965. movl digits-.Lpic(%ebx),%ebx
  1966. {$else}
  1967. addl $_GLOBAL_OFFSET_TABLE_,%ebx
  1968. movl digits@GOT(%ebx),%ebx
  1969. {$endif}
  1970. cmpl (%ebx,%ecx,4),%eax
  1971. {$else}
  1972. cmpl digits(,%ecx,4),%eax
  1973. {$endif}
  1974. cmc
  1975. adcl $0,%ecx {Nr. digits ready in ecx.}
  1976. {Write length & sign.}
  1977. lea (%edx,%ecx),%ebx
  1978. movb $45,%bh {movb $'-,%bh Not supported by our ATT reader.}
  1979. movw %bx,(%edi)
  1980. addl %edx,%edi
  1981. subl %edx,%esi
  1982. {Skip digits beyond string length.}
  1983. movl %eax,%edx
  1984. subl %ecx,%esi
  1985. jae .Lloop_write
  1986. .balign 4
  1987. .Lloop_skip:
  1988. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  1989. mull %edx
  1990. shrl $3,%edx
  1991. decl %ecx
  1992. jz .Ldone {If (l<0) and (high(s)=1) this jump is taken.}
  1993. incl %esi
  1994. jnz .Lloop_skip
  1995. {Write out digits.}
  1996. .balign 4
  1997. .Lloop_write:
  1998. movl $0xcccccccd,%eax {Divide by 10 using mul+shr}
  1999. {Pre-add '0'}
  2000. leal 48(%edx),%ebx {leal $'0(,%edx),%ebx Not supported by our ATT reader.}
  2001. mull %edx
  2002. shrl $3,%edx
  2003. leal (%edx,%edx,8),%eax {x mod 10 = x-10*(x div 10)}
  2004. subl %edx,%ebx
  2005. subl %eax,%ebx
  2006. movb %bl,(%edi,%ecx)
  2007. decl %ecx
  2008. jnz .Lloop_write
  2009. .Ldone:
  2010. popl %ebx
  2011. popl %edi
  2012. popl %esi
  2013. end;
  2014. {$endif}
  2015. {****************************************************************************
  2016. Bounds Check
  2017. ****************************************************************************}
  2018. { do a thread-safe inc/dec }
  2019. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  2020. function cpudeclocked(var l : longint) : boolean;assembler;nostackframe;
  2021. asm
  2022. lock
  2023. decl (%eax)
  2024. setzb %al
  2025. end;
  2026. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  2027. procedure cpuinclocked(var l : longint);assembler;nostackframe;
  2028. asm
  2029. lock
  2030. incl (%eax)
  2031. end;
  2032. // inline SMP check and normal lock.
  2033. // the locked one is so slow, inlining doesn't matter.
  2034. function declocked(var l : longint) : boolean; inline;
  2035. begin
  2036. if not ismultithread then
  2037. begin
  2038. dec(l);
  2039. declocked:=l=0;
  2040. end
  2041. else
  2042. declocked:=cpudeclocked(l);
  2043. end;
  2044. procedure inclocked(var l : longint); inline;
  2045. begin
  2046. if not ismultithread then
  2047. inc(l)
  2048. else
  2049. cpuinclocked(l);
  2050. end;
  2051. function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
  2052. asm
  2053. movl $-1,%edx
  2054. lock
  2055. xaddl %edx, (%eax)
  2056. lea -1(%edx),%eax
  2057. end;
  2058. function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
  2059. asm
  2060. movl $1,%edx
  2061. lock
  2062. xaddl %edx, (%eax)
  2063. lea 1(%edx),%eax
  2064. end;
  2065. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2066. asm
  2067. xchgl (%eax),%edx
  2068. movl %edx,%eax
  2069. end;
  2070. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  2071. asm
  2072. lock
  2073. xaddl %edx, (%eax)
  2074. movl %edx,%eax
  2075. end;
  2076. function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler; nostackframe;
  2077. asm
  2078. xchgl %eax,%ecx
  2079. lock
  2080. cmpxchgl %edx, (%ecx)
  2081. end;
  2082. function InterlockedCompareExchange64(var Target: int64; NewValue: int64; Comperand: int64): int64; assembler;
  2083. asm
  2084. pushl %ebx
  2085. pushl %edi
  2086. movl %eax,%edi
  2087. movl Comperand+4,%edx
  2088. movl Comperand+0,%eax
  2089. movl NewValue+4,%ecx
  2090. movl NewValue+0,%ebx
  2091. lock cmpxchg8b (%edi)
  2092. pop %edi
  2093. pop %ebx
  2094. end;
  2095. {****************************************************************************
  2096. FPU
  2097. ****************************************************************************}
  2098. const
  2099. { Internal constants for use in system unit }
  2100. FPU_Invalid = 1;
  2101. FPU_Denormal = 2;
  2102. FPU_DivisionByZero = 4;
  2103. FPU_Overflow = 8;
  2104. FPU_Underflow = $10;
  2105. FPU_StackUnderflow = $20;
  2106. FPU_StackOverflow = $40;
  2107. FPU_ExceptionMask = $ff;
  2108. MM_Invalid = 1;
  2109. MM_Denormal = 2;
  2110. MM_DivisionByZero = 4;
  2111. MM_Overflow = 8;
  2112. MM_Underflow = $10;
  2113. MM_Precicion = $20;
  2114. MM_ExceptionMask = $3f;
  2115. MM_MaskInvalidOp = %0000000010000000;
  2116. MM_MaskDenorm = %0000000100000000;
  2117. MM_MaskDivZero = %0000001000000000;
  2118. MM_MaskOverflow = %0000010000000000;
  2119. MM_MaskUnderflow = %0000100000000000;
  2120. MM_MaskPrecision = %0001000000000000;
  2121. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  2122. Procedure SysInitFPU;
  2123. begin
  2124. end;
  2125. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  2126. Procedure SysResetFPU;
  2127. var
  2128. { these locals are so we don't have to hack pic code in the assembler }
  2129. localmxcsr: dword;
  2130. localfpucw: word;
  2131. begin
  2132. localfpucw:=Default8087CW;
  2133. asm
  2134. fninit
  2135. fwait
  2136. fldcw localfpucw
  2137. end;
  2138. if has_sse_support then
  2139. begin
  2140. localmxcsr:=DefaultMXCSR;
  2141. asm
  2142. { setup sse exceptions }
  2143. {$ifndef OLD_ASSEMBLER}
  2144. ldmxcsr localmxcsr
  2145. {$else OLD_ASSEMBLER}
  2146. mov localmxcsr,%eax
  2147. subl $4,%esp
  2148. mov %eax,(%esp)
  2149. //ldmxcsr (%esp)
  2150. .byte 0x0f,0xae,0x14,0x24
  2151. addl $4,%esp
  2152. {$endif OLD_ASSEMBLER}
  2153. end;
  2154. end;
  2155. end;
  2156. { because of the brain dead sse detection on x86, this test is post poned }
  2157. procedure fpc_cpucodeinit;
  2158. var
  2159. _eax,_ecx_cpuid1,_edx_cpuid1,_ebx_cpuid7 : longint;
  2160. begin
  2161. if cpuid_support then
  2162. begin
  2163. asm
  2164. movl $1,%eax
  2165. xorl %ecx,%ecx
  2166. cpuid
  2167. movl %edx,_edx_cpuid1
  2168. movl %ecx,_ecx_cpuid1
  2169. end ['ebx'];
  2170. has_mmx_support:=(_edx_cpuid1 and $800000)<>0;
  2171. if ((_edx_cpuid1 and $2000000)<>0) then
  2172. begin
  2173. os_supports_sse:=true;
  2174. sse_check:=true;
  2175. asm
  2176. { force an sse exception if no sse is supported, the exception handler sets
  2177. os_supports_sse to false then }
  2178. { don't change this instruction, the code above depends on its size }
  2179. {$ifdef OLD_ASSEMBLER}
  2180. .byte 0x0f,0x28,0xf7
  2181. {$else}
  2182. movaps %xmm7, %xmm6
  2183. {$endif not EMX}
  2184. end;
  2185. sse_check:=false;
  2186. has_sse_support:=os_supports_sse;
  2187. end;
  2188. if has_sse_support then
  2189. begin
  2190. has_sse2_support:=((_edx_cpuid1 and $4000000)<>0);
  2191. has_sse3_support:=((_ecx_cpuid1 and $200)<>0);
  2192. { now avx }
  2193. asm
  2194. xorl %eax,%eax
  2195. cpuid
  2196. movl %eax,_eax
  2197. end;
  2198. if _eax>=7 then
  2199. begin
  2200. asm
  2201. movl $7,%eax
  2202. xorl %ecx,%ecx
  2203. cpuid
  2204. movl %ebx,_ebx_cpuid7
  2205. end;
  2206. fast_large_repmovstosb:=_ebx_cpuid7 and (1 shl 9)<>0;
  2207. if (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then
  2208. begin
  2209. asm
  2210. xorl %ecx,%ecx
  2211. .byte 0x0f,0x01,0xd0 { xgetbv }
  2212. movl %eax,_eax
  2213. end;
  2214. if (_eax and 6)=6 then
  2215. begin
  2216. has_avx_support:=(_ecx_cpuid1 and $10000000)<>0;
  2217. has_avx2_support:=(_ebx_cpuid7 and $20)<>0;
  2218. end;
  2219. end;
  2220. end;
  2221. end;
  2222. end;
  2223. { don't let libraries influence the FPU cw set by the host program }
  2224. if IsLibrary then
  2225. begin
  2226. Default8087CW:=Get8087CW;
  2227. if has_sse_support then
  2228. DefaultMXCSR:=GetMXCSR;
  2229. end;
  2230. SysResetFPU;
  2231. fpc_cpucodeinit_performed:=true;
  2232. end;
  2233. {$if not defined(darwin) and defined(regcall) }
  2234. { darwin requires that the stack is aligned to 16 bytes when calling another function }
  2235. {$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
  2236. {$define FPC_SYSTEM_HAS_ANSISTR_DECR_REF}
  2237. Procedure fpc_AnsiStr_Decr_Ref (Var S : Pointer); [Public,Alias:'FPC_ANSISTR_DECR_REF']; compilerproc; nostackframe; assembler;
  2238. asm
  2239. movl (%eax),%edx
  2240. testl %edx,%edx
  2241. jz .Lquit
  2242. movl $0,(%eax) // s:=nil
  2243. cmpl $0,-8(%edx) // exit if refcount<0
  2244. jl .Lquit
  2245. {$ifdef FPC_PIC}
  2246. call fpc_geteipasecx
  2247. addl $_GLOBAL_OFFSET_TABLE_,%ecx
  2248. movl ismultithread@GOT(%ecx),%ecx
  2249. cmpl $0,(%ecx)
  2250. {$else FPC_PIC}
  2251. cmpl $0,ismultithread
  2252. {$endif FPC_PIC}
  2253. je .Lskiplock
  2254. .byte 0xF0 // LOCK prefix, jumped over if IsMultiThread = false. FPC assembler does not accept disjoint LOCK mnemonic.
  2255. .Lskiplock:
  2256. decl -8(%edx)
  2257. jz .Lfree
  2258. .Lquit:
  2259. ret
  2260. .Lfree:
  2261. leal -12(%edx),%eax // points to start of allocation
  2262. { freemem is not an assembler leaf function like fpc_geteipasecx, so it
  2263. needs to be called with proper stack alignment }
  2264. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2265. leal -12(%esp),%esp
  2266. call FPC_FREEMEM
  2267. leal 12(%esp),%esp
  2268. {$else FPC_SYSTEM_STACKALIGNMENT16}
  2269. jmp FPC_FREEMEM // can perform a tail call
  2270. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2271. end;
  2272. function fpc_truely_ansistr_unique(Var S : Pointer): Pointer; forward;
  2273. {$define FPC_SYSTEM_HAS_ANSISTR_UNIQUE}
  2274. Function fpc_ansistr_Unique(Var S : Pointer): Pointer; [Public,Alias : 'FPC_ANSISTR_UNIQUE']; compilerproc; nostackframe;assembler;
  2275. asm
  2276. // Var S located in register
  2277. // Var $result located in register
  2278. movl %eax,%edx
  2279. // [437] pointer(result) := pointer(s);
  2280. movl (%eax),%eax
  2281. // [438] If Pointer(S)=Nil then
  2282. testl %eax,%eax
  2283. je .Lj4031
  2284. .Lj4036:
  2285. // [440] if PAnsiRec(Pointer(S)-Firstoff)^.Ref<>1 then
  2286. movl -8(%eax),%ecx
  2287. cmpl $1,%ecx
  2288. je .Lj4038
  2289. // [441] result:=fpc_truely_ansistr_unique(s);
  2290. movl %edx,%eax
  2291. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2292. leal -12(%esp),%esp
  2293. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2294. call fpc_truely_ansistr_unique
  2295. {$ifdef FPC_SYSTEM_STACKALIGNMENT16}
  2296. leal 12(%esp),%esp
  2297. {$endif FPC_SYSTEM_STACKALIGNMENT16}
  2298. .Lj4038:
  2299. .Lj4031:
  2300. // [442] end;
  2301. end;
  2302. {$endif FPC_HAS_FEATURE_ANSISTRINGS}
  2303. {$endif ndef darwin and defined(regcall) }
  2304. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  2305. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  2306. procedure ReadBarrier;assembler;nostackframe;
  2307. asm
  2308. {$ifdef CPUX86_HAS_SSE2}
  2309. lfence
  2310. {$else CPUX86_HAS_SSE2}
  2311. lock
  2312. addl $0,0(%esp)
  2313. {$endif CPUX86_HAS_SSE2}
  2314. end;
  2315. procedure ReadDependencyBarrier;{$ifdef SYSTEMINLINE}inline;{$endif}
  2316. begin
  2317. { reads imply barrier on earlier reads depended on }
  2318. end;
  2319. procedure ReadWriteBarrier;assembler;nostackframe;
  2320. asm
  2321. {$ifdef CPUX86_HAS_SSE2}
  2322. mfence
  2323. {$else CPUX86_HAS_SSE2}
  2324. lock
  2325. addl $0,0(%esp)
  2326. {$endif CPUX86_HAS_SSE2}
  2327. end;
  2328. procedure WriteBarrier;assembler;nostackframe;
  2329. asm
  2330. {$ifdef CPUX86_HAS_SSEUNIT}
  2331. sfence
  2332. {$endif CPUX86_HAS_SSEUNIT}
  2333. end;
  2334. {$endif}
  2335. {$ifndef FPC_SYSTEM_HAS_BSF_QWORD}
  2336. {$define FPC_SYSTEM_HAS_BSF_QWORD}
  2337. function BsfQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2338. asm
  2339. bsfl 4(%esp),%eax
  2340. jz .L1
  2341. ret $8
  2342. .L1:
  2343. bsfl 8(%esp),%eax
  2344. jz .L2
  2345. add $32,%eax
  2346. ret $8
  2347. .L2:
  2348. movl $255,%eax
  2349. end;
  2350. {$endif FPC_SYSTEM_HAS_BSF_QWORD}
  2351. {$ifndef FPC_SYSTEM_HAS_BSR_QWORD}
  2352. {$define FPC_SYSTEM_HAS_BSR_QWORD}
  2353. function BsrQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
  2354. asm
  2355. bsrl 8(%esp),%eax
  2356. jz .L1
  2357. add $32,%eax
  2358. ret $8
  2359. .L1:
  2360. bsrl 4(%esp),%eax
  2361. jz .L2
  2362. ret $8
  2363. .L2:
  2364. movl $255,%eax
  2365. end;
  2366. {$endif FPC_SYSTEM_HAS_BSR_QWORD}
  2367. {$ifndef FPC_SYSTEM_HAS_SAR_QWORD}
  2368. {$define FPC_SYSTEM_HAS_SAR_QWORD}
  2369. function fpc_SarInt64(Const AValue : Int64;const Shift : Byte): Int64; [Public,Alias:'FPC_SARINT64']; compilerproc; assembler; nostackframe;
  2370. asm
  2371. movl 8(%esp),%edx
  2372. movzbl %al,%ecx
  2373. cmpb $32,%al
  2374. jnb .L1
  2375. movl 4(%esp),%eax
  2376. shrdl %cl,%edx,%eax
  2377. sarl %cl,%edx
  2378. ret $8
  2379. .L1:
  2380. movl %edx,%eax
  2381. sarl $31,%edx
  2382. sarl %cl,%eax // uses 5 lower bits of cl.
  2383. end;
  2384. {$endif FPC_SYSTEM_HAS_SAR_QWORD}