config.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. __package__ = 'plugins_extractor.chrome'
  2. import os
  3. from pathlib import Path
  4. from typing import List, Optional
  5. from pydantic import Field, model_validator
  6. from pydantic_pkgr import bin_abspath
  7. from abx.archivebox.base_configset import BaseConfigSet
  8. from abx.archivebox.base_binary import env
  9. from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
  10. from archivebox.misc.logging import STDERR
  11. from archivebox.misc.util import dedupe
  12. CHROMIUM_BINARY_NAMES_LINUX = [
  13. "chromium",
  14. "chromium-browser",
  15. "chromium-browser-beta",
  16. "chromium-browser-unstable",
  17. "chromium-browser-canary",
  18. "chromium-browser-dev",
  19. ]
  20. CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
  21. CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
  22. CHROME_BINARY_NAMES_LINUX = [
  23. "google-chrome",
  24. "google-chrome-stable",
  25. "google-chrome-beta",
  26. "google-chrome-canary",
  27. "google-chrome-unstable",
  28. "google-chrome-dev",
  29. "chrome"
  30. ]
  31. CHROME_BINARY_NAMES_MACOS = [
  32. "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
  33. "/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
  34. ]
  35. CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
  36. APT_DEPENDENCIES = [
  37. 'apt-transport-https', 'at-spi2-common', 'chromium-browser',
  38. 'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
  39. 'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
  40. 'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
  41. 'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
  42. ]
  43. def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
  44. for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
  45. abspath = bin_abspath(bin_name, PATH=env.PATH)
  46. if abspath:
  47. return abspath
  48. return None
  49. def create_macos_app_symlink(target: Path, shortcut: Path):
  50. """
  51. on macOS, some binaries are inside of .app, so we need to
  52. create a tiny bash script instead of a symlink
  53. (so that ../ parent relationships are relative to original .app instead of callsite dir)
  54. """
  55. # TODO: should we enforce this? is it useful in any other situation?
  56. # if platform.system().lower() != 'darwin':
  57. # raise Exception(...)
  58. shortcut.unlink(missing_ok=True)
  59. shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
  60. shortcut.chmod(0o777) # make sure its executable by everyone
  61. ###################### Config ##########################
  62. class ChromeConfig(BaseConfigSet):
  63. USE_CHROME: bool = Field(default=True)
  64. # Chrome Binary
  65. CHROME_BINARY: str = Field(default='chrome')
  66. CHROME_DEFAULT_ARGS: List[str] = Field(default=[
  67. '--virtual-time-budget=15000',
  68. '--disable-features=DarkMode',
  69. "--run-all-compositor-stages-before-draw",
  70. "--hide-scrollbars",
  71. "--autoplay-policy=no-user-gesture-required",
  72. "--no-first-run",
  73. "--use-fake-ui-for-media-stream",
  74. "--use-fake-device-for-media-stream",
  75. "--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",
  76. ])
  77. CHROME_EXTRA_ARGS: List[str] = Field(default=[])
  78. # Chrome Options Tuning
  79. CHROME_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT - 10)
  80. CHROME_HEADLESS: bool = Field(default=True)
  81. CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
  82. CHROME_RESOLUTION: str = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION)
  83. CHROME_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
  84. # Cookies & Auth
  85. CHROME_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
  86. CHROME_USER_DATA_DIR: Path | None = Field(default=None)
  87. CHROME_PROFILE_NAME: str = Field(default='Default')
  88. # Extractor Toggles
  89. SAVE_SCREENSHOT: bool = Field(default=True, alias='FETCH_SCREENSHOT')
  90. SAVE_DOM: bool = Field(default=True, alias='FETCH_DOM')
  91. SAVE_PDF: bool = Field(default=True, alias='FETCH_PDF')
  92. @model_validator(mode='after')
  93. def validate_use_chrome(self):
  94. if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
  95. STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]')
  96. STDERR.print(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
  97. STDERR.print(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
  98. STDERR.print()
  99. STDERR.print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
  100. STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
  101. STDERR.print()
  102. # if user has specified a user data dir, make sure its valid
  103. if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK):
  104. # check to make sure user_data_dir/<profile_name> exists
  105. if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir():
  106. STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]')
  107. STDERR.print(f' {self.CHROME_USER_DATA_DIR}')
  108. STDERR.print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
  109. STDERR.print(' For more info see:')
  110. STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
  111. if '/Default' in str(self.CHROME_USER_DATA_DIR):
  112. STDERR.print()
  113. STDERR.print(' Try removing /Default from the end e.g.:')
  114. STDERR.print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]))
  115. # hard error is too annoying here, instead just set it to nothing
  116. # raise SystemExit(2)
  117. self.update_in_place(CHROME_USER_DATA_DIR=None)
  118. else:
  119. if self.CHROME_USER_DATA_DIR is not None:
  120. self.update_in_place(CHROME_USER_DATA_DIR=None)
  121. return self
  122. def chrome_args(self, **options) -> List[str]:
  123. """helper to build up a chrome shell command with arguments"""
  124. # Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
  125. options = self.model_copy(update=options)
  126. cmd_args = [*options.CHROME_DEFAULT_ARGS, *options.CHROME_EXTRA_ARGS]
  127. if options.CHROME_HEADLESS:
  128. cmd_args += ["--headless=new"] # expects chrome version >= 111
  129. if not options.CHROME_SANDBOX:
  130. # assume this means we are running inside a docker container
  131. # in docker, GPU support is limited, sandboxing is unecessary,
  132. # and SHM is limited to 64MB by default (which is too low to be usable).
  133. cmd_args += (
  134. "--no-sandbox",
  135. "--no-zygote",
  136. "--disable-dev-shm-usage",
  137. "--disable-software-rasterizer",
  138. "--disable-sync",
  139. # "--password-store=basic",
  140. )
  141. # set window size for screenshot/pdf/etc. rendering
  142. cmd_args += ('--window-size={}'.format(options.CHROME_RESOLUTION),)
  143. if not options.CHROME_CHECK_SSL_VALIDITY:
  144. cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
  145. if options.CHROME_USER_AGENT:
  146. cmd_args += ('--user-agent={}'.format(options.CHROME_USER_AGENT),)
  147. # this no longer works on newer chrome version for some reason, just causes chrome to hang indefinitely:
  148. # if options.CHROME_TIMEOUT:
  149. # cmd_args += ('--timeout={}'.format(options.CHROME_TIMEOUT * 1000),)
  150. if options.CHROME_USER_DATA_DIR:
  151. cmd_args.append('--user-data-dir={}'.format(options.CHROME_USER_DATA_DIR))
  152. cmd_args.append('--profile-directory={}'.format(options.CHROME_PROFILE_NAME or 'Default'))
  153. return dedupe(cmd_args)
  154. CHROME_CONFIG = ChromeConfig()