apps.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. __package__ = 'archivebox.plugins_pkg.playwright'
  2. import platform
  3. from pathlib import Path
  4. from typing import List, Optional, Dict, ClassVar
  5. # Depends on other PyPI/vendor packages:
  6. from pydantic import InstanceOf, computed_field, Field
  7. from pydantic_pkgr import (
  8. BinName,
  9. BinProvider,
  10. BinProviderName,
  11. ProviderLookupDict,
  12. InstallArgs,
  13. PATHStr,
  14. HostBinPath,
  15. bin_abspath,
  16. OPERATING_SYSTEM,
  17. DEFAULT_ENV_PATH,
  18. )
  19. from archivebox.config import CONSTANTS
  20. # Depends on other Django apps:
  21. from abx.archivebox.base_plugin import BasePlugin
  22. from abx.archivebox.base_configset import BaseConfigSet
  23. from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env
  24. # from abx.archivebox.base_extractor import BaseExtractor
  25. # from abx.archivebox.base_queue import BaseQueue
  26. from abx.archivebox.base_hook import BaseHook
  27. from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
  28. ###################### Config ##########################
  29. class PlaywrightConfigs(BaseConfigSet):
  30. # PLAYWRIGHT_BINARY: str = Field(default='wget')
  31. # PLAYWRIGHT_ARGS: Optional[List[str]] = Field(default=None)
  32. # PLAYWRIGHT_EXTRA_ARGS: List[str] = []
  33. # PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
  34. pass
  35. PLAYWRIGHT_CONFIG = PlaywrightConfigs()
  36. LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
  37. class PlaywrightBinary(BaseBinary):
  38. name: BinName = "playwright"
  39. binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env]
  40. PLAYWRIGHT_BINARY = PlaywrightBinary()
  41. class PlaywrightBinProvider(BaseBinProvider):
  42. name: BinProviderName = "playwright"
  43. INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name
  44. PATH: PATHStr = f"{CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}"
  45. puppeteer_browsers_dir: Optional[Path] = (
  46. Path("~/Library/Caches/ms-playwright").expanduser() # macos playwright cache dir
  47. if OPERATING_SYSTEM == "darwin" else
  48. Path("~/.cache/ms-playwright").expanduser() # linux playwright cache dir
  49. )
  50. puppeteer_install_args: List[str] = ["install"] # --with-deps
  51. packages_handler: ProviderLookupDict = Field(default={
  52. "chrome": lambda: ["chromium"],
  53. }, exclude=True)
  54. _browser_abspaths: ClassVar[Dict[str, HostBinPath]] = {}
  55. @computed_field
  56. @property
  57. def INSTALLER_BIN_ABSPATH(self) -> HostBinPath | None:
  58. return PLAYWRIGHT_BINARY.load().abspath
  59. def setup(self) -> None:
  60. assert SYS_PIP_BINPROVIDER.INSTALLER_BIN_ABSPATH, "Pip bin provider not initialized"
  61. if self.puppeteer_browsers_dir:
  62. self.puppeteer_browsers_dir.mkdir(parents=True, exist_ok=True)
  63. def installed_browser_bins(self, browser_name: str = "*") -> List[Path]:
  64. if browser_name == 'chrome':
  65. browser_name = 'chromium'
  66. # if on macOS, browser binary is inside a .app, otherwise it's just a plain binary
  67. if platform.system().lower() == "darwin":
  68. # ~/Library/caches/ms-playwright/chromium-1097/chrome-mac/Chromium.app/Contents/MacOS/Chromium
  69. return sorted(
  70. self.puppeteer_browsers_dir.glob(
  71. f"{browser_name}-*/*-mac*/*.app/Contents/MacOS/*"
  72. )
  73. )
  74. # ~/Library/caches/ms-playwright/chromium-1097/chrome-linux/chromium
  75. return sorted(self.puppeteer_browsers_dir.glob(f"{browser_name}-*/*-linux/*"))
  76. def on_get_abspath(self, bin_name: BinName, **context) -> Optional[HostBinPath]:
  77. assert bin_name == "chrome", "Only chrome is supported using the @puppeteer/browsers install method currently."
  78. # already loaded, return abspath from cache
  79. if bin_name in self._browser_abspaths:
  80. return self._browser_abspaths[bin_name]
  81. # first time loading, find browser in self.puppeteer_browsers_dir by searching filesystem for installed binaries
  82. matching_bins = [abspath for abspath in self.installed_browser_bins() if bin_name in str(abspath)]
  83. if matching_bins:
  84. newest_bin = matching_bins[-1] # already sorted alphabetically, last should theoretically be highest version number
  85. self._browser_abspaths[bin_name] = newest_bin
  86. return self._browser_abspaths[bin_name]
  87. # playwright sometimes installs google-chrome-stable via apt into system $PATH, check there as well
  88. abspath = bin_abspath('google-chrome-stable', PATH=env.PATH)
  89. if abspath:
  90. self._browser_abspaths[bin_name] = abspath
  91. return self._browser_abspaths[bin_name]
  92. return None
  93. def on_install(self, bin_name: str, packages: Optional[InstallArgs] = None, **context) -> str:
  94. """playwright install chrome"""
  95. self.setup()
  96. assert bin_name == "chrome", "Only chrome is supported using the playwright install method currently."
  97. if not self.INSTALLER_BIN_ABSPATH:
  98. raise Exception(
  99. f"{self.__class__.__name__} install method is not available on this host ({self.INSTALLER_BIN} not found in $PATH)"
  100. )
  101. packages = packages or self.on_get_packages(bin_name)
  102. # print(f'[*] {self.__class__.__name__}: Installing {bin_name}: {self.INSTALLER_BIN_ABSPATH} install {packages}')
  103. install_args = [*self.puppeteer_install_args]
  104. proc = self.exec(bin_name=self.INSTALLER_BIN_ABSPATH, cmd=[*install_args, *packages])
  105. if proc.returncode != 0:
  106. print(proc.stdout.strip())
  107. print(proc.stderr.strip())
  108. raise Exception(f"{self.__class__.__name__}: install got returncode {proc.returncode} while installing {packages}: {packages}")
  109. # [email protected] /data/lib/browsers/chrome/mac_arm-129.0.6668.58/chrome-mac-arm64/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing
  110. output_info = proc.stdout.strip().split("\n")[-1]
  111. browser_abspath = output_info.split(" ", 1)[-1]
  112. # browser_version = output_info.split('@', 1)[-1].split(' ', 1)[0]
  113. self._browser_abspaths[bin_name] = Path(browser_abspath)
  114. return proc.stderr.strip() + "\n" + proc.stdout.strip()
  115. PLAYWRIGHT_BINPROVIDER = PlaywrightBinProvider()
  116. class PlaywrightPlugin(BasePlugin):
  117. app_label: str = 'playwright'
  118. verbose_name: str = 'Playwright (PIP)'
  119. hooks: List[InstanceOf[BaseHook]] = [
  120. PLAYWRIGHT_CONFIG,
  121. PLAYWRIGHT_BINPROVIDER,
  122. PLAYWRIGHT_BINARY,
  123. ]
  124. PLUGIN = PlaywrightPlugin()
  125. # PLUGIN.register(settings)
  126. DJANGO_APP = PLUGIN.AppConfig