apps.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. __package__ = 'archivebox.plugins_pkg.playwright'
  2. import os
  3. import platform
  4. from pathlib import Path
  5. from typing import List, Optional, Dict, ClassVar
  6. # Depends on other PyPI/vendor packages:
  7. from pydantic import InstanceOf, computed_field, Field
  8. from pydantic_pkgr import (
  9. BinName,
  10. BinProvider,
  11. BinProviderName,
  12. ProviderLookupDict,
  13. InstallArgs,
  14. PATHStr,
  15. HostBinPath,
  16. bin_abspath,
  17. OPERATING_SYSTEM,
  18. DEFAULT_ENV_PATH,
  19. )
  20. from archivebox.config import CONSTANTS
  21. # Depends on other Django apps:
  22. from abx.archivebox.base_plugin import BasePlugin
  23. from abx.archivebox.base_configset import BaseConfigSet
  24. from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env
  25. # from abx.archivebox.base_extractor import BaseExtractor
  26. # from abx.archivebox.base_queue import BaseQueue
  27. from abx.archivebox.base_hook import BaseHook
  28. from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
  29. ###################### Config ##########################
  30. class PlaywrightConfigs(BaseConfigSet):
  31. # PLAYWRIGHT_BINARY: str = Field(default='wget')
  32. # PLAYWRIGHT_ARGS: Optional[List[str]] = Field(default=None)
  33. # PLAYWRIGHT_EXTRA_ARGS: List[str] = []
  34. # PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
  35. pass
  36. PLAYWRIGHT_CONFIG = PlaywrightConfigs()
  37. LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
  38. class PlaywrightBinary(BaseBinary):
  39. name: BinName = "playwright"
  40. binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env]
  41. PLAYWRIGHT_BINARY = PlaywrightBinary()
  42. class PlaywrightBinProvider(BaseBinProvider):
  43. name: BinProviderName = "playwright"
  44. INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name
  45. PATH: PATHStr = f"{CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}"
  46. playwright_browsers_dir: Optional[Path] = (
  47. Path("~/Library/Caches/ms-playwright").expanduser() # macos playwright cache dir
  48. if OPERATING_SYSTEM == "darwin" else
  49. Path("~/.cache/ms-playwright").expanduser() # linux playwright cache dir
  50. )
  51. playwright_install_args: List[str] = ["install"] # --with-deps
  52. packages_handler: ProviderLookupDict = Field(default={
  53. "chrome": lambda: ["chromium"],
  54. }, exclude=True)
  55. _browser_abspaths: ClassVar[Dict[str, HostBinPath]] = {}
  56. @computed_field
  57. @property
  58. def INSTALLER_BIN_ABSPATH(self) -> HostBinPath | None:
  59. return PLAYWRIGHT_BINARY.load().abspath
  60. def setup(self) -> None:
  61. assert SYS_PIP_BINPROVIDER.INSTALLER_BIN_ABSPATH, "Pip bin provider not initialized"
  62. if self.playwright_browsers_dir:
  63. self.playwright_browsers_dir.mkdir(parents=True, exist_ok=True)
  64. def installed_browser_bins(self, browser_name: str = "*") -> List[Path]:
  65. if browser_name == 'chrome':
  66. browser_name = 'chromium'
  67. # if on macOS, browser binary is inside a .app, otherwise it's just a plain binary
  68. if platform.system().lower() == "darwin":
  69. # ~/Library/caches/ms-playwright/chromium-1097/chrome-mac/Chromium.app/Contents/MacOS/Chromium
  70. return sorted(
  71. self.playwright_browsers_dir.glob(
  72. f"{browser_name}-*/*-mac*/*.app/Contents/MacOS/*"
  73. )
  74. )
  75. # ~/Library/caches/ms-playwright/chromium-1097/chrome-linux/chromium
  76. return sorted(self.playwright_browsers_dir.glob(f"{browser_name}-*/*-linux/*"))
  77. def on_get_abspath(self, bin_name: BinName, **context) -> Optional[HostBinPath]:
  78. assert bin_name == "chrome", "Only chrome is supported using the @puppeteer/browsers install method currently."
  79. # already loaded, return abspath from cache
  80. if bin_name in self._browser_abspaths:
  81. return self._browser_abspaths[bin_name]
  82. # first time loading, find browser in self.playwright_browsers_dir by searching filesystem for installed binaries
  83. matching_bins = [abspath for abspath in self.installed_browser_bins() if bin_name in str(abspath)]
  84. if matching_bins:
  85. newest_bin = matching_bins[-1] # already sorted alphabetically, last should theoretically be highest version number
  86. self._browser_abspaths[bin_name] = newest_bin
  87. return self._browser_abspaths[bin_name]
  88. # playwright sometimes installs google-chrome-stable via apt into system $PATH, check there as well
  89. abspath = bin_abspath('google-chrome-stable', PATH=env.PATH)
  90. if abspath:
  91. self._browser_abspaths[bin_name] = abspath
  92. return self._browser_abspaths[bin_name]
  93. return None
  94. def on_install(self, bin_name: str, packages: Optional[InstallArgs] = None, **context) -> str:
  95. """playwright install chrome"""
  96. self.setup()
  97. assert bin_name == "chrome", "Only chrome is supported using the playwright install method currently."
  98. if not self.INSTALLER_BIN_ABSPATH:
  99. raise Exception(
  100. f"{self.__class__.__name__} install method is not available on this host ({self.INSTALLER_BIN} not found in $PATH)"
  101. )
  102. packages = packages or self.on_get_packages(bin_name)
  103. # print(f'[*] {self.__class__.__name__}: Installing {bin_name}: {self.INSTALLER_BIN_ABSPATH} install {packages}')
  104. install_args = [*self.playwright_install_args]
  105. proc = self.exec(bin_name=self.INSTALLER_BIN_ABSPATH, cmd=[*install_args, *packages])
  106. if proc.returncode != 0:
  107. print(proc.stdout.strip())
  108. print(proc.stderr.strip())
  109. raise Exception(f"{self.__class__.__name__}: install got returncode {proc.returncode} while installing {packages}: {packages}")
  110. # [email protected] /data/lib/browsers/chrome/mac_arm-129.0.6668.58/chrome-mac-arm64/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing
  111. # playwright build v1010 downloaded to /home/squash/.cache/ms-playwright/ffmpeg-1010
  112. output_lines = [line for line in proc.stdout.strip().split('\n') if '/chrome-' in line]
  113. if output_lines:
  114. relpath = output_lines[0].split(self.playwright_browsers_dir)[-1]
  115. abspath = self.playwright_browsers_dir / relpath
  116. if os.path.isfile(abspath) and os.access(abspath, os.X_OK):
  117. self._browser_abspaths[bin_name] = abspath
  118. return abspath
  119. return proc.stderr.strip() + "\n" + proc.stdout.strip()
  120. PLAYWRIGHT_BINPROVIDER = PlaywrightBinProvider()
  121. class PlaywrightPlugin(BasePlugin):
  122. app_label: str = 'playwright'
  123. verbose_name: str = 'Playwright (PIP)'
  124. hooks: List[InstanceOf[BaseHook]] = [
  125. PLAYWRIGHT_CONFIG,
  126. PLAYWRIGHT_BINPROVIDER,
  127. PLAYWRIGHT_BINARY,
  128. ]
  129. PLUGIN = PlaywrightPlugin()
  130. # PLUGIN.register(settings)
  131. DJANGO_APP = PLUGIN.AppConfig