apps.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. __package__ = 'archivebox.plugins_pkg.puppeteer'
  2. import platform
  3. from pathlib import Path
  4. from typing import List, Optional, Dict, ClassVar
  5. # Depends on other PyPI/vendor packages:
  6. from pydantic import InstanceOf, Field
  7. from pydantic_pkgr import (
  8. BinProvider,
  9. BinName,
  10. BinProviderName,
  11. ProviderLookupDict,
  12. InstallArgs,
  13. PATHStr,
  14. HostBinPath,
  15. )
  16. from archivebox.config import CONSTANTS
  17. # Depends on other Django apps:
  18. from abx.archivebox.base_plugin import BasePlugin
  19. from abx.archivebox.base_configset import BaseConfigSet
  20. from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env
  21. # from abx.archivebox.base_extractor import BaseExtractor
  22. # from abx.archivebox.base_queue import BaseQueue
  23. from abx.archivebox.base_hook import BaseHook
  24. # Depends on Other Plugins:
  25. from plugins_pkg.npm.apps import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
  26. ###################### Config ##########################
  27. class PuppeteerConfigs(BaseConfigSet):
  28. # PUPPETEER_BINARY: str = Field(default='wget')
  29. # PUPPETEER_ARGS: Optional[List[str]] = Field(default=None)
  30. # PUPPETEER_EXTRA_ARGS: List[str] = []
  31. # PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
  32. pass
  33. PUPPETEER_CONFIG = PuppeteerConfigs()
  34. LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
  35. class PuppeteerBinary(BaseBinary):
  36. name: BinName = "puppeteer"
  37. binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
  38. PUPPETEER_BINARY = PuppeteerBinary()
  39. class PuppeteerBinProvider(BaseBinProvider):
  40. name: BinProviderName = "puppeteer"
  41. INSTALLER_BIN: BinName = "npx"
  42. PATH: PATHStr = str(CONSTANTS.LIB_BIN_DIR)
  43. puppeteer_browsers_dir: Optional[Path] = LIB_DIR_BROWSERS
  44. puppeteer_install_args: List[str] = ["@puppeteer/browsers", "install", "--path", str(LIB_DIR_BROWSERS)]
  45. packages_handler: ProviderLookupDict = Field(default={
  46. "chrome": lambda:
  47. ['chrome@stable'],
  48. }, exclude=True)
  49. _browser_abspaths: ClassVar[Dict[str, HostBinPath]] = {}
  50. def setup(self) -> None:
  51. assert SYS_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH, "NPM bin provider not initialized"
  52. if self.puppeteer_browsers_dir:
  53. self.puppeteer_browsers_dir.mkdir(parents=True, exist_ok=True)
  54. def installed_browser_bins(self, browser_name: str='*') -> List[Path]:
  55. # if on macOS, browser binary is inside a .app, otherwise it's just a plain binary
  56. if platform.system().lower() == 'darwin':
  57. # /data/lib/browsers/chrome/mac_arm-129.0.6668.58/chrome-mac-arm64/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing
  58. return sorted(self.puppeteer_browsers_dir.glob(f'{browser_name}/mac*/chrome*/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing'))
  59. # /data/lib/browsers/chrome/linux-131.0.6730.0/chrome-linux64/chrome
  60. return sorted(self.puppeteer_browsers_dir.glob(f"{browser_name}/linux*/chrome*/chrome"))
  61. def on_get_abspath(self, bin_name: BinName, **context) -> Optional[HostBinPath]:
  62. assert bin_name == 'chrome', 'Only chrome is supported using the @puppeteer/browsers install method currently.'
  63. # already loaded, return abspath from cache
  64. if bin_name in self._browser_abspaths:
  65. return self._browser_abspaths[bin_name]
  66. # first time loading, find browser in self.puppeteer_browsers_dir by searching filesystem for installed binaries
  67. matching_bins = [abspath for abspath in self.installed_browser_bins() if bin_name in str(abspath)]
  68. if matching_bins:
  69. newest_bin = matching_bins[-1] # already sorted alphabetically, last should theoretically be highest version number
  70. self._browser_abspaths[bin_name] = newest_bin
  71. return self._browser_abspaths[bin_name]
  72. return None
  73. def on_install(self, bin_name: str, packages: Optional[InstallArgs] = None, **context) -> str:
  74. """npx @puppeteer/browsers install chrome@stable"""
  75. self.setup()
  76. assert bin_name == 'chrome', 'Only chrome is supported using the @puppeteer/browsers install method currently.'
  77. if not self.INSTALLER_BIN_ABSPATH:
  78. raise Exception(
  79. f"{self.__class__.__name__} install method is not available on this host ({self.INSTALLER_BIN} not found in $PATH)"
  80. )
  81. packages = packages or self.on_get_packages(bin_name)
  82. # print(f'[*] {self.__class__.__name__}: Installing {bin_name}: {self.INSTALLER_BIN_ABSPATH} install {packages}')
  83. install_args = [*self.puppeteer_install_args]
  84. proc = self.exec(bin_name=self.INSTALLER_BIN_ABSPATH, cmd=[*install_args, *packages])
  85. if proc.returncode != 0:
  86. print(proc.stdout.strip())
  87. print(proc.stderr.strip())
  88. raise Exception(f"{self.__class__.__name__}: install got returncode {proc.returncode} while installing {packages}: {packages}")
  89. # [email protected] /data/lib/browsers/chrome/mac_arm-129.0.6668.58/chrome-mac-arm64/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing
  90. output_info = proc.stdout.strip().split('\n')[-1]
  91. browser_abspath = output_info.split(' ', 1)[-1]
  92. # browser_version = output_info.split('@', 1)[-1].split(' ', 1)[0]
  93. self._browser_abspaths[bin_name] = Path(browser_abspath)
  94. return proc.stderr.strip() + "\n" + proc.stdout.strip()
  95. PUPPETEER_BINPROVIDER = PuppeteerBinProvider()
  96. # ALTERNATIVE INSTALL METHOD using Ansible:
  97. # install_playbook = self.plugin_dir / 'install_puppeteer.yml'
  98. # chrome_bin = run_playbook(install_playbook, data_dir=DATA_DIR, quiet=quiet).BINARIES.chrome
  99. # return self.__class__.model_validate(
  100. # {
  101. # **self.model_dump(),
  102. # "loaded_abspath": chrome_bin.symlink,
  103. # "loaded_version": chrome_bin.version,
  104. # "loaded_binprovider": env,
  105. # "binproviders_supported": self.binproviders_supported,
  106. # }
  107. # )
  108. class PuppeteerPlugin(BasePlugin):
  109. app_label: str ='puppeteer'
  110. verbose_name: str = 'Puppeteer (NPM)'
  111. hooks: List[InstanceOf[BaseHook]] = [
  112. PUPPETEER_CONFIG,
  113. PUPPETEER_BINPROVIDER,
  114. PUPPETEER_BINARY,
  115. ]
  116. PLUGIN = PuppeteerPlugin()
  117. # PLUGIN.register(settings)
  118. DJANGO_APP = PLUGIN.AppConfig