models.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. __package__ = 'archivebox.plugins.defaults'
  2. # import shutil
  3. import re
  4. from typing import List, Dict, Any
  5. from pathlib import Path
  6. from django.db import models, transaction
  7. from django.utils.functional import cached_property
  8. from solo.models import SingletonModel # type: ignore[import-untyped]
  9. from config import bin_path, bin_version
  10. ConfigDict = Dict[str, Any]
  11. # def bin_path(binary: str) -> str | None:
  12. # return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
  13. # def bin_version(bin_path: str, cmd: str | None=None) -> str | None:
  14. # return '0.0.0'
  15. # def pretty_path(path: Path) -> str:
  16. # """take a Path object and return the path as a string relative to the current directory"""
  17. # if not path:
  18. # return ''
  19. # return str(path.expanduser().resolve().relative_to(Path.cwd().resolve()))
  20. class ArchiveBoxBaseDependency(models.Model):
  21. singleton_instance_id = 1
  22. id = models.AutoField(default=singleton_instance_id, primary_key=True)
  23. NAME = 'DEFAULT'
  24. LABEL = "Default"
  25. REQUIRED = False
  26. PARENT_DEPENDENCIES: List[str] = []
  27. BIN_DEPENDENCIES: List[str] = []
  28. APT_DEPENDENCIES: List[str] = []
  29. BREW_DEPENDENCIES: List[str] = []
  30. PIP_DEPENDENCIES: List[str] = []
  31. NPM_DEPENDENCIES: List[str] = []
  32. DEFAULT_BINARY: str | None = '/bin/bash'
  33. DEFAULT_START_CMD: str | None = '/bin/bash -c "while true; do sleep 1; done"'
  34. DEFAULT_PID_FILE: str | None = 'logs/{NAME}_WORKER.pid'
  35. DEFAULT_STOP_CMD: str | None = 'kill "$(<{PID_FILE})"'
  36. DEFAULT_VERSION_COMMAND: str | None = '{BINARY} --version'
  37. DEFAULT_ARGS: str | None = ''
  38. VERSION_CMD = '{BINARY} --version'
  39. ENABLED = models.BooleanField(default=True, editable=False)
  40. BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
  41. ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
  42. # START_CMD = models.CharField(max_length=255, default=DEFAULT_START_CMD)
  43. # WORKERS = models.IntegerField(default=1)
  44. class Meta:
  45. abstract = True
  46. app_label = 'defaults'
  47. def __str__(self):
  48. return f"{self.LABEL} Dependency Configuration"
  49. def __json__(self):
  50. return {
  51. 'type': 'ArchiveBoxDependency',
  52. '__class__': self.__class__.__name__,
  53. 'NAME': self.NAME,
  54. 'LABEL': self.LABEL,
  55. 'ENABLED': self.ENABLED,
  56. 'BINARY': self.BINARY,
  57. 'ARGS': self.ARGS,
  58. # 'START_CMD': self.START_CMD,
  59. # 'WORKERS': self.WORKERS,
  60. }
  61. @cached_property
  62. def bin_path(self) -> str:
  63. return bin_path(self.BINARY or self.DEFAULT_BINARY)
  64. @cached_property
  65. def bin_version(self) -> str | None:
  66. print(f'ArchiveBoxBaseDependency.bin_version({self.bin_path}, cmd={self.VERSION_CMD.format(BINARY=self.BINARY)})')
  67. return bin_version(self.bin_path, cmd=self.VERSION_CMD.format(BINARY=self.BINARY))
  68. # return bin_version(self.bin_path, cmd=self.VERSION_CMD)
  69. @cached_property
  70. def is_valid(self) -> bool:
  71. return bool(self.bin_path and self.bin_version)
  72. @cached_property
  73. def is_enabled(self) -> bool:
  74. return bool(self.ENABLED and self.is_valid)
  75. @cached_property
  76. def pretty_version(self) -> str:
  77. if self.is_enabled:
  78. if self.is_valid:
  79. color, symbol, note, version = 'green', '√', 'valid', ''
  80. parsed_version_num = re.search(r'[\d\.]+', self.bin_version)
  81. if parsed_version_num:
  82. version = f'v{parsed_version_num[0]}'
  83. if not self.bin_version:
  84. color, symbol, note, version = 'red', 'X', 'invalid', '?'
  85. else:
  86. color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
  87. path = pretty_path(self.bin_path)
  88. return ' '.join((
  89. ANSI[color],
  90. symbol,
  91. ANSI['reset'],
  92. name.ljust(21),
  93. version.ljust(14),
  94. ANSI[color],
  95. note.ljust(8),
  96. ANSI['reset'],
  97. path.ljust(76),
  98. ))
  99. # @helper
  100. def install_parents(self, config):
  101. return {
  102. # parent_dependency.NAME: parent_dependency.get_solo().install_self()
  103. parent_dependency: parent_dependency
  104. for parent_dependency in self.PARENT_DEPENDENCIES
  105. }
  106. # @helper
  107. def install_self(self, config):
  108. assert all(self.install_parents(config=config).values())
  109. BashEnvironmentDependency.get_solo().install_pkgs(self.BIN_DEPENDENCIES)
  110. AptEnvironmentDependency.get_solo().install_pkgs(self.APT_DEPENDENCIES)
  111. BrewEnvironmentDependency.get_solo().install_pkgs(self.BREW_DEPENDENCIES)
  112. PipEnvironmentDependency.get_solo().install_pkgs(self.PIP_DEPENDENCIES)
  113. NPMEnvironmentDependency.get_solo().install_pkgs(self.NPM_DEPENDENCIES)
  114. assert self.is_valid
  115. return self.bin_version
  116. # @task
  117. def run(args, pwd, timeout):
  118. errors = None
  119. timer = TimedProgress(timeout, prefix=' ')
  120. try:
  121. proc = run(cmd=[self.bin_path, *args], pwd=pwd, timeout=timeout)
  122. except Exception as err:
  123. errors = err
  124. finally:
  125. timer.end()
  126. return proc, timer, errors
  127. class ArchiveBoxDefaultDependency(ArchiveBoxBaseDependency, SingletonModel):
  128. singleton_instance_id = 1
  129. id = models.AutoField(default=singleton_instance_id, primary_key=True)
  130. ENABLED = models.BooleanField(default=True, editable=True)
  131. class Meta: # pyright: ignore [reportIncompatibleVariableOverride]
  132. abstract = False
  133. app_label = 'defaults'
  134. verbose_name = 'Default Configuration: Dependencies'
  135. class ArchiveBoxBaseExtractor(models.Model):
  136. singleton_instance_id = 1
  137. id = models.AutoField(default=singleton_instance_id, primary_key=True)
  138. NAME = 'DEFAULT'
  139. LABEL = 'Default'
  140. DEFAULT_DEPENDENCY = ArchiveBoxDefaultDependency
  141. DEPENDENCY = DEFAULT_DEPENDENCY
  142. DEFAULT_ENABLED = True
  143. DEFAULT_CMD = ['{DEPENDENCY.BINARY}', '{ARGS}', '{url}']
  144. DEFAULT_ARGS = ['--timeout={TIMEOUT}']
  145. DEFAULT_TIMEOUT = '{TIMEOUT}'
  146. # DEFAULT_USER_AGENT = '{USER_AGENT}'
  147. # DEFAULT_COOKIES_TXT = '{COOKIES_TXT}'
  148. ENABLED = models.BooleanField(default=DEFAULT_ENABLED, editable=True)
  149. CMD = models.CharField(max_length=255, default=DEFAULT_CMD)
  150. ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
  151. TIMEOUT = models.CharField(max_length=255, default=DEFAULT_TIMEOUT)
  152. ALIASES = {
  153. 'ENABLED': (f'SAVE_{NAME}', f'USE_{NAME}', f'FETCH_{NAME}'),
  154. }
  155. def __str__(self):
  156. return f"{self.LABEL} Extractor Configuration"
  157. class Meta: # pyright: ignore [reportIncompatibleVariableOverride]
  158. abstract = True
  159. verbose_name = "Default Extractor Configuration"
  160. app_label = 'defaults'
  161. @cached_property
  162. def dependency(self):
  163. return self.DEPENDENCY.get_solo()
  164. def __json__(self):
  165. return {
  166. 'type': 'ArchiveBoxExtractor',
  167. '__class__': self.__class__.__name__,
  168. 'NAME': self.NAME,
  169. 'LABEL': self.LABEL,
  170. 'ENABLED': self.ENABLED,
  171. 'DEPENDENCY': self.dependency.__json__(),
  172. 'ARGS': self.ARGS,
  173. 'CMD': self.CMD,
  174. 'TIMEOUT': self.TIMEOUT,
  175. 'is_valid': self.is_valid,
  176. 'is_enabled': self.is_enabled,
  177. }
  178. def format_args(self, csv: List[str], **config):
  179. un_prefixed_config = {**self.__json__()} # e.g. ENABLED=True
  180. prefixed_config = { # e.g. GALLERYDL_ENABLED=True
  181. f'{self.NAME}_{key}': value
  182. for key, value in un_prefixed_config.items()
  183. }
  184. merged_config = {
  185. **config, # e.g. TIMEOUT=60
  186. **un_prefixed_config, # e.g. ENABLED=True
  187. **prefixed_config, # e.g. GALLERYDL_ENABLED=True
  188. }
  189. formatted_config = [
  190. arg.format(**merged_config)
  191. for arg in csv
  192. ]
  193. return formatted_config
  194. @cached_property
  195. def is_valid(self):
  196. if not self.dependency.is_valid:
  197. return False
  198. # TIMEOUT must be at least 5 seconds
  199. # if self.TIMEOUT < 5:
  200. # return False
  201. # assert Path(self.COOKIES_TXT).exists()
  202. # TODO: validate user agent with uaparser
  203. # TODO: validate args, cookies.txt?
  204. return True
  205. @cached_property
  206. def is_enabled(self):
  207. return self.ENABLED and self.is_valid and self.dependency.is_enabled
  208. def save(self, *args, **kwargs):
  209. # assert self.is_valid
  210. with transaction.atomic():
  211. result = super().save(*args, **kwargs)
  212. # post to message bus:
  213. print({
  214. 'type': f'{self.__class__.__name__}.save',
  215. 'diff': self.__json__(),
  216. 'kwargs': kwargs,
  217. })
  218. # potential consumers of this event:
  219. # - event logger: write to events.log
  220. # - config file updater: writes to ArchiveBox.conf
  221. # - supervisor: restarts relevant dependencies/extractors
  222. # - etc...
  223. return result
  224. def out_dir(self, url: str, snapshot_dir: Path, config: ConfigDict):
  225. return (snapshot_dir / self.NAME)
  226. def create_out_dir(self, url: str, snapshot_dir: Path, config: ConfigDict):
  227. out_dir = self.out_dir(url=url, snapshot_dir=snapshot_dir, config=config)
  228. return out_dir.mkdir(exist_ok=True)
  229. def should_extract(self, url: str, snapshot_dir: Path, config: ConfigDict):
  230. # return False if extractor is disabled
  231. if not self.is_enabled:
  232. return False
  233. out_dir = self.out_dir(url=url, snapshot_dir=snapshot_dir, config=config)
  234. if has_existing_output := out_dir.glob('*'):
  235. return False
  236. if not (has_write_access := os.access(out_dir, os.W_OK | os.X_OK)):
  237. return False
  238. return True
  239. def get_dependency_cmd(self, url: str, extractor_dir: Path, config: ConfigDict):
  240. return [
  241. self.format_args(self.CMD, **config),
  242. url,
  243. *self.format_args(self.ARGS, **config), # TODO: split and requote this properly
  244. ]
  245. # @requires_config('HOSTNAME', 'TIMEOUT', 'USER_AGENT', 'CHECK_SSL_VALIDITY')
  246. def extract(self, url: str, snapshot_dir: Path, config: ConfigDict):
  247. if not self.ENABLED:
  248. return
  249. extractor_dir = self.create_extractor_directory(snapshot_dir)
  250. cmd = self.get_dependency_cmd(url=url, extractor_dir=extractor_dir, config=config)
  251. status, stdout, stderr, output_path = 'failed', '', '', None
  252. try:
  253. proc, timer, errors = self.dependency.run(cmd, cwd=extractor_dir, timeout=self.TIMEOUT)
  254. stdout, stderr = proc.stdout, proc.stderr
  255. if 'ERROR: Unsupported URL' in stderr:
  256. hints = ('gallery-dl doesnt support this type of url yet',)
  257. raise ArchiveError('Failed to save gallerydl', hints)
  258. if proc.returncode == 0 and 'finished' in stdout:
  259. output_path = extractor_dir / 'index.html'
  260. status = 'succeeded'
  261. except Exception as err:
  262. stderr += err
  263. num_bytes, num_dirs, num_files = get_dir_size(extractor_dir)
  264. return ArchiveResult(
  265. cmd=cmd,
  266. pwd=str(out_dir),
  267. cmd_version=self.dependency.bin_version,
  268. cmd_path=self.dependency.bin_path,
  269. cmd_hostname=config.HOSTNAME,
  270. output_path=output_path,
  271. stdout=stdout,
  272. stderr=stderr,
  273. status=status,
  274. num_bytes=num_bytes,
  275. num_files=num_files,
  276. num_dirs=num_dirs,
  277. **timer.stats,
  278. )
  279. class ArchiveBoxDefaultExtractor(ArchiveBoxBaseExtractor, SingletonModel):
  280. singleton_instance_id = 1
  281. id = models.AutoField(default=singleton_instance_id, primary_key=True)
  282. DEPENDENCY = ArchiveBoxDefaultDependency
  283. ENABLED = models.BooleanField(default=True, editable=True)
  284. class Meta:
  285. abstract = False
  286. app_label = 'defaults'
  287. verbose_name = 'Default Configuration: Extractors'