|
|
@@ -0,0 +1,361 @@
|
|
|
+# __package__ = 'archivebox.plugins.defaults'
|
|
|
+
|
|
|
+import shutil
|
|
|
+
|
|
|
+from typing import List, Dict, Any
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+from django.db import models, transaction
|
|
|
+from django.utils.functional import cached_property
|
|
|
+
|
|
|
+from solo.models import SingletonModel
|
|
|
+
|
|
|
+ConfigDict = Dict[str, Any]
|
|
|
+
|
|
|
+
|
|
|
+def bin_path(binary: str) -> str | None:
|
|
|
+ return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
|
|
|
+
|
|
|
+def bin_version(bin_path: str, cmd: str | None=None) -> str | None:
|
|
|
+ return '0.0.0'
|
|
|
+
|
|
|
+
|
|
|
+class ArchiveBoxBaseDependency(SingletonModel):
|
|
|
+ singleton_instance_id = 1
|
|
|
+
|
|
|
+ id = models.AutoField(default=singleton_instance_id, primary_key=True)
|
|
|
+
|
|
|
+ NAME = 'DEFAULT'
|
|
|
+ LABEL = "Default"
|
|
|
+ REQUIRED = False
|
|
|
+
|
|
|
+ PARENT_DEPENDENCIES = []
|
|
|
+
|
|
|
+ BIN_DEPENDENCIES = []
|
|
|
+ APT_DEPENDENCIES = []
|
|
|
+ BREW_DEPENDENCIES = []
|
|
|
+ PIP_DEPENDENCIES = []
|
|
|
+ NPM_DEPENDENCIES = []
|
|
|
+
|
|
|
+ DEFAULT_BINARY = '/bin/false'
|
|
|
+ DEFAULT_START_CMD = '/bin/false'
|
|
|
+ DEFAULT_PID_FILE = 'logs/{NAME}_WORKER.pid'
|
|
|
+ DEFAULT_STOP_CMD = 'kill "$(<{PID_FILE})"'
|
|
|
+ DEFAULT_VERSION_COMMAND = '{CMD} --version'
|
|
|
+ DEFAULT_ARGS = ''
|
|
|
+
|
|
|
+ VERSION_CMD = '{BINARY} --version'
|
|
|
+
|
|
|
+ ENABLED = models.BooleanField(default=True, editable=False)
|
|
|
+ BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
|
|
|
+ ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
|
|
|
+
|
|
|
+ # START_CMD = models.CharField(max_length=255, default=DEFAULT_START_CMD)
|
|
|
+ # WORKERS = models.IntegerField(default=1)
|
|
|
+
|
|
|
+ class Meta:
|
|
|
+ abstract = True
|
|
|
+ app_label = 'defaults'
|
|
|
+
|
|
|
+ def __str__(self):
|
|
|
+ return "{self.LABEL} Dependency Configuration"
|
|
|
+
|
|
|
+ def __json__(self):
|
|
|
+ return {
|
|
|
+ 'type': 'ArchiveBoxDependency',
|
|
|
+ '__class__': self.__class__.__name__,
|
|
|
+ 'NAME': self.NAME,
|
|
|
+ 'LABEL': self.LABEL,
|
|
|
+ 'ENABLED': self.ENABLED,
|
|
|
+ 'BINARY': self.BINARY,
|
|
|
+ 'ARGS': self.ARGS,
|
|
|
+ # 'START_CMD': self.START_CMD,
|
|
|
+ # 'WORKERS': self.WORKERS,
|
|
|
+ }
|
|
|
+
|
|
|
+ @cached_property
|
|
|
+ def bin_path(self):
|
|
|
+ return bin_path(self.BINARY or self.DEFAULT_BINARY)
|
|
|
+
|
|
|
+ @cached_property
|
|
|
+ def bin_version(self):
|
|
|
+ return bin_version(self.bin_path, cmd=self.VERSION_CMD)
|
|
|
+
|
|
|
+ @cached_property
|
|
|
+ def is_valid(self):
|
|
|
+ return bool(self.bin_path and self.bin_version)
|
|
|
+
|
|
|
+ @cached_property
|
|
|
+ def is_enabled(self):
|
|
|
+ return bool(self.ENABLED and self.is_valid)
|
|
|
+
|
|
|
+ @cached_property
|
|
|
+ def pretty_version(self):
|
|
|
+ if self.enabled:
|
|
|
+ if self.is_valid:
|
|
|
+ color, symbol, note, version = 'green', '√', 'valid', ''
|
|
|
+
|
|
|
+ parsed_version_num = re.search(r'[\d\.]+', self.bin_version)
|
|
|
+ if parsed_version_num:
|
|
|
+ version = f'v{parsed_version_num[0]}'
|
|
|
+
|
|
|
+ if not self.bin_version:
|
|
|
+ color, symbol, note, version = 'red', 'X', 'invalid', '?'
|
|
|
+ else:
|
|
|
+ color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
|
|
|
+
|
|
|
+ path = pretty_path(self.bin_path)
|
|
|
+
|
|
|
+ return ' '.join((
|
|
|
+ ANSI[color],
|
|
|
+ symbol,
|
|
|
+ ANSI['reset'],
|
|
|
+ name.ljust(21),
|
|
|
+ version.ljust(14),
|
|
|
+ ANSI[color],
|
|
|
+ note.ljust(8),
|
|
|
+ ANSI['reset'],
|
|
|
+ path.ljust(76),
|
|
|
+ ))
|
|
|
+
|
|
|
+ # @helper
|
|
|
+ def install_parents(self, config):
|
|
|
+ return {
|
|
|
+ parent_dependency.NAME: parent_dependency.get_solo().install_self()
|
|
|
+ for parent_dependency in self.PARENT_DEPENDENCIES
|
|
|
+ }
|
|
|
+
|
|
|
+ # @helper
|
|
|
+ def install_self(self, config):
|
|
|
+ assert all(self.install_parents().values())
|
|
|
+
|
|
|
+ BashEnvironmentDependency.get_solo().install_pkgs(self.BIN_DEPENDENCIES)
|
|
|
+ AptEnvironmentDependency.get_solo().install_pkgs(self.APT_DEPENDENCIES)
|
|
|
+ BrewEnvironmentDependency.get_solo().install_pkgs(self.BREW_DEPENDENCIES)
|
|
|
+ PipEnvironmentDependency.get_solo().install_pkgs(self.PIP_DEPENDENCIES)
|
|
|
+ NPMEnvironmentDependency.get_solo().install_pkgs(self.NPM_DEPENDENCIES)
|
|
|
+
|
|
|
+ assert self.is_valid
|
|
|
+ return self.bin_version
|
|
|
+
|
|
|
+ # @task
|
|
|
+ def run(args, pwd, timeout):
|
|
|
+ errors = None
|
|
|
+ timer = TimedProgress(timeout, prefix=' ')
|
|
|
+ try:
|
|
|
+ proc = run(cmd=[self.bin_path, *args], pwd=pwd, timeout=timeout)
|
|
|
+
|
|
|
+ except Exception as err:
|
|
|
+ errors = err
|
|
|
+ finally:
|
|
|
+ timer.end()
|
|
|
+
|
|
|
+ return proc, timer, errors
|
|
|
+
|
|
|
+class ArchiveBoxDefaultDependency(ArchiveBoxBaseDependency, SingletonModel):
|
|
|
+ singleton_instance_id = 1
|
|
|
+
|
|
|
+ id = models.AutoField(default=singleton_instance_id, primary_key=True)
|
|
|
+
|
|
|
+ class Meta:
|
|
|
+ abstract = False
|
|
|
+ app_label = 'defaults'
|
|
|
+
|
|
|
+
|
|
|
+class ArchiveBoxBaseExtractor(SingletonModel):
|
|
|
+ singleton_instance_id = 1
|
|
|
+
|
|
|
+ id = models.AutoField(default=singleton_instance_id, primary_key=True)
|
|
|
+
|
|
|
+ NAME = 'DEFAULT'
|
|
|
+ LABEL = 'Default'
|
|
|
+
|
|
|
+ DEFAULT_DEPENDENCY = ArchiveBoxDefaultDependency
|
|
|
+ DEPENDENCY = DEFAULT_DEPENDENCY
|
|
|
+
|
|
|
+
|
|
|
+ DEFAULT_ENABLED = True
|
|
|
+ DEFAULT_CMD = ['{DEPENDENCY.BINARY}', '{ARGS}', '{url}']
|
|
|
+ DEFAULT_ARGS = ['--timeout={TIMEOUT}']
|
|
|
+ DEFAULT_TIMEOUT = '{TIMEOUT}'
|
|
|
+ # DEFAULT_USER_AGENT = '{USER_AGENT}'
|
|
|
+ # DEFAULT_COOKIES_TXT = '{COOKIES_TXT}'
|
|
|
+
|
|
|
+ ENABLED = models.BooleanField(default=DEFAULT_ENABLED, editable=True)
|
|
|
+
|
|
|
+ CMD = models.CharField(max_length=255, default=DEFAULT_CMD)
|
|
|
+ ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
|
|
|
+ TIMEOUT = models.CharField(max_length=255, default=DEFAULT_TIMEOUT)
|
|
|
+
|
|
|
+ ALIASES = {
|
|
|
+ 'ENABLED': (f'SAVE_{NAME}', f'USE_{NAME}', f'FETCH_{NAME}'),
|
|
|
+ }
|
|
|
+
|
|
|
+ def __str__(self):
|
|
|
+ return f"{self.LABEL} Extractor Configuration"
|
|
|
+
|
|
|
+ class Meta:
|
|
|
+ abstract = True
|
|
|
+ verbose_name = f"Default Extractor Configuration"
|
|
|
+ app_label = 'defaults'
|
|
|
+
|
|
|
+ @cached_property
|
|
|
+ def dependency(self):
|
|
|
+ return self.DEPENDENCY.get_solo()
|
|
|
+
|
|
|
+ def __json__(self):
|
|
|
+ return {
|
|
|
+ 'type': 'ArchiveBoxExtractor',
|
|
|
+ '__class__': self.__class__.__name__,
|
|
|
+ 'NAME': self.NAME,
|
|
|
+ 'LABEL': self.LABEL,
|
|
|
+ 'ENABLED': self.ENABLED,
|
|
|
+ 'DEPENDENCY': self.dependency.__json__(),
|
|
|
+ 'ARGS': self.ARGS,
|
|
|
+ 'CMD': self.CMD,
|
|
|
+ 'TIMEOUT': self.TIMEOUT,
|
|
|
+ 'is_valid': self.is_valid,
|
|
|
+ 'is_enabled': self.is_enabled,
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ def format_args(self, csv: List[str], **config):
|
|
|
+ un_prefixed_config = {**self.__json__()} # e.g. ENABLED=True
|
|
|
+ prefixed_config = { # e.g. GALLERYDL_ENABLED=True
|
|
|
+ f'{self.NAME}_{key}': value
|
|
|
+ for key, value in un_prefixed_config.items()
|
|
|
+ }
|
|
|
+
|
|
|
+ merged_config = {
|
|
|
+ **config, # e.g. TIMEOUT=60
|
|
|
+ **un_prefixed_config, # e.g. ENABLED=True
|
|
|
+ **prefixed_config, # e.g. GALLERYDL_ENABLED=True
|
|
|
+ }
|
|
|
+ formatted_config = [
|
|
|
+ arg.format(**merged_config)
|
|
|
+ for arg in csv
|
|
|
+ ]
|
|
|
+
|
|
|
+ return formatted_config
|
|
|
+
|
|
|
+ @cached_property
|
|
|
+ def is_valid(self):
|
|
|
+ if not self.dependency.is_valid:
|
|
|
+ return False
|
|
|
+
|
|
|
+ # TIMEOUT must be at least 5 seconds
|
|
|
+ # if self.TIMEOUT < 5:
|
|
|
+ # return False
|
|
|
+
|
|
|
+ # assert Path(self.COOKIES_TXT).exists()
|
|
|
+ # TODO: validate user agent with uaparser
|
|
|
+ # TODO: validate args, cookies.txt?
|
|
|
+ return True
|
|
|
+
|
|
|
+ @cached_property
|
|
|
+ def is_enabled(self):
|
|
|
+ return self.ENABLED and self.is_valid and self.dependency.is_enabled
|
|
|
+
|
|
|
+
|
|
|
+ def save(self, *args, **kwargs):
|
|
|
+ assert self.is_valid
|
|
|
+
|
|
|
+ with transaction.atomic():
|
|
|
+ result = super().save(*args, **kwargs)
|
|
|
+ # post to message bus:
|
|
|
+ print({
|
|
|
+ 'type': f'{self.__class__.__name__}.save',
|
|
|
+ 'diff': self.__json__(),
|
|
|
+ 'kwargs': kwargs,
|
|
|
+ })
|
|
|
+ # potential consumers of this event:
|
|
|
+ # - event logger: write to events.log
|
|
|
+ # - config file updater: writes to ArchiveBox.conf
|
|
|
+ # - supervisor: restarts relevant dependencies/extractors
|
|
|
+ # - etc...
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+ def out_dir(self, url: str, snapshot_dir: Path, config: ConfigDict):
|
|
|
+ return (snapshot_dir / self.NAME)
|
|
|
+
|
|
|
+ def create_out_dir(self, url: str, snapshot_dir: Path, config: ConfigDict):
|
|
|
+ out_dir = self.out_dir(url=url, snapshot_dir=snapshot_dir, config=config)
|
|
|
+ return out_dir.mkdir(exist_ok=True)
|
|
|
+
|
|
|
+ def should_extract(self, url: str, snapshot_dir: Path, config: ConfigDict):
|
|
|
+ # return False if extractor is disabled
|
|
|
+ if not self.is_enabled:
|
|
|
+ return False
|
|
|
+
|
|
|
+ out_dir = self.out_dir(url=url, snapshot_dir=snapshot_dir, config=config)
|
|
|
+
|
|
|
+ if has_existing_output := out_dir.glob('*'):
|
|
|
+ return False
|
|
|
+
|
|
|
+ if not (has_write_access := os.access(out_dir, os.W_OK | os.X_OK)):
|
|
|
+ return False
|
|
|
+
|
|
|
+ return True
|
|
|
+
|
|
|
+
|
|
|
+ def get_dependency_cmd(self, url: str, extractor_dir: Path, config: ConfigDict):
|
|
|
+ return [
|
|
|
+ self.format_args(self.CMD, **config),
|
|
|
+ url,
|
|
|
+ *self.format_args(self.ARGS, **config), # TODO: split and requote this properly
|
|
|
+ ]
|
|
|
+
|
|
|
+ # @requires_config('HOSTNAME', 'TIMEOUT', 'USER_AGENT', 'CHECK_SSL_VALIDITY')
|
|
|
+ def extract(self, url: str, snapshot_dir: Path, config: ConfigDict):
|
|
|
+ if not self.ENABLED:
|
|
|
+ return
|
|
|
+
|
|
|
+ extractor_dir = self.create_extractor_directory(snapshot_dir)
|
|
|
+
|
|
|
+ cmd = self.get_dependency_cmd(url=url, extractor_dir=extractor_dir, config=config)
|
|
|
+
|
|
|
+ status, stdout, stderr, output_path = 'failed', '', '', None
|
|
|
+ try:
|
|
|
+ proc, timer, errors = self.dependency.run(cmd, cwd=extractor_dir, timeout=self.TIMEOUT)
|
|
|
+ stdout, stderr = proc.stdout, proc.stderr
|
|
|
+
|
|
|
+ if 'ERROR: Unsupported URL' in stderr:
|
|
|
+ hints = ('gallery-dl doesnt support this type of url yet',)
|
|
|
+ raise ArchiveError('Failed to save gallerydl', hints)
|
|
|
+
|
|
|
+ if proc.returncode == 0 and 'finished' in stdout:
|
|
|
+ output_path = extractor_dir / 'index.html'
|
|
|
+ status = 'succeeded'
|
|
|
+ except Exception as err:
|
|
|
+ stderr += err
|
|
|
+
|
|
|
+ num_bytes, num_dirs, num_files = get_dir_size(extractor_dir)
|
|
|
+
|
|
|
+ return ArchiveResult(
|
|
|
+ cmd=cmd,
|
|
|
+ pwd=str(out_dir),
|
|
|
+ cmd_version=self.dependency.bin_version,
|
|
|
+ cmd_path=self.dependency.bin_path,
|
|
|
+ cmd_hostname=config.HOSTNAME,
|
|
|
+
|
|
|
+ output_path=output_path,
|
|
|
+ stdout=stdout,
|
|
|
+ stderr=stderr,
|
|
|
+ status=status,
|
|
|
+
|
|
|
+ num_bytes=num_bytes,
|
|
|
+ num_files=num_files,
|
|
|
+ num_dirs=num_dirs,
|
|
|
+ **timer.stats,
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+class ArchiveBoxDefaultExtractor(ArchiveBoxBaseExtractor, SingletonModel):
|
|
|
+ singleton_instance_id = 1
|
|
|
+
|
|
|
+ id = models.AutoField(default=singleton_instance_id, primary_key=True)
|
|
|
+
|
|
|
+ class Meta:
|
|
|
+ abstract = False
|
|
|
+ app_label = 'defaults'
|