| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- from django.db import models
- from django.utils.functional import cached_property
- from solo.models import SingletonModel
- from archivebox.plugins.defaults.models import (
- ArchiveBoxDefaultDependency,
- ArchiveBoxDefaultExtractor,
- BashEnvironmentDependency,
- PipEnvironmentDependency,
- )
- class GalleryDLDependency(ArchiveBoxDefaultDependency, SingletonModel):
- NAME = 'GALLERYDL'
- LABEL = "GalleryDL"
- REQUIRED = False
- PARENT_DEPENDENCIES = [
- BashEnvironmentDependency,
- PipEnvironmentDependency,
- ]
- BIN_DEPENDENCIES = ['gallery-dl']
- APT_DEPENDENCIES = []
- BREW_DEPENDENCIES = []
- PIP_PACKAGES = ['gallery-dl']
- NPM_PACKAGES = []
- DEFAULT_BINARY = 'gallery-dl'
- DEFAULT_START_CMD = None
- DEFAULT_ARGS = []
- VERSION_CMD = '{BINARY} --version'
- ENABLED = models.BooleanField(default=True)
- BINARY = models.CharField(max_length=255, default='gallery-dl')
- WORKERS = models.IntegerField(default='1')
- class GalleryDLExtractor(ArchiveBoxDefaultExtractor, SingletonModel):
- NAME = 'GALLERYDL'
- LABEL = 'gallery-dl'
- DEPENDENCY = GalleryDLDependency.get_solo()
- # https://github.com/mikf/gallery-dl
- DEFAULT_CMD = [
- '{DEPENDENCY.BINARY}',
- '{ARGS}'
- '{url}',
- ]
- DEFAULT_ARGS = [
- '--timeout', self.TIMEOUT.format(**config),
- '--cookies', self.COOKIES_TXT.format(**config),
- '--user-agent', self.COOKIES_TXT.format(**config),
- '--verify', self.CHECK_SSL_VALIDITY.format(**config),
- ]
- ENABLED = models.BooleanField(default=True)
- CMD = models.CharField(max_length=255, default=DEFAULT_CMD)
- ARGS = models.CSVField(max_length=255, default=DEFAULT_ARGS)
-
- TIMEOUT = models.CharField(max_length=255, default='{TIMEOUT}')
- USER_AGENT = models.CharField(max_length=255, default='{USER_AGENT}')
- COOKIES_TXT = models.CharField(max_length=255, default='{COOKIES_TXT}')
- CHECK_SSL_VALIDITY = models.CharField(default='{CHECK_SSL_VALIDITY}')
- # @task
- # @requires_config('HOSTNAME', 'TIMEOUT', 'USER_AGENT', 'CHECK_SSL_VALIDITY')
- def extract(self, url: str, out_dir: Path, config: ConfigDict):
- if not self.ENABLED:
- return
- extractor_dir = self.create_extractor_directory(out_dir)
- cmd = [
- self.CMD,
- url,
- '--timeout', self.TIMEOUT.format(**config),
- '--cookies', self.COOKIES_TXT.format(**config),
- '--user-agent', self.COOKIES_TXT.format(**config),
- '--verify', self.CHECK_SSL_VALIDITY.format(**config),
- *split_args(self.ARGS.format(**config)),
- ]
- status, stdout, stderr, output_path = 'failed', '', '', None
- try:
- proc, timer, errors = self.DEPENDENCY.run(cmd, cwd=extractor_dir, timeout=self.GALLERYDL_TIMEOUT)
- stdout, stderr = proc.stdout, proc.stderr
-
- if 'ERROR: Unsupported URL' in stderr:
- hints = ('gallery-dl doesnt support this type of url yet',)
- raise ArchiveError('Failed to save gallerydl', hints)
- if proc.returncode == 0 and 'finished' in stdout:
- output_path = extractor_dir / 'index.html'
- status = 'succeeded'
- except Exception as err:
- stderr += err
- num_bytes, num_dirs, num_files = get_dir_size(extractor_dir)
- return ArchiveResult(
- cmd=cmd,
- pwd=str(out_dir),
- cmd_version=self.DEPENDENCY.bin_version,
- cmd_path=self.DEPENDENCY.bin_path,
- cmd_hostname=config.HOSTNAME,
- output_path=output_path,
- stdout=stdout,
- stderr=stderr,
- status=status,
- num_bytes=num_bytes,
- num_files=num_files,
- num_dirs=num_dirs,
- **timer.stats,
- )
|