models.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. from django.db import models
  2. from django.utils.functional import cached_property
  3. from solo.models import SingletonModel
  4. from archivebox.plugins.defaults.models import (
  5. ArchiveBoxDefaultDependency,
  6. ArchiveBoxDefaultExtractor,
  7. BashEnvironmentDependency,
  8. PipEnvironmentDependency,
  9. )
  10. class GalleryDLDependency(ArchiveBoxDefaultDependency, SingletonModel):
  11. NAME = 'GALLERYDL'
  12. LABEL = "GalleryDL"
  13. REQUIRED = False
  14. PARENT_DEPENDENCIES = [
  15. BashEnvironmentDependency,
  16. PipEnvironmentDependency,
  17. ]
  18. BIN_DEPENDENCIES = ['gallery-dl']
  19. APT_DEPENDENCIES = []
  20. BREW_DEPENDENCIES = []
  21. PIP_PACKAGES = ['gallery-dl']
  22. NPM_PACKAGES = []
  23. DEFAULT_BINARY = 'gallery-dl'
  24. DEFAULT_START_CMD = None
  25. DEFAULT_ARGS = []
  26. VERSION_CMD = '{BINARY} --version'
  27. ENABLED = models.BooleanField(default=True)
  28. BINARY = models.CharField(max_length=255, default='gallery-dl')
  29. WORKERS = models.IntegerField(default='1')
  30. class GalleryDLExtractor(ArchiveBoxDefaultExtractor, SingletonModel):
  31. NAME = 'GALLERYDL'
  32. LABEL = 'gallery-dl'
  33. DEPENDENCY = GalleryDLDependency.get_solo()
  34. # https://github.com/mikf/gallery-dl
  35. DEFAULT_CMD = [
  36. '{DEPENDENCY.BINARY}',
  37. '{ARGS}'
  38. '{url}',
  39. ]
  40. DEFAULT_ARGS = [
  41. '--timeout', self.TIMEOUT.format(**config),
  42. '--cookies', self.COOKIES_TXT.format(**config),
  43. '--user-agent', self.COOKIES_TXT.format(**config),
  44. '--verify', self.CHECK_SSL_VALIDITY.format(**config),
  45. ]
  46. ENABLED = models.BooleanField(default=True)
  47. CMD = models.CharField(max_length=255, default=DEFAULT_CMD)
  48. ARGS = models.CSVField(max_length=255, default=DEFAULT_ARGS)
  49. TIMEOUT = models.CharField(max_length=255, default='{TIMEOUT}')
  50. USER_AGENT = models.CharField(max_length=255, default='{USER_AGENT}')
  51. COOKIES_TXT = models.CharField(max_length=255, default='{COOKIES_TXT}')
  52. CHECK_SSL_VALIDITY = models.CharField(default='{CHECK_SSL_VALIDITY}')
  53. # @task
  54. # @requires_config('HOSTNAME', 'TIMEOUT', 'USER_AGENT', 'CHECK_SSL_VALIDITY')
  55. def extract(self, url: str, out_dir: Path, config: ConfigDict):
  56. if not self.ENABLED:
  57. return
  58. extractor_dir = self.create_extractor_directory(out_dir)
  59. cmd = [
  60. self.CMD,
  61. url,
  62. '--timeout', self.TIMEOUT.format(**config),
  63. '--cookies', self.COOKIES_TXT.format(**config),
  64. '--user-agent', self.COOKIES_TXT.format(**config),
  65. '--verify', self.CHECK_SSL_VALIDITY.format(**config),
  66. *split_args(self.ARGS.format(**config)),
  67. ]
  68. status, stdout, stderr, output_path = 'failed', '', '', None
  69. try:
  70. proc, timer, errors = self.DEPENDENCY.run(cmd, cwd=extractor_dir, timeout=self.GALLERYDL_TIMEOUT)
  71. stdout, stderr = proc.stdout, proc.stderr
  72. if 'ERROR: Unsupported URL' in stderr:
  73. hints = ('gallery-dl doesnt support this type of url yet',)
  74. raise ArchiveError('Failed to save gallerydl', hints)
  75. if proc.returncode == 0 and 'finished' in stdout:
  76. output_path = extractor_dir / 'index.html'
  77. status = 'succeeded'
  78. except Exception as err:
  79. stderr += err
  80. num_bytes, num_dirs, num_files = get_dir_size(extractor_dir)
  81. return ArchiveResult(
  82. cmd=cmd,
  83. pwd=str(out_dir),
  84. cmd_version=self.DEPENDENCY.bin_version,
  85. cmd_path=self.DEPENDENCY.bin_path,
  86. cmd_hostname=config.HOSTNAME,
  87. output_path=output_path,
  88. stdout=stdout,
  89. stderr=stderr,
  90. status=status,
  91. num_bytes=num_bytes,
  92. num_files=num_files,
  93. num_dirs=num_dirs,
  94. **timer.stats,
  95. )