extractor.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. import hashlib
  2. import mimetypes
  3. import os
  4. import subprocess
  5. from typing import ClassVar
  6. from datetime import timedelta
  7. from zipfile import Path
  8. from django.utils import timezone
  9. from archivebox.misc.hashing import get_dir_info
  10. from core.models import ArchiveResult
  11. import abx
  12. import archivebox
  13. # class Extractor:
  14. # # static class variables
  15. # name: ClassVar[str] = 'ytdlp'
  16. # verbose_name: ClassVar[str] = 'YT-DLP'
  17. # binaries: ClassVar[tuple[str, ...]] = ()
  18. # daemons: ClassVar[tuple[str, ...]] = ()
  19. # timeout: ClassVar[int] = 60
  20. #
  21. # # instance variables
  22. # ARCHIVERESULT: ArchiveResult
  23. # CONFIG: dict[str, object]
  24. # BINARIES: dict[str, object]
  25. # DAEMONS: dict[str, object]
  26. #
  27. # def __init__(self, archiveresult: ArchiveResult, extra_config: dict | None=None):
  28. # assert archiveresult.pk, 'ArchiveResult must be saved to DB before it can be extracted'
  29. # self.archiveresult = self.ARCHIVERESULT = archiveresult
  30. # self.CONFIG = archivebox.pm.hook.get_SCOPE_CONFIG(archiveresult=self.archiveresult, extra=extra_config)
  31. # all_binaries = abx.as_dict(archivebox.pm.hook.get_BINARIES())
  32. # all_daemons = abx.as_dict(archivebox.pm.hook.get_DAEMONS())
  33. # self.BINARIES = {
  34. # binary_name: all_binaries[binary_name]
  35. # for binary_name in self.binaries
  36. # }
  37. # self.DAEMONS = {
  38. # daemon_name: all_daemons[daemon_name]
  39. # for daemon_name in self.daemons
  40. # }
  41. # def extract(self, config: dict | None=None) -> 'ArchiveResult':
  42. # """
  43. # - making sure any binaries the extractor depends on are installed and loaded
  44. # - creating a new temporary working directory under the snapshot dir to hold extractor output
  45. # - setting up a timer signal to kill the extractor if it runs too long
  46. # - passing the extractor the URLs, temporary working directory, and config dict of options
  47. # - running the extractor in a shell subprocess and collecting stdout/stderr
  48. # - capturing the extractor's exit code
  49. # - if extractor exits with 29 (RetryError), it should set the status to 'BACKOFF' and set retry_at to a datetime in the future
  50. # - if extractor exits with 50 (NotApplicable), it should set the status to 'SKIPPED', and set retry_at to None
  51. # - setting the correct permissions and ownership on all the output files
  52. # - generating the merkle tree of all the output files and their hashes
  53. # - generating a thumbnail of the main output (or collecting one provided by the extractor)
  54. # - detecting any special outputs files that need to be parsed for other parts of the system (content-types? )
  55. # - metadata.json -> ArchiveResult.output_json
  56. # - outlinks.jsonl -> ArchiveResult.output_links
  57. # - search_texts.txt -> ArchiveResult.index_texts
  58. # - .merkle.json -> ArchiveResult.output_files
  59. # - videos.jsonl -> ArchiveResult.output_videos
  60. # - audios.jsonl -> ArchiveResult.output_audios
  61. # - images.jsonl -> ArchiveResult.output_images
  62. # - htmls.jsonl -> ArchiveResult.output_htmls
  63. # - saving all the result metadata to the ArchiveResult in the database
  64. # """
  65. # archiveresult = self.ARCHIVERESULT
  66. # # config = get_scope_config(archiveresult=archiveresult.snapshot.url, env=...)
  67. # self.before_extract()
  68. # error = Exception('Failed to start extractor')
  69. # stdout = ''
  70. # stderr = ''
  71. # try:
  72. # proc = archiveresult.EXTRACTOR.spawn(url=archiveresult.snapshot.url, binaries=binaries, daemons=daemons, cwd=cwd, config=config)
  73. # stdout, stderr = proc.communicate()
  74. # error = None
  75. # except Exception as err:
  76. # error = err
  77. # finally:
  78. # self.after_extract(error=error)
  79. # return archiveresult
  80. # def should_extract(self):
  81. # if self.archiveresult.snapshot.url.startswith('https://youtube.com/'):
  82. # return True
  83. # return False
  84. # def load_binaries(self):
  85. # return {
  86. # bin_name: binary.load()
  87. # for bin_name, binary in self.BINARIES.items()
  88. # }
  89. # def load_daemons(self):
  90. # return {
  91. # daemon_name: daemon.load()
  92. # for daemon_name, daemon in self.DAEMONS.items()
  93. # }
  94. # def output_dir_name(self):
  95. # # e.g. 'ytdlp'
  96. # return f'{self.name}'
  97. # @property
  98. # def OUTPUT_DIR(self):
  99. # return self.archiveresult.snapshot_dir / self.output_dir_name()
  100. # def before_extract(self):
  101. # # create self.archiveresult.snapshot_dir / self.archiveresult.extractor / dir
  102. # # chown, chmod, etc.
  103. # binaries = self.load_binaries()
  104. # daemons = self.load_daemons()
  105. # cmd = self.archiveresult.EXTRACTOR.get_cmd(binaries=binaries, daemons=daemons)
  106. # cmd_version = self.archiveresult.EXTRACTOR.get_cmd_version(binaries=binaries, daemons=daemons)
  107. # self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
  108. # os.chmod(self.OUTPUT_DIR, 0o755)
  109. # self.archiveresult.status = self.archiveresult.StatusChoices.STARTED
  110. # self.archiveresult.retry_at = timezone.now() + timedelta(seconds=self.timeout)
  111. # self.archiveresult.start_ts = timezone.now()
  112. # self.archiveresult.end_ts = None
  113. # self.archiveresult.output = None
  114. # self.archiveresult.output_path = str(self.OUTPUT_DIR.relative_to(self.archiveresult.snapshot_dir))
  115. # self.archiveresult.cmd = cmd
  116. # self.archiveresult.cmd_version = cmd_version
  117. # self.archiveresult.machine = Machine.objects.get_current()
  118. # self.archiveresult.iface = NetworkInterface.objects.get_current()
  119. # self.archiveresult.save()
  120. # self.archiveresult.write_indexes()
  121. # def extract(self, url: str, binaries: dict, daemons: dict, cwd: Path, config: dict):
  122. # proc = subprocess.run(self.archiveresult.cmd, cwd=self.archiveresult.cwd, env=os.environ.update(binaries), timeout=self.timeout, shell=True, capture_output=True, text=True)
  123. # self.archiveresult.stdout = proc.stdout
  124. # self.archiveresult.stderr = proc.stderr
  125. # self.archiveresult.returncode = proc.returncode
  126. # self.archiveresult.save()
  127. # self.archiveresult.write_indexes()
  128. # def determine_status(self):
  129. # if self.archiveresult.returncode == 29:
  130. # return self.archiveresult.StatusChoices.BACKOFF, timezone.now() + timedelta(seconds=self.timeout)
  131. # elif self.archiveresult.returncode == 50:
  132. # return self.archiveresult.StatusChoices.SKIPPED, None
  133. # else:
  134. # return self.archiveresult.StatusChoices.FAILED, None
  135. # def collect_outputs(self, cwd: Path):
  136. # for file in cwd.rglob('*'):
  137. # path = file.relative_to(cwd)
  138. # os.chmod(file, 0o644)
  139. # #os.chown(file, ARCHIVEBOX_UID, ARCHIVEBOX_GID)
  140. # self.archiveresult.outputs.append({
  141. # 'type': 'FILE',
  142. # 'path': file.relative_to(cwd),
  143. # 'size': file.stat().st_size,
  144. # 'ext': file.suffix,
  145. # 'mimetype': mimetypes.guess_type(file)[0],
  146. # 'sha256': hashlib.sha256(file.read_bytes()).hexdigest(),
  147. # 'blake3': hashlib.blake3(file.read_bytes()).hexdigest(),
  148. # 'created_at': file.stat().st_ctime,
  149. # 'modified_at': file.stat().st_mtime,
  150. # 'symlinks': [
  151. # 'screenshot.png',
  152. # 'example.com',
  153. # ]
  154. # })
  155. # outlinks = parse_outlinks(file)
  156. # if outlinks:
  157. # self.archiveresult.outputs.append({
  158. # 'type': 'OUTLINK',
  159. # 'url': outlink.target,
  160. # 'selector': outlink.selector,
  161. # 'text': outlink.text,
  162. # })
  163. #
  164. # if path.endswith('favicon.ico'):
  165. # self.archiveresult.outputs.append({
  166. # 'type': 'FAVICON',
  167. # 'symlinks': {
  168. # 'favicon': output_file['path'],
  169. # 'favicon.ico': output_file['path'],
  170. # 'favicon.png': output_file['path'].with_suffix('.png'),
  171. # },
  172. # 'path': output_file['path'],
  173. # })
  174. # if path.endswith('.pdf'):
  175. # self.archiveresult.outputs.append({
  176. # 'type': 'PDF',
  177. # 'path': file.relative_to(cwd),
  178. # })
  179. #
  180. # if 'text/plain' in mimetypes.guess_type(file):
  181. # self.archiveresult.outputs.append({
  182. # 'type': 'SEARCHTEXT',
  183. # 'path': file.relative_to(self.archiveresult.OUTPUT_DIR),
  184. # 'archiveresult_id': self.archiveresult.id,
  185. # })
  186. #
  187. # def after_extract(self, error: Exception | None=None):
  188. # status, retry_at = self.determine_status()
  189. #
  190. # self.archiveresult.error = f'{type(error).__name__}: {error}' if error else None
  191. # self.archiveresult.status = self.archiveresult.StatusChoices.FAILED if error else self.archiveresult.StatusChoices.SUCCEEDED
  192. # self.archiveresult.retry_at = None
  193. # self.archiveresult.end_ts = timezone.now()
  194. # self.archiveresult.output = self.archiveresult.outputs[0].path
  195. # self.archiveresult.save()
  196. # self.archiveresult.write_indexes()