models.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. __package__ = 'archivebox.core'
  2. import uuid
  3. from pathlib import Path
  4. from typing import Dict, Optional, List
  5. from datetime import datetime, timedelta
  6. from collections import defaultdict
  7. from django.db import models, transaction
  8. from django.utils.functional import cached_property
  9. from django.utils.text import slugify
  10. from django.db.models import Case, When, Value, IntegerField
  11. from ..util import parse_date
  12. from ..index.schema import Link
  13. from ..config import CONFIG
  14. from ..system import get_dir_size
  15. #EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
  16. EXTRACTORS = [("title", "title"), ("wget", "wget")]
  17. STATUS_CHOICES = [
  18. ("succeeded", "succeeded"),
  19. ("failed", "failed"),
  20. ("skipped", "skipped")
  21. ]
  22. try:
  23. JSONField = models.JSONField
  24. except AttributeError:
  25. import jsonfield
  26. JSONField = jsonfield.JSONField
  27. class Tag(models.Model):
  28. """
  29. Based on django-taggit model
  30. """
  31. name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100)
  32. slug = models.SlugField(verbose_name="slug", unique=True, max_length=100)
  33. class Meta:
  34. verbose_name = "Tag"
  35. verbose_name_plural = "Tags"
  36. def __str__(self):
  37. return self.name
  38. def slugify(self, tag, i=None):
  39. slug = slugify(tag)
  40. if i is not None:
  41. slug += "_%d" % i
  42. return slug
  43. def save(self, *args, **kwargs):
  44. if self._state.adding and not self.slug:
  45. self.slug = self.slugify(self.name)
  46. with transaction.atomic():
  47. slugs = set(
  48. type(self)
  49. ._default_manager.filter(slug__startswith=self.slug)
  50. .values_list("slug", flat=True)
  51. )
  52. i = None
  53. while True:
  54. slug = self.slugify(self.name, i)
  55. if slug not in slugs:
  56. self.slug = slug
  57. return super().save(*args, **kwargs)
  58. i = 1 if i is None else i+1
  59. else:
  60. return super().save(*args, **kwargs)
  61. class Snapshot(models.Model):
  62. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
  63. url = models.URLField(unique=True)
  64. timestamp = models.CharField(max_length=32, unique=True, db_index=True)
  65. title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
  66. added = models.DateTimeField(auto_now_add=True, db_index=True)
  67. updated = models.DateTimeField(null=True, blank=True, db_index=True)
  68. tags = models.ManyToManyField(Tag)
  69. keys = ('id', 'url', 'timestamp', 'title', 'tags', 'updated', 'base_url')
  70. def __repr__(self) -> str:
  71. title = self.title or '-'
  72. return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
  73. def __str__(self) -> str:
  74. title = self.title or '-'
  75. return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
  76. def field_names():
  77. fields = self._meta.get_field_names()
  78. exclude = ["tags", "archiveresult"] # Exclude relationships for now
  79. return [field.name for field in fields if field.name not in exclude]
  80. @classmethod
  81. def from_json(cls, info: dict):
  82. info = {k: v for k, v in info.items() if k in cls.keys}
  83. if "tags" in info:
  84. # TODO: Handle tags
  85. info.pop("tags")
  86. info.pop("base_url", None)
  87. return cls(**info)
  88. def get_history(self) -> dict:
  89. """
  90. Generates the history dictionary out of the stored ArchiveResults
  91. """
  92. history_list = self.archiveresult_set.all()
  93. history = defaultdict(list)
  94. for history_item in history_list:
  95. history[history_item.extractor].append(
  96. {
  97. "cmd": history_item.cmd,
  98. "cmd_version": history_item.cmd_version,
  99. "end_ts": history_item.end_ts.isoformat(),
  100. "start_ts": history_item.start_ts.isoformat(),
  101. "pwd": history_item.pwd,
  102. "output": history_item.output,
  103. "schema": "ArchiveResult",
  104. "status": history_item.status
  105. }
  106. )
  107. return dict(history)
  108. def as_json(self, *args) -> dict:
  109. """
  110. Returns the snapshot in json format.
  111. id is converted to str
  112. history is extracted from ArchiveResult
  113. """
  114. args = args or self.keys
  115. output = {
  116. key: getattr(self, key)
  117. if key != 'tags' else self.tags_str()
  118. for key in args
  119. }
  120. if "id" in output.keys():
  121. output["id"] = str(output["id"])
  122. output["history"] = self.get_history()
  123. return output
  124. def as_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
  125. from ..index.csv import to_csv
  126. return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
  127. def as_link(self) -> Link:
  128. return Link.from_json(self.as_json())
  129. def as_link_with_details(self) -> Link:
  130. from ..index import load_link_details
  131. return load_link_details(self.as_link())
  132. def tags_str(self) -> str:
  133. return ','.join(self.tags.order_by('name').values_list('name', flat=True))
  134. @cached_property
  135. def bookmarked(self):
  136. return parse_date(self.timestamp)
  137. @cached_property
  138. def bookmarked_date(self) -> Optional[str]:
  139. from ..util import ts_to_date
  140. max_ts = (datetime.now() + timedelta(days=30)).timestamp()
  141. if self.timestamp and self.timestamp.replace('.', '').isdigit():
  142. if 0 < float(self.timestamp) < max_ts:
  143. return ts_to_date(datetime.fromtimestamp(float(self.timestamp)))
  144. else:
  145. return str(self.timestamp)
  146. return None
  147. @cached_property
  148. def is_archived(self) -> bool:
  149. from ..config import ARCHIVE_DIR
  150. from ..util import domain
  151. output_paths = (
  152. domain(self.url),
  153. 'output.pdf',
  154. 'screenshot.png',
  155. 'output.html',
  156. 'media',
  157. 'singlefile.html'
  158. )
  159. return any(
  160. (Path(ARCHIVE_DIR) / self.timestamp / path).exists()
  161. for path in output_paths
  162. )
  163. @cached_property
  164. def archive_dates(self) -> List[datetime]:
  165. return [
  166. result.start_ts
  167. for result in self.archiveresult_set.all()
  168. ]
  169. @cached_property
  170. def oldest_archive_date(self) -> Optional[datetime]:
  171. oldest = self.archiveresult_set.all().order_by("-start_ts")[:1]
  172. if len(oldest) > 0:
  173. return oldest[0].start_ts
  174. @cached_property
  175. def num_outputs(self):
  176. return self.archiveresult_set.filter(status='succeeded').count()
  177. @cached_property
  178. def url_hash(self):
  179. return self.as_link().url_hash
  180. @cached_property
  181. def base_url(self) -> str:
  182. from ..util import base_url
  183. return base_url(self.url)
  184. @cached_property
  185. def snapshot_dir(self):
  186. from ..config import CONFIG
  187. return Path(CONFIG['ARCHIVE_DIR']) / self.timestamp
  188. @cached_property
  189. def archive_path(self):
  190. from ..config import ARCHIVE_DIR_NAME
  191. return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
  192. @cached_property
  193. def archive_size(self) -> float:
  194. try:
  195. return get_dir_size(self.archive_path)[0]
  196. except Exception:
  197. return 0
  198. @cached_property
  199. def history(self):
  200. # TODO: use ArchiveResult for this instead of json
  201. return self.as_link_with_details().history
  202. @cached_property
  203. def latest_title(self):
  204. if ('title' in self.history
  205. and self.history['title']
  206. and (self.history['title'][-1].status == 'succeeded')
  207. and self.history['title'][-1].output.strip()):
  208. return self.history['title'][-1].output.strip()
  209. return None
  210. @cached_property
  211. def domain(self) -> str:
  212. from ..util import domain
  213. return domain(self.url)
  214. @cached_property
  215. def is_static(self) -> bool:
  216. from ..util import is_static_file
  217. return is_static_file(self.url)
  218. @cached_property
  219. def details(self) -> Dict:
  220. # TODO: Define what details are, and return them accordingly
  221. return {"history": {}}
  222. @property
  223. def extension(self) -> str:
  224. from ..util import extension
  225. return extension(self.url)
  226. def canonical_outputs(self) -> Dict[str, Optional[str]]:
  227. """predict the expected output paths that should be present after archiving"""
  228. from ..extractors.wget import wget_output_path
  229. canonical = {
  230. 'index_path': 'index.html',
  231. 'favicon_path': 'favicon.ico',
  232. 'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
  233. 'wget_path': wget_output_path(self),
  234. 'warc_path': 'warc',
  235. 'singlefile_path': 'singlefile.html',
  236. 'readability_path': 'readability/content.html',
  237. 'mercury_path': 'mercury/content.html',
  238. 'pdf_path': 'output.pdf',
  239. 'screenshot_path': 'screenshot.png',
  240. 'dom_path': 'output.html',
  241. 'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url),
  242. 'git_path': 'git',
  243. 'media_path': 'media',
  244. }
  245. if self.is_static:
  246. # static binary files like PDF and images are handled slightly differently.
  247. # they're just downloaded once and aren't archived separately multiple times,
  248. # so the wget, screenshot, & pdf urls should all point to the same file
  249. static_path = wget_output_path(self)
  250. canonical.update({
  251. 'title': self.basename,
  252. 'wget_path': static_path,
  253. 'pdf_path': static_path,
  254. 'screenshot_path': static_path,
  255. 'dom_path': static_path,
  256. 'singlefile_path': static_path,
  257. 'readability_path': static_path,
  258. 'mercury_path': static_path,
  259. })
  260. return canonical
  261. def save_tags(self, tags=()):
  262. tags_id = []
  263. for tag in tags:
  264. tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
  265. self.tags.clear()
  266. self.tags.add(*tags_id)
  267. class ArchiveResultManager(models.Manager):
  268. def indexable(self, sorted: bool = True):
  269. from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
  270. INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  271. qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
  272. if sorted:
  273. precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  274. qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
  275. return qs
  276. class ArchiveResult(models.Model):
  277. snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
  278. cmd = JSONField()
  279. pwd = models.CharField(max_length=256)
  280. cmd_version = models.CharField(max_length=32, default=None, null=True, blank=True)
  281. output = models.CharField(max_length=512)
  282. start_ts = models.DateTimeField()
  283. end_ts = models.DateTimeField()
  284. status = models.CharField(max_length=16, choices=STATUS_CHOICES)
  285. extractor = models.CharField(choices=EXTRACTORS, max_length=32)
  286. objects = ArchiveResultManager()
  287. def __str__(self):
  288. return self.extractor
  289. class Meta:
  290. ordering = ["-start_ts"]