models.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. __package__ = 'archivebox.core'
  2. import uuid
  3. from django.db import models
  4. from django.utils.functional import cached_property
  5. from django.utils.text import slugify
  6. from django.core.cache import cache
  7. from django.db.models import Case, When, Value, IntegerField
  8. from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
  9. from ..system import get_dir_size
  10. from ..util import parse_date, base_url, hashurl
  11. from ..index.schema import Link
  12. from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
  13. EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
  14. STATUS_CHOICES = [
  15. ("succeeded", "succeeded"),
  16. ("failed", "failed"),
  17. ("skipped", "skipped")
  18. ]
  19. try:
  20. JSONField = models.JSONField
  21. except AttributeError:
  22. import jsonfield
  23. JSONField = jsonfield.JSONField
  24. class Tag(models.Model):
  25. """
  26. Based on django-taggit model
  27. """
  28. name = models.CharField(unique=True, blank=False, max_length=100)
  29. # slug is autoset on save from name, never set it manually
  30. slug = models.SlugField(unique=True, blank=True, max_length=100)
  31. class Meta:
  32. verbose_name = "Tag"
  33. verbose_name_plural = "Tags"
  34. def __str__(self):
  35. return self.name
  36. def slugify(self, tag, i=None):
  37. slug = slugify(tag)
  38. if i is not None:
  39. slug += "_%d" % i
  40. return slug
  41. def save(self, *args, **kwargs):
  42. if self._state.adding and not self.slug:
  43. self.slug = self.slugify(self.name)
  44. # if name is different but slug conficts with another tags slug, append a counter
  45. # with transaction.atomic():
  46. slugs = set(
  47. type(self)
  48. ._default_manager.filter(slug__startswith=self.slug)
  49. .values_list("slug", flat=True)
  50. )
  51. i = None
  52. while True:
  53. slug = self.slugify(self.name, i)
  54. if slug not in slugs:
  55. self.slug = slug
  56. return super().save(*args, **kwargs)
  57. i = 1 if i is None else i+1
  58. else:
  59. return super().save(*args, **kwargs)
  60. class Snapshot(models.Model):
  61. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
  62. url = models.URLField(unique=True)
  63. timestamp = models.CharField(max_length=32, unique=True, db_index=True)
  64. title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
  65. added = models.DateTimeField(auto_now_add=True, db_index=True)
  66. updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
  67. tags = models.ManyToManyField(Tag, blank=True)
  68. keys = ('url', 'timestamp', 'title', 'tags', 'updated')
  69. def __repr__(self) -> str:
  70. title = self.title or '-'
  71. return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
  72. def __str__(self) -> str:
  73. title = self.title or '-'
  74. return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
  75. @classmethod
  76. def from_json(cls, info: dict):
  77. info = {k: v for k, v in info.items() if k in cls.keys}
  78. return cls(**info)
  79. def as_json(self, *args) -> dict:
  80. args = args or self.keys
  81. return {
  82. key: getattr(self, key)
  83. if key != 'tags' else self.tags_str()
  84. for key in args
  85. }
  86. def as_link(self) -> Link:
  87. return Link.from_json(self.as_json())
  88. def as_link_with_details(self) -> Link:
  89. from ..index import load_link_details
  90. return load_link_details(self.as_link())
  91. def tags_str(self) -> str:
  92. cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
  93. calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
  94. return cache.get_or_set(cache_key, calc_tags_str)
  95. @cached_property
  96. def bookmarked(self):
  97. return parse_date(self.timestamp)
  98. @cached_property
  99. def bookmarked_date(self):
  100. # TODO: remove this
  101. return self.bookmarked
  102. @cached_property
  103. def is_archived(self):
  104. return self.as_link().is_archived
  105. @cached_property
  106. def num_outputs(self):
  107. return self.archiveresult_set.filter(status='succeeded').count()
  108. @cached_property
  109. def url_hash(self):
  110. return hashurl(self.url)
  111. @cached_property
  112. def base_url(self):
  113. return base_url(self.url)
  114. @cached_property
  115. def link_dir(self):
  116. return str(ARCHIVE_DIR / self.timestamp)
  117. @cached_property
  118. def archive_path(self):
  119. return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
  120. @cached_property
  121. def archive_size(self):
  122. cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'
  123. def calc_dir_size():
  124. try:
  125. return get_dir_size(self.link_dir)[0]
  126. except Exception:
  127. return 0
  128. return cache.get_or_set(cache_key, calc_dir_size)
  129. @cached_property
  130. def history(self):
  131. # TODO: use ArchiveResult for this instead of json
  132. return self.as_link_with_details().history
  133. @cached_property
  134. def latest_title(self):
  135. if self.title:
  136. return self.title # whoopdedoo that was easy
  137. try:
  138. # take longest successful title from ArchiveResult db history
  139. return sorted(
  140. self.archiveresult_set\
  141. .filter(extractor='title', status='succeeded', output__isnull=False)\
  142. .values_list('output', flat=True),
  143. key=lambda r: len(r),
  144. )[-1]
  145. except IndexError:
  146. pass
  147. try:
  148. # take longest successful title from Link json index file history
  149. return sorted(
  150. (
  151. result.output.strip()
  152. for result in self.history['title']
  153. if result.status == 'succeeded' and result.output.strip()
  154. ),
  155. key=lambda r: len(r),
  156. )[-1]
  157. except (KeyError, IndexError):
  158. pass
  159. return None
  160. def save_tags(self, tags=()):
  161. tags_id = []
  162. for tag in tags:
  163. if tag.strip():
  164. tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
  165. self.tags.clear()
  166. self.tags.add(*tags_id)
  167. class ArchiveResultManager(models.Manager):
  168. def indexable(self, sorted: bool = True):
  169. INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  170. qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
  171. if sorted:
  172. precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  173. qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
  174. return qs
  175. class ArchiveResult(models.Model):
  176. id = models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')
  177. uuid = models.UUIDField(default=uuid.uuid4, editable=False)
  178. snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
  179. extractor = models.CharField(choices=EXTRACTORS, max_length=32)
  180. cmd = JSONField()
  181. pwd = models.CharField(max_length=256)
  182. cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
  183. output = models.CharField(max_length=1024)
  184. start_ts = models.DateTimeField(db_index=True)
  185. end_ts = models.DateTimeField()
  186. status = models.CharField(max_length=16, choices=STATUS_CHOICES)
  187. objects = ArchiveResultManager()
  188. def __str__(self):
  189. return self.extractor