models.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. __package__ = 'archivebox.core'
  2. from typing import Optional, List, Dict
  3. from django_stubs_ext.db.models import TypedModelMeta
  4. import json
  5. import uuid
  6. from uuid import uuid4
  7. from pathlib import Path
  8. from django.db import models
  9. from django.utils.functional import cached_property
  10. from django.utils.text import slugify
  11. from django.core.cache import cache
  12. from django.urls import reverse
  13. from django.db.models import Case, When, Value, IntegerField
  14. from django.contrib.auth.models import User # noqa
  15. from abid_utils.models import ABIDModel, ABIDField
  16. from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
  17. from ..system import get_dir_size
  18. from ..util import parse_date, base_url
  19. from ..index.schema import Link
  20. from ..index.html import snapshot_icons
  21. from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
  22. EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
  23. STATUS_CHOICES = [
  24. ("succeeded", "succeeded"),
  25. ("failed", "failed"),
  26. ("skipped", "skipped")
  27. ]
  28. # class BaseModel(models.Model):
  29. # # TODO: migrate all models to a shared base class with all our standard fields and helpers:
  30. # # ulid/created/modified/owner/is_deleted/as_json/from_json/etc.
  31. # #
  32. # # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
  33. # # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
  34. # class Meta(TypedModelMeta):
  35. # abstract = True
  36. class Tag(ABIDModel):
  37. """
  38. Based on django-taggit model + ABID base.
  39. """
  40. abid_prefix = 'tag_'
  41. abid_ts_src = 'self.created' # TODO: add created/modified time
  42. abid_uri_src = 'self.name'
  43. abid_subtype_src = '"03"'
  44. abid_rand_src = 'self.id'
  45. # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
  46. id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
  47. uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
  48. abid = ABIDField(prefix=abid_prefix)
  49. name = models.CharField(unique=True, blank=False, max_length=100)
  50. slug = models.SlugField(unique=True, blank=True, max_length=100)
  51. # slug is autoset on save from name, never set it manually
  52. class Meta(TypedModelMeta):
  53. verbose_name = "Tag"
  54. verbose_name_plural = "Tags"
  55. def __str__(self):
  56. return self.name
  57. def slugify(self, tag, i=None):
  58. slug = slugify(tag)
  59. if i is not None:
  60. slug += "_%d" % i
  61. return slug
  62. def save(self, *args, **kwargs):
  63. if self._state.adding and not self.slug:
  64. self.slug = self.slugify(self.name)
  65. # if name is different but slug conficts with another tags slug, append a counter
  66. # with transaction.atomic():
  67. slugs = set(
  68. type(self)
  69. ._default_manager.filter(slug__startswith=self.slug)
  70. .values_list("slug", flat=True)
  71. )
  72. i = None
  73. while True:
  74. slug = self.slugify(self.name, i)
  75. if slug not in slugs:
  76. self.slug = slug
  77. return super().save(*args, **kwargs)
  78. i = 1 if i is None else i+1
  79. else:
  80. return super().save(*args, **kwargs)
  81. class Snapshot(ABIDModel):
  82. abid_prefix = 'snp_'
  83. abid_ts_src = 'self.added'
  84. abid_uri_src = 'self.url'
  85. abid_subtype_src = '"01"'
  86. abid_rand_src = 'self.id'
  87. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) # legacy pk
  88. uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
  89. abid = ABIDField(prefix=abid_prefix)
  90. url = models.URLField(unique=True, db_index=True)
  91. timestamp = models.CharField(max_length=32, unique=True, db_index=True)
  92. title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
  93. added = models.DateTimeField(auto_now_add=True, db_index=True)
  94. updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
  95. tags = models.ManyToManyField(Tag, blank=True)
  96. keys = ('url', 'timestamp', 'title', 'tags', 'updated')
  97. def __repr__(self) -> str:
  98. title = self.title or '-'
  99. return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
  100. def __str__(self) -> str:
  101. title = self.title or '-'
  102. return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
  103. @classmethod
  104. def from_json(cls, info: dict):
  105. info = {k: v for k, v in info.items() if k in cls.keys}
  106. return cls(**info)
  107. def as_json(self, *args) -> dict:
  108. args = args or self.keys
  109. return {
  110. key: getattr(self, key)
  111. if key != 'tags' else self.tags_str()
  112. for key in args
  113. }
  114. def as_link(self) -> Link:
  115. return Link.from_json(self.as_json())
  116. def as_link_with_details(self) -> Link:
  117. from ..index import load_link_details
  118. return load_link_details(self.as_link())
  119. def tags_str(self, nocache=True) -> str | None:
  120. cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
  121. calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
  122. if nocache:
  123. tags_str = calc_tags_str()
  124. cache.set(cache_key, tags_str)
  125. return tags_str
  126. return cache.get_or_set(cache_key, calc_tags_str)
  127. def icons(self) -> str:
  128. return snapshot_icons(self)
  129. @cached_property
  130. def extension(self) -> str:
  131. from ..util import extension
  132. return extension(self.url)
  133. @cached_property
  134. def bookmarked(self):
  135. return parse_date(self.timestamp)
  136. @cached_property
  137. def bookmarked_date(self):
  138. # TODO: remove this
  139. return self.bookmarked
  140. @cached_property
  141. def is_archived(self):
  142. return self.as_link().is_archived
  143. @cached_property
  144. def num_outputs(self) -> int:
  145. return self.archiveresult_set.filter(status='succeeded').count()
  146. @cached_property
  147. def base_url(self):
  148. return base_url(self.url)
  149. @cached_property
  150. def link_dir(self):
  151. return str(ARCHIVE_DIR / self.timestamp)
  152. @cached_property
  153. def archive_path(self):
  154. return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
  155. @cached_property
  156. def archive_size(self):
  157. cache_key = f'{str(self.pk)[:12]}-{(self.updated or self.added).timestamp()}-size'
  158. def calc_dir_size():
  159. try:
  160. return get_dir_size(self.link_dir)[0]
  161. except Exception:
  162. return 0
  163. return cache.get_or_set(cache_key, calc_dir_size)
  164. @cached_property
  165. def thumbnail_url(self) -> Optional[str]:
  166. result = self.archiveresult_set.filter(
  167. extractor='screenshot',
  168. status='succeeded'
  169. ).only('output').last()
  170. if result:
  171. return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
  172. return None
  173. @cached_property
  174. def headers(self) -> Optional[Dict[str, str]]:
  175. try:
  176. return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
  177. except Exception:
  178. pass
  179. return None
  180. @cached_property
  181. def status_code(self) -> Optional[str]:
  182. return self.headers and self.headers.get('Status-Code')
  183. @cached_property
  184. def history(self) -> dict:
  185. # TODO: use ArchiveResult for this instead of json
  186. return self.as_link_with_details().history
  187. @cached_property
  188. def latest_title(self) -> Optional[str]:
  189. if self.title:
  190. return self.title # whoopdedoo that was easy
  191. try:
  192. # take longest successful title from ArchiveResult db history
  193. return sorted(
  194. self.archiveresult_set\
  195. .filter(extractor='title', status='succeeded', output__isnull=False)\
  196. .values_list('output', flat=True),
  197. key=lambda r: len(r),
  198. )[-1]
  199. except IndexError:
  200. pass
  201. try:
  202. # take longest successful title from Link json index file history
  203. return sorted(
  204. (
  205. result.output.strip()
  206. for result in self.history['title']
  207. if result.status == 'succeeded' and result.output.strip()
  208. ),
  209. key=lambda r: len(r),
  210. )[-1]
  211. except (KeyError, IndexError):
  212. pass
  213. return None
  214. def save_tags(self, tags: List[str]=()) -> None:
  215. tags_id = []
  216. for tag in tags:
  217. if tag.strip():
  218. tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
  219. self.tags.clear()
  220. self.tags.add(*tags_id)
  221. # def get_storage_dir(self, create=True, symlink=True) -> Path:
  222. # date_str = self.added.strftime('%Y%m%d')
  223. # domain_str = domain(self.url)
  224. # abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
  225. # if create and not abs_storage_dir.is_dir():
  226. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  227. # if symlink:
  228. # LINK_PATHS = [
  229. # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  230. # # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
  231. # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
  232. # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
  233. # ]
  234. # for link_path in LINK_PATHS:
  235. # link_path.parent.mkdir(parents=True, exist_ok=True)
  236. # try:
  237. # link_path.symlink_to(abs_storage_dir)
  238. # except FileExistsError:
  239. # link_path.unlink()
  240. # link_path.symlink_to(abs_storage_dir)
  241. # return abs_storage_dir
  242. class ArchiveResultManager(models.Manager):
  243. def indexable(self, sorted: bool = True):
  244. INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  245. qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
  246. if sorted:
  247. precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  248. qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
  249. return qs
  250. class ArchiveResult(ABIDModel):
  251. abid_prefix = 'res_'
  252. abid_ts_src = 'self.snapshot.added'
  253. abid_uri_src = 'self.snapshot.url'
  254. abid_subtype_src = 'self.extractor'
  255. abid_rand_src = 'self.uuid'
  256. EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
  257. # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
  258. id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') # legacy pk
  259. uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
  260. abid = ABIDField(prefix=abid_prefix)
  261. snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
  262. extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
  263. cmd = models.JSONField()
  264. pwd = models.CharField(max_length=256)
  265. cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
  266. output = models.CharField(max_length=1024)
  267. start_ts = models.DateTimeField(db_index=True)
  268. end_ts = models.DateTimeField()
  269. status = models.CharField(max_length=16, choices=STATUS_CHOICES)
  270. objects = ArchiveResultManager()
  271. class Meta(TypedModelMeta):
  272. verbose_name = 'Result'
  273. def __str__(self):
  274. return self.extractor
  275. @cached_property
  276. def snapshot_dir(self):
  277. return Path(self.snapshot.link_dir)
  278. @property
  279. def extractor_module(self):
  280. return EXTRACTORS[self.extractor]
  281. def output_path(self) -> str:
  282. """return the canonical output filename or directory name within the snapshot dir"""
  283. return self.extractor_module.get_output_path()
  284. def embed_path(self) -> str:
  285. """
  286. return the actual runtime-calculated path to the file on-disk that
  287. should be used for user-facing iframe embeds of this result
  288. """
  289. if hasattr(self.extractor_module, 'get_embed_path'):
  290. return self.extractor_module.get_embed_path(self)
  291. return self.extractor_module.get_output_path()
  292. def legacy_output_path(self):
  293. link = self.snapshot.as_link()
  294. return link.canonical_outputs().get(f'{self.extractor}_path')
  295. def output_exists(self) -> bool:
  296. return Path(self.output_path()).exists()
  297. # def get_storage_dir(self, create=True, symlink=True):
  298. # date_str = self.snapshot.added.strftime('%Y%m%d')
  299. # domain_str = domain(self.snapshot.url)
  300. # abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
  301. # if create and not abs_storage_dir.is_dir():
  302. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  303. # if symlink:
  304. # LINK_PATHS = [
  305. # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  306. # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
  307. # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
  308. # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
  309. # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
  310. # ]
  311. # for link_path in LINK_PATHS:
  312. # link_path.parent.mkdir(parents=True, exist_ok=True)
  313. # try:
  314. # link_path.symlink_to(abs_storage_dir)
  315. # except FileExistsError:
  316. # link_path.unlink()
  317. # link_path.symlink_to(abs_storage_dir)
  318. # return abs_storage_dir
  319. # def symlink_index(self, create=True):
  320. # abs_result_dir = self.get_storage_dir(create=create)