models.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. __package__ = 'archivebox.core'
  2. from typing import Optional, List, Dict
  3. from django_stubs_ext.db.models import TypedModelMeta
  4. import json
  5. import random
  6. import uuid
  7. from uuid import uuid4
  8. from pathlib import Path
  9. from django.db import models
  10. from django.utils.functional import cached_property
  11. from django.utils.text import slugify
  12. from django.core.cache import cache
  13. from django.urls import reverse, reverse_lazy
  14. from django.db.models import Case, When, Value, IntegerField
  15. from django.contrib.auth.models import User # noqa
  16. from abid_utils.models import ABIDModel, ABIDField
  17. from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
  18. from ..system import get_dir_size
  19. from ..util import parse_date, base_url
  20. from ..index.schema import Link
  21. from ..index.html import snapshot_icons
  22. from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
  23. EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
  24. STATUS_CHOICES = [
  25. ("succeeded", "succeeded"),
  26. ("failed", "failed"),
  27. ("skipped", "skipped")
  28. ]
  29. # class BaseModel(models.Model):
  30. # # TODO: migrate all models to a shared base class with all our standard fields and helpers:
  31. # # ulid/created/modified/owner/is_deleted/as_json/from_json/etc.
  32. # #
  33. # # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
  34. # # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
  35. # class Meta(TypedModelMeta):
  36. # abstract = True
  37. class Tag(ABIDModel):
  38. """
  39. Based on django-taggit model + ABID base.
  40. """
  41. abid_prefix = 'tag_'
  42. abid_ts_src = 'self.created' # TODO: add created/modified time
  43. abid_uri_src = 'self.name'
  44. abid_subtype_src = '"03"'
  45. abid_rand_src = 'self.id'
  46. # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
  47. id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
  48. uuid = models.UUIDField(default=uuid.uuid4, editable=True, unique=True)
  49. abid = ABIDField(prefix=abid_prefix)
  50. name = models.CharField(unique=True, blank=False, max_length=100)
  51. slug = models.SlugField(unique=True, blank=True, max_length=100)
  52. # slug is autoset on save from name, never set it manually
  53. class Meta(TypedModelMeta):
  54. verbose_name = "Tag"
  55. verbose_name_plural = "Tags"
  56. def __str__(self):
  57. return self.name
  58. def slugify(self, tag, i=None):
  59. slug = slugify(tag)
  60. if i is not None:
  61. slug += "_%d" % i
  62. return slug
  63. def save(self, *args, **kwargs):
  64. if self._state.adding and not self.slug:
  65. self.slug = self.slugify(self.name)
  66. # if name is different but slug conficts with another tags slug, append a counter
  67. # with transaction.atomic():
  68. slugs = set(
  69. type(self)
  70. ._default_manager.filter(slug__startswith=self.slug)
  71. .values_list("slug", flat=True)
  72. )
  73. i = None
  74. while True:
  75. slug = self.slugify(self.name, i)
  76. if slug not in slugs:
  77. self.slug = slug
  78. return super().save(*args, **kwargs)
  79. i = 1 if i is None else i+1
  80. else:
  81. return super().save(*args, **kwargs)
  82. @property
  83. def api_url(self) -> str:
  84. # /api/v1/core/snapshot/{uulid}
  85. return reverse_lazy('api-1:get_tag', args=[self.abid])
  86. @property
  87. def api_docs_url(self) -> str:
  88. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_tag'
  89. class SnapshotTag(models.Model):
  90. snapshot = models.OneToOneField('Snapshot', primary_key=True, on_delete=models.CASCADE, to_field='id')
  91. tag = models.ForeignKey(Tag, on_delete=models.CASCADE, to_field='id')
  92. class Snapshot(ABIDModel):
  93. abid_prefix = 'snp_'
  94. abid_ts_src = 'self.added'
  95. abid_uri_src = 'self.url'
  96. abid_subtype_src = '"01"'
  97. abid_rand_src = 'self.old_id'
  98. old_id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) # legacy pk
  99. id = models.UUIDField(default=uuid.uuid4, editable=True, unique=True)
  100. abid = ABIDField(prefix=abid_prefix)
  101. url = models.URLField(unique=True, db_index=True)
  102. timestamp = models.CharField(max_length=32, unique=True, db_index=True)
  103. title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
  104. added = models.DateTimeField(auto_now_add=True, db_index=True)
  105. updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
  106. tags = models.ManyToManyField(Tag, blank=True)
  107. keys = ('url', 'timestamp', 'title', 'tags', 'updated')
  108. def __repr__(self) -> str:
  109. title = self.title or '-'
  110. return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
  111. def __str__(self) -> str:
  112. title = self.title or '-'
  113. return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
  114. @classmethod
  115. def from_json(cls, info: dict):
  116. info = {k: v for k, v in info.items() if k in cls.keys}
  117. return cls(**info)
  118. def as_json(self, *args) -> dict:
  119. args = args or self.keys
  120. return {
  121. key: getattr(self, key)
  122. if key != 'tags' else self.tags_str()
  123. for key in args
  124. }
  125. def as_link(self) -> Link:
  126. return Link.from_json(self.as_json())
  127. def as_link_with_details(self) -> Link:
  128. from ..index import load_link_details
  129. return load_link_details(self.as_link())
  130. def tags_str(self, nocache=True) -> str | None:
  131. cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
  132. calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
  133. if nocache:
  134. tags_str = calc_tags_str()
  135. cache.set(cache_key, tags_str)
  136. return tags_str
  137. return cache.get_or_set(cache_key, calc_tags_str)
  138. def icons(self) -> str:
  139. return snapshot_icons(self)
  140. @property
  141. def api_url(self) -> str:
  142. # /api/v1/core/snapshot/{uulid}
  143. return reverse_lazy('api-1:get_snapshot', args=[self.abid])
  144. @property
  145. def api_docs_url(self) -> str:
  146. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot'
  147. @cached_property
  148. def extension(self) -> str:
  149. from ..util import extension
  150. return extension(self.url)
  151. @cached_property
  152. def bookmarked(self):
  153. return parse_date(self.timestamp)
  154. @cached_property
  155. def bookmarked_date(self):
  156. # TODO: remove this
  157. return self.bookmarked
  158. @cached_property
  159. def is_archived(self):
  160. return self.as_link().is_archived
  161. @cached_property
  162. def num_outputs(self) -> int:
  163. return self.archiveresult_set.filter(status='succeeded').count()
  164. @cached_property
  165. def base_url(self):
  166. return base_url(self.url)
  167. @cached_property
  168. def link_dir(self):
  169. return str(ARCHIVE_DIR / self.timestamp)
  170. @cached_property
  171. def archive_path(self):
  172. return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
  173. @cached_property
  174. def archive_size(self):
  175. cache_key = f'{str(self.pk)[:12]}-{(self.updated or self.added).timestamp()}-size'
  176. def calc_dir_size():
  177. try:
  178. return get_dir_size(self.link_dir)[0]
  179. except Exception:
  180. return 0
  181. return cache.get_or_set(cache_key, calc_dir_size)
  182. @cached_property
  183. def thumbnail_url(self) -> Optional[str]:
  184. result = self.archiveresult_set.filter(
  185. extractor='screenshot',
  186. status='succeeded'
  187. ).only('output').last()
  188. if result:
  189. return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
  190. return None
  191. @cached_property
  192. def headers(self) -> Optional[Dict[str, str]]:
  193. try:
  194. return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
  195. except Exception:
  196. pass
  197. return None
  198. @cached_property
  199. def status_code(self) -> Optional[str]:
  200. return self.headers and self.headers.get('Status-Code')
  201. @cached_property
  202. def history(self) -> dict:
  203. # TODO: use ArchiveResult for this instead of json
  204. return self.as_link_with_details().history
  205. @cached_property
  206. def latest_title(self) -> Optional[str]:
  207. if self.title:
  208. return self.title # whoopdedoo that was easy
  209. try:
  210. # take longest successful title from ArchiveResult db history
  211. return sorted(
  212. self.archiveresult_set\
  213. .filter(extractor='title', status='succeeded', output__isnull=False)\
  214. .values_list('output', flat=True),
  215. key=lambda r: len(r),
  216. )[-1]
  217. except IndexError:
  218. pass
  219. try:
  220. # take longest successful title from Link json index file history
  221. return sorted(
  222. (
  223. result.output.strip()
  224. for result in self.history['title']
  225. if result.status == 'succeeded' and result.output.strip()
  226. ),
  227. key=lambda r: len(r),
  228. )[-1]
  229. except (KeyError, IndexError):
  230. pass
  231. return None
  232. def save_tags(self, tags: List[str]=()) -> None:
  233. tags_id = []
  234. for tag in tags:
  235. if tag.strip():
  236. tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
  237. self.tags.clear()
  238. self.tags.add(*tags_id)
  239. # def get_storage_dir(self, create=True, symlink=True) -> Path:
  240. # date_str = self.added.strftime('%Y%m%d')
  241. # domain_str = domain(self.url)
  242. # abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
  243. # if create and not abs_storage_dir.is_dir():
  244. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  245. # if symlink:
  246. # LINK_PATHS = [
  247. # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  248. # # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
  249. # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
  250. # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
  251. # ]
  252. # for link_path in LINK_PATHS:
  253. # link_path.parent.mkdir(parents=True, exist_ok=True)
  254. # try:
  255. # link_path.symlink_to(abs_storage_dir)
  256. # except FileExistsError:
  257. # link_path.unlink()
  258. # link_path.symlink_to(abs_storage_dir)
  259. # return abs_storage_dir
  260. class ArchiveResultManager(models.Manager):
  261. def indexable(self, sorted: bool = True):
  262. INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  263. qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
  264. if sorted:
  265. precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  266. qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
  267. return qs
  268. def rand_int_id():
  269. return random.getrandbits(32)
  270. class ArchiveResult(ABIDModel):
  271. abid_prefix = 'res_'
  272. abid_ts_src = 'self.snapshot.added'
  273. abid_uri_src = 'self.snapshot.url'
  274. abid_subtype_src = 'self.extractor'
  275. abid_rand_src = 'self.id'
  276. EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
  277. old_id = models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID')
  278. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True, verbose_name='ID')
  279. abid = ABIDField(prefix=abid_prefix)
  280. snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE, to_field='id')
  281. extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
  282. cmd = models.JSONField()
  283. pwd = models.CharField(max_length=256)
  284. cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
  285. output = models.CharField(max_length=1024)
  286. start_ts = models.DateTimeField(db_index=True)
  287. end_ts = models.DateTimeField()
  288. status = models.CharField(max_length=16, choices=STATUS_CHOICES)
  289. objects = ArchiveResultManager()
  290. class Meta(TypedModelMeta):
  291. verbose_name = 'Result'
  292. def __str__(self):
  293. return self.extractor
  294. def save(self, *args, **kwargs):
  295. super().save(*args, **kwargs)
  296. assert str(self.id) == str(self.abid.uuid)
  297. @cached_property
  298. def snapshot_dir(self):
  299. return Path(self.snapshot.link_dir)
  300. @property
  301. def api_url(self) -> str:
  302. # /api/v1/core/archiveresult/{uulid}
  303. return reverse_lazy('api-1:get_archiveresult', args=[self.abid])
  304. @property
  305. def api_docs_url(self) -> str:
  306. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_archiveresult'
  307. @property
  308. def extractor_module(self):
  309. return EXTRACTORS[self.extractor]
  310. def output_path(self) -> str:
  311. """return the canonical output filename or directory name within the snapshot dir"""
  312. return self.extractor_module.get_output_path()
  313. def embed_path(self) -> str:
  314. """
  315. return the actual runtime-calculated path to the file on-disk that
  316. should be used for user-facing iframe embeds of this result
  317. """
  318. if hasattr(self.extractor_module, 'get_embed_path'):
  319. return self.extractor_module.get_embed_path(self)
  320. return self.extractor_module.get_output_path()
  321. def legacy_output_path(self):
  322. link = self.snapshot.as_link()
  323. return link.canonical_outputs().get(f'{self.extractor}_path')
  324. def output_exists(self) -> bool:
  325. return Path(self.output_path()).exists()
  326. # def get_storage_dir(self, create=True, symlink=True):
  327. # date_str = self.snapshot.added.strftime('%Y%m%d')
  328. # domain_str = domain(self.snapshot.url)
  329. # abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
  330. # if create and not abs_storage_dir.is_dir():
  331. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  332. # if symlink:
  333. # LINK_PATHS = [
  334. # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  335. # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
  336. # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
  337. # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
  338. # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
  339. # ]
  340. # for link_path in LINK_PATHS:
  341. # link_path.parent.mkdir(parents=True, exist_ok=True)
  342. # try:
  343. # link_path.symlink_to(abs_storage_dir)
  344. # except FileExistsError:
  345. # link_path.unlink()
  346. # link_path.symlink_to(abs_storage_dir)
  347. # return abs_storage_dir
  348. # def symlink_index(self, create=True):
  349. # abs_result_dir = self.get_storage_dir(create=create)