models.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. __package__ = 'archivebox.core'
  2. from typing import Optional, List, Dict
  3. from django_stubs_ext.db.models import TypedModelMeta
  4. import json
  5. import random
  6. import uuid
  7. from uuid import uuid4
  8. from pathlib import Path
  9. from django.db import models
  10. from django.utils.functional import cached_property
  11. from django.utils.text import slugify
  12. from django.core.cache import cache
  13. from django.urls import reverse, reverse_lazy
  14. from django.db.models import Case, When, Value, IntegerField
  15. from abid_utils.models import ABIDModel, ABIDField
  16. from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
  17. from ..system import get_dir_size
  18. from ..util import parse_date, base_url
  19. from ..index.schema import Link
  20. from ..index.html import snapshot_icons
  21. from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
  22. EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
  23. STATUS_CHOICES = [
  24. ("succeeded", "succeeded"),
  25. ("failed", "failed"),
  26. ("skipped", "skipped")
  27. ]
  28. def rand_int_id():
  29. return random.getrandbits(32)
  30. # class BaseModel(models.Model):
  31. # # TODO: migrate all models to a shared base class with all our standard fields and helpers:
  32. # # ulid/created/modified/owner/is_deleted/as_json/from_json/etc.
  33. # #
  34. # # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
  35. # # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
  36. # class Meta(TypedModelMeta):
  37. # abstract = True
  38. class Tag(ABIDModel):
  39. """
  40. Based on django-taggit model + ABID base.
  41. """
  42. abid_prefix = 'tag_'
  43. abid_ts_src = 'self.created' # TODO: add created/modified time
  44. abid_uri_src = 'self.slug'
  45. abid_subtype_src = '"03"'
  46. abid_rand_src = 'self.old_id'
  47. old_id = models.BigIntegerField(unique=True, default=rand_int_id, serialize=False, verbose_name='Old ID') # legacy PK
  48. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True)
  49. abid = ABIDField(prefix=abid_prefix)
  50. name = models.CharField(unique=True, blank=False, max_length=100)
  51. slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
  52. # slug is autoset on save from name, never set it manually
  53. class Meta(TypedModelMeta):
  54. verbose_name = "Tag"
  55. verbose_name_plural = "Tags"
  56. def __str__(self):
  57. return self.name
  58. # @property
  59. # def old_id(self):
  60. # return self.id
  61. def slugify(self, tag, i=None):
  62. slug = slugify(tag)
  63. if i is not None:
  64. slug += "_%d" % i
  65. return slug
  66. def save(self, *args, **kwargs):
  67. if self._state.adding and not self.slug:
  68. self.slug = self.slugify(self.name)
  69. # if name is different but slug conficts with another tags slug, append a counter
  70. # with transaction.atomic():
  71. slugs = set(
  72. type(self)
  73. ._default_manager.filter(slug__startswith=self.slug)
  74. .values_list("slug", flat=True)
  75. )
  76. i = None
  77. while True:
  78. slug = self.slugify(self.name, i)
  79. if slug not in slugs:
  80. self.slug = slug
  81. return super().save(*args, **kwargs)
  82. i = 1 if i is None else i+1
  83. else:
  84. return super().save(*args, **kwargs)
  85. @property
  86. def api_url(self) -> str:
  87. # /api/v1/core/snapshot/{uulid}
  88. return reverse_lazy('api-1:get_tag', args=[self.abid])
  89. @property
  90. def api_docs_url(self) -> str:
  91. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_tag'
  92. class SnapshotTag(models.Model):
  93. id = models.AutoField(primary_key=True)
  94. snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
  95. tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
  96. class Meta:
  97. db_table = 'core_snapshot_tags'
  98. unique_together = [('snapshot', 'tag')]
  99. class Snapshot(ABIDModel):
  100. abid_prefix = 'snp_'
  101. abid_ts_src = 'self.added'
  102. abid_uri_src = 'self.url'
  103. abid_subtype_src = '"01"'
  104. abid_rand_src = 'self.old_id'
  105. old_id = models.UUIDField(default=uuid.uuid4, editable=False, unique=True) # legacy pk
  106. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True)
  107. abid = ABIDField(prefix=abid_prefix)
  108. url = models.URLField(unique=True, db_index=True)
  109. timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
  110. title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
  111. tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
  112. added = models.DateTimeField(auto_now_add=True, db_index=True)
  113. updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
  114. keys = ('url', 'timestamp', 'title', 'tags', 'updated')
  115. @property
  116. def uuid(self):
  117. return self.id
  118. def __repr__(self) -> str:
  119. title = (self.title_stripped or '-')[:64]
  120. return f'[{self.timestamp}] {self.url[:64]} ({title})'
  121. def __str__(self) -> str:
  122. title = (self.title_stripped or '-')[:64]
  123. return f'[{self.timestamp}] {self.url[:64]} ({title})'
  124. def save(self, *args, **kwargs):
  125. super().save(*args, **kwargs)
  126. try:
  127. assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'Snapshot.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
  128. except AssertionError as e:
  129. print(e)
  130. @classmethod
  131. def from_json(cls, info: dict):
  132. info = {k: v for k, v in info.items() if k in cls.keys}
  133. return cls(**info)
  134. def as_json(self, *args) -> dict:
  135. args = args or self.keys
  136. return {
  137. key: getattr(self, key)
  138. if key != 'tags' else self.tags_str()
  139. for key in args
  140. }
  141. def as_link(self) -> Link:
  142. return Link.from_json(self.as_json())
  143. def as_link_with_details(self) -> Link:
  144. from ..index import load_link_details
  145. return load_link_details(self.as_link())
  146. def tags_str(self, nocache=True) -> str | None:
  147. cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
  148. calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
  149. if nocache:
  150. tags_str = calc_tags_str()
  151. cache.set(cache_key, tags_str)
  152. return tags_str
  153. return cache.get_or_set(cache_key, calc_tags_str)
  154. def icons(self) -> str:
  155. return snapshot_icons(self)
  156. @property
  157. def api_url(self) -> str:
  158. # /api/v1/core/snapshot/{uulid}
  159. return reverse_lazy('api-1:get_snapshot', args=[self.abid])
  160. @property
  161. def api_docs_url(self) -> str:
  162. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot'
  163. @cached_property
  164. def title_stripped(self) -> str:
  165. return (self.title or '').replace("\n", " ").replace("\r", "")
  166. @cached_property
  167. def extension(self) -> str:
  168. from ..util import extension
  169. return extension(self.url)
  170. @cached_property
  171. def bookmarked(self):
  172. return parse_date(self.timestamp)
  173. @cached_property
  174. def bookmarked_date(self):
  175. # TODO: remove this
  176. return self.bookmarked
  177. @cached_property
  178. def is_archived(self):
  179. return self.as_link().is_archived
  180. @cached_property
  181. def num_outputs(self) -> int:
  182. return self.archiveresult_set.filter(status='succeeded').count()
  183. @cached_property
  184. def base_url(self):
  185. return base_url(self.url)
  186. @cached_property
  187. def link_dir(self):
  188. return str(ARCHIVE_DIR / self.timestamp)
  189. @cached_property
  190. def archive_path(self):
  191. return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
  192. @cached_property
  193. def archive_size(self):
  194. cache_key = f'{str(self.pk)[:12]}-{(self.updated or self.added).timestamp()}-size'
  195. def calc_dir_size():
  196. try:
  197. return get_dir_size(self.link_dir)[0]
  198. except Exception:
  199. return 0
  200. return cache.get_or_set(cache_key, calc_dir_size)
  201. @cached_property
  202. def thumbnail_url(self) -> Optional[str]:
  203. result = self.archiveresult_set.filter(
  204. extractor='screenshot',
  205. status='succeeded'
  206. ).only('output').last()
  207. if result:
  208. return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
  209. return None
  210. @cached_property
  211. def headers(self) -> Optional[Dict[str, str]]:
  212. try:
  213. return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
  214. except Exception:
  215. pass
  216. return None
  217. @cached_property
  218. def status_code(self) -> Optional[str]:
  219. return self.headers and self.headers.get('Status-Code')
  220. @cached_property
  221. def history(self) -> dict:
  222. # TODO: use ArchiveResult for this instead of json
  223. return self.as_link_with_details().history
  224. @cached_property
  225. def latest_title(self) -> Optional[str]:
  226. if self.title:
  227. return self.title # whoopdedoo that was easy
  228. try:
  229. # take longest successful title from ArchiveResult db history
  230. return sorted(
  231. self.archiveresult_set\
  232. .filter(extractor='title', status='succeeded', output__isnull=False)\
  233. .values_list('output', flat=True),
  234. key=lambda r: len(r),
  235. )[-1]
  236. except IndexError:
  237. pass
  238. try:
  239. # take longest successful title from Link json index file history
  240. return sorted(
  241. (
  242. result.output.strip()
  243. for result in self.history['title']
  244. if result.status == 'succeeded' and result.output.strip()
  245. ),
  246. key=lambda r: len(r),
  247. )[-1]
  248. except (KeyError, IndexError):
  249. pass
  250. return None
  251. def save_tags(self, tags: List[str]=()) -> None:
  252. tags_id = []
  253. for tag in tags:
  254. if tag.strip():
  255. tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
  256. self.tags.clear()
  257. self.tags.add(*tags_id)
  258. # def get_storage_dir(self, create=True, symlink=True) -> Path:
  259. # date_str = self.added.strftime('%Y%m%d')
  260. # domain_str = domain(self.url)
  261. # abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
  262. # if create and not abs_storage_dir.is_dir():
  263. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  264. # if symlink:
  265. # LINK_PATHS = [
  266. # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  267. # # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
  268. # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
  269. # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
  270. # ]
  271. # for link_path in LINK_PATHS:
  272. # link_path.parent.mkdir(parents=True, exist_ok=True)
  273. # try:
  274. # link_path.symlink_to(abs_storage_dir)
  275. # except FileExistsError:
  276. # link_path.unlink()
  277. # link_path.symlink_to(abs_storage_dir)
  278. # return abs_storage_dir
  279. class ArchiveResultManager(models.Manager):
  280. def indexable(self, sorted: bool = True):
  281. INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  282. qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
  283. if sorted:
  284. precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  285. qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
  286. return qs
  287. class ArchiveResult(ABIDModel):
  288. abid_prefix = 'res_'
  289. abid_ts_src = 'self.snapshot.added'
  290. abid_uri_src = 'self.snapshot.url'
  291. abid_subtype_src = 'self.extractor'
  292. abid_rand_src = 'self.id'
  293. EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
  294. old_id = models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID')
  295. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True, verbose_name='ID')
  296. abid = ABIDField(prefix=abid_prefix)
  297. snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE, to_field='id', db_column='snapshot_id')
  298. extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
  299. cmd = models.JSONField()
  300. pwd = models.CharField(max_length=256)
  301. cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
  302. output = models.CharField(max_length=1024)
  303. start_ts = models.DateTimeField(db_index=True)
  304. end_ts = models.DateTimeField()
  305. status = models.CharField(max_length=16, choices=STATUS_CHOICES)
  306. objects = ArchiveResultManager()
  307. class Meta(TypedModelMeta):
  308. verbose_name = 'Archive Result'
  309. verbose_name_plural = 'Archive Results Log'
  310. def __str__(self):
  311. return self.extractor
  312. def save(self, *args, **kwargs):
  313. super().save(*args, **kwargs)
  314. try:
  315. assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'ArchiveResult.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
  316. except AssertionError as e:
  317. print(e)
  318. @property
  319. def uuid(self):
  320. return self.id
  321. @cached_property
  322. def snapshot_dir(self):
  323. return Path(self.snapshot.link_dir)
  324. @property
  325. def api_url(self) -> str:
  326. # /api/v1/core/archiveresult/{uulid}
  327. return reverse_lazy('api-1:get_archiveresult', args=[self.abid])
  328. @property
  329. def api_docs_url(self) -> str:
  330. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_archiveresult'
  331. @property
  332. def extractor_module(self):
  333. return EXTRACTORS[self.extractor]
  334. def output_path(self) -> str:
  335. """return the canonical output filename or directory name within the snapshot dir"""
  336. return self.extractor_module.get_output_path()
  337. def embed_path(self) -> str:
  338. """
  339. return the actual runtime-calculated path to the file on-disk that
  340. should be used for user-facing iframe embeds of this result
  341. """
  342. if hasattr(self.extractor_module, 'get_embed_path'):
  343. return self.extractor_module.get_embed_path(self)
  344. return self.extractor_module.get_output_path()
  345. def legacy_output_path(self):
  346. link = self.snapshot.as_link()
  347. return link.canonical_outputs().get(f'{self.extractor}_path')
  348. def output_exists(self) -> bool:
  349. return Path(self.output_path()).exists()
  350. # def get_storage_dir(self, create=True, symlink=True):
  351. # date_str = self.snapshot.added.strftime('%Y%m%d')
  352. # domain_str = domain(self.snapshot.url)
  353. # abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
  354. # if create and not abs_storage_dir.is_dir():
  355. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  356. # if symlink:
  357. # LINK_PATHS = [
  358. # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  359. # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
  360. # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
  361. # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
  362. # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
  363. # ]
  364. # for link_path in LINK_PATHS:
  365. # link_path.parent.mkdir(parents=True, exist_ok=True)
  366. # try:
  367. # link_path.symlink_to(abs_storage_dir)
  368. # except FileExistsError:
  369. # link_path.unlink()
  370. # link_path.symlink_to(abs_storage_dir)
  371. # return abs_storage_dir
  372. # def symlink_index(self, create=True):
  373. # abs_result_dir = self.get_storage_dir(create=create)