models.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. __package__ = 'archivebox.core'
  2. from typing import Optional, List, Dict
  3. from django_stubs_ext.db.models import TypedModelMeta
  4. import json
  5. import random
  6. import uuid
  7. from uuid import uuid4
  8. from pathlib import Path
  9. from django.db import models
  10. from django.utils.functional import cached_property
  11. from django.utils.text import slugify
  12. from django.core.cache import cache
  13. from django.urls import reverse, reverse_lazy
  14. from django.db.models import Case, When, Value, IntegerField
  15. from abid_utils.models import ABIDModel, ABIDField
  16. from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
  17. from ..system import get_dir_size
  18. from ..util import parse_date, base_url
  19. from ..index.schema import Link
  20. from ..index.html import snapshot_icons
  21. from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
  22. EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
  23. STATUS_CHOICES = [
  24. ("succeeded", "succeeded"),
  25. ("failed", "failed"),
  26. ("skipped", "skipped")
  27. ]
  28. def rand_int_id():
  29. return random.getrandbits(32)
  30. # class BaseModel(models.Model):
  31. # # TODO: migrate all models to a shared base class with all our standard fields and helpers:
  32. # # ulid/created/modified/owner/is_deleted/as_json/from_json/etc.
  33. # #
  34. # # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
  35. # # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
  36. # class Meta(TypedModelMeta):
  37. # abstract = True
  38. class Tag(ABIDModel):
  39. """
  40. Based on django-taggit model + ABID base.
  41. """
  42. abid_prefix = 'tag_'
  43. abid_ts_src = 'self.created' # TODO: add created/modified time
  44. abid_uri_src = 'self.slug'
  45. abid_subtype_src = '"03"'
  46. abid_rand_src = 'self.old_id'
  47. old_id = models.BigIntegerField(unique=True, default=rand_int_id, serialize=False, verbose_name='Old ID') # legacy PK
  48. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True)
  49. abid = ABIDField(prefix=abid_prefix)
  50. name = models.CharField(unique=True, blank=False, max_length=100)
  51. slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
  52. # slug is autoset on save from name, never set it manually
  53. class Meta(TypedModelMeta):
  54. verbose_name = "Tag"
  55. verbose_name_plural = "Tags"
  56. def __str__(self):
  57. return self.name
  58. # @property
  59. # def old_id(self):
  60. # return self.id
  61. def slugify(self, tag, i=None):
  62. slug = slugify(tag)
  63. if i is not None:
  64. slug += "_%d" % i
  65. return slug
  66. def save(self, *args, **kwargs):
  67. if self._state.adding and not self.slug:
  68. self.slug = self.slugify(self.name)
  69. # if name is different but slug conficts with another tags slug, append a counter
  70. # with transaction.atomic():
  71. slugs = set(
  72. type(self)
  73. ._default_manager.filter(slug__startswith=self.slug)
  74. .values_list("slug", flat=True)
  75. )
  76. i = None
  77. while True:
  78. slug = self.slugify(self.name, i)
  79. if slug not in slugs:
  80. self.slug = slug
  81. return super().save(*args, **kwargs)
  82. i = 1 if i is None else i+1
  83. else:
  84. return super().save(*args, **kwargs)
  85. @property
  86. def api_url(self) -> str:
  87. # /api/v1/core/snapshot/{uulid}
  88. return reverse_lazy('api-1:get_tag', args=[self.abid])
  89. @property
  90. def api_docs_url(self) -> str:
  91. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_tag'
  92. class SnapshotTag(models.Model):
  93. id = models.AutoField(primary_key=True)
  94. snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
  95. tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
  96. class Meta:
  97. db_table = 'core_snapshot_tags'
  98. unique_together = [('snapshot', 'tag')]
  99. class Snapshot(ABIDModel):
  100. abid_prefix = 'snp_'
  101. abid_ts_src = 'self.added'
  102. abid_uri_src = 'self.url'
  103. abid_subtype_src = '"01"'
  104. abid_rand_src = 'self.old_id'
  105. old_id = models.UUIDField(default=uuid.uuid4, editable=False, unique=True) # legacy pk
  106. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True)
  107. abid = ABIDField(prefix=abid_prefix)
  108. url = models.URLField(unique=True, db_index=True)
  109. timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
  110. title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
  111. tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
  112. added = models.DateTimeField(auto_now_add=True, db_index=True)
  113. updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
  114. keys = ('url', 'timestamp', 'title', 'tags', 'updated')
  115. @property
  116. def uuid(self):
  117. return self.id
  118. def __repr__(self) -> str:
  119. title = (self.title_stripped or '-')[:64]
  120. return f'[{self.timestamp}] {self.url[:64]} ({title})'
  121. def __str__(self) -> str:
  122. title = (self.title_stripped or '-')[:64]
  123. return f'[{self.timestamp}] {self.url[:64]} ({title})'
  124. def save(self, *args, **kwargs):
  125. super().save(*args, **kwargs)
  126. try:
  127. assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'Snapshot.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
  128. except AssertionError as e:
  129. print(e)
  130. @classmethod
  131. def from_json(cls, info: dict):
  132. info = {k: v for k, v in info.items() if k in cls.keys}
  133. return cls(**info)
  134. def as_json(self, *args) -> dict:
  135. args = args or self.keys
  136. return {
  137. key: getattr(self, key)
  138. if key != 'tags' else self.tags_str()
  139. for key in args
  140. }
  141. def as_link(self) -> Link:
  142. return Link.from_json(self.as_json())
  143. def as_link_with_details(self) -> Link:
  144. from ..index import load_link_details
  145. return load_link_details(self.as_link())
  146. def tags_str(self, nocache=True) -> str | None:
  147. cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
  148. calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
  149. if nocache:
  150. tags_str = calc_tags_str()
  151. cache.set(cache_key, tags_str)
  152. return tags_str
  153. return cache.get_or_set(cache_key, calc_tags_str)
  154. def icons(self) -> str:
  155. return snapshot_icons(self)
  156. @property
  157. def api_url(self) -> str:
  158. # /api/v1/core/snapshot/{uulid}
  159. return reverse_lazy('api-1:get_snapshot', args=[self.abid])
  160. @property
  161. def api_docs_url(self) -> str:
  162. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot'
  163. @cached_property
  164. def extension(self) -> str:
  165. from ..util import extension
  166. return extension(self.url)
  167. @cached_property
  168. def bookmarked(self):
  169. return parse_date(self.timestamp)
  170. @cached_property
  171. def bookmarked_date(self):
  172. # TODO: remove this
  173. return self.bookmarked
  174. @cached_property
  175. def is_archived(self):
  176. return self.as_link().is_archived
  177. @cached_property
  178. def num_outputs(self) -> int:
  179. return self.archiveresult_set.filter(status='succeeded').count()
  180. @cached_property
  181. def base_url(self):
  182. return base_url(self.url)
  183. @cached_property
  184. def link_dir(self):
  185. return str(ARCHIVE_DIR / self.timestamp)
  186. @cached_property
  187. def archive_path(self):
  188. return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
  189. @cached_property
  190. def archive_size(self):
  191. cache_key = f'{str(self.pk)[:12]}-{(self.updated or self.added).timestamp()}-size'
  192. def calc_dir_size():
  193. try:
  194. return get_dir_size(self.link_dir)[0]
  195. except Exception:
  196. return 0
  197. return cache.get_or_set(cache_key, calc_dir_size)
  198. @cached_property
  199. def thumbnail_url(self) -> Optional[str]:
  200. result = self.archiveresult_set.filter(
  201. extractor='screenshot',
  202. status='succeeded'
  203. ).only('output').last()
  204. if result:
  205. return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
  206. return None
  207. @cached_property
  208. def headers(self) -> Optional[Dict[str, str]]:
  209. try:
  210. return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
  211. except Exception:
  212. pass
  213. return None
  214. @cached_property
  215. def status_code(self) -> Optional[str]:
  216. return self.headers and self.headers.get('Status-Code')
  217. @cached_property
  218. def history(self) -> dict:
  219. # TODO: use ArchiveResult for this instead of json
  220. return self.as_link_with_details().history
  221. @cached_property
  222. def latest_title(self) -> Optional[str]:
  223. if self.title:
  224. return self.title # whoopdedoo that was easy
  225. try:
  226. # take longest successful title from ArchiveResult db history
  227. return sorted(
  228. self.archiveresult_set\
  229. .filter(extractor='title', status='succeeded', output__isnull=False)\
  230. .values_list('output', flat=True),
  231. key=lambda r: len(r),
  232. )[-1]
  233. except IndexError:
  234. pass
  235. try:
  236. # take longest successful title from Link json index file history
  237. return sorted(
  238. (
  239. result.output.strip()
  240. for result in self.history['title']
  241. if result.status == 'succeeded' and result.output.strip()
  242. ),
  243. key=lambda r: len(r),
  244. )[-1]
  245. except (KeyError, IndexError):
  246. pass
  247. return None
  248. def save_tags(self, tags: List[str]=()) -> None:
  249. tags_id = []
  250. for tag in tags:
  251. if tag.strip():
  252. tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
  253. self.tags.clear()
  254. self.tags.add(*tags_id)
  255. # def get_storage_dir(self, create=True, symlink=True) -> Path:
  256. # date_str = self.added.strftime('%Y%m%d')
  257. # domain_str = domain(self.url)
  258. # abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
  259. # if create and not abs_storage_dir.is_dir():
  260. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  261. # if symlink:
  262. # LINK_PATHS = [
  263. # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  264. # # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
  265. # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
  266. # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
  267. # ]
  268. # for link_path in LINK_PATHS:
  269. # link_path.parent.mkdir(parents=True, exist_ok=True)
  270. # try:
  271. # link_path.symlink_to(abs_storage_dir)
  272. # except FileExistsError:
  273. # link_path.unlink()
  274. # link_path.symlink_to(abs_storage_dir)
  275. # return abs_storage_dir
  276. class ArchiveResultManager(models.Manager):
  277. def indexable(self, sorted: bool = True):
  278. INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  279. qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
  280. if sorted:
  281. precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  282. qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
  283. return qs
  284. class ArchiveResult(ABIDModel):
  285. abid_prefix = 'res_'
  286. abid_ts_src = 'self.snapshot.added'
  287. abid_uri_src = 'self.snapshot.url'
  288. abid_subtype_src = 'self.extractor'
  289. abid_rand_src = 'self.id'
  290. EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
  291. old_id = models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID')
  292. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True, verbose_name='ID')
  293. abid = ABIDField(prefix=abid_prefix)
  294. snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE, to_field='id', db_column='snapshot_id')
  295. extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
  296. cmd = models.JSONField()
  297. pwd = models.CharField(max_length=256)
  298. cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
  299. output = models.CharField(max_length=1024)
  300. start_ts = models.DateTimeField(db_index=True)
  301. end_ts = models.DateTimeField()
  302. status = models.CharField(max_length=16, choices=STATUS_CHOICES)
  303. objects = ArchiveResultManager()
  304. class Meta(TypedModelMeta):
  305. verbose_name = 'Archive Result'
  306. verbose_name_plural = 'Archive Results Log'
  307. def __str__(self):
  308. return self.extractor
  309. def save(self, *args, **kwargs):
  310. super().save(*args, **kwargs)
  311. try:
  312. assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'ArchiveResult.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
  313. except AssertionError as e:
  314. print(e)
  315. @property
  316. def uuid(self):
  317. return self.id
  318. @cached_property
  319. def snapshot_dir(self):
  320. return Path(self.snapshot.link_dir)
  321. @property
  322. def api_url(self) -> str:
  323. # /api/v1/core/archiveresult/{uulid}
  324. return reverse_lazy('api-1:get_archiveresult', args=[self.abid])
  325. @property
  326. def api_docs_url(self) -> str:
  327. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_archiveresult'
  328. @property
  329. def extractor_module(self):
  330. return EXTRACTORS[self.extractor]
  331. def output_path(self) -> str:
  332. """return the canonical output filename or directory name within the snapshot dir"""
  333. return self.extractor_module.get_output_path()
  334. def embed_path(self) -> str:
  335. """
  336. return the actual runtime-calculated path to the file on-disk that
  337. should be used for user-facing iframe embeds of this result
  338. """
  339. if hasattr(self.extractor_module, 'get_embed_path'):
  340. return self.extractor_module.get_embed_path(self)
  341. return self.extractor_module.get_output_path()
  342. def legacy_output_path(self):
  343. link = self.snapshot.as_link()
  344. return link.canonical_outputs().get(f'{self.extractor}_path')
  345. def output_exists(self) -> bool:
  346. return Path(self.output_path()).exists()
  347. # def get_storage_dir(self, create=True, symlink=True):
  348. # date_str = self.snapshot.added.strftime('%Y%m%d')
  349. # domain_str = domain(self.snapshot.url)
  350. # abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
  351. # if create and not abs_storage_dir.is_dir():
  352. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  353. # if symlink:
  354. # LINK_PATHS = [
  355. # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  356. # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
  357. # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
  358. # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
  359. # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
  360. # ]
  361. # for link_path in LINK_PATHS:
  362. # link_path.parent.mkdir(parents=True, exist_ok=True)
  363. # try:
  364. # link_path.symlink_to(abs_storage_dir)
  365. # except FileExistsError:
  366. # link_path.unlink()
  367. # link_path.symlink_to(abs_storage_dir)
  368. # return abs_storage_dir
  369. # def symlink_index(self, create=True):
  370. # abs_result_dir = self.get_storage_dir(create=create)