models.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531
  1. __package__ = 'archivebox.core'
  2. from typing import Optional, List, Dict, Iterable
  3. from django_stubs_ext.db.models import TypedModelMeta
  4. import json
  5. import random
  6. import uuid
  7. from uuid import uuid4
  8. from pathlib import Path
  9. from django.db import models
  10. from django.utils import timezone
  11. from django.utils.functional import cached_property
  12. from django.utils.text import slugify
  13. from django.core.cache import cache
  14. from django.urls import reverse, reverse_lazy
  15. from django.db.models import Case, When, Value, IntegerField
  16. from django.conf import settings
  17. from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
  18. from ..system import get_dir_size
  19. from ..util import parse_date, base_url
  20. from ..index.schema import Link
  21. from ..index.html import snapshot_icons
  22. from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
  23. def rand_int_id():
  24. return random.getrandbits(32)
  25. # class BaseModel(models.Model):
  26. # # TODO: migrate all models to a shared base class with all our standard fields and helpers:
  27. # # ulid/created/modified/owner/is_deleted/as_json/from_json/etc.
  28. # #
  29. # # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
  30. # # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
  31. # class Meta(TypedModelMeta):
  32. # abstract = True
  33. class Tag(ABIDModel):
  34. """
  35. Based on django-taggit model + ABID base.
  36. """
  37. abid_prefix = 'tag_'
  38. abid_ts_src = 'self.created'
  39. abid_uri_src = 'self.slug'
  40. abid_subtype_src = '"03"'
  41. abid_rand_src = 'self.old_id'
  42. old_id = models.BigIntegerField(unique=True, default=rand_int_id, serialize=False, verbose_name='Old ID') # legacy PK
  43. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True)
  44. abid = ABIDField(prefix=abid_prefix)
  45. name = models.CharField(unique=True, blank=False, max_length=100)
  46. slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
  47. # slug is autoset on save from name, never set it manually
  48. snapshot_set: models.Manager['Snapshot']
  49. class Meta(TypedModelMeta):
  50. verbose_name = "Tag"
  51. verbose_name_plural = "Tags"
  52. def __str__(self):
  53. return self.name
  54. # @property
  55. # def old_id(self):
  56. # return self.id
  57. def slugify(self, tag, i=None):
  58. slug = slugify(tag)
  59. if i is not None:
  60. slug += "_%d" % i
  61. return slug
  62. def save(self, *args, **kwargs):
  63. if self._state.adding and not self.slug:
  64. self.slug = self.slugify(self.name)
  65. # if name is different but slug conficts with another tags slug, append a counter
  66. # with transaction.atomic():
  67. slugs = set(
  68. type(self)
  69. ._default_manager.filter(slug__startswith=self.slug)
  70. .values_list("slug", flat=True)
  71. )
  72. i = None
  73. while True:
  74. slug = self.slugify(self.name, i)
  75. if slug not in slugs:
  76. self.slug = slug
  77. return super().save(*args, **kwargs)
  78. i = 1 if i is None else i+1
  79. else:
  80. return super().save(*args, **kwargs)
  81. @property
  82. def api_url(self) -> str:
  83. # /api/v1/core/snapshot/{uulid}
  84. return reverse_lazy('api-1:get_tag', args=[self.abid])
  85. @property
  86. def api_docs_url(self) -> str:
  87. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_tag'
  88. class SnapshotTag(models.Model):
  89. id = models.AutoField(primary_key=True)
  90. snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
  91. tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
  92. class Meta:
  93. db_table = 'core_snapshot_tags'
  94. unique_together = [('snapshot', 'tag')]
  95. class SnapshotManager(models.Manager):
  96. def get_queryset(self):
  97. return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
  98. class Snapshot(ABIDModel):
  99. abid_prefix = 'snp_'
  100. abid_ts_src = 'self.added'
  101. abid_uri_src = 'self.url'
  102. abid_subtype_src = '"01"'
  103. abid_rand_src = 'self.old_id'
  104. old_id = models.UUIDField(default=uuid.uuid4, editable=False, unique=True) # legacy pk
  105. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True)
  106. abid = ABIDField(prefix=abid_prefix)
  107. url = models.URLField(unique=True, db_index=True)
  108. timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
  109. title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
  110. tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
  111. added = AutoDateTimeField(default=timezone.now, db_index=True)
  112. updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
  113. keys = ('url', 'timestamp', 'title', 'tags', 'updated')
  114. archiveresult_set: models.Manager['ArchiveResult']
  115. objects = SnapshotManager()
  116. def __repr__(self) -> str:
  117. title = (self.title_stripped or '-')[:64]
  118. return f'[{self.timestamp}] {self.url[:64]} ({title})'
  119. def __str__(self) -> str:
  120. title = (self.title_stripped or '-')[:64]
  121. return f'[{self.timestamp}] {self.url[:64]} ({title})'
  122. @classmethod
  123. def from_json(cls, info: dict):
  124. info = {k: v for k, v in info.items() if k in cls.keys}
  125. return cls(**info)
  126. def as_json(self, *args) -> dict:
  127. args = args or self.keys
  128. return {
  129. key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False)
  130. for key in args
  131. }
  132. def as_link(self) -> Link:
  133. return Link.from_json(self.as_json())
  134. def as_link_with_details(self) -> Link:
  135. from ..index import load_link_details
  136. return load_link_details(self.as_link())
  137. def tags_str(self, nocache=True) -> str | None:
  138. calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
  139. cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
  140. if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
  141. # tags are pre-fetched already, use them directly (best because db is always freshest)
  142. tags_str = calc_tags_str()
  143. return tags_str
  144. if nocache:
  145. tags_str = calc_tags_str()
  146. cache.set(cache_key, tags_str)
  147. return tags_str
  148. return cache.get_or_set(cache_key, calc_tags_str)
  149. def icons(self) -> str:
  150. return snapshot_icons(self)
  151. @property
  152. def api_url(self) -> str:
  153. # /api/v1/core/snapshot/{uulid}
  154. return reverse_lazy('api-1:get_snapshot', args=[self.abid])
  155. @property
  156. def api_docs_url(self) -> str:
  157. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot'
  158. @cached_property
  159. def title_stripped(self) -> str:
  160. return (self.title or '').replace("\n", " ").replace("\r", "")
  161. @cached_property
  162. def extension(self) -> str:
  163. from ..util import extension
  164. return extension(self.url)
  165. @cached_property
  166. def bookmarked(self):
  167. return parse_date(self.timestamp)
  168. @cached_property
  169. def bookmarked_date(self):
  170. # TODO: remove this
  171. return self.bookmarked
  172. @cached_property
  173. def is_archived(self):
  174. return self.as_link().is_archived
  175. @cached_property
  176. def num_outputs(self) -> int:
  177. # DONT DO THIS: it will trigger a separate query for every snapshot
  178. # return self.archiveresult_set.filter(status='succeeded').count()
  179. # this is better:
  180. return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded'))
  181. @cached_property
  182. def base_url(self):
  183. return base_url(self.url)
  184. @cached_property
  185. def link_dir(self):
  186. return str(settings.CONFIG.ARCHIVE_DIR / self.timestamp)
  187. @cached_property
  188. def archive_path(self):
  189. return '{}/{}'.format(settings.CONFIG.ARCHIVE_DIR_NAME, self.timestamp)
  190. @cached_property
  191. def archive_size(self):
  192. cache_key = f'{str(self.pk)[:12]}-{(self.updated or self.added).timestamp()}-size'
  193. def calc_dir_size():
  194. try:
  195. return get_dir_size(self.link_dir)[0]
  196. except Exception:
  197. return 0
  198. return cache.get_or_set(cache_key, calc_dir_size)
  199. @cached_property
  200. def thumbnail_url(self) -> Optional[str]:
  201. if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
  202. result = (sorted(
  203. (
  204. result
  205. for result in self.archiveresult_set.all()
  206. if result.extractor == 'screenshot' and result.status =='succeeded' and result.output
  207. ),
  208. key=lambda result: result.created,
  209. ) or [None])[-1]
  210. else:
  211. result = self.archiveresult_set.filter(
  212. extractor='screenshot',
  213. status='succeeded'
  214. ).only('output').last()
  215. if result:
  216. return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
  217. return None
  218. @cached_property
  219. def headers(self) -> Optional[Dict[str, str]]:
  220. try:
  221. return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
  222. except Exception:
  223. pass
  224. return None
  225. @cached_property
  226. def status_code(self) -> Optional[str]:
  227. return self.headers.get('Status-Code') if self.headers else None
  228. @cached_property
  229. def history(self) -> dict:
  230. # TODO: use ArchiveResult for this instead of json
  231. return self.as_link_with_details().history
  232. @cached_property
  233. def latest_title(self) -> Optional[str]:
  234. if self.title:
  235. return self.title # whoopdedoo that was easy
  236. # check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again
  237. if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
  238. try:
  239. return (sorted(
  240. (
  241. result.output.strip()
  242. for result in self.archiveresult_set.all()
  243. if result.extractor == 'title' and result.status =='succeeded' and result.output
  244. ),
  245. key=lambda title: len(title),
  246. ) or [None])[-1]
  247. except IndexError:
  248. pass
  249. try:
  250. # take longest successful title from ArchiveResult db history
  251. return sorted(
  252. self.archiveresult_set\
  253. .filter(extractor='title', status='succeeded', output__isnull=False)\
  254. .values_list('output', flat=True),
  255. key=lambda r: len(r),
  256. )[-1]
  257. except IndexError:
  258. pass
  259. try:
  260. # take longest successful title from Link json index file history
  261. return sorted(
  262. (
  263. result.output.strip()
  264. for result in self.history['title']
  265. if result.status == 'succeeded' and result.output.strip()
  266. ),
  267. key=lambda r: len(r),
  268. )[-1]
  269. except (KeyError, IndexError):
  270. pass
  271. return None
  272. def save_tags(self, tags: Iterable[str]=()) -> None:
  273. tags_id = []
  274. for tag in tags:
  275. if tag.strip():
  276. tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
  277. self.tags.clear()
  278. self.tags.add(*tags_id)
  279. # def get_storage_dir(self, create=True, symlink=True) -> Path:
  280. # date_str = self.added.strftime('%Y%m%d')
  281. # domain_str = domain(self.url)
  282. # abs_storage_dir = Path(settings.CONFIG.ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
  283. # if create and not abs_storage_dir.is_dir():
  284. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  285. # if symlink:
  286. # LINK_PATHS = [
  287. # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  288. # # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
  289. # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
  290. # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
  291. # ]
  292. # for link_path in LINK_PATHS:
  293. # link_path.parent.mkdir(parents=True, exist_ok=True)
  294. # try:
  295. # link_path.symlink_to(abs_storage_dir)
  296. # except FileExistsError:
  297. # link_path.unlink()
  298. # link_path.symlink_to(abs_storage_dir)
  299. # return abs_storage_dir
  300. class ArchiveResultManager(models.Manager):
  301. def indexable(self, sorted: bool = True):
  302. """Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)"""
  303. INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  304. qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded')
  305. if sorted:
  306. precedence = [
  307. When(extractor=method, then=Value(precedence))
  308. for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE
  309. ]
  310. qs = qs.annotate(
  311. indexing_precedence=Case(
  312. *precedence,
  313. default=Value(1000),
  314. output_field=IntegerField()
  315. )
  316. ).order_by('indexing_precedence')
  317. return qs
  318. class ArchiveResult(ABIDModel):
  319. abid_prefix = 'res_'
  320. abid_ts_src = 'self.snapshot.added'
  321. abid_uri_src = 'self.snapshot.url'
  322. abid_subtype_src = 'self.extractor'
  323. abid_rand_src = 'self.old_id'
  324. EXTRACTOR_CHOICES = (
  325. ('htmltotext', 'htmltotext'),
  326. ('git', 'git'),
  327. ('singlefile', 'singlefile'),
  328. ('media', 'media'),
  329. ('archive_org', 'archive_org'),
  330. ('readability', 'readability'),
  331. ('mercury', 'mercury'),
  332. ('favicon', 'favicon'),
  333. ('pdf', 'pdf'),
  334. ('headers', 'headers'),
  335. ('screenshot', 'screenshot'),
  336. ('dom', 'dom'),
  337. ('title', 'title'),
  338. ('wget', 'wget'),
  339. )
  340. STATUS_CHOICES = [
  341. ("succeeded", "succeeded"),
  342. ("failed", "failed"),
  343. ("skipped", "skipped")
  344. ]
  345. old_id = models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID')
  346. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True, verbose_name='ID')
  347. abid = ABIDField(prefix=abid_prefix)
  348. snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE, to_field='id', db_column='snapshot_id')
  349. extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
  350. cmd = models.JSONField()
  351. pwd = models.CharField(max_length=256)
  352. cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
  353. output = models.CharField(max_length=1024)
  354. start_ts = models.DateTimeField(db_index=True)
  355. end_ts = models.DateTimeField()
  356. status = models.CharField(max_length=16, choices=STATUS_CHOICES)
  357. objects = ArchiveResultManager()
  358. class Meta(TypedModelMeta):
  359. verbose_name = 'Archive Result'
  360. verbose_name_plural = 'Archive Results Log'
  361. def __str__(self):
  362. return self.extractor
  363. @cached_property
  364. def snapshot_dir(self):
  365. return Path(self.snapshot.link_dir)
  366. @property
  367. def api_url(self) -> str:
  368. # /api/v1/core/archiveresult/{uulid}
  369. return reverse_lazy('api-1:get_archiveresult', args=[self.abid])
  370. @property
  371. def api_docs_url(self) -> str:
  372. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_archiveresult'
  373. @property
  374. def extractor_module(self):
  375. return EXTRACTORS[self.extractor]
  376. def output_path(self) -> str:
  377. """return the canonical output filename or directory name within the snapshot dir"""
  378. return self.extractor_module.get_output_path()
  379. def embed_path(self) -> str:
  380. """
  381. return the actual runtime-calculated path to the file on-disk that
  382. should be used for user-facing iframe embeds of this result
  383. """
  384. if get_embed_path_func := getattr(self.extractor_module, 'get_embed_path', None):
  385. return get_embed_path_func(self)
  386. return self.extractor_module.get_output_path()
  387. def legacy_output_path(self):
  388. link = self.snapshot.as_link()
  389. return link.canonical_outputs().get(f'{self.extractor}_path')
  390. def output_exists(self) -> bool:
  391. return Path(self.output_path()).exists()
  392. # def get_storage_dir(self, create=True, symlink=True):
  393. # date_str = self.snapshot.added.strftime('%Y%m%d')
  394. # domain_str = domain(self.snapshot.url)
  395. # abs_storage_dir = Path(settings.CONFIG.ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
  396. # if create and not abs_storage_dir.is_dir():
  397. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  398. # if symlink:
  399. # LINK_PATHS = [
  400. # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  401. # # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
  402. # # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
  403. # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
  404. # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
  405. # ]
  406. # for link_path in LINK_PATHS:
  407. # link_path.parent.mkdir(parents=True, exist_ok=True)
  408. # try:
  409. # link_path.symlink_to(abs_storage_dir)
  410. # except FileExistsError:
  411. # link_path.unlink()
  412. # link_path.symlink_to(abs_storage_dir)
  413. # return abs_storage_dir
  414. # def symlink_index(self, create=True):
  415. # abs_result_dir = self.get_storage_dir(create=create)