models.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531
  1. __package__ = 'archivebox.core'
  2. from typing import Optional, List, Dict, Iterable
  3. from django_stubs_ext.db.models import TypedModelMeta
  4. import json
  5. import random
  6. import uuid
  7. from uuid import uuid4
  8. from pathlib import Path
  9. from django.db import models
  10. from django.utils.functional import cached_property
  11. from django.utils.text import slugify
  12. from django.core.cache import cache
  13. from django.urls import reverse, reverse_lazy
  14. from django.db.models import Case, When, Value, IntegerField
  15. from django.conf import settings
  16. from abid_utils.models import ABIDModel, ABIDField
  17. from ..system import get_dir_size
  18. from ..util import parse_date, base_url
  19. from ..index.schema import Link
  20. from ..index.html import snapshot_icons
  21. from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
  22. def rand_int_id():
  23. return random.getrandbits(32)
  24. # class BaseModel(models.Model):
  25. # # TODO: migrate all models to a shared base class with all our standard fields and helpers:
  26. # # ulid/created/modified/owner/is_deleted/as_json/from_json/etc.
  27. # #
  28. # # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
  29. # # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
  30. # class Meta(TypedModelMeta):
  31. # abstract = True
  32. class Tag(ABIDModel):
  33. """
  34. Based on django-taggit model + ABID base.
  35. """
  36. abid_prefix = 'tag_'
  37. abid_ts_src = 'self.created' # TODO: add created/modified time
  38. abid_uri_src = 'self.slug'
  39. abid_subtype_src = '"03"'
  40. abid_rand_src = 'self.old_id'
  41. old_id = models.BigIntegerField(unique=True, default=rand_int_id, serialize=False, verbose_name='Old ID') # legacy PK
  42. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True)
  43. abid = ABIDField(prefix=abid_prefix)
  44. name = models.CharField(unique=True, blank=False, max_length=100)
  45. slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
  46. # slug is autoset on save from name, never set it manually
  47. snapshot_set: models.Manager['Snapshot']
  48. class Meta(TypedModelMeta):
  49. verbose_name = "Tag"
  50. verbose_name_plural = "Tags"
  51. def __str__(self):
  52. return self.name
  53. # @property
  54. # def old_id(self):
  55. # return self.id
  56. def slugify(self, tag, i=None):
  57. slug = slugify(tag)
  58. if i is not None:
  59. slug += "_%d" % i
  60. return slug
  61. def save(self, *args, **kwargs):
  62. if self._state.adding and not self.slug:
  63. self.slug = self.slugify(self.name)
  64. # if name is different but slug conficts with another tags slug, append a counter
  65. # with transaction.atomic():
  66. slugs = set(
  67. type(self)
  68. ._default_manager.filter(slug__startswith=self.slug)
  69. .values_list("slug", flat=True)
  70. )
  71. i = None
  72. while True:
  73. slug = self.slugify(self.name, i)
  74. if slug not in slugs:
  75. self.slug = slug
  76. return super().save(*args, **kwargs)
  77. i = 1 if i is None else i+1
  78. else:
  79. return super().save(*args, **kwargs)
  80. @property
  81. def api_url(self) -> str:
  82. # /api/v1/core/snapshot/{uulid}
  83. return reverse_lazy('api-1:get_tag', args=[self.abid])
  84. @property
  85. def api_docs_url(self) -> str:
  86. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_tag'
  87. class SnapshotTag(models.Model):
  88. id = models.AutoField(primary_key=True)
  89. snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
  90. tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
  91. class Meta:
  92. db_table = 'core_snapshot_tags'
  93. unique_together = [('snapshot', 'tag')]
  94. class SnapshotManager(models.Manager):
  95. def get_queryset(self):
  96. return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
  97. class Snapshot(ABIDModel):
  98. abid_prefix = 'snp_'
  99. abid_ts_src = 'self.added'
  100. abid_uri_src = 'self.url'
  101. abid_subtype_src = '"01"'
  102. abid_rand_src = 'self.old_id'
  103. old_id = models.UUIDField(default=uuid.uuid4, editable=False, unique=True) # legacy pk
  104. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True)
  105. abid = ABIDField(prefix=abid_prefix)
  106. url = models.URLField(unique=True, db_index=True)
  107. timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
  108. title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
  109. tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
  110. added = models.DateTimeField(auto_now_add=True, db_index=True)
  111. updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
  112. keys = ('url', 'timestamp', 'title', 'tags', 'updated')
  113. archiveresult_set: models.Manager['ArchiveResult']
  114. objects = SnapshotManager()
  115. def __repr__(self) -> str:
  116. title = (self.title_stripped or '-')[:64]
  117. return f'[{self.timestamp}] {self.url[:64]} ({title})'
  118. def __str__(self) -> str:
  119. title = (self.title_stripped or '-')[:64]
  120. return f'[{self.timestamp}] {self.url[:64]} ({title})'
  121. @classmethod
  122. def from_json(cls, info: dict):
  123. info = {k: v for k, v in info.items() if k in cls.keys}
  124. return cls(**info)
  125. def as_json(self, *args) -> dict:
  126. args = args or self.keys
  127. return {
  128. key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False)
  129. for key in args
  130. }
  131. def as_link(self) -> Link:
  132. return Link.from_json(self.as_json())
  133. def as_link_with_details(self) -> Link:
  134. from ..index import load_link_details
  135. return load_link_details(self.as_link())
  136. def tags_str(self, nocache=True) -> str | None:
  137. calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
  138. cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
  139. if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
  140. # tags are pre-fetched already, use them directly (best because db is always freshest)
  141. tags_str = calc_tags_str()
  142. return tags_str
  143. if nocache:
  144. tags_str = calc_tags_str()
  145. cache.set(cache_key, tags_str)
  146. return tags_str
  147. return cache.get_or_set(cache_key, calc_tags_str)
  148. def icons(self) -> str:
  149. return snapshot_icons(self)
  150. @property
  151. def api_url(self) -> str:
  152. # /api/v1/core/snapshot/{uulid}
  153. return reverse_lazy('api-1:get_snapshot', args=[self.abid])
  154. @property
  155. def api_docs_url(self) -> str:
  156. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot'
  157. @cached_property
  158. def title_stripped(self) -> str:
  159. return (self.title or '').replace("\n", " ").replace("\r", "")
  160. @cached_property
  161. def extension(self) -> str:
  162. from ..util import extension
  163. return extension(self.url)
  164. @cached_property
  165. def bookmarked(self):
  166. return parse_date(self.timestamp)
  167. @cached_property
  168. def bookmarked_date(self):
  169. # TODO: remove this
  170. return self.bookmarked
  171. @cached_property
  172. def is_archived(self):
  173. return self.as_link().is_archived
  174. @cached_property
  175. def num_outputs(self) -> int:
  176. # DONT DO THIS: it will trigger a separate query for every snapshot
  177. # return self.archiveresult_set.filter(status='succeeded').count()
  178. # this is better:
  179. return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded'))
  180. @cached_property
  181. def base_url(self):
  182. return base_url(self.url)
  183. @cached_property
  184. def link_dir(self):
  185. return str(settings.CONFIG.ARCHIVE_DIR / self.timestamp)
  186. @cached_property
  187. def archive_path(self):
  188. return '{}/{}'.format(settings.CONFIG.ARCHIVE_DIR_NAME, self.timestamp)
  189. @cached_property
  190. def archive_size(self):
  191. cache_key = f'{str(self.pk)[:12]}-{(self.updated or self.added).timestamp()}-size'
  192. def calc_dir_size():
  193. try:
  194. return get_dir_size(self.link_dir)[0]
  195. except Exception:
  196. return 0
  197. return cache.get_or_set(cache_key, calc_dir_size)
  198. @cached_property
  199. def thumbnail_url(self) -> Optional[str]:
  200. if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
  201. result = (sorted(
  202. (
  203. result
  204. for result in self.archiveresult_set.all()
  205. if result.extractor == 'screenshot' and result.status =='succeeded' and result.output
  206. ),
  207. key=lambda result: result.created,
  208. ) or [None])[-1]
  209. else:
  210. result = self.archiveresult_set.filter(
  211. extractor='screenshot',
  212. status='succeeded'
  213. ).only('output').last()
  214. if result:
  215. return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
  216. return None
  217. @cached_property
  218. def headers(self) -> Optional[Dict[str, str]]:
  219. try:
  220. return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
  221. except Exception:
  222. pass
  223. return None
  224. @cached_property
  225. def status_code(self) -> Optional[str]:
  226. return self.headers.get('Status-Code') if self.headers else None
  227. @cached_property
  228. def history(self) -> dict:
  229. # TODO: use ArchiveResult for this instead of json
  230. return self.as_link_with_details().history
  231. @cached_property
  232. def latest_title(self) -> Optional[str]:
  233. if self.title:
  234. return self.title # whoopdedoo that was easy
  235. # check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again
  236. if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
  237. try:
  238. return (sorted(
  239. (
  240. result.output.strip()
  241. for result in self.archiveresult_set.all()
  242. if result.extractor == 'title' and result.status =='succeeded' and result.output
  243. ),
  244. key=lambda title: len(title),
  245. ) or [None])[-1]
  246. except IndexError:
  247. pass
  248. try:
  249. # take longest successful title from ArchiveResult db history
  250. return sorted(
  251. self.archiveresult_set\
  252. .filter(extractor='title', status='succeeded', output__isnull=False)\
  253. .values_list('output', flat=True),
  254. key=lambda r: len(r),
  255. )[-1]
  256. except IndexError:
  257. pass
  258. try:
  259. # take longest successful title from Link json index file history
  260. return sorted(
  261. (
  262. result.output.strip()
  263. for result in self.history['title']
  264. if result.status == 'succeeded' and result.output.strip()
  265. ),
  266. key=lambda r: len(r),
  267. )[-1]
  268. except (KeyError, IndexError):
  269. pass
  270. return None
  271. def save_tags(self, tags: Iterable[str]=()) -> None:
  272. tags_id = []
  273. for tag in tags:
  274. if tag.strip():
  275. tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
  276. self.tags.clear()
  277. self.tags.add(*tags_id)
  278. # def get_storage_dir(self, create=True, symlink=True) -> Path:
  279. # date_str = self.added.strftime('%Y%m%d')
  280. # domain_str = domain(self.url)
  281. # abs_storage_dir = Path(settings.CONFIG.ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
  282. # if create and not abs_storage_dir.is_dir():
  283. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  284. # if symlink:
  285. # LINK_PATHS = [
  286. # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  287. # # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
  288. # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
  289. # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
  290. # ]
  291. # for link_path in LINK_PATHS:
  292. # link_path.parent.mkdir(parents=True, exist_ok=True)
  293. # try:
  294. # link_path.symlink_to(abs_storage_dir)
  295. # except FileExistsError:
  296. # link_path.unlink()
  297. # link_path.symlink_to(abs_storage_dir)
  298. # return abs_storage_dir
  299. class ArchiveResultManager(models.Manager):
  300. def indexable(self, sorted: bool = True):
  301. """Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)"""
  302. INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  303. qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded')
  304. if sorted:
  305. precedence = [
  306. When(extractor=method, then=Value(precedence))
  307. for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE
  308. ]
  309. qs = qs.annotate(
  310. indexing_precedence=Case(
  311. *precedence,
  312. default=Value(1000),
  313. output_field=IntegerField()
  314. )
  315. ).order_by('indexing_precedence')
  316. return qs
  317. class ArchiveResult(ABIDModel):
  318. abid_prefix = 'res_'
  319. abid_ts_src = 'self.snapshot.added'
  320. abid_uri_src = 'self.snapshot.url'
  321. abid_subtype_src = 'self.extractor'
  322. abid_rand_src = 'self.old_id'
  323. EXTRACTOR_CHOICES = (
  324. ('htmltotext', 'htmltotext'),
  325. ('git', 'git'),
  326. ('singlefile', 'singlefile'),
  327. ('media', 'media'),
  328. ('archive_org', 'archive_org'),
  329. ('readability', 'readability'),
  330. ('mercury', 'mercury'),
  331. ('favicon', 'favicon'),
  332. ('pdf', 'pdf'),
  333. ('headers', 'headers'),
  334. ('screenshot', 'screenshot'),
  335. ('dom', 'dom'),
  336. ('title', 'title'),
  337. ('wget', 'wget'),
  338. )
  339. STATUS_CHOICES = [
  340. ("succeeded", "succeeded"),
  341. ("failed", "failed"),
  342. ("skipped", "skipped")
  343. ]
  344. old_id = models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID')
  345. id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True, verbose_name='ID')
  346. abid = ABIDField(prefix=abid_prefix)
  347. snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE, to_field='id', db_column='snapshot_id')
  348. extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
  349. cmd = models.JSONField()
  350. pwd = models.CharField(max_length=256)
  351. cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
  352. output = models.CharField(max_length=1024)
  353. start_ts = models.DateTimeField(db_index=True)
  354. end_ts = models.DateTimeField()
  355. status = models.CharField(max_length=16, choices=STATUS_CHOICES)
  356. objects = ArchiveResultManager()
  357. class Meta(TypedModelMeta):
  358. verbose_name = 'Archive Result'
  359. verbose_name_plural = 'Archive Results Log'
  360. def __str__(self):
  361. return self.extractor
  362. @cached_property
  363. def snapshot_dir(self):
  364. return Path(self.snapshot.link_dir)
  365. @property
  366. def api_url(self) -> str:
  367. # /api/v1/core/archiveresult/{uulid}
  368. return reverse_lazy('api-1:get_archiveresult', args=[self.abid])
  369. @property
  370. def api_docs_url(self) -> str:
  371. return f'/api/v1/docs#/Core%20Models/api_v1_core_get_archiveresult'
  372. @property
  373. def extractor_module(self):
  374. return EXTRACTORS[self.extractor]
  375. def output_path(self) -> str:
  376. """return the canonical output filename or directory name within the snapshot dir"""
  377. return self.extractor_module.get_output_path()
  378. def embed_path(self) -> str:
  379. """
  380. return the actual runtime-calculated path to the file on-disk that
  381. should be used for user-facing iframe embeds of this result
  382. """
  383. if get_embed_path_func := getattr(self.extractor_module, 'get_embed_path', None):
  384. return get_embed_path_func(self)
  385. return self.extractor_module.get_output_path()
  386. def legacy_output_path(self):
  387. link = self.snapshot.as_link()
  388. return link.canonical_outputs().get(f'{self.extractor}_path')
  389. def output_exists(self) -> bool:
  390. return Path(self.output_path()).exists()
  391. # def get_storage_dir(self, create=True, symlink=True):
  392. # date_str = self.snapshot.added.strftime('%Y%m%d')
  393. # domain_str = domain(self.snapshot.url)
  394. # abs_storage_dir = Path(settings.CONFIG.ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
  395. # if create and not abs_storage_dir.is_dir():
  396. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  397. # if symlink:
  398. # LINK_PATHS = [
  399. # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  400. # # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
  401. # # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
  402. # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
  403. # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
  404. # ]
  405. # for link_path in LINK_PATHS:
  406. # link_path.parent.mkdir(parents=True, exist_ok=True)
  407. # try:
  408. # link_path.symlink_to(abs_storage_dir)
  409. # except FileExistsError:
  410. # link_path.unlink()
  411. # link_path.symlink_to(abs_storage_dir)
  412. # return abs_storage_dir
  413. # def symlink_index(self, create=True):
  414. # abs_result_dir = self.get_storage_dir(create=create)