models.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585
  1. __package__ = 'archivebox.core'
  2. from typing import Optional, Dict, Iterable
  3. from django_stubs_ext.db.models import TypedModelMeta
  4. import os
  5. import json
  6. from pathlib import Path
  7. from django.db import models
  8. from django.utils.functional import cached_property
  9. from django.utils.text import slugify
  10. from django.core.cache import cache
  11. from django.urls import reverse, reverse_lazy
  12. from django.db.models import Case, When, Value, IntegerField
  13. from django.contrib import admin
  14. from django.conf import settings
  15. from archivebox.config import CONSTANTS
  16. from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
  17. from queues.tasks import bg_archive_snapshot
  18. # from crawls.models import Crawl
  19. # from machine.models import Machine, NetworkInterface
  20. from archivebox.misc.system import get_dir_size
  21. from archivebox.misc.util import parse_date, base_url
  22. from ..index.schema import Link
  23. from ..index.html import snapshot_icons
  24. from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
  25. # class BaseModel(models.Model):
  26. # # TODO: migrate all models to a shared base class with all our standard fields and helpers:
  27. # # ulid/created_at/modified_at/created_by/is_deleted/as_json/from_json/etc.
  28. # #
  29. # # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
  30. # # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
  31. # class Meta(TypedModelMeta):
  32. # abstract = True
  33. class Tag(ABIDModel):
  34. """
  35. Loosely based on django-taggit model + ABID base.
  36. """
  37. abid_prefix = 'tag_'
  38. abid_ts_src = 'self.created_at'
  39. abid_uri_src = 'self.slug'
  40. abid_subtype_src = '"03"'
  41. abid_rand_src = 'self.id'
  42. abid_drift_allowed = True
  43. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  44. abid = ABIDField(prefix=abid_prefix)
  45. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='tag_set')
  46. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  47. modified_at = models.DateTimeField(auto_now=True)
  48. name = models.CharField(unique=True, blank=False, max_length=100)
  49. slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
  50. # slug is autoset on save from name, never set it manually
  51. snapshot_set: models.Manager['Snapshot']
  52. # crawl_set: models.Manager['Crawl']
  53. class Meta(TypedModelMeta):
  54. verbose_name = "Tag"
  55. verbose_name_plural = "Tags"
  56. def __str__(self):
  57. return self.name
  58. def slugify(self, tag, i=None):
  59. slug = slugify(tag)
  60. if i is not None:
  61. slug += "_%d" % i
  62. return slug
  63. def clean(self, *args, **kwargs):
  64. self.slug = self.slug or self.slugify(self.name)
  65. super().clean(*args, **kwargs)
  66. def save(self, *args, **kwargs):
  67. if self._state.adding:
  68. self.slug = self.slugify(self.name)
  69. # if name is different but slug conficts with another tags slug, append a counter
  70. # with transaction.atomic():
  71. slugs = set(
  72. type(self)
  73. ._default_manager.filter(slug__startswith=self.slug)
  74. .values_list("slug", flat=True)
  75. )
  76. i = None
  77. while True:
  78. slug = self.slugify(self.name, i)
  79. if slug not in slugs:
  80. self.slug = slug
  81. return super().save(*args, **kwargs)
  82. i = 1 if i is None else i+1
  83. else:
  84. return super().save(*args, **kwargs)
  85. @property
  86. def api_url(self) -> str:
  87. # /api/v1/core/snapshot/{uulid}
  88. return reverse_lazy('api-1:get_tag', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  89. @property
  90. def api_docs_url(self) -> str:
  91. return '/api/v1/docs#/Core%20Models/api_v1_core_get_tag'
  92. class SnapshotTag(models.Model):
  93. id = models.AutoField(primary_key=True)
  94. snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
  95. tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
  96. class Meta:
  97. db_table = 'core_snapshot_tags'
  98. unique_together = [('snapshot', 'tag')]
  99. # class CrawlTag(models.Model):
  100. # id = models.AutoField(primary_key=True)
  101. # crawl = models.ForeignKey('Crawl', db_column='crawl_id', on_delete=models.CASCADE, to_field='id')
  102. # tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
  103. # class Meta:
  104. # db_table = 'core_crawl_tags'
  105. # unique_together = [('crawl', 'tag')]
  106. class SnapshotManager(models.Manager):
  107. def get_queryset(self):
  108. return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
  109. class Snapshot(ABIDModel):
  110. abid_prefix = 'snp_'
  111. abid_ts_src = 'self.created_at'
  112. abid_uri_src = 'self.url'
  113. abid_subtype_src = '"01"'
  114. abid_rand_src = 'self.id'
  115. abid_drift_allowed = True
  116. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  117. abid = ABIDField(prefix=abid_prefix)
  118. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set')
  119. created_at = AutoDateTimeField(default=None, null=False, db_index=True) # loaded from self._init_timestamp
  120. modified_at = models.DateTimeField(auto_now=True)
  121. # legacy ts fields
  122. bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
  123. downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
  124. # crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set')
  125. url = models.URLField(unique=True, db_index=True)
  126. timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
  127. tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
  128. title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
  129. keys = ('url', 'timestamp', 'title', 'tags', 'downloaded_at')
  130. archiveresult_set: models.Manager['ArchiveResult']
  131. objects = SnapshotManager()
  132. def save(self, *args, **kwargs):
  133. if not self.bookmarked_at:
  134. self.bookmarked_at = self.created_at or self._init_timestamp
  135. super().save(*args, **kwargs)
  136. def archive(self, overwrite=False, methods=None):
  137. result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
  138. return result
  139. def __repr__(self) -> str:
  140. title = (self.title_stripped or '-')[:64]
  141. return f'[{self.timestamp}] {self.url[:64]} ({title})'
  142. def __str__(self) -> str:
  143. title = (self.title_stripped or '-')[:64]
  144. return f'[{self.timestamp}] {self.url[:64]} ({title})'
  145. @classmethod
  146. def from_json(cls, info: dict):
  147. info = {k: v for k, v in info.items() if k in cls.keys}
  148. return cls(**info)
  149. def as_json(self, *args) -> dict:
  150. args = args or self.keys
  151. return {
  152. key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False)
  153. for key in args
  154. }
  155. def as_link(self) -> Link:
  156. return Link.from_json(self.as_json())
  157. def as_link_with_details(self) -> Link:
  158. from ..index import load_link_details
  159. return load_link_details(self.as_link())
  160. @admin.display(description='Tags')
  161. def tags_str(self, nocache=True) -> str | None:
  162. calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
  163. cache_key = f'{self.pk}-{(self.downloaded_at or self.bookmarked_at).timestamp()}-tags'
  164. if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
  165. # tags are pre-fetched already, use them directly (best because db is always freshest)
  166. tags_str = calc_tags_str()
  167. return tags_str
  168. if nocache:
  169. tags_str = calc_tags_str()
  170. cache.set(cache_key, tags_str)
  171. return tags_str
  172. return cache.get_or_set(cache_key, calc_tags_str)
  173. def icons(self) -> str:
  174. return snapshot_icons(self)
  175. @property
  176. def api_url(self) -> str:
  177. # /api/v1/core/snapshot/{uulid}
  178. return reverse_lazy('api-1:get_snapshot', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  179. @property
  180. def api_docs_url(self) -> str:
  181. return '/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot'
  182. def get_absolute_url(self):
  183. return f'/{self.archive_path}'
  184. @cached_property
  185. def title_stripped(self) -> str:
  186. return (self.title or '').replace("\n", " ").replace("\r", "")
  187. @cached_property
  188. def extension(self) -> str:
  189. from archivebox.misc.util import extension
  190. return extension(self.url)
  191. @cached_property
  192. def bookmarked(self):
  193. return parse_date(self.timestamp)
  194. @cached_property
  195. def bookmarked_date(self):
  196. # TODO: remove this
  197. return self.bookmarked
  198. @cached_property
  199. def is_archived(self):
  200. return self.as_link().is_archived
  201. @cached_property
  202. def num_outputs(self) -> int:
  203. # DONT DO THIS: it will trigger a separate query for every snapshot
  204. # return self.archiveresult_set.filter(status='succeeded').count()
  205. # this is better:
  206. return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded'))
  207. @cached_property
  208. def base_url(self):
  209. return base_url(self.url)
  210. @cached_property
  211. def link_dir(self):
  212. return str(CONSTANTS.ARCHIVE_DIR / self.timestamp)
  213. @cached_property
  214. def archive_path(self):
  215. return '{}/{}'.format(CONSTANTS.ARCHIVE_DIR_NAME, self.timestamp)
  216. @cached_property
  217. def archive_size(self):
  218. cache_key = f'{str(self.pk)[:12]}-{(self.downloaded_at or self.bookmarked_at).timestamp()}-size'
  219. def calc_dir_size():
  220. try:
  221. return get_dir_size(self.link_dir)[0]
  222. except Exception:
  223. return 0
  224. return cache.get_or_set(cache_key, calc_dir_size)
  225. @cached_property
  226. def thumbnail_url(self) -> Optional[str]:
  227. if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
  228. result = (sorted(
  229. (
  230. result
  231. for result in self.archiveresult_set.all()
  232. if result.extractor == 'screenshot' and result.status =='succeeded' and result.output
  233. ),
  234. key=lambda result: result.created_at,
  235. ) or [None])[-1]
  236. else:
  237. result = self.archiveresult_set.filter(
  238. extractor='screenshot',
  239. status='succeeded'
  240. ).only('output').last()
  241. if result:
  242. return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
  243. return None
  244. @cached_property
  245. def headers(self) -> Optional[Dict[str, str]]:
  246. try:
  247. return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
  248. except Exception:
  249. pass
  250. return None
  251. @cached_property
  252. def status_code(self) -> Optional[str]:
  253. return self.headers.get('Status-Code') if self.headers else None
  254. @cached_property
  255. def history(self) -> dict:
  256. # TODO: use ArchiveResult for this instead of json
  257. return self.as_link_with_details().history
  258. @cached_property
  259. def latest_title(self) -> Optional[str]:
  260. if self.title:
  261. return self.title # whoopdedoo that was easy
  262. # check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again
  263. if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
  264. try:
  265. return (sorted(
  266. (
  267. result.output.strip()
  268. for result in self.archiveresult_set.all()
  269. if result.extractor == 'title' and result.status =='succeeded' and result.output
  270. ),
  271. key=lambda title: len(title),
  272. ) or [None])[-1]
  273. except IndexError:
  274. pass
  275. try:
  276. # take longest successful title from ArchiveResult db history
  277. return sorted(
  278. self.archiveresult_set\
  279. .filter(extractor='title', status='succeeded', output__isnull=False)\
  280. .values_list('output', flat=True),
  281. key=lambda r: len(r),
  282. )[-1]
  283. except IndexError:
  284. pass
  285. try:
  286. # take longest successful title from Link json index file history
  287. return sorted(
  288. (
  289. result.output.strip()
  290. for result in self.history['title']
  291. if result.status == 'succeeded' and result.output.strip()
  292. ),
  293. key=lambda r: len(r),
  294. )[-1]
  295. except (KeyError, IndexError):
  296. pass
  297. return None
  298. def save_tags(self, tags: Iterable[str]=()) -> None:
  299. tags_id = []
  300. for tag in tags:
  301. if tag.strip():
  302. tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
  303. self.tags.clear()
  304. self.tags.add(*tags_id)
  305. # def get_storage_dir(self, create=True, symlink=True) -> Path:
  306. # date_str = self.bookmarked_at.strftime('%Y%m%d')
  307. # domain_str = domain(self.url)
  308. # abs_storage_dir = Path(CONSTANTS.ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
  309. # if create and not abs_storage_dir.is_dir():
  310. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  311. # if symlink:
  312. # LINK_PATHS = [
  313. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  314. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
  315. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
  316. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
  317. # ]
  318. # for link_path in LINK_PATHS:
  319. # link_path.parent.mkdir(parents=True, exist_ok=True)
  320. # try:
  321. # link_path.symlink_to(abs_storage_dir)
  322. # except FileExistsError:
  323. # link_path.unlink()
  324. # link_path.symlink_to(abs_storage_dir)
  325. # return abs_storage_dir
  326. class ArchiveResultManager(models.Manager):
  327. def indexable(self, sorted: bool = True):
  328. """Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)"""
  329. INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  330. qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded')
  331. if sorted:
  332. precedence = [
  333. When(extractor=method, then=Value(precedence))
  334. for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE
  335. ]
  336. qs = qs.annotate(
  337. indexing_precedence=Case(
  338. *precedence,
  339. default=Value(1000),
  340. output_field=IntegerField()
  341. )
  342. ).order_by('indexing_precedence')
  343. return qs
  344. class ArchiveResult(ABIDModel):
  345. abid_prefix = 'res_'
  346. abid_ts_src = 'self.snapshot.created_at'
  347. abid_uri_src = 'self.snapshot.url'
  348. abid_subtype_src = 'self.extractor'
  349. abid_rand_src = 'self.id'
  350. abid_drift_allowed = True
  351. EXTRACTOR_CHOICES = (
  352. ('htmltotext', 'htmltotext'),
  353. ('git', 'git'),
  354. ('singlefile', 'singlefile'),
  355. ('media', 'media'),
  356. ('archive_org', 'archive_org'),
  357. ('readability', 'readability'),
  358. ('mercury', 'mercury'),
  359. ('favicon', 'favicon'),
  360. ('pdf', 'pdf'),
  361. ('headers', 'headers'),
  362. ('screenshot', 'screenshot'),
  363. ('dom', 'dom'),
  364. ('title', 'title'),
  365. ('wget', 'wget'),
  366. )
  367. STATUS_CHOICES = [
  368. ("succeeded", "succeeded"),
  369. ("failed", "failed"),
  370. ("skipped", "skipped")
  371. ]
  372. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  373. abid = ABIDField(prefix=abid_prefix)
  374. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set')
  375. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  376. modified_at = models.DateTimeField(auto_now=True)
  377. snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE, to_field='id', db_column='snapshot_id')
  378. extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
  379. cmd = models.JSONField()
  380. pwd = models.CharField(max_length=256)
  381. cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
  382. output = models.CharField(max_length=1024)
  383. start_ts = models.DateTimeField(db_index=True)
  384. end_ts = models.DateTimeField()
  385. status = models.CharField(max_length=16, choices=STATUS_CHOICES)
  386. # the network interface that was used to download this result
  387. # uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
  388. objects = ArchiveResultManager()
  389. class Meta(TypedModelMeta):
  390. verbose_name = 'Archive Result'
  391. verbose_name_plural = 'Archive Results Log'
  392. def __str__(self):
  393. # return f'[{self.abid}] 📅 {self.start_ts.strftime("%Y-%m-%d %H:%M")} 📄 {self.extractor} {self.snapshot.url}'
  394. return self.extractor
  395. # TODO: finish connecting machine.models
  396. # @cached_property
  397. # def machine(self):
  398. # return self.iface.machine if self.iface else None
  399. @cached_property
  400. def snapshot_dir(self):
  401. return Path(self.snapshot.link_dir)
  402. @property
  403. def api_url(self) -> str:
  404. # /api/v1/core/archiveresult/{uulid}
  405. return reverse_lazy('api-1:get_archiveresult', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  406. @property
  407. def api_docs_url(self) -> str:
  408. return '/api/v1/docs#/Core%20Models/api_v1_core_get_archiveresult'
  409. def get_absolute_url(self):
  410. return f'/{self.snapshot.archive_path}/{self.output_path()}'
  411. @property
  412. def extractor_module(self):
  413. return EXTRACTORS[self.extractor]
  414. def output_path(self) -> str:
  415. """return the canonical output filename or directory name within the snapshot dir"""
  416. return self.extractor_module.get_output_path()
  417. def embed_path(self) -> str:
  418. """
  419. return the actual runtime-calculated path to the file on-disk that
  420. should be used for user-facing iframe embeds of this result
  421. """
  422. if get_embed_path_func := getattr(self.extractor_module, 'get_embed_path', None):
  423. return get_embed_path_func(self)
  424. return self.extractor_module.get_output_path()
  425. def legacy_output_path(self):
  426. link = self.snapshot.as_link()
  427. return link.canonical_outputs().get(f'{self.extractor}_path')
  428. def output_exists(self) -> bool:
  429. return os.access(self.output_path(), os.R_OK)
  430. # def get_storage_dir(self, create=True, symlink=True):
  431. # date_str = self.snapshot.bookmarked_at.strftime('%Y%m%d')
  432. # domain_str = domain(self.snapshot.url)
  433. # abs_storage_dir = Path(CONSTANTS.ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
  434. # if create and not abs_storage_dir.is_dir():
  435. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  436. # if symlink:
  437. # LINK_PATHS = [
  438. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  439. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
  440. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
  441. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
  442. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
  443. # ]
  444. # for link_path in LINK_PATHS:
  445. # link_path.parent.mkdir(parents=True, exist_ok=True)
  446. # try:
  447. # link_path.symlink_to(abs_storage_dir)
  448. # except FileExistsError:
  449. # link_path.unlink()
  450. # link_path.symlink_to(abs_storage_dir)
  451. # return abs_storage_dir
  452. # def symlink_index(self, create=True):
  453. # abs_result_dir = self.get_storage_dir(create=create)