models.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794
  1. __package__ = 'archivebox.core'
  2. from typing import Optional, Dict, Iterable, Any
  3. from django_stubs_ext.db.models import TypedModelMeta
  4. import os
  5. import json
  6. from pathlib import Path
  7. from django.db import models
  8. from django.db.models import QuerySet
  9. from django.utils.functional import cached_property
  10. from django.utils.text import slugify
  11. from django.utils import timezone
  12. from django.core.cache import cache
  13. from django.urls import reverse, reverse_lazy
  14. from django.db.models import Case, When, Value, IntegerField
  15. from django.contrib import admin
  16. from django.conf import settings
  17. import abx
  18. from archivebox.config import CONSTANTS
  19. from archivebox.misc.system import get_dir_size
  20. from archivebox.misc.util import parse_date, base_url
  21. from archivebox.index.schema import Link
  22. from archivebox.index.html import snapshot_icons
  23. from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
  24. from archivebox.base_models.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithOutputDir
  25. from workers.models import ModelWithStateMachine
  26. from workers.tasks import bg_archive_snapshot
  27. from crawls.models import Crawl
  28. # from machine.models import Machine, NetworkInterface
  29. class Tag(ABIDModel):
  30. """
  31. Loosely based on django-taggit model + ABID base.
  32. """
  33. abid_prefix = 'tag_'
  34. abid_ts_src = 'self.created_at'
  35. abid_uri_src = 'self.slug'
  36. abid_subtype_src = '"03"'
  37. abid_rand_src = 'self.id'
  38. abid_drift_allowed = True
  39. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  40. abid = ABIDField(prefix=abid_prefix)
  41. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='tag_set')
  42. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  43. modified_at = models.DateTimeField(auto_now=True)
  44. name = models.CharField(unique=True, blank=False, max_length=100)
  45. slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
  46. # slug is autoset on save from name, never set it manually
  47. snapshot_set: models.Manager['Snapshot']
  48. # crawl_set: models.Manager['Crawl']
  49. class Meta(TypedModelMeta):
  50. verbose_name = "Tag"
  51. verbose_name_plural = "Tags"
  52. def __str__(self):
  53. return self.name
  54. def slugify(self, tag, i=None):
  55. slug = slugify(tag)
  56. if i is not None:
  57. slug += "_%d" % i
  58. return slug
  59. def clean(self, *args, **kwargs):
  60. self.slug = self.slug or self.slugify(self.name)
  61. super().clean(*args, **kwargs)
  62. def save(self, *args, **kwargs):
  63. if self._state.adding:
  64. self.slug = self.slugify(self.name)
  65. # if name is different but slug conficts with another tags slug, append a counter
  66. # with transaction.atomic():
  67. slugs = set(
  68. type(self)
  69. ._default_manager.filter(slug__startswith=self.slug)
  70. .values_list("slug", flat=True)
  71. )
  72. i = None
  73. while True:
  74. slug = self.slugify(self.name, i)
  75. if slug not in slugs:
  76. self.slug = slug
  77. return super().save(*args, **kwargs)
  78. i = 1 if i is None else i+1
  79. else:
  80. return super().save(*args, **kwargs)
  81. @property
  82. def api_url(self) -> str:
  83. # /api/v1/core/snapshot/{uulid}
  84. return reverse_lazy('api-1:get_tag', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  85. @property
  86. def api_docs_url(self) -> str:
  87. return '/api/v1/docs#/Core%20Models/api_v1_core_get_tag'
  88. class SnapshotTag(models.Model):
  89. id = models.AutoField(primary_key=True)
  90. snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
  91. tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
  92. class Meta:
  93. db_table = 'core_snapshot_tags'
  94. unique_together = [('snapshot', 'tag')]
  95. # class CrawlTag(models.Model):
  96. # id = models.AutoField(primary_key=True)
  97. # crawl = models.ForeignKey('Crawl', db_column='crawl_id', on_delete=models.CASCADE, to_field='id')
  98. # tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
  99. # class Meta:
  100. # db_table = 'core_crawl_tags'
  101. # unique_together = [('crawl', 'tag')]
  102. def validate_timestamp(value):
  103. assert isinstance(value, str) and value, f'timestamp must be a non-empty string, got: "{value}"'
  104. assert value.replace('.', '').isdigit(), f'timestamp must be a float str, got: "{value}"'
  105. class SnapshotManager(models.Manager):
  106. def get_queryset(self):
  107. return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
  108. class Snapshot(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
  109. abid_prefix = 'snp_'
  110. abid_ts_src = 'self.created_at'
  111. abid_uri_src = 'self.url'
  112. abid_subtype_src = '"01"'
  113. abid_rand_src = 'self.id'
  114. abid_drift_allowed = True
  115. state_machine_name = 'core.statemachines.SnapshotMachine'
  116. state_field_name = 'status'
  117. retry_at_field_name = 'retry_at'
  118. StatusChoices = ModelWithStateMachine.StatusChoices
  119. active_state = StatusChoices.STARTED
  120. output_dir_parent = 'snapshots'
  121. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  122. abid = ABIDField(prefix=abid_prefix)
  123. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
  124. created_at = AutoDateTimeField(default=None, null=False, db_index=True) # loaded from self._init_timestamp
  125. modified_at = models.DateTimeField(auto_now=True)
  126. status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
  127. retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
  128. notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this snapshot should have')
  129. bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
  130. downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
  131. crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
  132. url = models.URLField(unique=True, db_index=True)
  133. timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False, validators=[validate_timestamp])
  134. tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
  135. title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
  136. # config = models.JSONField(default=dict, null=False, blank=False, editable=True)
  137. keys = ('url', 'timestamp', 'title', 'tags', 'downloaded_at', 'created_at', 'status', 'retry_at', 'abid', 'id')
  138. archiveresult_set: models.Manager['ArchiveResult']
  139. objects = SnapshotManager()
  140. def save(self, *args, **kwargs):
  141. print(f'Snapshot[{self.ABID}].save()')
  142. if self.pk:
  143. existing_snapshot = self.__class__.objects.filter(pk=self.pk).first()
  144. if existing_snapshot and existing_snapshot.status == self.StatusChoices.SEALED:
  145. if self.as_json() != existing_snapshot.as_json():
  146. raise Exception(f'Snapshot {self.pk} is already sealed, it cannot be modified any further. NEW: {self.as_json()} != Existing: {existing_snapshot.as_json()}')
  147. if not self.bookmarked_at:
  148. self.bookmarked_at = self.created_at or self._init_timestamp
  149. if not self.timestamp:
  150. self.timestamp = str(self.bookmarked_at.timestamp())
  151. super().save(*args, **kwargs)
  152. def archive(self, overwrite=False, methods=None):
  153. result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
  154. return result
  155. def __repr__(self) -> str:
  156. url = self.url or '<no url set>'
  157. created_at = self.created_at.strftime("%Y-%m-%d %H:%M") if self.created_at else '<no timestamp set>'
  158. if self.id and self.url:
  159. return f'[{self.ABID}] {url[:64]} @ {created_at}'
  160. return f'[{self.abid_prefix}****not*saved*yet****] {url[:64]} @ {created_at}'
  161. def __str__(self) -> str:
  162. return repr(self)
  163. @classmethod
  164. def from_json(cls, info: dict):
  165. info = {k: v for k, v in info.items() if k in cls.keys}
  166. return cls(**info)
  167. def as_json(self, *args) -> dict:
  168. args = args or self.keys
  169. return {
  170. key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False)
  171. for key in args
  172. }
  173. def as_link(self) -> Link:
  174. return Link.from_json(self.as_json())
  175. def as_link_with_details(self) -> Link:
  176. from ..index import load_link_details
  177. return load_link_details(self.as_link())
  178. @admin.display(description='Tags')
  179. def tags_str(self, nocache=True) -> str | None:
  180. calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
  181. cache_key = f'{self.pk}-{(self.downloaded_at or self.bookmarked_at).timestamp()}-tags'
  182. if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
  183. # tags are pre-fetched already, use them directly (best because db is always freshest)
  184. tags_str = calc_tags_str()
  185. return tags_str
  186. if nocache:
  187. tags_str = calc_tags_str()
  188. cache.set(cache_key, tags_str)
  189. return tags_str
  190. return cache.get_or_set(cache_key, calc_tags_str)
  191. def icons(self) -> str:
  192. return snapshot_icons(self)
  193. @property
  194. def api_url(self) -> str:
  195. # /api/v1/core/snapshot/{uulid}
  196. return reverse_lazy('api-1:get_snapshot', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  197. @property
  198. def api_docs_url(self) -> str:
  199. return '/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot'
  200. def get_absolute_url(self):
  201. return f'/{self.archive_path}'
  202. @cached_property
  203. def title_stripped(self) -> str:
  204. return (self.title or '').replace("\n", " ").replace("\r", "")
  205. @cached_property
  206. def extension(self) -> str:
  207. from archivebox.misc.util import extension
  208. return extension(self.url)
  209. @cached_property
  210. def bookmarked(self):
  211. return parse_date(self.timestamp)
  212. @cached_property
  213. def bookmarked_date(self):
  214. # TODO: remove this
  215. return self.bookmarked
  216. @cached_property
  217. def is_archived(self):
  218. return self.as_link().is_archived
  219. @cached_property
  220. def num_outputs(self) -> int:
  221. # DONT DO THIS: it will trigger a separate query for every snapshot
  222. # return self.archiveresult_set.filter(status='succeeded').count()
  223. # this is better:
  224. return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded'))
  225. @cached_property
  226. def base_url(self):
  227. return base_url(self.url)
  228. @cached_property
  229. def link_dir(self):
  230. return str(CONSTANTS.ARCHIVE_DIR / self.timestamp)
  231. @cached_property
  232. def archive_path(self):
  233. return '{}/{}'.format(CONSTANTS.ARCHIVE_DIR_NAME, self.timestamp)
  234. @cached_property
  235. def archive_size(self):
  236. cache_key = f'{str(self.pk)[:12]}-{(self.downloaded_at or self.bookmarked_at).timestamp()}-size'
  237. def calc_dir_size():
  238. try:
  239. return get_dir_size(self.link_dir)[0]
  240. except Exception:
  241. return 0
  242. return cache.get_or_set(cache_key, calc_dir_size)
  243. @cached_property
  244. def thumbnail_url(self) -> Optional[str]:
  245. if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
  246. result = (sorted(
  247. (
  248. result
  249. for result in self.archiveresult_set.all()
  250. if result.extractor == 'screenshot' and result.status =='succeeded' and result.output
  251. ),
  252. key=lambda result: result.created_at,
  253. ) or [None])[-1]
  254. else:
  255. result = self.archiveresult_set.filter(
  256. extractor='screenshot',
  257. status='succeeded'
  258. ).only('output').last()
  259. if result:
  260. return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
  261. return None
  262. @cached_property
  263. def headers(self) -> Optional[Dict[str, str]]:
  264. try:
  265. return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
  266. except Exception:
  267. pass
  268. return None
  269. @cached_property
  270. def status_code(self) -> Optional[str]:
  271. return self.headers.get('Status-Code') if self.headers else None
  272. @cached_property
  273. def history(self) -> dict:
  274. # TODO: use ArchiveResult for this instead of json
  275. return self.as_link_with_details().history
  276. @cached_property
  277. def latest_title(self) -> Optional[str]:
  278. if self.title:
  279. return self.title # whoopdedoo that was easy
  280. # check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again
  281. if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
  282. try:
  283. return (sorted(
  284. (
  285. result.output.strip()
  286. for result in self.archiveresult_set.all()
  287. if result.extractor == 'title' and result.status =='succeeded' and result.output
  288. ),
  289. key=lambda title: len(title),
  290. ) or [None])[-1]
  291. except IndexError:
  292. pass
  293. try:
  294. # take longest successful title from ArchiveResult db history
  295. return sorted(
  296. self.archiveresult_set\
  297. .filter(extractor='title', status='succeeded', output__isnull=False)\
  298. .values_list('output', flat=True),
  299. key=lambda r: len(r),
  300. )[-1]
  301. except IndexError:
  302. pass
  303. try:
  304. # take longest successful title from Link json index file history
  305. return sorted(
  306. (
  307. result.output.strip()
  308. for result in self.history['title']
  309. if result.status == 'succeeded' and result.output.strip()
  310. ),
  311. key=lambda r: len(r),
  312. )[-1]
  313. except (KeyError, IndexError):
  314. pass
  315. return None
  316. def save_tags(self, tags: Iterable[str]=()) -> None:
  317. tags_id = []
  318. for tag in tags:
  319. if tag.strip():
  320. tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
  321. self.tags.clear()
  322. self.tags.add(*tags_id)
  323. def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
  324. pending_archiveresults = self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
  325. return pending_archiveresults
  326. def create_pending_archiveresults(self) -> list['ArchiveResult']:
  327. ALL_EXTRACTORS = ['favicon', 'title', 'screenshot', 'headers', 'singlefile', 'dom', 'git', 'archive_org', 'readability', 'mercury', 'pdf', 'wget']
  328. # config = get_scope_config(snapshot=self)
  329. config = {'EXTRACTORS': ','.join(ALL_EXTRACTORS)}
  330. if config.get('EXTRACTORS', 'auto') == 'auto':
  331. EXTRACTORS = ALL_EXTRACTORS
  332. else:
  333. EXTRACTORS = config.get('EXTRACTORS', '').split(',')
  334. archiveresults = []
  335. for extractor in EXTRACTORS:
  336. if not extractor:
  337. continue
  338. if ArchiveResult.objects.filter(snapshot=self, extractor=extractor).exists():
  339. continue
  340. archiveresult, created = ArchiveResult.objects.get_or_create(
  341. snapshot=self,
  342. extractor=extractor,
  343. defaults={
  344. 'status': ArchiveResult.INITIAL_STATE,
  345. 'retry_at': timezone.now(),
  346. },
  347. )
  348. if archiveresult.status == ArchiveResult.INITIAL_STATE:
  349. archiveresults.append(archiveresult)
  350. return archiveresults
  351. # def migrate_output_dir(self):
  352. # """Move the output files to the new folder structure if needed"""
  353. # print(f'{self}.migrate_output_dir()')
  354. # self.migrate_from_0_7_2()
  355. # self.migrate_from_0_8_6()
  356. # # ... future migrations here
  357. # def migrate_from_0_7_2(self):
  358. # """Migrate the folder structure from 0.7.2 to the current version"""
  359. # # migrate any existing output_dir into data/archiveresults/<extractor>/YYYY-MM-DD/<domain>/<abid>
  360. # # create self.output_dir if it doesn't exist
  361. # # move loose files in snapshot_dir into self.output_dir
  362. # # update self.pwd = self.output_dir
  363. # print(f'{self}.migrate_from_0_7_2()')
  364. # def migrate_from_0_8_6(self):
  365. # """Migrate the folder structure from 0.8.6 to the current version"""
  366. # # ... future migration code here ...
  367. # print(f'{self}.migrate_from_0_8_6()')
  368. # def save_json_index(self):
  369. # """Save the json index file to ./.index.json"""
  370. # print(f'{self}.save_json_index()')
  371. # pass
  372. # def save_symlinks_index(self):
  373. # """Update the symlink farm idnexes to point to the new location of self.output_dir"""
  374. # # ln -s self.output_dir data/index/results_by_type/wget/YYYY-MM-DD/example.com/<abid>
  375. # # ln -s self.output_dir data/index/results_by_day/YYYY-MM-DD/example.com/wget/<abid>
  376. # # ln -s self.output_dir data/index/results_by_domain/example.com/YYYY-MM-DD/wget/<abid>
  377. # # ln -s self.output_dir data/index/results_by_abid/<abid>
  378. # # ln -s self.output_dir data/archive/<snapshot_timestamp>/<extractor>
  379. # print(f'{self}.save_symlinks_index()')
  380. # def save_html_index(self):
  381. # """Save the html index file to ./.index.html"""
  382. # print(f'{self}.save_html_index()')
  383. # pass
  384. # def save_merkle_index(self):
  385. # """Calculate the recursive sha256 of all the files in the output path and save it to ./.checksum.json"""
  386. # print(f'{self}.save_merkle_index()')
  387. # pass
  388. # def save_search_index(self):
  389. # """Pass any indexable text to the search backend indexer (e.g. sonic, SQLiteFTS5, etc.)"""
  390. # print(f'{self}.save_search_index()')
  391. # pass
  392. # def get_storage_dir(self, create=True, symlink=True) -> Path:
  393. # date_str = self.bookmarked_at.strftime('%Y%m%d')
  394. # domain_str = domain(self.url)
  395. # abs_storage_dir = Path(CONSTANTS.ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
  396. # if create and not abs_storage_dir.is_dir():
  397. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  398. # if symlink:
  399. # LINK_PATHS = [
  400. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  401. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
  402. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
  403. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
  404. # ]
  405. # for link_path in LINK_PATHS:
  406. # link_path.parent.mkdir(parents=True, exist_ok=True)
  407. # try:
  408. # link_path.symlink_to(abs_storage_dir)
  409. # except FileExistsError:
  410. # link_path.unlink()
  411. # link_path.symlink_to(abs_storage_dir)
  412. # return abs_storage_dir
  413. class ArchiveResultManager(models.Manager):
  414. def indexable(self, sorted: bool = True):
  415. """Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)"""
  416. INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  417. qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded')
  418. if sorted:
  419. precedence = [
  420. When(extractor=method, then=Value(precedence))
  421. for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE
  422. ]
  423. qs = qs.annotate(
  424. indexing_precedence=Case(
  425. *precedence,
  426. default=Value(1000),
  427. output_field=IntegerField()
  428. )
  429. ).order_by('indexing_precedence')
  430. return qs
  431. class ArchiveResult(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
  432. abid_prefix = 'res_'
  433. abid_ts_src = 'self.snapshot.created_at'
  434. abid_uri_src = 'self.snapshot.url'
  435. abid_subtype_src = 'self.extractor'
  436. abid_rand_src = 'self.id'
  437. abid_drift_allowed = True
  438. class StatusChoices(models.TextChoices):
  439. QUEUED = 'queued', 'Queued' # pending, initial
  440. STARTED = 'started', 'Started' # active
  441. BACKOFF = 'backoff', 'Waiting to retry' # pending
  442. SUCCEEDED = 'succeeded', 'Succeeded' # final
  443. FAILED = 'failed', 'Failed' # final
  444. SKIPPED = 'skipped', 'Skipped' # final
  445. state_machine_name = 'core.statemachines.ArchiveResultMachine'
  446. retry_at_field_name = 'retry_at'
  447. state_field_name = 'status'
  448. active_state = StatusChoices.STARTED
  449. output_dir_parent = 'archiveresults'
  450. EXTRACTOR_CHOICES = (
  451. ('htmltotext', 'htmltotext'),
  452. ('git', 'git'),
  453. ('singlefile', 'singlefile'),
  454. ('media', 'media'),
  455. ('archive_org', 'archive_org'),
  456. ('readability', 'readability'),
  457. ('mercury', 'mercury'),
  458. ('favicon', 'favicon'),
  459. ('pdf', 'pdf'),
  460. ('headers', 'headers'),
  461. ('screenshot', 'screenshot'),
  462. ('dom', 'dom'),
  463. ('title', 'title'),
  464. ('wget', 'wget'),
  465. )
  466. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  467. abid = ABIDField(prefix=abid_prefix)
  468. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set', db_index=True)
  469. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  470. modified_at = models.DateTimeField(auto_now=True)
  471. status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
  472. retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
  473. snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore
  474. extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True)
  475. cmd = models.JSONField(default=None, null=True, blank=True)
  476. pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
  477. cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
  478. output = models.CharField(max_length=1024, default=None, null=True, blank=True)
  479. start_ts = models.DateTimeField(default=None, null=True, blank=True)
  480. end_ts = models.DateTimeField(default=None, null=True, blank=True)
  481. notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this ArchiveResult should have')
  482. # the network interface that was used to download this result
  483. # uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
  484. objects = ArchiveResultManager()
  485. keys = ('snapshot_id', 'extractor', 'cmd', 'pwd', 'cmd_version', 'output', 'start_ts', 'end_ts', 'created_at', 'status', 'retry_at', 'abid', 'id')
  486. class Meta(TypedModelMeta):
  487. verbose_name = 'Archive Result'
  488. verbose_name_plural = 'Archive Results Log'
  489. def __repr__(self):
  490. snapshot_id = getattr(self, 'snapshot_id', None)
  491. url = self.snapshot.url if snapshot_id else '<no url set>'
  492. created_at = self.snapshot.created_at.strftime("%Y-%m-%d %H:%M") if snapshot_id else '<no timestamp set>'
  493. extractor = self.extractor or '<no extractor set>'
  494. if self.id and snapshot_id:
  495. return f'[{self.ABID}] {url[:64]} @ {created_at} -> {extractor}'
  496. return f'[{self.abid_prefix}****not*saved*yet****] {url} @ {created_at} -> {extractor}'
  497. def __str__(self):
  498. return repr(self)
  499. def save(self, *args, write_indexes: bool=False, **kwargs):
  500. print(f'ArchiveResult[{self.ABID}].save()')
  501. # if (self.pk and self.__class__.objects.filter(pk=self.pk).values_list('status', flat=True)[0] in [self.StatusChoices.FAILED, self.StatusChoices.SUCCEEDED, self.StatusChoices.SKIPPED]):
  502. # raise Exception(f'ArchiveResult {self.pk} is in a final state, it cannot be modified any further.')
  503. if self.pk:
  504. existing_archiveresult = self.__class__.objects.filter(pk=self.pk).first()
  505. if existing_archiveresult and existing_archiveresult.status in [self.StatusChoices.FAILED, self.StatusChoices.SUCCEEDED, self.StatusChoices.SKIPPED]:
  506. if self.as_json() != existing_archiveresult.as_json():
  507. raise Exception(f'ArchiveResult {self.pk} is in a final state, it cannot be modified any further. NEW: {self.as_json()} != Existing: {existing_archiveresult.as_json()}')
  508. super().save(*args, **kwargs)
  509. # DONT DO THIS:
  510. # self.snapshot.update_for_workers() # this should be done manually wherever its needed, not in here as a side-effect on save()
  511. # TODO: finish connecting machine.models
  512. # @cached_property
  513. # def machine(self):
  514. # return self.iface.machine if self.iface else None
  515. @cached_property
  516. def snapshot_dir(self):
  517. return Path(self.snapshot.link_dir)
  518. @cached_property
  519. def url(self):
  520. return self.snapshot.url
  521. @property
  522. def api_url(self) -> str:
  523. # /api/v1/core/archiveresult/{uulid}
  524. return reverse_lazy('api-1:get_archiveresult', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  525. @property
  526. def api_docs_url(self) -> str:
  527. return '/api/v1/docs#/Core%20Models/api_v1_core_get_archiveresult'
  528. def get_absolute_url(self):
  529. return f'/{self.snapshot.archive_path}/{self.extractor}'
  530. @property
  531. def extractor_module(self) -> Any | None:
  532. return abx.as_dict(abx.pm.hook.get_EXTRACTORS()).get(self.extractor, None)
  533. def embed_path(self) -> str | None:
  534. """
  535. return the actual runtime-calculated path to the file on-disk that
  536. should be used for user-facing iframe embeds of this result
  537. """
  538. try:
  539. return self.extractor_module.get_embed_path(self)
  540. except Exception as e:
  541. print(f'Error getting embed path for {self.extractor} extractor: {e}')
  542. return None
  543. def legacy_output_path(self):
  544. link = self.snapshot.as_link()
  545. return link.canonical_outputs().get(f'{self.extractor}_path')
  546. def output_exists(self) -> bool:
  547. output_path = Path(self.snapshot_dir) / self.extractor
  548. return os.path.exists(output_path)
  549. def create_output_dir(self):
  550. output_dir = Path(self.snapshot_dir) / self.extractor
  551. output_dir.mkdir(parents=True, exist_ok=True)
  552. return output_dir
  553. def as_json(self, *args) -> dict:
  554. args = args or self.keys
  555. return {
  556. key: getattr(self, key)
  557. for key in args
  558. }
  559. def write_indexes(self):
  560. """Write the ArchiveResult json, html, and merkle indexes to output dir, and pass searchable text to the search backend"""
  561. super().write_indexes()
  562. self.save_search_index()
  563. # def migrate_output_dir(self):
  564. # """Move the output files to the new folder structure if needed"""
  565. # print(f'{self}.migrate_output_dir()')
  566. # self.migrate_from_0_7_2()
  567. # self.migrate_from_0_8_6()
  568. # # ... future migrations here
  569. # def migrate_from_0_7_2(self):
  570. # """Migrate the folder structure from 0.7.2 to the current version"""
  571. # # migrate any existing output_dir into data/archiveresults/<extractor>/YYYY-MM-DD/<domain>/<abid>
  572. # # create self.output_dir if it doesn't exist
  573. # # move loose files in snapshot_dir into self.output_dir
  574. # # update self.pwd = self.output_dir
  575. # print(f'{self}.migrate_from_0_7_2()')
  576. # def migrate_from_0_8_6(self):
  577. # """Migrate the folder structure from 0.8.6 to the current version"""
  578. # # ... future migration code here ...
  579. # print(f'{self}.migrate_from_0_8_6()')
  580. # def save_json_index(self):
  581. # """Save the json index file to ./.index.json"""
  582. # print(f'{self}.save_json_index()')
  583. # pass
  584. # def save_symlinks_index(self):
  585. # """Update the symlink farm idnexes to point to the new location of self.output_dir"""
  586. # # ln -s self.output_dir data/index/results_by_type/wget/YYYY-MM-DD/example.com/<abid>
  587. # # ln -s self.output_dir data/index/results_by_day/YYYY-MM-DD/example.com/wget/<abid>
  588. # # ln -s self.output_dir data/index/results_by_domain/example.com/YYYY-MM-DD/wget/<abid>
  589. # # ln -s self.output_dir data/index/results_by_abid/<abid>
  590. # # ln -s self.output_dir data/archive/<snapshot_timestamp>/<extractor>
  591. # print(f'{self}.save_symlinks_index()')
  592. # def save_html_index(self):
  593. # """Save the html index file to ./.index.html"""
  594. # print(f'{self}.save_html_index()')
  595. # pass
  596. # def save_merkle_index(self):
  597. # """Calculate the recursive sha256 of all the files in the output path and save it to ./.checksum.json"""
  598. # print(f'{self}.save_merkle_index()')
  599. # pass
  600. def save_search_index(self):
  601. """Pass any indexable text to the search backend indexer (e.g. sonic, SQLiteFTS5, etc.)"""
  602. print(f'{self}.save_search_index()')
  603. pass
  604. # def get_storage_dir(self, create=True, symlink=True):
  605. # date_str = self.snapshot.bookmarked_at.strftime('%Y%m%d')
  606. # domain_str = domain(self.snapshot.url)
  607. # abs_storage_dir = Path(CONSTANTS.ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
  608. # if create and not abs_storage_dir.is_dir():
  609. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  610. # if symlink:
  611. # LINK_PATHS = [
  612. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  613. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
  614. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
  615. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
  616. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
  617. # ]
  618. # for link_path in LINK_PATHS:
  619. # link_path.parent.mkdir(parents=True, exist_ok=True)
  620. # try:
  621. # link_path.symlink_to(abs_storage_dir)
  622. # except FileExistsError:
  623. # link_path.unlink()
  624. # link_path.symlink_to(abs_storage_dir)
  625. # return abs_storage_dir
  626. # def symlink_index(self, create=True):
  627. # abs_result_dir = self.get_storage_dir(create=create)