2
0

models.py 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853
  1. __package__ = 'archivebox.core'
  2. from typing import Optional, Dict, Iterable, Any
  3. from django_stubs_ext.db.models import TypedModelMeta
  4. import os
  5. import json
  6. from pathlib import Path
  7. from django.db import models
  8. from django.db.models import QuerySet
  9. from django.utils.functional import cached_property
  10. from django.utils.text import slugify
  11. from django.utils import timezone
  12. from django.core.cache import cache
  13. from django.urls import reverse, reverse_lazy
  14. from django.db.models import Case, When, Value, IntegerField
  15. from django.contrib import admin
  16. from django.conf import settings
  17. import abx
  18. from archivebox.config import CONSTANTS
  19. from archivebox.misc.system import get_dir_size
  20. from archivebox.misc.util import parse_date, base_url
  21. from archivebox.index.schema import Link
  22. from archivebox.index.html import snapshot_icons
  23. from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
  24. from archivebox.base_models.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithOutputDir, ModelWithConfig
  25. from workers.models import ModelWithStateMachine
  26. from workers.tasks import bg_archive_snapshot
  27. from crawls.models import Crawl
  28. # from machine.models import Machine, NetworkInterface
  29. class Tag(ABIDModel):
  30. """
  31. Loosely based on django-taggit model + ABID base.
  32. """
  33. abid_prefix = 'tag_'
  34. abid_ts_src = 'self.created_at'
  35. abid_uri_src = 'self.slug'
  36. abid_subtype_src = '"03"'
  37. abid_rand_src = 'self.id'
  38. abid_drift_allowed = True
  39. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  40. abid = ABIDField(prefix=abid_prefix)
  41. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='tag_set')
  42. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  43. modified_at = models.DateTimeField(auto_now=True)
  44. name = models.CharField(unique=True, blank=False, max_length=100)
  45. slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
  46. # slug is autoset on save from name, never set it manually
  47. snapshot_set: models.Manager['Snapshot']
  48. # crawl_set: models.Manager['Crawl']
  49. class Meta(TypedModelMeta):
  50. verbose_name = "Tag"
  51. verbose_name_plural = "Tags"
  52. def __str__(self):
  53. return self.name
  54. def slugify(self, tag, i=None):
  55. slug = slugify(tag)
  56. if i is not None:
  57. slug += "_%d" % i
  58. return slug
  59. def clean(self, *args, **kwargs):
  60. self.slug = self.slug or self.slugify(self.name)
  61. super().clean(*args, **kwargs)
  62. def save(self, *args, **kwargs):
  63. if self._state.adding:
  64. self.slug = self.slugify(self.name)
  65. # if name is different but slug conficts with another tags slug, append a counter
  66. # with transaction.atomic():
  67. slugs = set(
  68. type(self)
  69. ._default_manager.filter(slug__startswith=self.slug)
  70. .values_list("slug", flat=True)
  71. )
  72. i = None
  73. while True:
  74. slug = self.slugify(self.name, i)
  75. if slug not in slugs:
  76. self.slug = slug
  77. return super().save(*args, **kwargs)
  78. i = 1 if i is None else i+1
  79. else:
  80. return super().save(*args, **kwargs)
  81. @property
  82. def api_url(self) -> str:
  83. # /api/v1/core/snapshot/{uulid}
  84. return reverse_lazy('api-1:get_tag', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  85. @property
  86. def api_docs_url(self) -> str:
  87. return '/api/v1/docs#/Core%20Models/api_v1_core_get_tag'
  88. class SnapshotTag(models.Model):
  89. id = models.AutoField(primary_key=True)
  90. snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
  91. tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
  92. class Meta:
  93. db_table = 'core_snapshot_tags'
  94. unique_together = [('snapshot', 'tag')]
  95. # class CrawlTag(models.Model):
  96. # id = models.AutoField(primary_key=True)
  97. # crawl = models.ForeignKey('Crawl', db_column='crawl_id', on_delete=models.CASCADE, to_field='id')
  98. # tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
  99. # class Meta:
  100. # db_table = 'core_crawl_tags'
  101. # unique_together = [('crawl', 'tag')]
  102. def validate_timestamp(value):
  103. assert isinstance(value, str) and value, f'timestamp must be a non-empty string, got: "{value}"'
  104. assert value.replace('.', '').isdigit(), f'timestamp must be a float str, got: "{value}"'
  105. class SnapshotManager(models.Manager):
  106. def get_queryset(self):
  107. return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
  108. class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithStateMachine, ABIDModel):
  109. abid_prefix = 'snp_'
  110. abid_ts_src = 'self.created_at'
  111. abid_uri_src = 'self.url'
  112. abid_subtype_src = '"01"'
  113. abid_rand_src = 'self.id'
  114. abid_drift_allowed = True
  115. state_machine_name = 'core.statemachines.SnapshotMachine'
  116. state_field_name = 'status'
  117. retry_at_field_name = 'retry_at'
  118. StatusChoices = ModelWithStateMachine.StatusChoices
  119. active_state = StatusChoices.STARTED
  120. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  121. abid = ABIDField(prefix=abid_prefix)
  122. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
  123. created_at = AutoDateTimeField(default=None, null=False, db_index=True) # loaded from self._init_timestamp
  124. modified_at = models.DateTimeField(auto_now=True)
  125. retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
  126. status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
  127. notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this snapshot should have')
  128. bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
  129. downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
  130. crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
  131. url = models.URLField(unique=True, db_index=True)
  132. timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False, validators=[validate_timestamp])
  133. tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
  134. title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
  135. keys = ('url', 'timestamp', 'title', 'tags', 'downloaded_at', 'created_at', 'status', 'retry_at', 'abid', 'id')
  136. archiveresult_set: models.Manager['ArchiveResult']
  137. objects = SnapshotManager()
  138. ### Inherited from ModelWithStateMachine #################################
  139. # StatusChoices = ModelWithStateMachine.StatusChoices
  140. #
  141. # status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
  142. # retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
  143. #
  144. # state_machine_name = 'core.statemachines.SnapshotMachine'
  145. # state_field_name = 'status'
  146. # retry_at_field_name = 'retry_at'
  147. # active_state = StatusChoices.STARTED
  148. ########################################################################
  149. ### Inherited from ModelWithConfig #######################################
  150. # config = models.JSONField(default=dict, null=False, blank=False, editable=True)
  151. ########################################################################
  152. ### Inherited from ModelWithOutputDir:
  153. # output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True)
  154. # self.save(): creates OUTPUT_DIR, writes index.json, writes indexes
  155. # self.output_dir_parent -> str 'archive/snapshots/<YYYY-MM-DD>/<example.com>'
  156. # self.output_dir_name -> '<abid>'
  157. # self.output_dir_str -> 'archive/snapshots/<YYYY-MM-DD>/<example.com>/<abid>'
  158. # self.OUTPUT_DIR -> Path('/data/archive/snapshots/<YYYY-MM-DD>/<example.com>/<abid>')
  159. ### Inherited from ABIDModel:
  160. # id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  161. # abid = ABIDField(prefix=abid_prefix)
  162. # created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
  163. # created_at = AutoDateTimeField(default=None, null=False, db_index=True) # loaded from self._init_timestamp
  164. # modified_at = models.DateTimeField(auto_now=True)
  165. # abid_prefix = 'snp_'
  166. # abid_ts_src = 'self.created_at'
  167. # abid_uri_src = 'self.url'
  168. # abid_subtype_src = '"01"'
  169. # abid_rand_src = 'self.id'
  170. # abid_drift_allowed = True
  171. # self.clean() -> sets self._timestamp
  172. # self.save() -> issues new ABID if creating new, otherwise uses existing ABID
  173. # self.ABID -> ABID
  174. # self.api_url -> '/api/v1/core/snapshot/{uulid}'
  175. # self.api_docs_url -> '/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot'
  176. # self.admin_change_url -> '/admin/core/snapshot/{pk}/change/'
  177. # self.get_absolute_url() -> '/{self.archive_path}'
  178. # self.update_for_workers() -> bool
  179. # self.as_json() -> dict[str, Any]
  180. def save(self, *args, **kwargs):
  181. print(f'Snapshot[{self.ABID}].save()')
  182. if self.pk:
  183. existing_snapshot = self.__class__.objects.filter(pk=self.pk).first()
  184. if existing_snapshot and existing_snapshot.status == self.StatusChoices.SEALED:
  185. if self.as_json() != existing_snapshot.as_json():
  186. raise Exception(f'Snapshot {self.pk} is already sealed, it cannot be modified any further. NEW: {self.as_json()} != Existing: {existing_snapshot.as_json()}')
  187. if not self.bookmarked_at:
  188. self.bookmarked_at = self.created_at or self._init_timestamp
  189. if not self.timestamp:
  190. self.timestamp = str(self.bookmarked_at.timestamp())
  191. super().save(*args, **kwargs)
  192. # make sure the crawl has this url in its urls log
  193. if self.crawl and self.url not in self.crawl.urls:
  194. self.crawl.urls += f'\n{self.url}'
  195. self.crawl.save()
  196. def archive(self, overwrite=False, methods=None):
  197. result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
  198. return result
  199. def __repr__(self) -> str:
  200. url = self.url or '<no url set>'
  201. created_at = self.created_at.strftime("%Y-%m-%d %H:%M") if self.created_at else '<no timestamp set>'
  202. if self.id and self.url:
  203. return f'[{self.ABID}] {url[:64]} @ {created_at}'
  204. return f'[{self.abid_prefix}****not*saved*yet****] {url[:64]} @ {created_at}'
  205. def __str__(self) -> str:
  206. return repr(self)
  207. @classmethod
  208. def from_json(cls, info: dict):
  209. info = {k: v for k, v in info.items() if k in cls.keys}
  210. return cls(**info)
  211. def as_json(self, *args) -> dict:
  212. args = args or self.keys
  213. return {
  214. key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False)
  215. for key in args
  216. }
  217. def as_link(self) -> Link:
  218. return Link.from_json(self.as_json())
  219. def as_link_with_details(self) -> Link:
  220. from ..index import load_link_details
  221. return load_link_details(self.as_link())
  222. @admin.display(description='Tags')
  223. def tags_str(self, nocache=True) -> str | None:
  224. calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
  225. cache_key = f'{self.pk}-{(self.downloaded_at or self.bookmarked_at).timestamp()}-tags'
  226. if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
  227. # tags are pre-fetched already, use them directly (best because db is always freshest)
  228. tags_str = calc_tags_str()
  229. return tags_str
  230. if nocache:
  231. tags_str = calc_tags_str()
  232. cache.set(cache_key, tags_str)
  233. return tags_str
  234. return cache.get_or_set(cache_key, calc_tags_str)
  235. def icons(self) -> str:
  236. return snapshot_icons(self)
  237. @property
  238. def api_url(self) -> str:
  239. # /api/v1/core/snapshot/{uulid}
  240. return reverse_lazy('api-1:get_snapshot', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  241. @property
  242. def api_docs_url(self) -> str:
  243. return '/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot'
  244. def get_absolute_url(self):
  245. return f'/{self.archive_path}'
  246. @cached_property
  247. def title_stripped(self) -> str:
  248. return (self.title or '').replace("\n", " ").replace("\r", "")
  249. @cached_property
  250. def extension(self) -> str:
  251. from archivebox.misc.util import extension
  252. return extension(self.url)
  253. @cached_property
  254. def bookmarked(self):
  255. return parse_date(self.timestamp)
  256. @cached_property
  257. def bookmarked_date(self):
  258. # TODO: remove this
  259. return self.bookmarked
  260. @cached_property
  261. def is_archived(self):
  262. return self.as_link().is_archived
  263. @cached_property
  264. def num_outputs(self) -> int:
  265. # DONT DO THIS: it will trigger a separate query for every snapshot
  266. # return self.archiveresult_set.filter(status='succeeded').count()
  267. # this is better:
  268. return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded'))
  269. @cached_property
  270. def base_url(self):
  271. return base_url(self.url)
  272. @cached_property
  273. def link_dir(self):
  274. return str(CONSTANTS.ARCHIVE_DIR / self.timestamp)
  275. @cached_property
  276. def archive_path(self):
  277. return '{}/{}'.format(CONSTANTS.ARCHIVE_DIR_NAME, self.timestamp)
  278. @cached_property
  279. def archive_size(self):
  280. cache_key = f'{str(self.pk)[:12]}-{(self.downloaded_at or self.bookmarked_at).timestamp()}-size'
  281. def calc_dir_size():
  282. try:
  283. return get_dir_size(self.link_dir)[0]
  284. except Exception:
  285. return 0
  286. return cache.get_or_set(cache_key, calc_dir_size)
  287. @cached_property
  288. def thumbnail_url(self) -> Optional[str]:
  289. if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
  290. result = (sorted(
  291. (
  292. result
  293. for result in self.archiveresult_set.all()
  294. if result.extractor == 'screenshot' and result.status =='succeeded' and result.output
  295. ),
  296. key=lambda result: result.created_at,
  297. ) or [None])[-1]
  298. else:
  299. result = self.archiveresult_set.filter(
  300. extractor='screenshot',
  301. status='succeeded'
  302. ).only('output').last()
  303. if result:
  304. return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
  305. return None
  306. @cached_property
  307. def headers(self) -> Optional[Dict[str, str]]:
  308. try:
  309. return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
  310. except Exception:
  311. pass
  312. return None
  313. @cached_property
  314. def status_code(self) -> Optional[str]:
  315. return self.headers.get('Status-Code') if self.headers else None
  316. @cached_property
  317. def history(self) -> dict:
  318. # TODO: use ArchiveResult for this instead of json
  319. return self.as_link_with_details().history
  320. @cached_property
  321. def latest_title(self) -> Optional[str]:
  322. if self.title:
  323. return self.title # whoopdedoo that was easy
  324. # check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again
  325. if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
  326. try:
  327. return (sorted(
  328. (
  329. result.output.strip()
  330. for result in self.archiveresult_set.all()
  331. if result.extractor == 'title' and result.status =='succeeded' and result.output
  332. ),
  333. key=lambda title: len(title),
  334. ) or [None])[-1]
  335. except IndexError:
  336. pass
  337. try:
  338. # take longest successful title from ArchiveResult db history
  339. return sorted(
  340. self.archiveresult_set\
  341. .filter(extractor='title', status='succeeded', output__isnull=False)\
  342. .values_list('output', flat=True),
  343. key=lambda r: len(r),
  344. )[-1]
  345. except IndexError:
  346. pass
  347. try:
  348. # take longest successful title from Link json index file history
  349. return sorted(
  350. (
  351. result.output.strip()
  352. for result in self.history['title']
  353. if result.status == 'succeeded' and result.output.strip()
  354. ),
  355. key=lambda r: len(r),
  356. )[-1]
  357. except (KeyError, IndexError):
  358. pass
  359. return None
  360. def save_tags(self, tags: Iterable[str]=()) -> None:
  361. tags_id = []
  362. for tag in tags:
  363. if tag.strip():
  364. tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
  365. self.tags.clear()
  366. self.tags.add(*tags_id)
  367. def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
  368. pending_archiveresults = self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
  369. return pending_archiveresults
  370. def create_pending_archiveresults(self) -> list['ArchiveResult']:
  371. ALL_EXTRACTORS = ['favicon', 'title', 'screenshot', 'headers', 'singlefile', 'dom', 'git', 'archive_org', 'readability', 'mercury', 'pdf', 'wget']
  372. # config = get_scope_config(snapshot=self)
  373. config = {'EXTRACTORS': ','.join(ALL_EXTRACTORS)}
  374. if config.get('EXTRACTORS', 'auto') == 'auto':
  375. EXTRACTORS = ALL_EXTRACTORS
  376. else:
  377. EXTRACTORS = config.get('EXTRACTORS', '').split(',')
  378. archiveresults = []
  379. for extractor in EXTRACTORS:
  380. if not extractor:
  381. continue
  382. if ArchiveResult.objects.filter(snapshot=self, extractor=extractor).exists():
  383. continue
  384. archiveresult, created = ArchiveResult.objects.get_or_create(
  385. snapshot=self,
  386. extractor=extractor,
  387. defaults={
  388. 'status': ArchiveResult.INITIAL_STATE,
  389. 'retry_at': timezone.now(),
  390. },
  391. )
  392. if archiveresult.status == ArchiveResult.INITIAL_STATE:
  393. archiveresults.append(archiveresult)
  394. return archiveresults
  395. # def migrate_output_dir(self):
  396. # """Move the output files to the new folder structure if needed"""
  397. # print(f'{self}.migrate_output_dir()')
  398. # self.migrate_from_0_7_2()
  399. # self.migrate_from_0_8_6()
  400. # # ... future migrations here
  401. # def migrate_from_0_7_2(self):
  402. # """Migrate the folder structure from 0.7.2 to the current version"""
  403. # # migrate any existing output_dir into data/archiveresults/<extractor>/YYYY-MM-DD/<domain>/<abid>
  404. # # create self.output_dir if it doesn't exist
  405. # # move loose files in snapshot_dir into self.output_dir
  406. # # update self.pwd = self.output_dir
  407. # print(f'{self}.migrate_from_0_7_2()')
  408. # def migrate_from_0_8_6(self):
  409. # """Migrate the folder structure from 0.8.6 to the current version"""
  410. # # ... future migration code here ...
  411. # print(f'{self}.migrate_from_0_8_6()')
  412. # def save_json_index(self):
  413. # """Save the json index file to ./.index.json"""
  414. # print(f'{self}.save_json_index()')
  415. # pass
  416. # def save_symlinks_index(self):
  417. # """Update the symlink farm idnexes to point to the new location of self.output_dir"""
  418. # # ln -s self.output_dir data/index/results_by_type/wget/YYYY-MM-DD/example.com/<abid>
  419. # # ln -s self.output_dir data/index/results_by_day/YYYY-MM-DD/example.com/wget/<abid>
  420. # # ln -s self.output_dir data/index/results_by_domain/example.com/YYYY-MM-DD/wget/<abid>
  421. # # ln -s self.output_dir data/index/results_by_abid/<abid>
  422. # # ln -s self.output_dir data/archive/<snapshot_timestamp>/<extractor>
  423. # print(f'{self}.save_symlinks_index()')
  424. # def save_html_index(self):
  425. # """Save the html index file to ./.index.html"""
  426. # print(f'{self}.save_html_index()')
  427. # pass
  428. # def save_merkle_index(self):
  429. # """Calculate the recursive sha256 of all the files in the output path and save it to ./.checksum.json"""
  430. # print(f'{self}.save_merkle_index()')
  431. # pass
  432. # def save_search_index(self):
  433. # """Pass any indexable text to the search backend indexer (e.g. sonic, SQLiteFTS5, etc.)"""
  434. # print(f'{self}.save_search_index()')
  435. # pass
  436. # def get_storage_dir(self, create=True, symlink=True) -> Path:
  437. # date_str = self.bookmarked_at.strftime('%Y%m%d')
  438. # domain_str = domain(self.url)
  439. # abs_storage_dir = Path(CONSTANTS.ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
  440. # if create and not abs_storage_dir.is_dir():
  441. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  442. # if symlink:
  443. # LINK_PATHS = [
  444. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  445. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
  446. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
  447. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
  448. # ]
  449. # for link_path in LINK_PATHS:
  450. # link_path.parent.mkdir(parents=True, exist_ok=True)
  451. # try:
  452. # link_path.symlink_to(abs_storage_dir)
  453. # except FileExistsError:
  454. # link_path.unlink()
  455. # link_path.symlink_to(abs_storage_dir)
  456. # return abs_storage_dir
  457. class ArchiveResultManager(models.Manager):
  458. def indexable(self, sorted: bool = True):
  459. """Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)"""
  460. INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  461. qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded')
  462. if sorted:
  463. precedence = [
  464. When(extractor=method, then=Value(precedence))
  465. for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE
  466. ]
  467. qs = qs.annotate(
  468. indexing_precedence=Case(
  469. *precedence,
  470. default=Value(1000),
  471. output_field=IntegerField()
  472. )
  473. ).order_by('indexing_precedence')
  474. return qs
  475. class ArchiveResult(ModelWithConfig, ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
  476. abid_prefix = 'res_'
  477. abid_ts_src = 'self.snapshot.created_at'
  478. abid_uri_src = 'self.snapshot.url'
  479. abid_subtype_src = 'self.extractor'
  480. abid_rand_src = 'self.id'
  481. abid_drift_allowed = True
  482. class StatusChoices(models.TextChoices):
  483. QUEUED = 'queued', 'Queued' # pending, initial
  484. STARTED = 'started', 'Started' # active
  485. BACKOFF = 'backoff', 'Waiting to retry' # pending
  486. SUCCEEDED = 'succeeded', 'Succeeded' # final
  487. FAILED = 'failed', 'Failed' # final
  488. SKIPPED = 'skipped', 'Skipped' # final
  489. state_machine_name = 'core.statemachines.ArchiveResultMachine'
  490. retry_at_field_name = 'retry_at'
  491. state_field_name = 'status'
  492. active_state = StatusChoices.STARTED
  493. EXTRACTOR_CHOICES = (
  494. ('htmltotext', 'htmltotext'),
  495. ('git', 'git'),
  496. ('singlefile', 'singlefile'),
  497. ('media', 'media'),
  498. ('archive_org', 'archive_org'),
  499. ('readability', 'readability'),
  500. ('mercury', 'mercury'),
  501. ('favicon', 'favicon'),
  502. ('pdf', 'pdf'),
  503. ('headers', 'headers'),
  504. ('screenshot', 'screenshot'),
  505. ('dom', 'dom'),
  506. ('title', 'title'),
  507. ('wget', 'wget'),
  508. )
  509. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  510. abid = ABIDField(prefix=abid_prefix)
  511. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set', db_index=True)
  512. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  513. modified_at = models.DateTimeField(auto_now=True)
  514. status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
  515. retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
  516. snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore
  517. extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True)
  518. cmd = models.JSONField(default=None, null=True, blank=True)
  519. pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
  520. cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
  521. output = models.CharField(max_length=1024, default=None, null=True, blank=True)
  522. start_ts = models.DateTimeField(default=None, null=True, blank=True)
  523. end_ts = models.DateTimeField(default=None, null=True, blank=True)
  524. notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this ArchiveResult should have')
  525. # the network interface that was used to download this result
  526. # uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
  527. objects = ArchiveResultManager()
  528. keys = ('snapshot_id', 'extractor', 'cmd', 'pwd', 'cmd_version', 'output', 'start_ts', 'end_ts', 'created_at', 'status', 'retry_at', 'abid', 'id')
  529. class Meta(TypedModelMeta):
  530. verbose_name = 'Archive Result'
  531. verbose_name_plural = 'Archive Results Log'
  532. def __repr__(self):
  533. snapshot_id = getattr(self, 'snapshot_id', None)
  534. url = self.snapshot.url if snapshot_id else '<no url set>'
  535. created_at = self.snapshot.created_at.strftime("%Y-%m-%d %H:%M") if snapshot_id else '<no timestamp set>'
  536. extractor = self.extractor or '<no extractor set>'
  537. if self.id and snapshot_id:
  538. return f'[{self.ABID}] {url[:64]} @ {created_at} -> {extractor}'
  539. return f'[{self.abid_prefix}****not*saved*yet****] {url} @ {created_at} -> {extractor}'
  540. def __str__(self):
  541. return repr(self)
  542. def save(self, *args, write_indexes: bool=False, **kwargs):
  543. print(f'ArchiveResult[{self.ABID}].save()')
  544. # if (self.pk and self.__class__.objects.filter(pk=self.pk).values_list('status', flat=True)[0] in [self.StatusChoices.FAILED, self.StatusChoices.SUCCEEDED, self.StatusChoices.SKIPPED]):
  545. # raise Exception(f'ArchiveResult {self.pk} is in a final state, it cannot be modified any further.')
  546. if self.pk:
  547. existing_archiveresult = self.__class__.objects.filter(pk=self.pk).first()
  548. if existing_archiveresult and existing_archiveresult.status in [self.StatusChoices.FAILED, self.StatusChoices.SUCCEEDED, self.StatusChoices.SKIPPED]:
  549. if self.as_json() != existing_archiveresult.as_json():
  550. raise Exception(f'ArchiveResult {self.pk} is in a final state, it cannot be modified any further. NEW: {self.as_json()} != Existing: {existing_archiveresult.as_json()}')
  551. super().save(*args, **kwargs)
  552. # DONT DO THIS:
  553. # self.snapshot.update_for_workers() # this should be done manually wherever its needed, not in here as a side-effect on save()
  554. # TODO: finish connecting machine.models
  555. # @cached_property
  556. # def machine(self):
  557. # return self.iface.machine if self.iface else None
  558. @cached_property
  559. def snapshot_dir(self):
  560. return Path(self.snapshot.link_dir)
  561. @cached_property
  562. def url(self):
  563. return self.snapshot.url
  564. @property
  565. def api_url(self) -> str:
  566. # /api/v1/core/archiveresult/{uulid}
  567. return reverse_lazy('api-1:get_archiveresult', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  568. @property
  569. def api_docs_url(self) -> str:
  570. return '/api/v1/docs#/Core%20Models/api_v1_core_get_archiveresult'
  571. def get_absolute_url(self):
  572. return f'/{self.snapshot.archive_path}/{self.extractor}'
  573. @property
  574. def extractor_module(self) -> Any | None:
  575. return abx.as_dict(abx.pm.hook.get_EXTRACTORS()).get(self.extractor, None)
  576. @property
  577. def EXTRACTOR(self) -> object:
  578. # return self.extractor_module
  579. return self.extractor_module(archiveresult=self)
  580. def embed_path(self) -> str | None:
  581. """
  582. return the actual runtime-calculated path to the file on-disk that
  583. should be used for user-facing iframe embeds of this result
  584. """
  585. try:
  586. return self.extractor_module.get_embed_path(self)
  587. except Exception as e:
  588. print(f'Error getting embed path for {self.extractor} extractor: {e}')
  589. return None
  590. def legacy_output_path(self):
  591. link = self.snapshot.as_link()
  592. return link.canonical_outputs().get(f'{self.extractor}_path')
  593. def output_exists(self) -> bool:
  594. output_path = Path(self.snapshot_dir) / self.extractor
  595. return os.path.exists(output_path)
  596. def create_output_dir(self):
  597. output_dir = Path(self.snapshot_dir) / self.extractor
  598. output_dir.mkdir(parents=True, exist_ok=True)
  599. return output_dir
  600. def as_json(self, *args) -> dict:
  601. args = args or self.keys
  602. return {
  603. key: getattr(self, key)
  604. for key in args
  605. }
  606. def write_indexes(self):
  607. """Write the ArchiveResult json, html, and merkle indexes to output dir, and pass searchable text to the search backend"""
  608. super().write_indexes()
  609. self.save_search_index()
  610. # self.save_outlinks_to_crawl()
  611. # def save_outlinks_to_crawl(self):
  612. # """Save the output of this ArchiveResult to the Crawl's urls field"""
  613. # if self.output_urls:
  614. # self.snapshot.crawl.urls += f'\n{self.url}'
  615. # self.snapshot.crawl.save()
  616. # def migrate_output_dir(self):
  617. # """Move the output files to the new folder structure if needed"""
  618. # print(f'{self}.migrate_output_dir()')
  619. # self.migrate_from_0_7_2()
  620. # self.migrate_from_0_8_6()
  621. # # ... future migrations here
  622. # def migrate_from_0_7_2(self):
  623. # """Migrate the folder structure from 0.7.2 to the current version"""
  624. # # migrate any existing output_dir into data/archiveresults/<extractor>/YYYY-MM-DD/<domain>/<abid>
  625. # # create self.output_dir if it doesn't exist
  626. # # move loose files in snapshot_dir into self.output_dir
  627. # # update self.pwd = self.output_dir
  628. # print(f'{self}.migrate_from_0_7_2()')
  629. # def migrate_from_0_8_6(self):
  630. # """Migrate the folder structure from 0.8.6 to the current version"""
  631. # # ... future migration code here ...
  632. # print(f'{self}.migrate_from_0_8_6()')
  633. # def save_json_index(self):
  634. # """Save the json index file to ./.index.json"""
  635. # print(f'{self}.save_json_index()')
  636. # pass
  637. # def save_symlinks_index(self):
  638. # """Update the symlink farm idnexes to point to the new location of self.output_dir"""
  639. # # ln -s self.output_dir data/index/results_by_type/wget/YYYY-MM-DD/example.com/<abid>
  640. # # ln -s self.output_dir data/index/results_by_day/YYYY-MM-DD/example.com/wget/<abid>
  641. # # ln -s self.output_dir data/index/results_by_domain/example.com/YYYY-MM-DD/wget/<abid>
  642. # # ln -s self.output_dir data/index/results_by_abid/<abid>
  643. # # ln -s self.output_dir data/archive/<snapshot_timestamp>/<extractor>
  644. # print(f'{self}.save_symlinks_index()')
  645. # def save_html_index(self):
  646. # """Save the html index file to ./.index.html"""
  647. # print(f'{self}.save_html_index()')
  648. # pass
  649. # def save_merkle_index(self):
  650. # """Calculate the recursive sha256 of all the files in the output path and save it to ./.checksum.json"""
  651. # print(f'{self}.save_merkle_index()')
  652. # pass
  653. def save_search_index(self):
  654. """Pass any indexable text to the search backend indexer (e.g. sonic, SQLiteFTS5, etc.)"""
  655. print(f'{self}.save_search_index()')
  656. pass
  657. # def get_storage_dir(self, create=True, symlink=True):
  658. # date_str = self.snapshot.bookmarked_at.strftime('%Y%m%d')
  659. # domain_str = domain(self.snapshot.url)
  660. # abs_storage_dir = Path(CONSTANTS.ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
  661. # if create and not abs_storage_dir.is_dir():
  662. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  663. # if symlink:
  664. # LINK_PATHS = [
  665. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  666. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
  667. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
  668. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
  669. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
  670. # ]
  671. # for link_path in LINK_PATHS:
  672. # link_path.parent.mkdir(parents=True, exist_ok=True)
  673. # try:
  674. # link_path.symlink_to(abs_storage_dir)
  675. # except FileExistsError:
  676. # link_path.unlink()
  677. # link_path.symlink_to(abs_storage_dir)
  678. # return abs_storage_dir
  679. # def symlink_index(self, create=True):
  680. # abs_result_dir = self.get_storage_dir(create=create)