models.py 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023
  1. __package__ = 'archivebox.core'
  2. from typing import Optional, Dict, Iterable, Any
  3. from django_stubs_ext.db.models import TypedModelMeta
  4. import os
  5. import json
  6. from pathlib import Path
  7. from django.db import models
  8. from django.db.models import QuerySet
  9. from django.core.validators import MinValueValidator, MaxValueValidator
  10. from django.utils.functional import cached_property
  11. from django.utils.text import slugify
  12. from django.utils import timezone
  13. from django.core.cache import cache
  14. from django.urls import reverse, reverse_lazy
  15. from django.db.models import Case, When, IntegerField
  16. from django.contrib import admin
  17. from django.conf import settings
  18. import abx
  19. from archivebox.config import CONSTANTS
  20. from archivebox.misc.system import get_dir_size
  21. from archivebox.misc.util import parse_date, base_url, domain as url_domain
  22. from archivebox.misc.hashing import get_dir_info
  23. from archivebox.index.schema import Link
  24. from archivebox.index.html import snapshot_icons
  25. from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
  26. from archivebox.base_models.models import (
  27. ABIDModel, ABIDField, AutoDateTimeField, get_or_create_system_user_pk,
  28. ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, # ModelWithStateMachine
  29. ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats
  30. )
  31. from workers.models import ModelWithStateMachine
  32. from workers.tasks import bg_archive_snapshot
  33. from tags.models import KVTag
  34. # from machine.models import Machine, NetworkInterface
  35. from crawls.models import Seed, Crawl, CrawlSchedule
  36. class Tag(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ABIDModel):
  37. """
  38. Old tag model, loosely based on django-taggit model + ABID base.
  39. Being phazed out in favor of archivebox.tags.models.ATag
  40. """
  41. abid_prefix = 'tag_'
  42. abid_ts_src = 'self.created_at'
  43. abid_uri_src = 'self.slug'
  44. abid_subtype_src = '"03"'
  45. abid_rand_src = 'self.id'
  46. abid_drift_allowed = True
  47. read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'slug')
  48. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  49. abid = ABIDField(prefix=abid_prefix)
  50. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='tag_set')
  51. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  52. modified_at = models.DateTimeField(auto_now=True)
  53. name = models.CharField(unique=True, blank=False, max_length=100)
  54. slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
  55. # slug is autoset on save from name, never set it manually
  56. snapshot_set: models.Manager['Snapshot']
  57. # crawl_set: models.Manager['Crawl']
  58. class Meta(TypedModelMeta):
  59. verbose_name = "Tag"
  60. verbose_name_plural = "Tags"
  61. def __str__(self):
  62. return self.name
  63. def slugify(self, tag, i=None):
  64. slug = slugify(tag)
  65. if i is not None:
  66. slug += "_%d" % i
  67. return slug
  68. def clean(self, *args, **kwargs):
  69. self.slug = self.slug or self.slugify(self.name)
  70. super().clean(*args, **kwargs)
  71. def save(self, *args, **kwargs):
  72. if self._state.adding:
  73. self.slug = self.slugify(self.name)
  74. # if name is different but slug conficts with another tags slug, append a counter
  75. # with transaction.atomic():
  76. slugs = set(
  77. type(self)
  78. ._default_manager.filter(slug__startswith=self.slug)
  79. .values_list("slug", flat=True)
  80. )
  81. i = None
  82. while True:
  83. slug = self.slugify(self.name, i)
  84. if slug not in slugs:
  85. self.slug = slug
  86. return super().save(*args, **kwargs)
  87. i = 1 if i is None else i+1
  88. else:
  89. return super().save(*args, **kwargs)
  90. @property
  91. def api_url(self) -> str:
  92. # /api/v1/core/snapshot/{uulid}
  93. return reverse_lazy('api-1:get_tag', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  94. @property
  95. def api_docs_url(self) -> str:
  96. return '/api/v1/docs#/Core%20Models/api_v1_core_get_tag'
  97. class SnapshotTag(models.Model):
  98. id = models.AutoField(primary_key=True)
  99. snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
  100. tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
  101. class Meta:
  102. db_table = 'core_snapshot_tags'
  103. unique_together = [('snapshot', 'tag')]
  104. def validate_timestamp(value):
  105. assert isinstance(value, str) and value, f'timestamp must be a non-empty string, got: "{value}"'
  106. assert value.replace('.', '').isdigit(), f'timestamp must be a float str, got: "{value}"'
  107. class SnapshotManager(models.Manager):
  108. def filter(self, *args, **kwargs):
  109. """add support for .filter(domain='example.com') to Snapshot queryset"""
  110. domain = kwargs.pop('domain', None)
  111. qs = super().filter(*args, **kwargs)
  112. if domain:
  113. qs = qs.filter(url__icontains=f'://{domain}')
  114. return qs
  115. def get_queryset(self):
  116. return (
  117. super().get_queryset()
  118. .prefetch_related('tags', 'archiveresult_set')
  119. # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
  120. )
  121. class Snapshot(
  122. ModelWithReadOnlyFields,
  123. ModelWithSerializers,
  124. ModelWithUUID,
  125. ModelWithKVTags,
  126. ABIDModel,
  127. ModelWithOutputDir,
  128. ModelWithConfig,
  129. ModelWithNotes,
  130. ModelWithHealthStats,
  131. ModelWithStateMachine,
  132. ):
  133. ### ModelWithSerializers
  134. # cls.from_dict() -> Self
  135. # self.as_json() -> dict[str, Any]
  136. # self.as_jsonl_row() -> str
  137. # self.as_csv_row() -> str
  138. # self.as_html_icon(), .as_html_embed(), .as_html_row(), ...
  139. ### ModelWithReadOnlyFields
  140. read_only_fields = ('id', 'abid', 'created_at', 'created_by_id', 'url', 'timestamp', 'bookmarked_at', 'crawl_id')
  141. ### Immutable fields:
  142. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  143. abid = ABIDField(prefix=abid_prefix)
  144. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
  145. created_at = AutoDateTimeField(default=None, null=False, db_index=True) # loaded from self._init_timestamp
  146. url = models.URLField(unique=True, db_index=True)
  147. timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False, validators=[validate_timestamp])
  148. bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
  149. crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
  150. ### Mutable fields:
  151. title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
  152. downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
  153. modified_at = models.DateTimeField(auto_now=True)
  154. ### ModelWithStateMachine
  155. retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
  156. status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
  157. ### ModelWithConfig
  158. config = models.JSONField(default=dict, null=False, blank=False, editable=True)
  159. ### ModelWithNotes
  160. notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this snapshot should have')
  161. ### ModelWithOutputDir
  162. output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True)
  163. # self.output_dir_parent -> str 'archive/snapshots/<YYYY-MM-DD>/<example.com>'
  164. # self.output_dir_name -> '<abid>'
  165. # self.output_dir_str -> 'archive/snapshots/<YYYY-MM-DD>/<example.com>/<abid>'
  166. # self.OUTPUT_DIR -> Path('/data/archive/snapshots/<YYYY-MM-DD>/<example.com>/<abid>')
  167. # self.save(): creates OUTPUT_DIR, writes index.json, writes indexes
  168. # old-style tags (dedicated ManyToMany Tag model above):
  169. tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
  170. # new-style tags (new key-value tags defined by tags.models.KVTag & ModelWithKVTags):
  171. kvtag_set = tag_set = GenericRelation(
  172. KVTag,
  173. related_query_name="snapshot",
  174. content_type_field="obj_type",
  175. object_id_field="obj_id",
  176. order_by=('created_at',),
  177. )
  178. ### ABIDModel
  179. abid_prefix = 'snp_'
  180. abid_ts_src = 'self.created_at'
  181. abid_uri_src = 'self.url'
  182. abid_subtype_src = '"01"'
  183. abid_rand_src = 'self.id'
  184. abid_drift_allowed = True
  185. # self.clean() -> sets self._timestamp
  186. # self.save() -> issues new ABID if creating new, otherwise uses existing ABID
  187. # self.ABID -> ABID
  188. # self.api_url -> '/api/v1/core/snapshot/{uulid}'
  189. # self.api_docs_url -> '/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot'
  190. # self.admin_change_url -> '/admin/core/snapshot/{pk}/change/'
  191. # self.get_absolute_url() -> '/{self.archive_path}'
  192. # self.update_for_workers() -> bool
  193. ### ModelWithStateMachine
  194. state_machine_name = 'core.statemachines.SnapshotMachine'
  195. state_field_name = 'status'
  196. retry_at_field_name = 'retry_at'
  197. StatusChoices = ModelWithStateMachine.StatusChoices
  198. active_state = StatusChoices.STARTED
  199. ### Relations & Managers
  200. objects = SnapshotManager()
  201. archiveresult_set: models.Manager['ArchiveResult']
  202. def save(self, *args, **kwargs):
  203. print(f'Snapshot[{self.ABID}].save()')
  204. if self.pk:
  205. existing_snapshot = self.__class__.objects.filter(pk=self.pk).first()
  206. if existing_snapshot and existing_snapshot.status == self.StatusChoices.SEALED:
  207. if self.as_json() != existing_snapshot.as_json():
  208. raise Exception(f'Snapshot {self.pk} is already sealed, it cannot be modified any further. NEW: {self.as_json()} != Existing: {existing_snapshot.as_json()}')
  209. if not self.bookmarked_at:
  210. self.bookmarked_at = self.created_at or self._init_timestamp
  211. if not self.timestamp:
  212. self.timestamp = str(self.bookmarked_at.timestamp())
  213. super().save(*args, **kwargs)
  214. # make sure the crawl has this url in its urls log
  215. if self.crawl and self.url not in self.crawl.urls:
  216. self.crawl.urls += f'\n{self.url}'
  217. self.crawl.save()
  218. def output_dir_parent(self) -> str:
  219. return 'archive'
  220. def output_dir_name(self) -> str:
  221. return str(self.timestamp)
  222. def archive(self, overwrite=False, methods=None):
  223. result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
  224. return result
  225. def __repr__(self) -> str:
  226. url = self.url or '<no url set>'
  227. created_at = self.created_at.strftime("%Y-%m-%d %H:%M") if self.created_at else '<no timestamp set>'
  228. if self.id and self.url:
  229. return f'[{self.ABID}] {url[:64]} @ {created_at}'
  230. return f'[{self.abid_prefix}****not*saved*yet****] {url[:64]} @ {created_at}'
  231. def __str__(self) -> str:
  232. return repr(self)
  233. @classmethod
  234. def from_json(cls, fields: dict[str, Any]) -> Self:
  235. # print('LEGACY from_json()')
  236. return cls.from_dict(fields)
  237. def as_json(self, *args, **kwargs) -> dict:
  238. json_dict = super().as_json(*args, **kwargs)
  239. if 'tags' in json_dict:
  240. json_dict['tags'] = self.tags_str(nocache=False)
  241. return json_dict
  242. def as_link(self) -> Link:
  243. return Link.from_json(self.as_json())
  244. def as_link_with_details(self) -> Link:
  245. from ..index import load_link_details
  246. return load_link_details(self.as_link())
  247. @admin.display(description='Tags')
  248. def tags_str(self, nocache=True) -> str | None:
  249. calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
  250. cache_key = f'{self.pk}-{(self.downloaded_at or self.bookmarked_at).timestamp()}-tags'
  251. if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
  252. # tags are pre-fetched already, use them directly (best because db is always freshest)
  253. tags_str = calc_tags_str()
  254. return tags_str
  255. if nocache:
  256. tags_str = calc_tags_str()
  257. cache.set(cache_key, tags_str)
  258. return tags_str
  259. return cache.get_or_set(cache_key, calc_tags_str)
  260. def icons(self) -> str:
  261. return snapshot_icons(self)
  262. @property
  263. def api_url(self) -> str:
  264. # /api/v1/core/snapshot/{uulid}
  265. return reverse_lazy('api-1:get_snapshot', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  266. @property
  267. def api_docs_url(self) -> str:
  268. return '/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot'
  269. def get_absolute_url(self):
  270. return f'/{self.archive_path}'
  271. @cached_property
  272. def title_stripped(self) -> str:
  273. return (self.title or '').replace("\n", " ").replace("\r", "")
  274. @cached_property
  275. def extension(self) -> str:
  276. from archivebox.misc.util import extension
  277. return extension(self.url)
  278. @cached_property
  279. def bookmarked(self):
  280. return parse_date(self.timestamp)
  281. @cached_property
  282. def bookmarked_date(self):
  283. # TODO: remove this
  284. return self.bookmarked
  285. @cached_property
  286. def domain(self) -> str:
  287. return url_domain(self.url)
  288. @cached_property
  289. def is_archived(self):
  290. return self.as_link().is_archived
  291. @cached_property
  292. def num_outputs(self) -> int:
  293. # DONT DO THIS: it will trigger a separate query for every snapshot
  294. # return self.archiveresult_set.filter(status='succeeded').count()
  295. # this is better:
  296. return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded'))
  297. @cached_property
  298. def base_url(self):
  299. return base_url(self.url)
  300. @cached_property
  301. def link_dir(self):
  302. return str(CONSTANTS.ARCHIVE_DIR / self.timestamp)
  303. @cached_property
  304. def archive_path(self):
  305. return '{}/{}'.format(CONSTANTS.ARCHIVE_DIR_NAME, self.timestamp)
  306. @cached_property
  307. def archive_size(self):
  308. cache_key = f'{str(self.pk)[:12]}-{(self.downloaded_at or self.bookmarked_at).timestamp()}-size'
  309. def calc_dir_size():
  310. try:
  311. return get_dir_size(self.link_dir)[0]
  312. except Exception:
  313. return 0
  314. return cache.get_or_set(cache_key, calc_dir_size)
  315. @cached_property
  316. def thumbnail_url(self) -> Optional[str]:
  317. if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
  318. result = (sorted(
  319. (
  320. result
  321. for result in self.archiveresult_set.all()
  322. if result.extractor == 'screenshot' and result.status =='succeeded' and result.output
  323. ),
  324. key=lambda result: result.created_at,
  325. ) or [None])[-1]
  326. else:
  327. result = self.archiveresult_set.filter(
  328. extractor='screenshot',
  329. status='succeeded'
  330. ).only('output').last()
  331. if result:
  332. return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
  333. return None
  334. @cached_property
  335. def headers(self) -> Optional[Dict[str, str]]:
  336. try:
  337. return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
  338. except Exception:
  339. pass
  340. return None
  341. @cached_property
  342. def status_code(self) -> Optional[str]:
  343. return self.headers.get('Status-Code') if self.headers else None
  344. @cached_property
  345. def history(self) -> dict:
  346. # TODO: use ArchiveResult for this instead of json
  347. return self.as_link_with_details().history
  348. @cached_property
  349. def latest_title(self) -> Optional[str]:
  350. if self.title:
  351. return self.title # whoopdedoo that was easy
  352. # check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again
  353. if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
  354. try:
  355. return (sorted(
  356. (
  357. result.output.strip()
  358. for result in self.archiveresult_set.all()
  359. if result.extractor == 'title' and result.status =='succeeded' and result.output
  360. ),
  361. key=lambda title: len(title),
  362. ) or [None])[-1]
  363. except IndexError:
  364. pass
  365. try:
  366. # take longest successful title from ArchiveResult db history
  367. return sorted(
  368. self.archiveresult_set\
  369. .filter(extractor='title', status='succeeded', output__isnull=False)\
  370. .values_list('output', flat=True),
  371. key=lambda r: len(r),
  372. )[-1]
  373. except IndexError:
  374. pass
  375. try:
  376. # take longest successful title from Link json index file history
  377. return sorted(
  378. (
  379. result.output.strip()
  380. for result in self.history['title']
  381. if result.status == 'succeeded' and result.output.strip()
  382. ),
  383. key=lambda r: len(r),
  384. )[-1]
  385. except (KeyError, IndexError):
  386. pass
  387. return None
  388. def save_tags(self, tags: Iterable[str]=()) -> None:
  389. tags_id = []
  390. for tag in tags:
  391. if tag.strip():
  392. tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
  393. self.tags.clear()
  394. self.tags.add(*tags_id)
  395. def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
  396. pending_archiveresults = self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
  397. return pending_archiveresults
  398. def create_pending_archiveresults(self) -> list['ArchiveResult']:
  399. ALL_EXTRACTORS = ['favicon', 'title', 'screenshot', 'headers', 'singlefile', 'dom', 'git', 'archive_org', 'readability', 'mercury', 'pdf', 'wget']
  400. # config = get_scope_config(snapshot=self)
  401. config = {'EXTRACTORS': ','.join(ALL_EXTRACTORS)}
  402. if config.get('EXTRACTORS', 'auto') == 'auto':
  403. EXTRACTORS = ALL_EXTRACTORS
  404. else:
  405. EXTRACTORS = config.get('EXTRACTORS', '').split(',')
  406. archiveresults = []
  407. for extractor in EXTRACTORS:
  408. if not extractor:
  409. continue
  410. if ArchiveResult.objects.filter(snapshot=self, extractor=extractor).exists():
  411. continue
  412. archiveresult, created = ArchiveResult.objects.get_or_create(
  413. snapshot=self,
  414. extractor=extractor,
  415. defaults={
  416. 'status': ArchiveResult.INITIAL_STATE,
  417. 'retry_at': timezone.now(),
  418. },
  419. )
  420. if archiveresult.status == ArchiveResult.INITIAL_STATE:
  421. archiveresults.append(archiveresult)
  422. return archiveresults
  423. # def migrate_output_dir(self):
  424. # """Move the output files to the new folder structure if needed"""
  425. # print(f'{self}.migrate_output_dir()')
  426. # self.migrate_from_0_7_2()
  427. # self.migrate_from_0_8_6()
  428. # # ... future migrations here
  429. # def migrate_from_0_7_2(self):
  430. # """Migrate the folder structure from 0.7.2 to the current version"""
  431. # # migrate any existing output_dir into data/archiveresults/<extractor>/YYYY-MM-DD/<domain>/<abid>
  432. # # create self.output_dir if it doesn't exist
  433. # # move loose files in snapshot_dir into self.output_dir
  434. # # update self.pwd = self.output_dir
  435. # print(f'{self}.migrate_from_0_7_2()')
  436. # def migrate_from_0_8_6(self):
  437. # """Migrate the folder structure from 0.8.6 to the current version"""
  438. # # ... future migration code here ...
  439. # print(f'{self}.migrate_from_0_8_6()')
  440. # def save_json_index(self):
  441. # """Save the json index file to ./.index.json"""
  442. # print(f'{self}.save_json_index()')
  443. # pass
  444. # def save_symlinks_index(self):
  445. # """Update the symlink farm idnexes to point to the new location of self.output_dir"""
  446. # # ln -s self.output_dir data/index/results_by_type/wget/YYYY-MM-DD/example.com/<abid>
  447. # # ln -s self.output_dir data/index/results_by_day/YYYY-MM-DD/example.com/wget/<abid>
  448. # # ln -s self.output_dir data/index/results_by_domain/example.com/YYYY-MM-DD/wget/<abid>
  449. # # ln -s self.output_dir data/index/results_by_abid/<abid>
  450. # # ln -s self.output_dir data/archive/<snapshot_timestamp>/<extractor>
  451. # print(f'{self}.save_symlinks_index()')
  452. # def save_html_index(self):
  453. # """Save the html index file to ./.index.html"""
  454. # print(f'{self}.save_html_index()')
  455. # pass
  456. # def save_merkle_index(self):
  457. # """Calculate the recursive sha256 of all the files in the output path and save it to ./.checksum.json"""
  458. # print(f'{self}.save_merkle_index()')
  459. # pass
  460. # def save_search_index(self):
  461. # """Pass any indexable text to the search backend indexer (e.g. sonic, SQLiteFTS5, etc.)"""
  462. # print(f'{self}.save_search_index()')
  463. # pass
  464. # def get_storage_dir(self, create=True, symlink=True) -> Path:
  465. # date_str = self.bookmarked_at.strftime('%Y%m%d')
  466. # domain_str = domain(self.url)
  467. # abs_storage_dir = Path(CONSTANTS.ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
  468. # if create and not abs_storage_dir.is_dir():
  469. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  470. # if symlink:
  471. # LINK_PATHS = [
  472. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  473. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
  474. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
  475. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
  476. # ]
  477. # for link_path in LINK_PATHS:
  478. # link_path.parent.mkdir(parents=True, exist_ok=True)
  479. # try:
  480. # link_path.symlink_to(abs_storage_dir)
  481. # except FileExistsError:
  482. # link_path.unlink()
  483. # link_path.symlink_to(abs_storage_dir)
  484. # return abs_storage_dir
  485. class ArchiveResultManager(models.Manager):
  486. def indexable(self, sorted: bool = True):
  487. """Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)"""
  488. INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  489. qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded')
  490. if sorted:
  491. precedence = [
  492. When(extractor=method, then=Value(precedence))
  493. for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE
  494. ]
  495. qs = qs.annotate(
  496. indexing_precedence=Case(
  497. *precedence,
  498. default=Value(1000),
  499. output_field=IntegerField()
  500. )
  501. ).order_by('indexing_precedence')
  502. return qs
  503. class ArchiveResult(
  504. ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ABIDModel,
  505. ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine
  506. ):
  507. ### ABIDModel
  508. abid_prefix = 'res_'
  509. abid_ts_src = 'self.snapshot.created_at'
  510. abid_uri_src = 'self.snapshot.url'
  511. abid_subtype_src = 'self.extractor'
  512. abid_rand_src = 'self.id'
  513. abid_drift_allowed = True
  514. ### ModelWithStateMachine
  515. class StatusChoices(models.TextChoices):
  516. QUEUED = 'queued', 'Queued' # pending, initial
  517. STARTED = 'started', 'Started' # active
  518. BACKOFF = 'backoff', 'Waiting to retry' # pending
  519. SUCCEEDED = 'succeeded', 'Succeeded' # final
  520. FAILED = 'failed', 'Failed' # final
  521. SKIPPED = 'skipped', 'Skipped' # final
  522. state_machine_name = 'core.statemachines.ArchiveResultMachine'
  523. retry_at_field_name = 'retry_at'
  524. state_field_name = 'status'
  525. active_state = StatusChoices.STARTED
  526. EXTRACTOR_CHOICES = (
  527. ('htmltotext', 'htmltotext'),
  528. ('git', 'git'),
  529. ('singlefile', 'singlefile'),
  530. ('media', 'media'),
  531. ('archive_org', 'archive_org'),
  532. ('readability', 'readability'),
  533. ('mercury', 'mercury'),
  534. ('favicon', 'favicon'),
  535. ('pdf', 'pdf'),
  536. ('headers', 'headers'),
  537. ('screenshot', 'screenshot'),
  538. ('dom', 'dom'),
  539. ('title', 'title'),
  540. ('wget', 'wget'),
  541. )
  542. ### ModelWithReadOnlyFields
  543. read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'snapshot', 'extractor', 'pwd')
  544. ### Immutable fields:
  545. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  546. abid = ABIDField(prefix=abid_prefix)
  547. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set', db_index=True)
  548. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  549. snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore
  550. extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True)
  551. pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
  552. ### Mutable fields:
  553. cmd = models.JSONField(default=None, null=True, blank=True)
  554. modified_at = models.DateTimeField(auto_now=True)
  555. cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
  556. output = models.CharField(max_length=1024, default=None, null=True, blank=True)
  557. start_ts = models.DateTimeField(default=None, null=True, blank=True)
  558. end_ts = models.DateTimeField(default=None, null=True, blank=True)
  559. ### ModelWithStateMachine
  560. status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
  561. retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
  562. ### ModelWithNotes
  563. notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this ArchiveResult should have')
  564. ### ModelWithHealthStats
  565. # ...
  566. ### ModelWithKVTags
  567. # tag_set = GenericRelation(KVTag, related_query_name='archiveresult')
  568. ### ModelWithOutputDir
  569. output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
  570. # machine = models.ForeignKey(Machine, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Machine Used')
  571. iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
  572. objects = ArchiveResultManager()
  573. keys = ('snapshot_id', 'extractor', 'cmd', 'pwd', 'cmd_version', 'output', 'start_ts', 'end_ts', 'created_at', 'status', 'retry_at', 'abid', 'id')
  574. class Meta(TypedModelMeta):
  575. verbose_name = 'Archive Result'
  576. verbose_name_plural = 'Archive Results Log'
  577. def __repr__(self):
  578. snapshot_id = getattr(self, 'snapshot_id', None)
  579. url = self.snapshot.url if snapshot_id else '<no url set>'
  580. created_at = self.snapshot.created_at.strftime("%Y-%m-%d %H:%M") if snapshot_id else '<no timestamp set>'
  581. extractor = self.extractor or '<no extractor set>'
  582. if self.id and snapshot_id:
  583. return f'[{self.ABID}] {url[:64]} @ {created_at} -> {extractor}'
  584. return f'[{self.abid_prefix}****not*saved*yet****] {url} @ {created_at} -> {extractor}'
  585. def __str__(self):
  586. return repr(self)
  587. def save(self, *args, write_indexes: bool=False, **kwargs):
  588. print(f'ArchiveResult[{self.ABID}].save()')
  589. # if (self.pk and self.__class__.objects.filter(pk=self.pk).values_list('status', flat=True)[0] in [self.StatusChoices.FAILED, self.StatusChoices.SUCCEEDED, self.StatusChoices.SKIPPED]):
  590. # raise Exception(f'ArchiveResult {self.pk} is in a final state, it cannot be modified any further.')
  591. if self.pk:
  592. existing_archiveresult = self.__class__.objects.filter(pk=self.pk).first()
  593. if existing_archiveresult and existing_archiveresult.status in [self.StatusChoices.FAILED, self.StatusChoices.SUCCEEDED, self.StatusChoices.SKIPPED]:
  594. if self.as_json() != existing_archiveresult.as_json():
  595. raise Exception(f'ArchiveResult {self.pk} is in a final state, it cannot be modified any further. NEW: {self.as_json()} != Existing: {existing_archiveresult.as_json()}')
  596. super().save(*args, **kwargs)
  597. # DONT DO THIS:
  598. # self.snapshot.update_for_workers() # this should be done manually wherever its needed, not in here as a side-effect on save()
  599. # TODO: finish connecting machine.models
  600. # @cached_property
  601. # def machine(self):
  602. # return self.iface.machine if self.iface else None
  603. @cached_property
  604. def snapshot_dir(self):
  605. return Path(self.snapshot.link_dir)
  606. @cached_property
  607. def url(self):
  608. return self.snapshot.url
  609. @property
  610. def api_url(self) -> str:
  611. # /api/v1/core/archiveresult/{uulid}
  612. return reverse_lazy('api-1:get_archiveresult', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  613. @property
  614. def api_docs_url(self) -> str:
  615. return '/api/v1/docs#/Core%20Models/api_v1_core_get_archiveresult'
  616. def get_absolute_url(self):
  617. return f'/{self.snapshot.archive_path}/{self.extractor}'
  618. @property
  619. def extractor_module(self) -> Any | None:
  620. return abx.as_dict(abx.pm.hook.get_EXTRACTORS()).get(self.extractor, None)
  621. @property
  622. def EXTRACTOR(self) -> object:
  623. # return self.extractor_module
  624. return self.extractor_module(archiveresult=self)
  625. def embed_path(self) -> str | None:
  626. """
  627. return the actual runtime-calculated path to the file on-disk that
  628. should be used for user-facing iframe embeds of this result
  629. """
  630. try:
  631. return self.extractor_module.get_embed_path(self)
  632. except Exception as e:
  633. print(f'Error getting embed path for {self.extractor} extractor: {e}')
  634. return None
  635. def legacy_output_path(self):
  636. return self.canonical_outputs().get(f'{self.extractor}_path')
  637. def output_exists(self) -> bool:
  638. output_path = Path(self.snapshot_dir) / self.extractor
  639. return os.path.exists(output_path)
  640. def create_output_dir(self):
  641. output_dir = Path(self.snapshot_dir) / self.extractor
  642. output_dir.mkdir(parents=True, exist_ok=True)
  643. return output_dir
  644. def canonical_outputs(self) -> Dict[str, Optional[str]]:
  645. """Predict the expected output paths that should be present after archiving"""
  646. # You'll need to implement the actual logic based on your requirements
  647. # TODO: banish this awful duplication from the codebase and import these
  648. # from their respective extractor files
  649. from abx_plugin_favicon.config import FAVICON_CONFIG
  650. canonical = {
  651. 'index_path': 'index.html',
  652. 'favicon_path': 'favicon.ico',
  653. 'google_favicon_path': FAVICON_CONFIG.FAVICON_PROVIDER.format(self.domain),
  654. 'wget_path': f'warc/{self.timestamp}',
  655. 'warc_path': 'warc/',
  656. 'singlefile_path': 'singlefile.html',
  657. 'readability_path': 'readability/content.html',
  658. 'mercury_path': 'mercury/content.html',
  659. 'htmltotext_path': 'htmltotext.txt',
  660. 'pdf_path': 'output.pdf',
  661. 'screenshot_path': 'screenshot.png',
  662. 'dom_path': 'output.html',
  663. 'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
  664. 'git_path': 'git/',
  665. 'media_path': 'media/',
  666. 'headers_path': 'headers.json',
  667. }
  668. if self.is_static:
  669. static_path = f'warc/{self.timestamp}'
  670. canonical.update({
  671. 'title': self.basename,
  672. 'wget_path': static_path,
  673. 'pdf_path': static_path,
  674. 'screenshot_path': static_path,
  675. 'dom_path': static_path,
  676. 'singlefile_path': static_path,
  677. 'readability_path': static_path,
  678. 'mercury_path': static_path,
  679. 'htmltotext_path': static_path,
  680. })
  681. return canonical
  682. @property
  683. def output_dir_name(self) -> str:
  684. return self.extractor
  685. @property
  686. def output_dir_parent(self) -> str:
  687. return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
  688. @cached_property
  689. def output_files(self) -> dict[str, dict]:
  690. dir_info = get_dir_info(self.OUTPUT_DIR, max_depth=6)
  691. with open(self.OUTPUT_DIR / '.hashes.json', 'w') as f:
  692. json.dump(dir_info, f)
  693. return dir_info
  694. def announce_event(self, output_type: str, event: dict):
  695. event = {
  696. **event,
  697. 'type': output_type,
  698. }
  699. # if event references a file, make sure it exists on disk
  700. if 'path' in event:
  701. file_path = Path(self.OUTPUT_DIR) / event['path']
  702. assert file_path.exists(), f'ArchiveResult[{self.ABID}].announce_event(): File does not exist: {file_path} ({event})'
  703. with open(self.OUTPUT_DIR / '.events.jsonl', 'a') as f:
  704. f.write(json.dumps(event, sort_keys=True, default=str) + '\n')
  705. def events(self, filter_type: str | None=None) -> list[dict]:
  706. events = []
  707. try:
  708. with open(self.OUTPUT_DIR / '.events.jsonl', 'r') as f:
  709. for line in f:
  710. event = json.loads(line)
  711. if filter_type is None or event['type'] == filter_type:
  712. events.append(event)
  713. except FileNotFoundError:
  714. pass
  715. return events
  716. def write_indexes(self):
  717. """Write the ArchiveResult json, html, and merkle indexes to output dir, and pass searchable text to the search backend"""
  718. super().write_indexes()
  719. self.save_search_index()
  720. # self.save_outlinks_to_crawl()
  721. # def save_outlinks_to_crawl(self):
  722. # """Save the output of this ArchiveResult to the Crawl's urls field"""
  723. # if self.output_urls:
  724. # self.snapshot.crawl.urls += f'\n{self.url}'
  725. # self.snapshot.crawl.save()
  726. # def migrate_output_dir(self):
  727. # """Move the output files to the new folder structure if needed"""
  728. # print(f'{self}.migrate_output_dir()')
  729. # self.migrate_from_0_7_2()
  730. # self.migrate_from_0_8_6()
  731. # # ... future migrations here
  732. # def migrate_from_0_7_2(self):
  733. # """Migrate the folder structure from 0.7.2 to the current version"""
  734. # # migrate any existing output_dir into data/archiveresults/<extractor>/YYYY-MM-DD/<domain>/<abid>
  735. # # create self.output_dir if it doesn't exist
  736. # # move loose files in snapshot_dir into self.output_dir
  737. # # update self.pwd = self.output_dir
  738. # print(f'{self}.migrate_from_0_7_2()')
  739. # def migrate_from_0_8_6(self):
  740. # """Migrate the folder structure from 0.8.6 to the current version"""
  741. # # ... future migration code here ...
  742. # print(f'{self}.migrate_from_0_8_6()')
  743. # def save_json_index(self):
  744. # """Save the json index file to ./.index.json"""
  745. # print(f'{self}.save_json_index()')
  746. # pass
  747. # def save_symlinks_index(self):
  748. # """Update the symlink farm idnexes to point to the new location of self.output_dir"""
  749. # # ln -s self.output_dir data/index/results_by_type/wget/YYYY-MM-DD/example.com/<abid>
  750. # # ln -s self.output_dir data/index/results_by_day/YYYY-MM-DD/example.com/wget/<abid>
  751. # # ln -s self.output_dir data/index/results_by_domain/example.com/YYYY-MM-DD/wget/<abid>
  752. # # ln -s self.output_dir data/index/results_by_abid/<abid>
  753. # # ln -s self.output_dir data/archive/<snapshot_timestamp>/<extractor>
  754. # print(f'{self}.save_symlinks_index()')
  755. # def save_html_index(self):
  756. # """Save the html index file to ./.index.html"""
  757. # print(f'{self}.save_html_index()')
  758. # pass
  759. # def save_merkle_index(self):
  760. # """Calculate the recursive sha256 of all the files in the output path and save it to ./.checksum.json"""
  761. # print(f'{self}.save_merkle_index()')
  762. # pass
  763. def save_search_index(self):
  764. """Pass any indexable text to the search backend indexer (e.g. sonic, SQLiteFTS5, etc.)"""
  765. print(f'{self}.save_search_index()')
  766. pass
  767. # def get_storage_dir(self, create=True, symlink=True):
  768. # date_str = self.snapshot.bookmarked_at.strftime('%Y%m%d')
  769. # domain_str = domain(self.snapshot.url)
  770. # abs_storage_dir = Path(CONSTANTS.ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
  771. # if create and not abs_storage_dir.is_dir():
  772. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  773. # if symlink:
  774. # LINK_PATHS = [
  775. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  776. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
  777. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
  778. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
  779. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
  780. # ]
  781. # for link_path in LINK_PATHS:
  782. # link_path.parent.mkdir(parents=True, exist_ok=True)
  783. # try:
  784. # link_path.symlink_to(abs_storage_dir)
  785. # except FileExistsError:
  786. # link_path.unlink()
  787. # link_path.symlink_to(abs_storage_dir)
  788. # return abs_storage_dir
  789. # def symlink_index(self, create=True):
  790. # abs_result_dir = self.get_storage_dir(create=create)
  791. # @abx.hookimpl.on_archiveresult_created
  792. # def exec_archiveresult_extractor_effects(archiveresult):
  793. # config = get_scope_config(...)
  794. # # abx.archivebox.writes.update_archiveresult_started(archiveresult, start_ts=timezone.now())
  795. # # abx.archivebox.events.on_archiveresult_updated(archiveresult)
  796. # # check if it should be skipped
  797. # if not abx.archivebox.reads.get_archiveresult_should_run(archiveresult, config):
  798. # abx.archivebox.writes.update_archiveresult_skipped(archiveresult, status='skipped')
  799. # abx.archivebox.events.on_archiveresult_skipped(archiveresult, config)
  800. # return
  801. # # run the extractor method and save the output back to the archiveresult
  802. # try:
  803. # output = abx.archivebox.effects.exec_archiveresult_extractor(archiveresult, config)
  804. # abx.archivebox.writes.update_archiveresult_succeeded(archiveresult, output=output, error=None, end_ts=timezone.now())
  805. # except Exception as e:
  806. # abx.archivebox.writes.update_archiveresult_failed(archiveresult, error=e, end_ts=timezone.now())
  807. # # bump the modified time on the archiveresult and Snapshot
  808. # abx.archivebox.events.on_archiveresult_updated(archiveresult)
  809. # abx.archivebox.events.on_snapshot_updated(archiveresult.snapshot)
  810. # @abx.hookimpl.reads.get_outlink_parents
  811. # def get_outlink_parents(url, crawl_pk=None, config=None):
  812. # scope = Q(dst=url)
  813. # if crawl_pk:
  814. # scope = scope | Q(via__snapshot__crawl_id=crawl_pk)
  815. # parent = list(Outlink.objects.filter(scope))
  816. # if not parent:
  817. # # base case: we reached the top of the chain, no more parents left
  818. # return []
  819. # # recursive case: there is another parent above us, get its parents
  820. # yield parent[0]
  821. # yield from get_outlink_parents(parent[0].src, crawl_pk=crawl_pk, config=config)