models.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643
  1. __package__ = 'archivebox.core'
  2. from typing import Optional, Dict, Iterable
  3. from django_stubs_ext.db.models import TypedModelMeta
  4. import os
  5. import json
  6. from pathlib import Path
  7. from datetime import timedelta
  8. from django.db import models
  9. from django.utils.functional import cached_property
  10. from django.utils.text import slugify
  11. from django.utils import timezone
  12. from django.core.cache import cache
  13. from django.urls import reverse, reverse_lazy
  14. from django.db.models import Case, When, Value, IntegerField
  15. from django.contrib import admin
  16. from django.conf import settings
  17. from actors.models import ModelWithStateMachine
  18. from archivebox.config import CONSTANTS
  19. from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
  20. from queues.tasks import bg_archive_snapshot
  21. from crawls.models import Crawl
  22. # from machine.models import Machine, NetworkInterface
  23. from archivebox.misc.system import get_dir_size
  24. from archivebox.misc.util import parse_date, base_url
  25. from ..index.schema import Link
  26. from ..index.html import snapshot_icons
  27. from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
  28. # class BaseModel(models.Model):
  29. # # TODO: migrate all models to a shared base class with all our standard fields and helpers:
  30. # # ulid/created_at/modified_at/created_by/is_deleted/as_json/from_json/etc.
  31. # #
  32. # # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
  33. # # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
  34. # class Meta(TypedModelMeta):
  35. # abstract = True
  36. class Tag(ABIDModel):
  37. """
  38. Loosely based on django-taggit model + ABID base.
  39. """
  40. abid_prefix = 'tag_'
  41. abid_ts_src = 'self.created_at'
  42. abid_uri_src = 'self.slug'
  43. abid_subtype_src = '"03"'
  44. abid_rand_src = 'self.id'
  45. abid_drift_allowed = True
  46. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  47. abid = ABIDField(prefix=abid_prefix)
  48. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='tag_set')
  49. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  50. modified_at = models.DateTimeField(auto_now=True)
  51. name = models.CharField(unique=True, blank=False, max_length=100)
  52. slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
  53. # slug is autoset on save from name, never set it manually
  54. snapshot_set: models.Manager['Snapshot']
  55. # crawl_set: models.Manager['Crawl']
  56. class Meta(TypedModelMeta):
  57. verbose_name = "Tag"
  58. verbose_name_plural = "Tags"
  59. def __str__(self):
  60. return self.name
  61. def slugify(self, tag, i=None):
  62. slug = slugify(tag)
  63. if i is not None:
  64. slug += "_%d" % i
  65. return slug
  66. def clean(self, *args, **kwargs):
  67. self.slug = self.slug or self.slugify(self.name)
  68. super().clean(*args, **kwargs)
  69. def save(self, *args, **kwargs):
  70. if self._state.adding:
  71. self.slug = self.slugify(self.name)
  72. # if name is different but slug conficts with another tags slug, append a counter
  73. # with transaction.atomic():
  74. slugs = set(
  75. type(self)
  76. ._default_manager.filter(slug__startswith=self.slug)
  77. .values_list("slug", flat=True)
  78. )
  79. i = None
  80. while True:
  81. slug = self.slugify(self.name, i)
  82. if slug not in slugs:
  83. self.slug = slug
  84. return super().save(*args, **kwargs)
  85. i = 1 if i is None else i+1
  86. else:
  87. return super().save(*args, **kwargs)
  88. @property
  89. def api_url(self) -> str:
  90. # /api/v1/core/snapshot/{uulid}
  91. return reverse_lazy('api-1:get_tag', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  92. @property
  93. def api_docs_url(self) -> str:
  94. return '/api/v1/docs#/Core%20Models/api_v1_core_get_tag'
  95. class SnapshotTag(models.Model):
  96. id = models.AutoField(primary_key=True)
  97. snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
  98. tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
  99. class Meta:
  100. db_table = 'core_snapshot_tags'
  101. unique_together = [('snapshot', 'tag')]
  102. # class CrawlTag(models.Model):
  103. # id = models.AutoField(primary_key=True)
  104. # crawl = models.ForeignKey('Crawl', db_column='crawl_id', on_delete=models.CASCADE, to_field='id')
  105. # tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
  106. # class Meta:
  107. # db_table = 'core_crawl_tags'
  108. # unique_together = [('crawl', 'tag')]
  109. class SnapshotManager(models.Manager):
  110. def get_queryset(self):
  111. return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
  112. class Snapshot(ABIDModel, ModelWithStateMachine):
  113. abid_prefix = 'snp_'
  114. abid_ts_src = 'self.created_at'
  115. abid_uri_src = 'self.url'
  116. abid_subtype_src = '"01"'
  117. abid_rand_src = 'self.id'
  118. abid_drift_allowed = True
  119. state_machine_name = 'core.statemachines.SnapshotMachine'
  120. state_field_name = 'status'
  121. retry_at_field_name = 'retry_at'
  122. StatusChoices = ModelWithStateMachine.StatusChoices
  123. active_state = StatusChoices.STARTED
  124. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  125. abid = ABIDField(prefix=abid_prefix)
  126. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
  127. created_at = AutoDateTimeField(default=None, null=False, db_index=True) # loaded from self._init_timestamp
  128. modified_at = models.DateTimeField(auto_now=True)
  129. status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
  130. retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
  131. # legacy ts fields
  132. bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
  133. downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
  134. crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
  135. url = models.URLField(unique=True, db_index=True)
  136. timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
  137. tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
  138. title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
  139. keys = ('url', 'timestamp', 'title', 'tags', 'downloaded_at')
  140. archiveresult_set: models.Manager['ArchiveResult']
  141. objects = SnapshotManager()
  142. def save(self, *args, **kwargs):
  143. if not self.bookmarked_at:
  144. self.bookmarked_at = self.created_at or self._init_timestamp
  145. super().save(*args, **kwargs)
  146. def archive(self, overwrite=False, methods=None):
  147. result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
  148. return result
  149. def __repr__(self) -> str:
  150. url = self.url or '<no url set>'
  151. created_at = self.created_at.strftime("%Y-%m-%d %H:%M") if self.created_at else '<no timestamp set>'
  152. if self.id and self.url:
  153. return f'[{self.ABID}] {url[:64]} @ {created_at}'
  154. return f'[{self.abid_prefix}****not*saved*yet****] {url[:64]} @ {created_at}'
  155. def __str__(self) -> str:
  156. return repr(self)
  157. @classmethod
  158. def from_json(cls, info: dict):
  159. info = {k: v for k, v in info.items() if k in cls.keys}
  160. return cls(**info)
  161. def as_json(self, *args) -> dict:
  162. args = args or self.keys
  163. return {
  164. key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False)
  165. for key in args
  166. }
  167. def as_link(self) -> Link:
  168. return Link.from_json(self.as_json())
  169. def as_link_with_details(self) -> Link:
  170. from ..index import load_link_details
  171. return load_link_details(self.as_link())
  172. @admin.display(description='Tags')
  173. def tags_str(self, nocache=True) -> str | None:
  174. calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
  175. cache_key = f'{self.pk}-{(self.downloaded_at or self.bookmarked_at).timestamp()}-tags'
  176. if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
  177. # tags are pre-fetched already, use them directly (best because db is always freshest)
  178. tags_str = calc_tags_str()
  179. return tags_str
  180. if nocache:
  181. tags_str = calc_tags_str()
  182. cache.set(cache_key, tags_str)
  183. return tags_str
  184. return cache.get_or_set(cache_key, calc_tags_str)
  185. def icons(self) -> str:
  186. return snapshot_icons(self)
  187. @property
  188. def api_url(self) -> str:
  189. # /api/v1/core/snapshot/{uulid}
  190. return reverse_lazy('api-1:get_snapshot', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  191. @property
  192. def api_docs_url(self) -> str:
  193. return '/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot'
  194. def get_absolute_url(self):
  195. return f'/{self.archive_path}'
  196. @cached_property
  197. def title_stripped(self) -> str:
  198. return (self.title or '').replace("\n", " ").replace("\r", "")
  199. @cached_property
  200. def extension(self) -> str:
  201. from archivebox.misc.util import extension
  202. return extension(self.url)
  203. @cached_property
  204. def bookmarked(self):
  205. return parse_date(self.timestamp)
  206. @cached_property
  207. def bookmarked_date(self):
  208. # TODO: remove this
  209. return self.bookmarked
  210. @cached_property
  211. def is_archived(self):
  212. return self.as_link().is_archived
  213. @cached_property
  214. def num_outputs(self) -> int:
  215. # DONT DO THIS: it will trigger a separate query for every snapshot
  216. # return self.archiveresult_set.filter(status='succeeded').count()
  217. # this is better:
  218. return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded'))
  219. @cached_property
  220. def base_url(self):
  221. return base_url(self.url)
  222. @cached_property
  223. def link_dir(self):
  224. return str(CONSTANTS.ARCHIVE_DIR / self.timestamp)
  225. @cached_property
  226. def archive_path(self):
  227. return '{}/{}'.format(CONSTANTS.ARCHIVE_DIR_NAME, self.timestamp)
  228. @cached_property
  229. def archive_size(self):
  230. cache_key = f'{str(self.pk)[:12]}-{(self.downloaded_at or self.bookmarked_at).timestamp()}-size'
  231. def calc_dir_size():
  232. try:
  233. return get_dir_size(self.link_dir)[0]
  234. except Exception:
  235. return 0
  236. return cache.get_or_set(cache_key, calc_dir_size)
  237. @cached_property
  238. def thumbnail_url(self) -> Optional[str]:
  239. if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
  240. result = (sorted(
  241. (
  242. result
  243. for result in self.archiveresult_set.all()
  244. if result.extractor == 'screenshot' and result.status =='succeeded' and result.output
  245. ),
  246. key=lambda result: result.created_at,
  247. ) or [None])[-1]
  248. else:
  249. result = self.archiveresult_set.filter(
  250. extractor='screenshot',
  251. status='succeeded'
  252. ).only('output').last()
  253. if result:
  254. return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
  255. return None
  256. @cached_property
  257. def headers(self) -> Optional[Dict[str, str]]:
  258. try:
  259. return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
  260. except Exception:
  261. pass
  262. return None
  263. @cached_property
  264. def status_code(self) -> Optional[str]:
  265. return self.headers.get('Status-Code') if self.headers else None
  266. @cached_property
  267. def history(self) -> dict:
  268. # TODO: use ArchiveResult for this instead of json
  269. return self.as_link_with_details().history
  270. @cached_property
  271. def latest_title(self) -> Optional[str]:
  272. if self.title:
  273. return self.title # whoopdedoo that was easy
  274. # check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again
  275. if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
  276. try:
  277. return (sorted(
  278. (
  279. result.output.strip()
  280. for result in self.archiveresult_set.all()
  281. if result.extractor == 'title' and result.status =='succeeded' and result.output
  282. ),
  283. key=lambda title: len(title),
  284. ) or [None])[-1]
  285. except IndexError:
  286. pass
  287. try:
  288. # take longest successful title from ArchiveResult db history
  289. return sorted(
  290. self.archiveresult_set\
  291. .filter(extractor='title', status='succeeded', output__isnull=False)\
  292. .values_list('output', flat=True),
  293. key=lambda r: len(r),
  294. )[-1]
  295. except IndexError:
  296. pass
  297. try:
  298. # take longest successful title from Link json index file history
  299. return sorted(
  300. (
  301. result.output.strip()
  302. for result in self.history['title']
  303. if result.status == 'succeeded' and result.output.strip()
  304. ),
  305. key=lambda r: len(r),
  306. )[-1]
  307. except (KeyError, IndexError):
  308. pass
  309. return None
  310. def save_tags(self, tags: Iterable[str]=()) -> None:
  311. tags_id = []
  312. for tag in tags:
  313. if tag.strip():
  314. tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
  315. self.tags.clear()
  316. self.tags.add(*tags_id)
  317. def has_pending_archiveresults(self) -> bool:
  318. pending_archiveresults = self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
  319. return pending_archiveresults.exists()
  320. def create_pending_archiveresults(self) -> list['ArchiveResult']:
  321. archiveresults = []
  322. for extractor in EXTRACTORS:
  323. archiveresult, _created = ArchiveResult.objects.get_or_create(
  324. snapshot=self,
  325. extractor=extractor,
  326. status=ArchiveResult.INITIAL_STATE,
  327. )
  328. archiveresults.append(archiveresult)
  329. return archiveresults
  330. # def get_storage_dir(self, create=True, symlink=True) -> Path:
  331. # date_str = self.bookmarked_at.strftime('%Y%m%d')
  332. # domain_str = domain(self.url)
  333. # abs_storage_dir = Path(CONSTANTS.ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
  334. # if create and not abs_storage_dir.is_dir():
  335. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  336. # if symlink:
  337. # LINK_PATHS = [
  338. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  339. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
  340. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
  341. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
  342. # ]
  343. # for link_path in LINK_PATHS:
  344. # link_path.parent.mkdir(parents=True, exist_ok=True)
  345. # try:
  346. # link_path.symlink_to(abs_storage_dir)
  347. # except FileExistsError:
  348. # link_path.unlink()
  349. # link_path.symlink_to(abs_storage_dir)
  350. # return abs_storage_dir
  351. class ArchiveResultManager(models.Manager):
  352. def indexable(self, sorted: bool = True):
  353. """Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)"""
  354. INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
  355. qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded')
  356. if sorted:
  357. precedence = [
  358. When(extractor=method, then=Value(precedence))
  359. for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE
  360. ]
  361. qs = qs.annotate(
  362. indexing_precedence=Case(
  363. *precedence,
  364. default=Value(1000),
  365. output_field=IntegerField()
  366. )
  367. ).order_by('indexing_precedence')
  368. return qs
  369. class ArchiveResult(ABIDModel, ModelWithStateMachine):
  370. abid_prefix = 'res_'
  371. abid_ts_src = 'self.snapshot.created_at'
  372. abid_uri_src = 'self.snapshot.url'
  373. abid_subtype_src = 'self.extractor'
  374. abid_rand_src = 'self.id'
  375. abid_drift_allowed = True
  376. class StatusChoices(models.TextChoices):
  377. QUEUED = 'queued', 'Queued' # pending, initial
  378. STARTED = 'started', 'Started' # active
  379. BACKOFF = 'backoff', 'Waiting to retry' # pending
  380. SUCCEEDED = 'succeeded', 'Succeeded' # final
  381. FAILED = 'failed', 'Failed' # final
  382. SKIPPED = 'skipped', 'Skipped' # final
  383. state_machine_name = 'core.statemachines.ArchiveResultMachine'
  384. retry_at_field_name = 'retry_at'
  385. state_field_name = 'status'
  386. active_state = StatusChoices.STARTED
  387. EXTRACTOR_CHOICES = (
  388. ('htmltotext', 'htmltotext'),
  389. ('git', 'git'),
  390. ('singlefile', 'singlefile'),
  391. ('media', 'media'),
  392. ('archive_org', 'archive_org'),
  393. ('readability', 'readability'),
  394. ('mercury', 'mercury'),
  395. ('favicon', 'favicon'),
  396. ('pdf', 'pdf'),
  397. ('headers', 'headers'),
  398. ('screenshot', 'screenshot'),
  399. ('dom', 'dom'),
  400. ('title', 'title'),
  401. ('wget', 'wget'),
  402. )
  403. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  404. abid = ABIDField(prefix=abid_prefix)
  405. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set', db_index=True)
  406. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  407. modified_at = models.DateTimeField(auto_now=True)
  408. status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
  409. retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
  410. snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore
  411. extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True)
  412. cmd = models.JSONField(default=None, null=True, blank=True)
  413. pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
  414. cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
  415. output = models.CharField(max_length=1024, default=None, null=True, blank=True)
  416. start_ts = models.DateTimeField(default=None, null=True, blank=True)
  417. end_ts = models.DateTimeField(default=None, null=True, blank=True)
  418. # the network interface that was used to download this result
  419. # uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
  420. objects = ArchiveResultManager()
  421. class Meta(TypedModelMeta):
  422. verbose_name = 'Archive Result'
  423. verbose_name_plural = 'Archive Results Log'
  424. def __repr__(self):
  425. snapshot_id = getattr(self, 'snapshot_id', None)
  426. url = self.snapshot.url if snapshot_id else '<no url set>'
  427. created_at = self.snapshot.created_at.strftime("%Y-%m-%d %H:%M") if snapshot_id else '<no timestamp set>'
  428. extractor = self.extractor or '<no extractor set>'
  429. if self.id and snapshot_id:
  430. return f'[{self.ABID}] {url[:64]} @ {created_at} -> {extractor}'
  431. return f'[{self.abid_prefix}****not*saved*yet****] {url} @ {created_at} -> {extractor}'
  432. def __str__(self):
  433. return repr(self)
  434. # TODO: finish connecting machine.models
  435. # @cached_property
  436. # def machine(self):
  437. # return self.iface.machine if self.iface else None
  438. @cached_property
  439. def snapshot_dir(self):
  440. return Path(self.snapshot.link_dir)
  441. @cached_property
  442. def url(self):
  443. return self.snapshot.url
  444. @property
  445. def api_url(self) -> str:
  446. # /api/v1/core/archiveresult/{uulid}
  447. return reverse_lazy('api-1:get_archiveresult', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  448. @property
  449. def api_docs_url(self) -> str:
  450. return '/api/v1/docs#/Core%20Models/api_v1_core_get_archiveresult'
  451. def get_absolute_url(self):
  452. return f'/{self.snapshot.archive_path}/{self.output_path()}'
  453. @property
  454. def extractor_module(self):
  455. return EXTRACTORS[self.extractor]
  456. def output_path(self) -> str:
  457. """return the canonical output filename or directory name within the snapshot dir"""
  458. return self.extractor_module.get_output_path()
  459. def embed_path(self) -> str:
  460. """
  461. return the actual runtime-calculated path to the file on-disk that
  462. should be used for user-facing iframe embeds of this result
  463. """
  464. if get_embed_path_func := getattr(self.extractor_module, 'get_embed_path', None):
  465. return get_embed_path_func(self)
  466. return self.extractor_module.get_output_path()
  467. def legacy_output_path(self):
  468. link = self.snapshot.as_link()
  469. return link.canonical_outputs().get(f'{self.extractor}_path')
  470. def output_exists(self) -> bool:
  471. return os.path.exists(self.output_path())
  472. def create_output_dir(self):
  473. snap_dir = self.snapshot_dir
  474. snap_dir.mkdir(parents=True, exist_ok=True)
  475. return snap_dir / self.output_path()
  476. # def get_storage_dir(self, create=True, symlink=True):
  477. # date_str = self.snapshot.bookmarked_at.strftime('%Y%m%d')
  478. # domain_str = domain(self.snapshot.url)
  479. # abs_storage_dir = Path(CONSTANTS.ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
  480. # if create and not abs_storage_dir.is_dir():
  481. # abs_storage_dir.mkdir(parents=True, exist_ok=True)
  482. # if symlink:
  483. # LINK_PATHS = [
  484. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
  485. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
  486. # # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
  487. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
  488. # Path(CONSTANTS.ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
  489. # ]
  490. # for link_path in LINK_PATHS:
  491. # link_path.parent.mkdir(parents=True, exist_ok=True)
  492. # try:
  493. # link_path.symlink_to(abs_storage_dir)
  494. # except FileExistsError:
  495. # link_path.unlink()
  496. # link_path.symlink_to(abs_storage_dir)
  497. # return abs_storage_dir
  498. # def symlink_index(self, create=True):
  499. # abs_result_dir = self.get_storage_dir(create=create)