views.py 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009
  1. __package__ = 'archivebox.core'
  2. import os
  3. import sys
  4. from django.utils import timezone
  5. import inspect
  6. from typing import Callable, get_type_hints
  7. from pathlib import Path
  8. from django.shortcuts import render, redirect
  9. from django.http import HttpRequest, HttpResponse, Http404
  10. from django.utils.html import format_html, mark_safe
  11. from django.views import View
  12. from django.views.generic.list import ListView
  13. from django.views.generic import FormView
  14. from django.db.models import Q
  15. from django.contrib import messages
  16. from django.contrib.auth.mixins import UserPassesTestMixin
  17. from django.views.decorators.csrf import csrf_exempt
  18. from django.utils.decorators import method_decorator
  19. from admin_data_views.typing import TableContext, ItemContext
  20. from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
  21. import archivebox
  22. from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
  23. from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
  24. from archivebox.config.configset import get_flat_config, get_config, get_all_configs
  25. from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
  26. from archivebox.misc.serve_static import serve_static_with_byterange_support
  27. from archivebox.misc.logging_util import printable_filesize
  28. from archivebox.search import query_search_index
  29. from archivebox.core.models import Snapshot
  30. from archivebox.core.forms import AddLinkForm
  31. from archivebox.crawls.models import Crawl
  32. from archivebox.hooks import get_enabled_plugins, get_plugin_name
  33. class HomepageView(View):
  34. def get(self, request):
  35. if request.user.is_authenticated:
  36. return redirect('/admin/core/snapshot/')
  37. if SERVER_CONFIG.PUBLIC_INDEX:
  38. return redirect('/public')
  39. return redirect(f'/admin/login/?next={request.path}')
  40. class SnapshotView(View):
  41. # render static html index from filesystem archive/<timestamp>/index.html
  42. @staticmethod
  43. def render_live_index(request, snapshot):
  44. TITLE_LOADING_MSG = 'Not yet archived...'
  45. # Dict of plugin -> ArchiveResult object
  46. archiveresult_objects = {}
  47. # Dict of plugin -> result info dict (for template compatibility)
  48. archiveresults = {}
  49. results = snapshot.archiveresult_set.all()
  50. for result in results:
  51. embed_path = result.embed_path()
  52. abs_path = result.snapshot_dir / (embed_path or 'None')
  53. if (result.status == 'succeeded'
  54. and embed_path
  55. and os.access(abs_path, os.R_OK)
  56. and abs_path.exists()):
  57. if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
  58. continue
  59. # Store the full ArchiveResult object for template tags
  60. archiveresult_objects[result.plugin] = result
  61. result_info = {
  62. 'name': result.plugin,
  63. 'path': embed_path,
  64. 'ts': ts_to_date_str(result.end_ts),
  65. 'size': abs_path.stat().st_size or '?',
  66. 'result': result, # Include the full object for template tags
  67. }
  68. archiveresults[result.plugin] = result_info
  69. # Use canonical_outputs for intelligent discovery
  70. # This method now scans ArchiveResults and uses smart heuristics
  71. canonical = snapshot.canonical_outputs()
  72. # Add any newly discovered outputs from canonical_outputs to archiveresults
  73. snap_dir = Path(snapshot.output_dir)
  74. for key, path in canonical.items():
  75. if not key.endswith('_path') or not path or path.startswith('http'):
  76. continue
  77. plugin_name = key.replace('_path', '')
  78. if plugin_name in archiveresults:
  79. continue # Already have this from ArchiveResult
  80. file_path = snap_dir / path
  81. if not file_path.exists() or not file_path.is_file():
  82. continue
  83. try:
  84. file_size = file_path.stat().st_size
  85. if file_size >= 15_000: # Only show files > 15KB
  86. archiveresults[plugin_name] = {
  87. 'name': plugin_name,
  88. 'path': path,
  89. 'ts': ts_to_date_str(file_path.stat().st_mtime or 0),
  90. 'size': file_size,
  91. 'result': None,
  92. }
  93. except OSError:
  94. continue
  95. # Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
  96. # Convert to base names for display ordering
  97. all_plugins = [get_plugin_name(e) for e in get_enabled_plugins()]
  98. preferred_types = tuple(all_plugins)
  99. all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
  100. best_result = {'path': 'None', 'result': None}
  101. for result_type in preferred_types:
  102. if result_type in archiveresults:
  103. best_result = archiveresults[result_type]
  104. break
  105. snapshot_info = snapshot.to_dict(extended=True)
  106. try:
  107. warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name
  108. except IndexError:
  109. warc_path = 'warc/'
  110. context = {
  111. **snapshot_info,
  112. **snapshot_info.get('canonical', {}),
  113. 'title': htmlencode(
  114. snapshot.title
  115. or (snapshot.base_url if snapshot.is_archived else TITLE_LOADING_MSG)
  116. ),
  117. 'extension': snapshot.extension or 'html',
  118. 'tags': snapshot.tags_str() or 'untagged',
  119. 'size': printable_filesize(snapshot.archive_size) if snapshot.archive_size else 'pending',
  120. 'status': 'archived' if snapshot.is_archived else 'not yet archived',
  121. 'status_color': 'success' if snapshot.is_archived else 'danger',
  122. 'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date),
  123. 'warc_path': warc_path,
  124. 'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
  125. 'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
  126. 'best_result': best_result,
  127. 'snapshot': snapshot, # Pass the snapshot object for template tags
  128. }
  129. return render(template_name='core/snapshot_live.html', request=request, context=context)
  130. def get(self, request, path):
  131. if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
  132. return redirect(f'/admin/login/?next={request.path}')
  133. snapshot = None
  134. try:
  135. slug, archivefile = path.split('/', 1)
  136. except (IndexError, ValueError):
  137. slug, archivefile = path.split('/', 1)[0], 'index.html'
  138. # slug is a timestamp
  139. if slug.replace('.','').isdigit():
  140. # missing trailing slash -> redirect to index
  141. if '/' not in path:
  142. return redirect(f'{path}/index.html')
  143. try:
  144. try:
  145. snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
  146. if archivefile == 'index.html':
  147. # if they requested snapshot index, serve live rendered template instead of static html
  148. response = self.render_live_index(request, snapshot)
  149. else:
  150. response = serve_static_with_byterange_support(
  151. request, archivefile, document_root=snapshot.output_dir, show_indexes=True,
  152. )
  153. response["Link"] = f'<{snapshot.url}>; rel="canonical"'
  154. return response
  155. except Snapshot.DoesNotExist:
  156. if Snapshot.objects.filter(timestamp__startswith=slug).exists():
  157. raise Snapshot.MultipleObjectsReturned
  158. else:
  159. raise
  160. except Snapshot.DoesNotExist:
  161. # Snapshot does not exist
  162. return HttpResponse(
  163. format_html(
  164. (
  165. '<center><br/><br/><br/>'
  166. 'No Snapshot directories match the given timestamp/ID: <code>{}</code><br/><br/>'
  167. 'You can <a href="/add/" target="_top">add a new Snapshot</a>, or return to the <a href="/" target="_top">Main Index</a>'
  168. '</center>'
  169. ),
  170. slug,
  171. path,
  172. ),
  173. content_type="text/html",
  174. status=404,
  175. )
  176. except Snapshot.MultipleObjectsReturned:
  177. snapshot_hrefs = mark_safe('<br/>').join(
  178. format_html(
  179. '{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
  180. snap.bookmarked_at.strftime('%Y-%m-%d %H:%M:%S'),
  181. snap.timestamp,
  182. snap.timestamp,
  183. snap.url,
  184. snap.title_stripped[:64] or '',
  185. )
  186. for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'bookmarked_at').order_by('-bookmarked_at')
  187. )
  188. return HttpResponse(
  189. format_html(
  190. (
  191. 'Multiple Snapshots match the given timestamp/ID <code>{}</code><br/><pre>'
  192. ),
  193. slug,
  194. ) + snapshot_hrefs + format_html(
  195. (
  196. '</pre><br/>'
  197. 'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
  198. )
  199. ),
  200. content_type="text/html",
  201. status=404,
  202. )
  203. except Http404:
  204. assert snapshot # (Snapshot.DoesNotExist is already handled above)
  205. # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
  206. return HttpResponse(
  207. format_html(
  208. (
  209. '<html><head>'
  210. '<title>Snapshot Not Found</title>'
  211. #'<script>'
  212. #'setTimeout(() => { window.location.reload(); }, 5000);'
  213. #'</script>'
  214. '</head><body>'
  215. '<center><br/><br/><br/>'
  216. f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a>: <a href="{snapshot.url}" target="_blank" rel="noreferrer">{snapshot.url}</a><br/>'
  217. f'was queued on {str(snapshot.bookmarked_at).split(".")[0]}, '
  218. f'but no files have been saved yet in:<br/><b><a href="/archive/{snapshot.timestamp}/" target="_top"><code>{snapshot.timestamp}</code></a><code>/'
  219. '{}'
  220. f'</code></b><br/><br/>'
  221. 'It\'s possible {} '
  222. f'during the last capture on {str(snapshot.bookmarked_at).split(".")[0]},<br/>or that the archiving process has not completed yet.<br/>'
  223. f'<pre><code># run this cmd to finish/retry archiving this Snapshot</code><br/>'
  224. f'<code style="user-select: all; color: #333">archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
  225. '<div class="text-align: left; width: 100%; max-width: 400px">'
  226. '<i><b>Next steps:</i></b><br/>'
  227. f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
  228. f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
  229. f'- go to the <a href="/admin/core/snapshot/{snapshot.pk}/change/" target="_top">Snapshot admin</a> to edit<br/>'
  230. f'- go to the <a href="/admin/core/snapshot/?id__exact={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
  231. '- or return to <a href="/" target="_top">the main index...</a></div>'
  232. '</center>'
  233. '</body></html>'
  234. ),
  235. archivefile if str(archivefile) != 'None' else '',
  236. f'the {archivefile} resource could not be fetched' if str(archivefile) != 'None' else 'the original site was not available',
  237. ),
  238. content_type="text/html",
  239. status=404,
  240. )
  241. # slug is a URL
  242. try:
  243. try:
  244. # try exact match on full url / ID first
  245. snapshot = Snapshot.objects.get(
  246. Q(url='http://' + path) | Q(url='https://' + path) | Q(id__icontains=path)
  247. )
  248. except Snapshot.DoesNotExist:
  249. # fall back to match on exact base_url
  250. try:
  251. snapshot = Snapshot.objects.get(
  252. Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path))
  253. )
  254. except Snapshot.DoesNotExist:
  255. # fall back to matching base_url as prefix
  256. snapshot = Snapshot.objects.get(
  257. Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
  258. )
  259. return redirect(f'/archive/{snapshot.timestamp}/index.html')
  260. except Snapshot.DoesNotExist:
  261. return HttpResponse(
  262. format_html(
  263. (
  264. '<center><br/><br/><br/>'
  265. 'No Snapshots match the given url: <code>{}</code><br/><br/><br/>'
  266. 'Return to the <a href="/" target="_top">Main Index</a>, or:<br/><br/>'
  267. '+ <i><a href="/add/?url={}" target="_top">Add a new Snapshot for <code>{}</code></a><br/><br/></i>'
  268. '</center>'
  269. ),
  270. base_url(path),
  271. path if '://' in path else f'https://{path}',
  272. path,
  273. ),
  274. content_type="text/html",
  275. status=404,
  276. )
  277. except Snapshot.MultipleObjectsReturned:
  278. snapshot_hrefs = mark_safe('<br/>').join(
  279. format_html(
  280. '{} <code style="font-size: 0.8em">{}</code> <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
  281. snap.bookmarked_at.strftime('%Y-%m-%d %H:%M:%S'),
  282. str(snap.id)[:8],
  283. snap.timestamp,
  284. snap.timestamp,
  285. snap.url,
  286. snap.title_stripped[:64] or '',
  287. )
  288. for snap in Snapshot.objects.filter(
  289. Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
  290. | Q(id__icontains=path)
  291. ).only('url', 'timestamp', 'title', 'bookmarked_at').order_by('-bookmarked_at')
  292. )
  293. return HttpResponse(
  294. format_html(
  295. (
  296. 'Multiple Snapshots match the given URL <code>{}</code><br/><pre>'
  297. ),
  298. base_url(path),
  299. ) + snapshot_hrefs + format_html(
  300. (
  301. '</pre><br/>'
  302. 'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
  303. )
  304. ),
  305. content_type="text/html",
  306. status=404,
  307. )
  308. class PublicIndexView(ListView):
  309. template_name = 'public_index.html'
  310. model = Snapshot
  311. paginate_by = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
  312. ordering = ['-bookmarked_at', '-created_at']
  313. def get_context_data(self, **kwargs):
  314. return {
  315. **super().get_context_data(**kwargs),
  316. 'VERSION': VERSION,
  317. 'COMMIT_HASH': SHELL_CONFIG.COMMIT_HASH,
  318. 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
  319. }
  320. def get_queryset(self, **kwargs):
  321. qs = super().get_queryset(**kwargs)
  322. query = self.request.GET.get('q', default = '').strip()
  323. if not query:
  324. return qs.distinct()
  325. query_type = self.request.GET.get('query_type')
  326. if not query_type or query_type == 'all':
  327. qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query))
  328. try:
  329. qs = qs | query_search_index(query)
  330. except Exception as err:
  331. print(f'[!] Error while using search backend: {err.__class__.__name__} {err}')
  332. elif query_type == 'fulltext':
  333. try:
  334. qs = qs | query_search_index(query)
  335. except Exception as err:
  336. print(f'[!] Error while using search backend: {err.__class__.__name__} {err}')
  337. elif query_type == 'meta':
  338. qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query))
  339. elif query_type == 'url':
  340. qs = qs.filter(Q(url__icontains=query))
  341. elif query_type == 'title':
  342. qs = qs.filter(Q(title__icontains=query))
  343. elif query_type == 'timestamp':
  344. qs = qs.filter(Q(timestamp__icontains=query))
  345. elif query_type == 'tags':
  346. qs = qs.filter(Q(tags__name__icontains=query))
  347. else:
  348. print(f'[!] Unknown value for query_type: "{query_type}"')
  349. return qs.distinct()
  350. def get(self, *args, **kwargs):
  351. if SERVER_CONFIG.PUBLIC_INDEX or self.request.user.is_authenticated:
  352. response = super().get(*args, **kwargs)
  353. return response
  354. else:
  355. return redirect(f'/admin/login/?next={self.request.path}')
  356. @method_decorator(csrf_exempt, name='dispatch')
  357. class AddView(UserPassesTestMixin, FormView):
  358. template_name = "add.html"
  359. form_class = AddLinkForm
  360. def get_initial(self):
  361. """Prefill the AddLinkForm with the 'url' GET parameter"""
  362. if self.request.method == 'GET':
  363. url = self.request.GET.get('url', None)
  364. if url:
  365. return {'url': url if '://' in url else f'https://{url}'}
  366. return super().get_initial()
  367. def test_func(self):
  368. return SERVER_CONFIG.PUBLIC_ADD_VIEW or self.request.user.is_authenticated
  369. def get_context_data(self, **kwargs):
  370. from archivebox.core.models import Tag
  371. return {
  372. **super().get_context_data(**kwargs),
  373. 'title': "Create Crawl",
  374. # We can't just call request.build_absolute_uri in the template, because it would include query parameters
  375. 'absolute_add_path': self.request.build_absolute_uri(self.request.path),
  376. 'VERSION': VERSION,
  377. 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
  378. 'stdout': '',
  379. 'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
  380. }
  381. def form_valid(self, form):
  382. urls = form.cleaned_data["url"]
  383. print(f'[+] Adding URL: {urls}')
  384. # Extract all form fields
  385. tag = form.cleaned_data["tag"]
  386. depth = int(form.cleaned_data["depth"])
  387. plugins = ','.join(form.cleaned_data.get("plugins", []))
  388. schedule = form.cleaned_data.get("schedule", "").strip()
  389. persona = form.cleaned_data.get("persona", "Default")
  390. overwrite = form.cleaned_data.get("overwrite", False)
  391. update = form.cleaned_data.get("update", False)
  392. index_only = form.cleaned_data.get("index_only", False)
  393. notes = form.cleaned_data.get("notes", "")
  394. custom_config = form.cleaned_data.get("config", {})
  395. from archivebox.config.permissions import HOSTNAME
  396. # 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
  397. sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
  398. sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
  399. # 2. create a new Crawl with the URLs from the file
  400. timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
  401. urls_content = sources_file.read_text()
  402. # Build complete config
  403. config = {
  404. 'ONLY_NEW': not update,
  405. 'INDEX_ONLY': index_only,
  406. 'OVERWRITE': overwrite,
  407. 'DEPTH': depth,
  408. 'PLUGINS': plugins or '',
  409. 'DEFAULT_PERSONA': persona or 'Default',
  410. }
  411. # Merge custom config overrides
  412. config.update(custom_config)
  413. crawl = Crawl.objects.create(
  414. urls=urls_content,
  415. max_depth=depth,
  416. tags_str=tag,
  417. notes=notes,
  418. label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
  419. created_by_id=self.request.user.pk,
  420. config=config
  421. )
  422. # 3. create a CrawlSchedule if schedule is provided
  423. if schedule:
  424. from archivebox.crawls.models import CrawlSchedule
  425. crawl_schedule = CrawlSchedule.objects.create(
  426. template=crawl,
  427. schedule=schedule,
  428. is_enabled=True,
  429. label=crawl.label,
  430. notes=f"Auto-created from add page. {notes}".strip(),
  431. created_by_id=self.request.user.pk,
  432. )
  433. crawl.schedule = crawl_schedule
  434. crawl.save(update_fields=['schedule'])
  435. # 4. start the Orchestrator & wait until it completes
  436. # ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
  437. # from archivebox.crawls.actors import CrawlActor
  438. # from archivebox.core.actors import SnapshotActor, ArchiveResultActor
  439. rough_url_count = urls.count('://')
  440. # Build success message with schedule link if created
  441. schedule_msg = ""
  442. if schedule:
  443. schedule_msg = f" and <a href='{crawl.schedule.admin_change_url}'>scheduled to repeat {schedule}</a>"
  444. messages.success(
  445. self.request,
  446. mark_safe(f"Created crawl with {rough_url_count} starting URL(s){schedule_msg}. Snapshots will be created and archived in the background. <a href='{crawl.admin_change_url}'>View Crawl →</a>"),
  447. )
  448. # Orchestrator (managed by supervisord) will pick up the queued crawl
  449. return redirect(crawl.admin_change_url)
  450. class HealthCheckView(View):
  451. """
  452. A Django view that renders plain text "OK" for service discovery tools
  453. """
  454. def get(self, request):
  455. """
  456. Handle a GET request
  457. """
  458. return HttpResponse(
  459. 'OK',
  460. content_type='text/plain',
  461. status=200
  462. )
  463. import json
  464. from django.http import JsonResponse
  465. def live_progress_view(request):
  466. """Simple JSON endpoint for live progress status - used by admin progress monitor."""
  467. try:
  468. from archivebox.workers.orchestrator import Orchestrator
  469. from archivebox.crawls.models import Crawl
  470. from archivebox.core.models import Snapshot, ArchiveResult
  471. from django.db.models import Case, When, Value, IntegerField
  472. # Get orchestrator status
  473. orchestrator_running = Orchestrator.is_running()
  474. total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
  475. # Get model counts by status
  476. crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
  477. crawls_started = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).count()
  478. # Get recent crawls (last 24 hours)
  479. from datetime import timedelta
  480. one_day_ago = timezone.now() - timedelta(days=1)
  481. crawls_recent = Crawl.objects.filter(created_at__gte=one_day_ago).count()
  482. snapshots_pending = Snapshot.objects.filter(status=Snapshot.StatusChoices.QUEUED).count()
  483. snapshots_started = Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED).count()
  484. archiveresults_pending = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
  485. archiveresults_started = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.STARTED).count()
  486. archiveresults_succeeded = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
  487. archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count()
  488. # Build hierarchical active crawls with nested snapshots and archive results
  489. from django.db.models import Prefetch
  490. active_crawls_qs = Crawl.objects.filter(
  491. status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
  492. ).prefetch_related(
  493. 'snapshot_set',
  494. 'snapshot_set__archiveresult_set',
  495. ).distinct().order_by('-modified_at')[:10]
  496. active_crawls = []
  497. for crawl in active_crawls_qs:
  498. # Get ALL snapshots for this crawl to count status (already prefetched)
  499. all_crawl_snapshots = list(crawl.snapshot_set.all())
  500. # Count snapshots by status from ALL snapshots
  501. total_snapshots = len(all_crawl_snapshots)
  502. completed_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED)
  503. started_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED)
  504. pending_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED)
  505. # Get only ACTIVE snapshots to display (limit to 5 most recent)
  506. active_crawl_snapshots = [
  507. s for s in all_crawl_snapshots
  508. if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
  509. ][:5]
  510. # Count URLs in the crawl (for when snapshots haven't been created yet)
  511. urls_count = 0
  512. if crawl.urls:
  513. urls_count = len([u for u in crawl.urls.split('\n') if u.strip() and not u.startswith('#')])
  514. # Calculate crawl progress
  515. crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
  516. # Get active snapshots for this crawl (already prefetched)
  517. active_snapshots_for_crawl = []
  518. for snapshot in active_crawl_snapshots:
  519. # Get archive results for this snapshot (already prefetched)
  520. snapshot_results = snapshot.archiveresult_set.all()
  521. # Count in memory instead of DB queries
  522. total_plugins = len(snapshot_results)
  523. completed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
  524. failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
  525. pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
  526. # Calculate snapshot progress
  527. snapshot_progress = int(((completed_plugins + failed_plugins) / total_plugins) * 100) if total_plugins > 0 else 0
  528. # Get all extractor plugins for this snapshot (already prefetched, sort in Python)
  529. # Order: started first, then queued, then completed
  530. def plugin_sort_key(ar):
  531. status_order = {
  532. ArchiveResult.StatusChoices.STARTED: 0,
  533. ArchiveResult.StatusChoices.QUEUED: 1,
  534. ArchiveResult.StatusChoices.SUCCEEDED: 2,
  535. ArchiveResult.StatusChoices.FAILED: 3,
  536. }
  537. return (status_order.get(ar.status, 4), ar.plugin)
  538. all_plugins = [
  539. {
  540. 'id': str(ar.id),
  541. 'plugin': ar.plugin,
  542. 'status': ar.status,
  543. }
  544. for ar in sorted(snapshot_results, key=plugin_sort_key)
  545. ]
  546. active_snapshots_for_crawl.append({
  547. 'id': str(snapshot.id),
  548. 'url': snapshot.url[:80],
  549. 'status': snapshot.status,
  550. 'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None,
  551. 'progress': snapshot_progress,
  552. 'total_plugins': total_plugins,
  553. 'completed_plugins': completed_plugins,
  554. 'failed_plugins': failed_plugins,
  555. 'pending_plugins': pending_plugins,
  556. 'all_plugins': all_plugins,
  557. })
  558. # Check if crawl can start (for debugging stuck crawls)
  559. can_start = bool(crawl.urls)
  560. urls_preview = crawl.urls[:60] if crawl.urls else None
  561. # Check if retry_at is in the future (would prevent worker from claiming)
  562. retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
  563. seconds_until_retry = int((crawl.retry_at - timezone.now()).total_seconds()) if crawl.retry_at and retry_at_future else 0
  564. active_crawls.append({
  565. 'id': str(crawl.id),
  566. 'label': str(crawl)[:60],
  567. 'status': crawl.status,
  568. 'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
  569. 'progress': crawl_progress,
  570. 'max_depth': crawl.max_depth,
  571. 'urls_count': urls_count,
  572. 'total_snapshots': total_snapshots,
  573. 'completed_snapshots': completed_snapshots,
  574. 'started_snapshots': started_snapshots,
  575. 'failed_snapshots': 0,
  576. 'pending_snapshots': pending_snapshots,
  577. 'active_snapshots': active_snapshots_for_crawl,
  578. 'can_start': can_start,
  579. 'urls_preview': urls_preview,
  580. 'retry_at_future': retry_at_future,
  581. 'seconds_until_retry': seconds_until_retry,
  582. })
  583. return JsonResponse({
  584. 'orchestrator_running': orchestrator_running,
  585. 'total_workers': total_workers,
  586. 'crawls_pending': crawls_pending,
  587. 'crawls_started': crawls_started,
  588. 'crawls_recent': crawls_recent,
  589. 'snapshots_pending': snapshots_pending,
  590. 'snapshots_started': snapshots_started,
  591. 'archiveresults_pending': archiveresults_pending,
  592. 'archiveresults_started': archiveresults_started,
  593. 'archiveresults_succeeded': archiveresults_succeeded,
  594. 'archiveresults_failed': archiveresults_failed,
  595. 'active_crawls': active_crawls,
  596. 'server_time': timezone.now().isoformat(),
  597. })
  598. except Exception as e:
  599. import traceback
  600. return JsonResponse({
  601. 'error': str(e),
  602. 'traceback': traceback.format_exc(),
  603. 'orchestrator_running': False,
  604. 'total_workers': 0,
  605. 'crawls_pending': 0,
  606. 'crawls_started': 0,
  607. 'crawls_recent': 0,
  608. 'snapshots_pending': 0,
  609. 'snapshots_started': 0,
  610. 'archiveresults_pending': 0,
  611. 'archiveresults_started': 0,
  612. 'archiveresults_succeeded': 0,
  613. 'archiveresults_failed': 0,
  614. 'active_crawls': [],
  615. 'server_time': timezone.now().isoformat(),
  616. }, status=500)
  617. def find_config_section(key: str) -> str:
  618. CONFIGS = get_all_configs()
  619. if key in CONSTANTS_CONFIG:
  620. return 'CONSTANT'
  621. matching_sections = [
  622. section_id for section_id, section in CONFIGS.items() if key in dict(section)
  623. ]
  624. section = matching_sections[0] if matching_sections else 'DYNAMIC'
  625. return section
  626. def find_config_default(key: str) -> str:
  627. CONFIGS = get_all_configs()
  628. if key in CONSTANTS_CONFIG:
  629. return str(CONSTANTS_CONFIG[key])
  630. default_val = None
  631. for config in CONFIGS.values():
  632. if key in dict(config):
  633. default_field = getattr(config, 'model_fields', dict(config))[key]
  634. default_val = default_field.default if hasattr(default_field, 'default') else default_field
  635. break
  636. if isinstance(default_val, Callable):
  637. default_val = inspect.getsource(default_val).split('lambda', 1)[-1].split(':', 1)[-1].replace('\n', ' ').strip()
  638. if default_val.count(')') > default_val.count('('):
  639. default_val = default_val[:-1]
  640. else:
  641. default_val = str(default_val)
  642. return default_val
  643. def find_config_type(key: str) -> str:
  644. from typing import get_type_hints, ClassVar
  645. CONFIGS = get_all_configs()
  646. for config in CONFIGS.values():
  647. if hasattr(config, key):
  648. # Try to get from pydantic model_fields first (more reliable)
  649. if hasattr(config, 'model_fields') and key in config.model_fields:
  650. field = config.model_fields[key]
  651. if hasattr(field, 'annotation'):
  652. try:
  653. return str(field.annotation.__name__)
  654. except AttributeError:
  655. return str(field.annotation)
  656. # Fallback to get_type_hints with proper namespace
  657. try:
  658. import typing
  659. namespace = {
  660. 'ClassVar': ClassVar,
  661. 'Optional': typing.Optional,
  662. 'Union': typing.Union,
  663. 'List': typing.List,
  664. 'Dict': typing.Dict,
  665. 'Path': Path,
  666. }
  667. type_hints = get_type_hints(config, globalns=namespace, localns=namespace)
  668. try:
  669. return str(type_hints[key].__name__)
  670. except AttributeError:
  671. return str(type_hints[key])
  672. except Exception:
  673. # If all else fails, return str
  674. pass
  675. return 'str'
  676. def key_is_safe(key: str) -> bool:
  677. for term in ('key', 'password', 'secret', 'token'):
  678. if term in key.lower():
  679. return False
  680. return True
  681. def find_config_source(key: str, merged_config: dict) -> str:
  682. """Determine where a config value comes from."""
  683. import os
  684. from archivebox.machine.models import Machine
  685. # Check if it's from archivebox.machine.config
  686. try:
  687. machine = Machine.current()
  688. if machine.config and key in machine.config:
  689. return 'Machine'
  690. except Exception:
  691. pass
  692. # Check if it's from environment variable
  693. if key in os.environ:
  694. return 'Environment'
  695. # Check if it's from archivebox.config.file
  696. from archivebox.config.configset import BaseConfigSet
  697. file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
  698. if key in file_config:
  699. return 'Config File'
  700. # Otherwise it's using the default
  701. return 'Default'
  702. @render_with_table_view
  703. def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
  704. CONFIGS = get_all_configs()
  705. assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
  706. # Get merged config that includes Machine.config overrides
  707. try:
  708. from archivebox.machine.models import Machine
  709. machine = Machine.current()
  710. merged_config = get_config()
  711. except Exception as e:
  712. # Fallback if Machine model not available
  713. merged_config = get_config()
  714. machine = None
  715. rows = {
  716. "Section": [],
  717. "Key": [],
  718. "Type": [],
  719. "Value": [],
  720. "Source": [],
  721. "Default": [],
  722. # "Documentation": [],
  723. # "Aliases": [],
  724. }
  725. for section_id, section in reversed(list(CONFIGS.items())):
  726. for key in dict(section).keys():
  727. rows['Section'].append(section_id) # section.replace('_', ' ').title().replace(' Config', '')
  728. rows['Key'].append(ItemLink(key, key=key))
  729. rows['Type'].append(format_html('<code>{}</code>', find_config_type(key)))
  730. # Use merged config value (includes machine overrides)
  731. actual_value = merged_config.get(key, getattr(section, key, None))
  732. rows['Value'].append(mark_safe(f'<code>{actual_value}</code>') if key_is_safe(key) else '******** (redacted)')
  733. # Show where the value comes from
  734. source = find_config_source(key, merged_config)
  735. source_colors = {
  736. 'Machine': 'purple',
  737. 'Environment': 'blue',
  738. 'Config File': 'green',
  739. 'Default': 'gray'
  740. }
  741. rows['Source'].append(format_html('<code style="color: {}">{}</code>', source_colors.get(source, 'gray'), source))
  742. rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
  743. # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
  744. # rows['Aliases'].append(', '.join(find_config_aliases(key)))
  745. section = 'CONSTANT'
  746. for key in CONSTANTS_CONFIG.keys():
  747. rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
  748. rows['Key'].append(ItemLink(key, key=key))
  749. rows['Type'].append(format_html('<code>{}</code>', getattr(type(CONSTANTS_CONFIG[key]), '__name__', str(CONSTANTS_CONFIG[key]))))
  750. rows['Value'].append(format_html('<code>{}</code>', CONSTANTS_CONFIG[key]) if key_is_safe(key) else '******** (redacted)')
  751. rows['Source'].append(mark_safe('<code style="color: gray">Constant</code>'))
  752. rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
  753. # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
  754. # rows['Aliases'].append('')
  755. return TableContext(
  756. title="Computed Configuration Values",
  757. table=rows,
  758. )
  759. @render_with_item_view
  760. def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
  761. import os
  762. from archivebox.machine.models import Machine
  763. from archivebox.config.configset import BaseConfigSet
  764. CONFIGS = get_all_configs()
  765. FLAT_CONFIG = get_flat_config()
  766. assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
  767. # Get merged config
  768. merged_config = get_config()
  769. # Determine all sources for this config value
  770. sources_info = []
  771. # Default value
  772. default_val = find_config_default(key)
  773. if default_val:
  774. sources_info.append(('Default', default_val, 'gray'))
  775. # Config file value
  776. if CONSTANTS.CONFIG_FILE.exists():
  777. file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
  778. if key in file_config:
  779. sources_info.append(('Config File', file_config[key], 'green'))
  780. # Environment variable
  781. if key in os.environ:
  782. sources_info.append(('Environment', os.environ[key] if key_is_safe(key) else '********', 'blue'))
  783. # Machine config
  784. machine = None
  785. machine_admin_url = None
  786. try:
  787. machine = Machine.current()
  788. machine_admin_url = f'/admin/machine/machine/{machine.id}/change/'
  789. if machine.config and key in machine.config:
  790. sources_info.append(('Machine', machine.config[key] if key_is_safe(key) else '********', 'purple'))
  791. except Exception:
  792. pass
  793. # Final computed value
  794. final_value = merged_config.get(key, FLAT_CONFIG.get(key, CONFIGS.get(key, None)))
  795. if not key_is_safe(key):
  796. final_value = '********'
  797. # Build sources display
  798. sources_html = '<br/>'.join([
  799. f'<b style="color: {color}">{source}:</b> <code>{value}</code>'
  800. for source, value, color in sources_info
  801. ])
  802. # aliases = USER_CONFIG.get(key, {}).get("aliases", [])
  803. aliases = []
  804. if key in CONSTANTS_CONFIG:
  805. section_header = mark_safe(f'[CONSTANTS] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, hardcoded by ArchiveBox)</small>')
  806. elif key in FLAT_CONFIG:
  807. section_header = mark_safe(f'data / ArchiveBox.conf &nbsp; [{find_config_section(key)}] &nbsp; <b><code style="color: lightgray">{key}</code></b>')
  808. else:
  809. section_header = mark_safe(f'[DYNAMIC CONFIG] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, calculated at runtime)</small>')
  810. return ItemContext(
  811. slug=key,
  812. title=key,
  813. data=[
  814. {
  815. "name": section_header,
  816. "description": None,
  817. "fields": {
  818. 'Key': key,
  819. 'Type': find_config_type(key),
  820. 'Value': final_value,
  821. 'Source': find_config_source(key, merged_config),
  822. },
  823. "help_texts": {
  824. 'Key': mark_safe(f'''
  825. <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a> &nbsp;
  826. <span style="display: {"inline" if aliases else "none"}">
  827. Aliases: {", ".join(aliases)}
  828. </span>
  829. '''),
  830. 'Type': mark_safe(f'''
  831. <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
  832. See full definition in <code>archivebox/config</code>...
  833. </a>
  834. '''),
  835. 'Value': mark_safe(f'''
  836. {'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
  837. <br/><hr/><br/>
  838. <b>Configuration Sources (in priority order):</b><br/><br/>
  839. {sources_html}
  840. <br/><br/>
  841. <p style="display: {"block" if key in FLAT_CONFIG and key not in CONSTANTS_CONFIG else "none"}">
  842. <i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
  843. <br/><br/>
  844. <code>archivebox config --set {key}="{
  845. val.strip("'")
  846. if (val := find_config_default(key)) else
  847. (str(FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
  848. }"</code>
  849. </p>
  850. '''),
  851. 'Source': mark_safe(f'''
  852. The value shown in the "Value" field comes from the <b>{find_config_source(key, merged_config)}</b> source.
  853. <br/><br/>
  854. Priority order (highest to lowest):
  855. <ol>
  856. <li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
  857. {f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ''}
  858. </li>
  859. <li><b style="color: blue">Environment</b> - Environment variables</li>
  860. <li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
  861. <li><b style="color: gray">Default</b> - Default value from code</li>
  862. </ol>
  863. {f'<br/><b>💡 Tip:</b> To override <code>{key}</code> on this machine, <a href="{machine_admin_url}">edit the Machine.config field</a> and add:<br/><code>{{"\\"{key}\\": "your_value_here"}}</code>' if machine_admin_url and key not in CONSTANTS_CONFIG else ''}
  864. '''),
  865. },
  866. },
  867. ],
  868. )