views.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. __package__ = 'archivebox.core'
  2. from io import StringIO
  3. from contextlib import redirect_stdout
  4. from django.shortcuts import render, redirect
  5. from django.http import HttpResponse, Http404
  6. from django.utils.html import format_html, mark_safe
  7. from django.views import View, static
  8. from django.views.generic.list import ListView
  9. from django.views.generic import FormView
  10. from django.db.models import Q
  11. from django.contrib.auth.mixins import UserPassesTestMixin
  12. from django.views.decorators.csrf import csrf_exempt
  13. from django.utils.decorators import method_decorator
  14. from core.models import Snapshot
  15. from core.forms import AddLinkForm
  16. from ..config import (
  17. OUTPUT_DIR,
  18. PUBLIC_INDEX,
  19. PUBLIC_SNAPSHOTS,
  20. PUBLIC_ADD_VIEW,
  21. VERSION,
  22. COMMIT_HASH,
  23. FOOTER_INFO,
  24. SNAPSHOTS_PER_PAGE,
  25. )
  26. from ..main import add
  27. from ..util import base_url, ansi_to_html
  28. from ..search import query_search_index
  29. class HomepageView(View):
  30. def get(self, request):
  31. if request.user.is_authenticated:
  32. return redirect('/admin/core/snapshot/')
  33. if PUBLIC_INDEX:
  34. return redirect('/public')
  35. return redirect(f'/admin/login/?next={request.path}')
  36. class SnapshotView(View):
  37. # render static html index from filesystem archive/<timestamp>/index.html
  38. def get(self, request, path):
  39. if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
  40. return redirect(f'/admin/login/?next={request.path}')
  41. try:
  42. slug, archivefile = path.split('/', 1)
  43. except (IndexError, ValueError):
  44. slug, archivefile = path.split('/', 1)[0], 'index.html'
  45. # slug is a timestamp
  46. if slug.replace('.', '').isdigit():
  47. # missing trailing slash -> redirect to index
  48. if '/' not in path:
  49. return redirect(f'{path}/index.html')
  50. # TODO: add support for archive.org-style URLs where timestamp may be a human-readable date
  51. # https://web.archivebox.io / web / 2022-01 / https://example.com
  52. # https://web.archivebox.io / web / 20220505103616 / https://example.com
  53. # https://web.archivebox.io / web / 2022-05-05__0:36:16 / https://example.com
  54. # use archivebox.util.parse_date (supports unix timestamps, iso date strings, and lots more etc.)
  55. try:
  56. try:
  57. snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
  58. response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
  59. response["Link"] = f'<{snapshot.url}>; rel="canonical"'
  60. return response
  61. except Snapshot.DoesNotExist:
  62. if Snapshot.objects.filter(timestamp__startswith=slug).exists():
  63. raise Snapshot.MultipleObjectsReturned
  64. else:
  65. raise
  66. except Snapshot.DoesNotExist:
  67. # Snapshot does not exist
  68. return HttpResponse(
  69. format_html(
  70. (
  71. '<center><br/><br/><br/>'
  72. 'No Snapshot directories match the given timestamp or UUID: <code>{}</code><br/><br/>'
  73. 'You can <a href="/add/" target="_top">add a new Snapshot</a>, or return to the <a href="/" target="_top">Main Index</a>'
  74. '</center>'
  75. ),
  76. slug,
  77. path,
  78. ),
  79. content_type="text/html",
  80. status=404,
  81. )
  82. except Snapshot.MultipleObjectsReturned:
  83. snapshot_hrefs = mark_safe('<br/>').join(
  84. format_html(
  85. '{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
  86. snap.added.strftime('%Y-%m-%d %H:%M:%S'),
  87. snap.timestamp,
  88. snap.timestamp,
  89. snap.url,
  90. snap.title or '',
  91. )
  92. for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'added').order_by('-added')
  93. )
  94. return HttpResponse(
  95. format_html(
  96. (
  97. 'Multiple Snapshots match the given timestamp/UUID <code>{}</code><br/><pre>'
  98. ),
  99. slug,
  100. ) + snapshot_hrefs + format_html(
  101. (
  102. '</pre><br/>'
  103. 'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
  104. )
  105. ),
  106. content_type="text/html",
  107. status=404,
  108. )
  109. except Http404:
  110. # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
  111. return HttpResponse(
  112. format_html(
  113. (
  114. '<center><br/><br/><br/>'
  115. f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a> exists in DB, but resource <b><code>{snapshot.timestamp}/'
  116. '{}'
  117. f'</code></b> does not exist in <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
  118. 'Maybe this resource type is not availabe for this Snapshot,<br/>or the archiving process has not completed yet?<br/>'
  119. f'<pre><code># run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
  120. '<div class="text-align: left; width: 100%; max-width: 400px">'
  121. '<i><b>Next steps:</i></b><br/>'
  122. f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
  123. f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
  124. f'- go to the <a href="/admin/core/snapshot/{snapshot.id}/change/" target="_top">Snapshot admin</a> to edit<br/>'
  125. f'- go to the <a href="/admin/core/snapshot/?id__startswith={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
  126. '- or return to <a href="/" target="_top">the main index...</a></div>'
  127. '</center>'
  128. ),
  129. archivefile,
  130. ),
  131. content_type="text/html",
  132. status=404,
  133. )
  134. # slug is a URL
  135. try:
  136. try:
  137. # try exact match on full url first
  138. snapshot = Snapshot.objects.get(
  139. Q(url='http://' + path) | Q(url='https://' + path) | Q(id__startswith=path)
  140. )
  141. except Snapshot.DoesNotExist:
  142. # fall back to match on exact base_url
  143. try:
  144. snapshot = Snapshot.objects.get(
  145. Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path))
  146. )
  147. except Snapshot.DoesNotExist:
  148. # fall back to matching base_url as prefix
  149. snapshot = Snapshot.objects.get(
  150. Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
  151. )
  152. return redirect(f'/archive/{snapshot.timestamp}/index.html')
  153. except Snapshot.DoesNotExist:
  154. return HttpResponse(
  155. format_html(
  156. (
  157. '<center><br/><br/><br/>'
  158. 'No Snapshots match the given url: <code>{}</code><br/><br/><br/>'
  159. 'Return to the <a href="/" target="_top">Main Index</a>, or:<br/><br/>'
  160. '+ <i><a href="/add/?url={}" target="_top">Add a new Snapshot for <code>{}</code></a><br/><br/></i>'
  161. '</center>'
  162. ),
  163. base_url(path),
  164. path if '://' in path else f'https://{path}',
  165. path,
  166. ),
  167. content_type="text/html",
  168. status=404,
  169. )
  170. except Snapshot.MultipleObjectsReturned:
  171. snapshot_hrefs = mark_safe('<br/>').join(
  172. format_html(
  173. '{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
  174. snap.added.strftime('%Y-%m-%d %H:%M:%S'),
  175. snap.timestamp,
  176. snap.timestamp,
  177. snap.url,
  178. snap.title or '',
  179. )
  180. for snap in Snapshot.objects.filter(
  181. Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
  182. ).only('url', 'timestamp', 'title', 'added').order_by('-added')
  183. )
  184. return HttpResponse(
  185. format_html(
  186. (
  187. 'Multiple Snapshots match the given URL <code>{}</code><br/><pre>'
  188. ),
  189. base_url(path),
  190. ) + snapshot_hrefs + format_html(
  191. (
  192. '</pre><br/>'
  193. 'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
  194. )
  195. ),
  196. content_type="text/html",
  197. status=404,
  198. )
  199. class PublicIndexView(ListView):
  200. template_name = 'public_index.html'
  201. model = Snapshot
  202. paginate_by = SNAPSHOTS_PER_PAGE
  203. ordering = ['-added']
  204. def get_context_data(self, **kwargs):
  205. return {
  206. **super().get_context_data(**kwargs),
  207. 'VERSION': VERSION,
  208. 'COMMIT_HASH': COMMIT_HASH,
  209. 'FOOTER_INFO': FOOTER_INFO,
  210. }
  211. def get_queryset(self, **kwargs):
  212. qs = super().get_queryset(**kwargs)
  213. query = self.request.GET.get('q')
  214. if query and query.strip():
  215. qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query))
  216. try:
  217. qs = qs | query_search_index(query)
  218. except Exception as err:
  219. print(f'[!] Error while using search backend: {err.__class__.__name__} {err}')
  220. return qs.distinct()
  221. def get(self, *args, **kwargs):
  222. if PUBLIC_INDEX or self.request.user.is_authenticated:
  223. response = super().get(*args, **kwargs)
  224. return response
  225. else:
  226. return redirect(f'/admin/login/?next={self.request.path}')
  227. @method_decorator(csrf_exempt, name='dispatch')
  228. class AddView(UserPassesTestMixin, FormView):
  229. template_name = "add.html"
  230. form_class = AddLinkForm
  231. def get_initial(self):
  232. """Prefill the AddLinkForm with the 'url' GET parameter"""
  233. if self.request.method == 'GET':
  234. url = self.request.GET.get('url', None)
  235. if url:
  236. return {'url': url if '://' in url else f'https://{url}'}
  237. return super().get_initial()
  238. def test_func(self):
  239. return PUBLIC_ADD_VIEW or self.request.user.is_authenticated
  240. def get_context_data(self, **kwargs):
  241. return {
  242. **super().get_context_data(**kwargs),
  243. 'title': "Add URLs",
  244. # We can't just call request.build_absolute_uri in the template, because it would include query parameters
  245. 'absolute_add_path': self.request.build_absolute_uri(self.request.path),
  246. 'VERSION': VERSION,
  247. 'FOOTER_INFO': FOOTER_INFO,
  248. 'stdout': '',
  249. }
  250. def form_valid(self, form):
  251. url = form.cleaned_data["url"]
  252. print(f'[+] Adding URL: {url}')
  253. parser = form.cleaned_data["parser"]
  254. tag = form.cleaned_data["tag"]
  255. depth = 0 if form.cleaned_data["depth"] == "0" else 1
  256. extractors = ','.join(form.cleaned_data["archive_methods"])
  257. input_kwargs = {
  258. "urls": url,
  259. "tag": tag,
  260. "depth": depth,
  261. "parser": parser,
  262. "update_all": False,
  263. "out_dir": OUTPUT_DIR,
  264. }
  265. if extractors:
  266. input_kwargs.update({"extractors": extractors})
  267. add_stdout = StringIO()
  268. with redirect_stdout(add_stdout):
  269. add(**input_kwargs)
  270. print(add_stdout.getvalue())
  271. context = self.get_context_data()
  272. context.update({
  273. "stdout": ansi_to_html(add_stdout.getvalue().strip()),
  274. "form": AddLinkForm()
  275. })
  276. return render(template_name=self.template_name, request=self.request, context=context)
  277. class HealthCheckView(View):
  278. """
  279. A Django view that renders plain text "OK" for service discovery tools
  280. """
  281. def get(self, request):
  282. """
  283. Handle a GET request
  284. """
  285. return HttpResponse(
  286. 'OK',
  287. content_type='text/plain',
  288. status=200
  289. )