views.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. __package__ = 'archivebox.core'
  2. from io import StringIO
  3. from contextlib import redirect_stdout
  4. from django.shortcuts import render, redirect
  5. from django.http import HttpResponse, Http404
  6. from django.utils.html import format_html, mark_safe
  7. from django.views import View, static
  8. from django.views.generic.list import ListView
  9. from django.views.generic import FormView
  10. from django.db.models import Q
  11. from django.contrib.auth.mixins import UserPassesTestMixin
  12. from django.views.decorators.csrf import csrf_exempt
  13. from django.utils.decorators import method_decorator
  14. from core.models import Snapshot
  15. from core.forms import AddLinkForm
  16. from ..config import (
  17. OUTPUT_DIR,
  18. PUBLIC_INDEX,
  19. PUBLIC_SNAPSHOTS,
  20. PUBLIC_ADD_VIEW,
  21. VERSION,
  22. COMMIT_HASH,
  23. FOOTER_INFO,
  24. SNAPSHOTS_PER_PAGE,
  25. )
  26. from ..main import add
  27. from ..util import base_url, ansi_to_html
  28. from ..search import query_search_index
  29. class HomepageView(View):
  30. def get(self, request):
  31. if request.user.is_authenticated:
  32. return redirect('/admin/core/snapshot/')
  33. if PUBLIC_INDEX:
  34. return redirect('/public')
  35. return redirect(f'/admin/login/?next={request.path}')
  36. class SnapshotView(View):
  37. # render static html index from filesystem archive/<timestamp>/index.html
  38. def get(self, request, path):
  39. if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
  40. return redirect(f'/admin/login/?next={request.path}')
  41. try:
  42. slug, archivefile = path.split('/', 1)
  43. except (IndexError, ValueError):
  44. slug, archivefile = path.split('/', 1)[0], 'index.html'
  45. # slug is a timestamp
  46. if slug.replace('.','').isdigit():
  47. # missing trailing slash -> redirect to index
  48. if '/' not in path:
  49. return redirect(f'{path}/index.html')
  50. try:
  51. try:
  52. snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
  53. response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
  54. response["Link"] = f'<{snapshot.url}>; rel="canonical"'
  55. return response
  56. except Snapshot.DoesNotExist:
  57. if Snapshot.objects.filter(timestamp__startswith=slug).exists():
  58. raise Snapshot.MultipleObjectsReturned
  59. else:
  60. raise
  61. except Snapshot.DoesNotExist:
  62. # Snapshot does not exist
  63. return HttpResponse(
  64. format_html(
  65. (
  66. '<center><br/><br/><br/>'
  67. 'No Snapshot directories match the given timestamp or UUID: <code>{}</code><br/><br/>'
  68. 'You can <a href="/add/" target="_top">add a new Snapshot</a>, or return to the <a href="/" target="_top">Main Index</a>'
  69. '</center>'
  70. ),
  71. slug,
  72. path,
  73. ),
  74. content_type="text/html",
  75. status=404,
  76. )
  77. except Snapshot.MultipleObjectsReturned:
  78. snapshot_hrefs = mark_safe('<br/>').join(
  79. format_html(
  80. '{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
  81. snap.added.strftime('%Y-%m-%d %H:%M:%S'),
  82. snap.timestamp,
  83. snap.timestamp,
  84. snap.url,
  85. snap.title or '',
  86. )
  87. for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'added').order_by('-added')
  88. )
  89. return HttpResponse(
  90. format_html(
  91. (
  92. 'Multiple Snapshots match the given timestamp/UUID <code>{}</code><br/><pre>'
  93. ),
  94. slug,
  95. ) + snapshot_hrefs + format_html(
  96. (
  97. '</pre><br/>'
  98. 'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
  99. )
  100. ),
  101. content_type="text/html",
  102. status=404,
  103. )
  104. except Http404:
  105. # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
  106. return HttpResponse(
  107. format_html(
  108. (
  109. '<center><br/><br/><br/>'
  110. f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a> exists in DB, but resource <b><code>{snapshot.timestamp}/'
  111. '{}'
  112. f'</code></b> does not exist in <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
  113. 'Maybe this resource type is not availabe for this Snapshot,<br/>or the archiving process has not completed yet?<br/>'
  114. f'<pre><code># run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
  115. '<div class="text-align: left; width: 100%; max-width: 400px">'
  116. '<i><b>Next steps:</i></b><br/>'
  117. f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
  118. f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
  119. f'- go to the <a href="/admin/core/snapshot/{snapshot.id}/change/" target="_top">Snapshot admin</a> to edit<br/>'
  120. f'- go to the <a href="/admin/core/snapshot/?id__startswith={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
  121. '- or return to <a href="/" target="_top">the main index...</a></div>'
  122. '</center>'
  123. ),
  124. archivefile,
  125. ),
  126. content_type="text/html",
  127. status=404,
  128. )
  129. # slug is a URL
  130. try:
  131. try:
  132. # try exact match on full url first
  133. snapshot = Snapshot.objects.get(
  134. Q(url='http://' + path) | Q(url='https://' + path) | Q(id__startswith=path)
  135. )
  136. except Snapshot.DoesNotExist:
  137. # fall back to match on exact base_url
  138. try:
  139. snapshot = Snapshot.objects.get(
  140. Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path))
  141. )
  142. except Snapshot.DoesNotExist:
  143. # fall back to matching base_url as prefix
  144. snapshot = Snapshot.objects.get(
  145. Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
  146. )
  147. return redirect(f'/archive/{snapshot.timestamp}/index.html')
  148. except Snapshot.DoesNotExist:
  149. return HttpResponse(
  150. format_html(
  151. (
  152. '<center><br/><br/><br/>'
  153. 'No Snapshots match the given url: <code>{}</code><br/><br/><br/>'
  154. 'Return to the <a href="/" target="_top">Main Index</a>, or:<br/><br/>'
  155. '+ <i><a href="/add/?url={}" target="_top">Add a new Snapshot for <code>{}</code></a><br/><br/></i>'
  156. '</center>'
  157. ),
  158. base_url(path),
  159. path if '://' in path else f'https://{path}',
  160. path,
  161. ),
  162. content_type="text/html",
  163. status=404,
  164. )
  165. except Snapshot.MultipleObjectsReturned:
  166. snapshot_hrefs = mark_safe('<br/>').join(
  167. format_html(
  168. '{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
  169. snap.added.strftime('%Y-%m-%d %H:%M:%S'),
  170. snap.timestamp,
  171. snap.timestamp,
  172. snap.url,
  173. snap.title or '',
  174. )
  175. for snap in Snapshot.objects.filter(
  176. Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
  177. ).only('url', 'timestamp', 'title', 'added').order_by('-added')
  178. )
  179. return HttpResponse(
  180. format_html(
  181. (
  182. 'Multiple Snapshots match the given URL <code>{}</code><br/><pre>'
  183. ),
  184. base_url(path),
  185. ) + snapshot_hrefs + format_html(
  186. (
  187. '</pre><br/>'
  188. 'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
  189. )
  190. ),
  191. content_type="text/html",
  192. status=404,
  193. )
  194. class PublicIndexView(ListView):
  195. template_name = 'public_index.html'
  196. model = Snapshot
  197. paginate_by = SNAPSHOTS_PER_PAGE
  198. ordering = ['-added']
  199. def get_context_data(self, **kwargs):
  200. return {
  201. **super().get_context_data(**kwargs),
  202. 'VERSION': VERSION,
  203. 'COMMIT_HASH': COMMIT_HASH,
  204. 'FOOTER_INFO': FOOTER_INFO,
  205. }
  206. def get_queryset(self, **kwargs):
  207. qs = super().get_queryset(**kwargs)
  208. query = self.request.GET.get('q')
  209. if query and query.strip():
  210. qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query))
  211. try:
  212. qs = qs | query_search_index(query)
  213. except Exception as err:
  214. print(f'[!] Error while using search backend: {err.__class__.__name__} {err}')
  215. return qs.distinct()
  216. def get(self, *args, **kwargs):
  217. if PUBLIC_INDEX or self.request.user.is_authenticated:
  218. response = super().get(*args, **kwargs)
  219. return response
  220. else:
  221. return redirect(f'/admin/login/?next={self.request.path}')
  222. @method_decorator(csrf_exempt, name='dispatch')
  223. class AddView(UserPassesTestMixin, FormView):
  224. template_name = "add.html"
  225. form_class = AddLinkForm
  226. def get_initial(self):
  227. """Prefill the AddLinkForm with the 'url' GET parameter"""
  228. if self.request.method == 'GET':
  229. url = self.request.GET.get('url', None)
  230. if url:
  231. return {'url': url if '://' in url else f'https://{url}'}
  232. return super().get_initial()
  233. def test_func(self):
  234. return PUBLIC_ADD_VIEW or self.request.user.is_authenticated
  235. def get_context_data(self, **kwargs):
  236. return {
  237. **super().get_context_data(**kwargs),
  238. 'title': "Add URLs",
  239. # We can't just call request.build_absolute_uri in the template, because it would include query parameters
  240. 'absolute_add_path': self.request.build_absolute_uri(self.request.path),
  241. 'VERSION': VERSION,
  242. 'FOOTER_INFO': FOOTER_INFO,
  243. 'stdout': '',
  244. }
  245. def form_valid(self, form):
  246. url = form.cleaned_data["url"]
  247. print(f'[+] Adding URL: {url}')
  248. parser = form.cleaned_data["parser"]
  249. tag = form.cleaned_data["tag"]
  250. depth = 0 if form.cleaned_data["depth"] == "0" else 1
  251. extractors = ','.join(form.cleaned_data["archive_methods"])
  252. input_kwargs = {
  253. "urls": url,
  254. "tag": tag,
  255. "depth": depth,
  256. "parser": parser,
  257. "update_all": False,
  258. "out_dir": OUTPUT_DIR,
  259. }
  260. if extractors:
  261. input_kwargs.update({"extractors": extractors})
  262. add_stdout = StringIO()
  263. with redirect_stdout(add_stdout):
  264. add(**input_kwargs)
  265. print(add_stdout.getvalue())
  266. context = self.get_context_data()
  267. context.update({
  268. "stdout": ansi_to_html(add_stdout.getvalue().strip()),
  269. "form": AddLinkForm()
  270. })
  271. return render(template_name=self.template_name, request=self.request, context=context)
  272. class HealthCheckView(View):
  273. """
  274. A Django view that renders plain text "OK" for service discovery tools
  275. """
  276. def get(self, request):
  277. """
  278. Handle a GET request
  279. """
  280. return HttpResponse(
  281. 'OK',
  282. content_type='text/plain',
  283. status=200
  284. )