admin_snapshots.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. __package__ = 'archivebox.core'
  2. import os
  3. from pathlib import Path
  4. from django.contrib import admin, messages
  5. from django.urls import path
  6. from django.utils.html import format_html, mark_safe
  7. from django.utils import timezone
  8. from django import forms
  9. from django.template import Template, RequestContext
  10. from django.contrib.admin.helpers import ActionForm
  11. from django.contrib.admin.widgets import FilteredSelectMultiple
  12. from archivebox.config import DATA_DIR
  13. from archivebox.config.common import SERVER_CONFIG
  14. from archivebox.misc.util import htmldecode, urldecode
  15. from archivebox.misc.paginators import AccelleratedPaginator
  16. from archivebox.misc.logging_util import printable_filesize
  17. from archivebox.search.admin import SearchResultsAdminMixin
  18. from archivebox.index.html import snapshot_icons
  19. from archivebox.extractors import archive_links
  20. from archivebox.base_models.admin import ABIDModelAdmin
  21. from archivebox.workers.tasks import bg_archive_links, bg_add
  22. from core.models import Tag
  23. from core.admin_tags import TagInline
  24. from core.admin_archiveresults import ArchiveResultInline, result_url
  25. # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
  26. GLOBAL_CONTEXT = {}
  27. class SnapshotActionForm(ActionForm):
  28. tags = forms.ModelMultipleChoiceField(
  29. label='Edit tags',
  30. queryset=Tag.objects.all(),
  31. required=False,
  32. widget=FilteredSelectMultiple(
  33. 'core_tag__name',
  34. False,
  35. ),
  36. )
  37. # TODO: allow selecting actions for specific extractors? is this useful?
  38. # extractor = forms.ChoiceField(
  39. # choices=ArchiveResult.EXTRACTOR_CHOICES,
  40. # required=False,
  41. # widget=forms.MultileChoiceField(attrs={'class': "form-control"})
  42. # )
  43. class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
  44. list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
  45. sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
  46. readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'abid_info', 'link_dir')
  47. search_fields = ('id', 'url', 'abid', 'timestamp', 'title', 'tags__name')
  48. list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
  49. fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', *readonly_fields)
  50. ordering = ['-created_at']
  51. actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
  52. inlines = [TagInline, ArchiveResultInline]
  53. list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
  54. action_form = SnapshotActionForm
  55. paginator = AccelleratedPaginator
  56. save_on_top = True
  57. show_full_result_count = False
  58. def changelist_view(self, request, extra_context=None):
  59. self.request = request
  60. extra_context = extra_context or {}
  61. try:
  62. return super().changelist_view(request, extra_context | GLOBAL_CONTEXT)
  63. except Exception as e:
  64. self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}')
  65. return super().changelist_view(request, GLOBAL_CONTEXT)
  66. def get_urls(self):
  67. urls = super().get_urls()
  68. custom_urls = [
  69. path('grid/', self.admin_site.admin_view(self.grid_view), name='grid')
  70. ]
  71. return custom_urls + urls
  72. # def get_queryset(self, request):
  73. # # tags_qs = SnapshotTag.objects.all().select_related('tag')
  74. # # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
  75. # self.request = request
  76. # return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
  77. @admin.action(
  78. description="Imported Timestamp"
  79. )
  80. def imported_timestamp(self, obj):
  81. context = RequestContext(self.request, {
  82. 'bookmarked_date': obj.bookmarked,
  83. 'timestamp': obj.timestamp,
  84. })
  85. html = Template("""{{bookmarked_date}} (<code>{{timestamp}}</code>)""")
  86. return mark_safe(html.render(context))
  87. # pretty_time = obj.bookmarked.strftime('%Y-%m-%d %H:%M:%S')
  88. # return f'{pretty_time} ({obj.timestamp})'
  89. # TODO: figure out a different way to do this, you cant nest forms so this doenst work
  90. # def action(self, obj):
  91. # # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0
  92. # # action: update_snapshots
  93. # # select_across: 0
  94. # # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3
  95. # return format_html(
  96. # '''
  97. # <form action="/admin/core/snapshot/" method="post" onsubmit="e => e.stopPropagation()">
  98. # <input type="hidden" name="csrfmiddlewaretoken" value="{}">
  99. # <input type="hidden" name="_selected_action" value="{}">
  100. # <button name="update_snapshots">Check</button>
  101. # <button name="update_titles">Pull title + favicon</button>
  102. # <button name="update_snapshots">Update</button>
  103. # <button name="overwrite_snapshots">Re-Archive (overwrite)</button>
  104. # <button name="delete_snapshots">Permanently delete</button>
  105. # </form>
  106. # ''',
  107. # csrf.get_token(self.request),
  108. # obj.pk,
  109. # )
  110. def admin_actions(self, obj):
  111. return format_html(
  112. # URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
  113. '''
  114. <a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}">Summary page ➡️</a> &nbsp; &nbsp;
  115. <a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}/index.html#all">Result files 📑</a> &nbsp; &nbsp;
  116. <a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/admin/core/snapshot/?id__exact={}">Admin actions ⚙️</a>
  117. ''',
  118. obj.timestamp,
  119. obj.timestamp,
  120. obj.pk,
  121. )
  122. def status_info(self, obj):
  123. return format_html(
  124. # URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
  125. '''
  126. Archived: {} ({} files {}) &nbsp; &nbsp;
  127. Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp;
  128. Status code: {} &nbsp; &nbsp;<br/>
  129. Server: {} &nbsp; &nbsp;
  130. Content type: {} &nbsp; &nbsp;
  131. Extension: {} &nbsp; &nbsp;
  132. ''',
  133. '✅' if obj.is_archived else '❌',
  134. obj.num_outputs,
  135. self.size(obj) or '0kb',
  136. f'/archive/{obj.timestamp}/favicon.ico',
  137. obj.status_code or '-',
  138. obj.headers and obj.headers.get('Server') or '-',
  139. obj.headers and obj.headers.get('Content-Type') or '-',
  140. obj.extension or '-',
  141. )
  142. @admin.display(
  143. description='Title',
  144. ordering='title',
  145. )
  146. def title_str(self, obj):
  147. tags = ''.join(
  148. format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.pk, tag.name)
  149. for tag in obj.tags.all()
  150. if str(tag.name).strip()
  151. )
  152. return format_html(
  153. '<a href="/{}">'
  154. '<img src="/{}/favicon.ico" class="favicon" onerror="this.remove()">'
  155. '</a>'
  156. '<a href="/{}/index.html">'
  157. '<b class="status-{}">{}</b>'
  158. '</a>',
  159. obj.archive_path,
  160. obj.archive_path,
  161. obj.archive_path,
  162. 'fetched' if obj.latest_title or obj.title else 'pending',
  163. urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
  164. ) + mark_safe(f' <span class="tags">{tags}</span>')
  165. @admin.display(
  166. description='Files Saved',
  167. # ordering='archiveresult_count',
  168. )
  169. def files(self, obj):
  170. # return '-'
  171. return snapshot_icons(obj)
  172. @admin.display(
  173. # ordering='archiveresult_count'
  174. )
  175. def size(self, obj):
  176. archive_size = os.access(Path(obj.link_dir) / 'index.html', os.F_OK) and obj.archive_size
  177. if archive_size:
  178. size_txt = printable_filesize(archive_size)
  179. if archive_size > 52428800:
  180. size_txt = mark_safe(f'<b>{size_txt}</b>')
  181. else:
  182. size_txt = mark_safe('<span style="opacity: 0.3">...</span>')
  183. return format_html(
  184. '<a href="/{}" title="View all files">{}</a>',
  185. obj.archive_path,
  186. size_txt,
  187. )
  188. @admin.display(
  189. description='Original URL',
  190. ordering='url',
  191. )
  192. def url_str(self, obj):
  193. return format_html(
  194. '<a href="{}"><code style="user-select: all;">{}</code></a>',
  195. obj.url,
  196. obj.url[:128],
  197. )
  198. def grid_view(self, request, extra_context=None):
  199. # cl = self.get_changelist_instance(request)
  200. # Save before monkey patching to restore for changelist list view
  201. saved_change_list_template = self.change_list_template
  202. saved_list_per_page = self.list_per_page
  203. saved_list_max_show_all = self.list_max_show_all
  204. # Monkey patch here plus core_tags.py
  205. self.change_list_template = 'private_index_grid.html'
  206. self.list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
  207. self.list_max_show_all = self.list_per_page
  208. # Call monkey patched view
  209. rendered_response = self.changelist_view(request, extra_context=extra_context)
  210. # Restore values
  211. self.change_list_template = saved_change_list_template
  212. self.list_per_page = saved_list_per_page
  213. self.list_max_show_all = saved_list_max_show_all
  214. return rendered_response
  215. # for debugging, uncomment this to print all requests:
  216. # def changelist_view(self, request, extra_context=None):
  217. # print('[*] Got request', request.method, request.POST)
  218. # return super().changelist_view(request, extra_context=None)
  219. @admin.action(
  220. description="ℹ️ Get Title"
  221. )
  222. def update_titles(self, request, queryset):
  223. links = [snapshot.as_link() for snapshot in queryset]
  224. if len(links) < 3:
  225. # run syncronously if there are only 1 or 2 links
  226. archive_links(links, overwrite=True, methods=('title','favicon'), out_dir=DATA_DIR)
  227. messages.success(request, f"Title and favicon have been fetched and saved for {len(links)} URLs.")
  228. else:
  229. # otherwise run in a background worker
  230. result = bg_archive_links((links,), kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
  231. messages.success(
  232. request,
  233. mark_safe(f"Title and favicon are updating in the background for {len(links)} URLs. {result_url(result)}"),
  234. )
  235. @admin.action(
  236. description="⬇️ Get Missing"
  237. )
  238. def update_snapshots(self, request, queryset):
  239. links = [snapshot.as_link() for snapshot in queryset]
  240. result = bg_archive_links((links,), kwargs={"overwrite": False, "out_dir": DATA_DIR})
  241. messages.success(
  242. request,
  243. mark_safe(f"Re-trying any previously failed methods for {len(links)} URLs in the background. {result_url(result)}"),
  244. )
  245. @admin.action(
  246. description="🆕 Archive Again"
  247. )
  248. def resnapshot_snapshot(self, request, queryset):
  249. for snapshot in queryset:
  250. timestamp = timezone.now().isoformat('T', 'seconds')
  251. new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
  252. result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
  253. messages.success(
  254. request,
  255. mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"),
  256. )
  257. @admin.action(
  258. description="🔄 Redo"
  259. )
  260. def overwrite_snapshots(self, request, queryset):
  261. links = [snapshot.as_link() for snapshot in queryset]
  262. result = bg_archive_links((links,), kwargs={"overwrite": True, "out_dir": DATA_DIR})
  263. messages.success(
  264. request,
  265. mark_safe(f"Clearing all previous results and re-downloading {len(links)} URLs in the background. {result_url(result)}"),
  266. )
  267. @admin.action(
  268. description="☠️ Delete"
  269. )
  270. def delete_snapshots(self, request, queryset):
  271. from archivebox.cli.archivebox_remove import remove
  272. remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR)
  273. messages.success(
  274. request,
  275. mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),
  276. )
  277. @admin.action(
  278. description="+"
  279. )
  280. def add_tags(self, request, queryset):
  281. tags = request.POST.getlist('tags')
  282. print('[+] Adding tags', tags, 'to Snapshots', queryset)
  283. for obj in queryset:
  284. obj.tags.add(*tags)
  285. messages.success(
  286. request,
  287. f"Added {len(tags)} tags to {queryset.count()} Snapshots.",
  288. )
  289. @admin.action(
  290. description="–"
  291. )
  292. def remove_tags(self, request, queryset):
  293. tags = request.POST.getlist('tags')
  294. print('[-] Removing tags', tags, 'to Snapshots', queryset)
  295. for obj in queryset:
  296. obj.tags.remove(*tags)
  297. messages.success(
  298. request,
  299. f"Removed {len(tags)} tags from {queryset.count()} Snapshots.",
  300. )