admin_snapshots.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. __package__ = 'archivebox.core'
  2. import os
  3. from pathlib import Path
  4. from django.contrib import admin, messages
  5. from django.urls import path
  6. from django.utils.html import format_html, mark_safe
  7. from django.utils import timezone
  8. from django import forms
  9. from django.template import Template, RequestContext
  10. from django.contrib.admin.helpers import ActionForm
  11. from django.contrib.admin.widgets import FilteredSelectMultiple
  12. from archivebox.config import DATA_DIR
  13. from archivebox.config.common import SERVER_CONFIG
  14. from archivebox.misc.util import htmldecode, urldecode
  15. from archivebox.misc.paginators import AccelleratedPaginator
  16. from archivebox.search.admin import SearchResultsAdminMixin
  17. from archivebox.logging_util import printable_filesize
  18. from archivebox.index.html import snapshot_icons
  19. from archivebox.extractors import archive_links
  20. from archivebox.main import remove
  21. from archivebox.abid_utils.admin import ABIDModelAdmin
  22. from archivebox.queues.tasks import bg_archive_links, bg_add
  23. from core.models import Tag
  24. from core.admin_tags import TagInline
  25. from core.admin_archiveresults import ArchiveResultInline, result_url
  26. # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
  27. GLOBAL_CONTEXT = {}
  28. class SnapshotActionForm(ActionForm):
  29. tags = forms.ModelMultipleChoiceField(
  30. label='Edit tags',
  31. queryset=Tag.objects.all(),
  32. required=False,
  33. widget=FilteredSelectMultiple(
  34. 'core_tag__name',
  35. False,
  36. ),
  37. )
  38. # TODO: allow selecting actions for specific extractors? is this useful?
  39. # extractor = forms.ChoiceField(
  40. # choices=ArchiveResult.EXTRACTOR_CHOICES,
  41. # required=False,
  42. # widget=forms.MultileChoiceField(attrs={'class': "form-control"})
  43. # )
  44. class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
  45. list_display = ('created_at', 'title_str', 'files', 'size', 'url_str')
  46. sort_fields = ('title_str', 'url_str', 'created_at')
  47. readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'abid_info', 'link_dir')
  48. search_fields = ('id', 'url', 'abid', 'timestamp', 'title', 'tags__name')
  49. list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
  50. fields = ('url', 'title', 'created_by', 'bookmarked_at', *readonly_fields)
  51. ordering = ['-created_at']
  52. actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
  53. inlines = [TagInline, ArchiveResultInline]
  54. list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
  55. action_form = SnapshotActionForm
  56. paginator = AccelleratedPaginator
  57. save_on_top = True
  58. show_full_result_count = False
  59. def changelist_view(self, request, extra_context=None):
  60. self.request = request
  61. extra_context = extra_context or {}
  62. try:
  63. return super().changelist_view(request, extra_context | GLOBAL_CONTEXT)
  64. except Exception as e:
  65. self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}')
  66. return super().changelist_view(request, GLOBAL_CONTEXT)
  67. def get_urls(self):
  68. urls = super().get_urls()
  69. custom_urls = [
  70. path('grid/', self.admin_site.admin_view(self.grid_view), name='grid')
  71. ]
  72. return custom_urls + urls
  73. # def get_queryset(self, request):
  74. # # tags_qs = SnapshotTag.objects.all().select_related('tag')
  75. # # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
  76. # self.request = request
  77. # return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
  78. @admin.action(
  79. description="Imported Timestamp"
  80. )
  81. def imported_timestamp(self, obj):
  82. context = RequestContext(self.request, {
  83. 'bookmarked_date': obj.bookmarked,
  84. 'timestamp': obj.timestamp,
  85. })
  86. html = Template("""{{bookmarked_date}} (<code>{{timestamp}}</code>)""")
  87. return mark_safe(html.render(context))
  88. # pretty_time = obj.bookmarked.strftime('%Y-%m-%d %H:%M:%S')
  89. # return f'{pretty_time} ({obj.timestamp})'
  90. # TODO: figure out a different way to do this, you cant nest forms so this doenst work
  91. # def action(self, obj):
  92. # # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0
  93. # # action: update_snapshots
  94. # # select_across: 0
  95. # # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3
  96. # return format_html(
  97. # '''
  98. # <form action="/admin/core/snapshot/" method="post" onsubmit="e => e.stopPropagation()">
  99. # <input type="hidden" name="csrfmiddlewaretoken" value="{}">
  100. # <input type="hidden" name="_selected_action" value="{}">
  101. # <button name="update_snapshots">Check</button>
  102. # <button name="update_titles">Pull title + favicon</button>
  103. # <button name="update_snapshots">Update</button>
  104. # <button name="overwrite_snapshots">Re-Archive (overwrite)</button>
  105. # <button name="delete_snapshots">Permanently delete</button>
  106. # </form>
  107. # ''',
  108. # csrf.get_token(self.request),
  109. # obj.pk,
  110. # )
  111. def admin_actions(self, obj):
  112. return format_html(
  113. # URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
  114. '''
  115. <a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}">Summary page ➡️</a> &nbsp; &nbsp;
  116. <a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}/index.html#all">Result files 📑</a> &nbsp; &nbsp;
  117. <a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/admin/core/snapshot/?id__exact={}">Admin actions ⚙️</a>
  118. ''',
  119. obj.timestamp,
  120. obj.timestamp,
  121. obj.pk,
  122. )
  123. def status_info(self, obj):
  124. return format_html(
  125. # URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
  126. '''
  127. Archived: {} ({} files {}) &nbsp; &nbsp;
  128. Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp;
  129. Status code: {} &nbsp; &nbsp;<br/>
  130. Server: {} &nbsp; &nbsp;
  131. Content type: {} &nbsp; &nbsp;
  132. Extension: {} &nbsp; &nbsp;
  133. ''',
  134. '✅' if obj.is_archived else '❌',
  135. obj.num_outputs,
  136. self.size(obj) or '0kb',
  137. f'/archive/{obj.timestamp}/favicon.ico',
  138. obj.status_code or '-',
  139. obj.headers and obj.headers.get('Server') or '-',
  140. obj.headers and obj.headers.get('Content-Type') or '-',
  141. obj.extension or '-',
  142. )
  143. @admin.display(
  144. description='Title',
  145. ordering='title',
  146. )
  147. def title_str(self, obj):
  148. tags = ''.join(
  149. format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.pk, tag.name)
  150. for tag in obj.tags.all()
  151. if str(tag.name).strip()
  152. )
  153. return format_html(
  154. '<a href="/{}">'
  155. '<img src="/{}/favicon.ico" class="favicon" onerror="this.remove()">'
  156. '</a>'
  157. '<a href="/{}/index.html">'
  158. '<b class="status-{}">{}</b>'
  159. '</a>',
  160. obj.archive_path,
  161. obj.archive_path,
  162. obj.archive_path,
  163. 'fetched' if obj.latest_title or obj.title else 'pending',
  164. urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
  165. ) + mark_safe(f' <span class="tags">{tags}</span>')
  166. @admin.display(
  167. description='Files Saved',
  168. # ordering='archiveresult_count',
  169. )
  170. def files(self, obj):
  171. # return '-'
  172. return snapshot_icons(obj)
  173. @admin.display(
  174. # ordering='archiveresult_count'
  175. )
  176. def size(self, obj):
  177. archive_size = os.access(Path(obj.link_dir) / 'index.html', os.F_OK) and obj.archive_size
  178. if archive_size:
  179. size_txt = printable_filesize(archive_size)
  180. if archive_size > 52428800:
  181. size_txt = mark_safe(f'<b>{size_txt}</b>')
  182. else:
  183. size_txt = mark_safe('<span style="opacity: 0.3">...</span>')
  184. return format_html(
  185. '<a href="/{}" title="View all files">{}</a>',
  186. obj.archive_path,
  187. size_txt,
  188. )
  189. @admin.display(
  190. description='Original URL',
  191. ordering='url',
  192. )
  193. def url_str(self, obj):
  194. return format_html(
  195. '<a href="{}"><code style="user-select: all;">{}</code></a>',
  196. obj.url,
  197. obj.url[:128],
  198. )
  199. def grid_view(self, request, extra_context=None):
  200. # cl = self.get_changelist_instance(request)
  201. # Save before monkey patching to restore for changelist list view
  202. saved_change_list_template = self.change_list_template
  203. saved_list_per_page = self.list_per_page
  204. saved_list_max_show_all = self.list_max_show_all
  205. # Monkey patch here plus core_tags.py
  206. self.change_list_template = 'private_index_grid.html'
  207. self.list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
  208. self.list_max_show_all = self.list_per_page
  209. # Call monkey patched view
  210. rendered_response = self.changelist_view(request, extra_context=extra_context)
  211. # Restore values
  212. self.change_list_template = saved_change_list_template
  213. self.list_per_page = saved_list_per_page
  214. self.list_max_show_all = saved_list_max_show_all
  215. return rendered_response
  216. # for debugging, uncomment this to print all requests:
  217. # def changelist_view(self, request, extra_context=None):
  218. # print('[*] Got request', request.method, request.POST)
  219. # return super().changelist_view(request, extra_context=None)
  220. @admin.action(
  221. description="ℹ️ Get Title"
  222. )
  223. def update_titles(self, request, queryset):
  224. links = [snapshot.as_link() for snapshot in queryset]
  225. if len(links) < 3:
  226. # run syncronously if there are only 1 or 2 links
  227. archive_links(links, overwrite=True, methods=('title','favicon'), out_dir=DATA_DIR)
  228. messages.success(request, f"Title and favicon have been fetched and saved for {len(links)} URLs.")
  229. else:
  230. # otherwise run in a background worker
  231. result = bg_archive_links((links,), kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
  232. messages.success(
  233. request,
  234. mark_safe(f"Title and favicon are updating in the background for {len(links)} URLs. {result_url(result)}"),
  235. )
  236. @admin.action(
  237. description="⬇️ Get Missing"
  238. )
  239. def update_snapshots(self, request, queryset):
  240. links = [snapshot.as_link() for snapshot in queryset]
  241. result = bg_archive_links((links,), kwargs={"overwrite": False, "out_dir": DATA_DIR})
  242. messages.success(
  243. request,
  244. mark_safe(f"Re-trying any previously failed methods for {len(links)} URLs in the background. {result_url(result)}"),
  245. )
  246. @admin.action(
  247. description="🆕 Archive Again"
  248. )
  249. def resnapshot_snapshot(self, request, queryset):
  250. for snapshot in queryset:
  251. timestamp = timezone.now().isoformat('T', 'seconds')
  252. new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
  253. result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
  254. messages.success(
  255. request,
  256. mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"),
  257. )
  258. @admin.action(
  259. description="🔄 Redo"
  260. )
  261. def overwrite_snapshots(self, request, queryset):
  262. links = [snapshot.as_link() for snapshot in queryset]
  263. result = bg_archive_links((links,), kwargs={"overwrite": True, "out_dir": DATA_DIR})
  264. messages.success(
  265. request,
  266. mark_safe(f"Clearing all previous results and re-downloading {len(links)} URLs in the background. {result_url(result)}"),
  267. )
  268. @admin.action(
  269. description="☠️ Delete"
  270. )
  271. def delete_snapshots(self, request, queryset):
  272. remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR)
  273. messages.success(
  274. request,
  275. mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),
  276. )
  277. @admin.action(
  278. description="+"
  279. )
  280. def add_tags(self, request, queryset):
  281. tags = request.POST.getlist('tags')
  282. print('[+] Adding tags', tags, 'to Snapshots', queryset)
  283. for obj in queryset:
  284. obj.tags.add(*tags)
  285. messages.success(
  286. request,
  287. f"Added {len(tags)} tags to {queryset.count()} Snapshots.",
  288. )
  289. @admin.action(
  290. description="–"
  291. )
  292. def remove_tags(self, request, queryset):
  293. tags = request.POST.getlist('tags')
  294. print('[-] Removing tags', tags, 'to Snapshots', queryset)
  295. for obj in queryset:
  296. obj.tags.remove(*tags)
  297. messages.success(
  298. request,
  299. f"Removed {len(tags)} tags from {queryset.count()} Snapshots.",
  300. )