admin.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. __package__ = 'archivebox.crawls'
  2. import json
  3. from pathlib import Path
  4. from django import forms
  5. from django.utils.html import format_html, format_html_join, mark_safe
  6. from django.contrib import admin, messages
  7. from django.urls import path
  8. from django.http import JsonResponse
  9. from django.views.decorators.http import require_POST
  10. from django.db.models import Count, Q
  11. from archivebox import DATA_DIR
  12. from django_object_actions import action
  13. from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
  14. from archivebox.core.models import Snapshot
  15. from archivebox.crawls.models import Crawl, CrawlSchedule
  16. def render_snapshots_list(snapshots_qs, limit=20):
  17. """Render a nice inline list view of snapshots with status, title, URL, and progress."""
  18. snapshots = snapshots_qs.order_by('-created_at')[:limit].annotate(
  19. total_results=Count('archiveresult'),
  20. succeeded_results=Count('archiveresult', filter=Q(archiveresult__status='succeeded')),
  21. failed_results=Count('archiveresult', filter=Q(archiveresult__status='failed')),
  22. )
  23. if not snapshots:
  24. return mark_safe('<div style="color: #666; font-style: italic; padding: 8px 0;">No Snapshots yet...</div>')
  25. # Status colors matching Django admin and progress monitor
  26. status_colors = {
  27. 'queued': ('#6c757d', '#f8f9fa'), # gray
  28. 'started': ('#856404', '#fff3cd'), # amber
  29. 'sealed': ('#155724', '#d4edda'), # green
  30. 'failed': ('#721c24', '#f8d7da'), # red
  31. }
  32. rows = []
  33. for snapshot in snapshots:
  34. status = snapshot.status or 'queued'
  35. color, bg = status_colors.get(status, ('#6c757d', '#f8f9fa'))
  36. # Calculate progress
  37. total = snapshot.total_results
  38. done = snapshot.succeeded_results + snapshot.failed_results
  39. progress_pct = int((done / total) * 100) if total > 0 else 0
  40. progress_text = f'{done}/{total}' if total > 0 else '-'
  41. # Truncate title and URL
  42. title = (snapshot.title or 'Untitled')[:60]
  43. if len(snapshot.title or '') > 60:
  44. title += '...'
  45. url_display = snapshot.url[:50]
  46. if len(snapshot.url) > 50:
  47. url_display += '...'
  48. # Format date
  49. date_str = snapshot.created_at.strftime('%Y-%m-%d %H:%M') if snapshot.created_at else '-'
  50. rows.append(f'''
  51. <tr style="border-bottom: 1px solid #eee;">
  52. <td style="padding: 6px 8px; white-space: nowrap;">
  53. <span style="display: inline-block; padding: 2px 8px; border-radius: 10px;
  54. font-size: 11px; font-weight: 500; text-transform: uppercase;
  55. color: {color}; background: {bg};">{status}</span>
  56. </td>
  57. <td style="padding: 6px 8px; white-space: nowrap;">
  58. <a href="/archive/{snapshot.timestamp}/" style="text-decoration: none;">
  59. <img src="/archive/{snapshot.timestamp}/favicon.ico"
  60. style="width: 16px; height: 16px; vertical-align: middle; margin-right: 4px;"
  61. onerror="this.style.display='none'"/>
  62. </a>
  63. </td>
  64. <td style="padding: 6px 8px; max-width: 300px;">
  65. <a href="{snapshot.admin_change_url}" style="color: #417690; text-decoration: none; font-weight: 500;"
  66. title="{snapshot.title or 'Untitled'}">{title}</a>
  67. </td>
  68. <td style="padding: 6px 8px; max-width: 250px;">
  69. <a href="{snapshot.url}" target="_blank"
  70. style="color: #666; text-decoration: none; font-family: monospace; font-size: 11px;"
  71. title="{snapshot.url}">{url_display}</a>
  72. </td>
  73. <td style="padding: 6px 8px; white-space: nowrap; text-align: center;">
  74. <div style="display: inline-flex; align-items: center; gap: 6px;">
  75. <div style="width: 60px; height: 6px; background: #eee; border-radius: 3px; overflow: hidden;">
  76. <div style="width: {progress_pct}%; height: 100%;
  77. background: {'#28a745' if snapshot.failed_results == 0 else '#ffc107' if snapshot.succeeded_results > 0 else '#dc3545'};
  78. transition: width 0.3s;"></div>
  79. </div>
  80. <a href="/admin/core/archiveresult/?snapshot__id__exact={snapshot.id}"
  81. style="font-size: 11px; color: #417690; min-width: 35px; text-decoration: none;"
  82. title="View archive results">{progress_text}</a>
  83. </div>
  84. </td>
  85. <td style="padding: 6px 8px; white-space: nowrap; color: #888; font-size: 11px;">
  86. {date_str}
  87. </td>
  88. </tr>
  89. ''')
  90. total_count = snapshots_qs.count()
  91. footer = ''
  92. if total_count > limit:
  93. footer = f'''
  94. <tr>
  95. <td colspan="6" style="padding: 8px; text-align: center; color: #666; font-size: 12px; background: #f8f9fa;">
  96. Showing {limit} of {total_count} snapshots
  97. </td>
  98. </tr>
  99. '''
  100. return mark_safe(f'''
  101. <div style="border: 1px solid #ddd; border-radius: 6px; overflow: hidden; max-width: 100%;">
  102. <table style="width: 100%; border-collapse: collapse; font-size: 13px;">
  103. <thead>
  104. <tr style="background: #f5f5f5; border-bottom: 2px solid #ddd;">
  105. <th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Status</th>
  106. <th style="padding: 8px; text-align: left; font-weight: 600; color: #333; width: 24px;"></th>
  107. <th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Title</th>
  108. <th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">URL</th>
  109. <th style="padding: 8px; text-align: center; font-weight: 600; color: #333;">Progress</th>
  110. <th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Created</th>
  111. </tr>
  112. </thead>
  113. <tbody>
  114. {''.join(rows)}
  115. {footer}
  116. </tbody>
  117. </table>
  118. </div>
  119. ''')
  120. class CrawlAdminForm(forms.ModelForm):
  121. """Custom form for Crawl admin to render urls field as textarea."""
  122. class Meta:
  123. model = Crawl
  124. fields = '__all__'
  125. widgets = {
  126. 'urls': forms.Textarea(attrs={
  127. 'rows': 8,
  128. 'style': 'width: 100%; font-family: monospace; font-size: 13px;',
  129. 'placeholder': 'https://example.com\nhttps://example2.com\n# Comments start with #',
  130. }),
  131. }
  132. class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
  133. form = CrawlAdminForm
  134. list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
  135. sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at')
  136. search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls')
  137. readonly_fields = ('created_at', 'modified_at', 'snapshots')
  138. fieldsets = (
  139. ('URLs', {
  140. 'fields': ('urls',),
  141. 'classes': ('card', 'wide'),
  142. }),
  143. ('Info', {
  144. 'fields': ('label', 'notes', 'tags_str'),
  145. 'classes': ('card',),
  146. }),
  147. ('Settings', {
  148. 'fields': ('max_depth', 'config'),
  149. 'classes': ('card',),
  150. }),
  151. ('Status', {
  152. 'fields': ('status', 'retry_at'),
  153. 'classes': ('card',),
  154. }),
  155. ('Relations', {
  156. 'fields': ('schedule', 'created_by'),
  157. 'classes': ('card',),
  158. }),
  159. ('Timestamps', {
  160. 'fields': ('created_at', 'modified_at'),
  161. 'classes': ('card',),
  162. }),
  163. ('Snapshots', {
  164. 'fields': ('snapshots',),
  165. 'classes': ('card', 'wide'),
  166. }),
  167. )
  168. list_filter = ('max_depth', 'schedule', 'created_by', 'status', 'retry_at')
  169. ordering = ['-created_at', '-retry_at']
  170. list_per_page = 100
  171. actions = ["delete_selected_batched"]
  172. change_actions = ['recrawl']
  173. def get_queryset(self, request):
  174. """Optimize queries with select_related and annotations."""
  175. qs = super().get_queryset(request)
  176. return qs.select_related('schedule', 'created_by').annotate(
  177. num_snapshots_cached=Count('snapshot_set')
  178. )
  179. @admin.action(description='Delete selected crawls')
  180. def delete_selected_batched(self, request, queryset):
  181. """Delete crawls in a single transaction to avoid SQLite concurrency issues."""
  182. from django.db import transaction
  183. total = queryset.count()
  184. # Get list of IDs to delete first (outside transaction)
  185. ids_to_delete = list(queryset.values_list('pk', flat=True))
  186. # Delete everything in a single atomic transaction
  187. with transaction.atomic():
  188. deleted_count, _ = Crawl.objects.filter(pk__in=ids_to_delete).delete()
  189. messages.success(request, f'Successfully deleted {total} crawls ({deleted_count} total objects including related records).')
  190. @action(label='Recrawl', description='Create a new crawl with the same settings')
  191. def recrawl(self, request, obj):
  192. """Duplicate this crawl as a new crawl with the same URLs and settings."""
  193. from django.utils import timezone
  194. from django.shortcuts import redirect
  195. # Validate URLs (required for crawl to start)
  196. if not obj.urls:
  197. messages.error(request, 'Cannot recrawl: original crawl has no URLs.')
  198. return redirect('admin:crawls_crawl_change', obj.id)
  199. new_crawl = Crawl.objects.create(
  200. urls=obj.urls,
  201. max_depth=obj.max_depth,
  202. tags_str=obj.tags_str,
  203. config=obj.config,
  204. schedule=obj.schedule,
  205. label=f"{obj.label} (recrawl)" if obj.label else "",
  206. notes=obj.notes,
  207. created_by=request.user,
  208. status=Crawl.StatusChoices.QUEUED,
  209. retry_at=timezone.now(),
  210. )
  211. messages.success(
  212. request,
  213. f'Created new crawl {new_crawl.id} with the same settings. '
  214. f'It will start processing shortly.'
  215. )
  216. return redirect('admin:crawls_crawl_change', new_crawl.id)
  217. def num_snapshots(self, obj):
  218. # Use cached annotation from get_queryset to avoid N+1
  219. return getattr(obj, 'num_snapshots_cached', obj.snapshot_set.count())
  220. def snapshots(self, obj):
  221. return render_snapshots_list(obj.snapshot_set.all())
  222. @admin.display(description='Schedule', ordering='schedule')
  223. def schedule_str(self, obj):
  224. if not obj.schedule:
  225. return mark_safe('<i>None</i>')
  226. return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule)
  227. @admin.display(description='URLs', ordering='urls')
  228. def urls_preview(self, obj):
  229. first_url = obj.get_urls_list()[0] if obj.get_urls_list() else ''
  230. return first_url[:80] + '...' if len(first_url) > 80 else first_url
  231. @admin.display(description='URLs')
  232. def urls_editor(self, obj):
  233. """Editor for crawl URLs."""
  234. widget_id = f'crawl_urls_{obj.pk}'
  235. # Check if it's a local file we can edit
  236. source_file = obj.get_file_path()
  237. is_file = source_file is not None
  238. file_contents = ""
  239. error = None
  240. if is_file and source_file:
  241. try:
  242. file_contents = source_file.read_text().strip()
  243. except Exception as e:
  244. error = f'Error reading {source_file}: {e}'
  245. # Escape for safe HTML embedding
  246. escaped_urls = (obj.urls or '').replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
  247. escaped_file_contents = file_contents.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
  248. # Count lines for auto-expand logic
  249. line_count = len((obj.urls or '').split('\n'))
  250. file_line_count = len(file_contents.split('\n')) if file_contents else 0
  251. uri_rows = min(max(3, line_count), 10)
  252. html = f'''
  253. <div id="{widget_id}_container" style="max-width: 900px;">
  254. <!-- URLs input -->
  255. <div style="margin-bottom: 12px;">
  256. <label style="font-weight: bold; display: block; margin-bottom: 4px;">URLs (one per line):</label>
  257. <textarea id="{widget_id}_urls"
  258. style="width: 100%; font-family: monospace; font-size: 13px;
  259. padding: 8px; border: 1px solid #ccc; border-radius: 4px;
  260. resize: vertical;"
  261. rows="{uri_rows}"
  262. placeholder="https://example.com&#10;https://example2.com&#10;# Comments start with #"
  263. readonly>{escaped_urls}</textarea>
  264. <p style="color: #666; font-size: 12px; margin: 4px 0 0 0;">
  265. {line_count} URL{'s' if line_count != 1 else ''} · Note: URLs displayed here for reference only
  266. </p>
  267. </div>
  268. {"" if not is_file else f'''
  269. <!-- File contents preview (if first URL is a file://) -->
  270. <div style="margin-bottom: 8px;">
  271. <label style="font-weight: bold; display: block; margin-bottom: 4px;">
  272. File Preview: <code style="font-weight: normal; color: #666;">{source_file}</code>
  273. </label>
  274. {"<div style='color: #dc3545; margin-bottom: 8px;'>" + error + "</div>" if error else ""}
  275. <textarea id="{widget_id}_file_preview"
  276. style="width: 100%; height: {min(400, max(150, file_line_count * 18))}px; font-family: monospace; font-size: 12px;
  277. padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical; background: #f9f9f9;"
  278. readonly>{escaped_file_contents}</textarea>
  279. </div>
  280. '''}
  281. </div>
  282. '''
  283. return mark_safe(html)
  284. class CrawlScheduleAdmin(BaseModelAdmin):
  285. list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str', 'crawls', 'num_crawls', 'num_snapshots')
  286. sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str')
  287. search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__urls')
  288. readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots')
  289. fieldsets = (
  290. ('Schedule Info', {
  291. 'fields': ('label', 'notes'),
  292. 'classes': ('card',),
  293. }),
  294. ('Configuration', {
  295. 'fields': ('schedule', 'template'),
  296. 'classes': ('card',),
  297. }),
  298. ('Metadata', {
  299. 'fields': ('created_by', 'created_at', 'modified_at'),
  300. 'classes': ('card',),
  301. }),
  302. ('Crawls', {
  303. 'fields': ('crawls',),
  304. 'classes': ('card', 'wide'),
  305. }),
  306. ('Snapshots', {
  307. 'fields': ('snapshots',),
  308. 'classes': ('card', 'wide'),
  309. }),
  310. )
  311. list_filter = ('created_by',)
  312. ordering = ['-created_at']
  313. list_per_page = 100
  314. actions = ["delete_selected"]
  315. @admin.display(description='Template', ordering='template')
  316. def template_str(self, obj):
  317. return format_html('<a href="{}">{}</a>', obj.template.admin_change_url, obj.template)
  318. def num_crawls(self, obj):
  319. return obj.crawl_set.count()
  320. def num_snapshots(self, obj):
  321. return obj.snapshot_set.count()
  322. def crawls(self, obj):
  323. return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
  324. (crawl.admin_change_url, crawl)
  325. for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
  326. )) or mark_safe('<i>No Crawls yet...</i>')
  327. def snapshots(self, obj):
  328. crawl_ids = obj.crawl_set.values_list('pk', flat=True)
  329. return render_snapshots_list(Snapshot.objects.filter(crawl_id__in=crawl_ids))
  330. def register_admin(admin_site):
  331. admin_site.register(Crawl, CrawlAdmin)
  332. admin_site.register(CrawlSchedule, CrawlScheduleAdmin)