|
|
@@ -17,7 +17,7 @@ from django_object_actions import action
|
|
|
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
|
|
|
|
|
from core.models import Snapshot
|
|
|
-from crawls.models import Seed, Crawl, CrawlSchedule
|
|
|
+from crawls.models import Crawl, CrawlSchedule
|
|
|
|
|
|
|
|
|
def render_snapshots_list(snapshots_qs, limit=20):
|
|
|
@@ -136,100 +136,24 @@ def render_snapshots_list(snapshots_qs, limit=20):
|
|
|
''')
|
|
|
|
|
|
|
|
|
-class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|
|
- list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
|
|
|
- sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
|
|
- search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
|
|
-
|
|
|
- readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
|
|
|
-
|
|
|
- fieldsets = (
|
|
|
- ('Source', {
|
|
|
- 'fields': ('uri', 'contents'),
|
|
|
- 'classes': ('card', 'wide'),
|
|
|
- }),
|
|
|
- ('Info', {
|
|
|
- 'fields': ('label', 'notes', 'tags_str'),
|
|
|
- 'classes': ('card',),
|
|
|
- }),
|
|
|
- ('Settings', {
|
|
|
- 'fields': ('extractor', 'config'),
|
|
|
- 'classes': ('card',),
|
|
|
- }),
|
|
|
- ('Metadata', {
|
|
|
- 'fields': ('created_by', 'created_at', 'modified_at'),
|
|
|
- 'classes': ('card',),
|
|
|
- }),
|
|
|
- ('Crawls', {
|
|
|
- 'fields': ('scheduled_crawls', 'crawls'),
|
|
|
- 'classes': ('card',),
|
|
|
- }),
|
|
|
- ('Snapshots', {
|
|
|
- 'fields': ('snapshots',),
|
|
|
- 'classes': ('card',),
|
|
|
- }),
|
|
|
- )
|
|
|
-
|
|
|
- list_filter = ('extractor', 'created_by')
|
|
|
- ordering = ['-created_at']
|
|
|
- list_per_page = 100
|
|
|
- actions = ["delete_selected"]
|
|
|
-
|
|
|
- def num_crawls(self, obj):
|
|
|
- return obj.crawl_set.count()
|
|
|
-
|
|
|
- def num_snapshots(self, obj):
|
|
|
- return obj.snapshot_set.count()
|
|
|
-
|
|
|
- def scheduled_crawls(self, obj):
|
|
|
- return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
|
|
- (scheduledcrawl.admin_change_url, scheduledcrawl)
|
|
|
- for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
|
|
|
- )) or mark_safe('<i>No Scheduled Crawls yet...</i>')
|
|
|
-
|
|
|
- def crawls(self, obj):
|
|
|
- return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
|
|
- (crawl.admin_change_url, crawl)
|
|
|
- for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
|
|
|
- )) or mark_safe('<i>No Crawls yet...</i>')
|
|
|
-
|
|
|
- def snapshots(self, obj):
|
|
|
- return render_snapshots_list(obj.snapshot_set.all())
|
|
|
-
|
|
|
- def contents(self, obj):
|
|
|
- source_file = obj.get_file_path()
|
|
|
- if source_file:
|
|
|
- contents = ""
|
|
|
- try:
|
|
|
- contents = source_file.read_text().strip()[:14_000]
|
|
|
- except Exception as e:
|
|
|
- contents = f'Error reading {source_file}: {e}'
|
|
|
-
|
|
|
- return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
|
|
|
-
|
|
|
- return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|
|
- list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
|
|
|
- sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
|
|
|
- search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
|
|
|
+ list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
|
|
|
+ sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at')
|
|
|
+ search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls')
|
|
|
|
|
|
- readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
|
|
|
+ readonly_fields = ('created_at', 'modified_at', 'snapshots', 'urls_editor')
|
|
|
|
|
|
fieldsets = (
|
|
|
('URLs', {
|
|
|
- 'fields': ('seed_urls_editor',),
|
|
|
+ 'fields': ('urls_editor',),
|
|
|
'classes': ('card', 'wide'),
|
|
|
}),
|
|
|
('Info', {
|
|
|
- 'fields': ('label', 'notes'),
|
|
|
+ 'fields': ('label', 'notes', 'tags_str'),
|
|
|
'classes': ('card',),
|
|
|
}),
|
|
|
('Settings', {
|
|
|
- 'fields': ('max_depth', 'config'),
|
|
|
+ 'fields': ('max_depth', 'extractor', 'config'),
|
|
|
'classes': ('card',),
|
|
|
}),
|
|
|
('Status', {
|
|
|
@@ -237,7 +161,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|
|
'classes': ('card',),
|
|
|
}),
|
|
|
('Relations', {
|
|
|
- 'fields': ('seed', 'schedule', 'created_by'),
|
|
|
+ 'fields': ('schedule', 'created_by'),
|
|
|
'classes': ('card',),
|
|
|
}),
|
|
|
('Timestamps', {
|
|
|
@@ -250,7 +174,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|
|
}),
|
|
|
)
|
|
|
|
|
|
- list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
|
|
|
+ list_filter = ('max_depth', 'extractor', 'schedule', 'created_by', 'status', 'retry_at')
|
|
|
ordering = ['-created_at', '-retry_at']
|
|
|
list_per_page = 100
|
|
|
actions = ["delete_selected"]
|
|
|
@@ -258,23 +182,20 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|
|
|
|
|
@action(label='Recrawl', description='Create a new crawl with the same settings')
|
|
|
def recrawl(self, request, obj):
|
|
|
- """Duplicate this crawl as a new crawl with the same seed and settings."""
|
|
|
+ """Duplicate this crawl as a new crawl with the same URLs and settings."""
|
|
|
from django.utils import timezone
|
|
|
from django.shortcuts import redirect
|
|
|
|
|
|
- # Validate seed has a URI (required for crawl to start)
|
|
|
- if not obj.seed:
|
|
|
- messages.error(request, 'Cannot recrawl: original crawl has no seed.')
|
|
|
- return redirect('admin:crawls_crawl_change', obj.id)
|
|
|
-
|
|
|
- if not obj.seed.uri:
|
|
|
- messages.error(request, 'Cannot recrawl: seed has no URI.')
|
|
|
+ # Validate URLs (required for crawl to start)
|
|
|
+ if not obj.urls:
|
|
|
+ messages.error(request, 'Cannot recrawl: original crawl has no URLs.')
|
|
|
return redirect('admin:crawls_crawl_change', obj.id)
|
|
|
|
|
|
new_crawl = Crawl.objects.create(
|
|
|
- seed=obj.seed,
|
|
|
urls=obj.urls,
|
|
|
+ extractor=obj.extractor,
|
|
|
max_depth=obj.max_depth,
|
|
|
+ tags_str=obj.tags_str,
|
|
|
config=obj.config,
|
|
|
schedule=obj.schedule,
|
|
|
label=f"{obj.label} (recrawl)" if obj.label else "",
|
|
|
@@ -292,43 +213,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|
|
|
|
|
return redirect('admin:crawls_crawl_change', new_crawl.id)
|
|
|
|
|
|
- def get_urls(self):
|
|
|
- urls = super().get_urls()
|
|
|
- custom_urls = [
|
|
|
- path('<path:object_id>/save_seed_contents/',
|
|
|
- self.admin_site.admin_view(self.save_seed_contents_view),
|
|
|
- name='crawls_crawl_save_seed_contents'),
|
|
|
- ]
|
|
|
- return custom_urls + urls
|
|
|
-
|
|
|
- def save_seed_contents_view(self, request, object_id):
|
|
|
- """Handle saving seed file contents via AJAX."""
|
|
|
- if request.method != 'POST':
|
|
|
- return JsonResponse({'success': False, 'error': 'POST required'}, status=405)
|
|
|
-
|
|
|
- try:
|
|
|
- crawl = Crawl.objects.get(pk=object_id)
|
|
|
- except Crawl.DoesNotExist:
|
|
|
- return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
|
|
|
-
|
|
|
- source_file = crawl.seed.get_file_path() if crawl.seed else None
|
|
|
- if not source_file:
|
|
|
- return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
|
|
|
-
|
|
|
- try:
|
|
|
- data = json.loads(request.body)
|
|
|
- contents = data.get('contents', '')
|
|
|
- except json.JSONDecodeError:
|
|
|
- return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
|
|
|
-
|
|
|
- try:
|
|
|
- # Ensure parent directory exists
|
|
|
- source_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
- source_file.write_text(contents)
|
|
|
- return JsonResponse({'success': True, 'message': f'Saved {len(contents)} bytes to {source_file.name}'})
|
|
|
- except Exception as e:
|
|
|
- return JsonResponse({'success': False, 'error': str(e)}, status=500)
|
|
|
-
|
|
|
def num_snapshots(self, obj):
|
|
|
return obj.snapshot_set.count()
|
|
|
|
|
|
@@ -341,163 +225,68 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|
|
return mark_safe('<i>None</i>')
|
|
|
return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule)
|
|
|
|
|
|
- @admin.display(description='Seed', ordering='seed')
|
|
|
- def seed_str(self, obj):
|
|
|
- if not obj.seed:
|
|
|
- return mark_safe('<i>None</i>')
|
|
|
- return format_html('<a href="{}">{}</a>', obj.seed.admin_change_url, obj.seed)
|
|
|
+ @admin.display(description='URLs', ordering='urls')
|
|
|
+ def urls_preview(self, obj):
|
|
|
+ first_url = obj.get_urls_list()[0] if obj.get_urls_list() else ''
|
|
|
+ return first_url[:80] + '...' if len(first_url) > 80 else first_url
|
|
|
|
|
|
@admin.display(description='URLs')
|
|
|
- def seed_urls_editor(self, obj):
|
|
|
- """Combined editor showing seed URL and file contents."""
|
|
|
- widget_id = f'seed_urls_{obj.pk}'
|
|
|
-
|
|
|
- # Get the seed URI (or use urls field if no seed)
|
|
|
- seed_uri = ''
|
|
|
- if obj.seed and obj.seed.uri:
|
|
|
- seed_uri = obj.seed.uri
|
|
|
- elif obj.urls:
|
|
|
- seed_uri = obj.urls
|
|
|
+ def urls_editor(self, obj):
|
|
|
+ """Editor for crawl URLs."""
|
|
|
+ widget_id = f'crawl_urls_{obj.pk}'
|
|
|
|
|
|
# Check if it's a local file we can edit
|
|
|
- source_file = obj.seed.get_file_path() if obj.seed else None
|
|
|
+ source_file = obj.get_file_path()
|
|
|
is_file = source_file is not None
|
|
|
- contents = ""
|
|
|
+ file_contents = ""
|
|
|
error = None
|
|
|
|
|
|
if is_file and source_file:
|
|
|
try:
|
|
|
- contents = source_file.read_text().strip()
|
|
|
+ file_contents = source_file.read_text().strip()
|
|
|
except Exception as e:
|
|
|
error = f'Error reading {source_file}: {e}'
|
|
|
|
|
|
# Escape for safe HTML embedding
|
|
|
- escaped_uri = seed_uri.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
|
|
- escaped_contents = (contents or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
|
|
+ escaped_urls = (obj.urls or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
|
|
+ escaped_file_contents = file_contents.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
|
|
|
|
|
# Count lines for auto-expand logic
|
|
|
- line_count = len(contents.split('\n')) if contents else 0
|
|
|
- uri_rows = min(max(1, seed_uri.count('\n') + 1), 3)
|
|
|
+ line_count = len((obj.urls or '').split('\n'))
|
|
|
+ file_line_count = len(file_contents.split('\n')) if file_contents else 0
|
|
|
+ uri_rows = min(max(3, line_count), 10)
|
|
|
|
|
|
html = f'''
|
|
|
<div id="{widget_id}_container" style="max-width: 900px;">
|
|
|
- <!-- Seed URL input (auto-expands) -->
|
|
|
+ <!-- URLs input -->
|
|
|
<div style="margin-bottom: 12px;">
|
|
|
- <label style="font-weight: bold; display: block; margin-bottom: 4px;">Seed URL:</label>
|
|
|
- <textarea id="{widget_id}_uri"
|
|
|
+ <label style="font-weight: bold; display: block; margin-bottom: 4px;">URLs (one per line):</label>
|
|
|
+ <textarea id="{widget_id}_urls"
|
|
|
style="width: 100%; font-family: monospace; font-size: 13px;
|
|
|
padding: 8px; border: 1px solid #ccc; border-radius: 4px;
|
|
|
- resize: vertical; min-height: 32px; overflow: hidden;"
|
|
|
+ resize: vertical;"
|
|
|
rows="{uri_rows}"
|
|
|
- placeholder="file:///data/sources/... or https://..."
|
|
|
- {"readonly" if not obj.pk else ""}>{escaped_uri}</textarea>
|
|
|
+ placeholder="https://example.com https://example2.com # Comments start with #"
|
|
|
+ readonly>{escaped_urls}</textarea>
|
|
|
+ <p style="color: #666; font-size: 12px; margin: 4px 0 0 0;">
|
|
|
+ {line_count} URL{'s' if line_count != 1 else ''} · URLs are read-only in admin, edit via API or CLI
|
|
|
+ </p>
|
|
|
</div>
|
|
|
|
|
|
{"" if not is_file else f'''
|
|
|
- <!-- File contents editor -->
|
|
|
+ <!-- File contents preview (if first URL is a file://) -->
|
|
|
<div style="margin-bottom: 8px;">
|
|
|
<label style="font-weight: bold; display: block; margin-bottom: 4px;">
|
|
|
- File Contents: <code style="font-weight: normal; color: #666;">{source_file}</code>
|
|
|
+ File Preview: <code style="font-weight: normal; color: #666;">{source_file}</code>
|
|
|
</label>
|
|
|
{"<div style='color: #dc3545; margin-bottom: 8px;'>" + error + "</div>" if error else ""}
|
|
|
- <textarea id="{widget_id}_contents"
|
|
|
- style="width: 100%; height: {min(400, max(150, line_count * 18))}px; font-family: monospace; font-size: 12px;
|
|
|
- padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical;"
|
|
|
- placeholder="Enter URLs, one per line...">{escaped_contents}</textarea>
|
|
|
- </div>
|
|
|
-
|
|
|
- <div style="display: flex; gap: 12px; align-items: center; flex-wrap: wrap;">
|
|
|
- <button type="button" id="{widget_id}_save_btn"
|
|
|
- onclick="saveSeedUrls_{widget_id}()"
|
|
|
- style="padding: 8px 20px; background: #417690; color: white; border: none;
|
|
|
- border-radius: 4px; cursor: pointer; font-weight: bold;">
|
|
|
- Save URLs
|
|
|
- </button>
|
|
|
- <span id="{widget_id}_line_count" style="color: #666; font-size: 12px;"></span>
|
|
|
- <span id="{widget_id}_status" style="color: #666; font-size: 12px;"></span>
|
|
|
- </div>
|
|
|
- '''}
|
|
|
-
|
|
|
- {"" if is_file else f'''
|
|
|
- <div style="margin-top: 8px; color: #666;">
|
|
|
- <a href="{seed_uri}" target="_blank">{seed_uri}</a>
|
|
|
+ <textarea id="{widget_id}_file_preview"
|
|
|
+ style="width: 100%; height: {min(400, max(150, file_line_count * 18))}px; font-family: monospace; font-size: 12px;
|
|
|
+ padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical; background: #f9f9f9;"
|
|
|
+ readonly>{escaped_file_contents}</textarea>
|
|
|
</div>
|
|
|
'''}
|
|
|
|
|
|
- <script>
|
|
|
- (function() {{
|
|
|
- var uriInput = document.getElementById('{widget_id}_uri');
|
|
|
- var contentsInput = document.getElementById('{widget_id}_contents');
|
|
|
- var status = document.getElementById('{widget_id}_status');
|
|
|
- var lineCount = document.getElementById('{widget_id}_line_count');
|
|
|
- var saveBtn = document.getElementById('{widget_id}_save_btn');
|
|
|
-
|
|
|
- // Auto-resize URI input
|
|
|
- function autoResizeUri() {{
|
|
|
- uriInput.style.height = 'auto';
|
|
|
- uriInput.style.height = Math.min(100, uriInput.scrollHeight) + 'px';
|
|
|
- }}
|
|
|
- uriInput.addEventListener('input', autoResizeUri);
|
|
|
- autoResizeUri();
|
|
|
-
|
|
|
- if (contentsInput) {{
|
|
|
- function updateLineCount() {{
|
|
|
- var lines = contentsInput.value.split('\\n').filter(function(l) {{ return l.trim(); }});
|
|
|
- lineCount.textContent = lines.length + ' URLs';
|
|
|
- }}
|
|
|
-
|
|
|
- contentsInput.addEventListener('input', function() {{
|
|
|
- updateLineCount();
|
|
|
- if (status) {{
|
|
|
- status.textContent = '(unsaved changes)';
|
|
|
- status.style.color = '#c4820e';
|
|
|
- }}
|
|
|
- }});
|
|
|
-
|
|
|
- updateLineCount();
|
|
|
- }}
|
|
|
-
|
|
|
- window.saveSeedUrls_{widget_id} = function() {{
|
|
|
- if (!saveBtn) return;
|
|
|
- saveBtn.disabled = true;
|
|
|
- saveBtn.textContent = 'Saving...';
|
|
|
- if (status) status.textContent = '';
|
|
|
-
|
|
|
- fetch(window.location.pathname + 'save_seed_contents/', {{
|
|
|
- method: 'POST',
|
|
|
- headers: {{
|
|
|
- 'Content-Type': 'application/json',
|
|
|
- 'X-CSRFToken': document.querySelector('[name=csrfmiddlewaretoken]').value
|
|
|
- }},
|
|
|
- body: JSON.stringify({{ contents: contentsInput ? contentsInput.value : '' }})
|
|
|
- }})
|
|
|
- .then(function(response) {{ return response.json(); }})
|
|
|
- .then(function(data) {{
|
|
|
- if (data.success) {{
|
|
|
- if (status) {{
|
|
|
- status.textContent = '✓ ' + data.message;
|
|
|
- status.style.color = '#28a745';
|
|
|
- }}
|
|
|
- }} else {{
|
|
|
- if (status) {{
|
|
|
- status.textContent = '✗ ' + data.error;
|
|
|
- status.style.color = '#dc3545';
|
|
|
- }}
|
|
|
- }}
|
|
|
- }})
|
|
|
- .catch(function(err) {{
|
|
|
- if (status) {{
|
|
|
- status.textContent = '✗ Error: ' + err;
|
|
|
- status.style.color = '#dc3545';
|
|
|
- }}
|
|
|
- }})
|
|
|
- .finally(function() {{
|
|
|
- saveBtn.disabled = false;
|
|
|
- saveBtn.textContent = 'Save URLs';
|
|
|
- }});
|
|
|
- }};
|
|
|
- }})();
|
|
|
- </script>
|
|
|
</div>
|
|
|
'''
|
|
|
return mark_safe(html)
|
|
|
@@ -507,7 +296,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|
|
class CrawlScheduleAdmin(BaseModelAdmin):
|
|
|
list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str', 'crawls', 'num_crawls', 'num_snapshots')
|
|
|
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str')
|
|
|
- search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__seed__uri')
|
|
|
+ search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__urls')
|
|
|
|
|
|
readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots')
|
|
|
|
|
|
@@ -561,6 +350,5 @@ class CrawlScheduleAdmin(BaseModelAdmin):
|
|
|
|
|
|
|
|
|
def register_admin(admin_site):
|
|
|
- admin_site.register(Seed, SeedAdmin)
|
|
|
admin_site.register(Crawl, CrawlAdmin)
|
|
|
admin_site.register(CrawlSchedule, CrawlScheduleAdmin)
|