| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319 |
- """Tests for the core views, especially AddView."""
- import os
- import django
- # Set up Django before importing any Django-dependent modules
- os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
- django.setup()
- from django.test import TestCase, Client
- from django.contrib.auth.models import User
- from django.urls import reverse
- from archivebox.crawls.models import Crawl, CrawlSchedule
- from archivebox.core.models import Tag
- class AddViewTests(TestCase):
- """Tests for the AddView (crawl creation form)."""
- def setUp(self):
- """Set up test user and client."""
- self.client = Client()
- self.user = User.objects.create_user(
- username='testuser',
- password='testpass123',
- email='[email protected]'
- )
- self.client.login(username='testuser', password='testpass123')
- self.add_url = reverse('add')
- def test_add_view_get_requires_auth(self):
- """Test that GET /add requires authentication."""
- self.client.logout()
- response = self.client.get(self.add_url)
- # Should redirect to login or show 403/404
- self.assertIn(response.status_code, [302, 403, 404])
- def test_add_view_get_shows_form(self):
- """Test that GET /add shows the form with all fields."""
- response = self.client.get(self.add_url)
- self.assertEqual(response.status_code, 200)
- # Check that form fields are present
- self.assertContains(response, 'name="url"')
- self.assertContains(response, 'name="tag"')
- self.assertContains(response, 'name="depth"')
- self.assertContains(response, 'name="notes"')
- self.assertContains(response, 'name="schedule"')
- self.assertContains(response, 'name="persona"')
- self.assertContains(response, 'name="overwrite"')
- self.assertContains(response, 'name="update"')
- self.assertContains(response, 'name="index_only"')
- # Check for plugin groups
- self.assertContains(response, 'name="chrome_plugins"')
- self.assertContains(response, 'name="archiving_plugins"')
- self.assertContains(response, 'name="parsing_plugins"')
- def test_add_view_shows_tag_autocomplete(self):
- """Test that tag autocomplete datalist is rendered."""
- # Create some tags
- Tag.objects.create(name='test-tag-1')
- Tag.objects.create(name='test-tag-2')
- response = self.client.get(self.add_url)
- self.assertEqual(response.status_code, 200)
- # Check for datalist with tags
- self.assertContains(response, 'id="tag-datalist"')
- self.assertContains(response, 'test-tag-1')
- self.assertContains(response, 'test-tag-2')
- def test_add_view_shows_plugin_presets(self):
- """Test that plugin preset buttons are rendered."""
- response = self.client.get(self.add_url)
- self.assertEqual(response.status_code, 200)
- self.assertContains(response, 'Quick Archive')
- self.assertContains(response, 'Full Chrome')
- self.assertContains(response, 'Text Only')
- self.assertContains(response, 'Select All')
- self.assertContains(response, 'Clear All')
- def test_add_view_shows_links_to_resources(self):
- """Test that helpful links are present."""
- response = self.client.get(self.add_url)
- self.assertEqual(response.status_code, 200)
- # Link to plugin documentation
- self.assertContains(response, '/admin/environment/plugins/')
- # Link to create new persona
- self.assertContains(response, '/admin/personas/persona/add/')
- def test_add_basic_crawl_without_schedule(self):
- """Test creating a basic crawl without a schedule."""
- response = self.client.post(self.add_url, {
- 'url': 'https://example.com\nhttps://example.org',
- 'tag': 'test-tag',
- 'depth': '0',
- 'notes': 'Test crawl notes',
- })
- # Should redirect to crawl admin page
- self.assertEqual(response.status_code, 302)
- # Check that crawl was created
- self.assertEqual(Crawl.objects.count(), 1)
- crawl = Crawl.objects.first()
- self.assertIn('https://example.com', crawl.urls)
- self.assertIn('https://example.org', crawl.urls)
- self.assertEqual(crawl.tags_str, 'test-tag')
- self.assertEqual(crawl.max_depth, 0)
- self.assertEqual(crawl.notes, 'Test crawl notes')
- self.assertEqual(crawl.created_by, self.user)
- # No schedule should be created
- self.assertIsNone(crawl.schedule)
- self.assertEqual(CrawlSchedule.objects.count(), 0)
- def test_add_crawl_with_schedule(self):
- """Test creating a crawl with a repeat schedule."""
- response = self.client.post(self.add_url, {
- 'url': 'https://example.com',
- 'tag': 'scheduled',
- 'depth': '1',
- 'notes': 'Daily crawl',
- 'schedule': 'daily',
- })
- self.assertEqual(response.status_code, 302)
- # Check that crawl and schedule were created
- self.assertEqual(Crawl.objects.count(), 1)
- self.assertEqual(CrawlSchedule.objects.count(), 1)
- crawl = Crawl.objects.first()
- schedule = CrawlSchedule.objects.first()
- self.assertEqual(crawl.schedule, schedule)
- self.assertEqual(schedule.template, crawl)
- self.assertEqual(schedule.schedule, 'daily')
- self.assertTrue(schedule.is_enabled)
- self.assertEqual(schedule.created_by, self.user)
- def test_add_crawl_with_cron_schedule(self):
- """Test creating a crawl with a cron format schedule."""
- response = self.client.post(self.add_url, {
- 'url': 'https://example.com',
- 'depth': '0',
- 'schedule': '0 */6 * * *', # Every 6 hours
- })
- self.assertEqual(response.status_code, 302)
- schedule = CrawlSchedule.objects.first()
- self.assertEqual(schedule.schedule, '0 */6 * * *')
- def test_add_crawl_with_plugins(self):
- """Test creating a crawl with specific plugins selected."""
- response = self.client.post(self.add_url, {
- 'url': 'https://example.com',
- 'depth': '0',
- 'chrome_plugins': ['screenshot', 'dom'],
- 'archiving_plugins': ['wget'],
- })
- self.assertEqual(response.status_code, 302)
- crawl = Crawl.objects.first()
- plugins = crawl.config.get('PLUGINS', '')
- # Should contain the selected plugins
- self.assertIn('screenshot', plugins)
- self.assertIn('dom', plugins)
- self.assertIn('wget', plugins)
- def test_add_crawl_with_depth_range(self):
- """Test creating crawls with different depth values (0-4)."""
- for depth in range(5):
- response = self.client.post(self.add_url, {
- 'url': f'https://example{depth}.com',
- 'depth': str(depth),
- })
- self.assertEqual(response.status_code, 302)
- self.assertEqual(Crawl.objects.count(), 5)
- for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
- self.assertEqual(crawl.max_depth, i)
- def test_add_crawl_with_advanced_options(self):
- """Test creating a crawl with advanced options."""
- response = self.client.post(self.add_url, {
- 'url': 'https://example.com',
- 'depth': '0',
- 'persona': 'CustomPersona',
- 'overwrite': True,
- 'update': True,
- 'index_only': True,
- })
- self.assertEqual(response.status_code, 302)
- crawl = Crawl.objects.first()
- config = crawl.config
- self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
- self.assertEqual(config.get('OVERWRITE'), True)
- self.assertEqual(config.get('ONLY_NEW'), False) # opposite of update
- self.assertEqual(config.get('INDEX_ONLY'), True)
- def test_add_crawl_with_custom_config(self):
- """Test creating a crawl with custom config overrides."""
- # Note: Django test client can't easily POST the KeyValueWidget format,
- # so this test would need to use the form directly or mock the cleaned_data
- # For now, we'll skip this test or mark it as TODO
- pass
- def test_add_empty_urls_fails(self):
- """Test that submitting without URLs fails validation."""
- response = self.client.post(self.add_url, {
- 'url': '',
- 'depth': '0',
- })
- # Should show form again with errors, not redirect
- self.assertEqual(response.status_code, 200)
- self.assertFormError(response, 'form', 'url', 'This field is required.')
- def test_add_invalid_urls_fails(self):
- """Test that invalid URLs fail validation."""
- response = self.client.post(self.add_url, {
- 'url': 'not-a-url',
- 'depth': '0',
- })
- # Should show form again with errors
- self.assertEqual(response.status_code, 200)
- # Check for validation error (URL regex should fail)
- self.assertContains(response, 'error')
- def test_add_success_message_without_schedule(self):
- """Test that success message is shown without schedule link."""
- response = self.client.post(self.add_url, {
- 'url': 'https://example.com\nhttps://example.org',
- 'depth': '0',
- }, follow=True)
- # Check success message mentions crawl creation
- messages = list(response.context['messages'])
- self.assertEqual(len(messages), 1)
- message_text = str(messages[0])
- self.assertIn('Created crawl with 2 starting URL', message_text)
- self.assertIn('View Crawl', message_text)
- self.assertNotIn('scheduled to repeat', message_text)
- def test_add_success_message_with_schedule(self):
- """Test that success message includes schedule link."""
- response = self.client.post(self.add_url, {
- 'url': 'https://example.com',
- 'depth': '0',
- 'schedule': 'weekly',
- }, follow=True)
- # Check success message mentions schedule
- messages = list(response.context['messages'])
- self.assertEqual(len(messages), 1)
- message_text = str(messages[0])
- self.assertIn('Created crawl', message_text)
- self.assertIn('scheduled to repeat weekly', message_text)
- self.assertIn('View Crawl', message_text)
- def test_add_crawl_creates_source_file(self):
- """Test that crawl creation saves URLs to sources file."""
- response = self.client.post(self.add_url, {
- 'url': 'https://example.com',
- 'depth': '0',
- })
- self.assertEqual(response.status_code, 302)
- # Check that source file was created in sources/ directory
- from archivebox.config import CONSTANTS
- sources_dir = CONSTANTS.SOURCES_DIR
- # Should have created a source file
- source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
- self.assertGreater(len(source_files), 0)
- def test_multiple_tags_are_saved(self):
- """Test that multiple comma-separated tags are saved."""
- response = self.client.post(self.add_url, {
- 'url': 'https://example.com',
- 'depth': '0',
- 'tag': 'tag1,tag2,tag3',
- })
- self.assertEqual(response.status_code, 302)
- crawl = Crawl.objects.first()
- self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
- def test_crawl_redirects_to_admin_change_page(self):
- """Test that successful submission redirects to crawl admin page."""
- response = self.client.post(self.add_url, {
- 'url': 'https://example.com',
- 'depth': '0',
- })
- crawl = Crawl.objects.first()
- expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
- self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)
|