tests.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. """Tests for the core views, especially AddView."""
  2. import os
  3. import django
  4. # Set up Django before importing any Django-dependent modules
  5. os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
  6. django.setup()
  7. from django.test import TestCase, Client
  8. from django.contrib.auth.models import User
  9. from django.urls import reverse
  10. from archivebox.crawls.models import Crawl, CrawlSchedule
  11. from archivebox.core.models import Tag
  12. class AddViewTests(TestCase):
  13. """Tests for the AddView (crawl creation form)."""
  14. def setUp(self):
  15. """Set up test user and client."""
  16. self.client = Client()
  17. self.user = User.objects.create_user(
  18. username='testuser',
  19. password='testpass123',
  20. email='[email protected]'
  21. )
  22. self.client.login(username='testuser', password='testpass123')
  23. self.add_url = reverse('add')
  24. def test_add_view_get_requires_auth(self):
  25. """Test that GET /add requires authentication."""
  26. self.client.logout()
  27. response = self.client.get(self.add_url)
  28. # Should redirect to login or show 403/404
  29. self.assertIn(response.status_code, [302, 403, 404])
  30. def test_add_view_get_shows_form(self):
  31. """Test that GET /add shows the form with all fields."""
  32. response = self.client.get(self.add_url)
  33. self.assertEqual(response.status_code, 200)
  34. # Check that form fields are present
  35. self.assertContains(response, 'name="url"')
  36. self.assertContains(response, 'name="tag"')
  37. self.assertContains(response, 'name="depth"')
  38. self.assertContains(response, 'name="notes"')
  39. self.assertContains(response, 'name="schedule"')
  40. self.assertContains(response, 'name="persona"')
  41. self.assertContains(response, 'name="overwrite"')
  42. self.assertContains(response, 'name="update"')
  43. self.assertContains(response, 'name="index_only"')
  44. # Check for plugin groups
  45. self.assertContains(response, 'name="chrome_plugins"')
  46. self.assertContains(response, 'name="archiving_plugins"')
  47. self.assertContains(response, 'name="parsing_plugins"')
  48. def test_add_view_shows_tag_autocomplete(self):
  49. """Test that tag autocomplete datalist is rendered."""
  50. # Create some tags
  51. Tag.objects.create(name='test-tag-1')
  52. Tag.objects.create(name='test-tag-2')
  53. response = self.client.get(self.add_url)
  54. self.assertEqual(response.status_code, 200)
  55. # Check for datalist with tags
  56. self.assertContains(response, 'id="tag-datalist"')
  57. self.assertContains(response, 'test-tag-1')
  58. self.assertContains(response, 'test-tag-2')
  59. def test_add_view_shows_plugin_presets(self):
  60. """Test that plugin preset buttons are rendered."""
  61. response = self.client.get(self.add_url)
  62. self.assertEqual(response.status_code, 200)
  63. self.assertContains(response, 'Quick Archive')
  64. self.assertContains(response, 'Full Chrome')
  65. self.assertContains(response, 'Text Only')
  66. self.assertContains(response, 'Select All')
  67. self.assertContains(response, 'Clear All')
  68. def test_add_view_shows_links_to_resources(self):
  69. """Test that helpful links are present."""
  70. response = self.client.get(self.add_url)
  71. self.assertEqual(response.status_code, 200)
  72. # Link to plugin documentation
  73. self.assertContains(response, '/admin/environment/plugins/')
  74. # Link to create new persona
  75. self.assertContains(response, '/admin/personas/persona/add/')
  76. def test_add_basic_crawl_without_schedule(self):
  77. """Test creating a basic crawl without a schedule."""
  78. response = self.client.post(self.add_url, {
  79. 'url': 'https://example.com\nhttps://example.org',
  80. 'tag': 'test-tag',
  81. 'depth': '0',
  82. 'notes': 'Test crawl notes',
  83. })
  84. # Should redirect to crawl admin page
  85. self.assertEqual(response.status_code, 302)
  86. # Check that crawl was created
  87. self.assertEqual(Crawl.objects.count(), 1)
  88. crawl = Crawl.objects.first()
  89. self.assertIn('https://example.com', crawl.urls)
  90. self.assertIn('https://example.org', crawl.urls)
  91. self.assertEqual(crawl.tags_str, 'test-tag')
  92. self.assertEqual(crawl.max_depth, 0)
  93. self.assertEqual(crawl.notes, 'Test crawl notes')
  94. self.assertEqual(crawl.created_by, self.user)
  95. # No schedule should be created
  96. self.assertIsNone(crawl.schedule)
  97. self.assertEqual(CrawlSchedule.objects.count(), 0)
  98. def test_add_crawl_with_schedule(self):
  99. """Test creating a crawl with a repeat schedule."""
  100. response = self.client.post(self.add_url, {
  101. 'url': 'https://example.com',
  102. 'tag': 'scheduled',
  103. 'depth': '1',
  104. 'notes': 'Daily crawl',
  105. 'schedule': 'daily',
  106. })
  107. self.assertEqual(response.status_code, 302)
  108. # Check that crawl and schedule were created
  109. self.assertEqual(Crawl.objects.count(), 1)
  110. self.assertEqual(CrawlSchedule.objects.count(), 1)
  111. crawl = Crawl.objects.first()
  112. schedule = CrawlSchedule.objects.first()
  113. self.assertEqual(crawl.schedule, schedule)
  114. self.assertEqual(schedule.template, crawl)
  115. self.assertEqual(schedule.schedule, 'daily')
  116. self.assertTrue(schedule.is_enabled)
  117. self.assertEqual(schedule.created_by, self.user)
  118. def test_add_crawl_with_cron_schedule(self):
  119. """Test creating a crawl with a cron format schedule."""
  120. response = self.client.post(self.add_url, {
  121. 'url': 'https://example.com',
  122. 'depth': '0',
  123. 'schedule': '0 */6 * * *', # Every 6 hours
  124. })
  125. self.assertEqual(response.status_code, 302)
  126. schedule = CrawlSchedule.objects.first()
  127. self.assertEqual(schedule.schedule, '0 */6 * * *')
  128. def test_add_crawl_with_plugins(self):
  129. """Test creating a crawl with specific plugins selected."""
  130. response = self.client.post(self.add_url, {
  131. 'url': 'https://example.com',
  132. 'depth': '0',
  133. 'chrome_plugins': ['screenshot', 'dom'],
  134. 'archiving_plugins': ['wget'],
  135. })
  136. self.assertEqual(response.status_code, 302)
  137. crawl = Crawl.objects.first()
  138. plugins = crawl.config.get('PLUGINS', '')
  139. # Should contain the selected plugins
  140. self.assertIn('screenshot', plugins)
  141. self.assertIn('dom', plugins)
  142. self.assertIn('wget', plugins)
  143. def test_add_crawl_with_depth_range(self):
  144. """Test creating crawls with different depth values (0-4)."""
  145. for depth in range(5):
  146. response = self.client.post(self.add_url, {
  147. 'url': f'https://example{depth}.com',
  148. 'depth': str(depth),
  149. })
  150. self.assertEqual(response.status_code, 302)
  151. self.assertEqual(Crawl.objects.count(), 5)
  152. for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
  153. self.assertEqual(crawl.max_depth, i)
  154. def test_add_crawl_with_advanced_options(self):
  155. """Test creating a crawl with advanced options."""
  156. response = self.client.post(self.add_url, {
  157. 'url': 'https://example.com',
  158. 'depth': '0',
  159. 'persona': 'CustomPersona',
  160. 'overwrite': True,
  161. 'update': True,
  162. 'index_only': True,
  163. })
  164. self.assertEqual(response.status_code, 302)
  165. crawl = Crawl.objects.first()
  166. config = crawl.config
  167. self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
  168. self.assertEqual(config.get('OVERWRITE'), True)
  169. self.assertEqual(config.get('ONLY_NEW'), False) # opposite of update
  170. self.assertEqual(config.get('INDEX_ONLY'), True)
  171. def test_add_crawl_with_custom_config(self):
  172. """Test creating a crawl with custom config overrides."""
  173. # Note: Django test client can't easily POST the KeyValueWidget format,
  174. # so this test would need to use the form directly or mock the cleaned_data
  175. # For now, we'll skip this test or mark it as TODO
  176. pass
  177. def test_add_empty_urls_fails(self):
  178. """Test that submitting without URLs fails validation."""
  179. response = self.client.post(self.add_url, {
  180. 'url': '',
  181. 'depth': '0',
  182. })
  183. # Should show form again with errors, not redirect
  184. self.assertEqual(response.status_code, 200)
  185. self.assertFormError(response, 'form', 'url', 'This field is required.')
  186. def test_add_invalid_urls_fails(self):
  187. """Test that invalid URLs fail validation."""
  188. response = self.client.post(self.add_url, {
  189. 'url': 'not-a-url',
  190. 'depth': '0',
  191. })
  192. # Should show form again with errors
  193. self.assertEqual(response.status_code, 200)
  194. # Check for validation error (URL regex should fail)
  195. self.assertContains(response, 'error')
  196. def test_add_success_message_without_schedule(self):
  197. """Test that success message is shown without schedule link."""
  198. response = self.client.post(self.add_url, {
  199. 'url': 'https://example.com\nhttps://example.org',
  200. 'depth': '0',
  201. }, follow=True)
  202. # Check success message mentions crawl creation
  203. messages = list(response.context['messages'])
  204. self.assertEqual(len(messages), 1)
  205. message_text = str(messages[0])
  206. self.assertIn('Created crawl with 2 starting URL', message_text)
  207. self.assertIn('View Crawl', message_text)
  208. self.assertNotIn('scheduled to repeat', message_text)
  209. def test_add_success_message_with_schedule(self):
  210. """Test that success message includes schedule link."""
  211. response = self.client.post(self.add_url, {
  212. 'url': 'https://example.com',
  213. 'depth': '0',
  214. 'schedule': 'weekly',
  215. }, follow=True)
  216. # Check success message mentions schedule
  217. messages = list(response.context['messages'])
  218. self.assertEqual(len(messages), 1)
  219. message_text = str(messages[0])
  220. self.assertIn('Created crawl', message_text)
  221. self.assertIn('scheduled to repeat weekly', message_text)
  222. self.assertIn('View Crawl', message_text)
  223. def test_add_crawl_creates_source_file(self):
  224. """Test that crawl creation saves URLs to sources file."""
  225. response = self.client.post(self.add_url, {
  226. 'url': 'https://example.com',
  227. 'depth': '0',
  228. })
  229. self.assertEqual(response.status_code, 302)
  230. # Check that source file was created in sources/ directory
  231. from archivebox.config import CONSTANTS
  232. sources_dir = CONSTANTS.SOURCES_DIR
  233. # Should have created a source file
  234. source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
  235. self.assertGreater(len(source_files), 0)
  236. def test_multiple_tags_are_saved(self):
  237. """Test that multiple comma-separated tags are saved."""
  238. response = self.client.post(self.add_url, {
  239. 'url': 'https://example.com',
  240. 'depth': '0',
  241. 'tag': 'tag1,tag2,tag3',
  242. })
  243. self.assertEqual(response.status_code, 302)
  244. crawl = Crawl.objects.first()
  245. self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
  246. def test_crawl_redirects_to_admin_change_page(self):
  247. """Test that successful submission redirects to crawl admin page."""
  248. response = self.client.post(self.add_url, {
  249. 'url': 'https://example.com',
  250. 'depth': '0',
  251. })
  252. crawl = Crawl.objects.first()
  253. expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
  254. self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)