test_savepagenow.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. """Integration tests for /web/https://... shortcut (Save Page Now)."""
  2. import os
  3. import subprocess
  4. import sys
  5. import textwrap
  6. from pathlib import Path
  7. from archivebox.tests.conftest import create_test_url
  8. def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool):
  9. project_root = Path(__file__).resolve().parents[2]
  10. script = textwrap.dedent(
  11. f"""
  12. import os
  13. os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
  14. from archivebox.config.django import setup_django
  15. setup_django()
  16. from django.test import Client
  17. from django.contrib.auth import get_user_model
  18. from archivebox.core.models import Snapshot
  19. client = Client()
  20. if {login!r}:
  21. user = get_user_model().objects.create_user(username='tester', password='pw')
  22. client.force_login(user)
  23. target_url = {request_url!r}
  24. resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
  25. assert resp.status_code == 302, resp.status_code
  26. snapshot = Snapshot.objects.filter(url={expected_url!r}).order_by('-created_at').first()
  27. if snapshot is None:
  28. raise AssertionError(
  29. "snapshot not created; status=%s location=%s count=%s"
  30. % (
  31. resp.status_code,
  32. resp.get('Location'),
  33. Snapshot.objects.count(),
  34. )
  35. )
  36. assert resp['Location'] == f"/{{snapshot.url_path}}"
  37. resp2 = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
  38. assert resp2.status_code == 302, resp2.status_code
  39. assert Snapshot.objects.filter(url={expected_url!r}).count() == 1
  40. assert resp2['Location'] == f"/{{snapshot.url_path}}"
  41. """
  42. )
  43. env = {
  44. **os.environ,
  45. 'DATA_DIR': str(initialized_archive),
  46. 'USE_COLOR': 'False',
  47. 'SHOW_PROGRESS': 'False',
  48. 'PUBLIC_ADD_VIEW': 'True' if public_add_view else 'False',
  49. 'SAVE_ARCHIVEDOTORG': 'False',
  50. 'SAVE_TITLE': 'False',
  51. 'SAVE_FAVICON': 'False',
  52. 'SAVE_WGET': 'False',
  53. 'SAVE_WARC': 'False',
  54. 'SAVE_PDF': 'False',
  55. 'SAVE_SCREENSHOT': 'False',
  56. 'SAVE_DOM': 'False',
  57. 'SAVE_SINGLEFILE': 'False',
  58. 'SAVE_READABILITY': 'False',
  59. 'SAVE_MERCURY': 'False',
  60. 'SAVE_GIT': 'False',
  61. 'SAVE_YTDLP': 'False',
  62. 'SAVE_HEADERS': 'False',
  63. 'SAVE_HTMLTOTEXT': 'False',
  64. }
  65. return subprocess.run(
  66. [sys.executable, '-c', script],
  67. cwd=project_root,
  68. env=env,
  69. text=True,
  70. capture_output=True,
  71. timeout=60,
  72. )
  73. def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: str):
  74. project_root = Path(__file__).resolve().parents[2]
  75. script = textwrap.dedent(
  76. f"""
  77. import os
  78. os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
  79. from archivebox.config.django import setup_django
  80. setup_django()
  81. from django.test import Client
  82. from archivebox.core.models import Snapshot
  83. client = Client()
  84. target_url = {request_url!r}
  85. resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
  86. assert resp.status_code == 404, resp.status_code
  87. assert Snapshot.objects.count() == 0
  88. """
  89. )
  90. env = {
  91. **os.environ,
  92. 'DATA_DIR': str(initialized_archive),
  93. 'USE_COLOR': 'False',
  94. 'SHOW_PROGRESS': 'False',
  95. 'PUBLIC_ADD_VIEW': 'False',
  96. 'SAVE_ARCHIVEDOTORG': 'False',
  97. 'SAVE_TITLE': 'False',
  98. 'SAVE_FAVICON': 'False',
  99. 'SAVE_WGET': 'False',
  100. 'SAVE_WARC': 'False',
  101. 'SAVE_PDF': 'False',
  102. 'SAVE_SCREENSHOT': 'False',
  103. 'SAVE_DOM': 'False',
  104. 'SAVE_SINGLEFILE': 'False',
  105. 'SAVE_READABILITY': 'False',
  106. 'SAVE_MERCURY': 'False',
  107. 'SAVE_GIT': 'False',
  108. 'SAVE_YTDLP': 'False',
  109. 'SAVE_HEADERS': 'False',
  110. 'SAVE_HTMLTOTEXT': 'False',
  111. }
  112. return subprocess.run(
  113. [sys.executable, '-c', script],
  114. cwd=project_root,
  115. env=env,
  116. text=True,
  117. capture_output=True,
  118. timeout=60,
  119. )
  120. def _run_savepagenow_existing_snapshot_script(initialized_archive: Path, request_url: str, stored_url: str):
  121. project_root = Path(__file__).resolve().parents[2]
  122. script = textwrap.dedent(
  123. f"""
  124. import os
  125. os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
  126. from archivebox.config.django import setup_django
  127. setup_django()
  128. from django.test import Client
  129. from archivebox.core.models import Snapshot
  130. from archivebox.crawls.models import Crawl
  131. from archivebox.base_models.models import get_or_create_system_user_pk
  132. target_url = {request_url!r}
  133. stored_url = {stored_url!r}
  134. created_by_id = get_or_create_system_user_pk()
  135. crawl = Crawl.objects.create(urls=stored_url, created_by_id=created_by_id)
  136. snapshot = Snapshot.objects.create(url=stored_url, crawl=crawl)
  137. client = Client()
  138. resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
  139. assert resp.status_code == 302, resp.status_code
  140. assert resp['Location'] == f"/{{snapshot.url_path}}"
  141. """
  142. )
  143. env = {
  144. **os.environ,
  145. 'DATA_DIR': str(initialized_archive),
  146. 'USE_COLOR': 'False',
  147. 'SHOW_PROGRESS': 'False',
  148. 'PUBLIC_ADD_VIEW': 'False',
  149. 'SAVE_ARCHIVEDOTORG': 'False',
  150. 'SAVE_TITLE': 'False',
  151. 'SAVE_FAVICON': 'False',
  152. 'SAVE_WGET': 'False',
  153. 'SAVE_WARC': 'False',
  154. 'SAVE_PDF': 'False',
  155. 'SAVE_SCREENSHOT': 'False',
  156. 'SAVE_DOM': 'False',
  157. 'SAVE_SINGLEFILE': 'False',
  158. 'SAVE_READABILITY': 'False',
  159. 'SAVE_MERCURY': 'False',
  160. 'SAVE_GIT': 'False',
  161. 'SAVE_YTDLP': 'False',
  162. 'SAVE_HEADERS': 'False',
  163. 'SAVE_HTMLTOTEXT': 'False',
  164. }
  165. return subprocess.run(
  166. [sys.executable, '-c', script],
  167. cwd=project_root,
  168. env=env,
  169. text=True,
  170. capture_output=True,
  171. timeout=60,
  172. )
  173. def test_web_add_creates_and_reuses_snapshot_logged_in(initialized_archive):
  174. """/web/https://... should work for authenticated users even when public add is off."""
  175. url = create_test_url(domain='example.com', path='savepagenow-auth')
  176. request_url = url.replace('https://', '')
  177. result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False)
  178. assert result.returncode == 0, (
  179. "SavePageNow shortcut (logged-in) test failed.\n"
  180. f"stdout:\n{result.stdout}\n"
  181. f"stderr:\n{result.stderr}"
  182. )
  183. def test_web_add_creates_and_reuses_snapshot_public(initialized_archive):
  184. """/web/https://... should work when PUBLIC_ADD_VIEW is enabled without login."""
  185. url = create_test_url(domain='example.com', path='savepagenow-public')
  186. request_url = url.replace('https://', '')
  187. result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True)
  188. assert result.returncode == 0, (
  189. "SavePageNow shortcut (public add) test failed.\n"
  190. f"stdout:\n{result.stdout}\n"
  191. f"stderr:\n{result.stderr}"
  192. )
  193. def test_web_add_requires_login_when_public_off(initialized_archive):
  194. """/web/https://... should 404 for new URLs when PUBLIC_ADD_VIEW is false and not logged in."""
  195. url = create_test_url(domain='example.com', path='savepagenow-404')
  196. request_url = url.replace('https://', '')
  197. result = _run_savepagenow_not_found_script(initialized_archive, request_url)
  198. assert result.returncode == 0, (
  199. "SavePageNow shortcut (no public add) test failed.\n"
  200. f"stdout:\n{result.stdout}\n"
  201. f"stderr:\n{result.stderr}"
  202. )
  203. def test_web_add_redirects_existing_snapshot_when_public_off(initialized_archive):
  204. """/web/https://... should redirect to existing snapshot even when public add is off and not logged in."""
  205. url = create_test_url(domain='example.com', path='savepagenow-existing')
  206. request_url = url.replace('https://', '')
  207. result = _run_savepagenow_existing_snapshot_script(initialized_archive, request_url, url)
  208. assert result.returncode == 0, (
  209. "SavePageNow shortcut (existing snapshot) test failed.\n"
  210. f"stdout:\n{result.stdout}\n"
  211. f"stderr:\n{result.stderr}"
  212. )