| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252 |
- """Integration tests for /web/https://... shortcut (Save Page Now)."""
- import os
- import subprocess
- import sys
- import textwrap
- from pathlib import Path
- from archivebox.tests.conftest import create_test_url
- def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool):
- project_root = Path(__file__).resolve().parents[2]
- script = textwrap.dedent(
- f"""
- import os
- os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
- from archivebox.config.django import setup_django
- setup_django()
- from django.test import Client
- from django.contrib.auth import get_user_model
- from archivebox.core.models import Snapshot
- client = Client()
- if {login!r}:
- user = get_user_model().objects.create_user(username='tester', password='pw')
- client.force_login(user)
- target_url = {request_url!r}
- resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
- assert resp.status_code == 302, resp.status_code
- snapshot = Snapshot.objects.filter(url={expected_url!r}).order_by('-created_at').first()
- if snapshot is None:
- raise AssertionError(
- "snapshot not created; status=%s location=%s count=%s"
- % (
- resp.status_code,
- resp.get('Location'),
- Snapshot.objects.count(),
- )
- )
- assert resp['Location'] == f"/{{snapshot.url_path}}"
- resp2 = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
- assert resp2.status_code == 302, resp2.status_code
- assert Snapshot.objects.filter(url={expected_url!r}).count() == 1
- assert resp2['Location'] == f"/{{snapshot.url_path}}"
- """
- )
- env = {
- **os.environ,
- 'DATA_DIR': str(initialized_archive),
- 'USE_COLOR': 'False',
- 'SHOW_PROGRESS': 'False',
- 'PUBLIC_ADD_VIEW': 'True' if public_add_view else 'False',
- 'SAVE_ARCHIVEDOTORG': 'False',
- 'SAVE_TITLE': 'False',
- 'SAVE_FAVICON': 'False',
- 'SAVE_WGET': 'False',
- 'SAVE_WARC': 'False',
- 'SAVE_PDF': 'False',
- 'SAVE_SCREENSHOT': 'False',
- 'SAVE_DOM': 'False',
- 'SAVE_SINGLEFILE': 'False',
- 'SAVE_READABILITY': 'False',
- 'SAVE_MERCURY': 'False',
- 'SAVE_GIT': 'False',
- 'SAVE_YTDLP': 'False',
- 'SAVE_HEADERS': 'False',
- 'SAVE_HTMLTOTEXT': 'False',
- }
- return subprocess.run(
- [sys.executable, '-c', script],
- cwd=project_root,
- env=env,
- text=True,
- capture_output=True,
- timeout=60,
- )
- def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: str):
- project_root = Path(__file__).resolve().parents[2]
- script = textwrap.dedent(
- f"""
- import os
- os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
- from archivebox.config.django import setup_django
- setup_django()
- from django.test import Client
- from archivebox.core.models import Snapshot
- client = Client()
- target_url = {request_url!r}
- resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
- assert resp.status_code == 404, resp.status_code
- assert Snapshot.objects.count() == 0
- """
- )
- env = {
- **os.environ,
- 'DATA_DIR': str(initialized_archive),
- 'USE_COLOR': 'False',
- 'SHOW_PROGRESS': 'False',
- 'PUBLIC_ADD_VIEW': 'False',
- 'SAVE_ARCHIVEDOTORG': 'False',
- 'SAVE_TITLE': 'False',
- 'SAVE_FAVICON': 'False',
- 'SAVE_WGET': 'False',
- 'SAVE_WARC': 'False',
- 'SAVE_PDF': 'False',
- 'SAVE_SCREENSHOT': 'False',
- 'SAVE_DOM': 'False',
- 'SAVE_SINGLEFILE': 'False',
- 'SAVE_READABILITY': 'False',
- 'SAVE_MERCURY': 'False',
- 'SAVE_GIT': 'False',
- 'SAVE_YTDLP': 'False',
- 'SAVE_HEADERS': 'False',
- 'SAVE_HTMLTOTEXT': 'False',
- }
- return subprocess.run(
- [sys.executable, '-c', script],
- cwd=project_root,
- env=env,
- text=True,
- capture_output=True,
- timeout=60,
- )
- def _run_savepagenow_existing_snapshot_script(initialized_archive: Path, request_url: str, stored_url: str):
- project_root = Path(__file__).resolve().parents[2]
- script = textwrap.dedent(
- f"""
- import os
- os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
- from archivebox.config.django import setup_django
- setup_django()
- from django.test import Client
- from archivebox.core.models import Snapshot
- from archivebox.crawls.models import Crawl
- from archivebox.base_models.models import get_or_create_system_user_pk
- target_url = {request_url!r}
- stored_url = {stored_url!r}
- created_by_id = get_or_create_system_user_pk()
- crawl = Crawl.objects.create(urls=stored_url, created_by_id=created_by_id)
- snapshot = Snapshot.objects.create(url=stored_url, crawl=crawl)
- client = Client()
- resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
- assert resp.status_code == 302, resp.status_code
- assert resp['Location'] == f"/{{snapshot.url_path}}"
- """
- )
- env = {
- **os.environ,
- 'DATA_DIR': str(initialized_archive),
- 'USE_COLOR': 'False',
- 'SHOW_PROGRESS': 'False',
- 'PUBLIC_ADD_VIEW': 'False',
- 'SAVE_ARCHIVEDOTORG': 'False',
- 'SAVE_TITLE': 'False',
- 'SAVE_FAVICON': 'False',
- 'SAVE_WGET': 'False',
- 'SAVE_WARC': 'False',
- 'SAVE_PDF': 'False',
- 'SAVE_SCREENSHOT': 'False',
- 'SAVE_DOM': 'False',
- 'SAVE_SINGLEFILE': 'False',
- 'SAVE_READABILITY': 'False',
- 'SAVE_MERCURY': 'False',
- 'SAVE_GIT': 'False',
- 'SAVE_YTDLP': 'False',
- 'SAVE_HEADERS': 'False',
- 'SAVE_HTMLTOTEXT': 'False',
- }
- return subprocess.run(
- [sys.executable, '-c', script],
- cwd=project_root,
- env=env,
- text=True,
- capture_output=True,
- timeout=60,
- )
- def test_web_add_creates_and_reuses_snapshot_logged_in(initialized_archive):
- """/web/https://... should work for authenticated users even when public add is off."""
- url = create_test_url(domain='example.com', path='savepagenow-auth')
- request_url = url.replace('https://', '')
- result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False)
- assert result.returncode == 0, (
- "SavePageNow shortcut (logged-in) test failed.\n"
- f"stdout:\n{result.stdout}\n"
- f"stderr:\n{result.stderr}"
- )
- def test_web_add_creates_and_reuses_snapshot_public(initialized_archive):
- """/web/https://... should work when PUBLIC_ADD_VIEW is enabled without login."""
- url = create_test_url(domain='example.com', path='savepagenow-public')
- request_url = url.replace('https://', '')
- result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True)
- assert result.returncode == 0, (
- "SavePageNow shortcut (public add) test failed.\n"
- f"stdout:\n{result.stdout}\n"
- f"stderr:\n{result.stderr}"
- )
- def test_web_add_requires_login_when_public_off(initialized_archive):
- """/web/https://... should 404 for new URLs when PUBLIC_ADD_VIEW is false and not logged in."""
- url = create_test_url(domain='example.com', path='savepagenow-404')
- request_url = url.replace('https://', '')
- result = _run_savepagenow_not_found_script(initialized_archive, request_url)
- assert result.returncode == 0, (
- "SavePageNow shortcut (no public add) test failed.\n"
- f"stdout:\n{result.stdout}\n"
- f"stderr:\n{result.stderr}"
- )
- def test_web_add_redirects_existing_snapshot_when_public_off(initialized_archive):
- """/web/https://... should redirect to existing snapshot even when public add is off and not logged in."""
- url = create_test_url(domain='example.com', path='savepagenow-existing')
- request_url = url.replace('https://', '')
- result = _run_savepagenow_existing_snapshot_script(initialized_archive, request_url, url)
- assert result.returncode == 0, (
- "SavePageNow shortcut (existing snapshot) test failed.\n"
- f"stdout:\n{result.stdout}\n"
- f"stderr:\n{result.stderr}"
- )
|