test_title.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536
  1. import os
  2. import sqlite3
  3. from .fixtures import *
  4. def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
  5. """Test that title is extracted from the page."""
  6. disable_extractors_dict.update({"SAVE_TITLE": "true"})
  7. subprocess.run(['archivebox', 'add', 'https://example.com'],
  8. capture_output=True, env=disable_extractors_dict)
  9. os.chdir(tmp_path)
  10. conn = sqlite3.connect("index.sqlite3")
  11. conn.row_factory = sqlite3.Row
  12. c = conn.cursor()
  13. c.execute("SELECT title from archivebox.core.snapshot")
  14. snapshot = c.fetchone()
  15. conn.close()
  16. assert snapshot[0] is not None
  17. assert "Example" in snapshot[0]
  18. def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict):
  19. """
  20. https://github.com/ArchiveBox/ArchiveBox/issues/330
  21. Unencoded content should not be rendered as it facilitates xss injections
  22. and breaks the layout.
  23. """
  24. disable_extractors_dict.update({"SAVE_TITLE": "true"})
  25. subprocess.run(['archivebox', 'add', 'https://example.com'],
  26. capture_output=True, env=disable_extractors_dict)
  27. list_process = subprocess.run(["archivebox", "list", "--html"], capture_output=True)
  28. # Should not contain unescaped HTML tags in output
  29. output = list_process.stdout.decode("utf-8")
  30. assert "https://example.com" in output