test_htmltotext.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. """
  2. Integration tests for htmltotext plugin
  3. Tests verify standalone htmltotext extractor execution.
  4. """
  5. import json
  6. import subprocess
  7. import sys
  8. import tempfile
  9. from pathlib import Path
  10. import pytest
  11. PLUGIN_DIR = Path(__file__).parent.parent
  12. HTMLTOTEXT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_htmltotext.*'), None)
  13. TEST_URL = 'https://example.com'
  14. def test_hook_script_exists():
  15. assert HTMLTOTEXT_HOOK.exists()
  16. def test_extracts_text_from_html():
  17. with tempfile.TemporaryDirectory() as tmpdir:
  18. tmpdir = Path(tmpdir)
  19. # Create HTML source
  20. (tmpdir / 'singlefile').mkdir()
  21. (tmpdir / 'singlefile' / 'singlefile.html').write_text('<html><body><h1>Example Domain</h1><p>This domain is for examples.</p></body></html>')
  22. result = subprocess.run(
  23. [sys.executable, str(HTMLTOTEXT_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
  24. cwd=tmpdir, capture_output=True, text=True, timeout=30
  25. )
  26. assert result.returncode == 0, f"Extraction failed: {result.stderr}"
  27. # Parse clean JSONL output
  28. result_json = None
  29. for line in result.stdout.strip().split('\n'):
  30. line = line.strip()
  31. if line.startswith('{'):
  32. try:
  33. record = json.loads(line)
  34. if record.get('type') == 'ArchiveResult':
  35. result_json = record
  36. break
  37. except json.JSONDecodeError:
  38. pass
  39. assert result_json, "Should have ArchiveResult JSONL output"
  40. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  41. # Verify output file (hook writes to current directory)
  42. output_file = tmpdir / 'htmltotext.txt'
  43. assert output_file.exists(), f"htmltotext.txt not created. Files: {list(tmpdir.iterdir())}"
  44. content = output_file.read_text()
  45. assert len(content) > 0, "Content should not be empty"
  46. assert 'Example Domain' in content, "Should contain text from HTML"
  47. def test_fails_gracefully_without_html():
  48. with tempfile.TemporaryDirectory() as tmpdir:
  49. result = subprocess.run(
  50. [sys.executable, str(HTMLTOTEXT_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
  51. cwd=tmpdir, capture_output=True, text=True, timeout=30
  52. )
  53. # Should exit with non-zero or emit failure JSONL
  54. # Parse clean JSONL output
  55. result_json = None
  56. for line in result.stdout.strip().split('\n'):
  57. line = line.strip()
  58. if line.startswith('{'):
  59. try:
  60. record = json.loads(line)
  61. if record.get('type') == 'ArchiveResult':
  62. result_json = record
  63. break
  64. except json.JSONDecodeError:
  65. pass
  66. if result_json:
  67. # Should report failure or skip since no HTML source
  68. assert result_json['status'] in ['failed', 'skipped'], f"Should fail or skip without HTML: {result_json}"
  69. if __name__ == '__main__':
  70. pytest.main([__file__, '-v'])