test_readability.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. """
  2. Integration tests for readability plugin
  3. Tests verify:
  4. 1. Validate hook checks for readability-extractor binary
  5. 2. Verify deps with abx-pkg
  6. 3. Plugin reports missing dependency correctly
  7. 4. Extraction works against real example.com content
  8. """
  9. import json
  10. import shutil
  11. import subprocess
  12. import sys
  13. import tempfile
  14. from pathlib import Path
  15. import pytest
  16. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  17. get_plugin_dir,
  18. get_hook_script,
  19. PLUGINS_ROOT,
  20. )
  21. PLUGIN_DIR = get_plugin_dir(__file__)
  22. READABILITY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_readability.*')
  23. TEST_URL = 'https://example.com'
  24. def create_example_html(tmpdir: Path) -> Path:
  25. """Create sample HTML that looks like example.com with enough content for Readability."""
  26. singlefile_dir = tmpdir / 'singlefile'
  27. singlefile_dir.mkdir()
  28. html_file = singlefile_dir / 'singlefile.html'
  29. html_file.write_text('''
  30. <!DOCTYPE html>
  31. <html>
  32. <head>
  33. <meta charset="utf-8">
  34. <title>Example Domain</title>
  35. <meta name="viewport" content="width=device-width, initial-scale=1">
  36. </head>
  37. <body>
  38. <article>
  39. <header>
  40. <h1>Example Domain</h1>
  41. </header>
  42. <div class="content">
  43. <p>This domain is for use in illustrative examples in documents. You may use this
  44. domain in literature without prior coordination or asking for permission.</p>
  45. <p>Example domains are maintained by the Internet Assigned Numbers Authority (IANA)
  46. to provide a well-known address for documentation purposes. This helps authors create
  47. examples that readers can understand without confusion about actual domain ownership.</p>
  48. <p>The practice of using example domains dates back to the early days of the internet.
  49. These reserved domains ensure that example code and documentation doesn't accidentally
  50. point to real, active websites that might change or disappear over time.</p>
  51. <p>For more information about example domains and their history, you can visit the
  52. IANA website. They maintain several example domains including example.com, example.net,
  53. and example.org, all specifically reserved for this purpose.</p>
  54. <p><a href="https://www.iana.org/domains/example">More information about example domains...</a></p>
  55. </div>
  56. </article>
  57. </body>
  58. </html>
  59. ''')
  60. return html_file
  61. def test_hook_script_exists():
  62. """Verify hook script exists."""
  63. assert READABILITY_HOOK.exists(), f"Hook script not found: {READABILITY_HOOK}"
  64. def test_reports_missing_dependency_when_not_installed():
  65. """Test that script reports DEPENDENCY_NEEDED when readability-extractor is not found."""
  66. with tempfile.TemporaryDirectory() as tmpdir:
  67. tmpdir = Path(tmpdir)
  68. # Create HTML source so it doesn't fail on missing HTML
  69. create_example_html(tmpdir)
  70. # Run with empty PATH so binary won't be found
  71. env = {'PATH': '/nonexistent', 'HOME': str(tmpdir)}
  72. result = subprocess.run(
  73. [sys.executable, str(READABILITY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
  74. cwd=tmpdir,
  75. capture_output=True,
  76. text=True,
  77. env=env
  78. )
  79. # Missing binary is a transient error - should exit 1 with no JSONL
  80. assert result.returncode == 1, "Should exit 1 when dependency missing"
  81. # Should NOT emit JSONL (transient error - will be retried)
  82. jsonl_lines = [line for line in result.stdout.strip().split('\n')
  83. if line.strip().startswith('{')]
  84. assert len(jsonl_lines) == 0, "Should not emit JSONL for transient error (missing binary)"
  85. # Should log error to stderr
  86. assert 'readability-extractor' in result.stderr.lower() or 'error' in result.stderr.lower(), \
  87. "Should report error in stderr"
  88. def test_verify_deps_with_abx_pkg():
  89. """Verify readability-extractor is available via abx-pkg."""
  90. from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
  91. readability_binary = Binary(
  92. name='readability-extractor',
  93. binproviders=[NpmProvider(), EnvProvider()],
  94. overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
  95. )
  96. readability_loaded = readability_binary.load()
  97. if readability_loaded and readability_loaded.abspath:
  98. assert True, "readability-extractor is available"
  99. else:
  100. pass
  101. def test_extracts_article_after_installation():
  102. """Test full workflow: extract article using readability-extractor from real HTML."""
  103. # Prerequisites checked by earlier test (install hook should have run)
  104. with tempfile.TemporaryDirectory() as tmpdir:
  105. tmpdir = Path(tmpdir)
  106. # Create example.com HTML for readability to process
  107. create_example_html(tmpdir)
  108. # Run readability extraction (should find the binary)
  109. result = subprocess.run(
  110. [sys.executable, str(READABILITY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
  111. cwd=tmpdir,
  112. capture_output=True,
  113. text=True,
  114. timeout=30
  115. )
  116. assert result.returncode == 0, f"Extraction failed: {result.stderr}"
  117. # Parse clean JSONL output
  118. result_json = None
  119. for line in result.stdout.strip().split('\n'):
  120. line = line.strip()
  121. if line.startswith('{'):
  122. pass
  123. try:
  124. record = json.loads(line)
  125. if record.get('type') == 'ArchiveResult':
  126. result_json = record
  127. break
  128. except json.JSONDecodeError:
  129. pass
  130. assert result_json, "Should have ArchiveResult JSONL output"
  131. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  132. # Verify output files exist (hook writes to current directory)
  133. html_file = tmpdir / 'content.html'
  134. txt_file = tmpdir / 'content.txt'
  135. json_file = tmpdir / 'article.json'
  136. assert html_file.exists(), "content.html not created"
  137. assert txt_file.exists(), "content.txt not created"
  138. assert json_file.exists(), "article.json not created"
  139. # Verify HTML content contains REAL example.com text
  140. html_content = html_file.read_text()
  141. assert len(html_content) > 100, f"HTML content too short: {len(html_content)} bytes"
  142. assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
  143. assert ('illustrative examples' in html_content.lower() or
  144. 'use in' in html_content.lower() or
  145. 'literature' in html_content.lower()), \
  146. "Missing example.com description in HTML"
  147. # Verify text content contains REAL example.com text
  148. txt_content = txt_file.read_text()
  149. assert len(txt_content) > 50, f"Text content too short: {len(txt_content)} bytes"
  150. assert 'example' in txt_content.lower(), "Missing 'example' in text"
  151. # Verify JSON metadata
  152. json_data = json.loads(json_file.read_text())
  153. assert isinstance(json_data, dict), "article.json should be a dict"
  154. def test_fails_gracefully_without_html_source():
  155. """Test that extraction fails gracefully when no HTML source is available."""
  156. # Prerequisites checked by earlier test (install hook should have run)
  157. with tempfile.TemporaryDirectory() as tmpdir:
  158. tmpdir = Path(tmpdir)
  159. # Don't create any HTML source files
  160. result = subprocess.run(
  161. [sys.executable, str(READABILITY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
  162. cwd=tmpdir,
  163. capture_output=True,
  164. text=True,
  165. timeout=30
  166. )
  167. assert result.returncode != 0, "Should fail without HTML source"
  168. combined_output = result.stdout + result.stderr
  169. assert ('no html source' in combined_output.lower() or
  170. 'not found' in combined_output.lower() or
  171. 'ERROR=' in combined_output), \
  172. "Should report missing HTML source"
  173. if __name__ == '__main__':
  174. pytest.main([__file__, '-v'])